commit d8d8209e1bd898acb3455608478a57b9d2ecfc57
parent 70592470957e05df49ed33592c6f9fd68f8c3421
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 8 Jun 2026 22:06:45 -0700
jit/tls: uniform in-image thread-local storage; retire Mach-O TLV thunk
Single-threaded JIT collapses thread-local to a single in-image instance:
the image's .tdata/.tbss IS the storage. Resolve every TLS access to that
storage without touching the host thread pointer (reading host tpidr/fs/tp
aliases into the host process's own TLS — wrong init and unsafe).
- link_jit: map the TLS segment RW in the JIT (AOT keeps it as the RO init
template). Relax the Mach-O TLV access to read storage straight from the
descriptor's +16 slot (ldr xN,[xN,#16]; blr -> nop) — no thunk, no
descriptor patching, no per-thread block. Init comes free from the
copied .tdata.
- kit_jit_tls_addr (replaces kit_jit_tlv_resolve): format-aware,
range-checked resolution for the interpreter/host (Mach-O reads
descriptor[+16]; ELF/COFF the symbol is the storage).
- opt: model IR_TLS_ADDR_OF's clobber set via NATIVE_MOP_TLS_ADDR. The
Mach-O TLV sequence clobbers x0/x16/x17/lr; the optimizer only modeled
its dest, corrupting a value left live in x0 across a second TLS access
at -O1+ (multi-var TLS returned 204 instead of 134). Fixes AOT macOS too.
- Retire the dead machinery: src/jit/ (tlv thunk asm/stub/header),
driver/env/jit_tls_posix.c, the KitJitTls public type + KitJitHost.tls,
Windows FLS-backed TLS, and all wiring (~600 lines removed).
- test/toy 142_threadlocal_multi: multi-var TLS R/I-lane coverage.
- doc/JIT.md: document the uniform model.
ELF Local-Exec is not yet converted (still TP-relative; Step 6).
Verified green on macOS arm64: test-toy 1388/0/39, test-link 122/0,
test-opt, test-cg-api, test-isa, test-aa64-inline, test-smoke-{x64,rv64}.
Diffstat:
24 files changed, 204 insertions(+), 789 deletions(-)
diff --git a/Makefile b/Makefile
@@ -78,12 +78,6 @@ endef
$(foreach s,$(LIB_SRCS_C_GENERAL) $(LIB_ASMS),$(eval $(call libkit_obj_rule,$(s),src,lib)))
$(call flatobjs,$(LIB_SRCS_C_GENERAL) $(LIB_ASMS),src,lib): RULE_CFLAGS = $(LIB_CFLAGS)
-# JIT TLV thunk stub (non-aarch64 only; aarch64 uses the .S version above)
-ifneq ($(LIB_SRCS_JIT_STUB),)
-$(foreach s,$(LIB_SRCS_JIT_STUB),$(eval $(call libkit_obj_rule,$(s),src,lib)))
-$(call flatobjs,$(LIB_SRCS_JIT_STUB),src,lib): RULE_CFLAGS = $(LIB_CFLAGS)
-endif
-
# lang_registry.c is the one libkit source that crosses into lang/*; it
# uses -Ilang so the frontend headers can be reached as "c/c.h" etc.
ifneq ($(LIB_SRCS_LANGREG),)
diff --git a/doc/JIT.md b/doc/JIT.md
@@ -6,8 +6,7 @@ is no separate "JIT compiler": the same linker that writes ELF/Mach-O/PE
files produces a resolved `LinkImage`, and the JIT mapper copies that
image into executable memory, applies relocations against the live
runtime addresses, and exposes a symbol/inspector surface. The mapper is
-`kit_jit_from_image` in `src/link/link_jit.c` — *not* `src/jit/`, which
-holds only the Mach-O TLV thunk. The `kit run` driver
+`kit_jit_from_image` in `src/link/link_jit.c`. The `kit run` driver
(`driver/cmd/run.c`) is the headline consumer; the JIT debugger (see
[DBG.md](DBG.md)) and the emulator's block translator (see [EMU.md](EMU.md))
ride on the same mapping primitives.
@@ -35,8 +34,8 @@ The frontend-to-image half is shared verbatim with the file linker. A
`src/api/link.c`) sets two pieces of state on the `Linker`: `jit_mode`,
which tells layout to skip file serialization and synthesize JIT-only
stubs/GOT, and the *JIT host* (`KitJitHost`), the vtable through which
-libkit reaches the executable-memory allocator and the TLS runtime
-without itself depending on any OS. `kit_link_session_jit` then calls
+libkit reaches the executable-memory allocator without itself depending
+on any OS. `kit_link_session_jit` then calls
`kit_jit_from_image`, transferring ownership of the image (and the
linker that backs it) into the returned `KitJit`.
@@ -93,6 +92,8 @@ handling before reaching `link_reloc_apply`:
offset is computed in image space. AArch64/RISC-V use a 16-byte TCB
bias (`JIT_TLS_TCB_SIZE`) matching `start.c` and the ELF writer;
x86-64 SysV variant II addresses a TLS symbol as `offset - tls_memsz`.
+ (See "Thread-local storage" below — the JIT is single-threaded, so the
+ in-image `.tdata`/`.tbss` is the variable's one instance.)
- RISC-V `PCREL_LO12_I/S`: the low-12 half of an `AUIPC` pair targets a
local anchor at the paired `HI20` site. The mapper finds that paired
reloc, recomputes the displacement against runtime addresses, and feeds
@@ -102,10 +103,12 @@ handling before reaching `link_reloc_apply`:
displacement far outside ±4 GiB once the image is placed away from
address 0, tripping the range check, so the `ADRP` is rewritten to
`MOVZ Xd,#0` and the paired `ADD #0` left as-is.
-- Mach-O `TLVP_LOAD_PAGEOFF12`: Apple's mandatory "LDR→ADD" TLV
- relaxation. Every TLV descriptor is in-image, so the extra indirection
- through a `__thread_ptrs` slot is unnecessary; the load is rewritten to
- an add so the descriptor address lands directly in the register.
+- Mach-O `TLVP_LOAD_PAGEOFF12`: the Mach-O TLV access is collapsed to an
+ ordinary in-image load (see "Thread-local storage" below). The mapper
+ rewrites the `__thread_ptrs` load to an `add` (descriptor address into
+ the register), then rewrites the following thunk-load to
+ `ldr xN,[xN,#16]` and nops the `blr` — so the register ends up holding
+ the variable's in-image storage address with no thunk call.
After relocations are applied, IFUNC resolvers (ELF only) are run
in-process and their results stored into the iplt slots, `.init_array`
@@ -183,35 +186,53 @@ An absent or incomplete `execmem` vtable is a hard error
from the adapter, falling back to `0x4000` if the adapter reports 0; the
POSIX adapter fills it from `sysconf(_SC_PAGESIZE)`.
-## Mach-O thread-local storage: the TLV thunk
-
-`src/jit/` contains exactly one thing: the Mach-O TLV thunk
-(`tlv_thunk_aarch64.S`, with a trapping stub for non-aarch64 hosts in
-`tlv_thunk_stub.c`; contract in `tlv_thunk.h`).
-
-On Mach-O, codegen emits Apple's TLV access sequence: load the variable's
-24-byte descriptor, load `descriptor[+0]` as a thunk pointer, and `blr`
-it with the descriptor in `x0`, expecting the per-thread variable address
-back in `x0`. In an AOT image dyld rewrites that slot to a libdyld thunk
-and allocates a pthread key per descriptor. A JIT image is never seen by
-dyld, so the mapper does dyld's job itself in `jit_patch_tlv_descriptors`:
-it finds every in-image descriptor (reloc-driven, keyed on the
-`__tlv_bootstrap` undef each TU emits), allocates one per-image TLS
-context through the host's `KitJitTls` vtable, and overwrites the three
-descriptor slots with `(&kit_jit_tlv_thunk, ctx, per-thread-offset)`.
-
-The thunk's calling convention is custom — `x0` in/out, every other
-GPR/SIMD register preserved — because it is invoked mid-expression in
-JITed code that has no idea a call is happening. That is why it is hand-
-written assembly: it saves and restores all caller-saved registers around
-a normal C call to the context's `get_block` (the ctx's first field, by
-contract), which lazily allocates and seeds the calling thread's TLS
-block. The driver's `KitJitTls` implementation
-(`driver/env/jit_tls_posix.c`) backs `get_block` with a pthread key whose
-destructor frees the per-thread block on thread exit. `kit_jit_tlv_resolve`
-lets host/interpreter code resolve a thread-local without going through
-the asm path, validating descriptor ownership first so a foreign (dyld)
-descriptor is never blindly called.
+## Thread-local storage
+
+The JIT is single-threaded, which collapses the whole problem: with one
+thread there is exactly one instance of each thread-local, which is
+semantically an ordinary global living in the image's `.tdata`/`.tbss`.
+The mapper already materializes those sections (init bytes copied for
+`.tdata`, zero-filled for `.tbss`), and `perms_for` maps the TLS segment
+read-write in the JIT (the AOT image keeps it as a read-only init
+template each thread copies). So the only work is to make every TLS
+access resolve to the in-image storage **without touching the host thread
+pointer** — reading the host's `tpidr_el0`/`fs`/`tp` would alias into the
+host process's own TLS, which is both wrong (no initializer) and unsafe
+(it scribbles on host libc state). All access lowering is therefore
+relaxed at map time to in-image addressing; no thunk, no per-thread
+block, no host TLS vtable.
+
+- **Mach-O (AArch64):** codegen emits Apple's TLV sequence (load the
+ 24-byte descriptor, load `descriptor[+0]` as a thunk, `blr` it). dyld
+ would rewrite that slot and allocate a pthread key; a JIT image is
+ never seen by dyld. Instead the mapper leaves the descriptor's `+16`
+ slot holding the variable's in-image storage address (the normal
+ `R_ABS64` against the storage symbol) and relaxes the access to read it
+ directly: the `__thread_ptrs` load becomes `add` (descriptor address),
+ the thunk-load becomes `ldr xN,[xN,#16]`, and the `blr` becomes a nop.
+
+- **ELF (AArch64/RISC-V/x86-64):** the in-image relaxation lands first on
+ Mach-O/AArch64; the ELF Local-Exec path is being unified onto the same
+ model (relax `mrs/add/add`, `lui/add tp/addi`, `mov fs:0/lea` to
+ PC-relative in-image addressing, dropping the thread-pointer read). Until
+ that lands, ELF still computes a TP-relative offset against the host
+ thread pointer (the TLS-LE bullet above), which is correct only when the
+ host runtime seeds the thread pointer to the image's TLS block.
+
+`kit_jit_tls_addr` gives host/interpreter code the same resolution from
+the address a thread-local's *symbol* resolves to (Mach-O: read
+`descriptor[+16]`; ELF/COFF: the symbol is the storage), range-checked
+against the image so a foreign/extern thread-local resolved through the
+host is rejected rather than dereferenced.
+
+A subtlety the access lowering imposes on codegen: the Mach-O TLV sequence
+materializes the descriptor in `x0` and (in the AOT form) calls the
+resolver thunk via `x16`, clobbering `x0`/`x16`/`x17`/`lr`. Codegen is
+shared between AOT and JIT, so the optimizer must model that clobber set
+or a value left live in `x0` across a TLS access is corrupted at `-O1`+.
+The backend reports it via `NATIVE_MOP_TLS_ADDR` from
+`machine_op_clobbers` (ELF Local-Exec, which uses only the destination
+register, reports none).
## Symbol and inspector surface
diff --git a/driver/cmd/run.c b/driver/cmd/run.c
@@ -862,19 +862,19 @@ static void* interp_jit_resolve(void* ctx, KitSlice name) {
return p;
}
-/* Thread-local resolver for the interpreter. On Mach-O a thread-local symbol
- * resolves to a TLV descriptor, not the storage; kit_jit_tlv_resolve unwraps
- * our own descriptors to the calling thread's address and returns NULL for
- * anything it can't safely resolve (non-Mach-O images, or a foreign/dyld
- * descriptor such as an extern thread-local). We return NULL in those cases so
- * the engine diagnoses cleanly rather than treating a descriptor — or an
- * unvalidated non-Mach-O symbol address — as the variable's storage. */
+/* Thread-local resolver for the interpreter. A thread-local symbol resolves to
+ * a Mach-O TLV descriptor (whose +16 slot holds the storage) or, on ELF/COFF,
+ * directly to the in-image storage; kit_jit_tls_addr normalizes both to the
+ * variable's single in-image instance and returns NULL for anything it can't
+ * safely resolve (a foreign/extern thread-local resolved through the host). We
+ * return NULL in those cases so the engine diagnoses cleanly rather than
+ * treating a foreign pointer as the variable's storage. */
static void* interp_jit_resolve_tls(void* ctx, KitSlice name, int64_t addend) {
KitJit* jit = (KitJit*)ctx;
void* sym = interp_jit_resolve(ctx, name);
void* tls;
if (!sym) return NULL;
- tls = kit_jit_tlv_resolve(jit, sym);
+ tls = kit_jit_tls_addr(jit, sym);
return tls ? (uint8_t*)tls + addend : NULL;
}
diff --git a/driver/env.h b/driver/env.h
@@ -13,17 +13,16 @@
* a POSIX file_io implementation (open/read/write on real paths). It is
* the single piece of glue that turns "the host" into a KitContext.
*
- * The execmem / dbg_os / jit_tls vtables that used to live on KitEnv now
- * live on KitJitHost / KitDbgHost. They are still constructed here so
- * that one DriverEnv covers every libkit-using tool, but they're handed
- * to libkit per-call via the appropriate host struct. */
+ * The execmem / dbg_os vtables that used to live on KitEnv now live on
+ * KitJitHost / KitDbgHost. They are still constructed here so that one
+ * DriverEnv covers every libkit-using tool, but they're handed to libkit
+ * per-call via the appropriate host struct. */
typedef struct DriverEnv {
KitHeap* heap;
KitDiagSink* diag;
KitFileIO file_io;
const KitExecMem* execmem;
const KitDbgOs* dbg_os; /* NULL unless `kit dbg` paths run */
- const KitJitTls* jit_tls; /* NULL unless `kit run` w/ TLV paths run */
const KitMetrics* metrics; /* optional scoped metrics sink */
int64_t now; /* unix seconds; -1 = unknown */
const char* cache_dir; /* base cache dir, e.g. ~/.cache/kit */
@@ -37,9 +36,9 @@ void driver_env_fini(DriverEnv*);
* libkit entry that takes `const KitContext *`. */
KitContext driver_env_to_context(const DriverEnv*);
-/* Build a KitJitHost from the DriverEnv (execmem + tls). The returned
- * struct holds borrowed pointers to vtables owned by g_execmem_posix /
- * g_jit_tls_posix; callers must not outlive driver_env_fini. */
+/* Build a KitJitHost from the DriverEnv (execmem). The returned struct
+ * holds a borrowed pointer to the vtable owned by g_execmem_posix;
+ * callers must not outlive driver_env_fini. */
KitJitHost driver_env_to_jit_host(const DriverEnv*);
/* Build a KitDbgHost from the DriverEnv (dbg_os). */
diff --git a/driver/env/env_posix.h b/driver/env/env_posix.h
@@ -2,7 +2,7 @@
#define KIT_DRIVER_ENV_POSIX_H
/* POSIX-shared internal surface used by the POSIX env TUs:
- * driver/env/posix.c, posix_dbg.c, jit_tls_posix.c
+ * driver/env/posix.c, posix_dbg.c
* driver/env/macos.c, linux.c, freebsd.c
* driver/env/linux_exec_hint_<arch>.c
* driver/env/uctx_<arch>_<os>.c
@@ -98,7 +98,6 @@ void os_host_target_fill(KitTargetSpec* t);
/* ---- vtable singletons wired into DriverEnv on POSIX -------------------- */
extern KitExecMem g_execmem_posix; /* posix.c — page_size set in init */
extern KitDbgOs g_dbg_os_posix; /* posix_dbg.c */
-extern KitJitTls g_jit_tls_posix; /* jit_tls_posix.c */
/* posix_dbg.c exposes a worker-thread check for the signal handler. */
int posix_dbg_caller_is_worker(void);
diff --git a/driver/env/jit_tls_posix.c b/driver/env/jit_tls_posix.c
@@ -1,120 +0,0 @@
-/* pthread_key-backed KitJitTls. Backs `kit run` on Mach-O targets:
- * every JIT image with TLS gets one pthread_key, the per-thread block is
- * allocated lazily on first access, and freed via the key's destructor
- * when the thread exits.
- *
- * The ctx layout is fixed by the contract in src/jit/tlv_thunk.h: the
- * first 8 bytes MUST be a function pointer the asm thunk calls with
- * x0 = ctx and expects back an x0 = TLS block. We satisfy this by making
- * `get_block` the first field. */
-
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "env_posix.h"
-
-typedef struct JitTlsCtx {
- void* (*get_block)(void* ctx); /* first; matches tlv_thunk's expectation */
- pthread_key_t key;
- size_t image_size;
- size_t image_filesz;
- size_t align;
- void* init_bytes; /* heap-owned copy of init bytes, or NULL if all BSS */
-} JitTlsCtx;
-
-static void jit_tls_thread_dtor(void* block) {
- /* POSIX pthread_key destructor: called when a thread that touched the
- * TLV exits. `block` is the void* set by pthread_setspecific, never
- * NULL (POSIX skips the destructor if it was NULL). */
- free(block);
-}
-
-static void* jit_tls_alloc_block(JitTlsCtx* ctx) {
- /* macOS aligned_alloc requires alignment >= sizeof(void*); bump
- * smaller request alignments up. Size must be a multiple of
- * alignment too. */
- size_t a = ctx->align ? ctx->align : sizeof(void*);
- size_t sz;
- void* block;
- if (a < sizeof(void*)) a = sizeof(void*);
- sz = (ctx->image_size + a - 1u) & ~(a - 1u);
- if (sz == 0) sz = a; /* zero-size TLS image still needs a non-NULL block */
- block = aligned_alloc(a, sz);
- if (!block) return NULL;
- if (ctx->image_filesz && ctx->init_bytes)
- memcpy(block, ctx->init_bytes, ctx->image_filesz);
- if (ctx->image_size > ctx->image_filesz)
- memset((char*)block + ctx->image_filesz, 0,
- ctx->image_size - ctx->image_filesz);
- return block;
-}
-
-static void* jit_tls_get_block(void* ctx_v) {
- JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
- void* block = pthread_getspecific(ctx->key);
- if (block) return block;
- block = jit_tls_alloc_block(ctx);
- if (!block) {
- fprintf(stderr, "kit run: out of memory allocating per-thread TLS block\n");
- abort();
- }
- if (pthread_setspecific(ctx->key, block) != 0) {
- fprintf(stderr, "kit run: pthread_setspecific failed in TLV thunk\n");
- abort();
- }
- return block;
-}
-
-static void* jit_tls_ctx_new(void* user, const void* init_bytes,
- size_t image_filesz, size_t image_size,
- size_t align) {
- JitTlsCtx* ctx;
- (void)user;
- ctx = (JitTlsCtx*)malloc(sizeof(*ctx));
- if (!ctx) return NULL;
- ctx->get_block = jit_tls_get_block;
- ctx->image_size = image_size;
- ctx->image_filesz = image_filesz;
- ctx->align = align ? align : sizeof(void*);
- ctx->init_bytes = NULL;
- if (image_filesz && init_bytes) {
- ctx->init_bytes = malloc(image_filesz);
- if (!ctx->init_bytes) {
- free(ctx);
- return NULL;
- }
- memcpy(ctx->init_bytes, init_bytes, image_filesz);
- }
- if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) {
- free(ctx->init_bytes);
- free(ctx);
- return NULL;
- }
- return ctx;
-}
-
-static void jit_tls_ctx_destroy(void* user, void* ctx_v) {
- JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
- void* my_block;
- (void)user;
- if (!ctx) return;
- /* Free the calling thread's block (POSIX won't run our destructor for
- * it; pthread_key_delete also doesn't fire destructors for live
- * threads). Other threads' blocks are reaped when those threads exit. */
- my_block = pthread_getspecific(ctx->key);
- if (my_block) {
- pthread_setspecific(ctx->key, NULL);
- free(my_block);
- }
- pthread_key_delete(ctx->key);
- free(ctx->init_bytes);
- free(ctx);
-}
-
-KitJitTls g_jit_tls_posix = {
- .ctx_new = jit_tls_ctx_new,
- .ctx_destroy = jit_tls_ctx_destroy,
- .user = NULL,
-};
diff --git a/driver/env/posix.c b/driver/env/posix.c
@@ -1191,7 +1191,6 @@ void driver_env_init(DriverEnv* e) {
e->execmem = &g_execmem_posix;
e->dbg_os = &g_dbg_os_posix;
- e->jit_tls = &g_jit_tls_posix;
e->metrics = NULL;
{
@@ -1239,7 +1238,6 @@ KitContext driver_env_to_context(const DriverEnv* e) {
KitJitHost driver_env_to_jit_host(const DriverEnv* e) {
KitJitHost h;
h.execmem = e->execmem;
- h.tls = e->jit_tls;
return h;
}
diff --git a/driver/env/windows.c b/driver/env/windows.c
@@ -1,6 +1,6 @@
-/* Windows host environment. Replaces posix.c, posix_dbg.c, and
- * jit_tls_posix.c on Win32 builds; common.c is reused unchanged. Built
- * against the Win32 API with MinGW-w64 in mind.
+/* Windows host environment. Replaces posix.c and posix_dbg.c on Win32
+ * builds; common.c is reused unchanged. Built against the Win32 API with
+ * MinGW-w64 in mind.
*
* Coverage:
* - file_io (CreateFileW/ReadFile/WriteFile + UTF-8 path conversion)
@@ -19,9 +19,6 @@
* GetThreadContext for the interrupt path (the interrupt's on_fault
* runs on the *caller* thread; natural faults run on the worker
* thread inside the VEH, matching POSIX semantics there).
- * - jit_tls: FlsAlloc/FlsGetValue/FlsSetValue with a per-thread dtor.
- * The TLV thunk's first-field contract is preserved (`get_block` is
- * the first field of JitTlsCtx).
*
* The W^X model on Windows mirrors the Linux/FreeBSD memfd path: a single
* pagefile-backed file-mapping object is mapped twice -- once RW (write
@@ -1682,115 +1679,9 @@ static KitDbgOs g_dbg_os_win = {
.user = NULL,
};
-/* ============================================================
- * jit_tls (FlsAlloc with per-thread dtor)
- * ============================================================ */
-
-/* The TLV thunk's contract (src/jit/tlv_thunk.h): the first 8 bytes of
- * ctx must be a function pointer the asm calls with arg0 == ctx and
- * expects back arg0 == TLS block. We satisfy that by making `get_block`
- * the first field. */
-typedef struct JitTlsCtxWin {
- void* (*get_block)(void* ctx);
- DWORD fls_index;
- size_t image_size;
- size_t image_filesz;
- size_t align;
- void* init_bytes;
-} JitTlsCtxWin;
-
-static void __stdcall jit_tls_thread_dtor(PVOID block) {
- /* FlsAlloc destructor: runs on the thread that's exiting, OR when the
- * FLS index is freed via FlsFree (for live threads' blocks). */
- if (block) free(block);
-}
-
-static void* jit_tls_alloc_block_win(JitTlsCtxWin* ctx) {
- size_t a = ctx->align ? ctx->align : sizeof(void*);
- size_t sz;
- void* block;
- if (a < sizeof(void*)) a = sizeof(void*);
- sz = (ctx->image_size + a - 1u) & ~(a - 1u);
- if (sz == 0) sz = a;
- /* _aligned_malloc is the MSVCRT/UCRT counterpart to aligned_alloc; the
- * matching free is _aligned_free, but only the FLS destructor frees
- * these blocks and it uses plain free. So allocate with malloc-style
- * alignment instead: overallocate and align manually -- but FLS dtor
- * needs to know the original pointer. Simpler: require align <=
- * MEMORY_ALLOCATION_ALIGNMENT (16 on Win64), which the TLS images we
- * see in practice satisfy. */
- (void)a;
- block = malloc(sz);
- if (!block) return NULL;
- if (ctx->image_filesz && ctx->init_bytes)
- memcpy(block, ctx->init_bytes, ctx->image_filesz);
- if (ctx->image_size > ctx->image_filesz)
- memset((char*)block + ctx->image_filesz, 0,
- ctx->image_size - ctx->image_filesz);
- return block;
-}
-
-static void* jit_tls_get_block_win(void* ctx_v) {
- JitTlsCtxWin* ctx = (JitTlsCtxWin*)ctx_v;
- void* block = FlsGetValue(ctx->fls_index);
- if (block) return block;
- block = jit_tls_alloc_block_win(ctx);
- if (!block) {
- fprintf(stderr, "kit run: out of memory allocating per-thread TLS block\n");
- abort();
- }
- if (!FlsSetValue(ctx->fls_index, block)) {
- fprintf(stderr, "kit run: FlsSetValue failed in TLV thunk\n");
- abort();
- }
- return block;
-}
-
-static void* jit_tls_ctx_new_win(void* user, const void* init_bytes,
- size_t image_filesz, size_t image_size,
- size_t align) {
- JitTlsCtxWin* ctx;
- (void)user;
- ctx = (JitTlsCtxWin*)malloc(sizeof(*ctx));
- if (!ctx) return NULL;
- ctx->get_block = jit_tls_get_block_win;
- ctx->image_size = image_size;
- ctx->image_filesz = image_filesz;
- ctx->align = align ? align : sizeof(void*);
- ctx->init_bytes = NULL;
- if (image_filesz && init_bytes) {
- ctx->init_bytes = malloc(image_filesz);
- if (!ctx->init_bytes) {
- free(ctx);
- return NULL;
- }
- memcpy(ctx->init_bytes, init_bytes, image_filesz);
- }
- ctx->fls_index = FlsAlloc(jit_tls_thread_dtor);
- if (ctx->fls_index == FLS_OUT_OF_INDEXES) {
- free(ctx->init_bytes);
- free(ctx);
- return NULL;
- }
- return ctx;
-}
-
-static void jit_tls_ctx_destroy_win(void* user, void* ctx_v) {
- JitTlsCtxWin* ctx = (JitTlsCtxWin*)ctx_v;
- (void)user;
- if (!ctx) return;
- /* FlsFree runs the destructor for every live thread's block before
- * releasing the index. The calling thread's block is reaped here too. */
- FlsFree(ctx->fls_index);
- free(ctx->init_bytes);
- free(ctx);
-}
-
-static KitJitTls g_jit_tls_win = {
- .ctx_new = jit_tls_ctx_new_win,
- .ctx_destroy = jit_tls_ctx_destroy_win,
- .user = NULL,
-};
+/* JIT thread-local access resolves to in-image storage (single-threaded JIT:
+ * the in-image TLS data is the single instance — see src/link/link_jit.c), so
+ * no host-provided per-thread TLS vtable is needed. */
/* ============================================================
* host target
@@ -1857,7 +1748,6 @@ void driver_env_init(DriverEnv* e) {
e->execmem = &g_execmem_win;
e->dbg_os = &g_dbg_os_win;
- e->jit_tls = &g_jit_tls_win;
e->metrics = NULL;
{
@@ -1905,7 +1795,6 @@ KitContext driver_env_to_context(const DriverEnv* e) {
KitJitHost driver_env_to_jit_host(const DriverEnv* e) {
KitJitHost h;
h.execmem = e->execmem;
- h.tls = e->jit_tls;
return h;
}
diff --git a/include/kit/jit.h b/include/kit/jit.h
@@ -35,16 +35,8 @@ typedef struct KitExecMem {
void* user;
} KitExecMem;
-typedef struct KitJitTls {
- void* (*ctx_new)(void* user, const void* init_bytes, size_t image_filesz,
- size_t image_size, size_t align);
- void (*ctx_destroy)(void* user, void* ctx);
- void* user;
-} KitJitTls;
-
typedef struct KitJitHost {
const KitExecMem* execmem;
- const KitJitTls* tls;
} KitJitHost;
KIT_API void kit_jit_free(KitJit*);
@@ -75,11 +67,14 @@ KIT_API KitStatus kit_jit_addr_to_sym(KitJit*, uint64_t addr,
KIT_API uint64_t kit_jit_runtime_to_image(KitJit*, uint64_t runtime_pc);
KIT_API uint64_t kit_jit_image_to_runtime(KitJit*, uint64_t image_vaddr);
-/* Resolve a thread-local variable's address for the calling thread, given the
- * runtime address its symbol resolves to (a Mach-O TLV descriptor). Returns
- * NULL when the image/target uses no TLV descriptor (e.g. non-Mach-O), letting
- * callers diagnose rather than dereference a descriptor as data. */
-KIT_API void* kit_jit_tlv_resolve(KitJit*, void* descriptor);
+/* Resolve a thread-local variable's in-image storage from the address its
+ * SYMBOL resolves to. The JIT is single-threaded, so the in-image .tdata/.tbss
+ * is the variable's single instance. On Mach-O the symbol resolves to a TLV
+ * descriptor whose +16 slot holds the storage address; on ELF/COFF the symbol
+ * already resolves to the storage. Returns NULL for an address outside this
+ * image (e.g. an extern thread-local resolved through the host), letting
+ * callers diagnose rather than dereference a foreign pointer. */
+KIT_API void* kit_jit_tls_addr(KitJit*, void* sym_addr);
typedef struct KitJitSymIter KitJitSymIter;
diff --git a/mk/env.mk b/mk/env.mk
@@ -114,9 +114,9 @@ HOST_LDLIBS += -lpthread
# Export all globals so libc.so can resolve symbols like `environ` from the exe
HOST_ENV_LDFLAGS += -rdynamic
else ifeq ($(HOST_OS),windows)
-# windows.c subsumes posix.c / posix_dbg.c / jit_tls_posix.c and folds in
-# its own CONTEXT-based register marshalling -- there's no POSIX overlap
-# worth sharing. EnumProcessModules pulls in -lpsapi.
+# windows.c subsumes posix.c / posix_dbg.c and folds in its own
+# CONTEXT-based register marshalling -- there's no POSIX overlap worth
+# sharing. EnumProcessModules pulls in -lpsapi.
DRIVER_ENV_OS_CFLAGS := -D_WIN32_WINNT=0x0601
DRIVER_ENV_OS_SRC := driver/env/windows.c
HOST_LDLIBS += -lpsapi
@@ -180,7 +180,6 @@ DRIVER_ENV_SRCS := \
driver/env/common.c \
driver/env/posix.c \
driver/env/posix_dbg.c \
- driver/env/jit_tls_posix.c \
$(DRIVER_ENV_OS_SRC) \
$(DRIVER_ENV_HINT_SRC) \
$(DRIVER_ENV_ICACHE_SRC) \
diff --git a/mk/lib_srcs.mk b/mk/lib_srcs.mk
@@ -128,14 +128,6 @@ LIB_SRCS_LINK := $(shell find src/link -name '*.c' 2>/dev/null)
ifneq ($(KIT_JIT_ENABLED),1)
LIB_SRCS_LINK := $(filter-out %/link_jit.c,$(LIB_SRCS_LINK))
endif
-# The JIT TLV thunk is aarch64-specific asm; every other target uses the C stub.
-ifeq ($(HOST_ARCH),aarch64)
-LIB_SRCS_JIT_ASM := $(shell find src/jit -name '*.S' 2>/dev/null)
-LIB_SRCS_JIT_STUB :=
-else
-LIB_SRCS_JIT_ASM :=
-LIB_SRCS_JIT_STUB := src/jit/tlv_thunk_stub.c
-endif
LIB_SRC_ABI_AAPCS64 = src/abi/abi_aapcs64.c
LIB_SRC_ABI_APPLE_ARM64 = src/abi/abi_apple_arm64.c
@@ -265,10 +257,9 @@ ifeq ($(KIT_LANG_TOY_ENABLED),1)
LANG_OBJS += $(call flatobjs,$(LANG_TOY_SRCS),lang,lang)
endif
+# Library assembly sources (currently none). The compile rules in the Makefile
+# consume LIB_ASMS, so keep it defined even when empty.
LIB_ASMS =
-ifeq ($(KIT_JIT_ENABLED),1)
-LIB_ASMS += $(LIB_SRCS_JIT_ASM)
-endif
# Compile-rule source groups (each shares one compile-flag profile; the rules
# that consume these live in the Makefile). lang_registry.c is split out: it is
@@ -281,6 +272,5 @@ LIB_OBJS = $(call flatobjs,$(LIB_SRCS_C_GENERAL),src,lib) \
$(call flatobjs,$(LIB_SRCS_LANGREG),src,lib) \
$(call flatobjs,$(LIB_SRCS_VENDOR),vendor,vendor) \
$(LANG_OBJS) \
- $(call flatobjs,$(LIB_ASMS),src,lib) \
- $(call flatobjs,$(LIB_SRCS_JIT_STUB),src,lib)
+ $(call flatobjs,$(LIB_ASMS),src,lib)
LIB_DEPS = $(LIB_OBJS:.o=.d)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -3626,8 +3626,20 @@ static void aa_trap(NativeTarget* t) { aa_emit32(t->mc, aa64_brk(0)); }
static int aa_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
- (void)t;
mask[0] = mask[1] = mask[2] = 0;
+ if ((NativeMachineOpKind)op->kind == NATIVE_MOP_TLS_ADDR) {
+ /* ELF Local-Exec materializes the address using only the destination
+ * register (mrs tpidr_el0 + add/add into rd) — no extra clobbers. The
+ * Mach-O TLV sequence loads the descriptor into x0 and calls the resolver
+ * thunk through x16, clobbering x0/x16/x17 and the link register; the JIT
+ * relaxation of that same sequence keeps the x0/x16/x17 footprint. Model
+ * the descriptor-model clobbers so a value live across a TLS access is not
+ * left in one of these registers. */
+ if (!obj_format_tls_via_descriptor(t->c)) return 0;
+ mask[NATIVE_REG_INT] =
+ (1u << 0) | (1u << 16) | (1u << 17) | (1u << AA_LR);
+ return 1;
+ }
if ((NativeMachineOpKind)op->kind != NATIVE_MOP_INTRINSIC ||
(IntrinKind)op->intrin != INTRIN_SYSCALL)
return 0;
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -294,6 +294,13 @@ typedef enum NativeMachineOpKind {
NATIVE_MOP_ATOMIC_CAS,
NATIVE_MOP_ATOMIC_RMW,
NATIVE_MOP_INTRINSIC,
+ /* A thread-local address materialization (IR_TLS_ADDR_OF). On targets whose
+ * TLS access model uses fixed scratch/result registers or a resolver-thunk
+ * call (e.g. Mach-O TLV descriptors → x0/x16/x17/lr), the encoding clobbers
+ * those regs even though the IR op only declares its destination. Targets
+ * whose TLS sequence touches only the destination register (ELF Local-Exec)
+ * report no clobbers. */
+ NATIVE_MOP_TLS_ADDR,
} NativeMachineOpKind;
typedef struct NativeMachineOp {
diff --git a/src/jit/tlv_thunk.h b/src/jit/tlv_thunk.h
@@ -1,50 +0,0 @@
-#ifndef KIT_JIT_TLV_THUNK_H
-#define KIT_JIT_TLV_THUNK_H
-
-/* The JIT-time TLV thunk for Mach-O thread-local access.
- *
- * kit's codegen emits the Apple TLV access sequence whenever a Mach-O
- * target dereferences a `_Thread_local`:
- *
- * adrp x0, sym@TLVPPAGE
- * ldr x0, [x0, sym@TLVPPAGEOFF] ; x0 = descriptor
- * ldr x1, [x0] ; x1 = descriptor[+0] (thunk*)
- * blr x1 ; thunk(x0=desc) -> x0=TLV addr
- *
- * The thunk's ABI is custom: x0 in/out as the descriptor / per-thread
- * TLV address, every other GPR and SIMD register preserved. In an AOT
- * Mach-O image dyld rewrites descriptor[+0] to a libdyld-supplied thunk
- * after allocating a pthread key per descriptor; the JIT image is never
- * walked by dyld, so we install our own thunk and patch every
- * descriptor's slot[0]/[1]/[2] from `kit_jit_from_image`.
- *
- * Per-image descriptor convention (post-patch):
- * [+0] : &kit_jit_tlv_thunk (entry below)
- * [+8] : opaque KitJitTls ctx pointer (per JIT image)
- * [+16] : byte offset within the per-thread TLS block (image-relative)
- *
- * Contract on the ctx pointer (set by KitJitTls.ctx_new): its first 8
- * bytes are a function pointer `void* (*get_block)(void* ctx)` that
- * returns the calling thread's TLS block (lazy-allocating + seeding
- * from the image's init bytes on first per-thread call).
- *
- * The thunk does roughly:
- *
- * void* thunk(void* desc) {
- * void* ctx = *(void**)((u8*)desc + 8);
- * void* (*get_block)(void*) = *(void**)ctx;
- * void* base = get_block(ctx);
- * return (u8*)base + *(u64*)((u8*)desc + 16);
- * }
- *
- * Calling `get_block` from a context that must preserve x1..x18 / q0..q7
- * is the reason the thunk is implemented in asm — a normal C function
- * call would clobber caller-saved regs the JITed access sequence has no
- * idea about. */
-
-/* Declared as a function for &-of, but its calling convention is the
- * custom one described above: callers must come through the access
- * sequence, not a plain C call. */
-void kit_jit_tlv_thunk(void);
-
-#endif
diff --git a/src/jit/tlv_thunk_aarch64.S b/src/jit/tlv_thunk_aarch64.S
@@ -1,113 +0,0 @@
-/* The Mach-O TLV thunk for the JIT path.
- *
- * Called via:
- * ldr x1, [x0] ; x0 = descriptor, x1 = thunk addr
- * blr x1
- * with the contract "x0 in/out as descriptor -> TLV addr, every other
- * GPR/SIMD register preserved". No C frame at entry: the access
- * sequence is mid-expression in JITed code, so we must save and restore
- * everything caller-saved before/after calling out to the host
- * `get_block` helper.
- *
- * See src/jit/tlv_thunk.h for the descriptor layout and ctx contract. */
-
-#if defined(__aarch64__)
-
- .text
- .p2align 2
- .globl _kit_jit_tlv_thunk
- .globl kit_jit_tlv_thunk
-_kit_jit_tlv_thunk:
-kit_jit_tlv_thunk:
- /* Frame layout (544 bytes, 16-byte aligned):
- * sp+ 0 .. sp+127 : x1-x16 (eight stp pairs)
- * sp+128 : x17
- * sp+136 : scratch slot for descriptor pointer
- * sp+144 : x29 (FP)
- * sp+152 : x30 (LR)
- * sp+160 .. sp+543 : v0-v7, v16-v31 (24 q regs)
- *
- * x18 is platform-reserved on Apple aarch64 (don't touch). v8-v15
- * are callee-saved by ABI so the host's get_block won't perturb
- * them; we skip them. */
- sub sp, sp, #544
-
- stp x1, x2, [sp, # 0]
- stp x3, x4, [sp, # 16]
- stp x5, x6, [sp, # 32]
- stp x7, x8, [sp, # 48]
- stp x9, x10, [sp, # 64]
- stp x11, x12, [sp, # 80]
- stp x13, x14, [sp, # 96]
- stp x15, x16, [sp, #112]
- str x17, [sp, #128]
-
- stp x29, x30, [sp, #144]
- add x29, sp, #144
-
- stp q0, q1, [sp, #160]
- stp q2, q3, [sp, #192]
- stp q4, q5, [sp, #224]
- stp q6, q7, [sp, #256]
- stp q16, q17, [sp, #288]
- stp q18, q19, [sp, #320]
- stp q20, q21, [sp, #352]
- stp q22, q23, [sp, #384]
- stp q24, q25, [sp, #416]
- stp q26, q27, [sp, #448]
- stp q28, q29, [sp, #480]
- stp q30, q31, [sp, #512]
-
- /* Stash desc; we'll need [desc + 16] (the byte offset) after the
- * call, but the call clobbers x0. */
- str x0, [sp, #136]
-
- /* ctx = *(desc + 8); get_block = *ctx. */
- ldr x1, [x0, #8]
- ldr x16, [x1]
-
- /* x0 = ctx, call get_block(ctx) -> x0 = block base. */
- mov x0, x1
- blr x16
-
- /* x0 = base; load offset and combine. */
- ldr x1, [sp, #136]
- ldr x1, [x1, #16]
- add x0, x0, x1
-
- /* Restore SIMD. */
- ldp q0, q1, [sp, #160]
- ldp q2, q3, [sp, #192]
- ldp q4, q5, [sp, #224]
- ldp q6, q7, [sp, #256]
- ldp q16, q17, [sp, #288]
- ldp q18, q19, [sp, #320]
- ldp q20, q21, [sp, #352]
- ldp q22, q23, [sp, #384]
- ldp q24, q25, [sp, #416]
- ldp q26, q27, [sp, #448]
- ldp q28, q29, [sp, #480]
- ldp q30, q31, [sp, #512]
-
- /* Restore FP/LR and GPRs (last so the temps we used above don't
- * leak out). */
- ldp x29, x30, [sp, #144]
-
- ldp x1, x2, [sp, # 0]
- ldp x3, x4, [sp, # 16]
- ldp x5, x6, [sp, # 32]
- ldp x7, x8, [sp, # 48]
- ldp x9, x10, [sp, # 64]
- ldp x11, x12, [sp, # 80]
- ldp x13, x14, [sp, # 96]
- ldp x15, x16, [sp, #112]
- ldr x17, [sp, #128]
-
- add sp, sp, #544
- ret
-
-#if defined(__linux__) && defined(__ELF__)
- .section .note.GNU-stack,"",%progbits
-#endif
-
-#endif /* __aarch64__ */
diff --git a/src/jit/tlv_thunk_stub.c b/src/jit/tlv_thunk_stub.c
@@ -1,24 +0,0 @@
-/* Stub `kit_jit_tlv_thunk` for non-aarch64 hosts. The asm version
- * lives in tlv_thunk_aarch64.S; on hosts that can't run the JIT image's
- * Mach-O code anyway (the JIT runs in-process, so target arch must
- * equal host arch), the symbol exists only so taking its address in
- * src/link/link_jit.c links cleanly. */
-
-#include "jit/tlv_thunk.h"
-
-#if !defined(__aarch64__)
-
-void kit_jit_tlv_thunk(void) {
- /* Reachable only if a non-aarch64 host somehow attempted to JIT an
- * aarch64 Mach-O image with TLVs — should have been rejected long
- * before any access reaches the thunk. */
- __builtin_trap();
-}
-
-#else
-
-/* Real implementation lives in tlv_thunk_aarch64.S; keep the TU
- * non-empty for -Wempty-translation-unit. */
-extern void kit_jit_tlv_thunk(void);
-
-#endif
diff --git a/src/link/link_jit.c b/src/link/link_jit.c
@@ -20,7 +20,6 @@
#include "core/pool.h"
#include "core/slice.h"
#include "core/util.h"
-#include "jit/tlv_thunk.h"
#include "link/link.h"
#include "link/link_internal.h"
#include "link/link_reloc_desc.h"
@@ -89,14 +88,7 @@ struct KitJit {
u64 append_cursor[SEG_NBUCKETS];
u64 append_limit[SEG_NBUCKETS];
u64 generation;
- /* Mach-O TLV runtime state. Lazily allocated by jit_patch_tlv_descriptors
- * when the image contains any in-image TLV descriptor. `tls_vtable` is
- * borrowed from KitEnv (lives across the env's lifetime); `tls_ctx`
- * is owned by us and freed via tls_vtable->ctx_destroy in
- * kit_jit_free. */
- const KitJitTls* tls_vtable;
- void* tls_ctx;
- /* Borrowed JIT host: execmem + tls vtables. Mirrors the Linker's
+ /* Borrowed JIT host: execmem vtable. Mirrors the Linker's
* jit_host so the JIT's lifetime accessors don't need to walk back to
* the Linker (which is still live behind jit->linker but may be
* decoupled in incremental flows). */
@@ -143,6 +135,11 @@ static int perms_for(u32 secflags) {
int p = KIT_PROT_READ;
if (secflags & SF_EXEC) p |= KIT_PROT_EXEC;
if (secflags & SF_WRITE) p |= KIT_PROT_WRITE;
+ /* JIT TLS storage is the single live instance (the JIT is single-threaded),
+ * accessed and mutated in place, so the TLS segment must be writable — even
+ * though the AOT image treats .tdata/.tbss as a read-only init template that
+ * each thread copies. */
+ if (secflags & SF_TLS) p |= KIT_PROT_WRITE;
return p;
}
@@ -258,116 +255,6 @@ static void jit_copy_input_section_bytes(LinkImage* img,
}
}
-/* Walk every TLV descriptor and overwrite its three slots with
- * (thunk_addr, ctx, per-thread offset). See src/jit/tlv_thunk.h for
- * the descriptor contract. Iteration is reloc-driven: every descriptor
- * carries one R_ABS64 against the cached `__tlv_bootstrap` undef extern
- * at its base, and one R_ABS64 against the storage symbol at base+16.
- * The +16 reloc gives us the storage symbol's image vaddr; subtracting
- * img->tls_vaddr yields the per-thread byte offset our thunk adds to the
- * caller's TLS block on every access. */
-static void jit_patch_tlv_descriptors(KitJit* jit) {
- LinkImage* img = jit->image;
- Compiler* c = jit->c;
- /* TLV descriptors are the Mach-O TLS-descriptor access model; ask the format
- * via the TLS-model hook rather than naming Mach-O directly. */
- if (!obj_format_tls_via_descriptor(c)) return;
- if (img->tls_memsz == 0) return;
-
- /* Find every LinkSymId whose interned name is __tlv_bootstrap. The
- * symbol is emitted as a weak-undef per TU (one ObjBuilder appends
- * one undef sym); link_resolve_symbols does NOT fan undef externs
- * into img->globals (only definitions go there), so we iterate
- * img->syms directly. One reloc with target = any of these IDs
- * marks a descriptor's +0 slot.
- *
- * Bitmap over LinkSymId so the inner reloc test is O(1). */
- Sym tlv_name = pool_intern_slice(c->global, SLICE_LIT("__tlv_bootstrap"));
- u32 nsyms = LinkSyms_count(&img->syms);
- Heap* h = (Heap*)c->ctx->heap;
- u8* is_tlv_bootstrap = (u8*)h->alloc(h, nsyms + 1u, 1u);
- if (!is_tlv_bootstrap)
- compiler_panic(c, SRCLOC_NONE, "kit_jit: oom on tlv-bootstrap bitmap");
- memset(is_tlv_bootstrap, 0, nsyms + 1u);
- int any_tlv = 0;
- for (u32 si = 0; si < nsyms; ++si) {
- LinkSymbol* s = LinkSyms_at(&img->syms, si);
- if (s && s->name == tlv_name) {
- is_tlv_bootstrap[si + 1u] = 1u;
- any_tlv = 1;
- }
- }
- if (!any_tlv) {
- h->free(h, is_tlv_bootstrap, nsyms + 1u);
- return;
- }
-
- const KitJitTls* tls = jit->jit_host ? jit->jit_host->tls : NULL;
- if (!tls || !tls->ctx_new || !tls->ctx_destroy)
- compiler_panic(c, SRCLOC_NONE,
- "kit_jit: image needs TLV thunk but jit host tls is NULL "
- "or incomplete");
-
- /* Snapshot the TLS image's init bytes from the write alias. The
- * write alias of any SF_TLS section stays readable for the lifetime
- * of KitJit; ctx_new is expected to copy what it needs. */
- const u8* init_bytes = NULL;
- if (img->tls_filesz) {
- init_bytes = (const u8*)vaddr_to_write(img, jit->segs, img->tls_vaddr);
- if (!init_bytes)
- compiler_panic(c, SRCLOC_NONE,
- "kit_jit: tls_vaddr does not map to any segment");
- }
- size_t align = img->tls_align ? (size_t)img->tls_align : 1u;
- void* ctx = tls->ctx_new(tls->user, init_bytes, (size_t)img->tls_filesz,
- (size_t)img->tls_memsz, align);
- if (!ctx)
- compiler_panic(c, SRCLOC_NONE, "kit_jit: jit_tls->ctx_new returned NULL");
-
- jit->tls_vtable = tls;
- jit->tls_ctx = ctx;
-
- uintptr_t thunk_addr = (uintptr_t)&kit_jit_tlv_thunk;
- u32 nrel = LinkRelocs_count(&img->relocs);
- for (u32 i = 0; i < nrel; ++i) {
- const LinkRelocApply* r = LinkRelocs_at(&img->relocs, i);
- if (r->target == LINK_SYM_NONE || !is_tlv_bootstrap[r->target]) continue;
- if (r->kind != R_ABS64) continue;
-
- u64 desc_vaddr = r->write_vaddr;
-
- /* Locate the paired +16 reloc. Quadratic, but reloc counts are
- * small (one reloc pair per TLV var) so the inner scan amortizes. */
- const LinkRelocApply* r16 = NULL;
- for (u32 j = 0; j < nrel; ++j) {
- const LinkRelocApply* q = LinkRelocs_at(&img->relocs, j);
- if (q->kind == R_ABS64 && q->write_vaddr == desc_vaddr + 16u) {
- r16 = q;
- break;
- }
- }
- if (!r16 || r16->target == LINK_SYM_NONE)
- compiler_panic(c, SRCLOC_NONE,
- "kit_jit: TLV descriptor missing data-symbol reloc");
-
- const LinkSymbol* data_sym = LinkSyms_at(&img->syms, r16->target - 1);
- if (!data_sym || !data_sym->defined)
- compiler_panic(c, SRCLOC_NONE,
- "kit_jit: TLV descriptor data symbol is undefined");
- u64 offset_in_image = (data_sym->vaddr + (u64)r16->addend) - img->tls_vaddr;
-
- u8* write = (u8*)vaddr_to_write(img, jit->segs, desc_vaddr);
- if (!write)
- compiler_panic(c, SRCLOC_NONE,
- "kit_jit: TLV descriptor vaddr does not map");
- wr_u64_le(write + 0u, (u64)thunk_addr);
- wr_u64_le(write + 8u, (u64)(uintptr_t)ctx);
- wr_u64_le(write + 16u, offset_in_image);
- }
-
- h->free(h, is_tlv_bootstrap, nsyms + 1u);
-}
-
KitJit* kit_jit_from_image(LinkImage* img) {
Compiler* c;
Heap* heap;
@@ -548,24 +435,37 @@ KitJit* kit_jit_from_image(LinkImage* img) {
continue;
}
}
- /* Apple ld's "LDR -> ADD" TLVP relaxation, mandatory in the JIT.
- *
- * The Mach-O TLV access sequence loads `x0 = *thread_ptrs_slot`,
- * where the slot's content is the descriptor's vaddr. The AOT
- * Mach-O writer synthesizes a __thread_ptrs section to hold those
- * slots; the JIT path does not (and doesn't need to: every TLV
- * descriptor is in-image, so the load is one indirection too many).
+ /* Mach-O TLV access -> ordinary in-image load (single-threaded JIT).
*
- * Rewrite `ldr xd, [xn, #imm12]` -> `add xd, xn, #imm12` so the
- * descriptor address ends up directly in x0. Encoding shifts:
- * LDR (uimm12, 64-bit): bits [31:22] = 1111100101 (0x3E5)
- * ADD (immediate, 64-bit, sh=0): bits [31:22] = 1001000100 (0x244)
- * Rn[9:5] / Rd|Rt[4:0] / imm12[21:10] stay in the same positions. */
+ * Codegen emits the Apple TLV sequence (4 insns):
+ * adrp x0, desc@TLVPPAGE (PAGE21)
+ * ldr x0, [x0, desc@TLVPPAGEOFF] (PAGEOFF12) -- via a __thread_ptrs slot
+ * ldr x16, [x0] -- load the resolver thunk from desc[+0]
+ * blr x16 -- call thunk(desc) -> &var in x0
+ * The AOT writer/dyld back that with a thunk + per-thread block. With one
+ * thread the in-image .tdata/.tbss IS the single instance, and desc[+16]
+ * already holds the variable's in-image storage address (the normal
+ * R_ABS64 against the storage symbol the loop applies). So we collapse the
+ * access to a direct load, dropping the thunk, the per-thread block, and
+ * descriptor patching entirely:
+ * PAGE21 : adrp x0, desc (unchanged, applied below)
+ * PAGEOFF12 : ldr x0,[x0,#imm] -> add x0,x0,#(desc & 0xfff) (x0 = &desc)
+ * +4 : ldr x16,[x0] -> ldr x0,[x0,#16] (x0 = &var)
+ * +8 : blr x16 -> nop
+ * (LDR uimm12 64-bit bits[31:22]=0x3E5; ADD imm sh=0 bits[31:22]=0x244;
+ * Rn[9:5]/Rt[4:0]/imm12[21:10] keep their positions.) */
if (r->kind == R_AARCH64_TLVP_LOAD_PAGEOFF12) {
u64 v = ((u64)S + (u64)r->addend) & 0xfffu;
u32 instr = rd_u32_le(P_bytes);
+ u8* i_thunk = P_bytes + 4u; /* ldr x16, [x0] */
+ u8* i_call = P_bytes + 8u; /* blr x16 */
instr = 0x91000000u | (instr & 0x3ffu) | ((u32)v << 10);
wr_u32_le(P_bytes, instr);
+ if (rd_u32_le(i_thunk) != 0xf9400010u || rd_u32_le(i_call) != 0xd63f0200u)
+ compiler_panic(c, SRCLOC_NONE,
+ "kit_jit: unexpected Mach-O TLV access sequence");
+ wr_u32_le(i_thunk, 0xf9400800u); /* ldr x0, [x0, #16] -> &var */
+ wr_u32_le(i_call, 0xd503201fu); /* nop */
continue;
}
link_reloc_apply(c, r->kind, P_bytes, S, r->addend, P);
@@ -655,8 +555,6 @@ KitJit* kit_jit_from_image(LinkImage* img) {
jit->append_cursor[SEG_TLS] + JIT_APPEND_TLS_SLACK;
jit->generation = 0;
jit->view_built = 0u;
- jit->tls_vtable = NULL;
- jit->tls_ctx = NULL;
jit->jit_host = host;
/* Take ownership of the image: undefer it from the compiler so a
@@ -670,12 +568,6 @@ KitJit* kit_jit_from_image(LinkImage* img) {
jit->linker->deferred = NULL;
}
- /* Mach-O TLV descriptor pass: install our thunk into descriptor[+0],
- * stash the per-image TLS ctx in [+8], and overwrite [+16] with the
- * per-thread byte offset. No-op on ELF (no TLV descriptors) and on
- * Mach-O images that have neither TLS nor TLV access sites. */
- jit_patch_tlv_descriptors(jit);
-
/* Run .init_array constructors in forward order. */
{
typedef void (*VoidFn)(void);
@@ -712,16 +604,6 @@ void kit_jit_free(KitJit* jit) {
jit_view_objfile_free(jit->view);
jit->view = NULL;
}
- /* TLV ctx: pthread_key_delete inside ctx_destroy triggers POSIX
- * per-thread destructors for blocks any live thread might still hold,
- * so this needs to run before we release segments referenced by those
- * destructors. (Currently the destructor only frees the block, but
- * we keep the ordering invariant either way.) */
- if (jit->tls_vtable && jit->tls_ctx) {
- jit->tls_vtable->ctx_destroy(jit->tls_vtable->user, jit->tls_ctx);
- jit->tls_ctx = NULL;
- jit->tls_vtable = NULL;
- }
/* segs[] are views into master — release master only. */
if (mem && mem->release && jit->master.size) {
mem->release(mem->user, &jit->master);
@@ -759,38 +641,32 @@ void* kit_jit_lookup(KitJit* jit, KitSlice name) {
return (void*)vaddr_to_runtime(jit->image, jit->segs, s->vaddr);
}
-void* kit_jit_tlv_resolve(KitJit* jit, void* descriptor) {
- /* Mach-O thread-local access goes through a TLV descriptor, not the data:
- * the symbol resolves to a 24-byte descriptor [thunk, ctx, image-offset]
- * (see src/jit/tlv_thunk.h). JITed code calls thunk(desc); from C we can run
- * the thunk's logic directly — `get_block` is the ctx's first word and is a
- * normal C function (the asm thunk only exists to preserve caller-saved regs
- * for the JITed access sequence). Returns the calling thread's address of the
- * variable, or NULL if this is not one of our descriptors (non-Mach-O image,
- * no in-image TLV ctx, or a *foreign* descriptor — e.g. an extern
- * thread-local resolved through dyld — which we must not dereference/call
- * into). */
- u8* desc = (u8*)descriptor;
- void* slot0;
- void* ctx;
- void* (*get_block)(void*);
- u64 offset;
- u8* base;
- if (!jit || !desc) return NULL;
- /* Mach-O TLS-descriptor JIT bootstrap only; gate via the TLS-model hook. */
- if (!obj_format_tls_via_descriptor(jit->c) || !jit->tls_ctx) return NULL;
- /* Ownership check: our descriptors carry &kit_jit_tlv_thunk at +0 and this
- * image's tls_ctx at +8 (jit_patch_tlv_descriptors). Refuse anything else so
- * a foreign descriptor never becomes a wild indirect call. */
- memcpy(&slot0, desc + 0u, sizeof slot0);
- memcpy(&ctx, desc + 8u, sizeof ctx);
- if (slot0 != (void*)&kit_jit_tlv_thunk || ctx != jit->tls_ctx) return NULL;
- memcpy(&get_block, ctx, sizeof get_block);
- if (!get_block) return NULL;
- memcpy(&offset, desc + 16u, sizeof offset);
- base = (u8*)get_block(ctx);
- if (!base) return NULL;
- return base + offset;
+void* kit_jit_tls_addr(KitJit* jit, void* sym_addr) {
+ /* Resolve a thread-local's in-image storage from the address its SYMBOL
+ * resolves to (single-threaded JIT: the in-image .tdata/.tbss is the single
+ * instance — see the TLVP relaxation in kit_jit_from_image).
+ *
+ * - Mach-O: the symbol resolves to a 24-byte TLV descriptor; its +16 slot
+ * holds the variable's in-image storage address (the normal R_ABS64
+ * against the storage symbol, applied during the reloc pass).
+ * - ELF/COFF: the symbol already resolves to the in-image storage.
+ *
+ * Returns NULL for anything outside this image's reservation — e.g. a
+ * foreign/extern thread-local resolved through the host (dlsym) — which we
+ * must not dereference. The interpreter relies on this NULL to diagnose
+ * cleanly rather than treat a foreign pointer as our storage. */
+ u8* p = (u8*)sym_addr;
+ u8* lo = (u8*)jit->master.runtime;
+ u8* hi = lo + jit->master.size;
+ if (!jit || !p) return NULL;
+ if (!jit->master.runtime || p < lo || p >= hi) return NULL;
+ if (obj_format_tls_via_descriptor(jit->c)) {
+ void* storage;
+ if (p + 24u > hi) return NULL;
+ memcpy(&storage, p + 16u, sizeof storage);
+ return storage;
+ }
+ return p;
}
uint64_t kit_jit_generation(KitJit* jit) { return jit ? jit->generation : 0; }
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -218,6 +218,9 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
case IR_ATOMIC_RMW:
mop.kind = NATIVE_MOP_ATOMIC_RMW;
break;
+ case IR_TLS_ADDR_OF:
+ mop.kind = NATIVE_MOP_TLS_ADDR;
+ break;
case IR_INTRINSIC: {
const IRIntrinAux* aux = (const IRIntrinAux*)in->extra.aux;
if (!aux) continue;
diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c
@@ -307,89 +307,11 @@ static void* extern_resolver(void* user, KitSlice name) {
return low_extern_default_value();
}
-/* ---- jit_tls (pthread-key backed) ----
- * Mirrors driver/env.c's KitJitTls implementation so 36_tls_basic/J on
- * aa64-macho can resolve TLV accesses without depending on the driver
- * binary. The ctx layout is fixed by src/jit/tlv_thunk.h: first 8 bytes
- * MUST be a function pointer `void* (*)(void* ctx)` that returns the
- * per-thread block. */
-#include <pthread.h>
-typedef struct JitTlsCtx {
- void* (*get_block)(void* ctx);
- pthread_key_t key;
- size_t image_size;
- size_t image_filesz;
- size_t align;
- void* init_bytes;
-} JitTlsCtx;
-static void jit_tls_thread_dtor(void* block) { free(block); }
-static void* jit_tls_alloc_block(JitTlsCtx* ctx) {
- /* macOS aligned_alloc requires alignment >= sizeof(void*) (8); bump
- * smaller request alignments up. Round size to a multiple of align
- * (aligned_alloc requires this). */
- size_t a = ctx->align ? ctx->align : sizeof(void*);
- if (a < sizeof(void*)) a = sizeof(void*);
- size_t sz = (ctx->image_size + a - 1u) & ~(a - 1u);
- if (sz == 0) sz = a;
- void* block = aligned_alloc(a, sz);
- if (!block) return NULL;
- if (ctx->image_filesz && ctx->init_bytes)
- memcpy(block, ctx->init_bytes, ctx->image_filesz);
- if (ctx->image_size > ctx->image_filesz)
- memset((char*)block + ctx->image_filesz, 0,
- ctx->image_size - ctx->image_filesz);
- return block;
-}
-static void* jit_tls_get_block(void* ctx_v) {
- JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
- void* block = pthread_getspecific(ctx->key);
- if (block) return block;
- block = jit_tls_alloc_block(ctx);
- if (!block) abort();
- if (pthread_setspecific(ctx->key, block) != 0) abort();
- return block;
-}
-static void* jit_tls_ctx_new(void* user, const void* init_bytes,
- size_t image_filesz, size_t image_size,
- size_t align) {
- (void)user;
- JitTlsCtx* ctx = (JitTlsCtx*)malloc(sizeof(*ctx));
- if (!ctx) return NULL;
- ctx->get_block = jit_tls_get_block;
- ctx->image_size = image_size;
- ctx->image_filesz = image_filesz;
- ctx->align = align ? align : sizeof(void*);
- ctx->init_bytes = NULL;
- if (image_filesz && init_bytes) {
- ctx->init_bytes = malloc(image_filesz);
- if (!ctx->init_bytes) {
- free(ctx);
- return NULL;
- }
- memcpy(ctx->init_bytes, init_bytes, image_filesz);
- }
- if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) {
- free(ctx->init_bytes);
- free(ctx);
- return NULL;
- }
- return ctx;
-}
-static void jit_tls_ctx_destroy(void* user, void* ctx_v) {
- JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
- (void)user;
- if (!ctx) return;
- void* my = pthread_getspecific(ctx->key);
- if (my) {
- pthread_setspecific(ctx->key, NULL);
- free(my);
- }
- pthread_key_delete(ctx->key);
- free(ctx->init_bytes);
- free(ctx);
-}
-static KitJitTls g_jit_tls = {jit_tls_ctx_new, jit_tls_ctx_destroy, NULL};
-
+/* The JIT resolves thread-local accesses to in-image storage (single-threaded:
+ * the in-image .tdata/.tbss is the single instance — see the TLS relaxation in
+ * src/link/link_jit.c). The host still seeds the thread pointer below so the
+ * freestanding ELF startup convention is honored on hosts that run the image
+ * natively, but no KitJitTls vtable / per-thread block is needed. */
#if defined(__aarch64__) || defined(__arm64__)
__attribute__((noinline, no_sanitize("address", "undefined"))) static int
call_with_aarch64_tls(int (*fn)(void), void* tls_block) {
@@ -672,7 +594,6 @@ int main(int argc, char** argv) {
KitJitHost jhost;
memset(&jhost, 0, sizeof(jhost));
jhost.execmem = &g_execmem;
- jhost.tls = &g_jit_tls;
KitLinkSessionOptions opts;
KitLinkSession* link = NULL;
diff --git a/test/link/rv32_jit_test.c b/test/link/rv32_jit_test.c
@@ -256,12 +256,10 @@ int main(void) {
return 2;
}
- /* JIT the object. The host's execmem is the W^X dual-map above; for
- * this test we don't need TLS so the jit_host->tls vtable is NULL. */
+ /* JIT the object. The host's execmem is the W^X dual-map above. */
KitJitHost jhost;
memset(&jhost, 0, sizeof(jhost));
jhost.execmem = &g_execmem;
- jhost.tls = NULL;
KitLinkSessionOptions opts;
memset(&opts, 0, sizeof(opts));
diff --git a/test/link/rv64_jit_test.c b/test/link/rv64_jit_test.c
@@ -247,12 +247,10 @@ int main(void) {
return 2;
}
- /* JIT the object. The host's execmem is the W^X dual-map above; for
- * this test we don't need TLS so the jit_host->tls vtable is NULL. */
+ /* JIT the object. The host's execmem is the W^X dual-map above. */
KitJitHost jhost;
memset(&jhost, 0, sizeof(jhost));
jhost.execmem = &g_execmem;
- jhost.tls = NULL;
KitLinkSessionOptions opts;
memset(&opts, 0, sizeof(opts));
diff --git a/test/toy/cases/142_threadlocal_multi.expected b/test/toy/cases/142_threadlocal_multi.expected
@@ -0,0 +1 @@
+134
+\ No newline at end of file
diff --git a/test/toy/cases/142_threadlocal_multi.link.skip b/test/toy/cases/142_threadlocal_multi.link.skip
@@ -0,0 +1 @@
+defining a _Thread_local and linking it into a standalone executable needs PIE/crt TLS setup (see PIE start.c limitation); R/I/C/W cover the semantics
+\ No newline at end of file
diff --git a/test/toy/cases/142_threadlocal_multi.toy b/test/toy/cases/142_threadlocal_multi.toy
@@ -0,0 +1,19 @@
+// Multiple thread-locals: two independent _Thread_local vars, mutated together
+// across calls, must keep separate storage and initializers. Single-threaded,
+// so JIT (R) and interpreter (I) must agree on the same in-image storage.
+var @[.threadlocal] a: i64 = 100;
+var @[.threadlocal] b: i64 = 7;
+
+fn step(): i64 {
+ a = a + 10;
+ b = b - 1;
+ return a + b;
+}
+
+fn __user_main(): i64 {
+ step(); // a=110 b=6 -> 116
+ step(); // a=120 b=5 -> 125
+ return step(); // a=130 b=4 -> 134
+}
+
+fn main(): i32 { return __user_main() as i32; }