commit 513e71590f9a23086a8978e67dbac6f6ac40fea8
parent 4e486a6b390dd05525986f195c8d2640a5f9081e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 11 May 2026 13:49:39 -0700
link/macho: TLV section + ARM64_RELOC_TLVP_LOAD_PAGE21/PAGEOFF12
Adds linker support for Mach-O thread-local variables on aa64. Adds
two new RelocKinds (R_AARCH64_TLVP_LOAD_PAGE21/PAGEOFF12) with reader
and writer mappings to ARM64_RELOC_TLVP_LOAD_*, ADRP / 64-bit LDR-lo12
encoding in link_reloc, width entries in link_layout, and the
following link_macho plumbing:
- collect_tlv mirrors the internal-GOT pass: one __thread_ptrs slot
per unique descriptor referenced via TLVP_LOAD.
- plan_layout synthesizes __DATA,__thread_ptrs
(S_THREAD_LOCAL_VARIABLE_POINTERS) after the user's TLV input
sections, preserves S_THREAD_LOCAL_* section type bits on
__thread_{vars,data,bss}, and forces __thread_vars to align 8.
- apply_relocs routes TLVP_LOAD relocs through __thread_ptrs slots,
and special-cases R_ABS64 inside an S_THREAD_LOCAL_VARIABLES
section: writes the literal TLV offset (target_vaddr -
tls_image_vaddr) with no chained-fixup entry, matching Apple ld's
descriptor[2] format.
- Chained-fixup REBASE / BIND per __thread_ptrs slot, parallel to
the existing __got per-slot pass.
Test: test/link/cases/36_tls_basic exercises a single _Thread_local
read. Pass matrix: aa64/x64/rv64-elf all pass; aa64-macho R passes,
E/J fail at runtime (dyld redirects descriptor[0] to
_tlv_bootstrap_error). Open bug tracked in doc/MACHO.md §4.
Diffstat:
8 files changed, 388 insertions(+), 18 deletions(-)
diff --git a/doc/MACHO.md b/doc/MACHO.md
@@ -152,7 +152,90 @@ for the canonical implementation pointers.
---
-## 4. Other gaps surfaced en route
+## 4. TLV (thread-local variables) — PARTIALLY RESOLVED
+
+Adds `ARM64_RELOC_TLVP_LOAD_PAGE21` / `PAGEOFF12` support and
+section/binding plumbing for `__DATA,__thread_vars` /
+`__DATA,__thread_data` / `__DATA,__thread_bss`. Linker-side scaffolding
+is in place; one runtime bug remains on Path E.
+
+**What landed:**
+
+- New `RelocKind`s `R_AARCH64_TLVP_LOAD_PAGE21` /
+ `R_AARCH64_TLVP_LOAD_PAGEOFF12` (`src/obj/obj.h`), with translator
+ entries in `macho_reloc_aarch64.c` and a name in `pipeline.c`.
+- `link_reloc.c` encodes PAGE21 as ADRP-form and PAGEOFF12 as a 64-bit
+ LDR `lo12` (scale 3). `reloc_width` in `link_layout.c` lists both at
+ width 4.
+- `link_macho.c`:
+ - `collect_tlv` pass mirrors the internal-GOT pass: one
+ `MachTlv` slot per unique descriptor symbol referenced via a TLVP
+ reloc.
+ - `plan_layout` synthesizes `__DATA,__thread_ptrs`
+ (`S_THREAD_LOCAL_VARIABLE_POINTERS`) after the user's TLV input
+ sections.
+ - TLV input sections retain their `S_THREAD_LOCAL_*` section type
+ bits (`__thread_vars` → S_THREAD_LOCAL_VARIABLES, `__thread_data`
+ → REGULAR, `__thread_bss` → ZEROFILL), keyed off `SF_TLS` + sectname.
+ - `__thread_vars` is forced to align 8 (clang emits it with align 1,
+ relying on layout to land it on 8 — dyld's chained-fixup walk
+ requires 8-byte boundaries for the descriptor's pointer fields).
+ - `apply_relocs` routes `R_AARCH64_TLVP_LOAD_PAGE21/PAGEOFF12` through
+ the `__thread_ptrs` slot regardless of whether the descriptor
+ target is in-image or imported.
+ - `apply_relocs` special-cases `R_ABS64` whose patch site is in a
+ `S_THREAD_LOCAL_VARIABLES` section: writes the literal TLV offset
+ (`target_vaddr - tls_image_vaddr`) with no chained-fixup entry.
+ Matches Apple `ld`'s descriptor[2] format — dyld interprets that
+ word as a per-thread offset, not an absolute address.
+ - Chained-fixup REBASE (internal) / BIND (imported) per `__thread_ptrs`
+ slot, parallel to the existing `__got` per-slot fixup pass.
+ - `build_chained_fixups` recognizes the synthetic `__thread_ptrs`
+ region inside `__DATA` for slot-byte writeback.
+
+**Test:** `test/link/cases/36_tls_basic` — a single `_Thread_local int`
+read. Matrix:
+
+- `aa64-elf`, `x64-elf`, `rv64-elf` — R + E pass (J passes on host
+ arch, skip elsewhere).
+- `aa64-macho` — **R pass**, **E fail**, **J fail**.
+
+**Path E remaining issue (open).** Linked exe aborts with
+`_tlv_bootstrap_error + 24` invoked from `test_main`. The chained
+fixups bind descriptor[0] to `__tlv_bootstrap` correctly
+(`DYLD_PRINT_BINDINGS=1` confirms), but at first TLV access the thunk
+pointer reads as `_tlv_bootstrap_error` instead — dyld is silently
+patching descriptor[0] to the error stub during image processing,
+meaning it scanned `__thread_vars` and rejected something about our
+metadata. Hypotheses to verify:
+
+- LC_BUILD_VERSION minOS (currently 12.0) too old or too new for
+ dyld's TLV metadata requirements?
+- Chained-fixups header / starts-in-image formatting missing some
+ field that dyld needs for TLV processing?
+- TLV descriptor's word 1 (key) or word 2 (offset) layout doesn't
+ match what dyld expects?
+- Missing some LC_* load command that signals TLV support?
+
+Comparison reference: `clang -arch arm64 -isysroot $(xcrun
+--show-sdk-path) tls.c -o ref.exe` produces a working binary with the
+same source. Diff its load commands, chained_fixups blob, and
+descriptor bytes against our output to identify the missing piece.
+
+**Path J open.** `_tlv_bootstrap` is a libSystem symbol; the JIT path
+has no dylib resolution. Either add a `j_targets`-exclusion on
+`36_tls_basic` (analogous to the IFUNC trio in §3.3), or provide a
+TLV-bootstrap stub for the JIT lane.
+
+**Debug residue.** Extra `(kind=%u S=0x%llx A=%lld P=0x%llx)`
+formatting in `link_reloc.c`'s `LDST%u_ABS_LO12_NC misaligned address`
+panic — added during debugging the `__thread_vars` alignment issue.
+Revert before committing, or keep if generally useful for future
+misalignment debugging.
+
+---
+
+## 5. Other gaps surfaced en route
These don't currently break a test but are worth tracking:
@@ -206,14 +289,17 @@ These don't currently break a test but are worth tracking:
---
-## 5. Validation
+## 6. Validation
Run the matrix under both tuples; the ELF side is the regression
guardrail:
make test-elf # 37/37 — unaffected
make test-link # 119/119 — ELF baseline
- make test-link CFREE_TEST_OBJ=macho # Path E: 103/103
- # Path J: 8 fail (see §3)
+ make test-link CFREE_TEST_OBJ=macho # Path E: 103/103 +
+ # 1 fail (36_tls_basic, §4)
+ # Path J: 100/100 +
+ # 1 fail (36_tls_basic, §4)
-`33_ifunc_in_init/E` is `e_targets`-excluded on `aa64-macho` (§4).
+`33_ifunc_in_init/E` is `e_targets`-excluded on `aa64-macho` (§5).
+`36_tls_basic/{E,J}` is the open regression — see §4.
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -1050,6 +1050,10 @@ static const char* reloc_kind_name(u16 kind) {
return "R_AARCH64_LDST64_ABS_LO12_NC";
case R_AARCH64_LDST128_ABS_LO12_NC:
return "R_AARCH64_LDST128_ABS_LO12_NC";
+ case R_AARCH64_TLVP_LOAD_PAGE21:
+ return "R_AARCH64_TLVP_LOAD_PAGE21";
+ case R_AARCH64_TLVP_LOAD_PAGEOFF12:
+ return "R_AARCH64_TLVP_LOAD_PAGEOFF12";
case R_AARCH64_TLSLE_ADD_TPREL_HI12:
return "R_AARCH64_TLSLE_ADD_TPREL_HI12";
case R_AARCH64_TLSLE_ADD_TPREL_LO12:
diff --git a/src/link/link_layout.c b/src/link/link_layout.c
@@ -1855,6 +1855,8 @@ static u8 reloc_width(RelocKind k) {
case R_AARCH64_LD64_GOT_LO12_NC:
case R_AARCH64_TLSLE_ADD_TPREL_HI12:
case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+ case R_AARCH64_TLVP_LOAD_PAGE21:
+ case R_AARCH64_TLVP_LOAD_PAGEOFF12:
return 4;
case R_RV_HI20:
case R_RV_LO12_I:
diff --git a/src/link/link_macho.c b/src/link/link_macho.c
@@ -59,6 +59,10 @@ static SrcLoc no_loc(void) {
#define MZ_PAGE 0x4000ULL
#define MZ_STUB_SIZE 12u
#define MZ_GOT_SIZE 8u
+/* __DATA,__thread_ptrs slot size — one pointer per unique TLV referenced
+ * via TLVP_LOAD_PAGE21/PAGEOFF12. Each slot holds the address of the
+ * matching TLV descriptor in __DATA,__thread_vars. */
+#define MZ_TLVP_SIZE 8u
#define DYLD_CHAINED_PTR_64 2u
#define DYLD_CHAINED_IMPORT 1u
@@ -172,6 +176,20 @@ typedef struct MachDylib {
Sym install;
} MachDylib;
+/* One slot in the synthetic __DATA,__thread_ptrs section per unique TLV
+ * descriptor referenced via TLVP_LOAD_PAGE21/PAGEOFF12. Modeled after
+ * MachImp's internal-GOT entries: the slot holds the descriptor address
+ * (REBASE for internal-to-image descriptors, BIND for dylib-imported
+ * ones). The descriptor itself is laid out in __DATA,__thread_vars by
+ * either the input objects (internal) or the providing dylib (imported). */
+typedef struct MachTlv {
+ LinkSymId sym; /* canonical descriptor LinkSymId */
+ u32 tlv_idx; /* 1-based slot index in __thread_ptrs */
+ u8 imported; /* 1 == descriptor lives in a dylib (BIND), 0 == internal (REBASE) */
+ u8 pad[3];
+ u32 import_idx; /* 1-based MachImp index when imported (for chained-bind ordinal) */
+} MachTlv;
+
/* ---- planned section ---- */
typedef struct MSec {
@@ -245,6 +263,25 @@ typedef struct MCtx {
u32 stubs_size;
u8* got_bytes;
u32 got_size;
+ /* TLV pointer slots — one entry in __DATA,__thread_ptrs per unique
+ * descriptor referenced via TLVP_LOAD_PAGE21/PAGEOFF12. sym_to_tlv
+ * maps LinkSymId → 1-based slot index (parallel to sym_to_imp). Slot
+ * bytes are populated at apply_relocs time once shift_sections has
+ * pinned descriptor vaddrs. */
+ MachTlv* tlv_slots;
+ u32 ntlv;
+ u32* sym_to_tlv;
+ u32 sym_to_tlv_size;
+ u8* tlv_ptrs_bytes;
+ u32 tlv_ptrs_size;
+ u64 tlv_ptrs_vaddr;
+ /* Vaddr of the first thread-local-storage section
+ * (__thread_data / __thread_bss). Each TLV descriptor's word 2
+ * stores the symbol's offset within this image rather than an
+ * absolute address — see apply_relocs's S_THREAD_LOCAL_VARIABLES
+ * ABS64 special case. */
+ u64 tls_image_vaddr;
+ u8 has_tls_image;
/* Final layout (computed during plan) */
u64 text_vaddr;
@@ -477,6 +514,77 @@ static void collect_imports(MCtx* x) {
x->nimport_funcs = stub_run;
}
+/* ---- pass: collect TLV pointer slots ----
+ *
+ * Mirror of collect_imports' internal-GOT pass, but for TLV descriptors:
+ * each unique descriptor referenced via ARM64_RELOC_TLVP_LOAD_PAGE21 /
+ * PAGEOFF12 gets one slot in the synthetic __DATA,__thread_ptrs section.
+ * The slot's runtime value is the descriptor's address; we patch it at
+ * apply_relocs time (REBASE for in-image descriptors, BIND for ones in
+ * a dylib).
+ *
+ * Slots are deduplicated by canonical LinkSymId so a single descriptor
+ * referenced from N call sites shares one __thread_ptrs entry. */
+static void collect_tlv(MCtx* x) {
+ LinkImage* img = x->img;
+ Heap* h = x->h;
+ x->sym_to_tlv_size = LinkSyms_count(&img->syms) + 1u;
+ x->sym_to_tlv =
+ (u32*)h->alloc(h, sizeof(u32) * x->sym_to_tlv_size, _Alignof(u32));
+ if (!x->sym_to_tlv)
+ compiler_panic(x->c, no_loc(), "link_macho: oom on sym_to_tlv");
+ memset(x->sym_to_tlv, 0, sizeof(u32) * x->sym_to_tlv_size);
+
+ u32 cap = 0;
+ for (u32 i = 0; i < LinkRelocs_count(&img->relocs); ++i) {
+ LinkRelocApply* r = LinkRelocs_at(&img->relocs, i);
+ if (r->kind != R_AARCH64_TLVP_LOAD_PAGE21 &&
+ r->kind != R_AARCH64_TLVP_LOAD_PAGEOFF12)
+ continue;
+ if (r->target == LINK_SYM_NONE || r->target >= x->sym_to_tlv_size) continue;
+ /* Resolve through canonical so multiple per-input duplicate undefs
+ * collapse onto one __thread_ptrs slot. */
+ LinkSymId canon = r->target;
+ LinkSymbol* t = sym_at(img, r->target);
+ if (!t) continue;
+ if (t->name != 0) {
+ LinkSymId hit = symhash_get(&img->globals, t->name);
+ if (hit != LINK_SYM_NONE) {
+ canon = hit;
+ t = sym_at(img, canon);
+ if (!t) continue;
+ }
+ }
+ if (x->sym_to_tlv[canon]) {
+ if (canon != r->target) x->sym_to_tlv[r->target] = x->sym_to_tlv[canon];
+ continue;
+ }
+ if (VEC_GROW(h, x->tlv_slots, cap, x->ntlv + 1u))
+ compiler_panic(x->c, no_loc(), "link_macho: oom on tlv_slots");
+ MachTlv* ts = &x->tlv_slots[x->ntlv++];
+ memset(ts, 0, sizeof(*ts));
+ ts->sym = canon;
+ ts->tlv_idx = x->ntlv;
+ ts->imported = t->imported ? 1u : 0u;
+ /* If the descriptor is imported we route the bind through the
+ * symbol's MachImp slot — that's where dyld's chained-import index
+ * comes from. When this loop fires the imp pass has already
+ * materialized the entry (real imports were processed first); the
+ * lookup may also have stashed an alias for non-canonical ids. */
+ if (ts->imported) {
+ u32 idx = (canon < x->sym_to_imp_size) ? x->sym_to_imp[canon] : 0u;
+ if (!idx && t->name != 0) {
+ LinkSymId hit2 = symhash_get(&img->globals, t->name);
+ if (hit2 != LINK_SYM_NONE && hit2 < x->sym_to_imp_size)
+ idx = x->sym_to_imp[hit2];
+ }
+ ts->import_idx = idx;
+ }
+ x->sym_to_tlv[canon] = x->ntlv;
+ if (canon != r->target) x->sym_to_tlv[r->target] = x->ntlv;
+ }
+}
+
/* ---- pass: plan Mach-O sections ----
*
* Walks LinkImage sections. Each non-zero-size LinkSection becomes one
@@ -685,6 +793,48 @@ static void plan_layout(MCtx* x) {
m->flags = 0x00000009u /*S_MOD_INIT_FUNC_POINTERS*/;
else if (strcmp(m->sectname, "__mod_term_func") == 0)
m->flags = 0x0000000au /*S_MOD_TERM_FUNC_POINTERS*/;
+ else if (ls->flags & SF_TLS) {
+ /* TLV sections: dyld dispatches by section type, not name. Map
+ * __thread_vars → S_THREAD_LOCAL_VARIABLES (descriptor records),
+ * __thread_data → S_THREAD_LOCAL_REGULAR (initial data),
+ * __thread_bss → S_THREAD_LOCAL_ZEROFILL (zero-init data). Done
+ * by sectname so per-TU inputs without a Mach-O ext_type still
+ * get the right section type. */
+ if (strcmp(m->sectname, "__thread_vars") == 0) {
+ m->flags = S_THREAD_LOCAL_VARIABLES;
+ /* Each descriptor is three pointers (24B) whose first word is
+ * dyld's _tlv_bootstrap thunk pointer. Clang/llvm emit
+ * __thread_vars with on-disk alignment 1 (relying on layout to
+ * land it on 8); force 8-alignment here so the descriptor
+ * pointers fall on 8-byte boundaries — dyld's chained-fixup
+ * processing assumes that. */
+ if (m->align < 8u) m->align = 8u;
+ } else if (m->is_zerofill)
+ m->flags = S_THREAD_LOCAL_ZEROFILL;
+ else
+ m->flags = S_THREAD_LOCAL_REGULAR;
+ }
+ }
+ /* __thread_ptrs synthetic (TLV pointer slots). Emitted into __DATA
+ * after the user's TLV input sections so descriptors and their
+ * pointers share the same segment. Each slot's runtime initial
+ * value (= TLV descriptor address) is patched during apply_relocs. */
+ if (x->ntlv) {
+ x->tlv_ptrs_size = x->ntlv * MZ_TLVP_SIZE;
+ x->tlv_ptrs_bytes = (u8*)h->alloc(h, x->tlv_ptrs_size, 8);
+ if (!x->tlv_ptrs_bytes)
+ compiler_panic(x->c, no_loc(), "link_macho: oom on tlv_ptrs");
+ memset(x->tlv_ptrs_bytes, 0, x->tlv_ptrs_size);
+ MSec* m = &x->secs[x->nsecs++];
+ memset(m, 0, sizeof(*m));
+ m->synth_data = x->tlv_ptrs_bytes;
+ m->synth_size = x->tlv_ptrs_size;
+ m->segname = "__DATA";
+ m->sectname = "__thread_ptrs";
+ m->align = 8u;
+ m->size = x->tlv_ptrs_size;
+ m->segidx = 3;
+ m->flags = S_THREAD_LOCAL_VARIABLE_POINTERS;
}
x->segs[3].nsects = x->nsecs - first_d;
x->segs[3].first_sec = first_d;
@@ -805,6 +955,21 @@ static void plan_layout(MCtx* x) {
x->data_const_filesz = sg->filesize;
}
if (i == 3) {
+ for (u32 j = 0; j < sg->nsects; ++j) {
+ MSec* m = &x->secs[sg->first_sec + j];
+ if (strcmp(m->sectname, "__thread_ptrs") == 0)
+ x->tlv_ptrs_vaddr = m->vaddr;
+ /* TLS storage image base: min vaddr across __thread_data and
+ * __thread_bss sections. __thread_vars is excluded — it holds
+ * the descriptors, not the data that maps into the per-thread
+ * block. */
+ if ((strcmp(m->sectname, "__thread_data") == 0 ||
+ strcmp(m->sectname, "__thread_bss") == 0) &&
+ (!x->has_tls_image || m->vaddr < x->tls_image_vaddr)) {
+ x->tls_image_vaddr = m->vaddr;
+ x->has_tls_image = 1;
+ }
+ }
x->data_vaddr = sg->vmaddr;
x->data_filesz = sg->filesize;
x->data_memsz = sg->vmsize;
@@ -1035,6 +1200,23 @@ static void apply_relocs(MCtx* x, FixList* fl) {
int imp_idx;
int is_imp = sym_S(x, img, r->target, &S, &imp_idx);
+ /* TLVP relocs route through a __thread_ptrs slot regardless of
+ * whether the descriptor target is in-image or imported. Resolved
+ * before the import / internal split because an imported TLV
+ * descriptor doesn't use the __got slot (its address lives in
+ * __thread_ptrs with its own chained bind). */
+ if (r->kind == R_AARCH64_TLVP_LOAD_PAGE21 ||
+ r->kind == R_AARCH64_TLVP_LOAD_PAGEOFF12) {
+ u32 tlv_idx =
+ (r->target < x->sym_to_tlv_size) ? x->sym_to_tlv[r->target] : 0u;
+ if (!tlv_idx)
+ compiler_panic(x->c, no_loc(),
+ "link_macho: TLVP reloc has no __thread_ptrs slot");
+ u64 slot_v = x->tlv_ptrs_vaddr + (tlv_idx - 1u) * MZ_TLVP_SIZE;
+ link_reloc_apply(x->c, r->kind, P_bytes, slot_v, r->addend, P);
+ continue;
+ }
+
if (is_imp) {
MachImp* mi = (imp_idx > 0) ? &x->imports[imp_idx - 1] : NULL;
switch (r->kind) {
@@ -1090,6 +1272,20 @@ static void apply_relocs(MCtx* x, FixList* fl) {
/* Internal relocs. */
if (r->kind == R_ABS64) {
+ /* Special case: ABS64 reloc inside a TLV descriptor record
+ * (__thread_vars section) targeting in-image TLS storage. This
+ * is the descriptor's word-2 "offset" field — dyld interprets it
+ * as the per-thread offset of the storage within the TLS image,
+ * NOT as an absolute address. Apple's ld writes the literal
+ * offset and emits no chained-fixup entry; replicate that so the
+ * chain skips over this slot (chained_fixups already does the
+ * right thing: no fixsite -> no chain link). */
+ if (msec && (msec->flags & SECTION_TYPE) == S_THREAD_LOCAL_VARIABLES &&
+ x->has_tls_image) {
+ u64 offset = (S + (u64)r->addend) - x->tls_image_vaddr;
+ wr_u64_le(P_bytes, offset);
+ continue;
+ }
/* Rebase site. */
wr_u64_le(P_bytes, S + (u64)r->addend);
FixSite fs = {(u8)msec->segidx, 0, {0}, 0, P, S + (u64)r->addend};
@@ -1136,6 +1332,33 @@ static void apply_relocs(MCtx* x, FixList* fl) {
fix_push(fl, &fs);
}
}
+
+ /* Per-slot TLV pointer fixups. Mirror of the __got loop above: each
+ * __thread_ptrs slot points at the descriptor record. When the
+ * descriptor is in-image (internal) we REBASE to its final vaddr; when
+ * it lives in a dylib we BIND through the descriptor's MachImp. The
+ * slot itself lives in __DATA (segidx=3), distinct from __got's
+ * __DATA_CONST (segidx=2). */
+ for (u32 i = 0; i < x->ntlv; ++i) {
+ MachTlv* ts = &x->tlv_slots[i];
+ u64 slot_v = x->tlv_ptrs_vaddr + (ts->tlv_idx - 1u) * MZ_TLVP_SIZE;
+ u8* slot = x->tlv_ptrs_bytes + (ts->tlv_idx - 1u) * MZ_TLVP_SIZE;
+ if (ts->imported) {
+ if (!ts->import_idx)
+ compiler_panic(x->c, no_loc(),
+ "link_macho: imported TLV without matching import slot");
+ wr_u64_le(slot, 0);
+ FixSite fs = {3u, 1, {0}, ts->import_idx, slot_v, 0};
+ fix_push(fl, &fs);
+ } else {
+ LinkSymbol* s = sym_at(img, ts->sym);
+ u64 tgt_v = s ? s->vaddr : 0;
+ wr_u64_le(slot, tgt_v);
+ if (tgt_v == 0) continue; /* weak-undef descriptor → NULL */
+ FixSite fs = {3u, 0, {0}, 0, slot_v, tgt_v};
+ fix_push(fl, &fs);
+ }
+ }
}
/* ---- chained fixups blob assembler ----
@@ -1315,20 +1538,26 @@ static void build_chained_fixups(MCtx* x, FixList* fl) {
next4 = (u32)(dist / 4u);
}
/* Find segment bytes; for slot in __DATA_CONST __got use
- * x->got_bytes; for __DATA, find the LinkSegment. */
+ * x->got_bytes; for __DATA, special-case the synthetic
+ * __thread_ptrs region and otherwise walk LinkSections. */
u8* slot = NULL;
if (s->segidx == 2) {
/* __DATA_CONST: __got slot. */
slot = x->got_bytes + (s->vaddr - x->got_vaddr);
} else if (s->segidx == 3) {
- /* __DATA: walk MSecs to find the matching one. */
- MSec* m = msec_for_vaddr(x, s->vaddr);
- if (m) {
- u8* base = bytes_for_section(x, m, x->img);
- if (base) {
- LinkSection* ls = &x->img->sections[m->link_sec_id - 1u];
- u32 in_off = (u32)(ls->input_offset + (s->vaddr - m->vaddr));
- slot = base + in_off;
+ if (x->tlv_ptrs_bytes && s->vaddr >= x->tlv_ptrs_vaddr &&
+ s->vaddr < x->tlv_ptrs_vaddr + x->tlv_ptrs_size) {
+ slot = x->tlv_ptrs_bytes + (s->vaddr - x->tlv_ptrs_vaddr);
+ } else {
+ /* __DATA: walk MSecs to find the matching one. */
+ MSec* m = msec_for_vaddr(x, s->vaddr);
+ if (m && m->link_sec_id) {
+ u8* base = bytes_for_section(x, m, x->img);
+ if (base) {
+ LinkSection* ls = &x->img->sections[m->link_sec_id - 1u];
+ u32 in_off = (u32)(ls->input_offset + (s->vaddr - m->vaddr));
+ slot = base + in_off;
+ }
}
}
}
@@ -2000,6 +2229,7 @@ void link_emit_macho(LinkImage* img, Writer* w) {
compiler_panic(x.c, no_loc(), "link_emit_macho: no resolved entry");
collect_imports(&x);
+ collect_tlv(&x);
plan_layout(&x);
shift_sections(&x);
@@ -2283,4 +2513,8 @@ void link_emit_macho(LinkImage* img, Writer* w) {
if (x.secs) x.h->free(x.h, x.secs, 0);
if (x.stubs_bytes) x.h->free(x.h, x.stubs_bytes, x.stubs_size);
if (x.got_bytes) x.h->free(x.h, x.got_bytes, x.got_size);
+ if (x.tlv_ptrs_bytes) x.h->free(x.h, x.tlv_ptrs_bytes, x.tlv_ptrs_size);
+ if (x.tlv_slots) x.h->free(x.h, x.tlv_slots, 0);
+ if (x.sym_to_tlv)
+ x.h->free(x.h, x.sym_to_tlv, sizeof(u32) * x.sym_to_tlv_size);
}
diff --git a/src/link/link_reloc.c b/src/link/link_reloc.c
@@ -150,6 +150,7 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A,
wr_u32_le(P_bytes, instr);
return;
}
+ case R_AARCH64_TLVP_LOAD_PAGE21:
case R_AARCH64_ADR_GOT_PAGE:
case R_AARCH64_ADR_PREL_PG_HI21:
case R_AARCH64_ADR_PREL_PG_HI21_NC: {
@@ -206,7 +207,8 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A,
case R_AARCH64_LDST32_ABS_LO12_NC:
case R_AARCH64_LDST64_ABS_LO12_NC:
case R_AARCH64_LDST128_ABS_LO12_NC:
- case R_AARCH64_LD64_GOT_LO12_NC: {
+ case R_AARCH64_LD64_GOT_LO12_NC:
+ case R_AARCH64_TLVP_LOAD_PAGEOFF12: {
/* LDR/STR with imm12 at bits [21:10]; the imm is scaled by the
* access size, so we right-shift the low 12 bits of (S+A) by
* the size scale before encoding. NC = no overflow check.
@@ -217,7 +219,8 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A,
: (k == R_AARCH64_LDST16_ABS_LO12_NC) ? 1u
: (k == R_AARCH64_LDST32_ABS_LO12_NC) ? 2u
: (k == R_AARCH64_LDST64_ABS_LO12_NC ||
- k == R_AARCH64_LD64_GOT_LO12_NC)
+ k == R_AARCH64_LD64_GOT_LO12_NC ||
+ k == R_AARCH64_TLVP_LOAD_PAGEOFF12)
? 3u
: 4u;
u64 lo12 = ((u64)S + (u64)A) & 0xfffu;
@@ -225,8 +228,11 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A,
u32 instr = rd_u32_le(P_bytes);
if (lo12 & ((1u << shift) - 1u))
compiler_panic(c, no_loc(),
- "link: LDST%u_ABS_LO12_NC misaligned address",
- 1u << (3 + shift));
+ "link: LDST%u_ABS_LO12_NC misaligned address "
+ "(kind=%u S=0x%llx A=%lld P=0x%llx)",
+ 1u << (3 + shift), (unsigned)k,
+ (unsigned long long)S, (long long)A,
+ (unsigned long long)P);
instr = (instr & ~(0xfffu << 10)) | ((u32)(imm12 & 0xfffu) << 10);
wr_u32_le(P_bytes, instr);
return;
diff --git a/src/obj/macho_reloc_aarch64.c b/src/obj/macho_reloc_aarch64.c
@@ -42,6 +42,10 @@ u32 macho_aarch64_reloc_to(u32 kind /* RelocKind */) {
return ARM64_RELOC_GOT_LOAD_PAGE21;
case R_AARCH64_LD64_GOT_LO12_NC:
return ARM64_RELOC_GOT_LOAD_PAGEOFF12;
+ case R_AARCH64_TLVP_LOAD_PAGE21:
+ return ARM64_RELOC_TLVP_LOAD_PAGE21;
+ case R_AARCH64_TLVP_LOAD_PAGEOFF12:
+ return ARM64_RELOC_TLVP_LOAD_PAGEOFF12;
default:
return (u32)-1;
}
@@ -58,6 +62,7 @@ u32 macho_aarch64_reloc_pcrel(u32 kind /* RelocKind */) {
case R_AARCH64_ADR_PREL_PG_HI21:
case R_AARCH64_ADR_PREL_PG_HI21_NC:
case R_AARCH64_ADR_GOT_PAGE:
+ case R_AARCH64_TLVP_LOAD_PAGE21:
return 1;
default:
return 0;
@@ -98,6 +103,10 @@ u32 macho_aarch64_reloc_from(u32 macho_type) {
return R_AARCH64_ADR_GOT_PAGE;
case ARM64_RELOC_GOT_LOAD_PAGEOFF12:
return R_AARCH64_LD64_GOT_LO12_NC;
+ case ARM64_RELOC_TLVP_LOAD_PAGE21:
+ return R_AARCH64_TLVP_LOAD_PAGE21;
+ case ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
+ return R_AARCH64_TLVP_LOAD_PAGEOFF12;
default:
return (u32)-1;
}
diff --git a/src/obj/obj.h b/src/obj/obj.h
@@ -125,6 +125,21 @@ typedef enum RelocKind {
R_AARCH64_LDST128_ABS_LO12_NC,
R_AARCH64_ABS16,
R_AARCH64_PREL16,
+ /* AArch64 Mach-O TLV (thread-local variable) descriptor access. The
+ * compiler emits these to reference a TLV descriptor in
+ * __DATA,__thread_vars; the linker routes both through a synthetic
+ * __DATA,__thread_ptrs slot (analogous to __got for non-TLV externs).
+ *
+ * adrp x0, _var@TLVPPAGE ; TLVP_LOAD_PAGE21
+ * ldr x0, [x0, _var@TLVPPAGEOFF]; TLVP_LOAD_PAGEOFF12 -> descriptor
+ * ldr x1, [x0] ; thunk (filled by dyld)
+ * blr x1 ; thunk(x0=descriptor) -> x0 = TLV addr
+ *
+ * Encoding-wise PAGE21 is ADRP-form and PAGEOFF12 is a 64-bit-LDR
+ * lo12 (scale=3). The linker rewrites S to the matching __thread_ptrs
+ * slot's vaddr before applying. */
+ R_AARCH64_TLVP_LOAD_PAGE21,
+ R_AARCH64_TLVP_LOAD_PAGEOFF12,
/* AArch64 TLS Local-Exec model. */
R_AARCH64_TLSLE_ADD_TPREL_HI12,
R_AARCH64_TLSLE_ADD_TPREL_LO12,
diff --git a/test/link/cases/36_tls_basic/a.c b/test/link/cases/36_tls_basic/a.c
@@ -0,0 +1,14 @@
+/* Thread-local variable read. Exercises:
+ * - the C frontend's _Thread_local storage-class accepting `int`
+ * - the codegen's TLS access sequence (ELF TLSLE on aa64/x64/rv64
+ * ELF; Mach-O TLVP descriptor + thunk call on aa64-macho)
+ * - the linker's TLS image / TLV section/binding handling
+ *
+ * No targets file: runs on every <arch>-<obj> tuple. Cases that fail
+ * today reveal which lane still needs codegen / linker work. */
+
+_Thread_local int tls_val = 7;
+
+int test_main(void) {
+ return tls_val == 7 ? 0 : 1;
+}