commit c1cf117c880ee320131d5be080375899b7acc02d
parent 3e695b43cadf39b052bc1d70f402d5fdfdf49292
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 14:47:22 -0700
link: TLS local-exec, IFUNC, and exe symtab / build-id / .eh_frame
Three independent gaps closed against doc/linker-status.md.
TLS local-exec (case 31_tls_local_exec):
- 4th SEG_TLS layout bucket; SF_TLS sections form their own
PT_LOAD whose vaddr/filesz/memsz/align flow into a PT_TLS phdr.
- R_AARCH64_TLSLE_ADD_TPREL_{HI12,LO12_NC} apply (caller pre-
adjusts S to TP-relative; +16 AArch64 TCB folded in).
- Synthesizes __tdata_start/__tdata_end and __tbss_size (SK_ABS).
- start.c and jit_runner build the per-thread block (TCB | .tdata
copy | .tbss zero) and msr TPIDR_EL0. On Darwin libc clobbers
TPIDR_EL0, so the harness keeps msr→blr back-to-back with no
libc calls between.
IFUNC trampoline (case 32_ifunc, JIT-only for now):
- Per-arch iplt stub (3-insn adrp/ldr/br on AArch64) + 8-byte
.igot.plt slot per defined STT_GNU_IFUNC; symbol vaddr is
redirected to the stub.
- JIT pre-resolves in-process after applying relocs and writes the
chosen impl pointer into the slot. ELF emit refuses with a
pointer to the documented init-routine plan.
Exe shape upgrade:
- Static ET_EXEC writer now emits .symtab / .strtab / .shstrtab
plus a section header table (per-segment .text/.rodata/.data
/.bss shdrs, .init_array, .fini_array, .eh_frame, .got named
per their input). nm / objdump -t / readelf -s all work.
- Allocatable .note.gnu.build-id with deterministic 16-byte FNV-1a
digest, in the headers PT_LOAD; PT_NOTE phdr makes it
discoverable via dl_iterate_phdr.
- .eh_frame survives into the output with a properly named
PROGBITS+ALLOC shdr at its final vaddr. .eh_frame_hdr +
PT_GNU_EH_FRAME still TODO.
Test counts: test-link 110/110 (R 35, E 34, J 35, bad 2 + 4 skipped
non-aa64); test-musl 3/3 unchanged.
Diffstat:
14 files changed, 1435 insertions(+), 166 deletions(-)
diff --git a/doc/linker-status.md b/doc/linker-status.md
@@ -16,9 +16,9 @@ live in `test/link/` — they are not duplicated in `test/elf/`.
| Harness | Pass | Fail | Notes |
|-----------------|-----:|-----:|--------------------------------------|
| `test-elf` | 37 | 0 | All Layer A/B/C green |
-| `test-link` R | 33 | 0 | object roundtrip via cfree-roundtrip |
-| `test-link` E | 33 | 0 | qemu/podman aarch64 exec |
-| `test-link` J | 33 | 0 | JIT in-process incl. GC subgroup |
+| `test-link` R | 35 | 0 | object roundtrip via cfree-roundtrip |
+| `test-link` E | 34 | 0 | qemu/podman aarch64 exec |
+| `test-link` J | 35 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS |
| `test-link` bad | 2 | 0 | `bad/30_undef_strong` (E + J) |
| `test-musl` | 3 | 0 | static musl 1.2.5: syscall, errno, printf |
@@ -42,7 +42,8 @@ works end-to-end. Beyond that:
CONDBR19, TSTBR14, LD_PREL_LO19, ADR_PREL_LO21, JUMP26 / CALL26,
ADR_PREL_PG_HI21{,_NC}, ADD_ABS_LO12_NC,
LDST{8,16,32,64,128}_ABS_LO12_NC,
- ADR_GOT_PAGE / LD64_GOT_LO12_NC. Plus a synthetic R_ABS64 emitter
+ ADR_GOT_PAGE / LD64_GOT_LO12_NC,
+ TLSLE_ADD_TPREL_{HI12,LO12_NC}. Plus a synthetic R_ABS64 emitter
for GOT slot fill. **Reads every reloc kind in musl 1.2.5 aarch64
libc.a.**
- **Symbol resolution:** STB_GLOBAL/WEAK/LOCAL replacement strength;
@@ -50,22 +51,64 @@ works end-to-end. Beyond that:
pass-through. Weak archive defs satisfy unresolved refs (matches
GNU ld / lld; required for musl's weak `__init_tls`).
- **Linker-synthesized symbols:** `__init_array_start/end`,
- `__fini_array_start/end`, and general `__start_<X>`/`__stop_<X>`
- for any encoding section.
-- **Section / segment layout:** three-bucket RX / R / RW partition,
- BSS, init/fini/preinit_array, synthetic `.got`. **Same-named input
- sections merge by first-occurrence** — required for `_init`/`_fini`
- to be contiguous when `.init` / `.fini` come from crti.o + crtn.o.
- `-ffunction-sections` / `-fdata-sections` flow through naturally.
+ `__fini_array_start/end`, `__tdata_start/end` (vaddrs of the .tdata
+ template), `__tbss_size` (SK_ABS holding the .tbss byte count), and
+ general `__start_<X>`/`__stop_<X>` for any encoding section.
+- **Section / segment layout:** four-bucket RX / R / RW / TLS
+ partition, BSS, init/fini/preinit_array, synthetic `.got`.
+ **Same-named input sections merge by first-occurrence** — required
+ for `_init`/`_fini` to be contiguous when `.init` / `.fini` come
+ from crti.o + crtn.o. `-ffunction-sections` / `-fdata-sections`
+ flow through naturally.
+- **TLS local-exec (AArch64):** `R_AARCH64_TLSLE_ADD_TPREL_{HI12,
+ LO12_NC}` apply against the per-image TLS span; .tdata/.tbss
+ sections (SHF_TLS) layout into a dedicated SEG_TLS segment with
+ natural alignment preserved on PT_TLS (separate from the
+ containing PT_LOAD's page align). The exe writer emits both the
+ PT_LOAD (so the kernel maps the .tdata template) and a PT_TLS
+ pointing at it; the AArch64 ABI's 16-byte TCB offset is folded
+ into S at apply time. The freestanding `_start` (and `jit-runner`)
+ build the per-thread block — TCB(16) | .tdata copy | .tbss zero —
+ using the synthesized boundary symbols and `msr TPIDR_EL0`. On
+ Darwin libc routinely clobbers TPIDR_EL0, so the harness keeps
+ msr → blr back-to-back with no libc calls between.
- **Inputs:** loose `.o`, `.a` (demand + `--whole-archive`),
`--start-group` / `--end-group` cyclic resolution.
- **GC:** `--gc-sections` at section granularity. Roots: entry sym,
init/fini/preinit_array, `SF_RETAIN` (`SHF_GNU_RETAIN`),
`__start_/__stop_` referents. Edges follow per-section relocs to
fixed point.
+- **IFUNC trampoline (JIT only):** every defined `STT_GNU_IFUNC`
+ symbol gets a 12-byte stub in a synthetic `.iplt` (RX) section
+ (`adrp x16, slot ; ldr x16,[x16,:lo12:slot] ; br x16`) and an
+ 8-byte slot in `.igot.plt` (RW); the IFUNC's vaddr is redirected
+ to the stub. JIT load calls each resolver in-process after
+ applying relocs and writes the chosen implementation pointer
+ into the slot. ELF emit refuses (no startup init routine yet —
+ see Gaps).
- **Format fidelity:** ELF read+write byte-stable for the supported
subset; `EI_OSABI=GNU` flips automatically when GNU extensions are
present.
+- **Exe section + symbol tables:** the static ET_EXEC writer emits
+ `.symtab` / `.strtab` / `.shstrtab` and a section header table.
+ Defined symbols carry final absolute addresses (IMAGE_BASE + image
+ vaddr); SK_FILE / SK_ABS / SK_COMMON map to SHN_ABS / SHN_COMMON;
+ per-input undef-vs-canonical-def shadow records are deduped via
+ `img->globals`. Per-name input sections survive into the output as
+ one `(segment, name)` shdr — `.text`, `.rodata`, `.data`, `.bss`,
+ `.init_array`, `.fini_array`, `.eh_frame`, `.got`, etc., named
+ per their input. `nm`, `objdump -t`, `readelf -s` all work.
+- **Build-id:** an allocatable `.note.gnu.build-id` with a 16-byte
+ digest goes into the headers PT_LOAD; a PT_NOTE phdr makes it
+ discoverable via `dl_iterate_phdr`. The digest is FNV-1a 64 over
+ each segment with two seeds, mixed into 128 bits — deterministic
+ given the post-relocation segment bytes.
+- **`.eh_frame` flow-through:** input `.eh_frame` survives into the
+ output with a properly named PROGBITS+ALLOC shdr at its final
+ vaddr. Sufficient for `backtrace()` past the innermost frame on
+ toolchains that scan `.eh_frame` linearly; fast lookup via
+ `.eh_frame_hdr` + PT_GNU_EH_FRAME is still TODO (binary search
+ index over FDEs).
- **Driver:** `cfree ld -static -o out crt1.o crti.o user.o libc.a
libcfree_rt.a crtn.o` works. Output is chmod 0755 on success.
- **JIT path** runs the same resolved image in-process; MAP_JIT on
@@ -80,10 +123,9 @@ ordered by how often the gap actually bites.
| Gap | What breaks | Effort |
|-----|-------------|--------|
-| **`.symtab` / `.strtab` in the exe** | `nm`, `objdump -t`, `gdb` see no symbols; backtraces unsymbolicated. The E-path symbol verifier in `run.sh` already activates the moment this lands. | small |
-| **`.note.gnu.build-id`, `.eh_frame`, `.debug_*` in the exe** | No build-id for deterministic-build tooling. No `.eh_frame` → C++ EH and `backtrace()` broken. No DWARF → `gdb` blind. | medium (eh_frame + debug); small (build-id) |
-| **STT_GNU_IFUNC trampoline** | Reader knows the kind; linker doesn't synthesize the indirection. Anything using `__attribute__((ifunc))` (much of glibc) fails. | medium |
-| **TLS local-exec apply + PT_TLS** | `R_AARCH64_TLSLE_*` are read but `link_reloc_apply` panics; no `PT_TLS` emitted. Not needed for musl-aarch64 (errno routes through `__errno_location()` and there are no `.tdata`/`.tbss` in libc.a) but any user TU using `__thread` would hit this. | medium |
+| **`.eh_frame_hdr` + PT_GNU_EH_FRAME** | `.eh_frame` already flows through with a proper shdr; without `.eh_frame_hdr` libgcc/libunwind fall back to linear FDE scan, and `dl_iterate_phdr` consumers (most modern unwinders) skip the section entirely. Needs FDE parsing + sorted binary-search table emission. | medium |
+| **`.debug_*` in the exe** | No DWARF → `gdb` blind on source lines. cfree's debug pipeline ends at the obj boundary; the linker drops non-`SF_ALLOC` sections. | medium |
+| **STT_GNU_IFUNC in ELF output** | iplt + igot trampoline lands for the JIT path (resolver called in-process at load). The ELF emit path can't run target-arch resolvers itself, so it refuses; needs a synthesized startup init routine that walks `img->iplt_pairs` and patches each slot before `_start` calls into user code. | medium |
| **TLSGD / TLSIE / TLSLD relocs** | Read but not applied. Needed for `-fpic` TLS or shared-lib TLS — moot until dynamic linking lands. | medium |
| **Dynamic linking: PT_DYNAMIC, PT_INTERP, PLT, DT_NEEDED** | Cannot link against any `.so`. Static-only. | large |
| **PIE / ET_DYN executables** | Driver accepts `-pie` but the writer always emits ET_EXEC at fixed `IMAGE_BASE`. Tied to dynamic-linking work. | medium (depends on dynamic) |
@@ -95,22 +137,16 @@ ordered by how often the gap actually bites.
already a working linker — including against real musl. The next
priorities, roughly in order:
-1. **`.symtab` / `.strtab` in the exe** — unblocks `nm`, `objdump -t`,
- `gdb`, the E-path symbol verifier already wired in `run.sh`.
-2. **`.note.gnu.build-id`** — small standalone change; deterministic-
- build tooling, package managers, and crash-report symbol-server
- lookups all key off the build-id.
-3. **`.eh_frame`** — required for C++ exceptions, `_Unwind_*`, and
- `backtrace()` past the innermost frame.
-4. **STT_GNU_IFUNC trampoline** — reader already recognizes it; need
- the call-site indirection so glibc-style `__attribute__((ifunc))`
- resolvers work.
-5. **TLS local-exec apply + PT_TLS** — handle `R_AARCH64_TLSLE_*` and
- emit a PT_TLS segment so user TUs with `__thread` /
- `_Thread_local` link (`.tdata` / `.tbss` flowing through layout).
- Not needed for musl libc.a itself (it routes errno through
- `__errno_location()`), but needed for user code that declares
- thread-locals.
+1. **`.eh_frame_hdr` + PT_GNU_EH_FRAME** — `.eh_frame` already flows
+ through; building the binary-search index over FDEs unblocks fast
+ unwind and `dl_iterate_phdr`-driven consumers (modern libunwind,
+ libgcc's `_Unwind_Find_FDE`).
+2. **STT_GNU_IFUNC in ELF output** — JIT trampoline already works;
+ the ELF emit path needs a startup init routine that walks the
+ pairs table and fills each slot before any user code (including
+ `.init_array` ctors) runs. Plan below.
+3. **`.debug_*` in the exe** — DWARF flow-through; the linker
+ currently drops non-`SF_ALLOC` sections at `section_kept`.
After those the next big lift is full dynamic linking (PT_DYNAMIC +
PLT + PT_INTERP + DT_NEEDED), which also unlocks PIE output and TLS
@@ -118,6 +154,76 @@ GD / IE / LD modes.
---
+## Plan: STT_GNU_IFUNC in ELF output
+
+Static, cross-compile, no dynamic loader. The only entity that can
+run resolvers is the emitted binary itself, after `_start` and
+before any user code that might call an IFUNC (including any
+`.init_array` ctor). The init logic belongs in **`libcfree_rt.a`**
+as portable C; the linker wires it in using machinery it already
+has (boundary symbols, ABS64 reloc apply, `.init_array`).
+
+### Layered placement
+
+| Concern | Where |
+|---------|-------|
+| iplt stub bytes (per-arch: `adrp/ldr/br`, `jmp *rip`, `auipc/ld/jr`) | `src/arch/<arch>.c` (small `LinkArch` hook) |
+| iplt stub reloc kinds | per-arch (linker already knows them) |
+| Pairs table layout `(resolver_addr, slot_addr)` | shared, in `link_layout.c` |
+| Init routine (walks pairs, calls resolvers, fills slots) | `rt/lib/cfree/ifunc_init.c` (~15 lines portable C) |
+| `__start_iplt_pairs` / `__stop_iplt_pairs` | shared (existing `__start_<X>`/`__stop_<X>` machinery) |
+| `.init_array` synthesis hookup | shared |
+| Pulling the rt member in via demand-load | automatic — synthetic ABS64 against `__cfree_ifunc_init` creates the undef ref |
+
+This keeps `link_layout.c` arch-clean: today it hand-encodes
+`0x90000010 / 0xf9400210 / 0xd61f0200` inline, which moves to a
+per-arch `LinkArch.emit_iplt_stub` hook on the existing
+`src/arch/<arch>.c` files alongside CGTarget/MCEmitter. Future
+arches add ~20 lines (x86_64's stub is 6 bytes; rv64 mirrors
+aarch64's three-insn shape).
+
+### Concrete file changes
+
+```
+rt/lib/cfree/ifunc_init.c new — __cfree_ifunc_init walks
+ __start_iplt_pairs..__stop_iplt_pairs,
+ calls each resolver, stores result
+ in slot
+src/arch/aarch64.c + LinkArch with iplt stub bytes
+ (moved out of layout_iplt)
+src/link/link_internal.h + LinkArch decl
+src/link/link_layout.c layout_iplt uses
+ link_arch(...)->emit_iplt_stub;
+ adds .iplt.pairs data section
+ filled via ABS64 relocs; emits
+ .init_array entry referencing
+ __cfree_ifunc_init (ELF emit only)
+src/link/link_elf.c drop the niplt panic
+src/link/link_jit.c unchanged — keep in-process pre-resolve
+test/link/cases/32_ifunc/ drop jit_only marker
+test/link/cases/33_ifunc_in_init new — IFUNC called from a ctor
+test/link/cases/34_ifunc_addr_taken new — &my_fn round-trips
+ through a function pointer
+```
+
+### JIT path interaction
+
+Pre-resolution in `link_jit.c` stays as-is (simpler than firing
+`__cfree_ifunc_init` from the JIT). `layout_iplt` skips synthesizing
+the `.init_array` entry on the JIT path: iplt stubs still get
+produced (so address-taken IFUNCs work), and pre-resolution fills
+slots in-process. One-line gate, no per-call guard in the rt code.
+
+### Adding IFUNC to a future arch
+
+Per-arch surface: `iplt_stub_size` (constant) + `emit_iplt_stub`
+(emits stub bytes, registers reloc-apply records against the slot
+LinkSymbol). Everything else — pairs table, init routine,
+`.init_array` hookup, boundary symbols — is shared. So the cost of
+a new arch is bounded and small.
+
+---
+
## test-link harness — speed and ergonomics
`test/link/run.sh` accepts:
diff --git a/src/link/link.c b/src/link/link.c
@@ -441,6 +441,8 @@ static void link_image_release(LinkImage* img)
sizeof(*img->syms) * img->syms_cap);
if (img->relocs) img->heap->free(img->heap, img->relocs,
sizeof(*img->relocs) * img->relocs_cap);
+ if (img->iplt_pairs) img->heap->free(img->heap, img->iplt_pairs,
+ sizeof(*img->iplt_pairs) * img->niplt * 2u);
if (img->input_maps) {
for (i = 0; i < img->ninput_maps; ++i) {
InputMap* m = &img->input_maps[i];
diff --git a/src/link/link.h b/src/link/link.h
@@ -71,6 +71,9 @@ typedef struct LinkSection {
u64 size;
u32 flags;
u32 align;
+ Sym name; /* section name (interned); 0 if anon */
+ u16 sem; /* SecSem of the source obj section */
+ u16 pad;
} LinkSection;
typedef struct LinkRelocApply {
diff --git a/src/link/link_elf.c b/src/link/link_elf.c
@@ -1,24 +1,50 @@
/* link_emit_elf_aarch64: write a static ET_EXEC ELF64 image to the
* caller-provided Writer.
*
- * Layout: ELF header + program headers occupy a separate first PT_LOAD
- * mapped read-only at IMAGE_BASE. The kept segments follow, each
- * page-aligned. We do this in two passes:
- * 1. Determine headers_size = sizeof(Ehdr) + nseg_total * sizeof(Phdr),
- * where nseg_total = nsegments + 1 (the headers PT_LOAD itself).
- * Bump every image-relative vaddr / file_offset on sections,
- * symbols, and reloc records by align_up(headers_size, PAGE).
- * 2. Apply relocations with the absolute addresses (S, P offset by
- * IMAGE_BASE), then write headers and segment bytes via the
- * Writer.
+ * File layout (in write order):
*
- * Image addresses on the LinkImage remain image-relative (base 0) on
- * entry; we mutate them in place since this consumer takes ownership
- * of the image at link_emit_image_writer + link_image_free time. */
+ * [headers PT_LOAD, PF_R, mapped at IMAGE_BASE]
+ * Ehdr64
+ * Phdr64[nphdr] -- one per loaded segment + headers + PT_NOTE
+ * .note.gnu.build-id -- 12 + 16 = 28 bytes (deterministic 16-byte id)
+ * pad to PAGE
+ *
+ * [PT_LOAD per kept image segment, in img->segments order]
+ * segment bytes (padded to its file_offset)
+ *
+ * [non-allocatable sections, file-only]
+ * .symtab -- ELF64_SYM_SIZE * nsyms
+ * .strtab -- NUL-led blob
+ * .shstrtab -- NUL-led blob
+ *
+ * [section header table at e_shoff]
+ * Shdr64[nshdr]
+ *
+ * Section header schema (for nm / objdump -t / gdb consumption):
+ *
+ * 0 SHN_UNDEF (zero entry)
+ * N one shdr per loaded sub-region: .text/.rodata/.data/.bss as
+ * the corresponding RX/R/RW segments materialize (.bss split
+ * out as the trailing memsz>filesz tail of the RW segment).
+ * 1 .note.gnu.build-id (allocatable, in headers PT_LOAD)
+ * 1 .symtab (sh_link -> .strtab; sh_info = first non-local idx)
+ * 1 .strtab
+ * 1 .shstrtab (Ehdr64.e_shstrndx)
+ *
+ * Build-id is computed deterministically over the post-relocation
+ * segment bytes (FNV-1a 64 over each segment, mixed into a 128-bit
+ * accumulator). The 16-byte digest is written into the note before the
+ * note is emitted to the Writer.
+ *
+ * The image image-relative addresses on entry are bumped by
+ * align_up(headers_size, PAGE) before relocs are applied, exactly as
+ * before — segment bytes / symbol vaddrs land at their final IMAGE_BASE
+ * absolute addresses by the time relocs run. */
#include "link/link.h"
#include "link/link_internal.h"
+#include "obj/elf.h"
#include "core/heap.h"
#include "core/pool.h"
@@ -26,35 +52,10 @@
static SrcLoc no_loc(void) { SrcLoc l = {0,0,0}; return l; }
-/* ---- ELF64 constants (subset; sysv-abi values) ---- */
+/* ---- ELF64 wire structs (subset) ---- */
#define EI_NIDENT 16
-/* e_ident */
-#define ELFMAG0 0x7f
-#define ELFMAG1 'E'
-#define ELFMAG2 'L'
-#define ELFMAG3 'F'
-#define ELFCLASS64 2
-#define ELFDATA2LSB 1
-#define EV_CURRENT 1
-#define ELFOSABI_SYSV 0
-
-/* e_type */
-#define ET_EXEC 2
-
-/* e_machine */
-#define EM_AARCH64 183
-
-/* p_type / p_flags */
-#define PT_LOAD 1
-#define PF_X 1u
-#define PF_W 2u
-#define PF_R 4u
-
-#define PAGE_SIZE 0x1000u
-#define IMAGE_BASE 0x400000ULL
-
typedef struct __attribute__((packed)) Ehdr64 {
u8 e_ident[EI_NIDENT];
u16 e_type;
@@ -83,12 +84,53 @@ typedef struct __attribute__((packed)) Phdr64 {
u64 p_align;
} Phdr64;
+typedef struct __attribute__((packed)) Shdr64 {
+ u32 sh_name;
+ u32 sh_type;
+ u64 sh_flags;
+ u64 sh_addr;
+ u64 sh_offset;
+ u64 sh_size;
+ u32 sh_link;
+ u32 sh_info;
+ u64 sh_addralign;
+ u64 sh_entsize;
+} Shdr64;
+
+#define PT_NOTE 4
+#define PT_TLS 7
+
+#define PAGE_SIZE 0x1000u
+#define IMAGE_BASE 0x400000ULL
+
+#define BUILD_ID_DESC_LEN 16u
+#define NOTE_NAME_GNU "GNU"
+#define NOTE_NAME_GNU_LEN 4u /* "GNU\0" */
+#define NOTE_BUILD_ID_TYPE 3u
+#define BUILD_ID_NOTE_BYTES (12u + NOTE_NAME_GNU_LEN + BUILD_ID_DESC_LEN)
+
+/* ---- byte writer helpers ---- */
+
static u64 align_up_u64(u64 v, u64 a) { return (v + (a - 1u)) & ~(a - 1u); }
+static void write_bytes(Writer* w, const void* data, size_t n)
+{
+ w->write(w, data, n);
+}
+
+static void write_zeroes(Writer* w, size_t n)
+{
+ static const u8 zeroes[256] = {0};
+ while (n) {
+ size_t step = n > sizeof(zeroes) ? sizeof(zeroes) : n;
+ w->write(w, zeroes, step);
+ n -= step;
+ }
+}
+
static u32 perms_to_pflags(u32 secflags)
{
- u32 f = 0;
- f |= PF_R;
+ u32 f = PF_R;
if (secflags & SF_EXEC) f |= PF_X;
if (secflags & SF_WRITE) f |= PF_W;
return f;
@@ -111,10 +153,23 @@ static void shift_image_addresses(LinkImage* img, u64 delta)
}
for (i = 0; i < img->nsyms; ++i) {
LinkSymbol* s = &img->syms[i];
- if (s->kind == SK_ABS) continue; /* extern / absolute */
+ if (s->kind == SK_ABS) continue;
if (!s->defined) continue;
s->vaddr += delta;
}
+ /* tls_vaddr lives in the same image-relative coordinate system as
+ * the segments it tracks, so it bumps with them. */
+ if (img->tls_memsz) img->tls_vaddr += delta;
+}
+
+/* AArch64 ELF ABI: the per-thread TLS block starts at TP + 16 bytes
+ * (the TCB sits ahead of the TLS image). */
+#define AARCH64_TCB_SIZE 16ull
+
+static int reloc_is_tlsle(RelocKind k)
+{
+ return k == R_AARCH64_TLSLE_ADD_TPREL_HI12
+ || k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
}
static void apply_all_relocs(LinkImage* img)
@@ -127,8 +182,16 @@ static void apply_all_relocs(LinkImage* img)
const LinkSegment* seg = &img->segments[sec->segment_id - 1];
u64 S, P;
u8* P_bytes;
- S = tgt->vaddr + IMAGE_BASE;
- if (tgt->kind == SK_ABS) S = tgt->vaddr; /* already absolute */
+ if (reloc_is_tlsle(r->kind)) {
+ /* S is the target's TP-relative offset: distance from the
+ * TLS image start plus the 16-byte TCB. Both vaddrs are
+ * in the same (post-shift, image-relative) coordinate
+ * system, so IMAGE_BASE cancels out. */
+ S = (tgt->vaddr - img->tls_vaddr) + AARCH64_TCB_SIZE;
+ } else {
+ S = tgt->vaddr + IMAGE_BASE;
+ if (tgt->kind == SK_ABS) S = tgt->vaddr;
+ }
P = r->write_vaddr + IMAGE_BASE;
P_bytes = img->segment_bytes[seg->id - 1] +
(size_t)(r->write_file_offset - seg->file_offset);
@@ -136,81 +199,546 @@ static void apply_all_relocs(LinkImage* img)
}
}
-static void write_bytes(Writer* w, const void* data, size_t n)
+/* ---- build-id: FNV-1a 64 over segment bytes, mixed to 128 bits ---- */
+
+static u64 fnv1a64(const u8* data, size_t n, u64 seed)
{
- w->write(w, data, n);
+ const u64 PRIME = 0x100000001b3ull;
+ u64 h = seed;
+ size_t i;
+ for (i = 0; i < n; ++i) {
+ h ^= (u64)data[i];
+ h *= PRIME;
+ }
+ return h;
}
-static void write_zeroes(Writer* w, size_t n)
+static void compute_build_id(LinkImage* img, u8 out[16])
{
- static const u8 zeroes[256] = {0};
- while (n) {
- size_t step = n > sizeof(zeroes) ? sizeof(zeroes) : n;
- w->write(w, zeroes, step);
- n -= step;
+ /* Two FNV-1a streams with different seeds → 128 bits. Mix segment
+ * bytes (post-reloc) plus segment vaddrs so the digest changes if
+ * either content or layout shifts. */
+ const u64 SEED_LO = 0xcbf29ce484222325ull;
+ const u64 SEED_HI = 0x14650fb0739d0383ull;
+ u64 lo = SEED_LO, hi = SEED_HI;
+ u32 i;
+ for (i = 0; i < img->nsegments; ++i) {
+ const LinkSegment* seg = &img->segments[i];
+ u64 vaddr = seg->vaddr;
+ u64 fsz = seg->file_size;
+ lo = fnv1a64((const u8*)&vaddr, sizeof vaddr, lo);
+ lo = fnv1a64((const u8*)&fsz, sizeof fsz, lo);
+ hi = fnv1a64((const u8*)&vaddr, sizeof vaddr, hi);
+ hi = fnv1a64((const u8*)&fsz, sizeof fsz, hi);
+ if (img->segment_bytes[i] && fsz) {
+ lo = fnv1a64(img->segment_bytes[i], (size_t)fsz, lo);
+ hi = fnv1a64(img->segment_bytes[i], (size_t)fsz, hi);
+ }
+ }
+ for (i = 0; i < 8; ++i) out[i] = (u8)(lo >> (i * 8));
+ for (i = 0; i < 8; ++i) out[8 + i] = (u8)(hi >> (i * 8));
+}
+
+/* ---- string-table builder ---- */
+
+typedef struct StrBuilder {
+ Heap* heap;
+ u8* data;
+ u32 len;
+ u32 cap;
+} StrBuilder;
+
+static void strb_init(StrBuilder* s, Heap* h, u32 reserve)
+{
+ s->heap = h;
+ s->cap = reserve > 16u ? reserve : 16u;
+ s->data = (u8*)h->alloc(h, s->cap, 1);
+ if (!s->data) s->cap = 0;
+ s->len = 0;
+ if (s->cap) { s->data[0] = 0; s->len = 1; } /* leading NUL */
+}
+
+static void strb_fini(StrBuilder* s)
+{
+ if (s->data) s->heap->free(s->heap, s->data, s->cap);
+ s->data = NULL;
+ s->cap = s->len = 0;
+}
+
+static void strb_grow(StrBuilder* s, u32 need)
+{
+ u32 new_cap;
+ u8* p;
+ if (need <= s->cap) return;
+ new_cap = s->cap ? s->cap : 16u;
+ while (new_cap < need) new_cap *= 2u;
+ p = (u8*)s->heap->realloc(s->heap, s->data, s->cap, new_cap, 1);
+ if (!p) return;
+ s->data = p;
+ s->cap = new_cap;
+}
+
+static u32 strb_add(StrBuilder* s, const char* str, u32 slen)
+{
+ u32 off;
+ if (slen == 0) return 0;
+ /* Linear dedup: scan existing data for a matching NUL-terminated
+ * substring. Strtabs are small enough to make this acceptable. */
+ if (s->len > slen) {
+ u32 i;
+ for (i = 0; i + slen < s->len; ++i) {
+ if (s->data[i + slen] == 0 &&
+ memcmp(s->data + i, str, slen) == 0) return i;
+ }
+ }
+ off = s->len;
+ strb_grow(s, s->len + slen + 1u);
+ memcpy(s->data + s->len, str, slen);
+ s->data[s->len + slen] = 0;
+ s->len += slen + 1u;
+ return off;
+}
+
+static u32 strb_add_cstr(StrBuilder* s, const char* str)
+{
+ return strb_add(s, str, (u32)strlen(str));
+}
+
+/* ---- symtab builder ---- */
+
+typedef struct SymRec {
+ u32 st_name;
+ u8 st_info;
+ u8 st_other;
+ u16 st_shndx;
+ u64 st_value;
+ u64 st_size;
+} SymRec;
+
+static u8 sym_kind_to_st_type(u8 kind)
+{
+ switch (kind) {
+ case SK_FUNC: return STT_FUNC;
+ case SK_OBJ: return STT_OBJECT;
+ case SK_SECTION: return STT_SECTION;
+ case SK_FILE: return STT_FILE;
+ case SK_TLS: return STT_TLS;
+ case SK_IFUNC: return STT_GNU_IFUNC;
+ case SK_NOTYPE:
+ case SK_ABS:
+ case SK_UNDEF:
+ default: return STT_NOTYPE;
+ }
+}
+
+static u8 sym_bind_to_st_bind(u8 bind)
+{
+ switch (bind) {
+ case SB_GLOBAL: return STB_GLOBAL;
+ case SB_WEAK: return STB_WEAK;
+ case SB_LOCAL:
+ default: return STB_LOCAL;
+ }
+}
+
+/* Produces one Elf64_Sym record on the wire from a SymRec. */
+static void write_sym_rec(Writer* w, const SymRec* r)
+{
+ u8 buf[ELF64_SYM_SIZE];
+ buf[0] = (u8)(r->st_name);
+ buf[1] = (u8)(r->st_name >> 8);
+ buf[2] = (u8)(r->st_name >> 16);
+ buf[3] = (u8)(r->st_name >> 24);
+ buf[4] = r->st_info;
+ buf[5] = r->st_other;
+ buf[6] = (u8)(r->st_shndx);
+ buf[7] = (u8)(r->st_shndx >> 8);
+ {
+ u32 i;
+ for (i = 0; i < 8; ++i) buf[8 + i] = (u8)(r->st_value >> (i * 8));
+ for (i = 0; i < 8; ++i) buf[16 + i] = (u8)(r->st_size >> (i * 8));
+ }
+ write_bytes(w, buf, sizeof buf);
+}
+
+/* ---- section header layout ---- *
+ *
+ * Per-segment cuts: each kept image segment contributes 1 .text/.rodata
+ * shdr for its file portion, plus a separate .bss shdr for the trailing
+ * NOBITS portion of an RW segment (memsz > filesz). The headers PT_LOAD
+ * contributes a single .note.gnu.build-id shdr. Trailing non-alloc
+ * shdrs: .symtab .strtab .shstrtab (always 3). */
+
+typedef struct OutShdr {
+ u32 shdr_idx; /* 1-based; assigned during planning */
+ LinkSegmentId segment_id;
+ Sym name;
+ u16 sem; /* SecSem from source LinkSection */
+ u32 flags; /* SF_* from source LinkSection */
+ u32 align;
+ u64 vaddr;
+ u64 file_offset;
+ u64 size;
+ int is_nobits;
+} OutShdr;
+
+static u16 sym_shndx_for(const LinkSymbol* s,
+ const OutShdr* outshdrs, u32 noutshdr)
+{
+ if (!s->defined) return SHN_UNDEF;
+ if (s->kind == SK_ABS) return SHN_ABS;
+ if (s->kind == SK_FILE) return SHN_ABS;
+ if (s->kind == SK_COMMON) return SHN_COMMON;
+ /* Find an output shdr whose [vaddr, vaddr+size) covers s->vaddr.
+ * Boundary symbols match at the upper edge. */
+ {
+ u32 i;
+ for (i = 0; i < noutshdr; ++i) {
+ u64 lo = outshdrs[i].vaddr;
+ u64 hi = lo + outshdrs[i].size;
+ if (s->vaddr >= lo && s->vaddr <= hi) return (u16)outshdrs[i].shdr_idx;
+ }
}
+ return SHN_ABS;
+}
+
+static u32 sec_sem_to_sht(u16 sem)
+{
+ switch (sem) {
+ case SSEM_PROGBITS: return SHT_PROGBITS;
+ case SSEM_NOBITS: return SHT_NOBITS;
+ case SSEM_NOTE: return SHT_NOTE;
+ case SSEM_INIT_ARRAY: return SHT_INIT_ARRAY;
+ case SSEM_FINI_ARRAY: return SHT_FINI_ARRAY;
+ case SSEM_PREINIT_ARRAY: return SHT_PREINIT_ARRAY;
+ default: return SHT_PROGBITS;
+ }
+}
+
+static u64 sec_flags_to_shf(u32 flags)
+{
+ u64 r = 0;
+ if (flags & SF_ALLOC) r |= SHF_ALLOC;
+ if (flags & SF_EXEC) r |= SHF_EXECINSTR;
+ if (flags & SF_WRITE) r |= SHF_WRITE;
+ if (flags & SF_TLS) r |= SHF_TLS;
+ if (flags & SF_MERGE) r |= SHF_MERGE;
+ if (flags & SF_STRINGS) r |= SHF_STRINGS;
+ if (flags & SF_LINK_ORDER) r |= SHF_LINK_ORDER;
+ if (flags & SF_RETAIN) r |= SHF_GNU_RETAIN;
+ return r;
}
void link_emit_elf_aarch64(LinkImage* img, Writer* w)
{
- Ehdr64 ehdr;
- Phdr64* phdrs;
- Heap* heap = img->heap;
- u32 nphdr_total = img->nsegments + 1u; /* +1 for the headers PT_LOAD */
- u64 headers_size = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64);
- u64 headers_load = align_up_u64(headers_size, PAGE_SIZE);
- u64 cur_off;
- u32 i;
+ Heap* heap = img->heap;
+ Compiler* c = img->c;
- if (img->c->target.arch != CFREE_ARCH_ARM_64) {
- compiler_panic(img->c, no_loc(),
+ if (c->target.arch != CFREE_ARCH_ARM_64)
+ compiler_panic(c, no_loc(),
"link_emit_elf: only AArch64 is implemented");
- }
- if (img->entry_sym == LINK_SYM_NONE) {
- compiler_panic(img->c, no_loc(),
+ if (img->entry_sym == LINK_SYM_NONE)
+ compiler_panic(c, no_loc(),
"link_emit_elf: no resolved entry symbol");
- }
+ /* IFUNC trampolines are wired up in layout_iplt and pre-resolved
+ * by the JIT at load time. The ELF emit path can't run resolvers
+ * itself (cross-compile, no in-process target code), so it would
+ * need a startup init routine that walks img->iplt_pairs. Until
+ * that lands, refuse rather than emit a binary whose iplt slots
+ * stay zero and trap on first call. */
+ if (img->niplt > 0)
+ compiler_panic(c, no_loc(),
+ "link_emit_elf: STT_GNU_IFUNC in ELF output is not "
+ "yet supported (JIT path works); the iplt slots "
+ "need a startup init routine");
- /* Reserve headers_load bytes at the front of the image: bump every
- * vaddr/file_offset on segments/sections/relocs/symbols. */
- shift_image_addresses(img, headers_load);
+ /* ---- plan number of program headers ----
+ *
+ * 1 headers PT_LOAD + nsegments PT_LOAD + 1 PT_NOTE (build-id)
+ * + 1 PT_TLS when this image carries any TLS sections. */
+ u32 has_tls = img->tls_memsz ? 1u : 0u;
+ u32 nphdr_total = 1u + img->nsegments + 1u + has_tls;
+ u64 headers_size = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64)
+ + BUILD_ID_NOTE_BYTES;
+ u64 headers_load = align_up_u64(headers_size, PAGE_SIZE);
+
+ /* The build-id note lives inside the headers PT_LOAD at this offset. */
+ u64 build_id_off = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64);
+ u64 build_id_addr = IMAGE_BASE + build_id_off;
- /* All addresses are now final (image-relative + headers_load).
- * Apply relocations into the segment buffers. */
+ /* ---- shift image addresses, apply relocations ----
+ *
+ * Must happen before segshdrs/symtab construction so they observe
+ * post-shift vaddrs (the values that will land in the file). */
+ shift_image_addresses(img, headers_load);
apply_all_relocs(img);
- /* Build program headers.
- * Phdr 0 is the read-only headers segment mapped at IMAGE_BASE.
- * Phdrs 1..nsegments are the kept segments. */
- phdrs = (Phdr64*)heap->alloc(heap, sizeof(Phdr64) * nphdr_total,
- _Alignof(Phdr64));
- if (!phdrs) compiler_panic(img->c, no_loc(),
- "link_emit_elf: oom on phdrs");
- memset(phdrs, 0, sizeof(Phdr64) * nphdr_total);
+ /* ---- compute build-id (post-reloc, deterministic) ---- */
+ u8 build_id[BUILD_ID_DESC_LEN];
+ compute_build_id(img, build_id);
- phdrs[0].p_type = PT_LOAD;
- phdrs[0].p_flags = PF_R;
- phdrs[0].p_offset = 0;
- phdrs[0].p_vaddr = IMAGE_BASE;
- phdrs[0].p_paddr = IMAGE_BASE;
- phdrs[0].p_filesz = headers_size;
- phdrs[0].p_memsz = headers_size;
- phdrs[0].p_align = PAGE_SIZE;
+ /* ---- plan section headers covering loaded segments ----
+ *
+ * Worst case: 1 file shdr per segment + 1 .bss shdr if RW has a tail.
+ * shdr indices: 0=NULL, 1..nsegshdr=these, then build-id/symtab/...
+ */
+ /* Walk img->sections sorted by (segment_id, vaddr) and merge into
+ * one OutShdr per (segment_id, name) run. layout already places
+ * same-name sections adjacent within a segment, so a stable
+ * by-vaddr sort followed by run-length grouping captures it. */
+ OutShdr* outshdrs;
+ u32 noutshdr = 0;
+ u32 outshdr_cap = img->nsections + 1u;
+ outshdrs = (OutShdr*)heap->alloc(heap, sizeof(*outshdrs) * outshdr_cap,
+ _Alignof(OutShdr));
+ if (!outshdrs) compiler_panic(c, no_loc(),
+ "link_emit_elf: oom on outshdrs");
+ memset(outshdrs, 0, sizeof(*outshdrs) * outshdr_cap);
+ {
+ /* Build a sort index over LinkSection ids by (segment_id, vaddr). */
+ u32* order = (u32*)heap->alloc(heap, sizeof(u32) * (img->nsections + 1u),
+ _Alignof(u32));
+ if (!order && img->nsections)
+ compiler_panic(c, no_loc(), "link_emit_elf: oom on shdr sort");
+ u32 i, j;
+ for (i = 0; i < img->nsections; ++i) order[i] = i;
+ /* Insertion sort — section count is small. */
+ for (i = 1; i < img->nsections; ++i) {
+ u32 cur = order[i];
+ const LinkSection* a = &img->sections[cur];
+ j = i;
+ while (j > 0) {
+ const LinkSection* b = &img->sections[order[j - 1]];
+ if ((b->segment_id < a->segment_id) ||
+ (b->segment_id == a->segment_id && b->vaddr <= a->vaddr))
+ break;
+ order[j] = order[j - 1];
+ --j;
+ }
+ order[j] = cur;
+ }
+ for (i = 0; i < img->nsections; ++i) {
+ const LinkSection* ls = &img->sections[order[i]];
+ OutShdr* tail = noutshdr ? &outshdrs[noutshdr - 1] : NULL;
+ int merge = tail
+ && tail->segment_id == ls->segment_id
+ && tail->name == ls->name
+ && tail->is_nobits == (ls->sem == SSEM_NOBITS);
+ if (merge) {
+ u64 end = ls->vaddr + ls->size;
+ u64 prev_end = tail->vaddr + tail->size;
+ if (end > prev_end) tail->size = end - tail->vaddr;
+ if (ls->align > tail->align) tail->align = ls->align;
+ } else {
+ OutShdr* o = &outshdrs[noutshdr];
+ o->shdr_idx = 1u + noutshdr;
+ o->segment_id = ls->segment_id;
+ o->name = ls->name;
+ o->sem = ls->sem;
+ o->flags = ls->flags;
+ o->align = ls->align;
+ o->vaddr = ls->vaddr;
+ o->file_offset = ls->file_offset;
+ o->size = ls->size;
+ o->is_nobits = (ls->sem == SSEM_NOBITS);
+ noutshdr++;
+ }
+ }
+ heap->free(heap, order, sizeof(u32) * (img->nsections + 1u));
+ }
- for (i = 0; i < img->nsegments; ++i) {
- const LinkSegment* seg = &img->segments[i];
- Phdr64* p = &phdrs[i + 1];
- p->p_type = PT_LOAD;
- p->p_flags = perms_to_pflags(seg->flags);
- p->p_offset = seg->file_offset;
- p->p_vaddr = IMAGE_BASE + seg->vaddr;
- p->p_paddr = IMAGE_BASE + seg->vaddr;
- p->p_filesz = seg->file_size;
- p->p_memsz = seg->mem_size;
- p->p_align = seg->align ? seg->align : PAGE_SIZE;
+ /* ---- build .shstrtab ---- */
+ StrBuilder shstrtab;
+ strb_init(&shstrtab, heap, 128);
+ u32 sh_name_symtab = strb_add_cstr(&shstrtab, ".symtab");
+ u32 sh_name_strtab = strb_add_cstr(&shstrtab, ".strtab");
+ u32 sh_name_shstrtab = strb_add_cstr(&shstrtab, ".shstrtab");
+ u32 sh_name_buildid = strb_add_cstr(&shstrtab, ".note.gnu.build-id");
+ /* Per-output-shdr names — interned strings from input section names. */
+ u32* outshdr_name_off = (u32*)heap->alloc(heap, sizeof(u32) * (noutshdr + 1u),
+ _Alignof(u32));
+ if (!outshdr_name_off && noutshdr)
+ compiler_panic(c, no_loc(), "link_emit_elf: oom on shdr name table");
+ {
+ u32 i;
+ for (i = 0; i < noutshdr; ++i) {
+ const OutShdr* o = &outshdrs[i];
+ if (o->name) {
+ size_t nlen;
+ const char* nm = pool_str(c->global, o->name, &nlen);
+ outshdr_name_off[i] = nm && nlen
+ ? strb_add(&shstrtab, nm, (u32)nlen) : 0;
+ } else {
+ outshdr_name_off[i] = 0;
+ }
+ }
+ }
+
+ u32 nshdr = 1u + noutshdr + 4u;
+ u32 shndx_buildid = 1u + noutshdr;
+ u32 shndx_symtab = shndx_buildid + 1u;
+ u32 shndx_strtab = shndx_symtab + 1u;
+ u32 shndx_shstrtab = shndx_strtab + 1u;
+
+ /* ---- build .symtab + .strtab ----
+ *
+ * Two passes (locals first, then globals/weaks). Slot 0 is
+ * STN_UNDEF. Globals are deduped via img->globals — only the
+ * canonical entry per name is emitted, since per-input undef
+ * records keep their own LinkSymId after resolve_undefs's
+ * "copy fields from canonical def" step. sh_info = first non-local
+ * idx. */
+ StrBuilder strtab;
+ strb_init(&strtab, heap, 256);
+
+ SymRec* recs = (SymRec*)heap->alloc(heap,
+ sizeof(*recs) * (img->nsyms + 1u),
+ _Alignof(SymRec));
+ if (!recs) compiler_panic(c, no_loc(), "link_emit_elf: oom on symrecs");
+ u32 nsyms_emit = 0;
+ u32 first_global_idx;
+ memset(&recs[nsyms_emit++], 0, sizeof(*recs)); /* slot 0 */
+ first_global_idx = nsyms_emit;
+
+ {
+ u32 pass, i;
+ for (pass = 0; pass < 2; ++pass) {
+ int want_local = (pass == 0);
+ if (!want_local) first_global_idx = nsyms_emit;
+ for (i = 0; i < img->nsyms; ++i) {
+ const LinkSymbol* s = &img->syms[i];
+ int is_local = (s->bind == SB_LOCAL);
+ size_t namelen = 0;
+ const char* nm;
+ u8 st_type, st_bind;
+ u16 shndx;
+ u64 st_value;
+ SymRec* r;
+ if (want_local != is_local) continue;
+ if (s->name == 0 && s->kind != SK_FILE) continue;
+ /* Dedupe globals: per-input undef-of-X and the canonical
+ * def-of-X are separate img->syms entries (resolve_undefs
+ * mirrors fields onto the undef). Only the canonical
+ * (first registered) entry is in img->globals. Skip the
+ * shadow copies. */
+ if (!is_local && s->name) {
+ LinkSymId canonical = symhash_get(&img->globals, s->name);
+ if (canonical != LINK_SYM_NONE && canonical != s->id)
+ continue;
+ }
+ nm = s->name ? pool_str(c->global, s->name, &namelen) : "";
+ shndx = sym_shndx_for(s, outshdrs, noutshdr);
+ /* st_value: in ET_EXEC, defined non-ABS symbols carry
+ * absolute virtual addresses (IMAGE_BASE + image
+ * vaddr); ABS symbols carry their own value verbatim. */
+ if (s->kind == SK_FILE) st_value = 0;
+ else if (s->kind == SK_ABS) st_value = s->vaddr;
+ else if (s->defined) st_value = IMAGE_BASE + s->vaddr;
+ else st_value = 0;
+ st_type = sym_kind_to_st_type(s->kind);
+ st_bind = sym_bind_to_st_bind(s->bind);
+ r = &recs[nsyms_emit++];
+ memset(r, 0, sizeof(*r));
+ r->st_name = (nm && namelen)
+ ? strb_add(&strtab, nm, (u32)namelen) : 0;
+ r->st_info = ELF64_ST_INFO(st_bind, st_type);
+ r->st_other = STV_DEFAULT;
+ r->st_shndx = shndx;
+ r->st_value = st_value;
+ r->st_size = s->size;
+ }
+ }
+ }
+
+ /* ---- compute file offsets for trailing non-alloc sections ---- */
+ /* End of segment data: the highest (file_offset + file_size) across
+ * loaded segments. */
+ u64 end_of_segs = headers_load;
+ {
+ u32 i;
+ for (i = 0; i < img->nsegments; ++i) {
+ const LinkSegment* seg = &img->segments[i];
+ u64 e = seg->file_offset + seg->file_size;
+ if (e > end_of_segs) end_of_segs = e;
+ }
+ }
+ u64 symtab_off = align_up_u64(end_of_segs, 8u);
+ u64 symtab_size = (u64)ELF64_SYM_SIZE * nsyms_emit;
+ u64 strtab_off = symtab_off + symtab_size;
+ u64 strtab_size = strtab.len;
+ u64 shstrtab_off = strtab_off + strtab_size;
+ u64 shstrtab_size = shstrtab.len;
+ u64 shdr_off = align_up_u64(shstrtab_off + shstrtab_size, 8u);
+
+ /* ---- build phdrs ---- */
+ Phdr64* phdrs = (Phdr64*)heap->alloc(heap, sizeof(Phdr64) * nphdr_total,
+ _Alignof(Phdr64));
+ if (!phdrs) compiler_panic(c, no_loc(), "link_emit_elf: oom on phdrs");
+ memset(phdrs, 0, sizeof(Phdr64) * nphdr_total);
+ {
+ u32 pi = 0;
+ /* Headers PT_LOAD (covers ehdr + phdrs + build-id note). */
+ phdrs[pi].p_type = PT_LOAD;
+ phdrs[pi].p_flags = PF_R;
+ phdrs[pi].p_offset = 0;
+ phdrs[pi].p_vaddr = IMAGE_BASE;
+ phdrs[pi].p_paddr = IMAGE_BASE;
+ phdrs[pi].p_filesz = headers_size;
+ phdrs[pi].p_memsz = headers_size;
+ phdrs[pi].p_align = PAGE_SIZE;
+ pi++;
+ /* Per-segment PT_LOAD. */
+ u32 i;
+ for (i = 0; i < img->nsegments; ++i) {
+ const LinkSegment* seg = &img->segments[i];
+ Phdr64* p = &phdrs[pi++];
+ p->p_type = PT_LOAD;
+ p->p_flags = perms_to_pflags(seg->flags);
+ p->p_offset = seg->file_offset;
+ p->p_vaddr = IMAGE_BASE + seg->vaddr; /* post-shift */
+ p->p_paddr = p->p_vaddr;
+ p->p_filesz = seg->file_size;
+ p->p_memsz = seg->mem_size;
+ p->p_align = seg->align ? seg->align : PAGE_SIZE;
+ }
+ /* PT_NOTE for build-id. */
+ phdrs[pi].p_type = PT_NOTE;
+ phdrs[pi].p_flags = PF_R;
+ phdrs[pi].p_offset = build_id_off;
+ phdrs[pi].p_vaddr = build_id_addr;
+ phdrs[pi].p_paddr = build_id_addr;
+ phdrs[pi].p_filesz = BUILD_ID_NOTE_BYTES;
+ phdrs[pi].p_memsz = BUILD_ID_NOTE_BYTES;
+ phdrs[pi].p_align = 4;
+ pi++;
+ /* PT_TLS describing the .tdata template + .tbss zero-fill.
+ * vaddr/file_offset point at the same bytes the matching
+ * PT_LOAD already covers — the loader uses PT_TLS to size
+ * each thread's TLS block and to seed it from .tdata. */
+ if (has_tls) {
+ phdrs[pi].p_type = PT_TLS;
+ phdrs[pi].p_flags = PF_R;
+ phdrs[pi].p_offset = img->tls_vaddr - headers_load
+ + headers_load; /* image-relative
+ * file offset
+ * matches vaddr
+ * after shift */
+ /* tls_vaddr was bumped by shift_image_addresses; we want
+ * the file offset to track segment file_offset, which
+ * equals the segment's vaddr (post-shift) by construction. */
+ phdrs[pi].p_offset = img->tls_vaddr;
+ phdrs[pi].p_vaddr = IMAGE_BASE + img->tls_vaddr;
+ phdrs[pi].p_paddr = phdrs[pi].p_vaddr;
+ phdrs[pi].p_filesz = img->tls_filesz;
+ phdrs[pi].p_memsz = img->tls_memsz;
+ phdrs[pi].p_align = img->tls_align ? img->tls_align : 1u;
+ pi++;
+ }
+ (void)pi;
}
- /* Build ehdr. */
+ /* ---- build ehdr ---- */
+ Ehdr64 ehdr;
memset(&ehdr, 0, sizeof(ehdr));
ehdr.e_ident[0] = ELFMAG0;
ehdr.e_ident[1] = ELFMAG1;
@@ -219,40 +747,156 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w)
ehdr.e_ident[4] = ELFCLASS64;
ehdr.e_ident[5] = ELFDATA2LSB;
ehdr.e_ident[6] = EV_CURRENT;
- ehdr.e_ident[7] = ELFOSABI_SYSV;
+ ehdr.e_ident[7] = ELFOSABI_NONE;
ehdr.e_type = ET_EXEC;
ehdr.e_machine = EM_AARCH64;
ehdr.e_version = EV_CURRENT;
ehdr.e_entry = IMAGE_BASE + img->syms[img->entry_sym - 1].vaddr;
ehdr.e_phoff = sizeof(Ehdr64);
- ehdr.e_shoff = 0;
+ ehdr.e_shoff = shdr_off;
ehdr.e_flags = 0;
ehdr.e_ehsize = sizeof(Ehdr64);
ehdr.e_phentsize = sizeof(Phdr64);
ehdr.e_phnum = (u16)nphdr_total;
- ehdr.e_shentsize = sizeof(struct { u32 a, b, c, d; }); /* unused */
- ehdr.e_shentsize = 0;
- ehdr.e_shnum = 0;
- ehdr.e_shstrndx = 0;
-
- /* Write to the Writer in file order: ehdr, phdrs, pad to first
- * segment's file_offset, segment bytes (each padded to its
- * own file_offset). */
+ ehdr.e_shentsize = sizeof(Shdr64);
+ ehdr.e_shnum = (u16)nshdr;
+ ehdr.e_shstrndx = (u16)shndx_shstrtab;
+
+ /* ---- write ehdr, phdrs, build-id note, pad ---- */
+ u64 cur_off;
write_bytes(w, &ehdr, sizeof(ehdr));
write_bytes(w, phdrs, sizeof(Phdr64) * nphdr_total);
cur_off = sizeof(ehdr) + sizeof(Phdr64) * nphdr_total;
- /* Pad up to the first segment. */
- for (i = 0; i < img->nsegments; ++i) {
- const LinkSegment* seg = &img->segments[i];
- if (seg->file_size == 0) continue; /* pure-BSS segment */
- if (cur_off < seg->file_offset) {
- write_zeroes(w, (size_t)(seg->file_offset - cur_off));
- cur_off = seg->file_offset;
+ /* .note.gnu.build-id wire format:
+ * u32 namesz = 4 ("GNU\0")
+ * u32 descsz = 16
+ * u32 type = NT_GNU_BUILD_ID (3)
+ * "GNU\0"
+ * <16 bytes of build-id>
+ */
+ {
+ u8 nh[12];
+ u32 v;
+ v = NOTE_NAME_GNU_LEN; nh[0]=(u8)v;nh[1]=(u8)(v>>8);nh[2]=(u8)(v>>16);nh[3]=(u8)(v>>24);
+ v = BUILD_ID_DESC_LEN; nh[4]=(u8)v;nh[5]=(u8)(v>>8);nh[6]=(u8)(v>>16);nh[7]=(u8)(v>>24);
+ v = NOTE_BUILD_ID_TYPE; nh[8]=(u8)v;nh[9]=(u8)(v>>8);nh[10]=(u8)(v>>16);nh[11]=(u8)(v>>24);
+ write_bytes(w, nh, sizeof nh);
+ write_bytes(w, NOTE_NAME_GNU "\0", NOTE_NAME_GNU_LEN);
+ write_bytes(w, build_id, BUILD_ID_DESC_LEN);
+ cur_off += BUILD_ID_NOTE_BYTES;
+ }
+
+ /* Pad to first segment file_offset (== headers_load). */
+ {
+ u32 i;
+ for (i = 0; i < img->nsegments; ++i) {
+ const LinkSegment* seg = &img->segments[i];
+ if (seg->file_size == 0) continue;
+ if (cur_off < seg->file_offset) {
+ write_zeroes(w, (size_t)(seg->file_offset - cur_off));
+ cur_off = seg->file_offset;
+ }
+ write_bytes(w, img->segment_bytes[seg->id - 1],
+ (size_t)seg->file_size);
+ cur_off += seg->file_size;
+ }
+ }
+
+ /* ---- write trailing non-alloc sections ---- */
+ if (cur_off < symtab_off) {
+ write_zeroes(w, (size_t)(symtab_off - cur_off));
+ cur_off = symtab_off;
+ }
+ {
+ u32 i;
+ for (i = 0; i < nsyms_emit; ++i) write_sym_rec(w, &recs[i]);
+ cur_off += symtab_size;
+ }
+ if (strtab.len) {
+ write_bytes(w, strtab.data, strtab.len);
+ cur_off += strtab.len;
+ }
+ if (shstrtab.len) {
+ write_bytes(w, shstrtab.data, shstrtab.len);
+ cur_off += shstrtab.len;
+ }
+
+ /* ---- write section header table ---- */
+ if (cur_off < shdr_off) {
+ write_zeroes(w, (size_t)(shdr_off - cur_off));
+ cur_off = shdr_off;
+ }
+ {
+ Shdr64 sh;
+ u32 i;
+ /* shdr 0: NULL */
+ memset(&sh, 0, sizeof(sh));
+ write_bytes(w, &sh, sizeof(sh));
+ /* per-name output shdrs */
+ for (i = 0; i < noutshdr; ++i) {
+ const OutShdr* o = &outshdrs[i];
+ memset(&sh, 0, sizeof(sh));
+ sh.sh_name = outshdr_name_off[i];
+ sh.sh_type = sec_sem_to_sht(o->sem);
+ sh.sh_flags = sec_flags_to_shf(o->flags);
+ sh.sh_addr = IMAGE_BASE + o->vaddr;
+ sh.sh_offset = o->file_offset;
+ sh.sh_size = o->size;
+ sh.sh_link = 0;
+ sh.sh_info = 0;
+ sh.sh_addralign = o->align ? o->align : 1;
+ sh.sh_entsize = (o->sem == SSEM_INIT_ARRAY ||
+ o->sem == SSEM_FINI_ARRAY ||
+ o->sem == SSEM_PREINIT_ARRAY) ? 8 : 0;
+ write_bytes(w, &sh, sizeof(sh));
}
- write_bytes(w, img->segment_bytes[seg->id - 1], (size_t)seg->file_size);
- cur_off += seg->file_size;
+ /* shdr: .note.gnu.build-id (allocatable; in headers PT_LOAD) */
+ memset(&sh, 0, sizeof(sh));
+ sh.sh_name = sh_name_buildid;
+ sh.sh_type = SHT_NOTE;
+ sh.sh_flags = SHF_ALLOC;
+ sh.sh_addr = build_id_addr;
+ sh.sh_offset = build_id_off;
+ sh.sh_size = BUILD_ID_NOTE_BYTES;
+ sh.sh_addralign = 4;
+ write_bytes(w, &sh, sizeof(sh));
+ /* shdr: .symtab */
+ memset(&sh, 0, sizeof(sh));
+ sh.sh_name = sh_name_symtab;
+ sh.sh_type = SHT_SYMTAB;
+ sh.sh_flags = 0;
+ sh.sh_addr = 0;
+ sh.sh_offset = symtab_off;
+ sh.sh_size = symtab_size;
+ sh.sh_link = shndx_strtab;
+ sh.sh_info = first_global_idx;
+ sh.sh_addralign = 8;
+ sh.sh_entsize = ELF64_SYM_SIZE;
+ write_bytes(w, &sh, sizeof(sh));
+ /* shdr: .strtab */
+ memset(&sh, 0, sizeof(sh));
+ sh.sh_name = sh_name_strtab;
+ sh.sh_type = SHT_STRTAB;
+ sh.sh_offset = strtab_off;
+ sh.sh_size = strtab_size;
+ sh.sh_addralign = 1;
+ write_bytes(w, &sh, sizeof(sh));
+ /* shdr: .shstrtab */
+ memset(&sh, 0, sizeof(sh));
+ sh.sh_name = sh_name_shstrtab;
+ sh.sh_type = SHT_STRTAB;
+ sh.sh_offset = shstrtab_off;
+ sh.sh_size = shstrtab_size;
+ sh.sh_addralign = 1;
+ write_bytes(w, &sh, sizeof(sh));
}
heap->free(heap, phdrs, sizeof(Phdr64) * nphdr_total);
+ heap->free(heap, recs, sizeof(*recs) * (img->nsyms + 1u));
+ heap->free(heap, outshdrs, sizeof(*outshdrs) * outshdr_cap);
+ if (outshdr_name_off)
+ heap->free(heap, outshdr_name_off, sizeof(u32) * (noutshdr + 1u));
+ strb_fini(&strtab);
+ strb_fini(&shstrtab);
}
diff --git a/src/link/link_internal.h b/src/link/link_internal.h
@@ -118,8 +118,29 @@ struct LinkImage {
u32 nrelocs;
u32 relocs_cap;
+ /* IFUNC trampoline table (image-relative vaddrs). One entry per
+ * defined STT_GNU_IFUNC symbol: (resolver_vaddr, slot_vaddr). The
+ * JIT path walks this after applying relocations, calls each
+ * resolver in-process, and stores the result into the slot's write
+ * alias. The ELF emit path uses it to seed a startup init routine
+ * (or panics when the routine is not yet wired in). */
+ u64* iplt_pairs; /* 2 * niplt entries */
+ u32 niplt;
+
LinkSymId entry_sym;
+ /* TLS image span (image-relative). Set when any input contributes
+ * an SF_TLS section. filesz covers the .tdata bytes that initialize
+ * the per-thread block; memsz adds .tbss zero-fill. tls_align is
+ * the natural alignment of the TLS image (max of contributing
+ * sections), distinct from the containing PT_LOAD's page align.
+ * AArch64 ELF ABI: TP-relative offset of a TLS symbol with image
+ * offset `o` is `o + 16` (16-byte TCB ahead of the TLS data). */
+ u64 tls_vaddr;
+ u64 tls_filesz;
+ u64 tls_memsz;
+ u32 tls_align;
+
InputMap* input_maps; /* one per input; indexed by input_id-1 */
u32 ninput_maps;
};
diff --git a/src/link/link_jit.c b/src/link/link_jit.c
@@ -52,6 +52,16 @@ struct CfreeJit {
u32 nsegs;
};
+/* AArch64 ELF ABI: TP points 16 bytes before the TLS image; TLSLE
+ * encodes (target_offset_in_image + 16). */
+#define AARCH64_TCB_SIZE 16ull
+
+static int reloc_is_tlsle(RelocKind k)
+{
+ return k == R_AARCH64_TLSLE_ADD_TPREL_HI12
+ || k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+}
+
static int perms_for(u32 secflags)
{
int p = CFREE_PROT_READ;
@@ -177,7 +187,12 @@ CfreeJit* cfree_jit_from_image(LinkImage* img)
const LinkSymbol* tgt = &img->syms[r->target - 1];
u64 S, P;
u8* P_bytes;
- if (tgt->kind == SK_ABS) {
+ if (reloc_is_tlsle(r->kind)) {
+ /* TLSLE: S is the TP-relative offset of the target. Both
+ * vaddrs are image-relative, so the runtime alias drops
+ * out and we work in image-space. */
+ S = (tgt->vaddr - img->tls_vaddr) + AARCH64_TCB_SIZE;
+ } else if (tgt->kind == SK_ABS) {
/* extern resolver result OR true absolute symbol — vaddr
* already holds the runtime address. */
S = tgt->vaddr;
@@ -217,6 +232,27 @@ CfreeJit* cfree_jit_from_image(LinkImage* img)
}
}
+ /* IFUNC pre-resolution: now that code is executable and slots are
+ * writable, walk every (resolver_vaddr, slot_vaddr) pair, call the
+ * resolver in-process, and store its return value into the slot's
+ * runtime address. The iplt stub then loads the slot and tail-
+ * calls the chosen implementation on every invocation. */
+ if (img->niplt) {
+ typedef void* (*ResolverFn)(void);
+ for (i = 0; i < img->niplt; ++i) {
+ u64 resolver_vaddr = img->iplt_pairs[2u * i + 0];
+ u64 slot_vaddr = img->iplt_pairs[2u * i + 1];
+ uintptr_t resolver_rt = vaddr_to_runtime(img, segs, resolver_vaddr);
+ uintptr_t slot_rt = vaddr_to_runtime(img, segs, slot_vaddr);
+ void* impl;
+ if (!resolver_rt || !slot_rt)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: iplt vaddr does not map to runtime");
+ impl = ((ResolverFn)resolver_rt)();
+ *(void**)(uintptr_t)slot_rt = impl;
+ }
+ }
+
jit = (CfreeJit*)heap->alloc(heap, sizeof(*jit), _Alignof(CfreeJit));
if (!jit) {
for (i = 0; i < img->nsegments; ++i)
diff --git a/src/link/link_layout.c b/src/link/link_layout.c
@@ -34,12 +34,17 @@ static u64 layout_page_size(Linker* l)
return 0x4000u;
}
-/* Three-bucket segment partitioning by permission. */
+/* Four-bucket segment partitioning by permission. TLS sections live
+ * in their own bucket: they're emitted as a PT_LOAD (so the kernel
+ * maps the .tdata template) and then referenced by a PT_TLS phdr;
+ * symbols in TLS sections need separate vaddr-to-offset arithmetic
+ * for TLSLE relocs. */
typedef enum SegBucket {
- SEG_RX = 0, /* SF_ALLOC | SF_EXEC */
- SEG_R = 1, /* SF_ALLOC, no EXEC, no WRITE */
- SEG_RW = 2, /* SF_ALLOC | SF_WRITE (incl. BSS) */
- SEG_NBUCKETS = 3,
+ SEG_RX = 0, /* SF_ALLOC | SF_EXEC */
+ SEG_R = 1, /* SF_ALLOC, no EXEC, no WRITE */
+ SEG_RW = 2, /* SF_ALLOC | SF_WRITE (incl. BSS) */
+ SEG_TLS = 3, /* SF_ALLOC | SF_TLS (.tdata + .tbss) */
+ SEG_NBUCKETS = 4,
} SegBucket;
static u64 align_up_u64(u64 v, u64 a) { return (v + (a - 1u)) & ~(a - 1u); }
@@ -57,6 +62,7 @@ static int section_kept(const Section* s)
static SegBucket bucket_for(u16 flags)
{
+ if (flags & SF_TLS) return SEG_TLS;
if (flags & SF_EXEC) return SEG_RX;
if (flags & SF_WRITE) return SEG_RW;
return SEG_R;
@@ -700,13 +706,13 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g)
}
}
- /* Three segment buckets; tracks per-bucket size during scan and
+ /* Four segment buckets; tracks per-bucket size during scan and
* per-section file_offset within the bucket. */
u64 seg_size[SEG_NBUCKETS] = {0};
- u32 seg_align[SEG_NBUCKETS] = {1, 1, 1};
+ u32 seg_align[SEG_NBUCKETS] = {1, 1, 1, 1};
u32 seg_count[SEG_NBUCKETS] = {0};
- /* For BSS: track separately to set mem_size > file_size. We track
- * trailing nobits per bucket — only SEG_RW gets BSS in practice. */
+ /* Track trailing NOBITS so segment mem_size > file_size: SEG_RW
+ * for .bss / COMMON, SEG_TLS for .tbss. */
u64 seg_bss_extra[SEG_NBUCKETS] = {0};
/* Pass 2: place sections, grouped by name within each bucket and
@@ -760,6 +766,8 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g)
ls->size = (s->sem == SSEM_NOBITS) ? s->bss_size : s->bytes.total;
ls->flags = s->flags;
ls->align = align;
+ ls->name = s->name;
+ ls->sem = s->sem;
ls->segment_id = (LinkSegmentId)(bucket + 1u); /* 1..3 sentinel */
m->section[pe->obj_sec_id] = lsid;
pe->placed = 1;
@@ -804,9 +812,11 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g)
for (b = 0; b < SEG_NBUCKETS; ++b) {
LinkSegment* seg;
u64 file_size, mem_size, align;
+ u32 nat_align;
u32 perms;
if (!seg_count[b]) continue;
- align = (u64)seg_align[b];
+ nat_align = seg_align[b] ? seg_align[b] : 1u;
+ align = (u64)nat_align;
if (align < layout_page_size(l)) align = layout_page_size(l);
cursor = align_up_u64(cursor, align);
@@ -814,8 +824,9 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g)
file_size = seg_size[b];
mem_size = seg_size[b] + seg_bss_extra[b];
perms = SF_ALLOC;
- if (b == SEG_RX) perms |= SF_EXEC;
- if (b == SEG_RW) perms |= SF_WRITE;
+ if (b == SEG_RX) perms |= SF_EXEC;
+ if (b == SEG_RW) perms |= SF_WRITE;
+ if (b == SEG_TLS) perms |= SF_TLS;
memset(seg, 0, sizeof(*seg));
seg->id = (LinkSegmentId)(img->nsegments + 1u);
@@ -827,6 +838,16 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g)
seg->align = (u32)align;
seg->nsections = seg_count[b];
bucket_seg[b] = seg->id;
+ if (b == SEG_TLS) {
+ /* Record TLS image span for PT_TLS emission and TLSLE
+ * reloc apply. tls_align preserves the natural section
+ * alignment (PT_TLS p_align), distinct from the
+ * containing PT_LOAD's page align. */
+ img->tls_vaddr = cursor;
+ img->tls_filesz = file_size;
+ img->tls_memsz = mem_size;
+ img->tls_align = nat_align;
+ }
cursor += mem_size;
img->nsegments++;
}
@@ -1090,6 +1111,47 @@ static void emit_array_boundaries(Linker* l, LinkImage* img)
emit_boundary_sym(l, img, "__fini_array_end", fini_end);
}
+/* Synthesize TLS boundary symbols so the freestanding _start can size
+ * and seed the per-thread block:
+ * __tdata_start, __tdata_end : image vaddrs of the .tdata template
+ * (memcpy source for the new TLS block).
+ * __tbss_size : SK_ABS holding the .tbss byte count
+ * (memset target after the .tdata copy).
+ * All three are always emitted. When no TLS exists they resolve to
+ * zero, which makes the _start TLS prologue a no-op. The .tdata
+ * extent is the file portion of the TLS segment; the .tbss extent is
+ * the trailing memsz - filesz tail. */
+static void emit_tls_boundaries(Linker* l, LinkImage* img)
+{
+ u64 tdata_start = img->tls_vaddr;
+ u64 tdata_end = img->tls_vaddr + img->tls_filesz;
+ u64 tbss_size = img->tls_memsz - img->tls_filesz;
+ Sym sym_size = pool_intern_cstr(l->c->global, "__tbss_size");
+ LinkSymId id;
+ LinkSymbol rec;
+
+ emit_boundary_sym(l, img, "__tdata_start", tdata_start);
+ emit_boundary_sym(l, img, "__tdata_end", tdata_end);
+
+ /* __tbss_size is an absolute count, not an address: SK_ABS so
+ * shift_image_addresses leaves it alone and the symbol's value
+ * IS the size when read as `(size_t)__tbss_size`. */
+ id = symhash_get(&img->globals, sym_size);
+ memset(&rec, 0, sizeof(rec));
+ rec.name = sym_size;
+ rec.kind = SK_ABS;
+ rec.bind = SB_GLOBAL;
+ rec.defined = 1;
+ rec.vaddr = tbss_size;
+ if (id != LINK_SYM_NONE) {
+ img->syms[id - 1] = rec;
+ img->syms[id - 1].id = id;
+ } else {
+ LinkSymId fresh = append_symbol(img, &rec);
+ symhash_insert(&img->globals, sym_size, fresh, &id);
+ }
+}
+
/* ---- pass 3c: __start_<X>/__stop_<X> encoding-section boundaries ----
*
* For every undef LinkSymbol whose name is __start_<X>/__stop_<X> with
@@ -1165,6 +1227,8 @@ static u8 reloc_width(RelocKind k)
case R_AARCH64_LDST128_ABS_LO12_NC:
case R_AARCH64_ADR_GOT_PAGE:
case R_AARCH64_LD64_GOT_LO12_NC:
+ case R_AARCH64_TLSLE_ADD_TPREL_HI12:
+ case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
return 4;
default:
return 0;
@@ -1405,6 +1469,8 @@ static void layout_got(Linker* l, LinkImage* img, LinkSymId** got_map_out)
gotsec->size = got_size;
gotsec->flags = SF_ALLOC | SF_WRITE;
gotsec->align = 8;
+ gotsec->name = pool_intern_cstr(img->c->global, ".got");
+ gotsec->sem = SSEM_PROGBITS;
img->nsections++;
/* Pass D: per slot, synthesize a LinkSymbol and emit the R_ABS64
@@ -1448,6 +1514,269 @@ static void layout_got(Linker* l, LinkImage* img, LinkSymId** got_map_out)
*got_map_out = got_map;
}
+/* ---- pass 3d: STT_GNU_IFUNC trampoline (.iplt + .igot.plt) ----
+ *
+ * Per defined SK_IFUNC symbol we synthesize:
+ * - A 12-byte stub in a fresh RX segment (.iplt): three AArch64
+ * instructions that load an 8-byte function pointer and tail-call
+ * it. Encoded as ADRP x16 / LDR x16,[x16] / BR x16 with the
+ * ADR_PREL_PG_HI21 + LDST64_ABS_LO12_NC immediates left zero —
+ * the existing reloc machinery patches them against a synthetic
+ * LinkSymbol whose vaddr is the matching slot.
+ * - An 8-byte slot in a fresh RW segment (.igot.plt), zero-initialized.
+ *
+ * The IFUNC LinkSymbol's vaddr is then redirected to the stub, and a
+ * pair (resolver_vaddr, slot_vaddr) is appended to img->iplt_pairs so
+ * the JIT path can fill the slot in-process after relocation apply.
+ * The ELF emit path needs target-arch code to call the resolvers and
+ * is not wired up yet — see link_emit_elf_aarch64.
+ *
+ * Invariant: runs after link_symbols_to_sections so the resolver's
+ * vaddr is final, and before emit_reloc_records so the synthesized
+ * stub relocs ride the same apply path as ordinary input relocs. */
+
+static void wr_u32_le(u8* p, u32 v)
+{
+ p[0] = (u8)(v & 0xffu);
+ p[1] = (u8)((v >> 8 ) & 0xffu);
+ p[2] = (u8)((v >> 16) & 0xffu);
+ p[3] = (u8)((v >> 24) & 0xffu);
+}
+
+static void layout_iplt(Linker* l, LinkImage* img)
+{
+ Heap* h = img->heap;
+ u32 i;
+ u32 nifunc = 0;
+ u64 page;
+ u64 base_vaddr = 0;
+ u64 iplt_vaddr, igot_vaddr;
+ u64 iplt_size, igot_size;
+ u32 iplt_seg_idx, igot_seg_idx;
+ LinkSegment* iplt_seg;
+ LinkSegment* igot_seg;
+ LinkSection* iplt_sec;
+ LinkSection* igot_sec;
+ u8* iplt_bytes;
+ u32 slot_idx;
+
+ /* Pass A: count defined IFUNCs. */
+ for (i = 0; i < img->nsyms; ++i) {
+ const LinkSymbol* s = &img->syms[i];
+ if (s->kind == SK_IFUNC && s->defined) ++nifunc;
+ }
+ if (nifunc == 0) return;
+
+ page = layout_page_size(l);
+
+ /* Pick a base vaddr after every existing segment. Two fresh
+ * segments are appended: one RX for stubs, one RW for slots. */
+ for (i = 0; i < img->nsegments; ++i) {
+ u64 end = img->segments[i].vaddr + img->segments[i].mem_size;
+ if (end > base_vaddr) base_vaddr = end;
+ }
+ base_vaddr = align_up_u64(base_vaddr, page);
+ iplt_vaddr = base_vaddr;
+ iplt_size = (u64)nifunc * 12u;
+ igot_vaddr = align_up_u64(iplt_vaddr + iplt_size, page);
+ igot_size = (u64)nifunc * 8u;
+
+ /* Grow segment / segment-bytes arrays by 2. */
+ {
+ u32 new_nseg = img->nsegments + 2u;
+ LinkSegment* nsegs = (LinkSegment*)h->realloc(
+ h, img->segments,
+ sizeof(*img->segments) * img->nsegments,
+ sizeof(*img->segments) * new_nseg, _Alignof(LinkSegment));
+ u8** nsbufs = (u8**)h->realloc(
+ h, img->segment_bytes,
+ sizeof(*img->segment_bytes) * img->nsegments,
+ sizeof(*img->segment_bytes) * new_nseg, _Alignof(u8*));
+ size_t* nscaps = (size_t*)h->realloc(
+ h, img->segment_bytes_cap,
+ sizeof(*img->segment_bytes_cap) * img->nsegments,
+ sizeof(*img->segment_bytes_cap) * new_nseg, _Alignof(size_t));
+ if (!nsegs || !nsbufs || !nscaps)
+ compiler_panic(img->c, no_loc(), "link: oom on iplt segments");
+ img->segments = nsegs;
+ img->segment_bytes = nsbufs;
+ img->segment_bytes_cap = nscaps;
+ }
+
+ iplt_seg_idx = img->nsegments;
+ iplt_seg = &img->segments[iplt_seg_idx];
+ memset(iplt_seg, 0, sizeof(*iplt_seg));
+ iplt_seg->id = (LinkSegmentId)(iplt_seg_idx + 1u);
+ iplt_seg->flags = SF_ALLOC | SF_EXEC;
+ iplt_seg->file_offset = iplt_vaddr;
+ iplt_seg->vaddr = iplt_vaddr;
+ iplt_seg->file_size = iplt_size;
+ iplt_seg->mem_size = iplt_size;
+ iplt_seg->align = (u32)page;
+ iplt_seg->nsections = 1;
+ img->segment_bytes[iplt_seg_idx] = (u8*)h->alloc(h, (size_t)iplt_size, 16);
+ img->segment_bytes_cap[iplt_seg_idx] = (size_t)iplt_size;
+ if (!img->segment_bytes[iplt_seg_idx])
+ compiler_panic(img->c, no_loc(), "link: oom on iplt bytes");
+ memset(img->segment_bytes[iplt_seg_idx], 0, (size_t)iplt_size);
+ img->nsegments++;
+
+ igot_seg_idx = img->nsegments;
+ igot_seg = &img->segments[igot_seg_idx];
+ memset(igot_seg, 0, sizeof(*igot_seg));
+ igot_seg->id = (LinkSegmentId)(igot_seg_idx + 1u);
+ igot_seg->flags = SF_ALLOC | SF_WRITE;
+ igot_seg->file_offset = igot_vaddr;
+ igot_seg->vaddr = igot_vaddr;
+ igot_seg->file_size = igot_size;
+ igot_seg->mem_size = igot_size;
+ igot_seg->align = (u32)page;
+ igot_seg->nsections = 1;
+ img->segment_bytes[igot_seg_idx] = (u8*)h->alloc(h, (size_t)igot_size, 16);
+ img->segment_bytes_cap[igot_seg_idx] = (size_t)igot_size;
+ if (!img->segment_bytes[igot_seg_idx])
+ compiler_panic(img->c, no_loc(), "link: oom on igot bytes");
+ memset(img->segment_bytes[igot_seg_idx], 0, (size_t)igot_size);
+ img->nsegments++;
+
+ /* Append two LinkSections (.iplt and .igot.plt). */
+ {
+ u32 new_nsec = img->nsections + 2u;
+ LinkSection* nsections = (LinkSection*)h->realloc(
+ h, img->sections,
+ sizeof(*img->sections) * img->nsections,
+ sizeof(*img->sections) * new_nsec, _Alignof(LinkSection));
+ if (!nsections)
+ compiler_panic(img->c, no_loc(), "link: oom on iplt sections");
+ img->sections = nsections;
+ }
+ iplt_sec = &img->sections[img->nsections];
+ memset(iplt_sec, 0, sizeof(*iplt_sec));
+ iplt_sec->id = (LinkSectionId)(img->nsections + 1u);
+ iplt_sec->input_id = LINK_INPUT_NONE;
+ iplt_sec->obj_section_id = OBJ_SEC_NONE;
+ iplt_sec->segment_id = iplt_seg->id;
+ iplt_sec->input_offset = 0;
+ iplt_sec->file_offset = iplt_vaddr;
+ iplt_sec->vaddr = iplt_vaddr;
+ iplt_sec->size = iplt_size;
+ iplt_sec->flags = SF_ALLOC | SF_EXEC;
+ iplt_sec->align = 4;
+ img->nsections++;
+
+ igot_sec = &img->sections[img->nsections];
+ memset(igot_sec, 0, sizeof(*igot_sec));
+ igot_sec->id = (LinkSectionId)(img->nsections + 1u);
+ igot_sec->input_id = LINK_INPUT_NONE;
+ igot_sec->obj_section_id = OBJ_SEC_NONE;
+ igot_sec->segment_id = igot_seg->id;
+ igot_sec->input_offset = 0;
+ igot_sec->file_offset = igot_vaddr;
+ igot_sec->vaddr = igot_vaddr;
+ igot_sec->size = igot_size;
+ igot_sec->flags = SF_ALLOC | SF_WRITE;
+ igot_sec->align = 8;
+ img->nsections++;
+
+ /* Allocate the iplt_pairs table (resolver_vaddr, slot_vaddr) per
+ * IFUNC, in the same iteration order as the stub layout below. */
+ img->iplt_pairs = (u64*)h->alloc(
+ h, sizeof(*img->iplt_pairs) * 2u * (size_t)nifunc, _Alignof(u64));
+ if (!img->iplt_pairs)
+ compiler_panic(img->c, no_loc(), "link: oom on iplt pairs");
+ img->niplt = nifunc;
+
+ iplt_bytes = img->segment_bytes[iplt_seg_idx];
+ slot_idx = 0;
+
+ /* Pass B: per IFUNC, write the stub bytes, synthesize a slot
+ * LinkSymbol, and emit the two relocs that fill the stub's ADRP/LDR
+ * immediate fields against the slot. */
+ for (i = 0; i < img->nsyms; ++i) {
+ LinkSymbol* s = &img->syms[i];
+ u64 stub_vaddr;
+ u64 slot_vaddr;
+ u64 resolver_vaddr;
+ LinkSymbol slot_rec;
+ LinkSymId slot_id;
+ LinkRelocApply rrec;
+ u8* stub_dst;
+
+ if (s->kind != SK_IFUNC || !s->defined) continue;
+
+ stub_vaddr = iplt_vaddr + (u64)slot_idx * 12u;
+ slot_vaddr = igot_vaddr + (u64)slot_idx * 8u;
+ resolver_vaddr = s->vaddr;
+
+ img->iplt_pairs[2u * slot_idx + 0] = resolver_vaddr;
+ img->iplt_pairs[2u * slot_idx + 1] = slot_vaddr;
+
+ stub_dst = iplt_bytes + (size_t)(slot_idx * 12u);
+ wr_u32_le(stub_dst + 0, 0x90000010u); /* ADRP x16, #0 */
+ wr_u32_le(stub_dst + 4, 0xf9400210u); /* LDR x16, [x16] */
+ wr_u32_le(stub_dst + 8, 0xd61f0200u); /* BR x16 */
+
+ /* Synthetic local symbol for the slot. */
+ memset(&slot_rec, 0, sizeof(slot_rec));
+ slot_rec.name = 0;
+ slot_rec.kind = SK_OBJ;
+ slot_rec.bind = SB_LOCAL;
+ slot_rec.defined = 1;
+ slot_rec.section_id = igot_sec->id;
+ slot_rec.vaddr = slot_vaddr;
+ slot_rec.size = 8;
+ slot_id = append_symbol(img, &slot_rec);
+
+ /* Reloc on the ADRP at stub+0. */
+ memset(&rrec, 0, sizeof(rrec));
+ rrec.input_id = LINK_INPUT_NONE;
+ rrec.section_id = OBJ_SEC_NONE;
+ rrec.link_section_id = iplt_sec->id;
+ rrec.offset = (u32)(slot_idx * 12u);
+ rrec.width = 4;
+ rrec.write_vaddr = stub_vaddr;
+ rrec.write_file_offset = stub_vaddr;
+ rrec.kind = R_AARCH64_ADR_PREL_PG_HI21;
+ rrec.target = slot_id;
+ rrec.addend = 0;
+ relocs_grow(img, img->nrelocs + 1u);
+ img->relocs[img->nrelocs++] = rrec;
+
+ /* Reloc on the LDR at stub+4. */
+ memset(&rrec, 0, sizeof(rrec));
+ rrec.input_id = LINK_INPUT_NONE;
+ rrec.section_id = OBJ_SEC_NONE;
+ rrec.link_section_id = iplt_sec->id;
+ rrec.offset = (u32)(slot_idx * 12u + 4u);
+ rrec.width = 4;
+ rrec.write_vaddr = stub_vaddr + 4u;
+ rrec.write_file_offset = stub_vaddr + 4u;
+ rrec.kind = R_AARCH64_LDST64_ABS_LO12_NC;
+ rrec.target = slot_id;
+ rrec.addend = 0;
+ relocs_grow(img, img->nrelocs + 1u);
+ img->relocs[img->nrelocs++] = rrec;
+
+ /* Redirect the IFUNC symbol to the stub. Keep its name +
+ * binding so cfree_jit_lookup and external relocs still find
+ * it; switch kind to SK_FUNC since the resolver indirection is
+ * hidden behind the stub. */
+ s->kind = SK_FUNC;
+ s->section_id = iplt_sec->id;
+ s->value = (u64)(slot_idx * 12u);
+ s->vaddr = stub_vaddr;
+ s->size = 12;
+
+ ++slot_idx;
+ }
+
+ /* Stale undefs that named these IFUNCs (e.g. a separate TU
+ * referencing my_fn) were re-pointed at the same LinkSymbol slot
+ * during resolve_undefs / link_symbols_to_sections, so updating
+ * the master slot above is sufficient — those undefs see the
+ * post-redirect vaddr through their shared slot. */
+}
+
/* ---- entry symbol ---- */
static void resolve_entry(Linker* l, LinkImage* img)
@@ -1635,9 +1964,11 @@ LinkImage* link_resolve(Linker* l)
emit_segment_bytes(l, img);
link_symbols_to_sections(l, img);
emit_array_boundaries(l, img);
+ emit_tls_boundaries(l, img);
emit_encoding_section_boundaries(l, img);
resolve_undefs(l, img);
gc_drop_dead_globals(l, img, &g);
+ layout_iplt(l, img);
{
LinkSymId* got_map = NULL;
u32 got_map_size = img->nsyms + 1u;
diff --git a/src/link/link_reloc.c b/src/link/link_reloc.c
@@ -180,6 +180,24 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes,
wr_u32_le(P_bytes, instr);
return;
}
+ case R_AARCH64_TLSLE_ADD_TPREL_HI12:
+ case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: {
+ /* AArch64 TLS local-exec. Caller passes S already as the
+ * TP-relative offset (target's image offset minus the TLS
+ * image base, plus the 16-byte AArch64 TCB). HI12 takes
+ * bits 23:12, LO12_NC takes bits 11:0; both encoded as
+ * imm12 at instruction bits [21:10] of an ADD (immediate).
+ * The HI12 form's instruction carries LSL #12 in its opcode,
+ * so bits 11:0 of the operand naturally land at scale 4096. */
+ u64 v = (u64)((i64)S + A);
+ u32 imm12 = (k == R_AARCH64_TLSLE_ADD_TPREL_HI12)
+ ? (u32)((v >> 12) & 0xfffu)
+ : (u32)(v & 0xfffu);
+ u32 instr = rd_u32_le(P_bytes);
+ instr = (instr & ~(0xfffu << 10)) | (imm12 << 10);
+ wr_u32_le(P_bytes, instr);
+ return;
+ }
case R_AARCH64_LDST8_ABS_LO12_NC:
case R_AARCH64_LDST16_ABS_LO12_NC:
case R_AARCH64_LDST32_ABS_LO12_NC:
diff --git a/test/link/CORPUS.md b/test/link/CORPUS.md
@@ -110,6 +110,13 @@ Cases 02–09 all pair ADR_PREL_PG_HI21 with their primary LDST reloc.
|---|------|-----------|
| 28 | `extern_resolver` | `CfreeExternResolver` provides address for undefined symbol |
| 29 | `jit_lookup_miss` | `cfree_jit_lookup` returns NULL for unknown name |
+| 32 | `ifunc` | `STT_GNU_IFUNC` trampoline: iplt stub + igot slot, resolver invoked at JIT load |
+
+### Group F — TLS
+
+| # | Name | Exercises |
+|---|------|-----------|
+| 31 | `tls_local_exec` | `_Thread_local` w/ initializer; `R_AARCH64_TLSLE_ADD_TPREL_{HI12,LO12_NC}` apply + PT_TLS layout |
### bad/ — negative tests
@@ -158,3 +165,7 @@ until the impl lands):
and `whole_archive` (needed by cases 26–27).
7. `cfree_link_jit` must return non-zero on unresolved strong symbols
without `extern_resolver` (needed by case 30).
+8. `link_reloc_apply` must handle `R_AARCH64_TLSLE_ADD_TPREL_HI12` and
+ `R_AARCH64_TLSLE_ADD_TPREL_LO12_NC`; `cfree_link_exe` must emit a
+ `PT_TLS` segment for `.tdata`/`.tbss`; harness `_start` must set
+ up `TPIDR_EL0` (needed by case 31).
diff --git a/test/link/cases/31_tls_local_exec/a.c b/test/link/cases/31_tls_local_exec/a.c
@@ -0,0 +1,5 @@
+/* TLS local-exec: _Thread_local with initializer.
+ * Exercises R_AARCH64_TLSLE_* reloc apply + .tdata layout in PT_TLS.
+ * Linker today panics in link_reloc_apply on TLSLE kinds. */
+_Thread_local int tls_x = 7;
+int test_main(void) { return tls_x == 7 ? 0 : 1; }
diff --git a/test/link/cases/32_ifunc/a.c b/test/link/cases/32_ifunc/a.c
@@ -0,0 +1,23 @@
+/* STT_GNU_IFUNC: my_fn is dispatched via a runtime resolver.
+ * impl_a returns 42, impl_b returns 99; resolve() picks impl_a so a
+ * call to my_fn() must return 42 once the linker's iplt trampoline
+ * is wired up and the slot is populated. */
+
+extern int impl_a(void);
+extern int impl_b(void);
+extern int (*resolve(void))(void);
+
+int impl_a(void) { return 42; }
+int impl_b(void) { return 99; }
+
+/* `volatile` keeps the compiler from constant-folding the choice and
+ * inlining my_fn into a direct branch to impl_a (which would defeat
+ * the trampoline test). */
+int (*resolve(void))(void) {
+ volatile int x = 1;
+ return x ? impl_a : impl_b;
+}
+
+int my_fn(void) __attribute__((ifunc("resolve")));
+
+int test_main(void) { return my_fn() == 42 ? 0 : 1; }
diff --git a/test/link/cases/32_ifunc/jit_only b/test/link/cases/32_ifunc/jit_only
diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c
@@ -304,7 +304,44 @@ int main(int argc, char** argv)
}
int (*fn)(void) = cfree_jit_lookup(jit, "test_main");
- int result = fn ? fn() : 1;
+
+ /* AArch64 TLS local-exec setup. Build a thread-local image —
+ * 16-byte TCB + .tdata copy + .tbss zero-fill — and point
+ * TPIDR_EL0 at it. On Darwin, libc functions clobber TPIDR_EL0
+ * (probably via dyld stub binding / locale TSD), so msr → call()
+ * must be back-to-back with NO libc invocations between. */
+#if defined(__aarch64__) || defined(__arm64__)
+ static char tls_block[8192] __attribute__((aligned(16)));
+ {
+ char* td_start = (char*)cfree_jit_lookup(jit, "__tdata_start");
+ char* td_end = (char*)cfree_jit_lookup(jit, "__tdata_end");
+ unsigned long bs_n = (unsigned long)(unsigned long long)
+ cfree_jit_lookup(jit, "__tbss_size");
+ if (td_start && td_end) {
+ unsigned long td_n = (unsigned long)(td_end - td_start);
+ unsigned long i;
+ /* Plain loops at -O0 stay loops; do NOT use memcpy/memset
+ * here — those go through dyld's stub binder on first
+ * call and clobber TPIDR_EL0. */
+ for (i = 0; i < td_n; ++i) tls_block[16 + i] = td_start[i];
+ for (i = 0; i < bs_n; ++i) tls_block[16 + td_n + i] = 0;
+ }
+ }
+#endif
+
+ int result;
+ if (fn) {
+#if defined(__aarch64__) || defined(__arm64__)
+ /* msr + blr in immediate succession; the compiler must not
+ * insert anything between. `volatile` and the "memory"
+ * clobber bracket the load of fn so the call uses the
+ * post-msr register state. */
+ __asm__ volatile ("msr tpidr_el0, %0" :: "r"(tls_block) : "memory");
+#endif
+ result = fn();
+ } else {
+ result = 1;
+ }
cfree_jit_run_dtors(jit);
diff --git a/test/link/harness/start.c b/test/link/harness/start.c
@@ -4,10 +4,18 @@
* test_main() — primary test body; returns 0 on pass.
* test_post_fini() — optional post-destructor check (weak default: 0).
*
- * Lifecycle: ctors → test_main → dtors → test_post_fini → exit(result).
+ * Lifecycle: TLS setup → ctors → test_main → dtors → test_post_fini → exit.
*
- * cfree-ld must define the four boundary symbols as linker-synthesized
- * absolute symbols surrounding the sorted init/fini arrays. */
+ * cfree-ld defines:
+ * __init_array_start/end, __fini_array_start/end — sorted ctor/dtor
+ * spans (synthesized around the corresponding SHT_*_ARRAY sections).
+ * __tdata_start, __tdata_end — .tdata template
+ * bytes; identical when no TLS (length 0).
+ * __tbss_size — SK_ABS, holds the
+ * .tbss byte count as its symbol value.
+ *
+ * The TLS prologue runs unconditionally: with no TLS in the image, the
+ * three boundary symbols all read as 0 and the loop is a no-op. */
extern int test_main(void);
__attribute__((weak)) int test_post_fini(void) { return 0; }
@@ -18,6 +26,17 @@ extern VoidFn __init_array_end[];
extern VoidFn __fini_array_start[];
extern VoidFn __fini_array_end[];
+extern char __tdata_start[];
+extern char __tdata_end[];
+extern char __tbss_size[]; /* SK_ABS: address-of yields the byte count */
+
+#define AARCH64_TCB_SIZE 16
+
+/* Per-thread TLS image; the test harness is single-threaded so a
+ * file-scope buffer is enough. Sized generously for any test we run
+ * here. Layout: [TCB(16) | .tdata copy | .tbss zero-fill]. */
+static char g_tls_block[4096] __attribute__((aligned(16)));
+
__attribute__((noreturn)) static void do_exit(int code)
{
register long x8 __asm__("x8") = 94; /* sys_exit_group */
@@ -26,11 +45,24 @@ __attribute__((noreturn)) static void do_exit(int code)
__builtin_unreachable();
}
+static void tls_init(void)
+{
+ unsigned long td_n = (unsigned long)(__tdata_end - __tdata_start);
+ unsigned long bs_n = (unsigned long)(unsigned long long)__tbss_size;
+ char* dst = g_tls_block + AARCH64_TCB_SIZE;
+ unsigned long i;
+ for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i];
+ for (i = 0; i < bs_n; ++i) dst[td_n + i] = 0;
+ __asm__ volatile ("msr tpidr_el0, %0" :: "r"(g_tls_block) : "memory");
+}
+
void _start(void)
{
VoidFn* p;
int result;
+ tls_init();
+
for (p = __init_array_start; p != __init_array_end; ++p)
(*p)();