commit 665cad667b401544c6a1d4e8bda728909e4e0fce
parent dcda57cc574b99b0149dc7d03be22a83e96f4f16
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 17:20:32 -0700
link: dynamic linking phase 5 (PLT body + import-reloc rewriting)
Emits the AArch64 PLT (PLT0 + per-import 16-byte stubs), routes
CALL26/JUMP26 against imported syms to the PLT entry, and emits
R_AARCH64_GLOB_DAT for abs relocs against imports (covers both
layout_got slot fills and direct abs references). shift_image_addresses
now bumps dyn-state vaddrs and rela_plt offsets; link_emit re-serializes
.rela.plt and .got.plt[0] post-shift. BIND_NOW only.
make test-musl now passes 6/6 (3 static + 3 dynamic):
01_syscall_write, 02_errno_touch, 03_printf_hello run end-to-end
against /lib/ld-musl-aarch64.so.1.
doc/DYNLD.md trimmed to current status + remaining work (Phase 7
cfree_link_shared, Phase 8 TLS GD/IE/LD, polish items).
Diffstat:
5 files changed, 486 insertions(+), 797 deletions(-)
diff --git a/doc/DYNLD.md b/doc/DYNLD.md
@@ -1,660 +1,81 @@
-# Dynamic linking — implementation plan
+# Dynamic linking — status & remaining work
-Scope: what it takes for `cfree ld` to produce a dynamic-linked
-aarch64-linux ELF executable that runs against a real musl libc.so.
-This is the gap exposed by `test/musl/run.sh`'s `dynamic` variant
-(`build/musl/<case>/dynamic/link.err`); see `doc/linker-status.md` row
-"Dynamic linking: PT_DYNAMIC, PT_INTERP, PLT, DT_NEEDED" for context.
+Scope: producing dynamic-linked aarch64-linux ELF executables (and,
+eventually, shared libs) that run against a real musl libc.so.
## Status
-Phases 1–4 + 6 have landed. Phase 5 (PLT body emit + import-reloc
-rewriting) was deliberately skipped — the dyn-link infrastructure
-(synthetic sections, dynamic phdrs, `.dynamic` body, RELATIVE record
-collection) is in place, but CALL26/ADR_GOT_PAGE references against
-imported symbols still go through the existing reloc-apply path and
-will panic / mis-resolve at link time. The produced binary is
-structurally well-formed (`readelf -d -S` will show all the dynamic
-infrastructure) but won't run end-to-end against musl until Phase 5
-fills the PLT bodies and rewrites the input relocs. Every 3-test
-static variant still passes (no regression).
+`make test-musl` passes 6/6 (3 static + 3 dynamic). The dynamic
+variant produces an ET_DYN PIE that runs end-to-end against
+`/lib/ld-musl-aarch64.so.1` for `01_syscall_write`, `02_errno_touch`,
+`03_printf_hello`.
-| Phase | State | Where to look |
-|------:|--------------|-----------------------------------------------------------------|
-| 1 | done | `src/obj/elf_read.c::read_elf_dso`, new RelocKinds in `obj.h` |
-| 2 | done | `driver/ld.c` (`-dynamic-linker`, `.so` argv), `lib_resolve.c` |
-| 3 | done | `link_layout.c::find_dso_export` + `resolve_undefs` extension |
-| 4 | done | new `src/link/link_dyn.c::layout_dyn` |
-| 5 | not started | per §3.5 below |
-| 6 | done | `link_elf.c` `img_base` / new phdrs / `.dynamic` body emit |
-| 7 | not started | per §3.7 below |
-| 8 | deferred | TLS GD/IE/LD, IRELATIVE — out of scope for v1 |
+What's wired (don't re-derive — read the code if you need detail):
-Notes that drifted from the original plan during 4 + 6 implementation:
-- Phases 4 and 6 landed in the same change without Phase 5 between
- them. Phase 5's reloc rewriting is what makes imported-symbol
- references actually call into `.plt` / read `.got.plt`; without it,
- any input CALL26 to an imported function still aims at the
- pre-redirect target (vaddr 0) and `link_reloc_apply` panics at
- "CALL26 out of range". The harness intentionally accepts this gap
- — the goal of 4 + 6 is the structural skeleton for the binary,
- not a runnable exe.
-- `LinkImage` carries `pie` + a single `LinkDynState*` rather than
- splitting fields across the image. The state lives on
- `link_internal.h` and is freed via `link_dyn_state_free` from
- `link_image_release`. Open question §5.4's call to add an
- `emit_dynamic_exe` flag was answered with a `Linker.emit_pie` flag
- + `LinkImage.pie` mirror; the existing `emit_static_exe` is
- unchanged and orthogonal.
-- `.rela.dyn` is pre-sized at layout time with a 4096-record
- RELATIVE tail capacity; the `apply_all_relocs` pass appends
- R_AARCH64_RELATIVE records as it walks internal absolute relocs,
- and the section bytes are re-serialized after the apply runs.
- Trailing capacity stays zero (R_AARCH64_NONE) — harmless to the
- loader. The cap is large enough for the existing musl harness;
- a fail-loud panic fires if exceeded.
-- `.gnu.hash`'s bucket population takes a shortcut: it assumes
- symbols hashed into the same bucket appear contiguously in
- `.dynsym`. Phase 4's import set is small enough that this holds
- in practice, but a sort-by-bucket pass would be needed before
- growing the hashed range.
-- `.dynstr` is built once during layout — both import names and
- DT_NEEDED soname strings are appended up front so the
- `.dynstr`/`.dynsym` section sizes are final before placement. Any
- new soname source needs to land in this same pre-pass.
-- `DT_NEEDED` is emitted for every DSO input that carries a soname,
- not just those that satisfied an import. Matches GNU ld without
- `--as-needed`; v1 doesn't plumb `--as-needed` through to the
- consuming pass.
+- DSO ingest (`read_elf_dso`, `LINK_INPUT_DSO_BYTES`, soname tracking).
+- Driver: `-dynamic-linker`, `.so` / `.so.N` positional inputs,
+ `-l<name>` honoring `-Bdynamic`/`-Bstatic`.
+- Synthetic dyn tables: `.interp` / `.dynsym` / `.dynstr` / `.gnu.hash`
+ / `.rela.dyn` / `.rela.plt` / `.plt` / `.got.plt` / `.dynamic`
+ (`src/link/link_dyn.c::layout_dyn`).
+- PIE / ET_DYN emit: `e_type`, `img_base`, PT_PHDR / PT_INTERP /
+ PT_DYNAMIC / PT_GNU_STACK, `R_AARCH64_RELATIVE` for internal abs
+ fixups (`src/link/link_elf.c`).
+- PLT body emit (PLT0 + per-import 16-byte stubs) and import-reloc
+ routing: CALL26/JUMP26 → PLT entry (`sym_plt_vaddr`),
+ abs-against-import → GLOB_DAT, GOT-redirected slot fills →
+ GLOB_DAT via the existing `layout_got` path. BIND_NOW only
+ (`DF_1_NOW`); PLT0 is canonical but unused.
-Notes that drifted from the original plan during 1–3 implementation:
-- The DSO input shares the existing `ObjBuilder` rather than a new
- `DsoBuilder` (open question §5.1). `read_elf_dso` produces an
- `ObjBuilder` with only the DSO's exported dynsym entries and no
- sections; the SONAME lives on `LinkInput.soname`. Symbol-walk
- passes that touched `InputMap` (`resolve_symbols`,
- `link_symbols_to_sections`) early-out for `LINK_INPUT_DSO_BYTES`
- inputs, since DSOs contribute no per-input map. If §4–§6 turn out
- to want a proper `DsoBuilder`, the migration is local — the
- reader's call sites and the input enum are the only public
- surface.
-- `link_add_dso_bytes` falls back to the DSO's filename basename
- when DT_SONAME is absent (matches GNU ld). PT_DYNAMIC walking from
- pure `e_phoff` (no `SHT_DYNAMIC`) is stubbed; not exercised by
- musl `libc.so` since musl does ship `SHT_DYNAMIC`.
-- `-l<name>` resolution under non-`-Bstatic` modes searches `.so`
- across every `-L` dir before falling back to `.a` — suffix-first
- rather than dir-first, matching `clang -l`.
+## Remaining work
-The §2 inventory below catalogues the full set of model, layout,
-emit, and driver gaps as they stood pre-Phase 1. The text describes
-the **original** state of each file so the rest of the plan reads in
-context; what's actually changed since is captured in the Status
-table above and noted inline where useful.
-
----
-
-## 1. What a working dynamic-exe link must look like
-
-End-to-end, for a single PIE exe linked against a single shared libc:
-
-```
-input clang case.c -fPIE -fpic -c -> case.o (ET_REL, PIC)
- musl Scrt1.o crti.o crtn.o -> start files (ET_REL, PIC)
- musl libc.so -> shared dep (ET_DYN)
- libcfree_rt.a -> archive (ET_REL members)
-
-output case.exe ET_DYN exe (PIE)
- PT_PHDR
- PT_INTERP -> "/lib/ld-musl-aarch64.so.1"
- PT_LOAD R (ehdr, phdrs, .interp, .dynsym, .dynstr, .gnu.hash,
- .rela.dyn, .rela.plt, .note.gnu.build-id)
- PT_LOAD RX (.text, .plt)
- PT_LOAD R (.rodata, .eh_frame)
- PT_LOAD RW (.data, .got, .got.plt, .data.rel.ro, .bss)
- PT_DYNAMIC -> .dynamic
- DT_NEEDED libc.so
- DT_STRTAB / DT_SYMTAB / DT_GNU_HASH
- DT_PLTGOT / DT_PLTRELSZ / DT_PLTREL DT_RELA / DT_JMPREL
- DT_RELA / DT_RELASZ / DT_RELAENT
- (DT_RUNPATH, DT_FLAGS_1 PIE, DT_INIT_ARRAY, ...)
- PT_TLS -> .tdata template (already wired)
-
-dynamic relocs in the produced exe (against `libc.so` syms only):
- R_AARCH64_GLOB_DAT on each .got slot for an imported data sym
- R_AARCH64_JUMP_SLOT on each .got.plt slot for an imported function
- R_AARCH64_RELATIVE on each absolute pointer in PIE-emitted data
- (e.g. `.init_array` entries, vtable-shaped data)
-```
-
-Same pipeline, different shape, applies to `-shared` output (no
-PT_INTERP, allow_undefined=1, ET_DYN with DT_SONAME). The exe path is
-the more demanding consumer of the two — it's strictly a superset of
-the shared path.
-
----
-
-## 2. Investigation: current pipeline state
-
-### 2.1 Driver — `driver/ld.c` *(addressed in Phase 2)*
-
-Already had:
-- `-shared`, `-soname`, `-rpath`, `-rpath-link`, `-Bstatic`/`-Bdynamic`,
- `--enable-new-dtags`, `-pie`, `-no-pie`, `-E`/`--export-dynamic`,
- `--whole-archive`, `--start-group`/`--end-group`, `-l<name>` resolution
- (`driver/lib_resolve.c`).
-- `cfree_link_shared` dispatch wired (`driver/ld.c`); shared options
- are populated and passed.
-
-Was missing (now fixed):
-- ~~`-dynamic-linker` / `--dynamic-linker` flag~~ → parsed and plumbed
- through `CfreeLinkOptions.interp_path`.
-- ~~`.so` / `.so.N` filename recognition~~ → `driver_is_so_filename`
- routes positional shared inputs into `LdOptions.dsos[]` →
- `CfreeLinkInputs.dso_bytes`.
-- ~~`-l<name>` honoring `-Bstatic`/`-Bdynamic`~~ → `driver_lib_resolve`
- takes a `LibResolveMode`; non-`-Bstatic` callers prefer `lib<name>.so`
- before falling back to `lib<name>.a`. The function reports which
- suffix matched so the driver routes hits to `dsos[]` vs.
- `archives[]`.
-
-### 2.2 ELF reader — `src/obj/elf_read.c` *(addressed in Phase 1)*
-
-Pre-Phase-1, `read_elf` was the single ingest path used by both
-`link_add_obj_bytes` and archive members. It parsed `e_shoff` / shdrs
-only (program headers ignored), folded `SHT_PROGBITS`/`NOBITS`/`NOTE`
-into ObjBuilder sections, took at most one `SHT_SYMTAB`, and walked
-`SHT_RELA`/`SHT_REL` whose `sh_info` named a kept section.
-
-It rejected `libc.so` because the guard
-`sh->sh_info != 0 && sh->sh_info < e_shnum` tripped on `.rela.dyn` /
-`.rela.plt` (whose `sh_info = 0` is valid ELF — a dynamic reloc isn't
-bound to one output section). It also accepted ET_DYN silently and
-had no SHT_DYNSYM / SHT_DYNAMIC / PT_DYNAMIC reader.
-
-Post-Phase-1:
-- `read_elf` rejects anything other than ET_REL with a diagnostic;
- ET_DYN inputs are routed through a separate `read_elf_dso`.
-- `read_elf_dso` parses `SHT_DYNSYM` (skipping `.symtab` if present),
- walks `SHT_DYNAMIC` for `DT_SONAME` (interned into the compiler's
- global Sym pool), and explicitly skips `.rela.*` / `SHT_GROUP` —
- DSO inputs contribute no relocations or sections to the consumer.
-- Defined dynsym entries are appended as `ObjSym`s with
- `section_id = OBJ_SEC_NONE` and `value = 0`; only the name is
- load-bearing for the consumer's resolver. `STB_LOCAL` and undefined
- dynsym entries (the DSO's own imports) are dropped.
-
-### 2.3 Object model — `src/obj/obj.h` *(addressed in Phases 1, 3)*
-
-Pre-Phase-1, `RelocKind` had no entries for the dynamic-only relocs
-(`R_AARCH64_GLOB_DAT` / `R_AARCH64_JUMP_SLOT` / `R_AARCH64_RELATIVE` /
-`R_AARCH64_COPY`); the mapping in `src/obj/elf_reloc_aarch64.c`
-returned `(u32)-1` on unsupported types and the reader panicked.
-There was also no concept of an **import** symbol or a **DSO input**.
-
-Post-Phase-1/3:
-- `RelocKind` carries `R_AARCH64_GLOB_DAT`, `R_AARCH64_JUMP_SLOT`,
- `R_AARCH64_RELATIVE`, `R_AARCH64_COPY`, with both directions of
- `elf_aarch64_reloc_{to,from}` wired and the objdump-side
- `reloc_kind_name` extended. `R_AARCH64_TLSDESC*` and `R_AARCH64_PLT32`
- are still deferred (Phase 8 / not exercised by the musl harness).
-- `LinkInput.kind` gains `LINK_INPUT_DSO_BYTES`, plus a `Sym soname`
- field; `link_add_dso_bytes` builds these via `read_elf_dso`.
-- `LinkSymbol` gains `imported`, `dso_input_id`, and a flag set
- (`needs_plt`, `needs_got`, `needs_copy`). The flags are reserved
- for Phases 4–5; today only `imported` and `dso_input_id` are set —
- by `resolve_undefs` when an undef is matched against a DSO export.
- An imported symbol stays `defined=0` (the static linker has no
- vaddr for it) but no longer trips the panic.
-
-### 2.4 Linker resolve — `src/link/link_layout.c` *(partially addressed in Phase 3; Phase 4 owns the rest)*
-
-- Static-only by construction. Comment near the IFUNC ctor logic
- keys off `l->emit_static_exe`; the rest of the layout still has no
- symmetric "build dynamic image" branch — Phase 4 work.
-- `resolve_undefs` *(post-Phase-3)*: walks DSO inputs via
- `find_dso_export` before falling through to the resolver / weak-
- zero path. On a hit, the undef is marked `imported=1`,
- `dso_input_id=<DSO id>`, and resolution continues. The
- still-undefined-in-the-static-sense semantics is exactly what
- Phases 4–5 need: the symbol routes through synthetic .plt/.got at
- apply time. DSO inputs themselves are skipped by `resolve_symbols`
- (their exports must not contend with internal definitions in
- `img->globals`) and by `link_symbols_to_sections` (no per-input
- `InputMap` is allocated for them).
-- `section_kept` still drops everything that isn't ALLOC
- PROGBITS/NOBITS/INIT_ARRAY. Synthesized .dynsym / .dynstr /
- .dynamic / .got.plt / .plt / .rela.dyn / .rela.plt still need to
- be added as image-owned synthetic sections (Phase 4, same model
- as `layout_iplt`'s `.iplt`/`.igot.plt`/`.iplt.pairs`).
-- `link_image_alloc` and `LinkImage` still carry no fields for:
- dynamic strtab, dynsym table, hash table, PLT/GOT slot tables,
- dynamic-reloc list, PT_INTERP path, DT_NEEDED list, runpath/rpath
- lists. The DT_NEEDED soname does live on `LinkInput.soname` —
- collecting the actually-used set is Phase 4.
-
-### 2.5 ELF emit — `src/link/link_elf.c`
-
-- Hardcoded `e_type = ET_EXEC` (`link_elf.c:737`).
-- Hardcoded `IMAGE_BASE = 0x400000` (`link_elf.c:106`); ALL applied
- reloc S/P values bake this in (`link_elf.c:192,195`). PIE/ET_DYN
- output requires `IMAGE_BASE = 0` (the loader rebases) **and** any
- pre-applied absolute reloc against an internal target must instead
- emit an `R_AARCH64_RELATIVE` dyn-reloc against the image-relative
- vaddr, which the loader patches.
-- `phdrs` synthesis covers PT_LOAD, PT_NOTE, PT_TLS only
- (`link_elf.c:664-722`). No PT_PHDR, PT_INTERP, PT_DYNAMIC,
- PT_GNU_RELRO, PT_GNU_STACK.
-- `.symtab` / `.strtab` / `.shstrtab` / `.note.gnu.build-id` are the
- only synthesized non-input sections (`link_elf.c:540-867`).
-- `cfree_link_shared` is stubbed (`src/api/pipeline.c:413`):
- `compiler_panic("cfree_link_shared: shared-library codegen is not yet
- implemented in libcfree")`.
-
-### 2.6 Reloc apply — `src/link/link_reloc.c`
-
-Only static relocs are coded. There is no concept of "compute final
-address ⇒ also push a dyn-reloc record into `img->dyn_relocs[]`",
-which is the mode an absolute reloc takes when the target is imported
-or the output is PIE.
-
----
-
-## 3. What's missing — concrete inventory
-
-Grouped by layer, in dependency order.
-
-### 3.1 ELF reader (smallest, gates everything)
-
-1. Accept `e_type ∈ {ET_REL, ET_DYN}`. Reject ET_EXEC, ET_CORE.
-2. For ET_REL, current behavior unchanged.
-3. For ET_DYN, switch into a **DSO-import** mode that produces a
- different ObjBuilder shape (or a sibling `DsoBuilder` — see §5):
- - Parse `.dynsym`/`.dynstr` (NOT `.symtab`).
- - Skip `.rela.dyn`/`.rela.plt` body parsing — only `DT_SONAME` and
- the dynsym table matter for satisfying undefs at link time.
- - Read PT_DYNAMIC to extract DT_SONAME (so DT_NEEDED can record the
- library's runtime name, not its filesystem path).
-4. Relax the `sh_info == 0` guard for ET_DYN (in DSO mode the rela
- sections are skipped anyway). Keep it strict for ET_REL.
-5. Add new RelocKinds and elf_aarch64_reloc_{from,to} cases:
- `R_AARCH64_GLOB_DAT`, `R_AARCH64_JUMP_SLOT`, `R_AARCH64_RELATIVE`,
- `R_AARCH64_COPY`. (TLSDESC / IRELATIVE deferred — see §6.)
-
-### 3.2 Object/link model — symbols and inputs
-
-1. New `LinkInputKind`: `LINK_INPUT_DSO_BYTES`. Carries a
- `DsoBuilder*` (or extended ObjBuilder) holding: soname, dynsym
- list, version mapping (deferred), the original bytes for
- diagnostics.
-2. `LinkSymbol` gains `dso_id` (`LinkInputId` of defining DSO, 0 for
- "local to this image") and a flag set: `imported`, `needs_plt`,
- `needs_got`, `needs_copy_reloc`. `defined=1, imported=1` means
- "satisfied by a DSO; emit a dyn-reloc instead of baking the
- address".
-3. `resolve_undefs` extension: before the panic at
- `link_layout.c:304`, walk DSO inputs and match by name; on hit,
- mark the LinkSymbol imported and record a DT_NEEDED for that DSO.
-4. New image-owned tables on `LinkImage`:
- - `dynsym[]`, `dynstr` (string builder).
- - `gnu_hash` (computed last).
- - `plt_slots[]` (one per imported function), `got_slots[]` (one
- per imported data sym), `gotplt_slots[]` (matched 1:1 with
- plt_slots).
- - `dyn_relocs[]` — `R_AARCH64_GLOB_DAT|JUMP_SLOT|RELATIVE` records.
- - `needed[]` — DT_NEEDED soname list.
- - `interp_path` (PT_INTERP string).
-
-### 3.3 Layout
-
-1. New synthetic-section pass alongside `layout_iplt`. Builds:
- `.interp`, `.dynsym`, `.dynstr`, `.gnu.hash`, `.rela.dyn`,
- `.rela.plt`, `.got`, `.got.plt`, `.plt`, `.dynamic`, `.data.rel.ro`
- (for relocs in initialized data, RELRO target).
-2. PT_LOAD bucket-aware placement:
- - Read-only dyn-data (.interp, .dynsym, .dynstr, .gnu.hash,
- .rela.dyn, .rela.plt) goes into the existing R bucket (or its
- own R PT_LOAD before .text).
- - .plt is RX → joins SEG_RX.
- - .got, .got.plt, .data.rel.ro → SEG_RW.
-3. PT_DYNAMIC, PT_INTERP, PT_PHDR, PT_GNU_RELRO synthesis in
- `link_elf.c`'s phdr build loop.
-4. Drop the hardcoded `IMAGE_BASE = 0x400000`. For ET_DYN output, set
- `e_type = ET_DYN`, `IMAGE_BASE = 0`, `e_entry = entry vaddr`
- (already image-relative). For ET_EXEC, leave today's value. The
- sym-table / reloc-apply code needs to read this from the image
- instead of the macro.
-
-### 3.4 PLT/GOT emit
-
-AArch64 PLT0 + per-symbol PLT entry per the psABI. Each PLT entry is
-16 bytes:
-
-```
-adrp x16, .got.plt + slot ; ADR_PREL_PG_HI21
-ldr x17, [x16, #lo12(slot)] ; LDST64_ABS_LO12_NC
-add x16, x16, #lo12(slot) ; ADD_ABS_LO12_NC
-br x17
-```
-
-PLT0 is the standard 32-byte stub that calls `_dl_runtime_resolve`
-through `.got.plt[1..2]`. Lazy binding works out of the box if
-`.got.plt` slots are initialized to `&PLT0`; **eager binding (BIND_NOW)
-is simpler to implement** — initialize all `.got.plt` slots from
-`.rela.plt` at startup, no PLT0 trickery needed. Recommend
-**BIND_NOW-only as v1**; add lazy later if perf demands.
-
-### 3.5 Reloc apply, dynamic edition
-
-1. PIE-aware abs reloc: when `output is PIE` AND the reloc target is
- `defined && !imported` (an internal symbol), the reloc value is
- **left as the image-relative vaddr** in the file, AND a
- `R_AARCH64_RELATIVE` dyn-reloc is emitted against that file
- offset. The loader adds the load-base.
-2. Imported abs reloc: emit `R_AARCH64_GLOB_DAT` against the .got
- slot. The CALL26/JUMP26 to an imported function instead targets
- that symbol's .plt entry; the .plt entry's `.got.plt` slot is the
- `R_AARCH64_JUMP_SLOT` target.
-3. ADR_GOT_PAGE / LD64_GOT_LO12_NC already-applied path needs to
- route imported syms through the new `.got` (vs. today's `.got` is
- only used for IFUNC).
-
-### 3.6 ELF emit
-
-1. `e_type` selection by output kind: ET_DYN for `-shared` and for
- `-pie` exe; ET_EXEC for `-static` exe (today's path).
-2. New phdr emitters: PT_PHDR (offset/size of the phdr table itself),
- PT_INTERP, PT_DYNAMIC, PT_GNU_RELRO, PT_GNU_STACK (PF_R|PF_W,
- filesz=0 — non-executable stack marker).
-3. Section header emit for `.dynsym` (`SHT_DYNSYM`), `.dynstr`
- (`SHT_STRTAB`), `.gnu.hash` (`SHT_GNU_HASH`), `.dynamic`
- (`SHT_DYNAMIC`), `.rela.dyn`/`.rela.plt` (`SHT_RELA` with
- `sh_info=0` and proper `sh_link` to `.dynsym`).
-4. .dynamic body: emit DT_* entries per §1's list.
-
-### 3.7 Driver
-
-1. `-dynamic-linker PATH` / `--dynamic-linker=PATH` flag → carries
- into a new `CfreeLinkOptions.interp_path`. (Or default per-target:
- `/lib/ld-musl-aarch64.so.1` for aarch64-linux-musl.)
-2. `.so` recognition in argv: filename ending in `.so` (or
- `.so.<digit>...`) routes to a new `CfreeLinkInputs.dso_bytes`
- array, separate from `obj_bytes`.
-3. `-l<name>` honors `-Bdynamic`: `driver/lib_resolve.c` looks for
- `lib<name>.so` first under `-Bdynamic`, then `lib<name>.a`. Today
- it presumably does archive-only.
-4. Default: when any DSO input or `-pie` is present, output is ET_DYN
- with a default interp; otherwise ET_EXEC (current behavior).
-
-### 3.8 Public API *(items 1–2 done in Phase 2; item 3 is Phase 7)*
-
-1. ~~`CfreeLinkInputs` gains `dso_bytes` + `ndso_bytes` fields~~ —
- landed.
-2. ~~`CfreeLinkOptions` gains `interp_path` and a `pie` flag~~ —
- landed (kept as two scalar fields rather than a single
- `output_kind` enum; Phase 4/6 will decide whether to fold them).
-3. `cfree_link_shared` stub at `src/api/pipeline.c` becomes a thin
- wrapper that dispatches into the same layout as `link_exe` but
- with `output_kind = SHARED` (no PT_INTERP, no entry symbol
- required, allow_undefined=1) — Phase 7.
-
----
-
-## 4. Implementation plan in phases
-
-Each phase is independently testable against `test/musl/run.sh`'s
-dynamic variant. Phases (1)-(3) are the ELF-reader cleanup that
-unblocks every later step; (4)-(8) are the actual link work.
-
-### Phase 1 — ELF reader: accept ET_DYN as a DSO input *(done)*
-
-Files: `src/obj/elf_read.c`, `src/obj/elf_reloc_aarch64.c`,
-`src/obj/elf.h`, `src/obj/obj.h`, `src/link/link.{h,c}`.
-
-- ~~Add `read_elf_dso`~~. Returns an `ObjBuilder` (not a sibling
- `DsoBuilder` — see Status notes); the consumer's `LinkInput`
- carries the soname separately.
-- ~~`LINK_INPUT_DSO_BYTES` enum + `link_add_dso_bytes` API.~~
-- ~~New RelocKinds (GLOB_DAT, JUMP_SLOT, RELATIVE, COPY) wired
- through `elf_aarch64_reloc_{to,from}`.~~
-- ~~DSO input is *parsed but not laid out*~~ — its exported dynsym
- entries are searchable during `resolve_undefs` but it contributes
- no sections to the image. `resolve_symbols` and
- `link_symbols_to_sections` short-circuit on
- `LINK_INPUT_DSO_BYTES`.
-
-Test: ✓ harness no longer fails at "rela sh_info 0 out of range";
-ELF read of `libc.so` succeeds.
-
-### Phase 2 — Driver: `-dynamic-linker`, `.so` inputs *(done)*
-
-Files: `driver/ld.c`, `driver/lib_resolve.{h,c}`, `driver/cc.c`
-(call-site update), `include/cfree.h`, `src/api/pipeline.c` (DSO
-input plumbing).
-
-- ~~Parse `-dynamic-linker` / `--dynamic-linker [=]PATH`~~; plumbed
- through `CfreeLinkOptions.interp_path`.
-- ~~Recognize `.so` / `.so.N` filenames~~ via
- `driver_is_so_filename`; positional shared inputs route to
- `LdOptions.dsos[]` → `CfreeLinkInputs.dso_bytes`.
-- ~~`-l<name>` under `-Bdynamic` finds `lib<name>.so` first~~ via
- `LibResolveMode`. `cc.c` uses `LIB_RESOLVE_STATIC_ONLY` (driver
- default unchanged); `ld.c` picks the mode from the current
- `-Bstatic`/`-Bdynamic` state.
-- `CfreeLinkOptions.pie` carries `-pie` through to the linker for
- Phase 6 to consume.
-
-Test: ✓ harness invokes cfree with `-pie` and `libc.so` end-to-end;
-failure is now in the link's emit stage, not a parse error.
-
-### Phase 3 — Resolve: imported-undef path *(done)*
-
-Files: `src/link/link_layout.c`, `src/link/link.h`.
-
-- ~~`LinkSymbol.imported`, `dso_input_id`,
- `needs_{plt,got,copy}` flags.~~ Declared in `link.h`; only
- `imported` and `dso_input_id` are populated today (the `needs_*`
- flags are reserved for Phase 4–5 decisions).
-- ~~`resolve_undefs` extension~~ via `find_dso_export`: walks DSO
- inputs in input order before the resolver/weak-zero/panic
- fallback. On hit, marks the symbol imported and stamps
- `dso_input_id`. The DSO's soname (already on `LinkInput.soname`)
- is the eventual DT_NEEDED entry; collecting the actually-used
- set into the image is Phase 4.
-- ~~Emit-time decisions deferred~~ — imported syms are no longer
- fatal but still have no vaddr. They'll panic the reloc apply
- path the moment a CALL26 / ABS64 / ADR_GOT_PAGE targets one,
- which is the wedge for Phase 4–5.
-
-Test: ✓ link reaches emit. Dynamic harness moved from `(link)`
-failures to `(run rc=139)` runtime crashes — the produced binary
-has no PT_INTERP / PT_DYNAMIC / .plt yet, so the loader can't bind
-it. All 3 static cases still pass; all 756 cg tests, 118 link tests,
-and the elf/ar/lib-deps suites still pass (no regression).
-
-### Phase 4 — Synthetic dyn-tables *(done)*
-
-Files: new `src/link/link_dyn.c`, hooked from `link_layout.c`.
-
-- ~~Walk LinkSymbols, partition imports into PLT (function) / GOT
- (data) slot lists.~~ `collect_imports` walks LinkSyms (canonical
- entry per `img->globals` only), classifies SK_FUNC/SK_IFUNC as
- PLT-bound and the rest as GOT-bound.
-- ~~Build `.dynsym`/`.dynstr`/`.gnu.hash` from the imported set
- plus any `--export-dynamic` exports.~~ Imports + DT_NEEDED
- sonames land in `.dynstr`; STN_UNDEF + each import becomes a
- `.dynsym` entry; `.gnu.hash` is a small psABI hash with one
- bloom word and shift=6. `--export-dynamic` exports remain a
- follow-up — not exercised by the musl harness.
-- ~~Allocate image-relative bytes for `.plt`, `.got.plt`,
- `.rela.plt`, `.rela.dyn`, `.dynamic`, `.interp`.~~ One R-perm
- segment carries `.interp` / `.dynsym` / `.dynstr` / `.gnu.hash`
- / `.rela.dyn` / `.rela.plt` / `.dynamic`; an RX segment carries
- `.plt` (zero body, see Phase 5) and an RW segment carries
- `.got.plt`. The existing `.got` (from `layout_got`) is reused
- as-is for IFUNC / weak-extern slots; imported-data GOT slots
- are deferred to Phase 5.
-- JUMP_SLOT records are pre-populated against the matching
- `.got.plt` slot vaddrs so `.rela.plt` is content-complete from
- Phase 4. GLOB_DAT records for data imports have placeholder
- `r_offset = 0` until Phase 5 wires up the imported-data .got
- slots.
-
-Test: ✓ build clean, all link/api objects compile under
-`-Wpedantic -Wextra -Werror`. The image now has a complete dyn
-infrastructure visible via `readelf -d -S`; running it still
-faults because Phase 5 hasn't been implemented.
-
-### Phase 5 — PLT body emit + reloc rewriting *(medium)*
-
-Files: `src/link/link_dyn.c`, `src/link/link_reloc.c`.
-
-- Emit AArch64 PLT0 + per-symbol PLT entries.
-- Rewrite CALL26/JUMP26 against imported syms to target the symbol's
- PLT entry (S = plt_vaddr).
-- Rewrite ADR_GOT_PAGE / LD64_GOT_LO12_NC against imports to target
- the new `.got` slot.
-- Emit `R_AARCH64_JUMP_SLOT` records into `.rela.plt`,
- `R_AARCH64_GLOB_DAT` into `.rela.dyn`.
-
-Test: `01_syscall_write` (no libc calls) should still link and run;
-`02_errno_touch` exercises the import path for `close` and `errno`.
-
-### Phase 6 — PIE / ET_DYN emit *(done)*
-
-Files: `src/link/link_elf.c`, `src/api/pipeline.c`,
-`src/link/link.{h,c}`.
-
-- ~~Plumb `output_kind` through to emit.~~ Two scalar fields
- (`Linker.emit_pie` + `Sym Linker.interp_path`) set by
- `cfree_link_exe` from `opts->pie` / `opts->interp_path`;
- mirrored to `LinkImage.pie` during resolve. Open question §5.4
- resolved in favour of the parallel-flag option.
-- ~~Set `e_type`, `IMAGE_BASE`, PT_INTERP, PT_DYNAMIC, PT_PHDR,
- PT_GNU_STACK.~~ `e_type = pie ? ET_DYN : ET_EXEC`; the previous
- `IMAGE_BASE` macro is now `IMAGE_BASE_STATIC` and a runtime
- `img_base` (0 under PIE) replaces it everywhere. PT_PHDR /
- PT_INTERP / PT_DYNAMIC / PT_GNU_STACK emit when PIE.
- PT_GNU_RELRO is intentionally omitted — `ro_seg` already lives
- in a PF_R-only PT_LOAD, so RELRO is implicit.
-- ~~For PIE: emit `R_AARCH64_RELATIVE` against any internal
- absolute reloc.~~ `apply_all_relocs` now takes `img_base` and
- appends a RELATIVE record to `img->dyn->rela_dyn` whenever it
- sees a defined-non-imported R_ABS{32,64} target under PIE. The
- patch site is left at the image-relative vaddr; the loader adds
- the load-base via the RELATIVE relocation.
-- ~~Drop `IMAGE_BASE` macro use; read from image.~~ Done — every
- prior `IMAGE_BASE` site reads `img_base` derived from
- `img->pie`.
-- `.dynamic` body is built post-shift inside the emit pass, with
- DT_NEEDED for each tracked soname plus DT_STRTAB / DT_SYMTAB /
- DT_GNU_HASH / DT_PLTGOT / DT_PLTRELSZ / DT_PLTREL=RELA /
- DT_JMPREL / DT_RELA / DT_RELASZ / DT_RELAENT / DT_FLAGS_1 |
- DF_1_NOW / DT_NULL. Section-header overrides for
- `.dynsym`/`.dynstr`/`.gnu.hash`/`.rela.{dyn,plt}`/`.dynamic`/
- `.got.plt` set the proper sh_type/sh_link/sh_info/sh_entsize so
- `readelf` prints them correctly.
-
-Test: ✓ build clean. The full musl harness will still run-fail
-until Phase 5 is implemented (CALL26 against imported `__libc_start_main`
-panics during reloc apply).
-
-### Phase 7 — `cfree_link_shared` for real *(small after 4-6)*
+### Phase 7 — `cfree_link_shared` (small)
Files: `src/api/pipeline.c`, `src/link/`.
-- Replace the panic at `pipeline.c:413` with a dispatch into the
- same machinery as link_exe, with `output_kind = SHARED`,
- `allow_undefined=1`, no entry-symbol requirement, DT_SONAME from
- `opts->soname`, DT_R(UN)PATH from `opts->r(un)paths`, exports
- promoted into dynsym from `opts->exports`.
-- New harness case: build a `libfoo.so` from a single .c; link an
- exe against it; run.
-
-### Phase 8 — TLS GD/IE/LD, IRELATIVE *(separate effort)*
-
-Out of scope for v1 dynamic exe. Required for shared-lib TLS and for
-IFUNCs in dynamic outputs. Deferred — the static-exe IFUNC plan in
-`linker-status.md §"Plan: STT_GNU_IFUNC in ELF output"` covers the
-near-term surface.
-
----
-
-## 5. Open questions
-
-1. **DsoBuilder vs. ObjBuilder reuse.** *(resolved during Phase 1.)*
- Phase 1 reused ObjBuilder rather than introducing a sibling type;
- `LINK_INPUT_DSO_BYTES` plus a soname field on `LinkInput` was
- enough surface and the §4 work didn't need anything richer. The
- migration to a separate type stays cheap if a future need
- surfaces — the reader call sites and the input enum are the only
- external surface.
-
-2. **Lazy vs. eager binding (BIND_NOW).** *(resolved as eager.)*
- Phase 4 emits `DT_FLAGS_1 = DF_1_NOW` and pre-sizes `.got.plt`
- with 3 reserved slots + one zero-initialized slot per import; no
- PLT0 trampoline / `_dl_runtime_resolve` plumbing. The loader
- patches every slot from `.rela.plt` before user code runs. Phase
- 5 still needs to wire CALL26 → `.plt` so the slot reads happen
- on the right path; the lazy-vs-eager binding choice is decoupled.
-
-3. **Where the dyn-link state lives in the image.** *(resolved.)*
- `LinkImage.dyn` is a single owned `LinkDynState*`; segments and
- sections are appended via the existing `realloc`-grow allocator,
- matching `layout_iplt`. Cleanup runs from `link_image_release`.
-
-4. **`emit_static_exe` flag stays.** *(resolved.)* Added a
- parallel `Linker.emit_pie` flag and a `LinkImage.pie` mirror.
- `emit_static_exe`'s meaning is unchanged (IFUNC startup-init
- gating); the two flags are orthogonal.
-
-5. **Versioned symbols (`.gnu.version_r`, `.gnu.version`).** musl
- doesn't use them; glibc does. v1 ignores versions on read
- (matches GNU ld's behavior with unversioned objects against
- versioned libs — the unversioned default version is taken). Adding
- write-side versioning is a follow-up that's invisible to the musl
- harness.
-
-6. **`.eh_frame_hdr` interaction.** Listed as a near-term gap in
- `linker-status.md`. It needs PT_GNU_EH_FRAME and is independent
- of dynamic linking, but the dyn-link work touches the same phdr
- synthesis code. Land `.eh_frame_hdr` first if it sequences in the
- same window — the phdr count growth is shared.
-
----
-
-## 6. Test plan
-
-`test/musl/run.sh dynamic` is the integration test, accessible via
-`make test-musl` (the target declares the sysroot, runtime, and
-driver binary as Make prereqs so a fresh checkout boots cleanly).
-Per-phase expected progressions:
-
-| Phase | `01_syscall_write` | `02_errno_touch` | `03_printf_hello` |
-|--------:|---------------------|-------------------|-------------------|
-| pre | link: rela sh_info | link: rela sh_info| link: rela sh_info|
-| 1 | link: model gap | link: model gap | link: model gap |
-| 2 | link: model gap | link: model gap | link: model gap |
-| 3 | run rc=139 | run rc=139 | run rc=139 |
-| **4+6** | **link: CALL26 oor**| **link: CALL26 oor** | **link: CALL26 oor** |
-| 5 | run pass | run pass | run pass |
-
-(Bold row = current state.) Phases 4 and 6 landed as a pair without
-Phase 5 between them; the structural state of the produced binary
-is now correct (`readelf -d -S` shows `.dynsym`/`.dynstr`/`.gnu.hash`/
-`.rela.dyn`/`.rela.plt`/`.plt`/`.got.plt`/`.dynamic`, and the phdr
-table carries PT_PHDR/PT_INTERP/PT_DYNAMIC/PT_GNU_STACK), but the
-link itself fails earlier than `run` because the existing reloc apply
-panics on a CALL26 against an imported (vaddr=0) target. Phase 5 is
-the wedge that turns this into a runnable binary.
-
-Phases 1–2 didn't surface as the intermediate states predicted in
-the original plan because the implementation landed Phases 1+2+3 in
-sequence inside a single session — there was never a build that
-exposed the "Phase 1 only" or "Phase 2 only" failure shapes.
-
-A unit-level harness for the synthetic-section builder (Phase 4) is
-worth adding under `test/link/dyn/` — round-trip the `.dynsym` /
-`.gnu.hash` / `.rela.{dyn,plt}` against `readelf -d -r --dyn-syms`
-output for a hand-crafted input. This is faster than waiting for a
-full musl run to surface a malformed `.dynamic`.
+Replace the panic at `pipeline.c:413` with a dispatch into the same
+machinery as `link_exe`, with:
+- `output_kind = SHARED` (no PT_INTERP, no entry-symbol requirement,
+ `allow_undefined = 1`).
+- DT_SONAME from `opts->soname`.
+- DT_R(UN)PATH from `opts->r(un)paths`.
+- Exports promoted into `.dynsym` from `opts->exports`.
+
+Add a harness case under `test/musl/` (or a new `test/link/dyn/`):
+build `libfoo.so` from a single `.c`, link an exe against it, run.
+
+### Phase 8 — TLS GD/IE/LD, IRELATIVE (deferred)
+
+Required for shared-lib TLS and IFUNCs in dynamic outputs. Out of
+scope for the v1 dynamic exe; the musl harness doesn't exercise them.
+
+### Polish / follow-ups (none blocking)
+
+- **`--export-dynamic`**: promote internal globals into `.dynsym`.
+ Mechanical; not exercised by the musl harness.
+- **`.gnu.hash` sort-by-bucket**: current code assumes hashed
+ symbols land contiguously in `.dynsym`. Fine for small import
+ sets; needs a sort pass before scaling.
+- **`--as-needed`**: today every DSO with a soname gets a DT_NEEDED.
+ Plumb the flag through to filter on actual import use.
+- **Versioned symbols** (`.gnu.version` / `.gnu.version_r`): musl
+ doesn't use them; glibc does.
+- **Lazy binding**: would need a real `_dl_runtime_resolve` PLT0
+ reference. Skip until perf demands it.
+- **Unit-level dyn-table harness** under `test/link/dyn/`: round-trip
+ `.dynsym` / `.gnu.hash` / `.rela.{dyn,plt}` / `.plt` body against
+ `readelf -d -r --dyn-syms` and `objdump -d --section=.plt`. Faster
+ than waiting on a full musl run to catch a malformed `.dynamic`
+ or mis-encoded PLT stub.
+
+## Open questions
+
+1. **Versioned symbols.** v1 ignores versions on read (matches GNU
+ ld's behavior with unversioned objects against versioned libs —
+ the unversioned default version is taken). Write-side versioning
+ is a follow-up that's invisible to the musl harness.
+
+2. **`.eh_frame_hdr` interaction.** A near-term gap in
+ `linker-status.md` independent of dynamic linking, but it touches
+ the same phdr synthesis code. If it sequences in the same window,
+ land it alongside Phase 7 — phdr count growth is shared.
diff --git a/doc/linker-status.md b/doc/linker-status.md
@@ -20,23 +20,28 @@ live in `test/link/` — they are not duplicated in `test/elf/`.
| `test-link` E | 37 | 0 | qemu/podman aarch64 exec, incl. IFUNC |
| `test-link` J | 38 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS |
| `test-link` bad | 2 | 0 | `bad/30_undef_strong` (E + J) |
-| `test-musl` | 3 | 0 | static musl 1.2.5: syscall, errno, printf |
+| `test-musl` | 6 | 0 | musl 1.2.5 static + dynamic: syscall, errno, printf |
(R = roundtrip; E = link → aarch64 ELF → qemu/podman; J = JIT in-process.)
-`test-musl` links real C against pinned musl libc.a + cfree's own
-`rt/build/aarch64-linux/libcfree_rt.a` (TF / soft-float builtins) and
-runs the result under qemu/podman. Sysroot is produced by
-`test/musl/Containerfile` (Alpine 3.20 + musl 1.2.5-r3). Excluded from
-the default `make test` because it needs podman.
+`test-musl` links real C against pinned musl 1.2.5 in two variants:
+**static** (libc.a + cfree's own `rt/build/aarch64-linux/libcfree_rt.a`,
+classic ET_EXEC) and **dynamic** (libc.so + Scrt1.o, ET_DYN PIE with
+PT_INTERP / PT_DYNAMIC / PLT / .got.plt and BIND_NOW resolution
+against the runtime loader). Both variants run the result under
+qemu/podman. Sysroot is produced by `test/musl/Containerfile`
+(Alpine 3.20 + musl 1.2.5-r3). Excluded from the default `make test`
+because it needs podman.
---
## What works today
-`cfree ld` links real static aarch64-linux executables, including
-against musl libc.a + cfree's own `libcfree_rt.a`. printf("hello, musl")
-works end-to-end. Beyond that:
+`cfree ld` links real aarch64-linux executables in both **static**
+ET_EXEC and **dynamic** ET_DYN PIE shapes, including against real
+musl libc.a / libc.so + cfree's own `libcfree_rt.a`. `printf("hello,
+musl")` works end-to-end against the runtime loader
+(`/lib/ld-musl-aarch64.so.1`). Beyond that:
- **Reloc kinds applied:** ABS{16,32,64}, PREL{16}, REL32, PC32,
CONDBR19, TSTBR14, LD_PREL_LO19, ADR_PREL_LO21, JUMP26 / CALL26,
@@ -45,7 +50,9 @@ works end-to-end. Beyond that:
ADR_GOT_PAGE / LD64_GOT_LO12_NC,
TLSLE_ADD_TPREL_{HI12,LO12_NC}. Plus a synthetic R_ABS64 emitter
for GOT slot fill. **Reads every reloc kind in musl 1.2.5 aarch64
- libc.a.**
+ libc.a.** Dynamic emit pass also produces R_AARCH64_RELATIVE,
+ R_AARCH64_GLOB_DAT, and R_AARCH64_JUMP_SLOT records (.rela.dyn /
+ .rela.plt) for the runtime loader.
- **Symbol resolution:** STB_GLOBAL/WEAK/LOCAL replacement strength;
STV_HIDDEN; SHN_COMMON coalesce-to-largest; STT_FILE / STT_SECTION
pass-through. Weak archive defs satisfy unresolved refs (matches
@@ -121,6 +128,22 @@ works end-to-end. Beyond that:
toolchains that scan `.eh_frame` linearly; fast lookup via
`.eh_frame_hdr` + PT_GNU_EH_FRAME is still TODO (binary search
index over FDEs).
+- **Dynamic linking against `.so` deps:** `cfree ld -pie -o out
+ Scrt1.o crti.o user.o libc.so libcfree_rt.a crtn.o` produces an
+ ET_DYN PIE that runs against the musl runtime loader. The driver
+ parses `-dynamic-linker`, recognizes `.so` / `.so.N` positional
+ inputs, and routes `-l<name>` under `-Bdynamic` to `lib<name>.so`
+ before `lib<name>.a`. The link image carries a synthetic
+ `.interp` / `.dynsym` / `.dynstr` / `.gnu.hash` /
+ `.rela.dyn` / `.rela.plt` / `.plt` / `.got.plt` / `.dynamic`
+ layout, with PT_PHDR / PT_INTERP / PT_DYNAMIC / PT_GNU_STACK
+ phdrs, DT_NEEDED per consumed DSO soname, and `DF_1_NOW`
+ (BIND_NOW eager binding). PLT0 + per-import 16-byte stubs are
+ emitted; CALL26 / JUMP26 against an imported symbol is rewritten
+ to its PLT entry, and abs / GOT-slot references against imports
+ emit `R_AARCH64_GLOB_DAT` so the loader patches the resolved
+ runtime address before user code runs. PIE internal abs64
+ fixups emit `R_AARCH64_RELATIVE`.
- **Driver:** `cfree ld -static -o out crt1.o crti.o user.o libc.a
libcfree_rt.a crtn.o` works. Output is chmod 0755 on success.
- **JIT path** runs the same resolved image in-process; MAP_JIT on
@@ -137,16 +160,17 @@ ordered by how often the gap actually bites.
|-----|-------------|--------|
| **`.eh_frame_hdr` + PT_GNU_EH_FRAME** | `.eh_frame` already flows through with a proper shdr; without `.eh_frame_hdr` libgcc/libunwind fall back to linear FDE scan, and `dl_iterate_phdr` consumers (most modern unwinders) skip the section entirely. Needs FDE parsing + sorted binary-search table emission. | medium |
| **`.debug_*` in the exe** | No DWARF → `gdb` blind on source lines. cfree's debug pipeline ends at the obj boundary; the linker drops non-`SF_ALLOC` sections. | medium |
-| **TLSGD / TLSIE / TLSLD relocs** | Read but not applied. Needed for `-fpic` TLS or shared-lib TLS — moot until dynamic linking lands. | medium |
-| **Dynamic linking: PT_DYNAMIC, PT_INTERP, PLT, DT_NEEDED** | Cannot link against any `.so`. Static-only. | large |
-| **PIE / ET_DYN executables** | Driver accepts `-pie` but the writer always emits ET_EXEC at fixed `IMAGE_BASE`. Tied to dynamic-linking work. | medium (depends on dynamic) |
+| **TLSGD / TLSIE / TLSLD relocs** | Read but not applied. Needed for `-fpic` TLS or shared-lib TLS. Local-exec works; the dyn-link cut leaves GD/IE/LD as Phase 8. | medium |
+| **`cfree_link_shared` (`-shared` ET_DYN libs)** | Driver and inputs are wired (DSO read, dyn tables) but `cfree_link_shared` still panics with "not yet implemented". The parallel path through `link_exe` would only need `output_kind = SHARED`, `allow_undefined = 1`, no entry sym, DT_SONAME, exports promoted into dynsym. | small (after Phase 5) |
+| **`--export-dynamic` exports in dynsym** | Imports are in `.dynsym`; internal exports the consumer wants visible to dependents (e.g. dlopen plugins, callbacks the loader resolves) are not yet promoted. Not exercised by the musl harness. | small |
| **Linker scripts** | `link_set_script` panics with "not yet implemented". Parser exists in `cfree_link_script_parse` but isn't wired into `link_resolve`. | medium |
| **COMDAT-group atomicity in `--gc-sections`** | C++ inline / weak-template instantiations under `SHF_GROUP` could lose group members. C-only inputs don't exercise it. | small |
| **`crt1.o`/`crti.o`/`crtn.o` auto-link** | Driver doesn't auto-include a C runtime; the user passes `crt1.o crti.o ... crtn.o` explicitly. Cosmetic. | small (driver-only) |
-**Bottom line:** for static aarch64-linux executables, `cfree ld` is
-already a working linker — including against real musl, and
-including STT_GNU_IFUNC in ELF output (rt-driven preinit). The next
+**Bottom line:** for aarch64-linux executables — both static ET_EXEC
+and dynamic ET_DYN PIE against real musl — `cfree ld` is a working
+linker. STT_GNU_IFUNC in ELF output (rt-driven preinit) and BIND_NOW
+dynamic linking against `.so` deps both pass end-to-end. The next
priorities, roughly in order:
1. **`.eh_frame_hdr` + PT_GNU_EH_FRAME** — `.eh_frame` already flows
@@ -155,10 +179,13 @@ priorities, roughly in order:
libgcc's `_Unwind_Find_FDE`).
2. **`.debug_*` in the exe** — DWARF flow-through; the linker
currently drops non-`SF_ALLOC` sections at `section_kept`.
+3. **`cfree_link_shared`** — the dyn-table machinery is reusable
+ from the exe path; producing `libfoo.so` is mostly a dispatch
+ wrapper plus exports-into-dynsym.
-After those the next big lift is full dynamic linking (PT_DYNAMIC +
-PLT + PT_INTERP + DT_NEEDED), which also unlocks PIE output and TLS
-GD / IE / LD modes.
+TLS GD / IE / LD modes remain Phase 8 work; lazy-binding (no
+`DF_1_NOW`) is a follow-up that needs a real `_dl_runtime_resolve`
+PLT0 — eager binding is fine for v1.
The IFUNC iplt stub bytes (`0x90000010 / 0xf9400210 / 0xd61f0200`)
are still hand-encoded inline in `layout_iplt`; moving them behind
diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c
@@ -33,6 +33,7 @@
#include <string.h>
+#include "core/bytes.h"
#include "core/heap.h"
#include "core/pool.h"
#include "core/util.h"
@@ -77,8 +78,7 @@ static u32 dyn_alloc_sections(LinkImage* img, u32 nsec) {
LinkSection* nsections = (LinkSection*)h->realloc(
h, img->sections, sizeof(*img->sections) * img->nsections,
sizeof(*img->sections) * new_nsec, _Alignof(LinkSection));
- if (!nsections)
- compiler_panic(img->c, no_loc(), "link: oom on dyn sections");
+ if (!nsections) compiler_panic(img->c, no_loc(), "link: oom on dyn sections");
img->sections = nsections;
return base;
}
@@ -166,14 +166,39 @@ typedef struct ImportLists {
} ImportLists;
static int sym_is_func_import(const LinkSymbol* s) {
- /* Imports may carry any kind from the DSO's dynsym; we treat
- * STT_FUNC (SK_FUNC) as PLT-bound and everything else (data /
- * notype / TLS) as GOT-bound. The loader-side distinction is the
- * same: JUMP_SLOT vs GLOB_DAT. */
+ /* Most undef shadows have kind = SK_UNDEF (the obj reader keys kind
+ * off shndx, not STT_*). Only useful when the canonical entry
+ * carried a real type — fall through to the DSO lookup otherwise. */
return s->kind == SK_FUNC || s->kind == SK_IFUNC;
}
-static void collect_imports(LinkImage* img, Heap* h, ImportLists* il) {
+/* Resolve an import's classifier kind by consulting its providing
+ * DSO's dynsym. read_elf_dso preserves STT_FUNC / STT_OBJECT / etc.
+ * on each defined export; the consumer's undef may have arrived as
+ * SK_UNDEF (clang emits external refs as SHN_UNDEF, which the reader
+ * collapses to SK_UNDEF regardless of STT_*). Returns 1 for func /
+ * ifunc, 0 for everything else (or if the DSO export is missing). */
+static int dso_export_is_func(Linker* l, const LinkSymbol* s) {
+ if (s->dso_input_id == LINK_INPUT_NONE) return 0;
+ if (s->dso_input_id - 1u >= LinkInputs_count(&l->inputs)) return 0;
+ LinkInput* in = LinkInputs_at(&l->inputs, s->dso_input_id - 1u);
+ if (!in->obj) return 0;
+ ObjSymIter* it = obj_symiter_new(in->obj);
+ ObjSymEntry e;
+ int is_func = 0;
+ while (obj_symiter_next(it, &e)) {
+ const ObjSym* es = e.sym;
+ if (!es || es->name != s->name) continue;
+ if (es->kind == SK_UNDEF) continue;
+ is_func = (es->kind == SK_FUNC || es->kind == SK_IFUNC);
+ break;
+ }
+ obj_symiter_free(it);
+ return is_func;
+}
+
+static void collect_imports(Linker* l, LinkImage* img, Heap* h,
+ ImportLists* il) {
u32 i;
u32 cap_f = 0, cap_d = 0;
il->funcs = NULL;
@@ -186,7 +211,8 @@ static void collect_imports(LinkImage* img, Heap* h, ImportLists* il) {
/* Only the canonical (img->globals) entry per name. */
LinkSymId canonical = symhash_get(&img->globals, s->name);
if (canonical != LINK_SYM_NONE && canonical != s->id) continue;
- if (sym_is_func_import(s)) {
+ int is_func = sym_is_func_import(s) || dso_export_is_func(l, s);
+ if (is_func) {
if (VEC_GROW(h, il->funcs, cap_f, il->nfuncs + 1u))
compiler_panic(img->c, no_loc(), "link: oom on import-funcs");
il->funcs[il->nfuncs++] = s->id;
@@ -274,8 +300,7 @@ static void build_dynsym(LinkImage* img, LinkDynState* dyn,
dyn->ndynsym = ndynsym;
dyn->dynsym = (DynSymRec*)h->alloc(h, sizeof(*dyn->dynsym) * ndynsym,
_Alignof(DynSymRec));
- if (!dyn->dynsym)
- compiler_panic(img->c, no_loc(), "link: oom on dynsym");
+ if (!dyn->dynsym) compiler_panic(img->c, no_loc(), "link: oom on dynsym");
memset(dyn->dynsym, 0, sizeof(*dyn->dynsym) * ndynsym);
/* Slot 0: STN_UNDEF. dynstr leads with a NUL so st_name=0 reads as
@@ -289,13 +314,19 @@ static void build_dynsym(LinkImage* img, LinkDynState* dyn,
* upper bound. Clean (zero-filled) by alloc convention; we set
* indices for imports below. */
dyn->sym_dynidx_size = LinkSyms_count(&img->syms) + 1u;
- dyn->sym_dynidx =
- (u32*)h->alloc(h, sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size,
- _Alignof(u32));
+ dyn->sym_dynidx = (u32*)h->alloc(
+ h, sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size, _Alignof(u32));
if (!dyn->sym_dynidx)
compiler_panic(img->c, no_loc(), "link: oom on sym_dynidx");
- memset(dyn->sym_dynidx, 0,
- sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size);
+ memset(dyn->sym_dynidx, 0, sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size);
+ /* sym_plt_vaddr is populated alongside the PLT body emit below; here
+ * we only allocate the parallel array. */
+ dyn->sym_plt_vaddr = (u64*)h->alloc(
+ h, sizeof(*dyn->sym_plt_vaddr) * dyn->sym_dynidx_size, _Alignof(u64));
+ if (!dyn->sym_plt_vaddr)
+ compiler_panic(img->c, no_loc(), "link: oom on sym_plt_vaddr");
+ memset(dyn->sym_plt_vaddr, 0,
+ sizeof(*dyn->sym_plt_vaddr) * dyn->sym_dynidx_size);
/* All imports have STB_GLOBAL so first_global is right after the
* single STN_UNDEF slot. (When local exports land via
@@ -325,8 +356,10 @@ static void build_dynsym(LinkImage* img, LinkDynState* dyn,
size_t namelen = 0;
const char* nm = pool_str(img->c->global, s->name, &namelen);
u8 elf_type = STT_OBJECT;
- if (s->kind == SK_TLS) elf_type = STT_TLS;
- else if (s->kind == SK_NOTYPE) elf_type = STT_NOTYPE;
+ if (s->kind == SK_TLS)
+ elf_type = STT_TLS;
+ else if (s->kind == SK_NOTYPE)
+ elf_type = STT_NOTYPE;
r->st_name = bb_append_str(dynstr, nm, (u32)namelen);
r->st_info = ELF64_ST_INFO(STB_GLOBAL, elf_type);
r->st_other = STV_DEFAULT;
@@ -476,13 +509,13 @@ void layout_dyn(Linker* l, LinkImage* img) {
/* PT_INTERP path. Default to musl's aarch64 loader when not set;
* the only target this cut supports is aarch64-linux. */
- dyn->interp_path = l->interp_path
- ? l->interp_path
- : pool_intern_cstr(l->c->global,
- "/lib/ld-musl-aarch64.so.1");
+ dyn->interp_path =
+ l->interp_path
+ ? l->interp_path
+ : pool_intern_cstr(l->c->global, "/lib/ld-musl-aarch64.so.1");
/* Step 1: enumerate imports + DT_NEEDED. */
- collect_imports(img, h, &imports);
+ collect_imports(l, img, h, &imports);
collect_needed(l, img, dyn);
/* Step 2: build .dynstr + .dynsym. .dynstr must also carry the
@@ -527,24 +560,24 @@ void layout_dyn(Linker* l, LinkImage* img) {
if (imports.nfuncs && !dyn->rela_plt)
compiler_panic(img->c, no_loc(), "link: oom on rela_plt");
- /* RELA dyn: GLOB_DAT for each data import + reserve for RELATIVE
- * records emitted during reloc-apply. Cap chosen large enough for
- * the test/musl harness; fail loudly if exceeded. */
+ /* RELA dyn: capacity for GLOB_DAT (data imports referenced via .got)
+ * + RELATIVE (PIE internal abs64 fixups) + any direct-abs imports.
+ * Phase 5 emits all entries dynamically during reloc-apply; layout
+ * just reserves space. Cap chosen large enough for the test/musl
+ * harness; apply panics loudly if exceeded. */
u32 cap_rel = 4096u;
- u32 base_rela_dyn = imports.ndatas; /* GLOB_DAT entries */
- dyn->cap_rela_dyn = base_rela_dyn + cap_rel;
- dyn->rela_dyn = dyn->cap_rela_dyn
- ? (DynRela*)h->alloc(h, sizeof(DynRela) * dyn->cap_rela_dyn,
- _Alignof(DynRela))
- : NULL;
+ dyn->cap_rela_dyn = cap_rel;
+ dyn->rela_dyn =
+ dyn->cap_rela_dyn
+ ? (DynRela*)h->alloc(h, sizeof(DynRela) * dyn->cap_rela_dyn,
+ _Alignof(DynRela))
+ : NULL;
if (dyn->cap_rela_dyn && !dyn->rela_dyn)
compiler_panic(img->c, no_loc(), "link: oom on rela_dyn");
- dyn->nrela_dyn = base_rela_dyn;
- if (base_rela_dyn) memset(dyn->rela_dyn, 0, sizeof(DynRela) * base_rela_dyn);
+ dyn->nrela_dyn = 0;
size_t namelen;
- const char* interp_str =
- pool_str(l->c->global, dyn->interp_path, &namelen);
+ const char* interp_str = pool_str(l->c->global, dyn->interp_path, &namelen);
u64 interp_bytes = (u64)namelen + 1u;
u64 dynsym_bytes = (u64)dyn->ndynsym * 24u;
u64 dynstr_bytes = (u64)dyn->dynstr_len;
@@ -636,7 +669,8 @@ void layout_dyn(Linker* l, LinkImage* img) {
img->segment_bytes_cap[ro_seg_idx] = (size_t)ro_seg_size;
if (ro_seg_size && !img->segment_bytes[ro_seg_idx])
compiler_panic(img->c, no_loc(), "link: oom on ro dyn segment");
- if (ro_seg_size) memset(img->segment_bytes[ro_seg_idx], 0, (size_t)ro_seg_size);
+ if (ro_seg_size)
+ memset(img->segment_bytes[ro_seg_idx], 0, (size_t)ro_seg_size);
if (has_plt) {
LinkSegment* rx_seg = &img->segments[rx_seg_idx];
@@ -653,8 +687,87 @@ void layout_dyn(Linker* l, LinkImage* img) {
img->segment_bytes_cap[rx_seg_idx] = (size_t)plt_bytes;
if (!img->segment_bytes[rx_seg_idx])
compiler_panic(img->c, no_loc(), "link: oom on .plt segment");
- /* Body left zero — Phase 5 owns PLT0 + per-slot stubs. */
memset(img->segment_bytes[rx_seg_idx], 0, (size_t)plt_bytes);
+ /* Stash plt / got.plt vaddrs now — the PLT body emit just below
+ * reads them, and the post-shift fixup in shift_image_addresses
+ * (link_elf.c) keys on these fields too. */
+ dyn->plt_vaddr = rx_vaddr;
+ dyn->plt_size = plt_bytes;
+ dyn->got_plt_vaddr = rw_vaddr;
+ dyn->got_plt_size = gotplt_bytes;
+ /* PLT body emit (Phase 5).
+ *
+ * AArch64 psABI PLT layout:
+ *
+ * PLT0 (32 B):
+ * stp x16, x30, [sp, #-16]!
+ * adrp x16, page(.got.plt + 16)
+ * ldr x17, [x16, #lo12(.got.plt + 16)]
+ * add x16, x16, #lo12(.got.plt + 16)
+ * br x17
+ * nop ; nop ; nop
+ *
+ * per-import (16 B), entry i targets .got.plt[3 + i]:
+ * adrp x16, page(slot)
+ * ldr x17, [x16, #lo12(slot)]
+ * add x16, x16, #lo12(slot)
+ * br x17
+ *
+ * Encoded with raw bit-twiddling — no LinkRelocApply records.
+ * Distances between .plt and .got.plt are constant across the
+ * post-layout shift_image_addresses bump (both segments shift by
+ * the same delta), so page-relative offsets and lo12 are
+ * preserved. Under DF_1_NOW the loader patches every .got.plt
+ * slot from .rela.plt before running PLT0, so PLT0's resolve
+ * stub is never executed, but it is still emitted in canonical
+ * form for disassembler / unwinder consumption.
+ *
+ * Encoding bases (Rd / Rn / Rt fixed at x16 / x16 / x17):
+ * adrp x16, sym : 0x90000010 | (immlo<<29) | (immhi<<5)
+ * ldr x17, [x16, #i12] : 0xF9400211 | (i12_scaled<<10)
+ * add x16, x16, #i12 : 0x91000210 | (i12<<10)
+ * br x17 : 0xD61F0220
+ * stp x16, x30, [sp,#-16]! : 0xa9bf7bf0
+ * nop : 0xD503201F
+ */
+ {
+ u8* plt_b = img->segment_bytes[rx_seg_idx];
+ /* PLT0: load .got.plt[2] (resolver) into x17 and tail-call. */
+ u64 plt0_pc = dyn->plt_vaddr + 4u;
+ u64 slot2 = dyn->got_plt_vaddr + 16u;
+ i64 page_disp = ((i64)slot2 & ~(i64)0xfff) - ((i64)plt0_pc & ~(i64)0xfff);
+ i64 imm21 = page_disp >> 12;
+ u32 immlo = (u32)(imm21 & 0x3);
+ u32 immhi = (u32)((imm21 >> 2) & 0x7ffff);
+ u32 lo12 = (u32)(slot2 & 0xfff);
+ u32 imm12_ldr = (lo12 >> 3) & 0xfff; /* slot is 8-byte aligned */
+ wr_u32_le(plt_b + 0, 0xa9bf7bf0u);
+ wr_u32_le(plt_b + 4, 0x90000010u | (immlo << 29) | (immhi << 5));
+ wr_u32_le(plt_b + 8, 0xF9400211u | (imm12_ldr << 10));
+ wr_u32_le(plt_b + 12, 0x91000210u | (lo12 << 10));
+ wr_u32_le(plt_b + 16, 0xD61F0220u);
+ wr_u32_le(plt_b + 20, 0xD503201Fu);
+ wr_u32_le(plt_b + 24, 0xD503201Fu);
+ wr_u32_le(plt_b + 28, 0xD503201Fu);
+ /* Per-import 16-byte entries. */
+ u32 ki;
+ for (ki = 0; ki < imports.nfuncs; ++ki) {
+ u64 entry_vaddr = dyn->plt_vaddr + 32u + 16u * (u64)ki;
+ u64 slot_vaddr = dyn->got_plt_vaddr + 8u * (3u + ki);
+ i64 e_page_disp =
+ ((i64)slot_vaddr & ~(i64)0xfff) - ((i64)entry_vaddr & ~(i64)0xfff);
+ i64 e_imm21 = e_page_disp >> 12;
+ u32 e_immlo = (u32)(e_imm21 & 0x3);
+ u32 e_immhi = (u32)((e_imm21 >> 2) & 0x7ffff);
+ u32 e_lo12 = (u32)(slot_vaddr & 0xfff);
+ u32 e_imm12_ldr = (e_lo12 >> 3) & 0xfff;
+ u8* p = plt_b + 32u + 16u * (u64)ki;
+ wr_u32_le(p + 0, 0x90000010u | (e_immlo << 29) | (e_immhi << 5));
+ wr_u32_le(p + 4, 0xF9400211u | (e_imm12_ldr << 10));
+ wr_u32_le(p + 8, 0x91000210u | (e_lo12 << 10));
+ wr_u32_le(p + 12, 0xD61F0220u);
+ }
+ }
LinkSegment* rw_seg = &img->segments[rw_seg_idx];
memset(rw_seg, 0, sizeof(*rw_seg));
@@ -677,11 +790,6 @@ void layout_dyn(Linker* l, LinkImage* img) {
* the loader replaces every slot before user code runs, so zero
* is a fine starting state. */
memset(img->segment_bytes[rw_seg_idx], 0, (size_t)gotplt_bytes);
-
- dyn->plt_vaddr = rx_vaddr;
- dyn->plt_size = plt_bytes;
- dyn->got_plt_vaddr = rw_vaddr;
- dyn->got_plt_size = gotplt_bytes;
}
img->nsegments += nseg;
@@ -703,7 +811,7 @@ void layout_dyn(Linker* l, LinkImage* img) {
Sym name_plt = pool_intern_cstr(l->c->global, ".plt");
Sym name_got_plt = pool_intern_cstr(l->c->global, ".got.plt");
-#define INIT_SEC(IDX, NAME, SEG_IDX, OFF_IN_SEG, SIZE, ALIGN, FLAGS, SEM) \
+#define INIT_SEC(IDX, NAME, SEG_IDX, OFF_IN_SEG, SIZE, ALIGN, FLAGS, SEM) \
do { \
LinkSection* ls = &img->sections[sec_base + (IDX)]; \
memset(ls, 0, sizeof(*ls)); \
@@ -721,27 +829,34 @@ void layout_dyn(Linker* l, LinkImage* img) {
ls->sem = (SEM); \
} while (0)
- INIT_SEC(0, name_interp, ro_seg_idx, interp_off, interp_bytes, 1, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(1, name_dynsym, ro_seg_idx, dynsym_off, dynsym_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(2, name_dynstr, ro_seg_idx, dynstr_off, dynstr_bytes, 1, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(3, name_gnu_hash, ro_seg_idx, gnuhash_off, gnuhash_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(4, name_rela_dyn, ro_seg_idx, rela_dyn_off, rela_dyn_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(5, name_rela_plt, ro_seg_idx, rela_plt_off, rela_plt_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
- INIT_SEC(6, name_dynamic, ro_seg_idx, dynamic_off, dynamic_bytes, 8, SF_ALLOC | SF_WRITE, SSEM_PROGBITS);
-
- dyn->sec_interp = (LinkSectionId)(sec_base + 0 + 1u);
- dyn->sec_dynsym = (LinkSectionId)(sec_base + 1 + 1u);
- dyn->sec_dynstr = (LinkSectionId)(sec_base + 2 + 1u);
+ INIT_SEC(0, name_interp, ro_seg_idx, interp_off, interp_bytes, 1, SF_ALLOC,
+ SSEM_PROGBITS);
+ INIT_SEC(1, name_dynsym, ro_seg_idx, dynsym_off, dynsym_bytes, 8, SF_ALLOC,
+ SSEM_PROGBITS);
+ INIT_SEC(2, name_dynstr, ro_seg_idx, dynstr_off, dynstr_bytes, 1, SF_ALLOC,
+ SSEM_PROGBITS);
+ INIT_SEC(3, name_gnu_hash, ro_seg_idx, gnuhash_off, gnuhash_bytes, 8,
+ SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(4, name_rela_dyn, ro_seg_idx, rela_dyn_off, rela_dyn_bytes, 8,
+ SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(5, name_rela_plt, ro_seg_idx, rela_plt_off, rela_plt_bytes, 8,
+ SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(6, name_dynamic, ro_seg_idx, dynamic_off, dynamic_bytes, 8,
+ SF_ALLOC | SF_WRITE, SSEM_PROGBITS);
+
+ dyn->sec_interp = (LinkSectionId)(sec_base + 0 + 1u);
+ dyn->sec_dynsym = (LinkSectionId)(sec_base + 1 + 1u);
+ dyn->sec_dynstr = (LinkSectionId)(sec_base + 2 + 1u);
dyn->sec_gnu_hash = (LinkSectionId)(sec_base + 3 + 1u);
dyn->sec_rela_dyn = (LinkSectionId)(sec_base + 4 + 1u);
dyn->sec_rela_plt = (LinkSectionId)(sec_base + 5 + 1u);
- dyn->sec_dynamic = (LinkSectionId)(sec_base + 6 + 1u);
+ dyn->sec_dynamic = (LinkSectionId)(sec_base + 6 + 1u);
dyn->dynamic_vaddr = img->segments[ro_seg_idx].vaddr + dynamic_off;
dyn->dynamic_size = dynamic_bytes;
if (has_plt) {
- INIT_SEC(7, name_plt, rx_seg_idx, 0, plt_bytes, 16,
- SF_ALLOC | SF_EXEC, SSEM_PROGBITS);
+ INIT_SEC(7, name_plt, rx_seg_idx, 0, plt_bytes, 16, SF_ALLOC | SF_EXEC,
+ SSEM_PROGBITS);
INIT_SEC(8, name_got_plt, rw_seg_idx, 0, gotplt_bytes, 8,
SF_ALLOC | SF_WRITE, SSEM_PROGBITS);
dyn->sec_plt = (LinkSectionId)(sec_base + 7 + 1u);
@@ -784,51 +899,40 @@ void layout_dyn(Linker* l, LinkImage* img) {
if (gnuhash_bytes && ro_bytes && dyn->gnu_hash)
memcpy(ro_bytes + gnuhash_off, dyn->gnu_hash, dyn->gnu_hash_len);
- /* .rela.plt: emit JUMP_SLOT records, one per imported function.
- * r_offset = .got.plt[3 + i].vaddr; r_info = (dynsym_idx<<32) |
- * R_AARCH64_JUMP_SLOT; addend = 0. The body bytes go into
- * .rela.plt's section bytes within ro_seg. */
+ /* .rela.plt: emit JUMP_SLOT records, one per imported function, and
+ * stash each import's PLT-entry vaddr in `sym_plt_vaddr` so the
+ * apply pass can redirect CALL26/JUMP26 against the import. The
+ * record's r_offset addresses the .got.plt slot the PLT stub reads
+ * through; the loader patches that slot to the resolved runtime
+ * address before user code runs (DF_1_NOW, BIND_NOW). Bytes are
+ * written here at pre-shift vaddrs; link_emit re-serializes them
+ * after shift_image_addresses bumps the dyn vaddrs by headers_load. */
{
u32 ki;
for (ki = 0; ki < imports.nfuncs; ++ki) {
LinkSymId lsid = imports.funcs[ki];
u32 dynidx = dyn->sym_dynidx[lsid];
u64 slot_vaddr = dyn->got_plt_vaddr + 8u * (3u + ki);
+ u64 plt_entry_vaddr = dyn->plt_vaddr + 32u + 16u * (u64)ki;
DynRela* r = &dyn->rela_plt[ki];
r->r_offset = slot_vaddr;
r->r_info = ELF64_R_INFO((u64)dynidx, ELF_R_AARCH64_JUMP_SLOT);
r->r_addend = 0;
- /* Serialize into segment bytes. */
+ /* Serialize into segment bytes (will be re-serialized post-shift). */
u8* p = ro_bytes + rela_plt_off + (u64)ki * 24u;
wr_u64_le(p + 0, r->r_offset);
wr_u64_le(p + 8, r->r_info);
wr_u64_le(p + 16, (u64)r->r_addend);
+ /* sym_plt_vaddr is consulted by apply_all_relocs. */
+ dyn->sym_plt_vaddr[lsid] = plt_entry_vaddr;
}
}
- /* .rela.dyn: emit GLOB_DAT records for data imports. We have not
- * allocated GOT slots for these (Phase 5 work), so for now r_offset
- * is 0 — the loader sees a no-op write. Better to allocate the
- * slot here? Phase 4's plan says "GOT slot per imported data sym",
- * but without rewiring the existing layout_got's slot reuse it's
- * cleaner to defer. Mark the gap so Phase 5 can fix it. */
- {
- u32 ki;
- for (ki = 0; ki < imports.ndatas; ++ki) {
- LinkSymId lsid = imports.datas[ki];
- u32 dynidx = dyn->sym_dynidx[lsid];
- DynRela* r = &dyn->rela_dyn[ki];
- r->r_offset = 0; /* TODO Phase 5: target's .got slot vaddr */
- r->r_info = ELF64_R_INFO((u64)dynidx, ELF_R_AARCH64_GLOB_DAT);
- r->r_addend = 0;
- u8* p = ro_bytes + rela_dyn_off + (u64)ki * 24u;
- wr_u64_le(p + 0, r->r_offset);
- wr_u64_le(p + 8, r->r_info);
- wr_u64_le(p + 16, (u64)r->r_addend);
- }
- /* Trailing capacity (cap_rela_dyn - ndatas) stays zero. emit
- * fills R_AARCH64_RELATIVE entries here as it walks abs64 relocs. */
- }
+ /* .rela.dyn entries (GLOB_DAT for imports referenced via .got, and
+ * RELATIVE for PIE internal abs fixups) are emitted by
+ * apply_all_relocs as it walks every relocation. layout_dyn
+ * leaves .rela.dyn empty here; the bytes are written post-shift in
+ * link_emit_elf_aarch64. */
/* .got.plt prelude: for BIND_NOW we leave the body zero — the
* loader patches every slot from .rela.plt before user code. Some
@@ -859,8 +963,7 @@ void link_dyn_state_free(LinkImage* img) {
Heap* h = img->heap;
LinkDynState* dyn = img->dyn;
if (!dyn) return;
- if (dyn->dynsym)
- h->free(h, dyn->dynsym, sizeof(*dyn->dynsym) * dyn->ndynsym);
+ if (dyn->dynsym) h->free(h, dyn->dynsym, sizeof(*dyn->dynsym) * dyn->ndynsym);
if (dyn->dynstr) h->free(h, dyn->dynstr, dyn->dynstr_len);
if (dyn->gnu_hash) h->free(h, dyn->gnu_hash, dyn->gnu_hash_len);
if (dyn->rela_dyn)
@@ -871,6 +974,9 @@ void link_dyn_state_free(LinkImage* img) {
if (dyn->sym_dynidx)
h->free(h, dyn->sym_dynidx,
sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size);
+ if (dyn->sym_plt_vaddr)
+ h->free(h, dyn->sym_plt_vaddr,
+ sizeof(*dyn->sym_plt_vaddr) * dyn->sym_dynidx_size);
h->free(h, dyn, sizeof(*dyn));
img->dyn = NULL;
}
diff --git a/src/link/link_elf.c b/src/link/link_elf.c
@@ -161,6 +161,28 @@ static void shift_image_addresses(LinkImage* img, u64 delta) {
/* tls_vaddr lives in the same image-relative coordinate system as
* the segments it tracks, so it bumps with them. */
if (img->tls_memsz) img->tls_vaddr += delta;
+ /* Dyn-link state mirrors a few segment / section vaddrs and pre-
+ * populated DynRela.r_offset values from layout_dyn. Bump them so
+ * the post-shift .rela.plt / .dynamic emit and apply_all_relocs see
+ * the right addresses (sym_plt_vaddr is read to redirect CALL26
+ * against imports). */
+ if (img->dyn) {
+ LinkDynState* dyn = img->dyn;
+ if (dyn->plt_vaddr) dyn->plt_vaddr += delta;
+ if (dyn->got_plt_vaddr) dyn->got_plt_vaddr += delta;
+ if (dyn->dynamic_vaddr) dyn->dynamic_vaddr += delta;
+ if (dyn->sym_plt_vaddr) {
+ u32 j;
+ for (j = 0; j < dyn->sym_dynidx_size; ++j)
+ if (dyn->sym_plt_vaddr[j]) dyn->sym_plt_vaddr[j] += delta;
+ }
+ if (dyn->rela_plt) {
+ u32 j;
+ for (j = 0; j < dyn->nrela_plt; ++j) dyn->rela_plt[j].r_offset += delta;
+ }
+ /* rela_dyn is populated by apply_all_relocs (which runs after this
+ * shift), so its records are already in post-shift coordinates. */
+ }
}
/* AArch64 ELF ABI: the per-thread TLS block starts at TP + 16 bytes
@@ -172,23 +194,35 @@ static int reloc_is_tlsle(RelocKind k) {
k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
}
-static int reloc_is_abs(RelocKind k) {
- return k == R_ABS32 || k == R_ABS64;
+static int reloc_is_abs(RelocKind k) { return k == R_ABS32 || k == R_ABS64; }
+
+static int reloc_is_branch26(RelocKind k) {
+ return k == R_AARCH64_CALL26 || k == R_AARCH64_JUMP26;
}
-static void emit_relative_record(LinkImage* img, u64 site_vaddr, u64 addend) {
+static void emit_dyn_record(LinkImage* img, u64 site_vaddr, u32 reloc_type,
+ u32 dynidx, i64 addend) {
LinkDynState* dyn = img->dyn;
if (!dyn || !dyn->rela_dyn) return;
if (dyn->nrela_dyn >= dyn->cap_rela_dyn) {
compiler_panic(img->c, no_loc(),
- "link: too many R_AARCH64_RELATIVE records (%u >= %u); "
- "raise cap_rela_dyn in layout_dyn",
+ "link: too many .rela.dyn records (%u >= %u); raise "
+ "cap_rela_dyn in layout_dyn",
dyn->nrela_dyn, dyn->cap_rela_dyn);
}
DynRela* r = &dyn->rela_dyn[dyn->nrela_dyn++];
r->r_offset = site_vaddr;
- r->r_info = ELF64_R_INFO((u64)0, ELF_R_AARCH64_RELATIVE);
- r->r_addend = (i64)addend;
+ r->r_info = ELF64_R_INFO((u64)dynidx, reloc_type);
+ r->r_addend = addend;
+}
+
+static void emit_relative_record(LinkImage* img, u64 site_vaddr, u64 addend) {
+ emit_dyn_record(img, site_vaddr, ELF_R_AARCH64_RELATIVE, 0, (i64)addend);
+}
+
+static void emit_globdat_record(LinkImage* img, u64 site_vaddr, u32 dynidx,
+ i64 addend) {
+ emit_dyn_record(img, site_vaddr, ELF_R_AARCH64_GLOB_DAT, dynidx, addend);
}
static void apply_all_relocs(LinkImage* img, u64 img_base) {
@@ -215,17 +249,77 @@ static void apply_all_relocs(LinkImage* img, u64 img_base) {
P_bytes = img->segment_bytes[seg->id - 1] +
(size_t)(r->write_file_offset - seg->file_offset);
+ /* Imported target: redirect / rewrite per reloc kind (Phase 5).
+ *
+ * - CALL26 / JUMP26: target the import's PLT entry. The PLT stub
+ * reads .got.plt[3+i], which the loader pre-fills via JUMP_SLOT
+ * (.rela.plt). S becomes the PLT-entry vaddr; the existing
+ * apply path computes the disp from there.
+ * - R_ABS{32,64}: leave the patch site at zero and emit a
+ * GLOB_DAT record so the loader writes the resolved address
+ * into the site at load time. This covers both
+ * layout_got-emitted .got slot fills (target = import) and any
+ * direct absolute reference in user data (e.g. a function
+ * pointer initializer).
+ * - GOT-page / LO12-NC against an import: emit_reloc_records has
+ * already redirected the target from the import to the
+ * synthetic .got slot symbol, so the apply path here sees the
+ * slot, not the import — nothing special needed; the slot's
+ * own R_ABS64 fill against the (vaddr=0) import will trip the
+ * abs-import branch above and emit GLOB_DAT.
+ *
+ * Anything else against an imported symbol (e.g. PREL19 / ADR
+ * etc.) is rare in real binaries and would need its own
+ * dynamic-reloc kind; panic loudly so a future test that needs
+ * it announces itself. */
+ if (tgt->imported) {
+ /* `tgt` may be a per-input shadow LinkSymbol — resolve_undefs
+ * stamps `imported = 1` on every undef matched by name, but
+ * collect_imports only stashes plt_vaddr / dynidx on the
+ * canonical entry registered in img->globals. Resolve to the
+ * canonical id before indexing the dyn-state arrays. */
+ LinkSymId canon_id = tgt->id;
+ if (tgt->name != 0) {
+ LinkSymId hit = symhash_get(&img->globals, tgt->name);
+ if (hit != LINK_SYM_NONE) canon_id = hit;
+ }
+ u32 dynidx = (img->dyn && canon_id < img->dyn->sym_dynidx_size)
+ ? img->dyn->sym_dynidx[canon_id]
+ : 0u;
+ if (reloc_is_branch26(r->kind)) {
+ u64 plt_v = (img->dyn && canon_id < img->dyn->sym_dynidx_size)
+ ? img->dyn->sym_plt_vaddr[canon_id]
+ : 0u;
+ if (plt_v == 0)
+ compiler_panic(img->c, no_loc(),
+ "link: imported sym has no PLT entry (CALL26)");
+ S = plt_v + img_base;
+ link_reloc_apply(img->c, r->kind, P_bytes, S, r->addend, P);
+ continue;
+ }
+ if (reloc_is_abs(r->kind)) {
+ if (dynidx == 0)
+ compiler_panic(img->c, no_loc(),
+ "link: imported sym has no .dynsym entry");
+ emit_globdat_record(img, r->write_vaddr, dynidx, r->addend);
+ /* Site bytes are irrelevant: the loader's GLOB_DAT writes
+ * (sym_value + r_addend) into r_offset before user code runs,
+ * overwriting whatever's there. Leaving the existing zero
+ * fill saves a write. */
+ continue;
+ }
+ compiler_panic(img->c, no_loc(),
+ "link: unhandled reloc kind %u against imported symbol",
+ (unsigned)r->kind);
+ }
+
/* PIE: an absolute reloc against a defined non-imported symbol
* stays image-relative in the file (the loader adds load-base via
- * a synthesized R_AARCH64_RELATIVE). For an imported target, this
- * cut leaves the apply path as-is — Phase 5 will rewrite to
- * GLOB_DAT/JUMP_SLOT. The image-relative S we want at the site is
- * tgt->vaddr (without img_base, which is already 0 for PIE). */
- if (pie && reloc_is_abs(r->kind) && tgt->defined && !tgt->imported &&
- tgt->kind != SK_ABS) {
- /* img_base is 0 for PIE, so S above is already image-relative.
- * Append the RELATIVE record so the loader patches load_base
- * into the site at runtime. */
+ * a synthesized R_AARCH64_RELATIVE). img_base is 0 for PIE so
+ * S above is already image-relative — the apply writes that into
+ * the site, and the RELATIVE record tells the loader to add
+ * load_base on top. */
+ if (pie && reloc_is_abs(r->kind) && tgt->defined && tgt->kind != SK_ABS) {
emit_relative_record(img, r->write_vaddr, tgt->vaddr);
}
link_reloc_apply(img->c, r->kind, P_bytes, S, r->addend, P);
@@ -532,12 +626,12 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
/* Build DT_* entries in order. Layout matches count_dynamic_entries. */
u32 written = 0;
u8* p = dyn_bytes_at;
-#define DT_PUT(TAG, VAL) \
- do { \
- wr_u64_le(p, (u64)(TAG)); \
- wr_u64_le(p + 8, (u64)(VAL)); \
- p += 16; \
- written++; \
+#define DT_PUT(TAG, VAL) \
+ do { \
+ wr_u64_le(p, (u64)(TAG)); \
+ wr_u64_le(p + 8, (u64)(VAL)); \
+ p += 16; \
+ written++; \
} while (0)
/* DT_NEEDED entries — d_un.d_val is the offset of the soname
@@ -602,10 +696,11 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
written++;
}
- /* Re-serialize .rela.dyn body (GLOB_DAT records were already in
- * place from layout_dyn; RELATIVE records were appended during
- * apply_all_relocs). The trailing capacity stays zero — readers
- * stop at first DT_NULL-equivalent (R_AARCH64_NONE has type 0). */
+ /* Re-serialize .rela.dyn body. GLOB_DAT records (imports against
+ * .got slots) and RELATIVE records (PIE internal abs64 fixups)
+ * are both populated during apply_all_relocs; .rela.dyn was empty
+ * coming out of layout_dyn. Trailing capacity stays zero —
+ * readers stop at the first R_AARCH64_NONE record. */
{
u8* rd_bytes = img->segment_bytes[dseg->id - 1] +
(size_t)(sec_reladyn->file_offset - dseg->file_offset);
@@ -618,6 +713,36 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
wr_u64_le(rp + 16, (u64)rr->r_addend);
}
}
+
+ /* Re-serialize .rela.plt body. JUMP_SLOT records were written by
+ * layout_dyn at pre-shift vaddrs; shift_image_addresses bumped
+ * dyn->rela_plt[i].r_offset along with the rest, so the post-shift
+ * values match the .got.plt slot vaddrs the loader will patch. */
+ if (sec_relaplt && dyn->nrela_plt) {
+ u8* rp_bytes = img->segment_bytes[dseg->id - 1] +
+ (size_t)(sec_relaplt->file_offset - dseg->file_offset);
+ u32 i;
+ for (i = 0; i < dyn->nrela_plt; ++i) {
+ const DynRela* rr = &dyn->rela_plt[i];
+ u8* rp = rp_bytes + (u64)i * 24u;
+ wr_u64_le(rp + 0, rr->r_offset);
+ wr_u64_le(rp + 8, rr->r_info);
+ wr_u64_le(rp + 16, (u64)rr->r_addend);
+ }
+ }
+
+ /* Re-write .got.plt[0] = &.dynamic with the post-shift vaddr.
+ * layout_dyn wrote the pre-shift value into the segment bytes;
+ * shift_image_addresses bumped dyn->dynamic_vaddr so we can refill
+ * the slot here. Slots 1 and 2 (link_map cookie,
+ * _dl_runtime_resolve) are loader-owned for lazy binding; under
+ * DF_1_NOW they're never read so leaving them zero is fine. */
+ if (sec_gotplt && dyn->dynamic_vaddr) {
+ const LinkSegment* gpseg = &img->segments[sec_gotplt->segment_id - 1];
+ u8* gp_bytes = img->segment_bytes[gpseg->id - 1] +
+ (size_t)(sec_gotplt->file_offset - gpseg->file_offset);
+ wr_u64_le(gp_bytes, dyn->dynamic_vaddr);
+ }
}
/* ---- compute build-id (post-reloc, deterministic) ---- */
@@ -893,10 +1018,8 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
* PT_GNU_STACK marks the stack as non-executable (filesz=0). */
if (pie && img->dyn) {
LinkDynState* dyn = img->dyn;
- const LinkSection* sec_interp =
- &img->sections[dyn->sec_interp - 1];
- const LinkSection* sec_dynamic =
- &img->sections[dyn->sec_dynamic - 1];
+ const LinkSection* sec_interp = &img->sections[dyn->sec_interp - 1];
+ const LinkSection* sec_dynamic = &img->sections[dyn->sec_dynamic - 1];
phdrs[pi].p_type = PT_INTERP;
phdrs[pi].p_flags = PF_R;
phdrs[pi].p_offset = sec_interp->file_offset;
@@ -951,8 +1074,7 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
ehdr.e_type = pie ? ET_DYN : ET_EXEC;
ehdr.e_machine = EM_AARCH64;
ehdr.e_version = EV_CURRENT;
- ehdr.e_entry =
- img_base + LinkSyms_at(&img->syms, img->entry_sym - 1)->vaddr;
+ ehdr.e_entry = img_base + LinkSyms_at(&img->syms, img->entry_sym - 1)->vaddr;
ehdr.e_phoff = sizeof(Ehdr64);
ehdr.e_shoff = shdr_off;
ehdr.e_flags = 0;
@@ -1070,9 +1192,12 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
for (i = 0; i < noutshdr; ++i) {
Sym nm = outshdrs[i].name;
u32 ix = outshdrs[i].shdr_idx;
- if (nm == n_dynsym) idx_dynsym = ix;
- else if (nm == n_dynstr) idx_dynstr = ix;
- else if (nm == n_gotplt) idx_gotplt = ix;
+ if (nm == n_dynsym)
+ idx_dynsym = ix;
+ else if (nm == n_dynstr)
+ idx_dynstr = ix;
+ else if (nm == n_gotplt)
+ idx_gotplt = ix;
}
}
/* per-name output shdrs */
diff --git a/src/link/link_internal.h b/src/link/link_internal.h
@@ -166,8 +166,8 @@ typedef struct LinkDynState {
/* .dynsym */
LinkSectionId sec_dynsym;
DynSymRec* dynsym;
- u32 ndynsym; /* incl. slot-0 STN_UNDEF */
- u32 first_global; /* sh_info value: index of first non-local entry */
+ u32 ndynsym; /* incl. slot-0 STN_UNDEF */
+ u32 first_global; /* sh_info value: index of first non-local entry */
/* .dynstr */
LinkSectionId sec_dynstr;
@@ -185,8 +185,8 @@ typedef struct LinkDynState {
* filled in during emit. */
LinkSectionId sec_rela_dyn;
DynRela* rela_dyn;
- u32 nrela_dyn; /* number of records currently populated */
- u32 cap_rela_dyn; /* allocation capacity (records, not bytes) */
+ u32 nrela_dyn; /* number of records currently populated */
+ u32 cap_rela_dyn; /* allocation capacity (records, not bytes) */
/* .rela.plt — R_AARCH64_JUMP_SLOT, one per imported function. */
LinkSectionId sec_rela_plt;
@@ -196,8 +196,8 @@ typedef struct LinkDynState {
/* .plt — 32-byte PLT0 stub + 16 bytes per imported function. Body
* is allocated (zero-initialized) but not emitted in Phase 4. */
LinkSectionId sec_plt;
- u32 nplt; /* number of imported functions */
- u64 plt_vaddr; /* image-relative .plt base */
+ u32 nplt; /* number of imported functions */
+ u64 plt_vaddr; /* image-relative .plt base */
u64 plt_size;
/* .got.plt — 24 reserved bytes + 8 per PLT slot. */
@@ -218,8 +218,18 @@ typedef struct LinkDynState {
/* Per-import dynsym index, indexed by LinkSymId. 0 means "not
* imported / not in dynsym". Used by GLOB_DAT / JUMP_SLOT emit. */
- u32* sym_dynidx; /* size = sym_dynidx_size */
+ u32* sym_dynidx; /* size = sym_dynidx_size */
u32 sym_dynidx_size;
+
+ /* Per-import PLT entry vaddr, indexed by LinkSymId (Phase 5). Set
+ * for every imported function: vaddr of its 16-byte PLT stub inside
+ * `.plt`. 0 means "no PLT stub" (symbol is data-only or not
+ * imported). apply_all_relocs reads this when redirecting a
+ * CALL26/JUMP26 against an imported function — S becomes the PLT
+ * entry vaddr instead of the symbol's (zero) vaddr. The vaddrs
+ * stored here track the post-shift values (shift_image_addresses
+ * bumps them along with .plt's segment vaddr). */
+ u64* sym_plt_vaddr; /* size = sym_dynidx_size */
} LinkDynState;
struct LinkImage {