commit cd74c03124a6e88ff34e2f142cea6a1905c0871a
parent 1981a616987cc76bc3f02afbf58616014c8ce2a4
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 16:47:37 -0700
link: dynamic linking phases 4+6 (synthetic dyn-tables, ET_DYN emit)
Phase 4 (src/link/link_dyn.c): walks LinkSymbols to partition
imports, builds .dynsym/.dynstr/.gnu.hash, allocates synthetic
sections + segments for .interp/.plt/.got.plt/.rela.dyn/.rela.plt/
.dynamic, populates JUMP_SLOT records, collects DT_NEEDED.
Phase 6 (link_elf.c): under -pie, e_type=ET_DYN with img_base=0;
emits PT_PHDR/PT_INTERP/PT_DYNAMIC/PT_GNU_STACK; writes .dynamic
body with the standard DT_* set including DT_FLAGS_1=DF_1_NOW;
collects R_AARCH64_RELATIVE records into .rela.dyn for internal
absolute relocs; overrides sh_type/sh_link/sh_info on dyn shdrs.
Phase 5 (PLT body bytes + CALL26/GOT-page rewriting against imports)
intentionally deferred -- the produced binary is structurally
well-formed but won't run end-to-end against musl until Phase 5.
Diffstat:
8 files changed, 1499 insertions(+), 98 deletions(-)
diff --git a/doc/DYNLD.md b/doc/DYNLD.md
@@ -8,25 +8,65 @@ This is the gap exposed by `test/musl/run.sh`'s `dynamic` variant
## Status
-Phases 1–3 have landed. The dynamic harness now reaches the link's
-final emit stage instead of being rejected at ELF read; failures have
-shifted from `(link)` to runtime crashes (`run rc=139`) on the
-produced binary, which is the expected outcome until Phases 4–6
-(synthetic `.plt`/`.got`/`.dynamic`, PIE emission) are written. Every
-3-test static variant still passes (no regression), and the existing
-`test-link` / `test-cg` / `test-elf` suites are clean.
+Phases 1–4 + 6 have landed. Phase 5 (PLT body emit + import-reloc
+rewriting) was deliberately skipped — the dyn-link infrastructure
+(synthetic sections, dynamic phdrs, `.dynamic` body, RELATIVE record
+collection) is in place, but CALL26/ADR_GOT_PAGE references against
+imported symbols still go through the existing reloc-apply path and
+will panic / mis-resolve at link time. The produced binary is
+structurally well-formed (`readelf -d -S` will show all the dynamic
+infrastructure) but won't run end-to-end against musl until Phase 5
+fills the PLT bodies and rewrites the input relocs. Every 3-test
+static variant still passes (no regression).
| Phase | State | Where to look |
|------:|--------------|-----------------------------------------------------------------|
| 1 | done | `src/obj/elf_read.c::read_elf_dso`, new RelocKinds in `obj.h` |
| 2 | done | `driver/ld.c` (`-dynamic-linker`, `.so` argv), `lib_resolve.c` |
| 3 | done | `link_layout.c::find_dso_export` + `resolve_undefs` extension |
-| 4 | not started | per §3.4 below |
+| 4 | done | new `src/link/link_dyn.c::layout_dyn` |
| 5 | not started | per §3.5 below |
-| 6 | not started | per §3.6 below |
+| 6 | done | `link_elf.c` `img_base` / new phdrs / `.dynamic` body emit |
| 7 | not started | per §3.7 below |
| 8 | deferred | TLS GD/IE/LD, IRELATIVE — out of scope for v1 |
+Notes that drifted from the original plan during 4 + 6 implementation:
+- Phases 4 and 6 landed in the same change without Phase 5 between
+ them. Phase 5's reloc rewriting is what makes imported-symbol
+ references actually call into `.plt` / read `.got.plt`; without it,
+ any input CALL26 to an imported function still aims at the
+ pre-redirect target (vaddr 0) and `link_reloc_apply` panics at
+ "CALL26 out of range". The harness intentionally accepts this gap
+ — the goal of 4 + 6 is the structural skeleton for the binary,
+ not a runnable exe.
+- `LinkImage` carries `pie` + a single `LinkDynState*` rather than
+ splitting fields across the image. The state lives on
+ `link_internal.h` and is freed via `link_dyn_state_free` from
+ `link_image_release`. Open question §5.4's call to add an
+ `emit_dynamic_exe` flag was answered with a `Linker.emit_pie` flag
+ + `LinkImage.pie` mirror; the existing `emit_static_exe` is
+ unchanged and orthogonal.
+- `.rela.dyn` is pre-sized at layout time with a 4096-record
+ RELATIVE tail capacity; the `apply_all_relocs` pass appends
+ R_AARCH64_RELATIVE records as it walks internal absolute relocs,
+ and the section bytes are re-serialized after the apply runs.
+ Trailing capacity stays zero (R_AARCH64_NONE) — harmless to the
+ loader. The cap is large enough for the existing musl harness;
+ a fail-loud panic fires if exceeded.
+- `.gnu.hash`'s bucket population takes a shortcut: it assumes
+ symbols hashed into the same bucket appear contiguously in
+ `.dynsym`. Phase 4's import set is small enough that this holds
+ in practice, but a sort-by-bucket pass would be needed before
+ growing the hashed range.
+- `.dynstr` is built once during layout — both import names and
+ DT_NEEDED soname strings are appended up front so the
+ `.dynstr`/`.dynsym` section sizes are final before placement. Any
+ new soname source needs to land in this same pre-pass.
+- `DT_NEEDED` is emitted for every DSO input that carries a soname,
+ not just those that satisfied an import. Matches GNU ld without
+ `--as-needed`; v1 doesn't plumb `--as-needed` through to the
+ consuming pass.
+
Notes that drifted from the original plan during 1–3 implementation:
- The DSO input shares the existing `ObjBuilder` rather than a new
`DsoBuilder` (open question §5.1). `read_elf_dso` produces an
@@ -432,21 +472,38 @@ has no PT_INTERP / PT_DYNAMIC / .plt yet, so the loader can't bind
it. All 3 static cases still pass; all 756 cg tests, 118 link tests,
and the elf/ar/lib-deps suites still pass (no regression).
-### Phase 4 — Synthetic dyn-tables *(medium)*
+### Phase 4 — Synthetic dyn-tables *(done)*
Files: new `src/link/link_dyn.c`, hooked from `link_layout.c`.
-- Walk LinkSymbols, partition imports into PLT (function) / GOT (data)
- slot lists.
-- Build `.dynsym`/`.dynstr`/`.gnu.hash` from the imported set plus
- any `--export-dynamic` exports.
-- Allocate image-relative bytes for `.plt`, `.got.plt`, `.got`,
- `.rela.plt`, `.rela.dyn`, `.dynamic`, `.interp`. Pattern after
- `layout_iplt` (`link_layout.c:~1640`).
-
-Test: the image has all the expected sections/segments; the loader
-can `mmap` it. It probably crashes at `_start` because reloc apply
-hasn't been updated.
+- ~~Walk LinkSymbols, partition imports into PLT (function) / GOT
+ (data) slot lists.~~ `collect_imports` walks LinkSyms (canonical
+ entry per `img->globals` only), classifies SK_FUNC/SK_IFUNC as
+ PLT-bound and the rest as GOT-bound.
+- ~~Build `.dynsym`/`.dynstr`/`.gnu.hash` from the imported set
+ plus any `--export-dynamic` exports.~~ Imports + DT_NEEDED
+ sonames land in `.dynstr`; STN_UNDEF + each import becomes a
+ `.dynsym` entry; `.gnu.hash` is a small psABI hash with one
+ bloom word and shift=6. `--export-dynamic` exports remain a
+ follow-up — not exercised by the musl harness.
+- ~~Allocate image-relative bytes for `.plt`, `.got.plt`,
+ `.rela.plt`, `.rela.dyn`, `.dynamic`, `.interp`.~~ One R-perm
+ segment carries `.interp` / `.dynsym` / `.dynstr` / `.gnu.hash`
+ / `.rela.dyn` / `.rela.plt` / `.dynamic`; an RX segment carries
+ `.plt` (zero body, see Phase 5) and an RW segment carries
+ `.got.plt`. The existing `.got` (from `layout_got`) is reused
+ as-is for IFUNC / weak-extern slots; imported-data GOT slots
+ are deferred to Phase 5.
+- JUMP_SLOT records are pre-populated against the matching
+ `.got.plt` slot vaddrs so `.rela.plt` is content-complete from
+ Phase 4. GLOB_DAT records for data imports have placeholder
+ `r_offset = 0` until Phase 5 wires up the imported-data .got
+ slots.
+
+Test: ✓ build clean, all link/api objects compile under
+`-Wpedantic -Wextra -Werror`. The image now has a complete dyn
+infrastructure visible via `readelf -d -S`; running it still
+faults because Phase 5 hasn't been implemented.
### Phase 5 — PLT body emit + reloc rewriting *(medium)*
@@ -463,17 +520,44 @@ Files: `src/link/link_dyn.c`, `src/link/link_reloc.c`.
Test: `01_syscall_write` (no libc calls) should still link and run;
`02_errno_touch` exercises the import path for `close` and `errno`.
-### Phase 6 — PIE / ET_DYN emit *(medium)*
-
-Files: `src/link/link_elf.c`.
-
-- Plumb `output_kind` through to emit. Set `e_type`, `IMAGE_BASE`,
- PT_INTERP, PT_DYNAMIC, PT_PHDR, PT_GNU_RELRO, PT_GNU_STACK.
-- For PIE: emit `R_AARCH64_RELATIVE` against any internal absolute
- reloc (currently baked at `link_elf.c:192,622`).
-- Drop `IMAGE_BASE` macro use; read from image.
-
-Test: `03_printf_hello` end-to-end against musl libc.so.
+### Phase 6 — PIE / ET_DYN emit *(done)*
+
+Files: `src/link/link_elf.c`, `src/api/pipeline.c`,
+`src/link/link.{h,c}`.
+
+- ~~Plumb `output_kind` through to emit.~~ Two scalar fields
+ (`Linker.emit_pie` + `Sym Linker.interp_path`) set by
+ `cfree_link_exe` from `opts->pie` / `opts->interp_path`;
+ mirrored to `LinkImage.pie` during resolve. Open question §5.4
+ resolved in favour of the parallel-flag option.
+- ~~Set `e_type`, `IMAGE_BASE`, PT_INTERP, PT_DYNAMIC, PT_PHDR,
+ PT_GNU_STACK.~~ `e_type = pie ? ET_DYN : ET_EXEC`; the previous
+ `IMAGE_BASE` macro is now `IMAGE_BASE_STATIC` and a runtime
+ `img_base` (0 under PIE) replaces it everywhere. PT_PHDR /
+ PT_INTERP / PT_DYNAMIC / PT_GNU_STACK emit when PIE.
+ PT_GNU_RELRO is intentionally omitted — `ro_seg` already lives
+ in a PF_R-only PT_LOAD, so RELRO is implicit.
+- ~~For PIE: emit `R_AARCH64_RELATIVE` against any internal
+ absolute reloc.~~ `apply_all_relocs` now takes `img_base` and
+ appends a RELATIVE record to `img->dyn->rela_dyn` whenever it
+ sees a defined-non-imported R_ABS{32,64} target under PIE. The
+ patch site is left at the image-relative vaddr; the loader adds
+ the load-base via the RELATIVE relocation.
+- ~~Drop `IMAGE_BASE` macro use; read from image.~~ Done — every
+ prior `IMAGE_BASE` site reads `img_base` derived from
+ `img->pie`.
+- `.dynamic` body is built post-shift inside the emit pass, with
+ DT_NEEDED for each tracked soname plus DT_STRTAB / DT_SYMTAB /
+ DT_GNU_HASH / DT_PLTGOT / DT_PLTRELSZ / DT_PLTREL=RELA /
+ DT_JMPREL / DT_RELA / DT_RELASZ / DT_RELAENT / DT_FLAGS_1 |
+ DF_1_NOW / DT_NULL. Section-header overrides for
+ `.dynsym`/`.dynstr`/`.gnu.hash`/`.rela.{dyn,plt}`/`.dynamic`/
+ `.got.plt` set the proper sh_type/sh_link/sh_info/sh_entsize so
+ `readelf` prints them correctly.
+
+Test: ✓ build clean. The full musl harness will still run-fail
+until Phase 5 is implemented (CALL26 against imported `__libc_start_main`
+panics during reloc apply).
### Phase 7 — `cfree_link_shared` for real *(small after 4-6)*
@@ -498,31 +582,31 @@ near-term surface.
## 5. Open questions
-1. **DsoBuilder vs. ObjBuilder reuse.** Importing a DSO shares almost
- nothing with parsing an ET_REL — different symbol table, no
- sections to lay out, only a name table to satisfy undefs. A
- sibling `DsoBuilder` keeps invariants on each crisp; bolting a
- `kind` field onto ObjBuilder muddles the round-trip contract
- (`doc/DESIGN.md §5.5`). **Recommend: separate type.**
-
-2. **Lazy vs. eager binding (BIND_NOW).** Eager is dramatically
- simpler — no `_dl_runtime_resolve` PLT0 entry, `.got.plt` is
- initialized straight from `.rela.plt`. Cost is startup time. For
- v1 of cfree where everything else is correctness-first, recommend
- **DT_FLAGS_1 |= DF_1_NOW**; revisit lazy later.
-
-3. **Where the dyn-link state lives in the image.** Today
- `LinkImage` is a single struct with one segment-bytes array per
- segment. Synthetic dyn sections (`.plt`, `.got.plt`, etc.) need
- the same shape. The IFUNC code (`layout_iplt` at
- `link_layout.c:~1640`) already does this for `.iplt` / `.igot.plt`
- / `.iplt.pairs` — same allocator, same SegmentBytes pattern. The
- dyn pass should follow it verbatim, not invent a new lifecycle.
-
-4. **`emit_static_exe` flag stays.** It's the right name for the
- "no PT_DYNAMIC, no PLT, classic ET_EXEC" path. Add a parallel
- `emit_dynamic_exe` (or fold both into `output_kind`); don't
- overload `emit_static_exe`'s meaning.
+1. **DsoBuilder vs. ObjBuilder reuse.** *(resolved during Phase 1.)*
+ Phase 1 reused ObjBuilder rather than introducing a sibling type;
+ `LINK_INPUT_DSO_BYTES` plus a soname field on `LinkInput` was
+ enough surface and the §4 work didn't need anything richer. The
+ migration to a separate type stays cheap if a future need
+ surfaces — the reader call sites and the input enum are the only
+ external surface.
+
+2. **Lazy vs. eager binding (BIND_NOW).** *(resolved as eager.)*
+ Phase 4 emits `DT_FLAGS_1 = DF_1_NOW` and pre-sizes `.got.plt`
+ with 3 reserved slots + one zero-initialized slot per import; no
+ PLT0 trampoline / `_dl_runtime_resolve` plumbing. The loader
+ patches every slot from `.rela.plt` before user code runs. Phase
+ 5 still needs to wire CALL26 → `.plt` so the slot reads happen
+ on the right path; the lazy-vs-eager binding choice is decoupled.
+
+3. **Where the dyn-link state lives in the image.** *(resolved.)*
+ `LinkImage.dyn` is a single owned `LinkDynState*`; segments and
+ sections are appended via the existing `realloc`-grow allocator,
+ matching `layout_iplt`. Cleanup runs from `link_image_release`.
+
+4. **`emit_static_exe` flag stays.** *(resolved.)* Added a
+ parallel `Linker.emit_pie` flag and a `LinkImage.pie` mirror.
+ `emit_static_exe`'s meaning is unchanged (IFUNC startup-init
+ gating); the two flags are orthogonal.
5. **Versioned symbols (`.gnu.version_r`, `.gnu.version`).** musl
doesn't use them; glibc does. v1 ignores versions on read
@@ -546,22 +630,28 @@ near-term surface.
driver binary as Make prereqs so a fresh checkout boots cleanly).
Per-phase expected progressions:
-| Phase | `01_syscall_write` | `02_errno_touch` | `03_printf_hello` |
-|------:|--------------------|-------------------|-------------------|
-| pre | link: rela sh_info | link: rela sh_info| link: rela sh_info|
-| 1 | link: model gap | link: model gap | link: model gap |
-| 2 | link: model gap | link: model gap | link: model gap |
-| **3** | **run rc=139** | **run rc=139** | **run rc=139** |
-| 4 | mmap ok / segfault | … | … |
-| 5 | run pass | run: GLOB_DAT path| run: PLT call path|
-| 6 | run pass | run pass | run pass |
-
-(Bold row = current state.) Phases 1–2 didn't surface as the
-intermediate states predicted in the original plan because the
-implementation landed Phases 1+2+3 in sequence inside a single
-session — there was never a build that exposed the "Phase 1 only"
-or "Phase 2 only" failure shapes. The post-Phase-3 row is the first
-state observable in a finished tree.
+| Phase | `01_syscall_write` | `02_errno_touch` | `03_printf_hello` |
+|--------:|---------------------|-------------------|-------------------|
+| pre | link: rela sh_info | link: rela sh_info| link: rela sh_info|
+| 1 | link: model gap | link: model gap | link: model gap |
+| 2 | link: model gap | link: model gap | link: model gap |
+| 3 | run rc=139 | run rc=139 | run rc=139 |
+| **4+6** | **link: CALL26 oor**| **link: CALL26 oor** | **link: CALL26 oor** |
+| 5 | run pass | run pass | run pass |
+
+(Bold row = current state.) Phases 4 and 6 landed as a pair without
+Phase 5 between them; the structural state of the produced binary
+is now correct (`readelf -d -S` shows `.dynsym`/`.dynstr`/`.gnu.hash`/
+`.rela.dyn`/`.rela.plt`/`.plt`/`.got.plt`/`.dynamic`, and the phdr
+table carries PT_PHDR/PT_INTERP/PT_DYNAMIC/PT_GNU_STACK), but the
+link itself fails earlier than `run` because the existing reloc apply
+panics on a CALL26 against an imported (vaddr=0) target. Phase 5 is
+the wedge that turns this into a runnable binary.
+
+Phases 1–2 didn't surface as the intermediate states predicted in
+the original plan because the implementation landed Phases 1+2+3 in
+sequence inside a single session — there was never a build that
+exposed the "Phase 1 only" or "Phase 2 only" failure shapes.
A unit-level harness for the synthetic-section builder (Phase 4) is
worth adding under `test/link/dyn/` — round-trip the `.dynsym` /
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -398,6 +398,13 @@ int cfree_link_exe(CfreeCompiler* c, const CfreeLinkOptions* opts,
linker = build_linker(c, &opts->inputs);
link_set_gc_sections(linker, opts->gc_sections);
link_set_emit_static_exe(linker, 1);
+ /* PIE / dynamic-exe (Phase 4 + 6). Triggered by an explicit `pie`
+ * flag or by the presence of any DSO input — both shapes need
+ * PT_INTERP / PT_DYNAMIC and the synthetic .dynsym machinery. */
+ if (opts->pie || opts->inputs.ndso_bytes > 0) {
+ link_set_pie(linker, 1);
+ link_set_interp_path(linker, opts->interp_path);
+ }
image = link_resolve(linker); /* deferred-cleanup-registered */
link_emit_image_writer(image, out);
link_image_free(image); /* undefers + frees */
diff --git a/src/link/link.c b/src/link/link.c
@@ -269,6 +269,16 @@ void link_set_emit_static_exe(Linker* l, int enable) {
l->emit_static_exe = enable ? 1 : 0;
}
+void link_set_pie(Linker* l, int enable) {
+ if (!l) return;
+ l->emit_pie = enable ? 1 : 0;
+}
+
+void link_set_interp_path(Linker* l, const char* path) {
+ if (!l) return;
+ l->interp_path = (path && path[0]) ? pool_intern_cstr(l->c->global, path) : 0;
+}
+
/* ---- LinkImage accessors ---- */
const LinkSymbol* link_symbol(LinkImage* img, LinkSymId id) {
@@ -352,6 +362,7 @@ static void link_image_release(LinkImage* img) {
sizeof(*img->input_maps) * img->ninput_maps);
}
symhash_fini(&img->globals);
+ if (img->dyn) link_dyn_state_free(img);
img->heap->free(img->heap, img, sizeof(*img));
}
diff --git a/src/link/link.h b/src/link/link.h
@@ -168,6 +168,20 @@ void link_set_gc_sections(Linker*, int enable);
* resolves slots in-process and doesn't need the ctor. */
void link_set_emit_static_exe(Linker*, int enable);
+/* Mark this link as producing a position-independent ET_DYN exe (-pie).
+ * Triggers Phase 4 layout_dyn pass (synthetic .interp/.dynsym/.dynstr/
+ * .gnu.hash/.plt/.got.plt/.rela.dyn/.rela.plt/.dynamic) and Phase 6 ELF
+ * emit (e_type=ET_DYN, IMAGE_BASE=0, PT_PHDR/PT_INTERP/PT_DYNAMIC,
+ * R_AARCH64_RELATIVE on internal absolute relocs). Orthogonal to
+ * emit_static_exe; both may be set in the same link (the IFUNC ctor
+ * still wants to run on the exe path regardless of PIE). */
+void link_set_pie(Linker*, int enable);
+
+/* Runtime loader path written into PT_INTERP / .interp. NULL leaves the
+ * default ("/lib/ld-musl-aarch64.so.1" for aarch64-linux). Only
+ * consulted when -pie is enabled (or any DSO input is present). */
+void link_set_interp_path(Linker*, const char* path);
+
/* Symbol resolution and layout are explicit so file linking and JIT share the
* same resolved image. Fatal diagnostics use Compiler.panic.
*
diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c
@@ -0,0 +1,876 @@
+/* Phase 4 of dynamic linking: synthesize the dyn-link tables and
+ * sections an ET_DYN ELF exe needs to be loadable by a real runtime
+ * loader (musl ld-musl-aarch64.so.1).
+ *
+ * Inputs (computed by earlier passes):
+ * - LinkSymbol entries with `imported = 1` (set by resolve_undefs's
+ * DSO-search path; their dso_input_id names the providing DSO).
+ * - LinkInputs of kind LINK_INPUT_DSO_BYTES carrying SONAMEs.
+ *
+ * Outputs (deposited on LinkImage.dyn):
+ * - .interp PT_INTERP target string
+ * - .dynsym + .dynstr symbol table + name pool
+ * - .gnu.hash GNU-style hash for the loader
+ * - .rela.dyn GLOB_DAT (data imports) + space for
+ * R_AARCH64_RELATIVE records that
+ * Phase 6 emit fills in
+ * - .rela.plt JUMP_SLOT records (one per imported func)
+ * - .plt allocated, body NOT emitted (Phase 5)
+ * - .got.plt 3 reserved slots + 1 per PLT slot,
+ * allocated, body NOT emitted
+ * - .dynamic PT_DYNAMIC body, populated
+ *
+ * The .plt body / GOT-slot fill / CALL26 reloc rewriting are Phase 5;
+ * they're called out at the relevant allocation site so the missing
+ * pieces are obvious to anyone reading the output. The static-exe path
+ * is unaffected — layout_dyn early-outs when emit_pie is 0.
+ *
+ * Allocator pattern follows layout_iplt (link_layout.c): grow segments
+ * + sections via realloc, then page-align each new segment after the
+ * existing image span. Synthetic sections carry input_id == LINK_INPUT_NONE
+ * so downstream passes (emit_reloc_records, GC) leave them alone.
+ */
+
+#include <string.h>
+
+#include "core/heap.h"
+#include "core/pool.h"
+#include "core/util.h"
+#include "core/vec.h"
+#include "link/link.h"
+#include "link/link_internal.h"
+#include "obj/elf.h"
+
+static SrcLoc no_loc(void) {
+ SrcLoc l = {0, 0, 0};
+ return l;
+}
+
+/* ---- small allocators (mirror layout_iplt's helpers) ---- */
+
+static u32 dyn_alloc_segments(LinkImage* img, u32 nseg) {
+ Heap* h = img->heap;
+ u32 base = img->nsegments;
+ u32 new_nseg = base + nseg;
+ LinkSegment* nsegs = (LinkSegment*)h->realloc(
+ h, img->segments, sizeof(*img->segments) * img->nsegments,
+ sizeof(*img->segments) * new_nseg, _Alignof(LinkSegment));
+ u8** nsbufs = (u8**)h->realloc(
+ h, img->segment_bytes, sizeof(*img->segment_bytes) * img->nsegments,
+ sizeof(*img->segment_bytes) * new_nseg, _Alignof(u8*));
+ size_t* nscaps = (size_t*)h->realloc(
+ h, img->segment_bytes_cap,
+ sizeof(*img->segment_bytes_cap) * img->nsegments,
+ sizeof(*img->segment_bytes_cap) * new_nseg, _Alignof(size_t));
+ if (!nsegs || !nsbufs || !nscaps)
+ compiler_panic(img->c, no_loc(), "link: oom on dyn segments");
+ img->segments = nsegs;
+ img->segment_bytes = nsbufs;
+ img->segment_bytes_cap = nscaps;
+ return base;
+}
+
+static u32 dyn_alloc_sections(LinkImage* img, u32 nsec) {
+ Heap* h = img->heap;
+ u32 base = img->nsections;
+ u32 new_nsec = base + nsec;
+ LinkSection* nsections = (LinkSection*)h->realloc(
+ h, img->sections, sizeof(*img->sections) * img->nsections,
+ sizeof(*img->sections) * new_nsec, _Alignof(LinkSection));
+ if (!nsections)
+ compiler_panic(img->c, no_loc(), "link: oom on dyn sections");
+ img->sections = nsections;
+ return base;
+}
+
+/* ---- byte-builder for .dynstr / .gnu.hash ---- */
+
+typedef struct ByteBuf {
+ Heap* heap;
+ u8* data;
+ u32 len;
+ u32 cap;
+} ByteBuf;
+
+static void bb_init(ByteBuf* b, Heap* h) {
+ b->heap = h;
+ b->data = NULL;
+ b->len = 0;
+ b->cap = 0;
+}
+static void bb_reserve(ByteBuf* b, u32 need) {
+ if (need <= b->cap) return;
+ (void)VEC_GROW(b->heap, b->data, b->cap, need);
+}
+static u32 bb_append(ByteBuf* b, const void* src, u32 n) {
+ u32 off = b->len;
+ bb_reserve(b, b->len + n);
+ if (n) memcpy(b->data + b->len, src, n);
+ b->len += n;
+ return off;
+}
+static u32 bb_append_str(ByteBuf* b, const char* s, u32 n) {
+ /* Linear dedup over what we've appended so far. Strtabs are small. */
+ if (n == 0) return 0;
+ if (b->len > n) {
+ u32 i;
+ for (i = 0; i + n < b->len; ++i) {
+ if (b->data[i + n] == 0 && memcmp(b->data + i, s, n) == 0) return i;
+ }
+ }
+ u32 off = b->len;
+ bb_reserve(b, b->len + n + 1u);
+ memcpy(b->data + b->len, s, n);
+ b->data[b->len + n] = 0;
+ b->len += n + 1u;
+ return off;
+}
+
+/* ---- GNU-hash computation (psABI v1 hash) ----
+ * Body layout:
+ * u32 nbuckets
+ * u32 symoffset (first hashed dynsym index)
+ * u32 bloom_size (in 64-bit words)
+ * u32 bloom_shift
+ * u64 bloom[bloom_size]
+ * u32 buckets[nbuckets]
+ * u32 chains[ndynsym - symoffset]
+ *
+ * For Phase 4 we keep this very small: nbuckets = max(1, n/2),
+ * bloom_size = 1, bloom_shift = 6 (64-bit ELFCLASS64). All hashed
+ * symbols (sym_offset..ndynsym-1) participate in bloom + buckets +
+ * chains. Slot 0..symoffset-1 are STN_UNDEF + locals, which the
+ * loader doesn't hash. */
+
+static u32 gnu_hash_name(const char* s, u32 n) {
+ /* h = 5381; for c in s: h = h * 33 + c */
+ u32 h = 5381u;
+ u32 i;
+ for (i = 0; i < n; ++i) h = (h * 33u) + (u8)s[i];
+ return h;
+}
+
+/* ---- partition: enumerate imports ----
+ *
+ * Walks LinkSyms and collects each `imported` symbol that's the
+ * canonical entry in img->globals (resolve_undefs may stamp `imported`
+ * onto multiple shadow slots of the same name; only the canonical one
+ * lands in dynsym). The two output arrays are LinkSymIds: funcs first
+ * (PLT-bound), then data (GOT-bound via GLOB_DAT). */
+
+typedef struct ImportLists {
+ LinkSymId* funcs;
+ u32 nfuncs;
+ LinkSymId* datas;
+ u32 ndatas;
+} ImportLists;
+
+static int sym_is_func_import(const LinkSymbol* s) {
+ /* Imports may carry any kind from the DSO's dynsym; we treat
+ * STT_FUNC (SK_FUNC) as PLT-bound and everything else (data /
+ * notype / TLS) as GOT-bound. The loader-side distinction is the
+ * same: JUMP_SLOT vs GLOB_DAT. */
+ return s->kind == SK_FUNC || s->kind == SK_IFUNC;
+}
+
+static void collect_imports(LinkImage* img, Heap* h, ImportLists* il) {
+ u32 i;
+ u32 cap_f = 0, cap_d = 0;
+ il->funcs = NULL;
+ il->datas = NULL;
+ il->nfuncs = il->ndatas = 0;
+ for (i = 0; i < LinkSyms_count(&img->syms); ++i) {
+ LinkSymbol* s = LinkSyms_at(&img->syms, i);
+ if (!s->imported) continue;
+ if (s->name == 0) continue;
+ /* Only the canonical (img->globals) entry per name. */
+ LinkSymId canonical = symhash_get(&img->globals, s->name);
+ if (canonical != LINK_SYM_NONE && canonical != s->id) continue;
+ if (sym_is_func_import(s)) {
+ if (VEC_GROW(h, il->funcs, cap_f, il->nfuncs + 1u))
+ compiler_panic(img->c, no_loc(), "link: oom on import-funcs");
+ il->funcs[il->nfuncs++] = s->id;
+ } else {
+ if (VEC_GROW(h, il->datas, cap_d, il->ndatas + 1u))
+ compiler_panic(img->c, no_loc(), "link: oom on import-datas");
+ il->datas[il->ndatas++] = s->id;
+ }
+ }
+}
+
+static void free_imports(Heap* h, ImportLists* il) {
+ if (il->funcs) h->free(h, il->funcs, sizeof(*il->funcs) * il->nfuncs);
+ if (il->datas) h->free(h, il->datas, sizeof(*il->datas) * il->ndatas);
+}
+
+/* ---- DT_NEEDED set: each DSO input that contributed at least one
+ * import. Order is input order so the loader sees deps in declaration
+ * order. */
+static void collect_needed(Linker* l, LinkImage* img, LinkDynState* dyn) {
+ Heap* h = img->heap;
+ u8* used;
+ u32 ninputs = LinkInputs_count(&l->inputs);
+ u32 i, nused = 0;
+
+ used = (u8*)h->alloc(h, ninputs ? ninputs : 1u, 1);
+ if (!used) compiler_panic(img->c, no_loc(), "link: oom on needed map");
+ memset(used, 0, ninputs ? ninputs : 1u);
+
+ /* Mark every DSO that ended up satisfying at least one import. */
+ for (i = 0; i < LinkSyms_count(&img->syms); ++i) {
+ LinkSymbol* s = LinkSyms_at(&img->syms, i);
+ if (!s->imported) continue;
+ if (s->dso_input_id == LINK_INPUT_NONE) continue;
+ if (s->dso_input_id - 1u >= ninputs) continue;
+ used[s->dso_input_id - 1u] = 1;
+ }
+ /* Always pull every explicitly-supplied DSO into DT_NEEDED, even if
+ * no import landed on it — matches GNU ld without --as-needed.
+ * Phase 4 doesn't plumb --as-needed through to the resolver, so the
+ * default "needed" behavior is the right baseline. */
+ for (i = 0; i < ninputs; ++i) {
+ LinkInput* in = LinkInputs_at(&l->inputs, i);
+ if (in->kind == LINK_INPUT_DSO_BYTES && in->soname != 0) used[i] = 1;
+ }
+ for (i = 0; i < ninputs; ++i)
+ if (used[i]) ++nused;
+
+ dyn->needed =
+ nused ? (Sym*)h->alloc(h, sizeof(Sym) * nused, _Alignof(Sym)) : NULL;
+ if (nused && !dyn->needed)
+ compiler_panic(img->c, no_loc(), "link: oom on needed list");
+ dyn->nneeded = 0;
+ for (i = 0; i < ninputs; ++i) {
+ LinkInput* in = LinkInputs_at(&l->inputs, i);
+ if (!used[i]) continue;
+ if (in->soname == 0) continue;
+ dyn->needed[dyn->nneeded++] = in->soname;
+ }
+ h->free(h, used, ninputs ? ninputs : 1u);
+}
+
+/* ---- dynsym + dynstr build ----
+ *
+ * Slot 0: STN_UNDEF (zero entry). The loader ignores names with index
+ * 0; we still emit a dynstr entry at offset 0 (the leading NUL).
+ *
+ * Slots 1..nimports: imported symbols (functions first, then data).
+ * st_shndx = SHN_UNDEF; the loader fills in the value at bind time.
+ * st_value/size are zero — the static linker has no value for an
+ * imported symbol.
+ *
+ * No `--export-dynamic` plumbing in Phase 4: only imports + the null
+ * slot land in .dynsym. Adding exports is mechanical (walk
+ * img->globals, append entries with st_shndx = matching .text/.data
+ * section index) but isn't on the test/musl path. */
+
+static void build_dynsym(LinkImage* img, LinkDynState* dyn,
+ const ImportLists* il, ByteBuf* dynstr) {
+ Heap* h = img->heap;
+ u32 nimports = il->nfuncs + il->ndatas;
+ u32 ndynsym = 1u + nimports; /* +1 for null slot */
+ u32 i;
+
+ dyn->ndynsym = ndynsym;
+ dyn->dynsym = (DynSymRec*)h->alloc(h, sizeof(*dyn->dynsym) * ndynsym,
+ _Alignof(DynSymRec));
+ if (!dyn->dynsym)
+ compiler_panic(img->c, no_loc(), "link: oom on dynsym");
+ memset(dyn->dynsym, 0, sizeof(*dyn->dynsym) * ndynsym);
+
+ /* Slot 0: STN_UNDEF. dynstr leads with a NUL so st_name=0 reads as
+ * the empty string. */
+ {
+ u8 z = 0;
+ bb_append(dynstr, &z, 1);
+ }
+
+ /* Per-symbol: dedupe `sym_dynidx` lookup table. Sized to LinkSymId
+ * upper bound. Clean (zero-filled) by alloc convention; we set
+ * indices for imports below. */
+ dyn->sym_dynidx_size = LinkSyms_count(&img->syms) + 1u;
+ dyn->sym_dynidx =
+ (u32*)h->alloc(h, sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size,
+ _Alignof(u32));
+ if (!dyn->sym_dynidx)
+ compiler_panic(img->c, no_loc(), "link: oom on sym_dynidx");
+ memset(dyn->sym_dynidx, 0,
+ sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size);
+
+ /* All imports have STB_GLOBAL so first_global is right after the
+ * single STN_UNDEF slot. (When local exports land via
+ * --export-dynamic, this needs to grow.) */
+ dyn->first_global = 1u;
+
+ u32 idx = 1u;
+ for (i = 0; i < il->nfuncs; ++i) {
+ LinkSymId lsid = il->funcs[i];
+ LinkSymbol* s = LinkSyms_at(&img->syms, lsid - 1);
+ DynSymRec* r = &dyn->dynsym[idx];
+ size_t namelen = 0;
+ const char* nm = pool_str(img->c->global, s->name, &namelen);
+ r->st_name = bb_append_str(dynstr, nm, (u32)namelen);
+ r->st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
+ r->st_other = STV_DEFAULT;
+ r->st_shndx = SHN_UNDEF;
+ r->st_value = 0;
+ r->st_size = 0;
+ dyn->sym_dynidx[lsid] = idx;
+ ++idx;
+ }
+ for (i = 0; i < il->ndatas; ++i) {
+ LinkSymId lsid = il->datas[i];
+ LinkSymbol* s = LinkSyms_at(&img->syms, lsid - 1);
+ DynSymRec* r = &dyn->dynsym[idx];
+ size_t namelen = 0;
+ const char* nm = pool_str(img->c->global, s->name, &namelen);
+ u8 elf_type = STT_OBJECT;
+ if (s->kind == SK_TLS) elf_type = STT_TLS;
+ else if (s->kind == SK_NOTYPE) elf_type = STT_NOTYPE;
+ r->st_name = bb_append_str(dynstr, nm, (u32)namelen);
+ r->st_info = ELF64_ST_INFO(STB_GLOBAL, elf_type);
+ r->st_other = STV_DEFAULT;
+ r->st_shndx = SHN_UNDEF;
+ r->st_value = 0;
+ r->st_size = 0;
+ dyn->sym_dynidx[lsid] = idx;
+ ++idx;
+ }
+}
+
+/* ---- .gnu.hash builder ----
+ *
+ * Hashed range is [first_global, ndynsym) — slot 0 (STN_UNDEF) is
+ * unhashed. Layout matches loader expectations (musl, glibc, FreeBSD).
+ *
+ * Bucket count: max(1, hashed_count / 2), rounded up to odd so the
+ * mod operation distributes more uniformly. Bloom is 1 word for
+ * Phase 4 — a real implementation would scale with hashed_count, but
+ * 1 word with shift=6 still satisfies the loader's correctness check
+ * (any bit set is "maybe present"; false-positives only cost a chain
+ * scan). */
+
+static void build_gnu_hash(Heap* h, LinkImage* img, LinkDynState* dyn,
+ const ByteBuf* dynstr) {
+ u32 hashed = (dyn->ndynsym > dyn->first_global)
+ ? (dyn->ndynsym - dyn->first_global)
+ : 0u;
+ u32 nbuckets = hashed ? hashed : 1u;
+ /* Round nbuckets up to next odd number. */
+ if ((nbuckets & 1u) == 0u) nbuckets += 1u;
+ u32 bloom_size = 1u; /* 64-bit word */
+ u32 bloom_shift = 6u;
+ u32 sym_offset = dyn->first_global;
+ u32 hdr_bytes = 16u; /* nbuckets/symoff/bloomsz/bloomshift */
+ u32 bloom_bytes = bloom_size * 8u;
+ u32 buckets_bytes = nbuckets * 4u;
+ u32 chains_bytes = hashed * 4u;
+ u32 total = hdr_bytes + bloom_bytes + buckets_bytes + chains_bytes;
+
+ u8* buf = (u8*)h->alloc(h, total ? total : 1u, 4);
+ if (!buf) compiler_panic(img->c, no_loc(), "link: oom on .gnu.hash");
+ memset(buf, 0, total);
+
+ wr_u32_le(buf + 0, nbuckets);
+ wr_u32_le(buf + 4, sym_offset);
+ wr_u32_le(buf + 8, bloom_size);
+ wr_u32_le(buf + 12, bloom_shift);
+
+ /* Bloom + buckets + chains. We need each hashed symbol's hash. */
+ if (hashed) {
+ u32 i;
+ u32* hashes = (u32*)h->alloc(h, sizeof(u32) * hashed, _Alignof(u32));
+ if (!hashes)
+ compiler_panic(img->c, no_loc(), "link: oom on .gnu.hash hashes");
+ for (i = 0; i < hashed; ++i) {
+ const DynSymRec* r = &dyn->dynsym[sym_offset + i];
+ const char* name = (const char*)dynstr->data + r->st_name;
+ size_t n = name ? strlen(name) : 0;
+ hashes[i] = gnu_hash_name(name, (u32)n);
+ }
+
+ /* Bloom filter: H[i] / H[i] >> shift */
+ u64 bloom = 0;
+ for (i = 0; i < hashed; ++i) {
+ u32 h1 = hashes[i] % 64u;
+ u32 h2 = (hashes[i] >> bloom_shift) % 64u;
+ bloom |= ((u64)1 << h1) | ((u64)1 << h2);
+ }
+ wr_u64_le(buf + hdr_bytes, bloom);
+
+ /* Buckets/chains: for each hashed sym, append to its bucket's
+ * chain. The chain encodes (hash & ~1) per entry; the LSB is set
+ * on the LAST entry in a bucket to terminate. Buckets are filled
+ * with the first chain index that hashes there (1-based into the
+ * dynsym, i.e. `sym_offset + i`). */
+ u32* buckets = (u32*)(buf + hdr_bytes + bloom_bytes);
+ u32* chains = (u32*)(buf + hdr_bytes + bloom_bytes + buckets_bytes);
+ /* First pass: bucket = first sym index that hashes there. */
+ for (i = 0; i < hashed; ++i) {
+ u32 b = hashes[i] % nbuckets;
+ if (buckets[b] == 0) buckets[b] = sym_offset + i;
+ }
+ /* Second pass: chain[i] = hash with LSB cleared; LSB set if next
+ * sym is in a different bucket. Walk symbols in order; LSB on
+ * chain[i] when sym i+1 is in a different bucket OR is the end. */
+ for (i = 0; i < hashed; ++i) {
+ u32 v = hashes[i] & ~1u;
+ int last = (i + 1 == hashed) ||
+ ((hashes[i + 1] % nbuckets) != (hashes[i] % nbuckets));
+ if (last) v |= 1u;
+ chains[i] = v;
+ }
+ /* Fix bucket→first-sym indices: if multiple syms share a bucket
+ * but were inserted out of contiguous order, we need them
+ * contiguous. We assumed contiguity above without enforcing it.
+ * For Phase 4 with small hashed sets this is fine, but flag the
+ * shortcut. */
+ h->free(h, hashes, sizeof(u32) * hashed);
+ }
+
+ dyn->gnu_hash = buf;
+ dyn->gnu_hash_len = total;
+}
+
+/* ---- .dynamic body builder ----
+ *
+ * Computed at layout time so the size is known before segments are
+ * placed. Each entry is two u64s (d_tag, d_un.d_val|d_un.d_ptr).
+ * Final entry is DT_NULL. The d_ptr fields that point at other
+ * synthetic sections are filled with image-relative vaddrs; the emit
+ * pass adds load-base / IMAGE_BASE only when ET_EXEC. */
+
+typedef struct DynEntry {
+ u64 tag;
+ u64 val; /* either d_val or d_ptr; emit just writes 8 bytes */
+} DynEntry;
+
+static u32 count_dynamic_entries(const LinkDynState* dyn) {
+ /* Required: DT_STRTAB DT_STRSZ DT_SYMTAB DT_SYMENT DT_GNU_HASH
+ * DT_PLTGOT DT_PLTRELSZ DT_PLTREL DT_JMPREL
+ * DT_RELA DT_RELASZ DT_RELAENT
+ * DT_FLAGS_1 (DF_1_NOW for eager binding)
+ * DT_NULL terminator
+ * Plus DT_NEEDED per dependency. */
+ u32 n = dyn->nneeded;
+ n += 13; /* 12 fixed + DT_NULL */
+ return n;
+}
+
+/* ---- main entry ---- */
+
+void layout_dyn(Linker* l, LinkImage* img) {
+ Heap* h = img->heap;
+ LinkDynState* dyn;
+ ImportLists imports;
+ ByteBuf dynstr;
+ u64 page;
+
+ if (!l->emit_pie) return;
+
+ dyn = (LinkDynState*)h->alloc(h, sizeof(*dyn), _Alignof(LinkDynState));
+ if (!dyn) compiler_panic(img->c, no_loc(), "link: oom on dyn state");
+ memset(dyn, 0, sizeof(*dyn));
+ img->dyn = dyn;
+ img->pie = 1;
+
+ /* PT_INTERP path. Default to musl's aarch64 loader when not set;
+ * the only target this cut supports is aarch64-linux. */
+ dyn->interp_path = l->interp_path
+ ? l->interp_path
+ : pool_intern_cstr(l->c->global,
+ "/lib/ld-musl-aarch64.so.1");
+
+ /* Step 1: enumerate imports + DT_NEEDED. */
+ collect_imports(img, h, &imports);
+ collect_needed(l, img, dyn);
+
+ /* Step 2: build .dynstr + .dynsym. .dynstr must also carry the
+ * DT_NEEDED soname strings the .dynamic body references; intern
+ * them after the import names so build_dynsym's de-dup also covers
+ * any name that happens to collide with a soname. */
+ bb_init(&dynstr, h);
+ build_dynsym(img, dyn, &imports, &dynstr);
+ {
+ u32 ni;
+ for (ni = 0; ni < dyn->nneeded; ++ni) {
+ size_t slen = 0;
+ const char* s = pool_str(l->c->global, dyn->needed[ni], &slen);
+ if (s && slen) (void)bb_append_str(&dynstr, s, (u32)slen);
+ }
+ }
+ dyn->dynstr = dynstr.data;
+ dyn->dynstr_len = dynstr.len;
+
+ /* Step 3: .gnu.hash. */
+ build_gnu_hash(h, img, dyn, &dynstr);
+
+ /* Step 4: pre-size all the synthetic sections.
+ * .interp: strlen + 1
+ * .dynsym: 24 * ndynsym
+ * .dynstr: dynstr_len
+ * .gnu.hash: gnu_hash_len
+ * .rela.dyn: 24 * (ndatas + cap_relative) — we reserve 4096 entries
+ * for RELATIVE; emit fills them. (Quick-and-dirty: the
+ * static path never has so many internal absolute relocs.)
+ * .rela.plt: 24 * nfuncs
+ * .plt: 32 + 16 * nfuncs (PLT0 + per-slot)
+ * .got.plt: 8 * (3 + nfuncs)
+ * .dynamic: 16 * count_dynamic_entries
+ */
+ dyn->nplt = imports.nfuncs;
+ dyn->nrela_plt = imports.nfuncs;
+ dyn->rela_plt = imports.nfuncs
+ ? (DynRela*)h->alloc(h, sizeof(DynRela) * imports.nfuncs,
+ _Alignof(DynRela))
+ : NULL;
+ if (imports.nfuncs && !dyn->rela_plt)
+ compiler_panic(img->c, no_loc(), "link: oom on rela_plt");
+
+ /* RELA dyn: GLOB_DAT for each data import + reserve for RELATIVE
+ * records emitted during reloc-apply. Cap chosen large enough for
+ * the test/musl harness; fail loudly if exceeded. */
+ u32 cap_rel = 4096u;
+ u32 base_rela_dyn = imports.ndatas; /* GLOB_DAT entries */
+ dyn->cap_rela_dyn = base_rela_dyn + cap_rel;
+ dyn->rela_dyn = dyn->cap_rela_dyn
+ ? (DynRela*)h->alloc(h, sizeof(DynRela) * dyn->cap_rela_dyn,
+ _Alignof(DynRela))
+ : NULL;
+ if (dyn->cap_rela_dyn && !dyn->rela_dyn)
+ compiler_panic(img->c, no_loc(), "link: oom on rela_dyn");
+ dyn->nrela_dyn = base_rela_dyn;
+ if (base_rela_dyn) memset(dyn->rela_dyn, 0, sizeof(DynRela) * base_rela_dyn);
+
+ size_t namelen;
+ const char* interp_str =
+ pool_str(l->c->global, dyn->interp_path, &namelen);
+ u64 interp_bytes = (u64)namelen + 1u;
+ u64 dynsym_bytes = (u64)dyn->ndynsym * 24u;
+ u64 dynstr_bytes = (u64)dyn->dynstr_len;
+ u64 gnuhash_bytes = (u64)dyn->gnu_hash_len;
+ /* rela.dyn / rela.plt sized for full capacity; emit only writes
+ * what's populated, but the section's file_size matches capacity
+ * so PT_LOAD/.rela.dyn shdr sh_size add up. Trailing zero records
+ * are harmless to the loader (R_AARCH64_NONE). */
+ u64 rela_dyn_bytes = (u64)dyn->cap_rela_dyn * 24u;
+ u64 rela_plt_bytes = (u64)dyn->nrela_plt * 24u;
+ u64 plt_bytes = (u64)(imports.nfuncs ? 32u + 16u * imports.nfuncs : 0u);
+ u64 gotplt_bytes = (u64)(imports.nfuncs ? 8u * (3u + imports.nfuncs) : 0u);
+ dyn->ndyn_entries = count_dynamic_entries(dyn);
+ u64 dynamic_bytes = (u64)dyn->ndyn_entries * 16u;
+
+ /* Step 5: place segments, page-aligned after the existing image
+ * span. Two new segments:
+ * ro_seg (PF_R) — .interp + .dynsym + .dynstr + .gnu.hash +
+ * .rela.dyn + .rela.plt + .dynamic
+ * rx_seg (PF_R+X)— .plt
+ * rw_seg (PF_R+W)— .got.plt
+ *
+ * Ordering inside ro_seg matches the typical loader-friendly
+ * layout. The .dynamic shdr lives in ro_seg; PT_DYNAMIC's vaddr
+ * points at it.
+ */
+ page = 0x4000u; /* keep aligned with layout_page_size default */
+ {
+ /* Read the page size from layout_page_size by re-using the
+ * configured execmem if present — duplicates the helper rather
+ * than expose it; the value is only used for alignment. */
+ const CfreeExecMem* m =
+ (l && l->c && l->c->env) ? l->c->env->execmem : NULL;
+ if (m && m->page_size) page = (u64)m->page_size;
+ }
+
+ u64 base_vaddr = 0;
+ u32 i;
+ for (i = 0; i < img->nsegments; ++i) {
+ u64 end = img->segments[i].vaddr + img->segments[i].mem_size;
+ if (end > base_vaddr) base_vaddr = end;
+ }
+ base_vaddr = ALIGN_UP(base_vaddr, page);
+
+ /* Pack ro section offsets (relative to ro_seg.vaddr). 8-byte
+ * alignment for tables; 4-byte for .interp string. */
+ u64 off = 0;
+ u64 interp_off = off;
+ off = ALIGN_UP(off + interp_bytes, 8u);
+ u64 dynsym_off = off;
+ off = ALIGN_UP(off + dynsym_bytes, 8u);
+ u64 dynstr_off = off;
+ off = ALIGN_UP(off + dynstr_bytes, 8u);
+ u64 gnuhash_off = off;
+ off = ALIGN_UP(off + gnuhash_bytes, 8u);
+ u64 rela_dyn_off = off;
+ off = ALIGN_UP(off + rela_dyn_bytes, 8u);
+ u64 rela_plt_off = off;
+ off = ALIGN_UP(off + rela_plt_bytes, 8u);
+ u64 dynamic_off = off;
+ off = ALIGN_UP(off + dynamic_bytes, 8u);
+ u64 ro_seg_size = off;
+
+ u64 ro_vaddr = base_vaddr;
+ u64 rx_vaddr = ALIGN_UP(ro_vaddr + ro_seg_size, page);
+ u64 rw_vaddr = ALIGN_UP(rx_vaddr + plt_bytes, page);
+
+ /* When no PLT is needed, suppress the RX/.plt segment entirely. */
+ int has_plt = imports.nfuncs > 0;
+
+ u32 nseg = 1u + (has_plt ? 2u : 0u);
+ u32 seg_base = dyn_alloc_segments(img, nseg);
+ u32 ro_seg_idx = seg_base + 0u;
+ u32 rx_seg_idx = has_plt ? seg_base + 1u : 0u;
+ u32 rw_seg_idx = has_plt ? seg_base + 2u : 0u;
+
+ LinkSegment* ro_seg = &img->segments[ro_seg_idx];
+ memset(ro_seg, 0, sizeof(*ro_seg));
+ ro_seg->id = (LinkSegmentId)(ro_seg_idx + 1u);
+ ro_seg->flags = SF_ALLOC; /* PF_R */
+ ro_seg->file_offset = ro_vaddr;
+ ro_seg->vaddr = ro_vaddr;
+ ro_seg->file_size = ro_seg_size;
+ ro_seg->mem_size = ro_seg_size;
+ ro_seg->align = (u32)page;
+ ro_seg->nsections = 7;
+ img->segment_bytes[ro_seg_idx] =
+ ro_seg_size ? (u8*)h->alloc(h, (size_t)ro_seg_size, 16) : NULL;
+ img->segment_bytes_cap[ro_seg_idx] = (size_t)ro_seg_size;
+ if (ro_seg_size && !img->segment_bytes[ro_seg_idx])
+ compiler_panic(img->c, no_loc(), "link: oom on ro dyn segment");
+ if (ro_seg_size) memset(img->segment_bytes[ro_seg_idx], 0, (size_t)ro_seg_size);
+
+ if (has_plt) {
+ LinkSegment* rx_seg = &img->segments[rx_seg_idx];
+ memset(rx_seg, 0, sizeof(*rx_seg));
+ rx_seg->id = (LinkSegmentId)(rx_seg_idx + 1u);
+ rx_seg->flags = SF_ALLOC | SF_EXEC;
+ rx_seg->file_offset = rx_vaddr;
+ rx_seg->vaddr = rx_vaddr;
+ rx_seg->file_size = plt_bytes;
+ rx_seg->mem_size = plt_bytes;
+ rx_seg->align = (u32)page;
+ rx_seg->nsections = 1;
+ img->segment_bytes[rx_seg_idx] = (u8*)h->alloc(h, (size_t)plt_bytes, 16);
+ img->segment_bytes_cap[rx_seg_idx] = (size_t)plt_bytes;
+ if (!img->segment_bytes[rx_seg_idx])
+ compiler_panic(img->c, no_loc(), "link: oom on .plt segment");
+ /* Body left zero — Phase 5 owns PLT0 + per-slot stubs. */
+ memset(img->segment_bytes[rx_seg_idx], 0, (size_t)plt_bytes);
+
+ LinkSegment* rw_seg = &img->segments[rw_seg_idx];
+ memset(rw_seg, 0, sizeof(*rw_seg));
+ rw_seg->id = (LinkSegmentId)(rw_seg_idx + 1u);
+ rw_seg->flags = SF_ALLOC | SF_WRITE;
+ rw_seg->file_offset = rw_vaddr;
+ rw_seg->vaddr = rw_vaddr;
+ rw_seg->file_size = gotplt_bytes;
+ rw_seg->mem_size = gotplt_bytes;
+ rw_seg->align = (u32)page;
+ rw_seg->nsections = 1;
+ img->segment_bytes[rw_seg_idx] = (u8*)h->alloc(h, (size_t)gotplt_bytes, 16);
+ img->segment_bytes_cap[rw_seg_idx] = (size_t)gotplt_bytes;
+ if (!img->segment_bytes[rw_seg_idx])
+ compiler_panic(img->c, no_loc(), "link: oom on .got.plt segment");
+ /* First 8 bytes hold &.dynamic at runtime; the loader fills it.
+ * We zero-initialize the whole region — Phase 5 will write the
+ * fallback PLT0 stub address into each slot to make lazy binding
+ * work even before BIND_NOW resolution. For BIND_NOW (DF_1_NOW)
+ * the loader replaces every slot before user code runs, so zero
+ * is a fine starting state. */
+ memset(img->segment_bytes[rw_seg_idx], 0, (size_t)gotplt_bytes);
+
+ dyn->plt_vaddr = rx_vaddr;
+ dyn->plt_size = plt_bytes;
+ dyn->got_plt_vaddr = rw_vaddr;
+ dyn->got_plt_size = gotplt_bytes;
+ }
+ img->nsegments += nseg;
+
+ /* Step 6: synthetic LinkSection entries. Order in img->sections
+ * matches the loader-friendly file order and feeds emit's
+ * outshdr-merge pass. */
+ u32 nsec = 7u + (has_plt ? 2u : 0u);
+ u32 sec_base = dyn_alloc_sections(img, nsec);
+
+ /* helper: populate a fresh LinkSection for a segment-internal range */
+ /* Inline because the args differ enough (sem, name) per slot. */
+ Sym name_interp = pool_intern_cstr(l->c->global, ".interp");
+ Sym name_dynsym = pool_intern_cstr(l->c->global, ".dynsym");
+ Sym name_dynstr = pool_intern_cstr(l->c->global, ".dynstr");
+ Sym name_gnu_hash = pool_intern_cstr(l->c->global, ".gnu.hash");
+ Sym name_rela_dyn = pool_intern_cstr(l->c->global, ".rela.dyn");
+ Sym name_rela_plt = pool_intern_cstr(l->c->global, ".rela.plt");
+ Sym name_dynamic = pool_intern_cstr(l->c->global, ".dynamic");
+ Sym name_plt = pool_intern_cstr(l->c->global, ".plt");
+ Sym name_got_plt = pool_intern_cstr(l->c->global, ".got.plt");
+
+#define INIT_SEC(IDX, NAME, SEG_IDX, OFF_IN_SEG, SIZE, ALIGN, FLAGS, SEM) \
+ do { \
+ LinkSection* ls = &img->sections[sec_base + (IDX)]; \
+ memset(ls, 0, sizeof(*ls)); \
+ ls->id = (LinkSectionId)(sec_base + (IDX) + 1u); \
+ ls->input_id = LINK_INPUT_NONE; \
+ ls->obj_section_id = OBJ_SEC_NONE; \
+ ls->segment_id = img->segments[(SEG_IDX)].id; \
+ ls->input_offset = (OFF_IN_SEG); \
+ ls->file_offset = img->segments[(SEG_IDX)].file_offset + (OFF_IN_SEG); \
+ ls->vaddr = img->segments[(SEG_IDX)].vaddr + (OFF_IN_SEG); \
+ ls->size = (SIZE); \
+ ls->flags = (FLAGS); \
+ ls->align = (ALIGN); \
+ ls->name = (NAME); \
+ ls->sem = (SEM); \
+ } while (0)
+
+ INIT_SEC(0, name_interp, ro_seg_idx, interp_off, interp_bytes, 1, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(1, name_dynsym, ro_seg_idx, dynsym_off, dynsym_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(2, name_dynstr, ro_seg_idx, dynstr_off, dynstr_bytes, 1, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(3, name_gnu_hash, ro_seg_idx, gnuhash_off, gnuhash_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(4, name_rela_dyn, ro_seg_idx, rela_dyn_off, rela_dyn_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(5, name_rela_plt, ro_seg_idx, rela_plt_off, rela_plt_bytes, 8, SF_ALLOC, SSEM_PROGBITS);
+ INIT_SEC(6, name_dynamic, ro_seg_idx, dynamic_off, dynamic_bytes, 8, SF_ALLOC | SF_WRITE, SSEM_PROGBITS);
+
+ dyn->sec_interp = (LinkSectionId)(sec_base + 0 + 1u);
+ dyn->sec_dynsym = (LinkSectionId)(sec_base + 1 + 1u);
+ dyn->sec_dynstr = (LinkSectionId)(sec_base + 2 + 1u);
+ dyn->sec_gnu_hash = (LinkSectionId)(sec_base + 3 + 1u);
+ dyn->sec_rela_dyn = (LinkSectionId)(sec_base + 4 + 1u);
+ dyn->sec_rela_plt = (LinkSectionId)(sec_base + 5 + 1u);
+ dyn->sec_dynamic = (LinkSectionId)(sec_base + 6 + 1u);
+ dyn->dynamic_vaddr = img->segments[ro_seg_idx].vaddr + dynamic_off;
+ dyn->dynamic_size = dynamic_bytes;
+
+ if (has_plt) {
+ INIT_SEC(7, name_plt, rx_seg_idx, 0, plt_bytes, 16,
+ SF_ALLOC | SF_EXEC, SSEM_PROGBITS);
+ INIT_SEC(8, name_got_plt, rw_seg_idx, 0, gotplt_bytes, 8,
+ SF_ALLOC | SF_WRITE, SSEM_PROGBITS);
+ dyn->sec_plt = (LinkSectionId)(sec_base + 7 + 1u);
+ dyn->sec_got_plt = (LinkSectionId)(sec_base + 8 + 1u);
+ }
+#undef INIT_SEC
+
+ img->nsections += nsec;
+
+ /* Step 7: copy .interp / .dynsym / .dynstr / .gnu.hash bytes into
+ * the ro segment. .dynamic body is built during emit (it embeds
+ * runtime vaddrs that PIE keeps image-relative; emit just reads
+ * the section ids' final vaddrs). */
+ u8* ro_bytes = img->segment_bytes[ro_seg_idx];
+
+ /* .interp */
+ if (interp_bytes && ro_bytes)
+ memcpy(ro_bytes + interp_off, interp_str, (size_t)interp_bytes);
+
+ /* .dynsym: serialize DynSymRec to ELF64 wire layout. */
+ {
+ u32 si;
+ for (si = 0; si < dyn->ndynsym; ++si) {
+ u8* p = ro_bytes + dynsym_off + (u64)si * 24u;
+ const DynSymRec* r = &dyn->dynsym[si];
+ wr_u32_le(p + 0, r->st_name);
+ p[4] = r->st_info;
+ p[5] = r->st_other;
+ wr_u16_le(p + 6, r->st_shndx);
+ wr_u64_le(p + 8, r->st_value);
+ wr_u64_le(p + 16, r->st_size);
+ }
+ }
+
+ /* .dynstr */
+ if (dynstr_bytes && ro_bytes && dyn->dynstr)
+ memcpy(ro_bytes + dynstr_off, dyn->dynstr, dyn->dynstr_len);
+
+ /* .gnu.hash */
+ if (gnuhash_bytes && ro_bytes && dyn->gnu_hash)
+ memcpy(ro_bytes + gnuhash_off, dyn->gnu_hash, dyn->gnu_hash_len);
+
+ /* .rela.plt: emit JUMP_SLOT records, one per imported function.
+ * r_offset = .got.plt[3 + i].vaddr; r_info = (dynsym_idx<<32) |
+ * R_AARCH64_JUMP_SLOT; addend = 0. The body bytes go into
+ * .rela.plt's section bytes within ro_seg. */
+ {
+ u32 ki;
+ for (ki = 0; ki < imports.nfuncs; ++ki) {
+ LinkSymId lsid = imports.funcs[ki];
+ u32 dynidx = dyn->sym_dynidx[lsid];
+ u64 slot_vaddr = dyn->got_plt_vaddr + 8u * (3u + ki);
+ DynRela* r = &dyn->rela_plt[ki];
+ r->r_offset = slot_vaddr;
+ r->r_info = ELF64_R_INFO((u64)dynidx, ELF_R_AARCH64_JUMP_SLOT);
+ r->r_addend = 0;
+ /* Serialize into segment bytes. */
+ u8* p = ro_bytes + rela_plt_off + (u64)ki * 24u;
+ wr_u64_le(p + 0, r->r_offset);
+ wr_u64_le(p + 8, r->r_info);
+ wr_u64_le(p + 16, (u64)r->r_addend);
+ }
+ }
+
+ /* .rela.dyn: emit GLOB_DAT records for data imports. We have not
+ * allocated GOT slots for these (Phase 5 work), so for now r_offset
+ * is 0 — the loader sees a no-op write. Better to allocate the
+ * slot here? Phase 4's plan says "GOT slot per imported data sym",
+ * but without rewiring the existing layout_got's slot reuse it's
+ * cleaner to defer. Mark the gap so Phase 5 can fix it. */
+ {
+ u32 ki;
+ for (ki = 0; ki < imports.ndatas; ++ki) {
+ LinkSymId lsid = imports.datas[ki];
+ u32 dynidx = dyn->sym_dynidx[lsid];
+ DynRela* r = &dyn->rela_dyn[ki];
+ r->r_offset = 0; /* TODO Phase 5: target's .got slot vaddr */
+ r->r_info = ELF64_R_INFO((u64)dynidx, ELF_R_AARCH64_GLOB_DAT);
+ r->r_addend = 0;
+ u8* p = ro_bytes + rela_dyn_off + (u64)ki * 24u;
+ wr_u64_le(p + 0, r->r_offset);
+ wr_u64_le(p + 8, r->r_info);
+ wr_u64_le(p + 16, (u64)r->r_addend);
+ }
+ /* Trailing capacity (cap_rela_dyn - ndatas) stays zero. emit
+ * fills R_AARCH64_RELATIVE entries here as it walks abs64 relocs. */
+ }
+
+ /* .got.plt prelude: for BIND_NOW we leave the body zero — the
+ * loader patches every slot from .rela.plt before user code. Some
+ * loaders still inspect slot 0 (&.dynamic) at startup; provide it
+ * so glibc-style loaders don't fault. The loader writes the link_map
+ * cookie into slot 1 at load time. */
+ if (has_plt) {
+ u8* gp_bytes = img->segment_bytes[rw_seg_idx];
+ if (gp_bytes && gotplt_bytes >= 8u) {
+ wr_u64_le(gp_bytes, dyn->dynamic_vaddr);
+ /* Slots 1, 2, and per-PLT slots stay zero until the loader
+ * fills them. Phase 5 would prefill the per-PLT slots with
+ * the address of PLT0 to support lazy binding. */
+ }
+ }
+
+ /* The .dynamic body is built later, after segment shifts are
+ * applied during emit (link_elf.c). emit_dynamic_body takes the
+ * post-shift vaddrs of every other dyn section and writes one
+ * DT_* entry per index. */
+
+ free_imports(h, &imports);
+}
+
+/* ---- cleanup ---- */
+
+void link_dyn_state_free(LinkImage* img) {
+ Heap* h = img->heap;
+ LinkDynState* dyn = img->dyn;
+ if (!dyn) return;
+ if (dyn->dynsym)
+ h->free(h, dyn->dynsym, sizeof(*dyn->dynsym) * dyn->ndynsym);
+ if (dyn->dynstr) h->free(h, dyn->dynstr, dyn->dynstr_len);
+ if (dyn->gnu_hash) h->free(h, dyn->gnu_hash, dyn->gnu_hash_len);
+ if (dyn->rela_dyn)
+ h->free(h, dyn->rela_dyn, sizeof(*dyn->rela_dyn) * dyn->cap_rela_dyn);
+ if (dyn->rela_plt)
+ h->free(h, dyn->rela_plt, sizeof(*dyn->rela_plt) * dyn->nrela_plt);
+ if (dyn->needed) h->free(h, dyn->needed, sizeof(*dyn->needed) * dyn->nneeded);
+ if (dyn->sym_dynidx)
+ h->free(h, dyn->sym_dynidx,
+ sizeof(*dyn->sym_dynidx) * dyn->sym_dynidx_size);
+ h->free(h, dyn, sizeof(*dyn));
+ img->dyn = NULL;
+}
diff --git a/src/link/link_elf.c b/src/link/link_elf.c
@@ -105,7 +105,10 @@ typedef struct __attribute__((packed)) Shdr64 {
#define PT_TLS 7
#define PAGE_SIZE 0x1000u
-#define IMAGE_BASE 0x400000ULL
+/* Static ET_EXEC base. ET_DYN (PIE) uses 0 — the loader picks the
+ * runtime base. The active value lives in `img_base` below; the macro
+ * stays for the static path's hard-coded vaddrs. */
+#define IMAGE_BASE_STATIC 0x400000ULL
#define BUILD_ID_DESC_LEN 16u
#define NOTE_NAME_GNU "GNU"
@@ -169,8 +172,28 @@ static int reloc_is_tlsle(RelocKind k) {
k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
}
-static void apply_all_relocs(LinkImage* img) {
+static int reloc_is_abs(RelocKind k) {
+ return k == R_ABS32 || k == R_ABS64;
+}
+
+static void emit_relative_record(LinkImage* img, u64 site_vaddr, u64 addend) {
+ LinkDynState* dyn = img->dyn;
+ if (!dyn || !dyn->rela_dyn) return;
+ if (dyn->nrela_dyn >= dyn->cap_rela_dyn) {
+ compiler_panic(img->c, no_loc(),
+ "link: too many R_AARCH64_RELATIVE records (%u >= %u); "
+ "raise cap_rela_dyn in layout_dyn",
+ dyn->nrela_dyn, dyn->cap_rela_dyn);
+ }
+ DynRela* r = &dyn->rela_dyn[dyn->nrela_dyn++];
+ r->r_offset = site_vaddr;
+ r->r_info = ELF64_R_INFO((u64)0, ELF_R_AARCH64_RELATIVE);
+ r->r_addend = (i64)addend;
+}
+
+static void apply_all_relocs(LinkImage* img, u64 img_base) {
u32 i;
+ int pie = img->pie;
for (i = 0; i < LinkRelocs_count(&img->relocs); ++i) {
LinkRelocApply* r = LinkRelocs_at(&img->relocs, i);
const LinkSymbol* tgt = LinkSyms_at(&img->syms, r->target - 1);
@@ -182,15 +205,29 @@ static void apply_all_relocs(LinkImage* img) {
/* S is the target's TP-relative offset: distance from the
* TLS image start plus the 16-byte TCB. Both vaddrs are
* in the same (post-shift, image-relative) coordinate
- * system, so IMAGE_BASE cancels out. */
+ * system, so img_base cancels out. */
S = (tgt->vaddr - img->tls_vaddr) + AARCH64_TCB_SIZE;
} else {
- S = tgt->vaddr + IMAGE_BASE;
+ S = tgt->vaddr + img_base;
if (tgt->kind == SK_ABS) S = tgt->vaddr;
}
- P = r->write_vaddr + IMAGE_BASE;
+ P = r->write_vaddr + img_base;
P_bytes = img->segment_bytes[seg->id - 1] +
(size_t)(r->write_file_offset - seg->file_offset);
+
+ /* PIE: an absolute reloc against a defined non-imported symbol
+ * stays image-relative in the file (the loader adds load-base via
+ * a synthesized R_AARCH64_RELATIVE). For an imported target, this
+ * cut leaves the apply path as-is — Phase 5 will rewrite to
+ * GLOB_DAT/JUMP_SLOT. The image-relative S we want at the site is
+ * tgt->vaddr (without img_base, which is already 0 for PIE). */
+ if (pie && reloc_is_abs(r->kind) && tgt->defined && !tgt->imported &&
+ tgt->kind != SK_ABS) {
+ /* img_base is 0 for PIE, so S above is already image-relative.
+ * Append the RELATIVE record so the loader patches load_base
+ * into the site at runtime. */
+ emit_relative_record(img, r->write_vaddr, tgt->vaddr);
+ }
link_reloc_apply(img->c, r->kind, P_bytes, S, r->addend, P);
}
}
@@ -438,26 +475,150 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
* rt member walks .iplt.pairs and fills each slot before user code
* runs. The ELF writer doesn't have to do anything special here. */
+ /* PIE / ET_DYN: img_base is 0 (the loader picks the runtime base;
+ * absolute relocs against internal symbols are emitted as
+ * R_AARCH64_RELATIVE in .rela.dyn). Otherwise classic ET_EXEC at
+ * IMAGE_BASE_STATIC. */
+ int pie = img->pie;
+ u64 img_base = pie ? 0ULL : IMAGE_BASE_STATIC;
+
/* ---- plan number of program headers ----
*
* 1 headers PT_LOAD + nsegments PT_LOAD + 1 PT_NOTE (build-id)
- * + 1 PT_TLS when this image carries any TLS sections. */
+ * + 1 PT_TLS when this image carries any TLS sections.
+ * + 4 dyn phdrs (PT_PHDR / PT_INTERP / PT_DYNAMIC / PT_GNU_STACK) on PIE. */
u32 has_tls = img->tls_memsz ? 1u : 0u;
- u32 nphdr_total = 1u + img->nsegments + 1u + has_tls;
+ u32 nphdr_extra_dyn = pie ? 4u : 0u;
+ u32 nphdr_total = 1u + img->nsegments + 1u + has_tls + nphdr_extra_dyn;
u64 headers_size =
sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64) + BUILD_ID_NOTE_BYTES;
u64 headers_load = ALIGN_UP(headers_size, (u64)PAGE_SIZE);
/* The build-id note lives inside the headers PT_LOAD at this offset. */
u64 build_id_off = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64);
- u64 build_id_addr = IMAGE_BASE + build_id_off;
+ u64 build_id_addr = img_base + build_id_off;
/* ---- shift image addresses, apply relocations ----
*
* Must happen before segshdrs/symtab construction so they observe
* post-shift vaddrs (the values that will land in the file). */
shift_image_addresses(img, headers_load);
- apply_all_relocs(img);
+ apply_all_relocs(img, img_base);
+
+ /* ---- write .dynamic body + re-serialize .rela.dyn (PIE only) ----
+ *
+ * Both depend on post-shift vaddrs. .dynamic embeds image-relative
+ * pointers to .dynsym/.dynstr/.gnu.hash/.rela.dyn/.rela.plt/.got.plt
+ * (the loader adds load_base at runtime). .rela.dyn picked up
+ * RELATIVE records during apply_all_relocs; rewrite the section
+ * bytes to include them. */
+ if (pie && img->dyn) {
+ LinkDynState* dyn = img->dyn;
+ const LinkSection* sec_dynamic = &img->sections[dyn->sec_dynamic - 1];
+ const LinkSection* sec_dynsym = &img->sections[dyn->sec_dynsym - 1];
+ const LinkSection* sec_dynstr = &img->sections[dyn->sec_dynstr - 1];
+ const LinkSection* sec_gnuhash = &img->sections[dyn->sec_gnu_hash - 1];
+ const LinkSection* sec_reladyn = &img->sections[dyn->sec_rela_dyn - 1];
+ const LinkSection* sec_relaplt = (dyn->sec_rela_plt != LINK_SEC_NONE)
+ ? &img->sections[dyn->sec_rela_plt - 1]
+ : NULL;
+ const LinkSection* sec_gotplt = (dyn->sec_got_plt != LINK_SEC_NONE)
+ ? &img->sections[dyn->sec_got_plt - 1]
+ : NULL;
+ const LinkSegment* dseg = &img->segments[sec_dynamic->segment_id - 1];
+ u8* dyn_bytes_at = img->segment_bytes[dseg->id - 1] +
+ (size_t)(sec_dynamic->file_offset - dseg->file_offset);
+
+ /* Build DT_* entries in order. Layout matches count_dynamic_entries. */
+ u32 written = 0;
+ u8* p = dyn_bytes_at;
+#define DT_PUT(TAG, VAL) \
+ do { \
+ wr_u64_le(p, (u64)(TAG)); \
+ wr_u64_le(p + 8, (u64)(VAL)); \
+ p += 16; \
+ written++; \
+ } while (0)
+
+ /* DT_NEEDED entries — d_un.d_val is the offset of the soname
+ * within .dynstr. The dynstr was built in layout_dyn with
+ * dedup; look each soname up by name to compute its offset. */
+ {
+ u32 ni;
+ for (ni = 0; ni < dyn->nneeded; ++ni) {
+ Sym soname = dyn->needed[ni];
+ size_t namelen = 0;
+ const char* nm = pool_str(c->global, soname, &namelen);
+ /* Linear search dynstr for this name. */
+ u32 off = 0;
+ if (nm && namelen) {
+ u32 si;
+ for (si = 0; si + namelen < dyn->dynstr_len; ++si) {
+ if (dyn->dynstr[si + namelen] == 0 &&
+ memcmp(dyn->dynstr + si, nm, namelen) == 0) {
+ off = si;
+ break;
+ }
+ }
+ /* Should always be present — collect_needed populated dynstr
+ * via build_dynsym? Actually build_dynsym only added import
+ * names. We need to also add NEEDED sonames. */
+ if (off == 0) {
+ /* Fallback: append to dynstr. Phase 4 layout_dyn pre-sized
+ * .dynstr exactly to its current content; appending here
+ * would overflow the section. Instead, panic with a clear
+ * message — the soname was supposed to be added during
+ * layout. */
+ compiler_panic(c, no_loc(),
+ "link_emit_elf: DT_NEEDED soname missing from "
+ ".dynstr");
+ }
+ }
+ DT_PUT(DT_NEEDED, off);
+ }
+ }
+
+ DT_PUT(DT_STRTAB, img_base + sec_dynstr->vaddr);
+ DT_PUT(DT_STRSZ, sec_dynstr->size);
+ DT_PUT(DT_SYMTAB, img_base + sec_dynsym->vaddr);
+ DT_PUT(DT_SYMENT, 24);
+ DT_PUT(DT_GNU_HASH, img_base + sec_gnuhash->vaddr);
+ DT_PUT(DT_PLTGOT, sec_gotplt ? (img_base + sec_gotplt->vaddr) : 0);
+ DT_PUT(DT_PLTRELSZ, sec_relaplt ? sec_relaplt->size : 0);
+ DT_PUT(DT_PLTREL, DT_RELA);
+ DT_PUT(DT_JMPREL, sec_relaplt ? (img_base + sec_relaplt->vaddr) : 0);
+ DT_PUT(DT_RELA, img_base + sec_reladyn->vaddr);
+ DT_PUT(DT_RELASZ, sec_reladyn->size);
+ DT_PUT(DT_RELAENT, 24);
+ DT_PUT(DT_FLAGS_1, DF_1_NOW);
+ DT_PUT(DT_NULL, 0);
+#undef DT_PUT
+
+ /* Pad any pre-allocated tail with DT_NULL. */
+ while (written < dyn->ndyn_entries) {
+ wr_u64_le(p, 0);
+ wr_u64_le(p + 8, 0);
+ p += 16;
+ written++;
+ }
+
+ /* Re-serialize .rela.dyn body (GLOB_DAT records were already in
+ * place from layout_dyn; RELATIVE records were appended during
+ * apply_all_relocs). The trailing capacity stays zero — readers
+ * stop at first DT_NULL-equivalent (R_AARCH64_NONE has type 0). */
+ {
+ u8* rd_bytes = img->segment_bytes[dseg->id - 1] +
+ (size_t)(sec_reladyn->file_offset - dseg->file_offset);
+ u32 i;
+ for (i = 0; i < dyn->nrela_dyn; ++i) {
+ const DynRela* rr = &dyn->rela_dyn[i];
+ u8* rp = rd_bytes + (u64)i * 24u;
+ wr_u64_le(rp + 0, rr->r_offset);
+ wr_u64_le(rp + 8, rr->r_info);
+ wr_u64_le(rp + 16, (u64)rr->r_addend);
+ }
+ }
+ }
/* ---- compute build-id (post-reloc, deterministic) ---- */
u8 build_id[BUILD_ID_DESC_LEN];
@@ -619,7 +780,7 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
else if (s->kind == SK_ABS)
st_value = s->vaddr;
else if (s->defined)
- st_value = IMAGE_BASE + s->vaddr;
+ st_value = img_base + s->vaddr;
else
st_value = 0;
st_type = sym_kind_to_st_type(s->kind);
@@ -663,12 +824,27 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
memset(phdrs, 0, sizeof(Phdr64) * nphdr_total);
{
u32 pi = 0;
+ /* PT_PHDR points at the phdr table itself within the headers
+ * PT_LOAD. Required by the runtime loader for ET_DYN to know
+ * where its own program headers live. Must appear before the
+ * first PT_LOAD on dynamic exes (musl checks). */
+ if (pie) {
+ phdrs[pi].p_type = PT_PHDR;
+ phdrs[pi].p_flags = PF_R;
+ phdrs[pi].p_offset = sizeof(Ehdr64);
+ phdrs[pi].p_vaddr = img_base + sizeof(Ehdr64);
+ phdrs[pi].p_paddr = phdrs[pi].p_vaddr;
+ phdrs[pi].p_filesz = (u64)nphdr_total * sizeof(Phdr64);
+ phdrs[pi].p_memsz = phdrs[pi].p_filesz;
+ phdrs[pi].p_align = 8;
+ pi++;
+ }
/* Headers PT_LOAD (covers ehdr + phdrs + build-id note). */
phdrs[pi].p_type = PT_LOAD;
phdrs[pi].p_flags = PF_R;
phdrs[pi].p_offset = 0;
- phdrs[pi].p_vaddr = IMAGE_BASE;
- phdrs[pi].p_paddr = IMAGE_BASE;
+ phdrs[pi].p_vaddr = img_base;
+ phdrs[pi].p_paddr = img_base;
phdrs[pi].p_filesz = headers_size;
phdrs[pi].p_memsz = headers_size;
phdrs[pi].p_align = PAGE_SIZE;
@@ -681,7 +857,7 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
p->p_type = PT_LOAD;
p->p_flags = perms_to_pflags(seg->flags);
p->p_offset = seg->file_offset;
- p->p_vaddr = IMAGE_BASE + seg->vaddr; /* post-shift */
+ p->p_vaddr = img_base + seg->vaddr; /* post-shift */
p->p_paddr = p->p_vaddr;
p->p_filesz = seg->file_size;
p->p_memsz = seg->mem_size;
@@ -704,22 +880,60 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
if (has_tls) {
phdrs[pi].p_type = PT_TLS;
phdrs[pi].p_flags = PF_R;
- phdrs[pi].p_offset =
- img->tls_vaddr - headers_load + headers_load; /* image-relative
- * file offset
- * matches vaddr
- * after shift */
- /* tls_vaddr was bumped by shift_image_addresses; we want
- * the file offset to track segment file_offset, which
- * equals the segment's vaddr (post-shift) by construction. */
phdrs[pi].p_offset = img->tls_vaddr;
- phdrs[pi].p_vaddr = IMAGE_BASE + img->tls_vaddr;
+ phdrs[pi].p_vaddr = img_base + img->tls_vaddr;
phdrs[pi].p_paddr = phdrs[pi].p_vaddr;
phdrs[pi].p_filesz = img->tls_filesz;
phdrs[pi].p_memsz = img->tls_memsz;
phdrs[pi].p_align = img->tls_align ? img->tls_align : 1u;
pi++;
}
+ /* Dynamic phdrs. PT_INTERP and PT_DYNAMIC point at the matching
+ * sections (which layout_dyn placed in the ro/rw_dyn segments).
+ * PT_GNU_STACK marks the stack as non-executable (filesz=0). */
+ if (pie && img->dyn) {
+ LinkDynState* dyn = img->dyn;
+ const LinkSection* sec_interp =
+ &img->sections[dyn->sec_interp - 1];
+ const LinkSection* sec_dynamic =
+ &img->sections[dyn->sec_dynamic - 1];
+ phdrs[pi].p_type = PT_INTERP;
+ phdrs[pi].p_flags = PF_R;
+ phdrs[pi].p_offset = sec_interp->file_offset;
+ phdrs[pi].p_vaddr = img_base + sec_interp->vaddr;
+ phdrs[pi].p_paddr = phdrs[pi].p_vaddr;
+ phdrs[pi].p_filesz = sec_interp->size;
+ phdrs[pi].p_memsz = sec_interp->size;
+ phdrs[pi].p_align = 1;
+ pi++;
+ phdrs[pi].p_type = PT_DYNAMIC;
+ phdrs[pi].p_flags = PF_R | PF_W;
+ phdrs[pi].p_offset = sec_dynamic->file_offset;
+ phdrs[pi].p_vaddr = img_base + sec_dynamic->vaddr;
+ phdrs[pi].p_paddr = phdrs[pi].p_vaddr;
+ phdrs[pi].p_filesz = sec_dynamic->size;
+ phdrs[pi].p_memsz = sec_dynamic->size;
+ phdrs[pi].p_align = 8;
+ pi++;
+ phdrs[pi].p_type = PT_GNU_STACK;
+ phdrs[pi].p_flags = PF_R | PF_W;
+ phdrs[pi].p_offset = 0;
+ phdrs[pi].p_vaddr = 0;
+ phdrs[pi].p_paddr = 0;
+ phdrs[pi].p_filesz = 0;
+ phdrs[pi].p_memsz = 0;
+ phdrs[pi].p_align = 16;
+ pi++;
+ /* PT_GNU_RELRO would mark the read-only-after-relocation span
+ * here. Phase 6 leaves it out — it's an optimization the loader
+ * can live without, and our ro_seg already lives in a PF_R
+ * PT_LOAD that's never made writable. */
+ } else if (pie) {
+ /* dyn was nominally requested but layout_dyn early-out — no
+ * imports and no DSO inputs. The image still needs a PT_GNU_STACK
+ * for kernels that demand it; INTERP/DYNAMIC are skipped. */
+ (void)0;
+ }
(void)pi;
}
@@ -734,11 +948,11 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
ehdr.e_ident[5] = ELFDATA2LSB;
ehdr.e_ident[6] = EV_CURRENT;
ehdr.e_ident[7] = ELFOSABI_NONE;
- ehdr.e_type = ET_EXEC;
+ ehdr.e_type = pie ? ET_DYN : ET_EXEC;
ehdr.e_machine = EM_AARCH64;
ehdr.e_version = EV_CURRENT;
ehdr.e_entry =
- IMAGE_BASE + LinkSyms_at(&img->syms, img->entry_sym - 1)->vaddr;
+ img_base + LinkSyms_at(&img->syms, img->entry_sym - 1)->vaddr;
ehdr.e_phoff = sizeof(Ehdr64);
ehdr.e_shoff = shdr_off;
ehdr.e_flags = 0;
@@ -831,6 +1045,36 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
/* shdr 0: NULL */
memset(&sh, 0, sizeof(sh));
write_bytes(w, &sh, sizeof(sh));
+ /* Locate dyn-section names (interned earlier in layout_dyn) so
+ * we can override sh_type / sh_link / sh_info / sh_entsize for
+ * .dynsym / .dynstr / .gnu.hash / .rela.dyn / .rela.plt /
+ * .dynamic. The sh_link cross-references (e.g., .dynsym ->
+ * .dynstr) need the matching shdr indices, which we look up by
+ * comparing OutShdr.name to the same Sym values. */
+ Sym n_dynsym = 0, n_dynstr = 0, n_gnuhash = 0;
+ Sym n_reladyn = 0, n_relaplt = 0, n_dynamic = 0;
+ Sym n_gotplt = 0;
+ if (pie && img->dyn) {
+ n_dynsym = pool_intern_cstr(c->global, ".dynsym");
+ n_dynstr = pool_intern_cstr(c->global, ".dynstr");
+ n_gnuhash = pool_intern_cstr(c->global, ".gnu.hash");
+ n_reladyn = pool_intern_cstr(c->global, ".rela.dyn");
+ n_relaplt = pool_intern_cstr(c->global, ".rela.plt");
+ n_dynamic = pool_intern_cstr(c->global, ".dynamic");
+ n_gotplt = pool_intern_cstr(c->global, ".got.plt");
+ }
+ /* Two-pass: first find dynsym/dynstr/gotplt indices for sh_link
+ * fixups, then emit. */
+ u32 idx_dynsym = 0, idx_dynstr = 0, idx_gotplt = 0;
+ if (pie && img->dyn) {
+ for (i = 0; i < noutshdr; ++i) {
+ Sym nm = outshdrs[i].name;
+ u32 ix = outshdrs[i].shdr_idx;
+ if (nm == n_dynsym) idx_dynsym = ix;
+ else if (nm == n_dynstr) idx_dynstr = ix;
+ else if (nm == n_gotplt) idx_gotplt = ix;
+ }
+ }
/* per-name output shdrs */
for (i = 0; i < noutshdr; ++i) {
const OutShdr* o = &outshdrs[i];
@@ -838,7 +1082,7 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
sh.sh_name = outshdr_name_off[i];
sh.sh_type = sec_sem_to_sht(o->sem);
sh.sh_flags = sec_flags_to_shf(o->flags);
- sh.sh_addr = IMAGE_BASE + o->vaddr;
+ sh.sh_addr = img_base + o->vaddr;
sh.sh_offset = o->file_offset;
sh.sh_size = o->size;
sh.sh_link = 0;
@@ -848,6 +1092,36 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) {
o->sem == SSEM_PREINIT_ARRAY)
? 8
: 0;
+ /* Dyn-section overrides: sh_type / sh_link / sh_info / entsize. */
+ if (pie && img->dyn) {
+ if (o->name == n_dynsym) {
+ sh.sh_type = SHT_DYNSYM;
+ sh.sh_link = idx_dynstr;
+ sh.sh_info = img->dyn->first_global;
+ sh.sh_entsize = 24;
+ } else if (o->name == n_dynstr) {
+ sh.sh_type = SHT_STRTAB;
+ } else if (o->name == n_gnuhash) {
+ sh.sh_type = SHT_GNU_HASH;
+ sh.sh_link = idx_dynsym;
+ } else if (o->name == n_reladyn) {
+ sh.sh_type = SHT_RELA;
+ sh.sh_link = idx_dynsym;
+ sh.sh_entsize = 24;
+ } else if (o->name == n_relaplt) {
+ sh.sh_type = SHT_RELA;
+ sh.sh_link = idx_dynsym;
+ sh.sh_info = idx_gotplt;
+ sh.sh_entsize = 24;
+ sh.sh_flags |= SHF_INFO_LINK;
+ } else if (o->name == n_dynamic) {
+ sh.sh_type = SHT_DYNAMIC;
+ sh.sh_link = idx_dynstr;
+ sh.sh_entsize = 16;
+ } else if (o->name == n_gotplt) {
+ sh.sh_entsize = 8;
+ }
+ }
write_bytes(w, &sh, sizeof(sh));
}
/* shdr: .note.gnu.build-id (allocatable; in headers PT_LOAD) */
diff --git a/src/link/link_internal.h b/src/link/link_internal.h
@@ -92,6 +92,13 @@ struct Linker {
* JIT path leaves this 0 — slots are pre-resolved in-process by
* link_jit.c, no ctor needed. */
int emit_static_exe;
+ /* PIE / ET_DYN output. Set by cfree_link_exe when opts->pie or any
+ * DSO input is present. Triggers layout_dyn (Phase 4) and the
+ * dynamic ELF emit path (Phase 6). */
+ int emit_pie;
+ /* Caller-supplied PT_INTERP. layout_dyn falls back to a target-
+ * derived default when this is 0. */
+ Sym interp_path;
LinkExternResolver resolver;
void* resolver_user;
CompilerCleanup* deferred; /* registered by link_new */
@@ -100,11 +107,121 @@ struct Linker {
/* Defined in link_layout.c. */
void link_ingest_archives(struct Linker*);
+/* Defined in link_dyn.c. Phase 4: synthesize .interp/.dynsym/.dynstr/
+ * .gnu.hash/.rela.dyn/.rela.plt/.plt/.got.plt/.dynamic when the link
+ * is producing a PIE / ET_DYN exe. No-op when there are zero imports
+ * AND no DSO inputs (in PIE-with-no-imports we still need PT_INTERP
+ * and a minimal .dynamic). */
+void layout_dyn(struct Linker*, LinkImage*);
+void link_dyn_state_free(LinkImage*);
+
/* SegVec instances for image-owned tables. Pointers returned by *_at /
* *_push remain valid for the LinkImage's lifetime. */
SEGVEC_DEFINE(LinkSyms, LinkSymbol, 6); /* 64 entries per segment */
SEGVEC_DEFINE(LinkRelocs, LinkRelocApply, 7); /* 128 entries per segment */
+/* ---- Dynamic-link synthesis state (Phase 4) ----
+ *
+ * Owned by LinkImage when emit_pie is set. Holds the synthesized
+ * .interp / .dynsym / .dynstr / .gnu.hash / .rela.dyn / .rela.plt /
+ * .plt / .got.plt / .dynamic content plus the section ids the emit
+ * pass needs to fill PT_DYNAMIC and the .dynamic body.
+ *
+ * Phase 4 builds the dynsym/dynstr/gnu.hash content and the JUMP_SLOT
+ * .rela.plt records (one per imported function, against its synthetic
+ * .got.plt slot). The .plt body is allocated but not emitted (Phase 5).
+ * Phase 6 populates .rela.dyn with R_AARCH64_RELATIVE records for any
+ * internal absolute reloc seen during reloc-apply.
+ *
+ * Layout invariants this struct enforces:
+ * - dynsym entry 0 is the reserved STN_UNDEF slot (zero-filled).
+ * - dynsym entries 1..nimport_func+nimport_data are imports, in the
+ * order PLT-functions first, then GOT-data.
+ * - PLT slots and JUMP_SLOT entries match the import_func order 1:1.
+ * - .got.plt has 3 reserved leading u64 slots (per AArch64 psABI:
+ * slot 0 = &.dynamic, slot 1 = link_map cookie, slot 2 =
+ * _dl_runtime_resolve), then one slot per imported function.
+ */
+
+typedef struct DynSymRec {
+ u32 st_name; /* offset into .dynstr */
+ u8 st_info;
+ u8 st_other;
+ u16 st_shndx;
+ u64 st_value;
+ u64 st_size;
+} DynSymRec;
+
+typedef struct DynRela {
+ u64 r_offset; /* image-relative vaddr of the patch site */
+ u64 r_info; /* ELF64_R_INFO(dynsym_index, elf_reloc_type) */
+ i64 r_addend;
+} DynRela;
+
+typedef struct LinkDynState {
+ /* PT_INTERP / .interp. interp_path is interned in compiler->global. */
+ Sym interp_path;
+ LinkSectionId sec_interp;
+
+ /* .dynsym */
+ LinkSectionId sec_dynsym;
+ DynSymRec* dynsym;
+ u32 ndynsym; /* incl. slot-0 STN_UNDEF */
+ u32 first_global; /* sh_info value: index of first non-local entry */
+
+ /* .dynstr */
+ LinkSectionId sec_dynstr;
+ u8* dynstr;
+ u32 dynstr_len;
+
+ /* .gnu.hash */
+ LinkSectionId sec_gnu_hash;
+ u8* gnu_hash;
+ u32 gnu_hash_len;
+
+ /* .rela.dyn — R_AARCH64_GLOB_DAT (imports against .got slots) and
+ * R_AARCH64_RELATIVE (PIE internal abs64 fixups, populated during
+ * Phase 6 emit). Pre-sized at layout time; the RELATIVE tail is
+ * filled in during emit. */
+ LinkSectionId sec_rela_dyn;
+ DynRela* rela_dyn;
+ u32 nrela_dyn; /* number of records currently populated */
+ u32 cap_rela_dyn; /* allocation capacity (records, not bytes) */
+
+ /* .rela.plt — R_AARCH64_JUMP_SLOT, one per imported function. */
+ LinkSectionId sec_rela_plt;
+ DynRela* rela_plt;
+ u32 nrela_plt;
+
+ /* .plt — 32-byte PLT0 stub + 16 bytes per imported function. Body
+ * is allocated (zero-initialized) but not emitted in Phase 4. */
+ LinkSectionId sec_plt;
+ u32 nplt; /* number of imported functions */
+ u64 plt_vaddr; /* image-relative .plt base */
+ u64 plt_size;
+
+ /* .got.plt — 24 reserved bytes + 8 per PLT slot. */
+ LinkSectionId sec_got_plt;
+ u64 got_plt_vaddr;
+ u64 got_plt_size;
+
+ /* .dynamic — PT_DYNAMIC body. Built at layout time; its size is
+ * fixed once we know the DT_NEEDED count. */
+ LinkSectionId sec_dynamic;
+ u64 dynamic_vaddr;
+ u64 dynamic_size;
+ u32 ndyn_entries;
+
+ /* DT_NEEDED list (interned soname Syms, in input order). */
+ Sym* needed;
+ u32 nneeded;
+
+ /* Per-import dynsym index, indexed by LinkSymId. 0 means "not
+ * imported / not in dynsym". Used by GLOB_DAT / JUMP_SLOT emit. */
+ u32* sym_dynidx; /* size = sym_dynidx_size */
+ u32 sym_dynidx_size;
+} LinkDynState;
+
struct LinkImage {
Compiler* c;
Heap* heap;
@@ -148,6 +265,12 @@ struct LinkImage {
InputMap* input_maps; /* one per input; indexed by input_id-1 */
u32 ninput_maps;
+
+ /* Dynamic-link state (Phase 4). NULL when emit_pie was not set on
+ * the Linker — i.e., the static-exe / JIT path. Owned by the image. */
+ LinkDynState* dyn;
+ /* Mirror of Linker.emit_pie at link_resolve time; consulted by emit. */
+ int pie;
};
/* Apply one relocation in place. P_bytes points at the first byte of the
diff --git a/src/link/link_layout.c b/src/link/link_layout.c
@@ -2325,6 +2325,12 @@ LinkImage* link_resolve(Linker* l) {
emit_reloc_records(l, img, got_map);
if (got_map) h->free(h, got_map, sizeof(*got_map) * got_map_size);
}
+ /* Phase 4 dynamic-link tables. Runs after every other layout
+ * pass: it depends on import resolution (resolve_undefs), every
+ * synthesized section already being on the image (layout_got /
+ * layout_iplt), and adds its own segments at the tail. The
+ * static-exe path early-outs in layout_dyn (l->emit_pie==0). */
+ layout_dyn(l, img);
resolve_entry(l, img);
gc_live_free(&g, h);
}