commit c5696650bb92794695f7e82f4cbffeae72f0bec4
parent 060d8253db61a71604f234a187998b47c3fc6a0c
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 15:58:21 -0700
link: STT_GNU_IFUNC in static ET_EXEC via libcfree_rt preinit
layout_iplt now also synthesizes a .iplt.pairs data section
(alternating resolver_ptr / slot_ptr u64s, filled via R_ABS64) and
boundary symbols __start_iplt_pairs / __stop_iplt_pairs so a
portable rt member can iterate it. Under emit_static_exe (set by
cfree_link_exe and not by cfree_link_jit), it also synthesizes a
one-entry .preinit_array referencing __cfree_ifunc_init from
libcfree_rt — preinit runs strictly before .init_array, so user
ctors that call IFUNCs see filled .igot.plt slots. The rt member
is pulled by archive demand-load: link_ingest_archives seeds
__cfree_ifunc_init into the wanted set whenever an input defines
an IFUNC.
Cross-TU IFUNC support: resolve_undefs copies the def's kind into
each undef LinkSymbol of the same name, so the iplt-emit loop
needs to dedup by canonical-via-globals (otherwise we double-count
nifunc and emit phantom stubs). A propagation pass at the tail of
layout_iplt then re-syncs every undef ref's section/value/vaddr to
the post-redirect (stub) values so address-taken IFUNCs round-trip
correctly across translation units.
emit_array_boundaries grew SSEM_PREINIT_ARRAY support and folds in
synthetic LinkSections (input_id == LINK_INPUT_NONE) so the
boundary symbols cover layout_iplt's preinit entry.
emit_boundary_sym now also walks per-input undef LinkSymbols to
re-resolve them when a later pass overwrites the def — fixes a
GOT-slot zero-fill bug for __start_iplt_pairs / __stop_iplt_pairs.
Cases 33_ifunc_in_init (IFUNC called from a ctor) and
34_ifunc_addr_taken (cross-TU &fn round-trip) cover the new
paths; 32_ifunc drops its jit_only marker so all three paths
(roundtrip / exec / JIT) run for it now.
doc/linker-status.md updated; the obsolete IFUNC implementation
plan is dropped, and IFUNC moves out of Gaps into "what works
today".
Diffstat:
10 files changed, 272 insertions(+), 116 deletions(-)
diff --git a/doc/linker-status.md b/doc/linker-status.md
@@ -16,9 +16,9 @@ live in `test/link/` — they are not duplicated in `test/elf/`.
| Harness | Pass | Fail | Notes |
|-----------------|-----:|-----:|--------------------------------------|
| `test-elf` | 37 | 0 | All Layer A/B/C green |
-| `test-link` R | 35 | 0 | object roundtrip via cfree-roundtrip |
-| `test-link` E | 34 | 0 | qemu/podman aarch64 exec |
-| `test-link` J | 35 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS |
+| `test-link` R | 38 | 0 | object roundtrip via cfree-roundtrip |
+| `test-link` E | 37 | 0 | qemu/podman aarch64 exec, incl. IFUNC |
+| `test-link` J | 38 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS |
| `test-link` bad | 2 | 0 | `bad/30_undef_strong` (E + J) |
| `test-musl` | 3 | 0 | static musl 1.2.5: syscall, errno, printf |
@@ -78,14 +78,26 @@ works end-to-end. Beyond that:
init/fini/preinit_array, `SF_RETAIN` (`SHF_GNU_RETAIN`),
`__start_/__stop_` referents. Edges follow per-section relocs to
fixed point.
-- **IFUNC trampoline (JIT only):** every defined `STT_GNU_IFUNC`
+- **IFUNC trampoline (JIT and ELF):** every defined `STT_GNU_IFUNC`
symbol gets a 12-byte stub in a synthetic `.iplt` (RX) section
(`adrp x16, slot ; ldr x16,[x16,:lo12:slot] ; br x16`) and an
8-byte slot in `.igot.plt` (RW); the IFUNC's vaddr is redirected
- to the stub. JIT load calls each resolver in-process after
+ to the stub, and cross-TU undef refs to the same name are
+ re-pointed at the stub via a propagation pass at the tail of
+ `layout_iplt`. JIT load calls each resolver in-process after
applying relocs and writes the chosen implementation pointer
- into the slot. ELF emit refuses (no startup init routine yet —
- see Gaps).
+ into the slot. ELF emit also materializes a parallel
+ `.iplt.pairs` data section (alternating `(resolver_ptr, slot_ptr)`
+ u64s, filled via `R_ABS64`) plus boundary symbols
+ `__start_iplt_pairs` / `__stop_iplt_pairs`, and synthesizes a
+ one-entry `.preinit_array` referencing
+ `__cfree_ifunc_init` (provided by `libcfree_rt.a`). Preinit
+ runs strictly before any `.init_array` ctor, so user ctors
+ that call IFUNCs see filled slots. The rt member is pulled
+ via demand-load: `link_ingest_archives` seeds
+ `__cfree_ifunc_init` into the archive wanted set whenever an
+ input defines an IFUNC and `link_set_emit_static_exe` was set
+ (which `cfree_link_exe` does and `cfree_link_jit` does not).
- **Format fidelity:** ELF read+write byte-stable for the supported
subset; `EI_OSABI=GNU` flips automatically when GNU extensions are
present.
@@ -125,7 +137,6 @@ ordered by how often the gap actually bites.
|-----|-------------|--------|
| **`.eh_frame_hdr` + PT_GNU_EH_FRAME** | `.eh_frame` already flows through with a proper shdr; without `.eh_frame_hdr` libgcc/libunwind fall back to linear FDE scan, and `dl_iterate_phdr` consumers (most modern unwinders) skip the section entirely. Needs FDE parsing + sorted binary-search table emission. | medium |
| **`.debug_*` in the exe** | No DWARF → `gdb` blind on source lines. cfree's debug pipeline ends at the obj boundary; the linker drops non-`SF_ALLOC` sections. | medium |
-| **STT_GNU_IFUNC in ELF output** | iplt + igot trampoline lands for the JIT path (resolver called in-process at load). The ELF emit path can't run target-arch resolvers itself, so it refuses; needs a synthesized startup init routine that walks `img->iplt_pairs` and patches each slot before `_start` calls into user code. | medium |
| **TLSGD / TLSIE / TLSLD relocs** | Read but not applied. Needed for `-fpic` TLS or shared-lib TLS — moot until dynamic linking lands. | medium |
| **Dynamic linking: PT_DYNAMIC, PT_INTERP, PLT, DT_NEEDED** | Cannot link against any `.so`. Static-only. | large |
| **PIE / ET_DYN executables** | Driver accepts `-pie` but the writer always emits ET_EXEC at fixed `IMAGE_BASE`. Tied to dynamic-linking work. | medium (depends on dynamic) |
@@ -134,93 +145,26 @@ ordered by how often the gap actually bites.
| **`crt1.o`/`crti.o`/`crtn.o` auto-link** | Driver doesn't auto-include a C runtime; the user passes `crt1.o crti.o ... crtn.o` explicitly. Cosmetic. | small (driver-only) |
**Bottom line:** for static aarch64-linux executables, `cfree ld` is
-already a working linker — including against real musl. The next
+already a working linker — including against real musl, and
+including STT_GNU_IFUNC in ELF output (rt-driven preinit). The next
priorities, roughly in order:
1. **`.eh_frame_hdr` + PT_GNU_EH_FRAME** — `.eh_frame` already flows
through; building the binary-search index over FDEs unblocks fast
unwind and `dl_iterate_phdr`-driven consumers (modern libunwind,
libgcc's `_Unwind_Find_FDE`).
-2. **STT_GNU_IFUNC in ELF output** — JIT trampoline already works;
- the ELF emit path needs a startup init routine that walks the
- pairs table and fills each slot before any user code (including
- `.init_array` ctors) runs. Plan below.
-3. **`.debug_*` in the exe** — DWARF flow-through; the linker
+2. **`.debug_*` in the exe** — DWARF flow-through; the linker
currently drops non-`SF_ALLOC` sections at `section_kept`.
After those the next big lift is full dynamic linking (PT_DYNAMIC +
PLT + PT_INTERP + DT_NEEDED), which also unlocks PIE output and TLS
GD / IE / LD modes.
----
-
-## Plan: STT_GNU_IFUNC in ELF output
-
-Static, cross-compile, no dynamic loader. The only entity that can
-run resolvers is the emitted binary itself, after `_start` and
-before any user code that might call an IFUNC (including any
-`.init_array` ctor). The init logic belongs in **`libcfree_rt.a`**
-as portable C; the linker wires it in using machinery it already
-has (boundary symbols, ABS64 reloc apply, `.init_array`).
-
-### Layered placement
-
-| Concern | Where |
-|---------|-------|
-| iplt stub bytes (per-arch: `adrp/ldr/br`, `jmp *rip`, `auipc/ld/jr`) | `src/arch/<arch>.c` (small `LinkArch` hook) |
-| iplt stub reloc kinds | per-arch (linker already knows them) |
-| Pairs table layout `(resolver_addr, slot_addr)` | shared, in `link_layout.c` |
-| Init routine (walks pairs, calls resolvers, fills slots) | `rt/lib/cfree/ifunc_init.c` (~15 lines portable C) |
-| `__start_iplt_pairs` / `__stop_iplt_pairs` | shared (existing `__start_<X>`/`__stop_<X>` machinery) |
-| `.init_array` synthesis hookup | shared |
-| Pulling the rt member in via demand-load | automatic — synthetic ABS64 against `__cfree_ifunc_init` creates the undef ref |
-
-This keeps `link_layout.c` arch-clean: today it hand-encodes
-`0x90000010 / 0xf9400210 / 0xd61f0200` inline, which moves to a
-per-arch `LinkArch.emit_iplt_stub` hook on the existing
-`src/arch/<arch>.c` files alongside CGTarget/MCEmitter. Future
-arches add ~20 lines (x86_64's stub is 6 bytes; rv64 mirrors
-aarch64's three-insn shape).
-
-### Concrete file changes
-
-```
-rt/lib/cfree/ifunc_init.c new — __cfree_ifunc_init walks
- __start_iplt_pairs..__stop_iplt_pairs,
- calls each resolver, stores result
- in slot
-src/arch/aarch64.c + LinkArch with iplt stub bytes
- (moved out of layout_iplt)
-src/link/link_internal.h + LinkArch decl
-src/link/link_layout.c layout_iplt uses
- link_arch(...)->emit_iplt_stub;
- adds .iplt.pairs data section
- filled via ABS64 relocs; emits
- .init_array entry referencing
- __cfree_ifunc_init (ELF emit only)
-src/link/link_elf.c drop the niplt panic
-src/link/link_jit.c unchanged — keep in-process pre-resolve
-test/link/cases/32_ifunc/ drop jit_only marker
-test/link/cases/33_ifunc_in_init new — IFUNC called from a ctor
-test/link/cases/34_ifunc_addr_taken new — &my_fn round-trips
- through a function pointer
-```
-
-### JIT path interaction
-
-Pre-resolution in `link_jit.c` stays as-is (simpler than firing
-`__cfree_ifunc_init` from the JIT). `layout_iplt` skips synthesizing
-the `.init_array` entry on the JIT path: iplt stubs still get
-produced (so address-taken IFUNCs work), and pre-resolution fills
-slots in-process. One-line gate, no per-call guard in the rt code.
-
-### Adding IFUNC to a future arch
-
-Per-arch surface: `iplt_stub_size` (constant) + `emit_iplt_stub`
-(emits stub bytes, registers reloc-apply records against the slot
-LinkSymbol). Everything else — pairs table, init routine,
-`.init_array` hookup, boundary symbols — is shared. So the cost of
-a new arch is bounded and small.
+The IFUNC iplt stub bytes (`0x90000010 / 0xf9400210 / 0xd61f0200`)
+are still hand-encoded inline in `layout_iplt`; moving them behind
+a per-arch `LinkArch.emit_iplt_stub` hook in `src/arch/<arch>.c`
+is bounded follow-up work — useful when a second arch lands but
+not load-bearing today.
---
diff --git a/rt/Makefile b/rt/Makefile
@@ -138,7 +138,8 @@ AEABI ?=
ARCH_FLAGS ?=
# ---------- sources derived from features ------------------------------------
-SRCS = lib/int/int.c lib/fp/fp.c lib/mem/mem.c lib/atomic/atomic_freestanding.c
+SRCS = lib/int/int.c lib/fp/fp.c lib/mem/mem.c lib/atomic/atomic_freestanding.c \
+ lib/cfree/ifunc_init.c
ifeq ($(ABI),lp64)
SRCS += lib/int64/int64.c
diff --git a/rt/lib/cfree/ifunc_init.c b/rt/lib/cfree/ifunc_init.c
@@ -0,0 +1,39 @@
+/* IFUNC startup init for cfree-ld static ELF binaries.
+ *
+ * The linker (src/link/link_layout.c::layout_iplt) materializes one
+ * iplt stub + one .igot.plt slot per defined STT_GNU_IFUNC symbol and
+ * also emits a parallel .iplt.pairs data section: alternating
+ * (resolver_ptr, slot_ptr) u64 pairs in the same order as the slots.
+ * For the ELF emit path, the linker additionally synthesizes a
+ * .init_array entry pointing at __cfree_ifunc_init below; the
+ * freestanding _start (or any standard CRT) walks .init_array before
+ * user code, so by the time test_main / main runs, every slot holds
+ * the chosen implementation pointer and the iplt stub's load+branch
+ * tail-calls correctly.
+ *
+ * The JIT path doesn't hit this function: link_jit.c pre-resolves
+ * slots in-process at load time and intentionally skips the
+ * .init_array synthesis (so __cfree_ifunc_init never becomes an undef
+ * ref the JIT user has to satisfy).
+ *
+ * The .iplt.pairs span symbols are weak so this object can also be
+ * linked into images that don't carry IFUNCs (or that were linked by
+ * a non-cfree linker that doesn't synthesize __start_iplt_pairs /
+ * __stop_iplt_pairs) — the function is then a no-op. */
+
+extern void* __start_iplt_pairs[] __attribute__((weak));
+extern void* __stop_iplt_pairs[] __attribute__((weak));
+
+typedef void* (*cfree_ifunc_resolver_t)(void);
+
+void __cfree_ifunc_init(void)
+{
+ void** p = __start_iplt_pairs;
+ void** end = __stop_iplt_pairs;
+ if (!p || !end) return;
+ for (; p < end; p += 2) {
+ cfree_ifunc_resolver_t r = (cfree_ifunc_resolver_t)p[0];
+ void** slot = (void**)p[1];
+ *slot = r();
+ }
+}
diff --git a/src/link/link.h b/src/link/link.h
@@ -128,6 +128,15 @@ void link_set_extern_resolver(Linker*, LinkExternResolver, void* user);
* script. Unreferenced sections are dropped from the output. */
void link_set_gc_sections(Linker*, int enable);
+/* Mark this link as targeting a static ET_EXEC ELF binary (vs. the
+ * in-process JIT). Setter is called by cfree_link_exe; the JIT path
+ * leaves it disabled. Currently controls the IFUNC startup-init
+ * synthesis in layout_iplt: with this flag set, layout appends a
+ * .init_array entry that calls __cfree_ifunc_init at exe startup so
+ * .igot.plt slots get filled before user code runs. The JIT pre-
+ * resolves slots in-process and doesn't need the ctor. */
+void link_set_emit_static_exe(Linker*, int enable);
+
/* Symbol resolution and layout are explicit so file linking and JIT share the
* same resolved image. Fatal diagnostics use Compiler.panic.
*
diff --git a/src/link/link_layout.c b/src/link/link_layout.c
@@ -1073,8 +1073,9 @@ static void emit_array_boundaries(Linker* l, LinkImage* img)
{
u32 ii, j;
/* Per-semantic: track [min_vaddr, max_vaddr+size]. */
- u64 init_start = (u64)-1, init_end = 0;
- u64 fini_start = (u64)-1, fini_end = 0;
+ u64 init_start = (u64)-1, init_end = 0;
+ u64 fini_start = (u64)-1, fini_end = 0;
+ u64 preinit_start = (u64)-1, preinit_end = 0;
for (ii = 0; ii < LinkInputs_count(&l->inputs); ++ii) {
ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj;
@@ -1085,7 +1086,9 @@ static void emit_array_boundaries(Linker* l, LinkImage* img)
const LinkSection* ls;
u64 start, end;
if (!s) continue;
- if (s->sem != SSEM_INIT_ARRAY && s->sem != SSEM_FINI_ARRAY) continue;
+ if (s->sem != SSEM_INIT_ARRAY
+ && s->sem != SSEM_FINI_ARRAY
+ && s->sem != SSEM_PREINIT_ARRAY) continue;
ls_id = m->section[j];
if (ls_id == LINK_SEC_NONE) continue;
ls = &img->sections[ls_id - 1];
@@ -1094,44 +1097,55 @@ static void emit_array_boundaries(Linker* l, LinkImage* img)
if (s->sem == SSEM_INIT_ARRAY) {
if (start < init_start) init_start = start;
if (end > init_end) init_end = end;
- } else {
+ } else if (s->sem == SSEM_FINI_ARRAY) {
if (start < fini_start) fini_start = start;
if (end > fini_end) fini_end = end;
+ } else {
+ if (start < preinit_start) preinit_start = start;
+ if (end > preinit_end) preinit_end = end;
}
}
}
- /* Synthetic init/fini sections (e.g. layout_iplt's .init_array
- * entry pointing at __cfree_ifunc_init) carry input_id =
- * LINK_INPUT_NONE and are not visible through the input_maps
- * loop above; fold them in here so the boundary symbols cover
- * them too. */
+ /* Synthetic init/fini/preinit sections (e.g. layout_iplt's
+ * .preinit_array entry pointing at __cfree_ifunc_init) carry
+ * input_id == LINK_INPUT_NONE and aren't visible through the
+ * input_maps loop above; fold them in here so the boundary
+ * symbols cover them too. */
{
u32 i;
for (i = 0; i < img->nsections; ++i) {
const LinkSection* ls = &img->sections[i];
u64 start, end;
if (ls->input_id != LINK_INPUT_NONE) continue;
- if (ls->sem != SSEM_INIT_ARRAY && ls->sem != SSEM_FINI_ARRAY) continue;
+ if (ls->sem != SSEM_INIT_ARRAY
+ && ls->sem != SSEM_FINI_ARRAY
+ && ls->sem != SSEM_PREINIT_ARRAY) continue;
start = ls->vaddr;
end = ls->vaddr + ls->size;
if (ls->sem == SSEM_INIT_ARRAY) {
if (start < init_start) init_start = start;
if (end > init_end) init_end = end;
- } else {
+ } else if (ls->sem == SSEM_FINI_ARRAY) {
if (start < fini_start) fini_start = start;
if (end > fini_end) fini_end = end;
+ } else {
+ if (start < preinit_start) preinit_start = start;
+ if (end > preinit_end) preinit_end = end;
}
}
}
- if (init_start == (u64)-1) { init_start = 0; init_end = 0; }
- if (fini_start == (u64)-1) { fini_start = 0; fini_end = 0; }
+ if (init_start == (u64)-1) { init_start = 0; init_end = 0; }
+ if (fini_start == (u64)-1) { fini_start = 0; fini_end = 0; }
+ if (preinit_start == (u64)-1) { preinit_start = 0; preinit_end = 0; }
- emit_boundary_sym(l, img, "__init_array_start", init_start);
- emit_boundary_sym(l, img, "__init_array_end", init_end);
- emit_boundary_sym(l, img, "__fini_array_start", fini_start);
- emit_boundary_sym(l, img, "__fini_array_end", fini_end);
+ emit_boundary_sym(l, img, "__init_array_start", init_start);
+ emit_boundary_sym(l, img, "__init_array_end", init_end);
+ emit_boundary_sym(l, img, "__fini_array_start", fini_start);
+ emit_boundary_sym(l, img, "__fini_array_end", fini_end);
+ emit_boundary_sym(l, img, "__preinit_array_start", preinit_start);
+ emit_boundary_sym(l, img, "__preinit_array_end", preinit_end);
}
/* Synthesize TLS boundary symbols so the freestanding _start can size
@@ -1540,9 +1554,10 @@ static void layout_got(Linker* l, LinkImage* img, LinkSymId** got_map_out)
* use the .iplt.pairs data section.
*
* When emit_static_exe is set (cfree_link_exe path), an additional
- * 8-byte SSEM_INIT_ARRAY section is synthesized that holds one R_ABS64
- * reloc against __cfree_ifunc_init. The startup CRT runs the entry
- * via .init_array before user code, filling all .igot.plt slots.
+ * 8-byte SSEM_PREINIT_ARRAY section is synthesized that holds one
+ * R_ABS64 reloc against __cfree_ifunc_init. Preinit runs strictly
+ * before .init_array, so user ctors that call IFUNCs see their
+ * .igot.plt slots already filled.
*
* Invariant: runs after link_symbols_to_sections so the resolver's
* vaddr is final; before emit_array_boundaries so the synthetic
@@ -1620,10 +1635,18 @@ static void layout_iplt(Linker* l, LinkImage* img)
Sym pairs_section_name;
Sym init_section_name;
- /* Pass A: count defined IFUNCs. */
+ /* Pass A: count canonical IFUNC defs. resolve_undefs copies
+ * the def's kind into each cross-TU undef LinkSymbol of the
+ * same name, so we'd over-count without the symhash_get check
+ * (matches the dedup in pass B). */
for (i = 0; i < LinkSyms_count(&img->syms); ++i) {
const LinkSymbol* s = LinkSyms_at(&img->syms, i);
- if (s->kind == SK_IFUNC && s->defined) ++nifunc;
+ if (s->kind != SK_IFUNC || !s->defined) continue;
+ if (s->name != 0) {
+ LinkSymId canonical = symhash_get(&img->globals, s->name);
+ if (canonical != LINK_SYM_NONE && canonical != s->id) continue;
+ }
+ ++nifunc;
}
if (nifunc == 0) return;
@@ -1747,7 +1770,7 @@ static void layout_iplt(Linker* l, LinkImage* img)
}
pairs_section_name = pool_intern_cstr(l->c->global, ".iplt.pairs");
- init_section_name = pool_intern_cstr(l->c->global, ".init_array");
+ init_section_name = pool_intern_cstr(l->c->global, ".preinit_array");
iplt_sec = &img->sections[sec_base + 0u];
memset(iplt_sec, 0, sizeof(*iplt_sec));
@@ -1808,7 +1831,7 @@ static void layout_iplt(Linker* l, LinkImage* img)
init_sec->flags = SF_ALLOC | SF_WRITE;
init_sec->align = 8;
init_sec->name = init_section_name;
- init_sec->sem = SSEM_INIT_ARRAY;
+ init_sec->sem = SSEM_PREINIT_ARRAY;
}
img->nsections += emit_init_array ? 4u : 3u;
@@ -1831,11 +1854,18 @@ static void layout_iplt(Linker* l, LinkImage* img)
iplt_bytes = img->segment_bytes[iplt_seg_idx];
slot_idx = 0;
- /* Pass B: per IFUNC, write the stub bytes, synthesize a slot
- * LinkSymbol + a synthetic resolver-pointer LinkSymbol, and emit
- * the relocs. The IFUNC LinkSymbol is then redirected to the
- * stub so external references call into the trampoline instead
- * of the resolver directly. */
+ /* Pass B: per IFUNC def, write the stub bytes, synthesize a
+ * slot LinkSymbol + a synthetic resolver-pointer LinkSymbol, and
+ * emit the relocs. The IFUNC LinkSymbol is then redirected to
+ * the stub so external references call into the trampoline
+ * instead of the resolver directly.
+ *
+ * Per-name dedup: resolve_undefs copies the def's kind into each
+ * undef LinkSymbol of the same name, so a cross-TU undef of an
+ * IFUNC also reads as SK_IFUNC + defined here. Skip those by
+ * keeping only the canonical entry from img->globals — undef
+ * copies pick up the post-redirect fields in the propagation
+ * pass at the end of this function. */
for (i = 0; i < LinkSyms_count(&img->syms); ++i) {
LinkSymbol* s = LinkSyms_at(&img->syms, i);
u64 stub_vaddr;
@@ -1852,6 +1882,10 @@ static void layout_iplt(Linker* l, LinkImage* img)
u8* stub_dst;
if (s->kind != SK_IFUNC || !s->defined) continue;
+ if (s->name != 0) {
+ LinkSymId canonical = symhash_get(&img->globals, s->name);
+ if (canonical != LINK_SYM_NONE && canonical != s->id) continue;
+ }
stub_vaddr = iplt_vaddr + (u64)slot_idx * 12u;
slot_vaddr = igot_vaddr + (u64)slot_idx * 8u;
@@ -1963,9 +1997,10 @@ static void layout_iplt(Linker* l, LinkImage* img)
++slot_idx;
}
- /* .init_array entry: one R_ABS64 reloc filling the 8-byte slot
- * with __cfree_ifunc_init's resolved address. The CRT walks
- * __init_array_start..__init_array_end and calls each entry. */
+ /* .preinit_array entry: one R_ABS64 reloc filling the 8-byte
+ * slot with __cfree_ifunc_init's resolved address. Preinit runs
+ * strictly before .init_array so user ctors that call IFUNCs see
+ * filled .igot.plt slots. */
if (emit_init_array) {
LinkRelocApply rrec;
memset(&rrec, 0, sizeof(rrec));
@@ -1981,6 +2016,34 @@ static void layout_iplt(Linker* l, LinkImage* img)
rrec.addend = 0;
*append_reloc_slot(img) = rrec;
}
+
+ /* Pass C: propagate the redirect to every per-input undef
+ * LinkSymbol that shares the IFUNC's name. resolve_undefs
+ * copied the pre-redirect (resolver) fields into each undef
+ * slot; without this fix-up, cross-TU references to the IFUNC
+ * (R_ABS64 / GOT-page / direct call) would resolve to the
+ * resolver's address, not the iplt stub. Identified by section
+ * matching the synthesized .iplt section, which only the
+ * canonical IFUNC defs land in (slot syms are LOCAL + nameless). */
+ {
+ u32 n = LinkSyms_count(&img->syms);
+ for (i = 0; i < n; ++i) {
+ LinkSymbol* s = LinkSyms_at(&img->syms, i);
+ LinkSymId canonical;
+ LinkSymbol* def;
+ if (s->name == 0) continue;
+ canonical = symhash_get(&img->globals, s->name);
+ if (canonical == LINK_SYM_NONE || canonical == s->id) continue;
+ def = LinkSyms_at(&img->syms, canonical - 1);
+ if (def->section_id != iplt_sec->id) continue;
+ s->section_id = def->section_id;
+ s->value = def->value;
+ s->vaddr = def->vaddr;
+ s->kind = def->kind;
+ s->size = def->size;
+ s->defined = 1;
+ }
+ }
}
/* ---- entry symbol ---- */
diff --git a/test/link/cases/32_ifunc/jit_only b/test/link/cases/32_ifunc/jit_only
diff --git a/test/link/cases/33_ifunc_in_init/a.c b/test/link/cases/33_ifunc_in_init/a.c
@@ -0,0 +1,32 @@
+/* IFUNC called from a __attribute__((constructor)) ctor.
+ *
+ * The ctor lands in .init_array; cfree-ld's synthetic ifunc-init
+ * entry sits in .preinit_array, which the CRT (test/link/harness/
+ * start.c and any standard CRT) iterates first. By the time the
+ * ctor below runs, the .igot.plt slot for my_fn is already
+ * filled with impl_a's address. test_main checks that the ctor
+ * recorded 42, which it could only do if the iplt stub returned
+ * the resolved implementation. */
+
+extern int impl_a(void);
+extern int impl_b(void);
+extern int (*resolve(void))(void);
+
+int impl_a(void) { return 42; }
+int impl_b(void) { return 99; }
+
+int (*resolve(void))(void) {
+ volatile int x = 1;
+ return x ? impl_a : impl_b;
+}
+
+int my_fn(void) __attribute__((ifunc("resolve")));
+
+static int g_observed = -1;
+
+__attribute__((constructor))
+static void ctor_runs_my_fn(void) {
+ g_observed = my_fn();
+}
+
+int test_main(void) { return g_observed == 42 ? 0 : 1; }
diff --git a/test/link/cases/34_ifunc_addr_taken/a.c b/test/link/cases/34_ifunc_addr_taken/a.c
@@ -0,0 +1,30 @@
+/* &my_fn round-trips through a function pointer (cross-TU).
+ *
+ * a.c defines the IFUNC; b.c references it via &my_fn so the
+ * address taken in b.c is observed across a TU boundary. After
+ * layout_iplt, the canonical IFUNC LinkSymbol is redirected to
+ * the iplt stub; the per-input undef LinkSymbol in b.c had its
+ * fields copied from the def by resolve_undefs (pre-redirect).
+ * The propagation pass at the tail of layout_iplt re-syncs that
+ * undef to the post-redirect (stub) vaddr, so the function
+ * pointer in b.c carries the stub's address.
+ *
+ * Calling through the pointer goes through the stub: ADRP /
+ * LDR / BR x16, hitting impl_a via the .igot.plt slot. The
+ * raw resolver address would have returned a function-pointer-
+ * to-resolver that, when invoked, returns impl_a (a different
+ * call shape we'd notice via test_main's return value). */
+
+extern int impl_a(void);
+extern int impl_b(void);
+extern int (*resolve(void))(void);
+
+int impl_a(void) { return 42; }
+int impl_b(void) { return 99; }
+
+int (*resolve(void))(void) {
+ volatile int x = 1;
+ return x ? impl_a : impl_b;
+}
+
+int my_fn(void) __attribute__((ifunc("resolve")));
diff --git a/test/link/cases/34_ifunc_addr_taken/b.c b/test/link/cases/34_ifunc_addr_taken/b.c
@@ -0,0 +1,10 @@
+/* TU2: take &my_fn (an undef ref to an IFUNC defined in a.c) and
+ * call it through a function pointer. Volatile prevents the
+ * compiler from inlining the call site through to my_fn directly. */
+
+extern int my_fn(void);
+
+int test_main(void) {
+ int (*volatile p)(void) = my_fn;
+ return p() == 42 ? 0 : 1;
+}
diff --git a/test/link/harness/start.c b/test/link/harness/start.c
@@ -21,6 +21,8 @@ extern int test_main(void);
__attribute__((weak)) int test_post_fini(void) { return 0; }
typedef void (*VoidFn)(void);
+extern VoidFn __preinit_array_start[];
+extern VoidFn __preinit_array_end[];
extern VoidFn __init_array_start[];
extern VoidFn __init_array_end[];
extern VoidFn __fini_array_start[];
@@ -37,6 +39,27 @@ extern char __tbss_size[]; /* SK_ABS: address-of yields the byte count */
* here. Layout: [TCB(16) | .tdata copy | .tbss zero-fill]. */
static char g_tls_block[4096] __attribute__((aligned(16)));
+/* IFUNC startup init. Mirrors rt/lib/cfree/ifunc_init.c — duplicated
+ * here so the test harness doesn't need libcfree_rt.a on the link
+ * line. When the linker emits a static ET_EXEC and the image
+ * contains any STT_GNU_IFUNC, layout_iplt synthesizes a .init_array
+ * entry pointing at __cfree_ifunc_init; the loop in _start below
+ * walks .init_array and calls each entry, so this fills every
+ * .igot.plt slot before test_main runs. */
+extern void* __start_iplt_pairs[] __attribute__((weak));
+extern void* __stop_iplt_pairs[] __attribute__((weak));
+void __cfree_ifunc_init(void)
+{
+ void** p = __start_iplt_pairs;
+ void** end = __stop_iplt_pairs;
+ if (!p || !end) return;
+ for (; p < end; p += 2) {
+ void* (*r)(void) = (void* (*)(void))p[0];
+ void** slot = (void**)p[1];
+ *slot = r();
+ }
+}
+
__attribute__((noreturn)) static void do_exit(int code)
{
register long x8 __asm__("x8") = 94; /* sys_exit_group */
@@ -63,6 +86,11 @@ void _start(void)
tls_init();
+ /* SHT_PREINIT_ARRAY runs strictly before .init_array. cfree-ld
+ * lands its synthetic __cfree_ifunc_init entry here so IFUNC
+ * slots are filled before any user ctor or test_main runs. */
+ for (p = __preinit_array_start; p != __preinit_array_end; ++p)
+ (*p)();
for (p = __init_array_start; p != __init_array_end; ++p)
(*p)();