commit 71b2a2aa5e6017120efb570f33e9c2a15e46e391
parent 1ea6db00892dda7ca552d604e1cf58b1be63a683
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 19 May 2026 14:30:00 -0700
Fix x64 PIC codegen and PIE dynamic-section noise
x64 backend: pick PC32 for data PC-relative refs and reserve PLT32 for
function symbols; previously every &sym from emit_global_lea carried
PLT32, which strict linkers flag as a smell on data targets.
x64 load/store from a locally-resolvable global now folds into a single
PC-relative mov (e.g. movl global(%rip), %r13d) instead of materializing
the address into R11 first and reloading. Saves an instruction, a
scratch reg, and a dependent load. GOT-routed accesses keep the
indirect path.
PIE linker: size .rela.dyn from a pre-pass over img->relocs instead of
reserving a fixed 4096-entry cap (which left ~4000 trailing R_*_NONE
records in the section). Omit DT_RELA*/DT_PLT*/DT_JMPREL from .dynamic
when no records exist, eliminating llvm-readelf "virtual address is not
in any segment" warnings on minimal PIE binaries.
Diffstat:
3 files changed, 166 insertions(+), 28 deletions(-)
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -112,12 +112,26 @@ static int x64_use_got_for_sym(CGTarget* t, ObjSymId sym) {
return obj_symbol_extern_via_got(t->c, t->obj, sym);
}
+/* Pick the PC-relative reloc kind for a non-GOT &sym reference.
+ * Function symbols use R_X64_PLT32 so the linker can route through a
+ * PLT trampoline when needed (calls into a DSO; address-taken function
+ * pointers that must agree across DSOs). Data symbols use the plain
+ * R_PC32: PLT32 happens to encode identically when the linker resolves
+ * the reference locally, but strict linkers warn when a data symbol
+ * carries a PLT-flavored reloc. */
+static u32 x64_pcrel_reloc_for_sym(CGTarget* t, ObjSymId sym) {
+ const ObjSym* s = obj_symbol_get(t->obj, sym);
+ if (s && (s->kind == SK_FUNC || s->kind == SK_IFUNC)) return R_X64_PLT32;
+ return R_PC32;
+}
+
/* Materialize `&sym + addend` into `dst_reg`. For locally-defined or
* static-link extern symbols, emit `lea rd, [rip + disp32]` with
- * R_X64_PLT32 (PLT32 collapses to a plain PC-relative LEA at link time
- * — the PLT routing only fires when the linker actually needs the
- * trampoline, i.e. function calls into a DSO). For undef externs in
- * PIC/PIE we instead emit `mov rd, [rip + disp32]` against a GOT slot
+ * R_PC32 for data symbols or R_X64_PLT32 for functions (PLT32 collapses
+ * to a plain PC-relative LEA at link time — the PLT routing only fires
+ * when the linker actually needs the trampoline, i.e. function calls
+ * or address-taken funcs into a DSO). For undef externs in PIC/PIE we
+ * instead emit `mov rd, [rip + disp32]` against a GOT slot
* (R_X64_REX_GOTPCRELX) so the loader can resolve the symbol by
* patching a single slot rather than touching .text.
*
@@ -167,16 +181,97 @@ static void emit_global_lea(CGTarget* t, u32 dst_reg, ObjSymId sym,
t->mc->emit_bytes(t->mc, &mr, 1);
u32 disp_pos = t->mc->pos(t->mc);
emit_u32le(t->mc, 0);
- t->mc->emit_reloc_at(t->mc, t->mc->section_id, disp_pos, R_X64_PLT32, sym,
- addend - 4, 1, 0);
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, disp_pos,
+ x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0);
+}
+
+/* Emit a single PC-relative GPR `mov reg, sym(%rip)` (load) or
+ * `mov sym(%rip), reg` (store). Saves one instruction and one scratch
+ * register vs. the lea-then-indirect-mov pair the GOT path needs.
+ * Caller guarantees the symbol is not GOT-routed. */
+static void emit_global_pcrel_gpr(CGTarget* t, u32 sz, int signed_ext,
+ int is_store, u32 reg, ObjSymId sym,
+ i64 addend) {
+ MCEmitter* mc = t->mc;
+ /* RIP-relative addressing: mod=00, r/m=101, disp32; pass base=0
+ * to emit_rex so REX.B stays clear (RIP isn't an extended reg). */
+ if (sz == 8) {
+ emit_rex(mc, 1, reg, 0, 0);
+ u8 op = is_store ? 0x89 : 0x8B;
+ mc->emit_bytes(mc, &op, 1);
+ } else if (sz == 4) {
+ emit_rex(mc, 0, reg, 0, 0);
+ u8 op = is_store ? 0x89 : 0x8B;
+ mc->emit_bytes(mc, &op, 1);
+ } else if (sz == 2) {
+ if (is_store) {
+ u8 p = 0x66;
+ mc->emit_bytes(mc, &p, 1);
+ emit_rex(mc, 0, reg, 0, 0);
+ u8 op = 0x89;
+ mc->emit_bytes(mc, &op, 1);
+ } else {
+ emit_rex(mc, 0, reg, 0, 0);
+ u8 op[2] = {0x0F, signed_ext ? 0xBFu : 0xB7u};
+ mc->emit_bytes(mc, op, 2);
+ }
+ } else if (sz == 1) {
+ if (is_store) {
+ /* Force REX so SIL/DIL/etc are addressable as byte regs. */
+ emit_rex_force(mc, 0, reg, 0, 0);
+ u8 op = 0x88;
+ mc->emit_bytes(mc, &op, 1);
+ } else {
+ emit_rex(mc, 0, reg, 0, 0);
+ u8 op[2] = {0x0F, signed_ext ? 0xBEu : 0xB6u};
+ mc->emit_bytes(mc, op, 2);
+ }
+ }
+ u8 mr = modrm(0u, (reg & 7u), 5u); /* [RIP + disp32] */
+ mc->emit_bytes(mc, &mr, 1);
+ u32 disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ mc->emit_reloc_at(mc, mc->section_id, disp_pos,
+ x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0);
+}
+
+/* Emit a single PC-relative SSE `movs[sd] xmm, sym(%rip)` (load) or
+ * `movs[sd] sym(%rip), xmm` (store). Caller guarantees the symbol is
+ * not GOT-routed. */
+static void emit_global_pcrel_sse(CGTarget* t, u32 sz, int is_store, u32 reg,
+ ObjSymId sym, i64 addend) {
+ MCEmitter* mc = t->mc;
+ u8 prefix2 = (sz == 8) ? 0xF2u : 0xF3u;
+ mc->emit_bytes(mc, &prefix2, 1);
+ emit_rex(mc, 0, reg, 0, 0);
+ u8 op[2] = {0x0Fu, is_store ? 0x11u : 0x10u};
+ mc->emit_bytes(mc, op, 2);
+ u8 mr = modrm(0u, (reg & 7u), 5u); /* [RIP + disp32] */
+ mc->emit_bytes(mc, &mr, 1);
+ u32 disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ mc->emit_reloc_at(mc, mc->section_id, disp_pos,
+ x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0);
}
void x_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
if (addr.kind == OPK_GLOBAL) {
- /* Materialize &sym into R11, then load from [r11]. */
- emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend);
+ ObjSymId sym = addr.v.global.sym;
+ i64 addend = addr.v.global.addend;
+ if (!x64_use_got_for_sym(t, sym)) {
+ /* Locally-resolvable: fold lea+load into a single PC-relative mov. */
+ if (dst.cls == RC_FP) {
+ emit_global_pcrel_sse(t, sz, 0, dst.v.reg & 0xFu, sym, addend);
+ } else {
+ int signed_ = type_is_signed(ma.type ? ma.type : addr.type);
+ emit_global_pcrel_gpr(t, sz, signed_, 0, dst.v.reg & 0xFu, sym, addend);
+ }
+ return;
+ }
+ /* GOT path: materialize &sym into R11, then load from [r11]. */
+ emit_global_lea(t, X64_R11, sym, addend);
if (dst.cls == RC_FP) {
u8 prefix2 = (sz == 8) ? 0xF2 : 0xF3;
emit_sse_load(t->mc, prefix2, 0x10, dst.v.reg & 0xFu, X64_R11, 0);
@@ -202,10 +297,27 @@ void x_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) {
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
if (addr.kind == OPK_GLOBAL) {
- /* Materialize &sym into R11, then store via [r11]. The IMM source
- * branch below uses RAX as a scratch for the value, so R11 stays
- * untouched between the LEA and the store. */
- emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend);
+ ObjSymId sym = addr.v.global.sym;
+ i64 addend = addr.v.global.addend;
+ if (!x64_use_got_for_sym(t, sym)) {
+ /* Locally-resolvable: fold lea+store into a single PC-relative mov. */
+ if (src.kind == OPK_IMM) {
+ int w = (sz == 8) ? 1 : 0;
+ x64_emit_load_imm(t->mc, w, X64_RAX, src.v.imm);
+ emit_global_pcrel_gpr(t, sz, 0, 1, X64_RAX, sym, addend);
+ return;
+ }
+ if (src.cls == RC_FP) {
+ emit_global_pcrel_sse(t, sz, 1, src.v.reg & 0xFu, sym, addend);
+ return;
+ }
+ emit_global_pcrel_gpr(t, sz, 0, 1, src.v.reg & 0xFu, sym, addend);
+ return;
+ }
+ /* GOT path: materialize &sym into R11, then store via [r11]. The
+ * IMM source branch below uses RAX as a scratch for the value, so
+ * R11 stays untouched between the LEA and the store. */
+ emit_global_lea(t, X64_R11, sym, addend);
if (src.kind == OPK_IMM) {
int w = (sz == 8) ? 1 : 0;
x64_emit_load_imm(t->mc, w, X64_RAX, src.v.imm);
diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c
@@ -481,13 +481,17 @@ typedef struct DynEntry {
static u32 count_dynamic_entries(const LinkDynState* dyn) {
/* Required: DT_STRTAB DT_STRSZ DT_SYMTAB DT_SYMENT DT_GNU_HASH
- * DT_PLTGOT DT_PLTRELSZ DT_PLTREL DT_JMPREL
- * DT_RELA DT_RELASZ DT_RELAENT
* DT_FLAGS_1 (DF_1_NOW for eager binding)
* DT_NULL terminator
+ * Optional (only when there are .rela.dyn records):
+ * DT_RELA DT_RELASZ DT_RELAENT
+ * Optional (only when there are imported functions / a PLT):
+ * DT_PLTGOT DT_PLTRELSZ DT_PLTREL DT_JMPREL
* Plus DT_NEEDED per dependency. */
u32 n = dyn->nneeded;
- n += 13; /* 12 fixed + DT_NULL */
+ n += 6; /* 5 fixed + DT_NULL */
+ if (dyn->cap_rela_dyn) n += 3; /* DT_RELA + DT_RELASZ + DT_RELAENT */
+ if (dyn->nrela_plt) n += 4; /* PLT-only entries */
return n;
}
@@ -570,12 +574,26 @@ void layout_dyn(Linker* l, LinkImage* img) {
if (imports.nfuncs && !dyn->rela_plt)
compiler_panic(img->c, no_loc(), "link: oom on rela_plt");
- /* RELA dyn: capacity for GLOB_DAT (data imports referenced via .got)
- * + RELATIVE (PIE internal abs64 fixups) + any direct-abs imports.
- * Phase 5 emits all entries dynamically during reloc-apply; layout
- * just reserves space. Cap chosen large enough for the test/musl
- * harness; apply panics loudly if exceeded. */
- u32 cap_rel = 4096u;
+ /* RELA dyn: GLOB_DAT (one per imported abs-relocated symbol) +
+ * RELATIVE (one per PIE internal abs reloc against a defined sym).
+ * Phase 5 emits these dynamically during reloc-apply; pre-count the
+ * exact total here (img->relocs and the resolve-time `imported` flags
+ * are already settled by the time layout_dyn runs) so the section
+ * isn't padded with hundreds of trailing R_*_NONE records. */
+ u32 cap_rel = 0;
+ {
+ u32 ri;
+ for (ri = 0; ri < LinkRelocs_count(&img->relocs); ++ri) {
+ const LinkRelocApply* r = LinkRelocs_at(&img->relocs, ri);
+ const LinkSymbol* tgt = LinkSyms_at(&img->syms, r->target - 1);
+ if (r->kind != R_ABS32 && r->kind != R_ABS64) continue;
+ if (tgt->imported) {
+ cap_rel++; /* GLOB_DAT */
+ } else if (tgt->defined && tgt->kind != SK_ABS) {
+ cap_rel++; /* RELATIVE */
+ }
+ }
+ }
dyn->cap_rela_dyn = cap_rel;
dyn->rela_dyn =
dyn->cap_rela_dyn
diff --git a/src/link/link_elf.c b/src/link/link_elf.c
@@ -759,13 +759,21 @@ void link_emit_elf(LinkImage* img, Writer* w) {
DT_PUT(DT_SYMTAB, img_base + sec_dynsym->vaddr);
DT_PUT(DT_SYMENT, 24);
DT_PUT(DT_GNU_HASH, img_base + sec_gnuhash->vaddr);
- DT_PUT(DT_PLTGOT, sec_gotplt ? (img_base + sec_gotplt->vaddr) : 0);
- DT_PUT(DT_PLTRELSZ, sec_relaplt ? sec_relaplt->size : 0);
- DT_PUT(DT_PLTREL, DT_RELA);
- DT_PUT(DT_JMPREL, sec_relaplt ? (img_base + sec_relaplt->vaddr) : 0);
- DT_PUT(DT_RELA, img_base + sec_reladyn->vaddr);
- DT_PUT(DT_RELASZ, sec_reladyn->size);
- DT_PUT(DT_RELAENT, 24);
+ /* DT_PLT* / DT_JMPREL only make sense when there's a PLT. Emitting
+ * them with size=0 / vaddr=0 (or pointing past the end of any
+ * PT_LOAD) trips llvm-readelf's "address not in any segment" check
+ * and confuses some loaders' DT walk. */
+ if (dyn->nrela_plt) {
+ DT_PUT(DT_PLTGOT, sec_gotplt ? (img_base + sec_gotplt->vaddr) : 0);
+ DT_PUT(DT_PLTRELSZ, sec_relaplt ? sec_relaplt->size : 0);
+ DT_PUT(DT_PLTREL, DT_RELA);
+ DT_PUT(DT_JMPREL, sec_relaplt ? (img_base + sec_relaplt->vaddr) : 0);
+ }
+ if (dyn->cap_rela_dyn) {
+ DT_PUT(DT_RELA, img_base + sec_reladyn->vaddr);
+ DT_PUT(DT_RELASZ, sec_reladyn->size);
+ DT_PUT(DT_RELAENT, 24);
+ }
DT_PUT(DT_FLAGS_1, DF_1_NOW);
DT_PUT(DT_NULL, 0);
#undef DT_PUT