kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 71b2a2aa5e6017120efb570f33e9c2a15e46e391
parent 1ea6db00892dda7ca552d604e1cf58b1be63a683
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 19 May 2026 14:30:00 -0700

Fix x64 PIC codegen and PIE dynamic-section noise

x64 backend: pick PC32 for data PC-relative refs and reserve PLT32 for
function symbols; previously every &sym from emit_global_lea carried
PLT32, which strict linkers flag as a smell on data targets.

x64 load/store from a locally-resolvable global now folds into a single
PC-relative mov (e.g. movl global(%rip), %r13d) instead of materializing
the address into R11 first and reloading. Saves an instruction, a
scratch reg, and a dependent load. GOT-routed accesses keep the
indirect path.

PIE linker: size .rela.dyn from a pre-pass over img->relocs instead of
reserving a fixed 4096-entry cap (which left ~4000 trailing R_*_NONE
records in the section). Omit DT_RELA*/DT_PLT*/DT_JMPREL from .dynamic
when no records exist, eliminating llvm-readelf "virtual address is not
in any segment" warnings on minimal PIE binaries.

Diffstat:
Msrc/arch/x64/ops.c | 136++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Msrc/link/link_dyn.c | 36+++++++++++++++++++++++++++---------
Msrc/link/link_elf.c | 22+++++++++++++++-------
3 files changed, 166 insertions(+), 28 deletions(-)

diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -112,12 +112,26 @@ static int x64_use_got_for_sym(CGTarget* t, ObjSymId sym) { return obj_symbol_extern_via_got(t->c, t->obj, sym); } +/* Pick the PC-relative reloc kind for a non-GOT &sym reference. + * Function symbols use R_X64_PLT32 so the linker can route through a + * PLT trampoline when needed (calls into a DSO; address-taken function + * pointers that must agree across DSOs). Data symbols use the plain + * R_PC32: PLT32 happens to encode identically when the linker resolves + * the reference locally, but strict linkers warn when a data symbol + * carries a PLT-flavored reloc. */ +static u32 x64_pcrel_reloc_for_sym(CGTarget* t, ObjSymId sym) { + const ObjSym* s = obj_symbol_get(t->obj, sym); + if (s && (s->kind == SK_FUNC || s->kind == SK_IFUNC)) return R_X64_PLT32; + return R_PC32; +} + /* Materialize `&sym + addend` into `dst_reg`. For locally-defined or * static-link extern symbols, emit `lea rd, [rip + disp32]` with - * R_X64_PLT32 (PLT32 collapses to a plain PC-relative LEA at link time - * — the PLT routing only fires when the linker actually needs the - * trampoline, i.e. function calls into a DSO). For undef externs in - * PIC/PIE we instead emit `mov rd, [rip + disp32]` against a GOT slot + * R_PC32 for data symbols or R_X64_PLT32 for functions (PLT32 collapses + * to a plain PC-relative LEA at link time — the PLT routing only fires + * when the linker actually needs the trampoline, i.e. function calls + * or address-taken funcs into a DSO). For undef externs in PIC/PIE we + * instead emit `mov rd, [rip + disp32]` against a GOT slot * (R_X64_REX_GOTPCRELX) so the loader can resolve the symbol by * patching a single slot rather than touching .text. * @@ -167,16 +181,97 @@ static void emit_global_lea(CGTarget* t, u32 dst_reg, ObjSymId sym, t->mc->emit_bytes(t->mc, &mr, 1); u32 disp_pos = t->mc->pos(t->mc); emit_u32le(t->mc, 0); - t->mc->emit_reloc_at(t->mc, t->mc->section_id, disp_pos, R_X64_PLT32, sym, - addend - 4, 1, 0); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, disp_pos, + x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0); +} + +/* Emit a single PC-relative GPR `mov reg, sym(%rip)` (load) or + * `mov sym(%rip), reg` (store). Saves one instruction and one scratch + * register vs. the lea-then-indirect-mov pair the GOT path needs. + * Caller guarantees the symbol is not GOT-routed. */ +static void emit_global_pcrel_gpr(CGTarget* t, u32 sz, int signed_ext, + int is_store, u32 reg, ObjSymId sym, + i64 addend) { + MCEmitter* mc = t->mc; + /* RIP-relative addressing: mod=00, r/m=101, disp32; pass base=0 + * to emit_rex so REX.B stays clear (RIP isn't an extended reg). */ + if (sz == 8) { + emit_rex(mc, 1, reg, 0, 0); + u8 op = is_store ? 0x89 : 0x8B; + mc->emit_bytes(mc, &op, 1); + } else if (sz == 4) { + emit_rex(mc, 0, reg, 0, 0); + u8 op = is_store ? 0x89 : 0x8B; + mc->emit_bytes(mc, &op, 1); + } else if (sz == 2) { + if (is_store) { + u8 p = 0x66; + mc->emit_bytes(mc, &p, 1); + emit_rex(mc, 0, reg, 0, 0); + u8 op = 0x89; + mc->emit_bytes(mc, &op, 1); + } else { + emit_rex(mc, 0, reg, 0, 0); + u8 op[2] = {0x0F, signed_ext ? 0xBFu : 0xB7u}; + mc->emit_bytes(mc, op, 2); + } + } else if (sz == 1) { + if (is_store) { + /* Force REX so SIL/DIL/etc are addressable as byte regs. */ + emit_rex_force(mc, 0, reg, 0, 0); + u8 op = 0x88; + mc->emit_bytes(mc, &op, 1); + } else { + emit_rex(mc, 0, reg, 0, 0); + u8 op[2] = {0x0F, signed_ext ? 0xBEu : 0xB6u}; + mc->emit_bytes(mc, op, 2); + } + } + u8 mr = modrm(0u, (reg & 7u), 5u); /* [RIP + disp32] */ + mc->emit_bytes(mc, &mr, 1); + u32 disp_pos = mc->pos(mc); + emit_u32le(mc, 0); + mc->emit_reloc_at(mc, mc->section_id, disp_pos, + x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0); +} + +/* Emit a single PC-relative SSE `movs[sd] xmm, sym(%rip)` (load) or + * `movs[sd] sym(%rip), xmm` (store). Caller guarantees the symbol is + * not GOT-routed. */ +static void emit_global_pcrel_sse(CGTarget* t, u32 sz, int is_store, u32 reg, + ObjSymId sym, i64 addend) { + MCEmitter* mc = t->mc; + u8 prefix2 = (sz == 8) ? 0xF2u : 0xF3u; + mc->emit_bytes(mc, &prefix2, 1); + emit_rex(mc, 0, reg, 0, 0); + u8 op[2] = {0x0Fu, is_store ? 0x11u : 0x10u}; + mc->emit_bytes(mc, op, 2); + u8 mr = modrm(0u, (reg & 7u), 5u); /* [RIP + disp32] */ + mc->emit_bytes(mc, &mr, 1); + u32 disp_pos = mc->pos(mc); + emit_u32le(mc, 0); + mc->emit_reloc_at(mc, mc->section_id, disp_pos, + x64_pcrel_reloc_for_sym(t, sym), sym, addend - 4, 1, 0); } void x_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { u32 sz = ma.size ? ma.size : type_byte_size(addr.type); if (addr.kind == OPK_GLOBAL) { - /* Materialize &sym into R11, then load from [r11]. */ - emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend); + ObjSymId sym = addr.v.global.sym; + i64 addend = addr.v.global.addend; + if (!x64_use_got_for_sym(t, sym)) { + /* Locally-resolvable: fold lea+load into a single PC-relative mov. */ + if (dst.cls == RC_FP) { + emit_global_pcrel_sse(t, sz, 0, dst.v.reg & 0xFu, sym, addend); + } else { + int signed_ = type_is_signed(ma.type ? ma.type : addr.type); + emit_global_pcrel_gpr(t, sz, signed_, 0, dst.v.reg & 0xFu, sym, addend); + } + return; + } + /* GOT path: materialize &sym into R11, then load from [r11]. */ + emit_global_lea(t, X64_R11, sym, addend); if (dst.cls == RC_FP) { u8 prefix2 = (sz == 8) ? 0xF2 : 0xF3; emit_sse_load(t->mc, prefix2, 0x10, dst.v.reg & 0xFu, X64_R11, 0); @@ -202,10 +297,27 @@ void x_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { u32 sz = ma.size ? ma.size : type_byte_size(addr.type); if (addr.kind == OPK_GLOBAL) { - /* Materialize &sym into R11, then store via [r11]. The IMM source - * branch below uses RAX as a scratch for the value, so R11 stays - * untouched between the LEA and the store. */ - emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend); + ObjSymId sym = addr.v.global.sym; + i64 addend = addr.v.global.addend; + if (!x64_use_got_for_sym(t, sym)) { + /* Locally-resolvable: fold lea+store into a single PC-relative mov. */ + if (src.kind == OPK_IMM) { + int w = (sz == 8) ? 1 : 0; + x64_emit_load_imm(t->mc, w, X64_RAX, src.v.imm); + emit_global_pcrel_gpr(t, sz, 0, 1, X64_RAX, sym, addend); + return; + } + if (src.cls == RC_FP) { + emit_global_pcrel_sse(t, sz, 1, src.v.reg & 0xFu, sym, addend); + return; + } + emit_global_pcrel_gpr(t, sz, 0, 1, src.v.reg & 0xFu, sym, addend); + return; + } + /* GOT path: materialize &sym into R11, then store via [r11]. The + * IMM source branch below uses RAX as a scratch for the value, so + * R11 stays untouched between the LEA and the store. */ + emit_global_lea(t, X64_R11, sym, addend); if (src.kind == OPK_IMM) { int w = (sz == 8) ? 1 : 0; x64_emit_load_imm(t->mc, w, X64_RAX, src.v.imm); diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c @@ -481,13 +481,17 @@ typedef struct DynEntry { static u32 count_dynamic_entries(const LinkDynState* dyn) { /* Required: DT_STRTAB DT_STRSZ DT_SYMTAB DT_SYMENT DT_GNU_HASH - * DT_PLTGOT DT_PLTRELSZ DT_PLTREL DT_JMPREL - * DT_RELA DT_RELASZ DT_RELAENT * DT_FLAGS_1 (DF_1_NOW for eager binding) * DT_NULL terminator + * Optional (only when there are .rela.dyn records): + * DT_RELA DT_RELASZ DT_RELAENT + * Optional (only when there are imported functions / a PLT): + * DT_PLTGOT DT_PLTRELSZ DT_PLTREL DT_JMPREL * Plus DT_NEEDED per dependency. */ u32 n = dyn->nneeded; - n += 13; /* 12 fixed + DT_NULL */ + n += 6; /* 5 fixed + DT_NULL */ + if (dyn->cap_rela_dyn) n += 3; /* DT_RELA + DT_RELASZ + DT_RELAENT */ + if (dyn->nrela_plt) n += 4; /* PLT-only entries */ return n; } @@ -570,12 +574,26 @@ void layout_dyn(Linker* l, LinkImage* img) { if (imports.nfuncs && !dyn->rela_plt) compiler_panic(img->c, no_loc(), "link: oom on rela_plt"); - /* RELA dyn: capacity for GLOB_DAT (data imports referenced via .got) - * + RELATIVE (PIE internal abs64 fixups) + any direct-abs imports. - * Phase 5 emits all entries dynamically during reloc-apply; layout - * just reserves space. Cap chosen large enough for the test/musl - * harness; apply panics loudly if exceeded. */ - u32 cap_rel = 4096u; + /* RELA dyn: GLOB_DAT (one per imported abs-relocated symbol) + + * RELATIVE (one per PIE internal abs reloc against a defined sym). + * Phase 5 emits these dynamically during reloc-apply; pre-count the + * exact total here (img->relocs and the resolve-time `imported` flags + * are already settled by the time layout_dyn runs) so the section + * isn't padded with hundreds of trailing R_*_NONE records. */ + u32 cap_rel = 0; + { + u32 ri; + for (ri = 0; ri < LinkRelocs_count(&img->relocs); ++ri) { + const LinkRelocApply* r = LinkRelocs_at(&img->relocs, ri); + const LinkSymbol* tgt = LinkSyms_at(&img->syms, r->target - 1); + if (r->kind != R_ABS32 && r->kind != R_ABS64) continue; + if (tgt->imported) { + cap_rel++; /* GLOB_DAT */ + } else if (tgt->defined && tgt->kind != SK_ABS) { + cap_rel++; /* RELATIVE */ + } + } + } dyn->cap_rela_dyn = cap_rel; dyn->rela_dyn = dyn->cap_rela_dyn diff --git a/src/link/link_elf.c b/src/link/link_elf.c @@ -759,13 +759,21 @@ void link_emit_elf(LinkImage* img, Writer* w) { DT_PUT(DT_SYMTAB, img_base + sec_dynsym->vaddr); DT_PUT(DT_SYMENT, 24); DT_PUT(DT_GNU_HASH, img_base + sec_gnuhash->vaddr); - DT_PUT(DT_PLTGOT, sec_gotplt ? (img_base + sec_gotplt->vaddr) : 0); - DT_PUT(DT_PLTRELSZ, sec_relaplt ? sec_relaplt->size : 0); - DT_PUT(DT_PLTREL, DT_RELA); - DT_PUT(DT_JMPREL, sec_relaplt ? (img_base + sec_relaplt->vaddr) : 0); - DT_PUT(DT_RELA, img_base + sec_reladyn->vaddr); - DT_PUT(DT_RELASZ, sec_reladyn->size); - DT_PUT(DT_RELAENT, 24); + /* DT_PLT* / DT_JMPREL only make sense when there's a PLT. Emitting + * them with size=0 / vaddr=0 (or pointing past the end of any + * PT_LOAD) trips llvm-readelf's "address not in any segment" check + * and confuses some loaders' DT walk. */ + if (dyn->nrela_plt) { + DT_PUT(DT_PLTGOT, sec_gotplt ? (img_base + sec_gotplt->vaddr) : 0); + DT_PUT(DT_PLTRELSZ, sec_relaplt ? sec_relaplt->size : 0); + DT_PUT(DT_PLTREL, DT_RELA); + DT_PUT(DT_JMPREL, sec_relaplt ? (img_base + sec_relaplt->vaddr) : 0); + } + if (dyn->cap_rela_dyn) { + DT_PUT(DT_RELA, img_base + sec_reladyn->vaddr); + DT_PUT(DT_RELASZ, sec_reladyn->size); + DT_PUT(DT_RELAENT, 24); + } DT_PUT(DT_FLAGS_1, DF_1_NOW); DT_PUT(DT_NULL, 0); #undef DT_PUT