link.c (9454B)
1 /* AArch64 link-time descriptor. 2 * 3 * Implements the LinkArchDesc contract from link_arch.h for the 4 * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve 5 * trampolines emitted in canonical form even under DF_1_NOW), and the 6 * 12-byte IPLT stub used by ifunc resolvers. All instruction bytes 7 * come from the encoders in arch/aa64/isa.h — no raw hex literals 8 * here. 9 * 10 * The byte layout matches the previous inline encodings in 11 * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the 12 * linker to descriptor dispatch is a no-op on the output image. */ 13 14 #include "arch/aa64/isa.h" 15 #include "core/bytes.h" 16 #include "core/core.h" 17 #include "link/link_arch.h" 18 #include "obj/obj.h" 19 20 /* Fixed register assignments mandated by the AArch64 PLT ABI. */ 21 #define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */ 22 #define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */ 23 24 /* PLT geometry. Documented in link_arch.h; redeclared here as the 25 * descriptor table needs them at file scope. */ 26 #define AA64_PLT0_SIZE 32u 27 #define AA64_PLT_ENTRY_SIZE 16u 28 #define AA64_IPLT_STUB_SIZE 12u 29 30 /* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import 31 * entries start at .got.plt[3]. */ 32 #define AA64_GOTPLT_RESOLVER_INDEX 2u 33 34 /* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12, 35 * where page(x) clears the low 12 bits. */ 36 #define AA64_PAGE_MASK ((u64)0xfffu) 37 38 /* Compute the (immlo, immhi) ADRP immediate halves for the page- 39 * relative displacement from `pc` to `target`. Both addresses are 40 * post-shift final image vaddrs; ADRP discards the low 12 bits of 41 * each before subtracting, so the result is invariant under any 42 * segment-base shift that moves both endpoints by the same delta. */ 43 static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo, 44 u32* immhi) { 45 i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK); 46 i64 imm21 = page_disp >> 12; 47 *immlo = (u32)(imm21 & 0x3); 48 *immhi = (u32)((imm21 >> 2) & 0x7ffff); 49 } 50 51 /* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr` 52 * (a .got.plt entry) into x16, loads the resolved function pointer 53 * into x17, and tail-calls it. Used by both PLT0 (after its STP) and 54 * each per-import entry — the only thing that varies is `pc`, which 55 * starts at the ADRP itself. */ 56 static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) { 57 u32 immlo, immhi; 58 aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi); 59 u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK); 60 /* LDR Xt encodes the byte offset divided by 8. .got.plt slots are 61 * 8-byte aligned so the low 3 bits of lo12 are always 0. */ 62 u32 ldr_imm12 = (lo12 >> 3) & 0xfffu; 63 64 wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); 65 wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17, 66 AA64_PLT_SCRATCH_X16, ldr_imm12)); 67 wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16, 68 AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0)); 69 wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17)); 70 } 71 72 static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { 73 /* PLT0: 74 * stp x16, x30, [sp, #-16]! 75 * adrp x16, page(.got.plt[2]) 76 * ldr x17, [x16, #lo12(.got.plt[2])] 77 * add x16, x16, #lo12(.got.plt[2]) 78 * br x17 79 * nop ; nop ; nop 80 * 81 * Under DF_1_NOW the loader patches every .got.plt slot from 82 * .rela.plt before running PLT0, so this trampoline never executes. 83 * It is still emitted in canonical form so disassemblers and 84 * unwinders see the layout the psABI specifies. */ 85 u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX; 86 /* The ADRP sits at plt0+4 (one instruction past the leading STP). */ 87 u64 adrp_pc = plt0_vaddr + 4u; 88 89 /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7 90 * scaled by 8, so the encoded field is -16/8 = -2. */ 91 wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP, 92 /*imm7_scaled=*/-2)); 93 aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2); 94 wr_u32_le(dst + 20, aa64_nop()); 95 wr_u32_le(dst + 24, aa64_nop()); 96 wr_u32_le(dst + 28, aa64_nop()); 97 } 98 99 static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { 100 /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the 101 * entry's first instruction (no leading STP here — the resolved 102 * function returns to the original caller, not into PLT0). */ 103 aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr); 104 } 105 106 static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, 107 LinkArchIPltReloc out[2]) { 108 /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ; 109 * BR x16. 110 * 111 * We deliberately emit the two address-bearing instructions with 112 * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on 113 * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both 114 * targeting the slot's synthetic local symbol. Reloc-apply runs 115 * after final vaddr assignment, which is the only point at which 116 * both endpoints' page-relative displacement is known. */ 117 (void)stub_vaddr; 118 (void)slot_vaddr; 119 120 wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0, 121 /*immhi=*/0)); 122 wr_u32_le(dst + 4, 123 aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16, 124 /*imm12_scaled=*/0)); 125 wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16)); 126 127 out[0].offset_in_stub = 0; 128 out[0].width = 4; 129 out[0].kind = R_AARCH64_ADR_PREL_PG_HI21; 130 out[1].offset_in_stub = 4; 131 out[1].width = 4; 132 out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC; 133 return 2; 134 } 135 136 /* PE/COFF IAT stub for aarch64 (12 B): 137 * 138 * adrp x16, iat_slot@PAGE ; x16 = page-aligned base 139 * ldr x16, [x16, #iat_off] ; x16 = *iat_slot (function ptr) 140 * br x16 ; tail-call 141 * 142 * Uses x16 (intra-procedure-call scratch) so the called function 143 * sees an unperturbed x30 / argument registers. Page+offset are 144 * baked from the post-shift IAT slot vaddr; no apply-time reloc 145 * needed because both ends move together under image-base shift. */ 146 void aa64_emit_coff_iat_stub(u8* dst, u64 stub_vaddr, u64 iat_slot_vaddr) { 147 u32 immlo, immhi; 148 aa64_adrp_imm_halves(stub_vaddr, iat_slot_vaddr, &immlo, &immhi); 149 u32 lo12 = (u32)(iat_slot_vaddr & AA64_PAGE_MASK); 150 /* IAT slots are 8-byte aligned (function pointers), so the low 3 151 * bits of lo12 are always 0; LDR Xt scales the imm12 by 8. */ 152 u32 ldr_imm12 = (lo12 >> 3) & 0xfffu; 153 154 wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); 155 wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, 156 AA64_PLT_SCRATCH_X16, ldr_imm12)); 157 wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16)); 158 } 159 160 void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) { 161 i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff; 162 i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff; 163 i64 imm21 = (page_s - page_p) >> 12; 164 u32 immlo = (u32)(imm21 & 0x3u); 165 u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu); 166 u32 lo12 = (u32)(got_slot_vaddr & 0xfffu); 167 u32 imm12_ldr = (lo12 >> 3) & 0xfffu; 168 169 wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); 170 wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, 171 AA64_PLT_SCRATCH_X16, imm12_ldr)); 172 wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16)); 173 } 174 175 /* Width + classification rows + instruction-immediate byte encoders for 176 * AArch64's relocation kinds; defined in src/arch/aa64/reloc.c and consulted 177 * through the .reloc_desc / .reloc_apply_insn hooks. */ 178 const RelocDesc* aa64_reloc_desc(RelocKind); 179 int aa64_reloc_apply_insn(Compiler*, RelocKind, u8*, u64, i64, u64); 180 void aa64_jit_tls_le_relax(Compiler*, RelocKind, u8*, u64, u64); 181 int aa64_jit_reloc_relax(Compiler*, RelocKind, const JitRelaxCtx*); 182 183 /* AArch64 __chkstk for PE/COFF: probes `x15 * 16` bytes of stack one page at a 184 * time, then returns. Mirrors the LLVM compiler-rt implementation (chkstk.S in 185 * builtins/aarch64). 28 bytes. x64 needs no equivalent — it emits inline stack 186 * probes. link_synth_coff_ctor_dtor_list emits these bytes into a retained 187 * .text$chkstk section for COFF targets that carry them. */ 188 static const u8 aa64_coff_chkstk[28] = { 189 0xf0, 0xed, 0x7c, 0xd3, /* lsl x16, x15, #4 */ 190 0xf1, 0x03, 0x00, 0x91, /* mov x17, sp */ 191 0x31, 0x06, 0x40, 0xd1, /* sub x17, x17, #0x1, lsl #12 */ 192 0x10, 0x06, 0x40, 0xf1, /* subs x16, x16, #0x1, lsl #12 */ 193 0x3f, 0x02, 0x40, 0xf9, /* ldr xzr, [x17] */ 194 0xac, 0xff, 0xff, 0x54, /* b.gt #-0x14 */ 195 0xc0, 0x03, 0x5f, 0xd6, /* ret */ 196 }; 197 198 const LinkArchDesc link_arch_aa64 = { 199 .plt0_size = AA64_PLT0_SIZE, 200 .plt_entry_size = AA64_PLT_ENTRY_SIZE, 201 .iplt_stub_size = AA64_IPLT_STUB_SIZE, 202 203 .emit_plt0 = aa64_emit_plt0, 204 .emit_plt_entry = aa64_emit_plt_entry, 205 .emit_iplt_stub = aa64_emit_iplt_stub, 206 207 .reloc_desc = aa64_reloc_desc, 208 .reloc_apply_insn = aa64_reloc_apply_insn, 209 .jit_tls_le_relax = aa64_jit_tls_le_relax, 210 .jit_reloc_relax = aa64_jit_reloc_relax, 211 212 .coff_chkstk_bytes = aa64_coff_chkstk, 213 .coff_chkstk_len = sizeof aa64_coff_chkstk, 214 };