reloc.c (16658B)
1 /* AArch64 relocation descriptors (width + classification). 2 * 3 * One row per relocation kind this backend applies. Reached through 4 * LinkArchDesc.reloc_desc (wired in link.c) and the arch-aware reloc_desc() 5 * dispatcher. The wire encoding + diagnostic name live in 6 * src/obj/<fmt>/reloc_aarch64.c; the instruction byte encoders live in the 7 * shared byte-patcher (src/obj/reloc_apply.c) until WS-C moves them here. 8 * 9 * Kinds with no row (the dynamic-only GLOB_DAT/JUMP_SLOT/RELATIVE/COPY, the 10 * MCEmitter-only INTRA_LABEL_ADDR, and the unused TLSLE LDST variants) are 11 * never applied through the static reloc record path and intentionally carry 12 * no descriptor. R_ABS16 and R_PREL16 are now neutral and live in the 13 * neutral_rows table; R_TPOFF64 is neutral and also lives there. */ 14 15 #include "obj/reloc.h" 16 17 #include "core/bytes.h" 18 #include "link/link_arch.h" 19 20 static const RelocDescRow aa64_rows[] = { 21 {R_AARCH64_JUMP26, {4, RELOC_IS_BRANCH}}, 22 {R_AARCH64_CALL26, {4, RELOC_IS_BRANCH}}, 23 {R_AARCH64_CONDBR19, {4, 0}}, 24 {R_AARCH64_TSTBR14, {4, 0}}, 25 {R_AARCH64_LD_PREL_LO19, {4, 0}}, 26 {R_AARCH64_ADR_PREL_LO21, {4, 0}}, 27 {R_AARCH64_ADR_PREL_PG_HI21, {4, RELOC_DIRECT_PAGE}}, 28 {R_AARCH64_ADR_PREL_PG_HI21_NC, {4, RELOC_DIRECT_PAGE}}, 29 {R_AARCH64_ADD_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 30 {R_AARCH64_LDST8_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 31 {R_AARCH64_LDST16_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 32 {R_AARCH64_LDST32_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 33 {R_AARCH64_LDST64_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 34 {R_AARCH64_LDST128_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}}, 35 {R_AARCH64_ADR_GOT_PAGE, {4, RELOC_USES_GOT}}, 36 {R_AARCH64_LD64_GOT_LO12_NC, {4, RELOC_USES_GOT}}, 37 {R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21, {4, RELOC_IS_TLS_GOT}}, 38 {R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC, {4, RELOC_IS_TLS_GOT}}, 39 {R_AARCH64_TLSLE_ADD_TPREL_HI12, {4, RELOC_IS_TLS_LE}}, 40 {R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, {4, RELOC_IS_TLS_LE}}, 41 {R_AARCH64_TLVP_LOAD_PAGE21, {4, RELOC_IS_TLVP}}, 42 {R_AARCH64_TLVP_LOAD_PAGEOFF12, {4, RELOC_IS_TLVP}}, 43 /* COFF AArch64 TLS SECREL imm12 pair: ADD-imm12 instruction relocs, 44 * AArch64-only, applied only into PE/COFF output. TLS-only, so the JIT 45 * classifies them as Local-Exec accesses via RELOC_IS_TLS_LE. */ 46 {R_COFF_AARCH64_SECREL_LOW12A, {4, RELOC_IS_TLS_LE}}, 47 {R_COFF_AARCH64_SECREL_HIGH12A, {4, RELOC_IS_TLS_LE}}, 48 }; 49 50 const RelocDesc* aa64_reloc_desc(RelocKind k) { 51 return reloc_desc_row_find(aa64_rows, 52 (u32)(sizeof aa64_rows / sizeof aa64_rows[0]), k); 53 } 54 55 /* AArch64 instruction-immediate byte encoders (WS-C). Moved verbatim from the 56 * format-neutral byte-patcher; reached via LinkArchDesc.reloc_apply_insn for 57 * the instruction-embedded kinds. Encoding references: ARM ARMv8-A "ELF for 58 * the ARM 64-bit Architecture (AArch64)" §5.7. Returns 1 if it owns `k`. */ 59 int aa64_reloc_apply_insn(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A, 60 u64 P) { 61 switch (k) { 62 case R_AARCH64_CONDBR19: 63 case R_AARCH64_LD_PREL_LO19: { 64 /* B.cond / CB(N)Z / LDR (literal) — imm19 in 4-byte units, 65 * signed, at bits [23:5]. Range: ±1MiB. */ 66 i64 disp = (i64)S + A - (i64)P; 67 u32 instr; 68 u32 imm19; 69 if (disp & 3) 70 compiler_panic(c, SRCLOC_NONE, 71 "link: imm19 reloc misaligned displacement"); 72 if (disp < -(i64)(1 << 20) || disp >= (i64)(1 << 20)) 73 compiler_panic(c, SRCLOC_NONE, 74 "link: imm19 reloc out of range (need ±1MiB)"); 75 imm19 = (u32)((disp >> 2) & 0x7ffffu); 76 instr = rd_u32_le(P_bytes); 77 instr = (instr & ~(0x7ffffu << 5)) | (imm19 << 5); 78 wr_u32_le(P_bytes, instr); 79 return 1; 80 } 81 case R_AARCH64_TSTBR14: { 82 /* TBZ/TBNZ — imm14 in 4-byte units, signed, at bits [18:5]. 83 * Range: ±32KiB. */ 84 i64 disp = (i64)S + A - (i64)P; 85 u32 instr; 86 u32 imm14; 87 if (disp & 3) 88 compiler_panic(c, SRCLOC_NONE, "link: TSTBR14 misaligned displacement"); 89 if (disp < -(i64)(1 << 15) || disp >= (i64)(1 << 15)) 90 compiler_panic(c, SRCLOC_NONE, 91 "link: TSTBR14 out of range (need ±32KiB)"); 92 imm14 = (u32)((disp >> 2) & 0x3fffu); 93 instr = rd_u32_le(P_bytes); 94 instr = (instr & ~(0x3fffu << 5)) | (imm14 << 5); 95 wr_u32_le(P_bytes, instr); 96 return 1; 97 } 98 case R_AARCH64_ADR_PREL_LO21: { 99 /* ADR — byte-granularity imm21, encoded as immlo[30:29] + 100 * immhi[23:5]. No 12-bit shift (unlike ADRP). Range: ±1MiB. */ 101 i64 disp = (i64)S + A - (i64)P; 102 u32 instr; 103 u32 immlo, immhi; 104 if (disp < -(i64)(1 << 20) || disp >= (i64)(1 << 20)) 105 compiler_panic(c, SRCLOC_NONE, 106 "link: ADR_PREL_LO21 out of range (need ±1MiB)"); 107 immlo = (u32)(disp & 0x3u); 108 immhi = (u32)((disp >> 2) & 0x7ffffu); 109 instr = rd_u32_le(P_bytes); 110 instr = (instr & 0x9f00001fu) | (immlo << 29) | (immhi << 5); 111 wr_u32_le(P_bytes, instr); 112 return 1; 113 } 114 case R_AARCH64_JUMP26: 115 case R_AARCH64_CALL26: { 116 /* B/BL imm26 — branch displacement in 4-byte units, signed. 117 * Clear bits [25:0] of the existing instruction and OR in the 118 * new imm26. Range check: ±128MiB. */ 119 i64 disp = (i64)S + A - (i64)P; 120 u32 instr; 121 u32 imm26; 122 if (disp & 3) 123 compiler_panic(c, SRCLOC_NONE, "link: CALL26 misaligned displacement"); 124 if (disp < -(i64)(1 << 27) || disp >= (i64)(1 << 27)) 125 compiler_panic(c, SRCLOC_NONE, 126 "link: CALL26 out of range (need ±128MiB)"); 127 imm26 = (u32)((disp >> 2) & 0x3ffffffu); 128 instr = rd_u32_le(P_bytes); 129 instr = (instr & 0xfc000000u) | imm26; 130 wr_u32_le(P_bytes, instr); 131 return 1; 132 } 133 case R_AARCH64_TLVP_LOAD_PAGE21: 134 case R_AARCH64_ADR_GOT_PAGE: 135 case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: 136 case R_AARCH64_ADR_PREL_PG_HI21: 137 case R_AARCH64_ADR_PREL_PG_HI21_NC: { 138 /* ADRP — page-relative imm21, encoded as immlo[30:29] + 139 * immhi[23:5]. Effective immediate is (S+A) page minus P page, 140 * shifted right by 12, sign-extended to 33 bits. _NC variant 141 * skips the range check (compiler asserts it can't overflow, 142 * e.g. when paired with explicit page bracketing). */ 143 i64 page_s = ((i64)S + A) & ~(i64)0xfff; 144 i64 page_p = (i64)P & ~(i64)0xfff; 145 i64 disp = page_s - page_p; 146 i64 imm21 = disp >> 12; 147 u32 instr; 148 u32 immlo, immhi; 149 if (k != R_AARCH64_ADR_PREL_PG_HI21_NC && 150 (imm21 < -(i64)(1 << 20) || imm21 >= (i64)(1 << 20))) 151 compiler_panic(c, SRCLOC_NONE, 152 "link: ADR_PREL_PG_HI21 out of range (need ±4GiB)"); 153 immlo = (u32)(imm21 & 0x3u); 154 immhi = (u32)((imm21 >> 2) & 0x7ffffu); 155 instr = rd_u32_le(P_bytes); 156 instr = (instr & 0x9f00001fu) | (immlo << 29) | (immhi << 5); 157 wr_u32_le(P_bytes, instr); 158 return 1; 159 } 160 case R_AARCH64_ADD_ABS_LO12_NC: { 161 /* ADD (immediate) imm12 at bits [21:10]. NC = no overflow check. */ 162 u64 v = ((u64)S + (u64)A) & 0xfffu; 163 u32 instr = rd_u32_le(P_bytes); 164 instr = (instr & ~(0xfffu << 10)) | ((u32)v << 10); 165 wr_u32_le(P_bytes, instr); 166 return 1; 167 } 168 case R_AARCH64_TLSLE_ADD_TPREL_HI12: 169 case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: { 170 /* AArch64 TLS local-exec. Caller passes S already as the 171 * TP-relative offset (target's image offset minus the TLS 172 * image base, plus the 16-byte AArch64 TCB). HI12 takes 173 * bits 23:12, LO12_NC takes bits 11:0; both encoded as 174 * imm12 at instruction bits [21:10] of an ADD (immediate). 175 * The HI12 form's instruction carries LSL #12 in its opcode, 176 * so bits 11:0 of the operand naturally land at scale 4096. */ 177 u64 v = (u64)((i64)S + A); 178 u32 imm12 = (k == R_AARCH64_TLSLE_ADD_TPREL_HI12) 179 ? (u32)((v >> 12) & 0xfffu) 180 : (u32)(v & 0xfffu); 181 u32 instr = rd_u32_le(P_bytes); 182 instr = (instr & ~(0xfffu << 10)) | (imm12 << 10); 183 wr_u32_le(P_bytes, instr); 184 return 1; 185 } 186 case R_AARCH64_LDST8_ABS_LO12_NC: 187 case R_AARCH64_LDST16_ABS_LO12_NC: 188 case R_AARCH64_LDST32_ABS_LO12_NC: 189 case R_AARCH64_LDST64_ABS_LO12_NC: 190 case R_AARCH64_LDST128_ABS_LO12_NC: 191 case R_AARCH64_LD64_GOT_LO12_NC: 192 case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: 193 case R_AARCH64_TLVP_LOAD_PAGEOFF12: { 194 /* LDR/STR with imm12 at bits [21:10]; the imm is scaled by the 195 * access size, so we right-shift the low 12 bits of (S+A) by 196 * the size scale before encoding. NC = no overflow check. 197 * 198 * LD64_GOT_LO12_NC has the same encoding as LDST64_ABS_LO12_NC; 199 * the linker has already redirected `S` to the GOT slot. */ 200 u32 shift = (k == R_AARCH64_LDST8_ABS_LO12_NC) ? 0u 201 : (k == R_AARCH64_LDST16_ABS_LO12_NC) ? 1u 202 : (k == R_AARCH64_LDST32_ABS_LO12_NC) ? 2u 203 : (k == R_AARCH64_LDST64_ABS_LO12_NC || 204 k == R_AARCH64_LD64_GOT_LO12_NC || 205 k == R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC || 206 k == R_AARCH64_TLVP_LOAD_PAGEOFF12) 207 ? 3u 208 : 4u; 209 u64 lo12 = ((u64)S + (u64)A) & 0xfffu; 210 u64 imm12 = lo12 >> shift; 211 u32 instr = rd_u32_le(P_bytes); 212 if (lo12 & ((1u << shift) - 1u)) 213 compiler_panic(c, SRCLOC_NONE, 214 "link: LDST%u_ABS_LO12_NC misaligned address " 215 "(kind=%u S=0x%llx A=%lld P=0x%llx)", 216 1u << (3 + shift), (unsigned)k, (unsigned long long)S, 217 (long long)A, (unsigned long long)P); 218 instr = (instr & ~(0xfffu << 10)) | ((u32)(imm12 & 0xfffu) << 10); 219 wr_u32_le(P_bytes, instr); 220 return 1; 221 } 222 default: 223 return 0; 224 } 225 } 226 227 /* In-process JIT TLS Local-Exec relaxation (LinkArchDesc.jit_tls_le_relax). 228 * 229 * ELF, per access: 230 * mrs rd, tpidr_el0 (no reloc) 231 * add rd, rd, #hi12 R_AARCH64_TLSLE_ADD_TPREL_HI12 <- `site` 232 * add rd, rd, #lo12 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC 233 * 234 * Windows/COFF, per access — the 7-instruction TEB idiom (see 235 * aa_tls_addr_of_win): 236 * site-20 ldr rd, [x18, #0x58] TEB.ThreadLocalStoragePointer 237 * site-16 adrp x16, _tls_index } 238 * site-12 add x16, x16, :lo12:_tls_index } &_tls_index (relocs dropped) 239 * site-8 ldr w16, [x16] module TLS index 240 * site-4 ldr rd, [rd, x16, lsl #3] this module's TLS block base 241 * site add rd, rd, :secrel_hi12:sym R_COFF_AARCH64_SECREL_HIGH12A <- 242 * site+4 add rd, rd, :secrel_lo12:sym R_COFF_AARCH64_SECREL_LOW12A 243 * 244 * Single-threaded JIT: in both cases address the in-image storage directly, 245 * dropping the thread-pointer read (and, on Windows, the `_tls_index` / TEB 246 * indirection): adrp rd, &var ; add rd, rd, :lo12:&var ; nop(s). */ 247 void aa64_jit_tls_le_relax(Compiler* c, RelocKind k, u8* site, u64 storage, 248 u64 site_pc) { 249 u8* mrs; 250 u8* add_lo; 251 u32 rd; 252 /* Windows COFF idiom: the terminal HIGH12A drives the whole rewrite; the 253 * LOW12A half is then a no-op (mirrors the ELF HI12/LO12 split). */ 254 if (k == R_COFF_AARCH64_SECREL_LOW12A) return; /* handled with HIGH12A */ 255 if (k == R_COFF_AARCH64_SECREL_HIGH12A) { 256 u8* p; 257 rd = rd_u32_le(site) & 0x1fu; 258 /* nop the TEB read, the _tls_index materialize + load, and the block load 259 * (site-20 .. site-8); reuse the block-load slot at site-4 for the ADRP. */ 260 for (p = site - 20; p <= site - 8; p += 4) wr_u32_le(p, 0xd503201fu); 261 wr_u32_le(site - 4, 0x90000000u | rd); /* adrp rd, #0 */ 262 aa64_reloc_apply_insn(c, R_AARCH64_ADR_PREL_PG_HI21, site - 4, storage, 0, 263 site_pc - 4u); 264 wr_u32_le(site, 0x91000000u | (rd << 5) | rd); /* add rd, rd, #0 */ 265 aa64_reloc_apply_insn(c, R_AARCH64_ADD_ABS_LO12_NC, site, storage, 0, 266 site_pc); 267 wr_u32_le(site + 4, 0xd503201fu); /* nop the secrel_lo12 add */ 268 return; 269 } 270 if (k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC) return; /* handled with HI12 */ 271 if (k != R_AARCH64_TLSLE_ADD_TPREL_HI12) 272 compiler_panic(c, SRCLOC_NONE, "aa64 jit tls: unexpected reloc kind %u", 273 (unsigned)k); 274 mrs = site - 4; /* mrs rd, tpidr_el0 */ 275 add_lo = site + 4; /* add rd, rd, #lo12 -> nop */ 276 rd = rd_u32_le(site) & 0x1fu; 277 if (rd_u32_le(mrs) != (0xd53bd040u | rd)) 278 compiler_panic(c, SRCLOC_NONE, "aa64 jit tls: unexpected access sequence"); 279 wr_u32_le(mrs, 0x90000000u | rd); /* adrp rd, #0 */ 280 aa64_reloc_apply_insn(c, R_AARCH64_ADR_PREL_PG_HI21, mrs, storage, 0, 281 site_pc - 4u); 282 wr_u32_le(site, 0x91000000u | (rd << 5) | rd); /* add rd, rd, #0 */ 283 aa64_reloc_apply_insn(c, R_AARCH64_ADD_ABS_LO12_NC, site, storage, 0, site_pc); 284 wr_u32_le(add_lo, 0xd503201fu); /* nop */ 285 } 286 287 /* In-process JIT relaxation of AArch64 indirection idioms (LinkArchDesc 288 * .jit_reloc_relax): the single-threaded JIT has no dynamic loader, GOT, or 289 * TLV resolver, so each access idiom is rewritten to address the in-image 290 * instance directly. Returns 1 if it owned `k`, 0 to fall through to the 291 * ordinary reloc apply. */ 292 int aa64_jit_reloc_relax(Compiler* c, RelocKind k, const JitRelaxCtx* ctx) { 293 u8* P_bytes = ctx->site; 294 295 /* Weak undefined target: address-of must evaluate to NULL. An ADRP + ADD 296 * pair would form a PC-relative address to vaddr 0 that exceeds ±4 GiB once 297 * the JIT places segments far from 0 (tripping link_reloc's range check). 298 * Rewrite the ADRP to MOVZ rd,#0 so rd becomes 0 directly; the paired ADD's 299 * assembled imm12 of 0 already gives rd += 0, so leave it as add rd,rd,#0. */ 300 if (ctx->weak_undef_zero) { 301 if (k == R_AARCH64_ADR_PREL_PG_HI21 || 302 k == R_AARCH64_ADR_PREL_PG_HI21_NC) { 303 u32 rd = rd_u32_le(P_bytes) & 0x1fu; 304 wr_u32_le(P_bytes, 0xd2800000u | rd); /* movz rd, #0 */ 305 return 1; 306 } 307 if (k == R_AARCH64_ADD_ABS_LO12_NC) return 1; /* leave add rd,rd,#0 */ 308 } 309 310 /* Mach-O TLV access -> ordinary in-image load. Codegen emits the 4-insn 311 * Apple TLV sequence: 312 * adrp x0, desc@TLVPPAGE (PAGE21) 313 * ldr x0, [x0, desc@TLVPPAGEOFF] (PAGEOFF12) <- this reloc 314 * ldr xN, [x0] -- load the resolver thunk from desc[0] 315 * blr xN -- call thunk(desc) -> &var in x0 316 * With one thread the in-image .tdata/.tbss IS the single instance, and 317 * desc[+16] already holds the variable's in-image storage address (filled by 318 * the normal R_ABS64 against the storage symbol). Collapse to a direct load, 319 * dropping the thunk and the per-thread block: 320 * PAGEOFF12 : ldr x0,[x0,#imm] -> add x0,x0,#(desc & 0xfff) (x0 = &desc) 321 * +4 : ldr xN,[x0] -> ldr x0,[x0,#16] (x0 = &var) 322 * +8 : blr xN -> nop 323 * The thunk register N is scratch (the Apple TLV ABI fixes only x0: 324 * descriptor in, &var out); kit's codegen uses x16, clang picks any free 325 * register (e.g. x8). Accept any N so long as the pair is `ldr xN,[x0]` 326 * (Rn=x0, imm12=0) followed by `blr xN`. */ 327 if (k == R_AARCH64_TLVP_LOAD_PAGEOFF12) { 328 u64 v = ((u64)ctx->S + (u64)ctx->addend) & 0xfffu; 329 u32 instr = rd_u32_le(P_bytes); 330 u8* i_thunk = P_bytes + 4u; 331 u8* i_call = P_bytes + 8u; 332 u32 thunk = rd_u32_le(i_thunk); 333 u32 call = rd_u32_le(i_call); 334 u32 n = thunk & 0x1fu; 335 wr_u32_le(P_bytes, 0x91000000u | (instr & 0x3ffu) | ((u32)v << 10)); 336 if ((thunk & ~0x1fu) != 0xf9400000u || call != (0xd63f0000u | (n << 5u))) 337 compiler_panic(c, SRCLOC_NONE, 338 "kit_jit: unexpected Mach-O TLV access sequence"); 339 wr_u32_le(i_thunk, 0xf9400800u); /* ldr x0, [x0, #16] -> &var */ 340 wr_u32_le(i_call, 0xd503201fu); /* nop */ 341 return 1; 342 } 343 344 /* No real GOT in the append image: the GOT load becomes a direct add, so the 345 * register holds the symbol address itself instead of loading it from a slot. 346 * LD64_GOT_LO12_NC shares the LDR uimm12 encoding; rewrite to ADD imm12. */ 347 if (ctx->got_relaxed && k == R_AARCH64_LD64_GOT_LO12_NC) { 348 u64 v = ((u64)ctx->S + (u64)ctx->addend) & 0xfffu; 349 u32 instr = rd_u32_le(P_bytes); 350 u32 rd = instr & 0x1fu; 351 u32 rn = (instr >> 5) & 0x1fu; 352 wr_u32_le(P_bytes, 0x91000000u | rd | (rn << 5) | ((u32)v << 10)); 353 return 1; 354 } 355 356 return 0; 357 }