kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

reloc.c (16658B)


      1 /* AArch64 relocation descriptors (width + classification).
      2  *
      3  * One row per relocation kind this backend applies.  Reached through
      4  * LinkArchDesc.reloc_desc (wired in link.c) and the arch-aware reloc_desc()
      5  * dispatcher.  The wire encoding + diagnostic name live in
      6  * src/obj/<fmt>/reloc_aarch64.c; the instruction byte encoders live in the
      7  * shared byte-patcher (src/obj/reloc_apply.c) until WS-C moves them here.
      8  *
      9  * Kinds with no row (the dynamic-only GLOB_DAT/JUMP_SLOT/RELATIVE/COPY, the
     10  * MCEmitter-only INTRA_LABEL_ADDR, and the unused TLSLE LDST variants) are
     11  * never applied through the static reloc record path and intentionally carry
     12  * no descriptor.  R_ABS16 and R_PREL16 are now neutral and live in the
     13  * neutral_rows table; R_TPOFF64 is neutral and also lives there. */
     14 
     15 #include "obj/reloc.h"
     16 
     17 #include "core/bytes.h"
     18 #include "link/link_arch.h"
     19 
     20 static const RelocDescRow aa64_rows[] = {
     21     {R_AARCH64_JUMP26, {4, RELOC_IS_BRANCH}},
     22     {R_AARCH64_CALL26, {4, RELOC_IS_BRANCH}},
     23     {R_AARCH64_CONDBR19, {4, 0}},
     24     {R_AARCH64_TSTBR14, {4, 0}},
     25     {R_AARCH64_LD_PREL_LO19, {4, 0}},
     26     {R_AARCH64_ADR_PREL_LO21, {4, 0}},
     27     {R_AARCH64_ADR_PREL_PG_HI21, {4, RELOC_DIRECT_PAGE}},
     28     {R_AARCH64_ADR_PREL_PG_HI21_NC, {4, RELOC_DIRECT_PAGE}},
     29     {R_AARCH64_ADD_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     30     {R_AARCH64_LDST8_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     31     {R_AARCH64_LDST16_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     32     {R_AARCH64_LDST32_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     33     {R_AARCH64_LDST64_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     34     {R_AARCH64_LDST128_ABS_LO12_NC, {4, RELOC_DIRECT_PAGE}},
     35     {R_AARCH64_ADR_GOT_PAGE, {4, RELOC_USES_GOT}},
     36     {R_AARCH64_LD64_GOT_LO12_NC, {4, RELOC_USES_GOT}},
     37     {R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21, {4, RELOC_IS_TLS_GOT}},
     38     {R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC, {4, RELOC_IS_TLS_GOT}},
     39     {R_AARCH64_TLSLE_ADD_TPREL_HI12, {4, RELOC_IS_TLS_LE}},
     40     {R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, {4, RELOC_IS_TLS_LE}},
     41     {R_AARCH64_TLVP_LOAD_PAGE21, {4, RELOC_IS_TLVP}},
     42     {R_AARCH64_TLVP_LOAD_PAGEOFF12, {4, RELOC_IS_TLVP}},
     43     /* COFF AArch64 TLS SECREL imm12 pair: ADD-imm12 instruction relocs,
     44      * AArch64-only, applied only into PE/COFF output. TLS-only, so the JIT
     45      * classifies them as Local-Exec accesses via RELOC_IS_TLS_LE. */
     46     {R_COFF_AARCH64_SECREL_LOW12A, {4, RELOC_IS_TLS_LE}},
     47     {R_COFF_AARCH64_SECREL_HIGH12A, {4, RELOC_IS_TLS_LE}},
     48 };
     49 
     50 const RelocDesc* aa64_reloc_desc(RelocKind k) {
     51   return reloc_desc_row_find(aa64_rows,
     52                              (u32)(sizeof aa64_rows / sizeof aa64_rows[0]), k);
     53 }
     54 
     55 /* AArch64 instruction-immediate byte encoders (WS-C).  Moved verbatim from the
     56  * format-neutral byte-patcher; reached via LinkArchDesc.reloc_apply_insn for
     57  * the instruction-embedded kinds.  Encoding references: ARM ARMv8-A "ELF for
     58  * the ARM 64-bit Architecture (AArch64)" §5.7.  Returns 1 if it owns `k`. */
     59 int aa64_reloc_apply_insn(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A,
     60                           u64 P) {
     61   switch (k) {
     62     case R_AARCH64_CONDBR19:
     63     case R_AARCH64_LD_PREL_LO19: {
     64       /* B.cond / CB(N)Z / LDR (literal) — imm19 in 4-byte units,
     65        * signed, at bits [23:5]. Range: ±1MiB. */
     66       i64 disp = (i64)S + A - (i64)P;
     67       u32 instr;
     68       u32 imm19;
     69       if (disp & 3)
     70         compiler_panic(c, SRCLOC_NONE,
     71                        "link: imm19 reloc misaligned displacement");
     72       if (disp < -(i64)(1 << 20) || disp >= (i64)(1 << 20))
     73         compiler_panic(c, SRCLOC_NONE,
     74                        "link: imm19 reloc out of range (need ±1MiB)");
     75       imm19 = (u32)((disp >> 2) & 0x7ffffu);
     76       instr = rd_u32_le(P_bytes);
     77       instr = (instr & ~(0x7ffffu << 5)) | (imm19 << 5);
     78       wr_u32_le(P_bytes, instr);
     79       return 1;
     80     }
     81     case R_AARCH64_TSTBR14: {
     82       /* TBZ/TBNZ — imm14 in 4-byte units, signed, at bits [18:5].
     83        * Range: ±32KiB. */
     84       i64 disp = (i64)S + A - (i64)P;
     85       u32 instr;
     86       u32 imm14;
     87       if (disp & 3)
     88         compiler_panic(c, SRCLOC_NONE, "link: TSTBR14 misaligned displacement");
     89       if (disp < -(i64)(1 << 15) || disp >= (i64)(1 << 15))
     90         compiler_panic(c, SRCLOC_NONE,
     91                        "link: TSTBR14 out of range (need ±32KiB)");
     92       imm14 = (u32)((disp >> 2) & 0x3fffu);
     93       instr = rd_u32_le(P_bytes);
     94       instr = (instr & ~(0x3fffu << 5)) | (imm14 << 5);
     95       wr_u32_le(P_bytes, instr);
     96       return 1;
     97     }
     98     case R_AARCH64_ADR_PREL_LO21: {
     99       /* ADR — byte-granularity imm21, encoded as immlo[30:29] +
    100        * immhi[23:5]. No 12-bit shift (unlike ADRP). Range: ±1MiB. */
    101       i64 disp = (i64)S + A - (i64)P;
    102       u32 instr;
    103       u32 immlo, immhi;
    104       if (disp < -(i64)(1 << 20) || disp >= (i64)(1 << 20))
    105         compiler_panic(c, SRCLOC_NONE,
    106                        "link: ADR_PREL_LO21 out of range (need ±1MiB)");
    107       immlo = (u32)(disp & 0x3u);
    108       immhi = (u32)((disp >> 2) & 0x7ffffu);
    109       instr = rd_u32_le(P_bytes);
    110       instr = (instr & 0x9f00001fu) | (immlo << 29) | (immhi << 5);
    111       wr_u32_le(P_bytes, instr);
    112       return 1;
    113     }
    114     case R_AARCH64_JUMP26:
    115     case R_AARCH64_CALL26: {
    116       /* B/BL imm26 — branch displacement in 4-byte units, signed.
    117        * Clear bits [25:0] of the existing instruction and OR in the
    118        * new imm26. Range check: ±128MiB. */
    119       i64 disp = (i64)S + A - (i64)P;
    120       u32 instr;
    121       u32 imm26;
    122       if (disp & 3)
    123         compiler_panic(c, SRCLOC_NONE, "link: CALL26 misaligned displacement");
    124       if (disp < -(i64)(1 << 27) || disp >= (i64)(1 << 27))
    125         compiler_panic(c, SRCLOC_NONE,
    126                        "link: CALL26 out of range (need ±128MiB)");
    127       imm26 = (u32)((disp >> 2) & 0x3ffffffu);
    128       instr = rd_u32_le(P_bytes);
    129       instr = (instr & 0xfc000000u) | imm26;
    130       wr_u32_le(P_bytes, instr);
    131       return 1;
    132     }
    133     case R_AARCH64_TLVP_LOAD_PAGE21:
    134     case R_AARCH64_ADR_GOT_PAGE:
    135     case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
    136     case R_AARCH64_ADR_PREL_PG_HI21:
    137     case R_AARCH64_ADR_PREL_PG_HI21_NC: {
    138       /* ADRP — page-relative imm21, encoded as immlo[30:29] +
    139        * immhi[23:5]. Effective immediate is (S+A) page minus P page,
    140        * shifted right by 12, sign-extended to 33 bits. _NC variant
    141        * skips the range check (compiler asserts it can't overflow,
    142        * e.g. when paired with explicit page bracketing). */
    143       i64 page_s = ((i64)S + A) & ~(i64)0xfff;
    144       i64 page_p = (i64)P & ~(i64)0xfff;
    145       i64 disp = page_s - page_p;
    146       i64 imm21 = disp >> 12;
    147       u32 instr;
    148       u32 immlo, immhi;
    149       if (k != R_AARCH64_ADR_PREL_PG_HI21_NC &&
    150           (imm21 < -(i64)(1 << 20) || imm21 >= (i64)(1 << 20)))
    151         compiler_panic(c, SRCLOC_NONE,
    152                        "link: ADR_PREL_PG_HI21 out of range (need ±4GiB)");
    153       immlo = (u32)(imm21 & 0x3u);
    154       immhi = (u32)((imm21 >> 2) & 0x7ffffu);
    155       instr = rd_u32_le(P_bytes);
    156       instr = (instr & 0x9f00001fu) | (immlo << 29) | (immhi << 5);
    157       wr_u32_le(P_bytes, instr);
    158       return 1;
    159     }
    160     case R_AARCH64_ADD_ABS_LO12_NC: {
    161       /* ADD (immediate) imm12 at bits [21:10]. NC = no overflow check. */
    162       u64 v = ((u64)S + (u64)A) & 0xfffu;
    163       u32 instr = rd_u32_le(P_bytes);
    164       instr = (instr & ~(0xfffu << 10)) | ((u32)v << 10);
    165       wr_u32_le(P_bytes, instr);
    166       return 1;
    167     }
    168     case R_AARCH64_TLSLE_ADD_TPREL_HI12:
    169     case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: {
    170       /* AArch64 TLS local-exec.  Caller passes S already as the
    171        * TP-relative offset (target's image offset minus the TLS
    172        * image base, plus the 16-byte AArch64 TCB).  HI12 takes
    173        * bits 23:12, LO12_NC takes bits 11:0; both encoded as
    174        * imm12 at instruction bits [21:10] of an ADD (immediate).
    175        * The HI12 form's instruction carries LSL #12 in its opcode,
    176        * so bits 11:0 of the operand naturally land at scale 4096. */
    177       u64 v = (u64)((i64)S + A);
    178       u32 imm12 = (k == R_AARCH64_TLSLE_ADD_TPREL_HI12)
    179                       ? (u32)((v >> 12) & 0xfffu)
    180                       : (u32)(v & 0xfffu);
    181       u32 instr = rd_u32_le(P_bytes);
    182       instr = (instr & ~(0xfffu << 10)) | (imm12 << 10);
    183       wr_u32_le(P_bytes, instr);
    184       return 1;
    185     }
    186     case R_AARCH64_LDST8_ABS_LO12_NC:
    187     case R_AARCH64_LDST16_ABS_LO12_NC:
    188     case R_AARCH64_LDST32_ABS_LO12_NC:
    189     case R_AARCH64_LDST64_ABS_LO12_NC:
    190     case R_AARCH64_LDST128_ABS_LO12_NC:
    191     case R_AARCH64_LD64_GOT_LO12_NC:
    192     case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
    193     case R_AARCH64_TLVP_LOAD_PAGEOFF12: {
    194       /* LDR/STR with imm12 at bits [21:10]; the imm is scaled by the
    195        * access size, so we right-shift the low 12 bits of (S+A) by
    196        * the size scale before encoding. NC = no overflow check.
    197        *
    198        * LD64_GOT_LO12_NC has the same encoding as LDST64_ABS_LO12_NC;
    199        * the linker has already redirected `S` to the GOT slot. */
    200       u32 shift = (k == R_AARCH64_LDST8_ABS_LO12_NC)    ? 0u
    201                   : (k == R_AARCH64_LDST16_ABS_LO12_NC) ? 1u
    202                   : (k == R_AARCH64_LDST32_ABS_LO12_NC) ? 2u
    203                   : (k == R_AARCH64_LDST64_ABS_LO12_NC ||
    204                      k == R_AARCH64_LD64_GOT_LO12_NC ||
    205                      k == R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC ||
    206                      k == R_AARCH64_TLVP_LOAD_PAGEOFF12)
    207                       ? 3u
    208                       : 4u;
    209       u64 lo12 = ((u64)S + (u64)A) & 0xfffu;
    210       u64 imm12 = lo12 >> shift;
    211       u32 instr = rd_u32_le(P_bytes);
    212       if (lo12 & ((1u << shift) - 1u))
    213         compiler_panic(c, SRCLOC_NONE,
    214                        "link: LDST%u_ABS_LO12_NC misaligned address "
    215                        "(kind=%u S=0x%llx A=%lld P=0x%llx)",
    216                        1u << (3 + shift), (unsigned)k, (unsigned long long)S,
    217                        (long long)A, (unsigned long long)P);
    218       instr = (instr & ~(0xfffu << 10)) | ((u32)(imm12 & 0xfffu) << 10);
    219       wr_u32_le(P_bytes, instr);
    220       return 1;
    221     }
    222     default:
    223       return 0;
    224   }
    225 }
    226 
    227 /* In-process JIT TLS Local-Exec relaxation (LinkArchDesc.jit_tls_le_relax).
    228  *
    229  * ELF, per access:
    230  *   mrs rd, tpidr_el0           (no reloc)
    231  *   add rd, rd, #hi12           R_AARCH64_TLSLE_ADD_TPREL_HI12   <- `site`
    232  *   add rd, rd, #lo12           R_AARCH64_TLSLE_ADD_TPREL_LO12_NC
    233  *
    234  * Windows/COFF, per access — the 7-instruction TEB idiom (see
    235  * aa_tls_addr_of_win):
    236  *   site-20  ldr  rd, [x18, #0x58]            TEB.ThreadLocalStoragePointer
    237  *   site-16  adrp x16, _tls_index             }
    238  *   site-12  add  x16, x16, :lo12:_tls_index  } &_tls_index (relocs dropped)
    239  *   site-8   ldr  w16, [x16]                  module TLS index
    240  *   site-4   ldr  rd, [rd, x16, lsl #3]       this module's TLS block base
    241  *   site     add  rd, rd, :secrel_hi12:sym    R_COFF_AARCH64_SECREL_HIGH12A <-
    242  *   site+4   add  rd, rd, :secrel_lo12:sym    R_COFF_AARCH64_SECREL_LOW12A
    243  *
    244  * Single-threaded JIT: in both cases address the in-image storage directly,
    245  * dropping the thread-pointer read (and, on Windows, the `_tls_index` / TEB
    246  * indirection):  adrp rd, &var ; add rd, rd, :lo12:&var ; nop(s). */
    247 void aa64_jit_tls_le_relax(Compiler* c, RelocKind k, u8* site, u64 storage,
    248                            u64 site_pc) {
    249   u8* mrs;
    250   u8* add_lo;
    251   u32 rd;
    252   /* Windows COFF idiom: the terminal HIGH12A drives the whole rewrite; the
    253    * LOW12A half is then a no-op (mirrors the ELF HI12/LO12 split). */
    254   if (k == R_COFF_AARCH64_SECREL_LOW12A) return; /* handled with HIGH12A */
    255   if (k == R_COFF_AARCH64_SECREL_HIGH12A) {
    256     u8* p;
    257     rd = rd_u32_le(site) & 0x1fu;
    258     /* nop the TEB read, the _tls_index materialize + load, and the block load
    259      * (site-20 .. site-8); reuse the block-load slot at site-4 for the ADRP. */
    260     for (p = site - 20; p <= site - 8; p += 4) wr_u32_le(p, 0xd503201fu);
    261     wr_u32_le(site - 4, 0x90000000u | rd); /* adrp rd, #0 */
    262     aa64_reloc_apply_insn(c, R_AARCH64_ADR_PREL_PG_HI21, site - 4, storage, 0,
    263                           site_pc - 4u);
    264     wr_u32_le(site, 0x91000000u | (rd << 5) | rd); /* add rd, rd, #0 */
    265     aa64_reloc_apply_insn(c, R_AARCH64_ADD_ABS_LO12_NC, site, storage, 0,
    266                           site_pc);
    267     wr_u32_le(site + 4, 0xd503201fu); /* nop the secrel_lo12 add */
    268     return;
    269   }
    270   if (k == R_AARCH64_TLSLE_ADD_TPREL_LO12_NC) return; /* handled with HI12 */
    271   if (k != R_AARCH64_TLSLE_ADD_TPREL_HI12)
    272     compiler_panic(c, SRCLOC_NONE, "aa64 jit tls: unexpected reloc kind %u",
    273                    (unsigned)k);
    274   mrs = site - 4;    /* mrs rd, tpidr_el0 */
    275   add_lo = site + 4; /* add rd, rd, #lo12 -> nop */
    276   rd = rd_u32_le(site) & 0x1fu;
    277   if (rd_u32_le(mrs) != (0xd53bd040u | rd))
    278     compiler_panic(c, SRCLOC_NONE, "aa64 jit tls: unexpected access sequence");
    279   wr_u32_le(mrs, 0x90000000u | rd); /* adrp rd, #0 */
    280   aa64_reloc_apply_insn(c, R_AARCH64_ADR_PREL_PG_HI21, mrs, storage, 0,
    281                         site_pc - 4u);
    282   wr_u32_le(site, 0x91000000u | (rd << 5) | rd); /* add rd, rd, #0 */
    283   aa64_reloc_apply_insn(c, R_AARCH64_ADD_ABS_LO12_NC, site, storage, 0, site_pc);
    284   wr_u32_le(add_lo, 0xd503201fu); /* nop */
    285 }
    286 
    287 /* In-process JIT relaxation of AArch64 indirection idioms (LinkArchDesc
    288  * .jit_reloc_relax): the single-threaded JIT has no dynamic loader, GOT, or
    289  * TLV resolver, so each access idiom is rewritten to address the in-image
    290  * instance directly.  Returns 1 if it owned `k`, 0 to fall through to the
    291  * ordinary reloc apply. */
    292 int aa64_jit_reloc_relax(Compiler* c, RelocKind k, const JitRelaxCtx* ctx) {
    293   u8* P_bytes = ctx->site;
    294 
    295   /* Weak undefined target: address-of must evaluate to NULL.  An ADRP + ADD
    296    * pair would form a PC-relative address to vaddr 0 that exceeds ±4 GiB once
    297    * the JIT places segments far from 0 (tripping link_reloc's range check).
    298    * Rewrite the ADRP to MOVZ rd,#0 so rd becomes 0 directly; the paired ADD's
    299    * assembled imm12 of 0 already gives rd += 0, so leave it as add rd,rd,#0. */
    300   if (ctx->weak_undef_zero) {
    301     if (k == R_AARCH64_ADR_PREL_PG_HI21 ||
    302         k == R_AARCH64_ADR_PREL_PG_HI21_NC) {
    303       u32 rd = rd_u32_le(P_bytes) & 0x1fu;
    304       wr_u32_le(P_bytes, 0xd2800000u | rd); /* movz rd, #0 */
    305       return 1;
    306     }
    307     if (k == R_AARCH64_ADD_ABS_LO12_NC) return 1; /* leave add rd,rd,#0 */
    308   }
    309 
    310   /* Mach-O TLV access -> ordinary in-image load.  Codegen emits the 4-insn
    311    * Apple TLV sequence:
    312    *   adrp x0, desc@TLVPPAGE          (PAGE21)
    313    *   ldr  x0, [x0, desc@TLVPPAGEOFF] (PAGEOFF12)   <- this reloc
    314    *   ldr  xN, [x0]                   -- load the resolver thunk from desc[0]
    315    *   blr  xN                         -- call thunk(desc) -> &var in x0
    316    * With one thread the in-image .tdata/.tbss IS the single instance, and
    317    * desc[+16] already holds the variable's in-image storage address (filled by
    318    * the normal R_ABS64 against the storage symbol).  Collapse to a direct load,
    319    * dropping the thunk and the per-thread block:
    320    *   PAGEOFF12 : ldr x0,[x0,#imm] -> add x0,x0,#(desc & 0xfff)   (x0 = &desc)
    321    *   +4        : ldr xN,[x0]       -> ldr x0,[x0,#16]            (x0 = &var)
    322    *   +8        : blr xN            -> nop
    323    * The thunk register N is scratch (the Apple TLV ABI fixes only x0:
    324    * descriptor in, &var out); kit's codegen uses x16, clang picks any free
    325    * register (e.g. x8).  Accept any N so long as the pair is `ldr xN,[x0]`
    326    * (Rn=x0, imm12=0) followed by `blr xN`. */
    327   if (k == R_AARCH64_TLVP_LOAD_PAGEOFF12) {
    328     u64 v = ((u64)ctx->S + (u64)ctx->addend) & 0xfffu;
    329     u32 instr = rd_u32_le(P_bytes);
    330     u8* i_thunk = P_bytes + 4u;
    331     u8* i_call = P_bytes + 8u;
    332     u32 thunk = rd_u32_le(i_thunk);
    333     u32 call = rd_u32_le(i_call);
    334     u32 n = thunk & 0x1fu;
    335     wr_u32_le(P_bytes, 0x91000000u | (instr & 0x3ffu) | ((u32)v << 10));
    336     if ((thunk & ~0x1fu) != 0xf9400000u || call != (0xd63f0000u | (n << 5u)))
    337       compiler_panic(c, SRCLOC_NONE,
    338                      "kit_jit: unexpected Mach-O TLV access sequence");
    339     wr_u32_le(i_thunk, 0xf9400800u); /* ldr x0, [x0, #16] -> &var */
    340     wr_u32_le(i_call, 0xd503201fu);  /* nop */
    341     return 1;
    342   }
    343 
    344   /* No real GOT in the append image: the GOT load becomes a direct add, so the
    345    * register holds the symbol address itself instead of loading it from a slot.
    346    * LD64_GOT_LO12_NC shares the LDR uimm12 encoding; rewrite to ADD imm12. */
    347   if (ctx->got_relaxed && k == R_AARCH64_LD64_GOT_LO12_NC) {
    348     u64 v = ((u64)ctx->S + (u64)ctx->addend) & 0xfffu;
    349     u32 instr = rd_u32_le(P_bytes);
    350     u32 rd = instr & 0x1fu;
    351     u32 rn = (instr >> 5) & 0x1fu;
    352     wr_u32_le(P_bytes, 0x91000000u | rd | (rn << 5) | ((u32)v << 10));
    353     return 1;
    354   }
    355 
    356   return 0;
    357 }