kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

link.c (9454B)


      1 /* AArch64 link-time descriptor.
      2  *
      3  * Implements the LinkArchDesc contract from link_arch.h for the
      4  * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve
      5  * trampolines emitted in canonical form even under DF_1_NOW), and the
      6  * 12-byte IPLT stub used by ifunc resolvers.  All instruction bytes
      7  * come from the encoders in arch/aa64/isa.h — no raw hex literals
      8  * here.
      9  *
     10  * The byte layout matches the previous inline encodings in
     11  * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the
     12  * linker to descriptor dispatch is a no-op on the output image. */
     13 
     14 #include "arch/aa64/isa.h"
     15 #include "core/bytes.h"
     16 #include "core/core.h"
     17 #include "link/link_arch.h"
     18 #include "obj/obj.h"
     19 
     20 /* Fixed register assignments mandated by the AArch64 PLT ABI. */
     21 #define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */
     22 #define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */
     23 
     24 /* PLT geometry. Documented in link_arch.h; redeclared here as the
     25  * descriptor table needs them at file scope. */
     26 #define AA64_PLT0_SIZE 32u
     27 #define AA64_PLT_ENTRY_SIZE 16u
     28 #define AA64_IPLT_STUB_SIZE 12u
     29 
     30 /* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import
     31  * entries start at .got.plt[3]. */
     32 #define AA64_GOTPLT_RESOLVER_INDEX 2u
     33 
     34 /* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12,
     35  * where page(x) clears the low 12 bits. */
     36 #define AA64_PAGE_MASK ((u64)0xfffu)
     37 
     38 /* Compute the (immlo, immhi) ADRP immediate halves for the page-
     39  * relative displacement from `pc` to `target`.  Both addresses are
     40  * post-shift final image vaddrs; ADRP discards the low 12 bits of
     41  * each before subtracting, so the result is invariant under any
     42  * segment-base shift that moves both endpoints by the same delta. */
     43 static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo,
     44                                         u32* immhi) {
     45   i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK);
     46   i64 imm21 = page_disp >> 12;
     47   *immlo = (u32)(imm21 & 0x3);
     48   *immhi = (u32)((imm21 >> 2) & 0x7ffff);
     49 }
     50 
     51 /* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr`
     52  * (a .got.plt entry) into x16, loads the resolved function pointer
     53  * into x17, and tail-calls it.  Used by both PLT0 (after its STP) and
     54  * each per-import entry — the only thing that varies is `pc`, which
     55  * starts at the ADRP itself. */
     56 static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) {
     57   u32 immlo, immhi;
     58   aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi);
     59   u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK);
     60   /* LDR Xt encodes the byte offset divided by 8.  .got.plt slots are
     61    * 8-byte aligned so the low 3 bits of lo12 are always 0. */
     62   u32 ldr_imm12 = (lo12 >> 3) & 0xfffu;
     63 
     64   wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
     65   wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17,
     66                                        AA64_PLT_SCRATCH_X16, ldr_imm12));
     67   wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16,
     68                                   AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0));
     69   wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17));
     70 }
     71 
     72 static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
     73   /* PLT0:
     74    *   stp  x16, x30, [sp, #-16]!
     75    *   adrp x16, page(.got.plt[2])
     76    *   ldr  x17, [x16, #lo12(.got.plt[2])]
     77    *   add  x16, x16,   #lo12(.got.plt[2])
     78    *   br   x17
     79    *   nop ; nop ; nop
     80    *
     81    * Under DF_1_NOW the loader patches every .got.plt slot from
     82    * .rela.plt before running PLT0, so this trampoline never executes.
     83    * It is still emitted in canonical form so disassemblers and
     84    * unwinders see the layout the psABI specifies. */
     85   u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX;
     86   /* The ADRP sits at plt0+4 (one instruction past the leading STP). */
     87   u64 adrp_pc = plt0_vaddr + 4u;
     88 
     89   /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7
     90    * scaled by 8, so the encoded field is -16/8 = -2. */
     91   wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP,
     92                                     /*imm7_scaled=*/-2));
     93   aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2);
     94   wr_u32_le(dst + 20, aa64_nop());
     95   wr_u32_le(dst + 24, aa64_nop());
     96   wr_u32_le(dst + 28, aa64_nop());
     97 }
     98 
     99 static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
    100   /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the
    101    * entry's first instruction (no leading STP here — the resolved
    102    * function returns to the original caller, not into PLT0). */
    103   aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr);
    104 }
    105 
    106 static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
    107                                LinkArchIPltReloc out[2]) {
    108   /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ;
    109    * BR x16.
    110    *
    111    * We deliberately emit the two address-bearing instructions with
    112    * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on
    113    * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both
    114    * targeting the slot's synthetic local symbol.  Reloc-apply runs
    115    * after final vaddr assignment, which is the only point at which
    116    * both endpoints' page-relative displacement is known. */
    117   (void)stub_vaddr;
    118   (void)slot_vaddr;
    119 
    120   wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0,
    121                                /*immhi=*/0));
    122   wr_u32_le(dst + 4,
    123             aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16,
    124                               /*imm12_scaled=*/0));
    125   wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16));
    126 
    127   out[0].offset_in_stub = 0;
    128   out[0].width = 4;
    129   out[0].kind = R_AARCH64_ADR_PREL_PG_HI21;
    130   out[1].offset_in_stub = 4;
    131   out[1].width = 4;
    132   out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC;
    133   return 2;
    134 }
    135 
    136 /* PE/COFF IAT stub for aarch64 (12 B):
    137  *
    138  *   adrp x16, iat_slot@PAGE       ; x16 = page-aligned base
    139  *   ldr  x16, [x16, #iat_off]     ; x16 = *iat_slot (function ptr)
    140  *   br   x16                       ; tail-call
    141  *
    142  * Uses x16 (intra-procedure-call scratch) so the called function
    143  * sees an unperturbed x30 / argument registers. Page+offset are
    144  * baked from the post-shift IAT slot vaddr; no apply-time reloc
    145  * needed because both ends move together under image-base shift. */
    146 void aa64_emit_coff_iat_stub(u8* dst, u64 stub_vaddr, u64 iat_slot_vaddr) {
    147   u32 immlo, immhi;
    148   aa64_adrp_imm_halves(stub_vaddr, iat_slot_vaddr, &immlo, &immhi);
    149   u32 lo12 = (u32)(iat_slot_vaddr & AA64_PAGE_MASK);
    150   /* IAT slots are 8-byte aligned (function pointers), so the low 3
    151    * bits of lo12 are always 0; LDR Xt scales the imm12 by 8. */
    152   u32 ldr_imm12 = (lo12 >> 3) & 0xfffu;
    153 
    154   wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
    155   wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16,
    156                                        AA64_PLT_SCRATCH_X16, ldr_imm12));
    157   wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16));
    158 }
    159 
    160 void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) {
    161   i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff;
    162   i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff;
    163   i64 imm21 = (page_s - page_p) >> 12;
    164   u32 immlo = (u32)(imm21 & 0x3u);
    165   u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu);
    166   u32 lo12 = (u32)(got_slot_vaddr & 0xfffu);
    167   u32 imm12_ldr = (lo12 >> 3) & 0xfffu;
    168 
    169   wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
    170   wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16,
    171                                        AA64_PLT_SCRATCH_X16, imm12_ldr));
    172   wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16));
    173 }
    174 
    175 /* Width + classification rows + instruction-immediate byte encoders for
    176  * AArch64's relocation kinds; defined in src/arch/aa64/reloc.c and consulted
    177  * through the .reloc_desc / .reloc_apply_insn hooks. */
    178 const RelocDesc* aa64_reloc_desc(RelocKind);
    179 int aa64_reloc_apply_insn(Compiler*, RelocKind, u8*, u64, i64, u64);
    180 void aa64_jit_tls_le_relax(Compiler*, RelocKind, u8*, u64, u64);
    181 int aa64_jit_reloc_relax(Compiler*, RelocKind, const JitRelaxCtx*);
    182 
    183 /* AArch64 __chkstk for PE/COFF: probes `x15 * 16` bytes of stack one page at a
    184  * time, then returns.  Mirrors the LLVM compiler-rt implementation (chkstk.S in
    185  * builtins/aarch64). 28 bytes.  x64 needs no equivalent — it emits inline stack
    186  * probes.  link_synth_coff_ctor_dtor_list emits these bytes into a retained
    187  * .text$chkstk section for COFF targets that carry them. */
    188 static const u8 aa64_coff_chkstk[28] = {
    189     0xf0, 0xed, 0x7c, 0xd3, /* lsl  x16, x15, #4              */
    190     0xf1, 0x03, 0x00, 0x91, /* mov  x17, sp                   */
    191     0x31, 0x06, 0x40, 0xd1, /* sub  x17, x17, #0x1, lsl #12   */
    192     0x10, 0x06, 0x40, 0xf1, /* subs x16, x16, #0x1, lsl #12   */
    193     0x3f, 0x02, 0x40, 0xf9, /* ldr  xzr, [x17]                */
    194     0xac, 0xff, 0xff, 0x54, /* b.gt #-0x14                    */
    195     0xc0, 0x03, 0x5f, 0xd6, /* ret                            */
    196 };
    197 
    198 const LinkArchDesc link_arch_aa64 = {
    199     .plt0_size = AA64_PLT0_SIZE,
    200     .plt_entry_size = AA64_PLT_ENTRY_SIZE,
    201     .iplt_stub_size = AA64_IPLT_STUB_SIZE,
    202 
    203     .emit_plt0 = aa64_emit_plt0,
    204     .emit_plt_entry = aa64_emit_plt_entry,
    205     .emit_iplt_stub = aa64_emit_iplt_stub,
    206 
    207     .reloc_desc = aa64_reloc_desc,
    208     .reloc_apply_insn = aa64_reloc_apply_insn,
    209     .jit_tls_le_relax = aa64_jit_tls_le_relax,
    210     .jit_reloc_relax = aa64_jit_reloc_relax,
    211 
    212     .coff_chkstk_bytes = aa64_coff_chkstk,
    213     .coff_chkstk_len = sizeof aa64_coff_chkstk,
    214 };