link.c (6621B)
1 /* RV64 link-time arch descriptor. See link_arch.h for the contract. 2 * 3 * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to 4 * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the 5 * vtable refactor; comments preserve the WHY (notably the +0x800 bias 6 * on AUIPC immediates). */ 7 8 #include "arch/riscv/isa.h" 9 #include "core/bytes.h" 10 #include "core/core.h" 11 #include "link/link_arch.h" 12 13 /* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is 14 * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively. 15 * Encoded once here so the descriptor and emitters stay in sync. */ 16 #define RV64_PLT0_SIZE 32u 17 #define RV64_PLT_ENTRY_SIZE 16u 18 #define RV64_IPLT_STUB_SIZE 12u 19 20 /* Split a PC-relative displacement into the (hi20, lo12) pair consumed 21 * by the AUIPC + I-type sequence. The +0x800 bias is the standard 22 * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate 23 * shifted left 12, then the second instruction adds a sign-extended 24 * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff) 25 * the lo12 sign-extends as a *negative* number whenever bit 11 is set, 26 * which underflows the AUIPC result by 0x1000. Adding 0x800 before 27 * the shift rounds the high half up in exactly the cases that need it 28 * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */ 29 static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) { 30 *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu; 31 *lo12_out = (u32)((u64)disp & 0xfffu); 32 } 33 34 /* PLT0 under DF_1_NOW is never executed — the loader resolves every 35 * JUMP_SLOT before transferring control — but we still emit it in 36 * canonical form (8 NOPs) so disassemblers and unwinders see a well- 37 * formed prologue at the top of .plt. */ 38 static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { 39 u32 i; 40 (void)plt0_vaddr; 41 (void)gotplt_vaddr; 42 for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop()); 43 } 44 45 /* Per-import PLT entry: load the GOT slot pre-filled by the loader 46 * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard 47 * psABI scratch for the trampoline return-address (clobbered by the 48 * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */ 49 static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { 50 i64 disp = (i64)slot_vaddr - (i64)entry_vaddr; 51 u32 hi20; 52 u32 lo12; 53 rv64_split_pcrel(disp, &hi20, &lo12); 54 wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20)); 55 wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12)); 56 wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0)); 57 wr_u32_le(dst + 12, rv_nop()); 58 } 59 60 /* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and 61 * tail-call to it. The stub->slot displacement is invariant under the 62 * segment-base shift (both addresses live in the same image), so we 63 * bake it directly into the instructions and report zero apply-time 64 * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */ 65 static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, 66 LinkArchIPltReloc out[2]) { 67 i64 disp = (i64)slot_vaddr - (i64)stub_vaddr; 68 u32 hi20; 69 u32 lo12; 70 (void)out; 71 rv64_split_pcrel(disp, &hi20, &lo12); 72 wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20)); 73 wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12)); 74 wr_u32_le(dst + 8, rv_jr(RV_T1)); 75 return 0u; 76 } 77 78 /* RV32 PLT entry: identical to rv64_emit_plt_entry except the GOT slot 79 * is 4 bytes (one XLEN word), so the load is LW not LD. Entry stays 80 * 16 bytes / 4 insns; the AUIPC + (hi20,lo12) split is XLEN-neutral. */ 81 static void rv32_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { 82 i64 disp = (i64)slot_vaddr - (i64)entry_vaddr; 83 u32 hi20; 84 u32 lo12; 85 rv64_split_pcrel(disp, &hi20, &lo12); 86 wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20)); 87 wr_u32_le(dst + 4, rv_lw(RV_T3, RV_T3, (i32)lo12)); 88 wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0)); 89 wr_u32_le(dst + 12, rv_nop()); 90 } 91 92 /* RV32 IPLT stub: identical to rv64_emit_iplt_stub except the 93 * .igot.plt slot is 4 bytes, so LW not LD. Stub stays 12 bytes / 94 * 3 insns; displacement is baked inline, so zero apply-time relocs. */ 95 static u32 rv32_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, 96 LinkArchIPltReloc out[2]) { 97 i64 disp = (i64)slot_vaddr - (i64)stub_vaddr; 98 u32 hi20; 99 u32 lo12; 100 (void)out; 101 rv64_split_pcrel(disp, &hi20, &lo12); 102 wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20)); 103 wr_u32_le(dst + 4, rv_lw(RV_T1, RV_T1, (i32)lo12)); 104 wr_u32_le(dst + 8, rv_jr(RV_T1)); 105 return 0u; 106 } 107 108 /* Width + classification rows for RISC-V's relocation kinds (shared by rv64 109 * and rv32); defined in src/arch/riscv/reloc.c and consulted through the 110 * .reloc_desc hook. R_RV_CALL / R_PLT32 carry RELOC_IS_BRANCH: a direct 111 * AUIPC+JALR reaches only ±2GiB, so a too-far target (e.g. a JIT-resolved 112 * host libc symbol) routes through the call-stub pass, the same safety net 113 * aa64 and x64 wire. rv_reloc_apply_insn (same file) holds the matching 114 * U/I/S/B/J + RVC instruction-immediate byte encoders. */ 115 const RelocDesc* rv_reloc_desc(RelocKind); 116 int rv_reloc_apply_insn(Compiler*, RelocKind, u8*, u64, i64, u64); 117 void rv_jit_tls_le_relax(Compiler*, RelocKind, u8*, u64, u64); 118 int rv_jit_reloc_relax(Compiler*, RelocKind, const JitRelaxCtx*); 119 120 const LinkArchDesc link_arch_rv64 = { 121 .plt0_size = RV64_PLT0_SIZE, 122 .plt_entry_size = RV64_PLT_ENTRY_SIZE, 123 .iplt_stub_size = RV64_IPLT_STUB_SIZE, 124 .global_pointer_symbol = "__global_pointer$", 125 .global_pointer_rw_offset = 0x800u, 126 .emit_plt0 = rv64_emit_plt0, 127 .emit_plt_entry = rv64_emit_plt_entry, 128 .emit_iplt_stub = rv64_emit_iplt_stub, 129 .reloc_desc = rv_reloc_desc, 130 .reloc_apply_insn = rv_reloc_apply_insn, 131 .jit_tls_le_relax = rv_jit_tls_le_relax, 132 .jit_reloc_relax = rv_jit_reloc_relax, 133 }; 134 135 /* RV32 link descriptor: identical to rv64 (PLT0/entry/stub byte sizes, 136 * __global_pointer$ + 0x800 RW bias, canonical 8-NOP PLT0, and the JIT 137 * call-stub predicate) EXCEPT the PLT/IPLT emitters load 4-byte GOT 138 * slots with LW instead of LD. */ 139 const LinkArchDesc link_arch_rv32 = { 140 .plt0_size = RV64_PLT0_SIZE, 141 .plt_entry_size = RV64_PLT_ENTRY_SIZE, 142 .iplt_stub_size = RV64_IPLT_STUB_SIZE, 143 .global_pointer_symbol = "__global_pointer$", 144 .global_pointer_rw_offset = 0x800u, 145 .emit_plt0 = rv64_emit_plt0, 146 .emit_plt_entry = rv32_emit_plt_entry, 147 .emit_iplt_stub = rv32_emit_iplt_stub, 148 .reloc_desc = rv_reloc_desc, 149 .reloc_apply_insn = rv_reloc_apply_insn, 150 .jit_reloc_relax = rv_jit_reloc_relax, 151 };