jit_tls_relax_test.c (8844B)
1 /* In-process JIT TLS Local-Exec relaxation for the Windows (COFF) TEB idiom. 2 * 3 * The single-threaded JIT rewrites the PE TLS-access idiom (read TEB 4 * ThreadLocalStoragePointer, index by `_tls_index`, then `+ SECREL(var)`) to 5 * address the in-image .tls instance directly, dropping the TEB/`_tls_index` 6 * indirection (see src/link/link_jit.c and the per-arch jit_tls_le_relax). The 7 * aarch64 path is verified end-to-end on the Windows VM; this pins the 8 * byte-level rewrite for BOTH arches, and is the only automated guard for the 9 * x86-64 idiom (x64-windows self-host can't run end-to-end yet). 10 * 11 * Each case hand-encodes the exact idiom the codegen emits 12 * (x64_tls_addr_of_win64 / aa_tls_addr_of_win), drives the relax, and asserts 13 * the result is the in-image address materialization. The x64 builder is pinned 14 * to a golden captured from real `kit cc -c` disassembly so it can't drift away 15 * from the codegen it must match. 16 * 17 * Exit 0 = pass; non-zero = fail. */ 18 19 #include <stdint.h> 20 #include <string.h> 21 22 #include <kit/core.h> 23 24 #include "core/core.h" 25 #include "lib/kit_unit.h" 26 #include "obj/obj.h" 27 28 /* Reached via LinkArchDesc.jit_tls_le_relax; not exposed in a header. */ 29 void x64_jit_tls_le_relax(Compiler* c, RelocKind k, u8* site, u64 storage, 30 u64 site_pc); 31 void aa64_jit_tls_le_relax(Compiler* c, RelocKind k, u8* site, u64 storage, 32 u64 site_pc); 33 34 static KitUnit g_u; 35 #define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__) 36 37 static KitCompiler* compiler_for(KitArchKind arch) { 38 static KitCompiler* aa64 = NULL; 39 static KitCompiler* x64 = NULL; 40 KitCompiler** slot = arch == KIT_ARCH_ARM_64 ? &aa64 : &x64; 41 if (!*slot) { 42 /* Windows/COFF so the compiler matches the idiom's origin; only the panic 43 * path reads it, so the exact spec barely matters. */ 44 KitTargetSpec t = kit_unit_target(arch, KIT_OS_WINDOWS, KIT_OBJ_COFF); 45 if (kit_unit_compiler_new(&g_u, t, slot) != KIT_OK || !*slot) { 46 fprintf(stderr, "compiler_new failed for arch=%d\n", (int)arch); 47 exit(2); 48 } 49 } 50 return *slot; 51 } 52 53 static u32 rd32(const u8* p) { 54 return (u32)p[0] | ((u32)p[1] << 8) | ((u32)p[2] << 16) | ((u32)p[3] << 24); 55 } 56 57 /* ---- x86-64 -------------------------------------------------------------- */ 58 59 /* Emit the 4-instruction Win64 TLS idiom for destination register `rd` into 60 * `b`, mirroring x64_tls_addr_of_win64 byte-for-byte. Returns total length; 61 * *site_off receives the offset of the trailing lea's SECREL disp32. */ 62 static u32 x64_emit_idiom(u8* b, u32 rd, u32* site_off) { 63 u32 n = 0; 64 /* (1) mov rd, gs:[0x58] (9 bytes) */ 65 b[n++] = 0x65; 66 b[n++] = (u8)(0x48u | ((rd & 8u) ? 0x04u : 0u)); 67 b[n++] = 0x8B; 68 b[n++] = (u8)(((rd & 7u) << 3) | 4u); 69 b[n++] = 0x25; 70 b[n++] = 0x58; 71 b[n++] = 0; 72 b[n++] = 0; 73 b[n++] = 0; 74 /* (2) mov r11d, [rip+_tls_index] (7 bytes) */ 75 b[n++] = 0x44; 76 b[n++] = 0x8B; 77 b[n++] = 0x1D; 78 b[n++] = 0; 79 b[n++] = 0; 80 b[n++] = 0; 81 b[n++] = 0; 82 /* (3) mov rd, [rd+r11*8] (4 or 5 bytes) */ 83 b[n++] = (u8)(0x4Au | ((rd & 8u) ? 0x05u : 0u)); 84 b[n++] = 0x8B; 85 if ((rd & 7u) == 5u) { 86 b[n++] = (u8)((1u << 6) | ((rd & 7u) << 3) | 4u); 87 b[n++] = (u8)(0xD8u | (rd & 7u)); 88 b[n++] = 0; 89 } else { 90 b[n++] = (u8)(((rd & 7u) << 3) | 4u); 91 b[n++] = (u8)(0xD8u | (rd & 7u)); 92 } 93 /* (4) lea rd, [rd + sym@SECREL] (7 or 8 bytes) */ 94 b[n++] = (u8)(0x48u | ((rd & 8u) ? 0x05u : 0u)); 95 b[n++] = 0x8D; 96 if ((rd & 7u) == 4u) { 97 b[n++] = (u8)((2u << 6) | ((rd & 7u) << 3) | 4u); 98 b[n++] = (u8)((4u << 3) | (rd & 7u)); 99 } else { 100 b[n++] = (u8)((2u << 6) | ((rd & 7u) << 3) | (rd & 7u)); 101 } 102 *site_off = n; 103 b[n++] = 0; 104 b[n++] = 0; 105 b[n++] = 0; 106 b[n++] = 0; 107 return n; 108 } 109 110 static void x64_check(u32 rd) { 111 u8 buf[40]; 112 u32 site_off; 113 u32 total = x64_emit_idiom(buf, rd, &site_off); 114 /* Distinct write/runtime "addresses": disp must use site_pc, not &buf. */ 115 const u64 site_pc = 0x140005000ull + site_off; 116 const u64 storage = 0x140009123ull; 117 u32 lea = total - 7u; /* the rewritten 7-byte rip-lea sits at the block end */ 118 i64 want_disp = (i64)storage - (i64)(site_pc + 4u); 119 u32 i; 120 x64_jit_tls_le_relax(compiler_for(KIT_ARCH_X86_64), R_COFF_SECREL, 121 &buf[site_off], storage, site_pc); 122 for (i = 0; i < lea; ++i) 123 EXPECT(buf[i] == 0x90u, "x64 rd=%u: byte %u not NOP (0x%02x)", rd, i, 124 buf[i]); 125 EXPECT(buf[lea] == (u8)(0x48u | ((rd >= 8u) ? 0x04u : 0u)), 126 "x64 rd=%u: lea REX 0x%02x", rd, buf[lea]); 127 EXPECT(buf[lea + 1] == 0x8Du, "x64 rd=%u: lea opcode 0x%02x", rd, 128 buf[lea + 1]); 129 EXPECT(buf[lea + 2] == (u8)(((rd & 7u) << 3) | 5u), 130 "x64 rd=%u: lea modrm 0x%02x", rd, buf[lea + 2]); 131 EXPECT((i32)rd32(&buf[lea + 3]) == (i32)want_disp, 132 "x64 rd=%u: disp 0x%08x want 0x%08x", rd, rd32(&buf[lea + 3]), 133 (u32)want_disp); 134 } 135 136 /* Pin x64_emit_idiom to real codegen: the rd=r8 idiom captured from 137 * `kit cc -target x86_64-windows -c` disassembly. A drift in the builder (and 138 * thus a relax tested against a fictional idiom) turns this red. */ 139 static void x64_golden(void) { 140 static const u8 want[] = {0x65, 0x4c, 0x8b, 0x04, 0x25, 0x58, 0x00, 0x00, 141 0x00, 0x44, 0x8b, 0x1d, 0x00, 0x00, 0x00, 0x00, 142 0x4f, 0x8b, 0x04, 0xd8, 0x4d, 0x8d, 0x80, 0x00, 143 0x00, 0x00, 0x00}; 144 u8 buf[40]; 145 u32 site_off; 146 u32 total = x64_emit_idiom(buf, 8u /* r8 */, &site_off); 147 EXPECT(total == sizeof want, "x64 golden length %u != %zu", total, 148 sizeof want); 149 EXPECT(memcmp(buf, want, sizeof want) == 0, "x64 golden idiom mismatch"); 150 } 151 152 /* ---- aarch64 ------------------------------------------------------------- */ 153 154 #define AA_NOP 0xd503201fu 155 156 static u32 aa_add_imm(u32 rd, u32 rn, u32 imm12, u32 sh) { 157 return 0x91000000u | (sh << 22) | ((imm12 & 0xfffu) << 10) | ((rn & 0x1fu) 158 << 5) | (rd & 0x1fu); 159 } 160 161 /* Emit the 7-instruction Win64/aarch64 TLS idiom for `rd` (aa_tls_addr_of_win). 162 * Only instruction (6) — the HIGH12A add the relax keys on — needs a faithful 163 * encoding; the rest are placeholders the relax overwrites unconditionally. */ 164 static u32 aa_emit_idiom(u32* w, u32 rd) { 165 w[0] = 0xf9400240u | (18u << 5) | rd; /* ldr rd,[x18,#0x58] (placeholder) */ 166 w[1] = 0x90000010u; /* adrp x16,_tls_index */ 167 w[2] = aa_add_imm(16u, 16u, 0u, 0u); /* add x16,x16,:lo12: */ 168 w[3] = 0xb9400210u; /* ldr w16,[x16] */ 169 w[4] = 0xf8607a00u | rd; /* ldr rd,[rd,x16,lsl#3] */ 170 w[5] = aa_add_imm(rd, rd, 0u, 1u); /* add rd,rd,:secrel_hi12: (HIGH12A) */ 171 w[6] = aa_add_imm(rd, rd, 0u, 0u); /* add rd,rd,:secrel_lo12: (LOW12A) */ 172 return 5u; /* word index of the HIGH12A site the relax keys on */ 173 } 174 175 /* Decode ADRP+ADD (the relax's output) back to the absolute address it 176 * materializes, so we confirm it equals `storage`. */ 177 static u64 aa_decode_adrp_add(u32 adrp, u32 add, u64 adrp_pc) { 178 u32 immlo = (adrp >> 29) & 0x3u; 179 u32 immhi = (adrp >> 5) & 0x7ffffu; 180 i64 imm = (i64)(((u64)immhi << 2) | immlo); 181 if (imm & (1ll << 20)) imm -= (1ll << 21); /* sign-extend 21 bits */ 182 u64 page = (adrp_pc & ~0xfffull) + ((u64)imm << 12); 183 return page + ((add >> 10) & 0xfffu); 184 } 185 186 static void aa_check(u32 rd) { 187 u32 w[7]; 188 u32 site_i = aa_emit_idiom(w, rd); 189 const u64 site_pc = 0x140005000ull + site_i * 4u; 190 const u64 storage = 0x140009123ull; 191 u8* site = (u8*)&w[site_i]; 192 u32 i; 193 aa64_jit_tls_le_relax(compiler_for(KIT_ARCH_ARM_64), 194 R_COFF_AARCH64_SECREL_HIGH12A, site, storage, site_pc); 195 for (i = 0; i < 4u; ++i) /* instructions (1)-(4) -> NOP */ 196 EXPECT(w[i] == AA_NOP, "aa64 rd=%u: insn %u not NOP (0x%08x)", rd, i, w[i]); 197 EXPECT((w[4] & 0x9f000000u) == 0x90000000u && (w[4] & 0x1fu) == rd, 198 "aa64 rd=%u: insn(5) not ADRP rd (0x%08x)", rd, w[4]); 199 EXPECT((w[5] & 0xff800000u) == 0x91000000u && (w[5] & 0x1fu) == rd && 200 ((w[5] >> 5) & 0x1fu) == rd, 201 "aa64 rd=%u: insn(6) not ADD rd,rd,#imm (0x%08x)", rd, w[5]); 202 EXPECT(w[6] == AA_NOP, "aa64 rd=%u: insn(7) not NOP (0x%08x)", rd, w[6]); 203 EXPECT(aa_decode_adrp_add(w[4], w[5], site_pc - 4u) == storage, 204 "aa64 rd=%u: ADRP+ADD != storage", rd); 205 } 206 207 int main(void) { 208 /* Cover the x64 length-variant branches: rd&7==4 (rsp/r12 -> lea sib), 209 * rd&7==5 (rbp/r13 -> mov disp8), and rd>=8 (REX.R/B). */ 210 static const u32 x64_rds[] = {0, 1, 4, 5, 8, 12, 13}; 211 static const u32 aa_rds[] = {0, 1, 9, 20}; 212 size_t i; 213 kit_unit_init(&g_u); 214 x64_golden(); 215 for (i = 0; i < sizeof x64_rds / sizeof x64_rds[0]; ++i) x64_check(x64_rds[i]); 216 for (i = 0; i < sizeof aa_rds / sizeof aa_rds[0]; ++i) aa_check(aa_rds[i]); 217 kit_unit_summary(&g_u, "jit_tls_relax_test"); 218 return kit_unit_status(&g_u); 219 }