kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

start.c (8097B)


      1 /* Freestanding _start for Path E (ELF exec) tests.
      2  *
      3  * Convention:
      4  *   test_main()      — primary test body; returns 0 on pass.
      5  *   test_post_fini() — optional post-destructor check (weak default: 0).
      6  *
      7  * Lifecycle: TLS setup → ctors → test_main → dtors → test_post_fini → exit.
      8  *
      9  * kit-ld defines:
     10  *   __init_array_start/end, __fini_array_start/end — sorted ctor/dtor
     11  *     spans (synthesized around the corresponding SHT_*_ARRAY sections).
     12  *   __tdata_start, __tdata_end                     — .tdata template
     13  *     bytes; identical when no TLS (length 0).
     14  *   __tbss_size                                    — SK_ABS, holds the
     15  *     .tbss byte count as its symbol value.
     16  *
     17  * The TLS prologue runs unconditionally: with no TLS in the image, the
     18  * three boundary symbols all read as 0 and the loop is a no-op. */
     19 
     20 extern int test_main(void);
     21 __attribute__((weak)) int test_post_fini(void) { return 0; }
     22 
     23 typedef void (*VoidFn)(void);
     24 extern VoidFn __preinit_array_start[];
     25 extern VoidFn __preinit_array_end[];
     26 extern VoidFn __init_array_start[];
     27 extern VoidFn __init_array_end[];
     28 extern VoidFn __fini_array_start[];
     29 extern VoidFn __fini_array_end[];
     30 
     31 extern char __tdata_start[];
     32 extern char __tdata_end[];
     33 extern char __tbss_size[]; /* SK_ABS: address-of yields the byte count */
     34 
     35 /* TLS-block prologue layout — per-arch ABI dictates whether the TCB sits
     36  * before or after .tdata in the thread-pointer-relative image. AArch64
     37  * keeps a 16-byte reserved TCB; SysV-x86_64 uses TLS variant II (negative
     38  * offsets from the thread pointer, see below); RISC-V LP64 follows
     39  * variant I and points the thread pointer at the TCB end. */
     40 #define AARCH64_TCB_SIZE 16
     41 
     42 /* Per-thread TLS image; the test harness is single-threaded so a
     43  * file-scope buffer is enough.  Sized generously for any test we run
     44  * here.  Layout: [TCB | .tdata copy | .tbss zero-fill] for variants
     45  * that put the TCB first. */
     46 static char g_tls_block[4096] __attribute__((aligned(16)));
     47 
     48 /* IFUNC startup init.  Mirrors rt/lib/kit/ifunc_init.c — duplicated
     49  * here so the test harness doesn't need libkit_rt.a on the link
     50  * line.  When the linker emits a static ET_EXEC and the image
     51  * contains any STT_GNU_IFUNC, layout_iplt synthesizes a .init_array
     52  * entry pointing at __kit_ifunc_init; the loop in _start below
     53  * walks .init_array and calls each entry, so this fills every
     54  * .igot.plt slot before test_main runs. */
     55 extern void* __start_iplt_pairs[] __attribute__((weak));
     56 extern void* __stop_iplt_pairs[] __attribute__((weak));
     57 void __kit_ifunc_init(void) {
     58   void** p = __start_iplt_pairs;
     59   void** end = __stop_iplt_pairs;
     60   if (!p || !end) return;
     61   for (; p < end; p += 2) {
     62     void* (*r)(void) = (void* (*)(void))p[0];
     63     void** slot = (void**)p[1];
     64     *slot = r();
     65   }
     66 }
     67 
     68 #if defined(__APPLE__)
     69 /* macOS doesn't expose a stable syscall ABI — all syscalls must go
     70  * through libSystem.dylib.  start.c on macOS therefore calls libc
     71  * `exit` rather than emitting `svc #0x80` inline; the kit Mach-O
     72  * exe linker resolves the import via LC_LOAD_DYLIB libSystem.B.dylib
     73  * and the dyld bind info / chained-fixups stream. */
     74 extern void exit(int) __attribute__((noreturn));
     75 #endif
     76 
     77 __attribute__((noreturn)) static void do_exit(int code) {
     78 #if defined(__APPLE__)
     79   exit(code);
     80   __builtin_unreachable();
     81 #elif defined(__aarch64__)
     82   register long x8 __asm__("x8") = 94; /* sys_exit_group */
     83   register long x0 __asm__("x0") = code;
     84   __asm__ volatile("svc #0" ::"r"(x8), "r"(x0) : "memory");
     85 #elif defined(__x86_64__)
     86   register long rax __asm__("rax") = 231; /* sys_exit_group */
     87   register long rdi __asm__("rdi") = code;
     88   __asm__ volatile("syscall" ::"r"(rax), "r"(rdi) : "memory");
     89 #elif defined(__riscv) && __riscv_xlen == 64
     90   register long a7 __asm__("a7") = 94; /* sys_exit_group */
     91   register long a0 __asm__("a0") = code;
     92   __asm__ volatile("ecall" ::"r"(a7), "r"(a0) : "memory");
     93 #else
     94 #error "start.c: unsupported architecture"
     95 #endif
     96   __builtin_unreachable();
     97 }
     98 
     99 static void tls_init(void) {
    100 #if defined(__APPLE__)
    101   /* On Darwin, tpidr_el0 is owned by libsystem/dyld; freestanding
    102    * tests don't synthesize TLS roots (31_tls_local_exec is N/A on
    103    * Mach-O), so the prologue is a no-op. */
    104   return;
    105 #else
    106   unsigned long td_n = (unsigned long)(__tdata_end - __tdata_start);
    107   unsigned long bs_n = (unsigned long)(unsigned long long)__tbss_size;
    108   unsigned long i;
    109   /* Launder bs_n past clang's "extern char[] has non-null address"
    110    * assumption — without this the .tbss zero loop is peeled and
    111    * unconditionally writes one byte at tls[td_n], which on the SysV
    112    * x86_64 variant II layout (TCB sits at tls[td_n]) clobbers the
    113    * thread-pointer self-pointer for any TLS image with bs_n == 0. */
    114   __asm__ volatile("" : "+r"(bs_n));
    115 #if defined(__aarch64__)
    116   /* Variant I (TCB first): tp -> [TCB(16) | tdata | tbss] */
    117   char* dst = g_tls_block + AARCH64_TCB_SIZE;
    118   for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i];
    119   for (i = 0; i < bs_n; ++i) dst[td_n + i] = 0;
    120   __asm__ volatile("msr tpidr_el0, %0" ::"r"(g_tls_block) : "memory");
    121 #elif defined(__x86_64__)
    122   /* SysV TLS variant II: TLS bytes at *negative* offsets from the
    123    * thread pointer (fs base). Lay out [tdata | tbss | TCB] where the
    124    * TCB self-pointer sits at offset 0. The first slot of the TCB
    125    * must be the thread pointer (self) per ELF ABI. */
    126   char* tcb = g_tls_block + sizeof(g_tls_block) - 64;
    127   *(void**)tcb = tcb;
    128   char* tls = tcb - (td_n + bs_n);
    129   for (i = 0; i < td_n; ++i) tls[i] = __tdata_start[i];
    130   for (i = 0; i < bs_n; ++i) tls[td_n + i] = 0;
    131   /* arch_prctl(ARCH_SET_FS, tcb): syscall 158, code 0x1002. */
    132   register long rax __asm__("rax") = 158;
    133   register long rdi __asm__("rdi") = 0x1002;
    134   register long rsi __asm__("rsi") = (long)tcb;
    135   __asm__ volatile("syscall"
    136                    : "+r"(rax)
    137                    : "r"(rdi), "r"(rsi)
    138                    : "rcx", "r11", "memory");
    139 #elif defined(__riscv) && __riscv_xlen == 64
    140   /* Variant I: tp -> [TCB | tdata | tbss], TCB is reserved (here just
    141    * the first 16 bytes of the block); RISC-V psABI puts tp 16 bytes
    142    * past the start of the static TLS block convention varies, but
    143    * the unwind/glibc convention used by linker-generated code
    144    * resolves &var via tp + offset_from_TLS_image_start. We place
    145    * .tdata immediately after a 16-byte reservation. */
    146   char* dst = g_tls_block + 16;
    147   for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i];
    148   for (i = 0; i < bs_n; ++i) dst[td_n + i] = 0;
    149   __asm__ volatile("mv tp, %0" ::"r"(g_tls_block) : "memory");
    150 #else
    151 #error "start.c: unsupported architecture"
    152 #endif
    153 #endif /* !__APPLE__ */
    154 }
    155 
    156 /* On x86_64 the kernel hands _start an rsp that is 16-aligned (so argc
    157  * lands on a 16-byte boundary), but clang compiles _start as an ordinary
    158  * function assuming the standard SysV contract of rsp ≡ 8 (mod 16) on
    159  * entry — off by 8. force_align_arg_pointer makes the prologue realign
    160  * rsp itself so every `call` downstream lands at the canonical
    161  * rsp ≡ 8 (mod 16). aarch64/rv64 ABIs keep SP 16-aligned at all times,
    162  * so no analogue is needed there. */
    163 #if defined(__x86_64__)
    164 __attribute__((force_align_arg_pointer))
    165 #endif
    166 void _start(void) {
    167   VoidFn* p;
    168   int result;
    169 
    170   tls_init();
    171 
    172 #if defined(__APPLE__)
    173   /* Mach-O: dyld walks __DATA,__mod_init_func before _start runs, so
    174    * the harness must NOT walk __init_array_start/end — the boundary
    175    * symbols are synthesized into the __got region (no real init array
    176    * on Mach-O) and dereferencing them faults. */
    177   (void)p;
    178 #else
    179   /* SHT_PREINIT_ARRAY runs strictly before .init_array.  kit-ld
    180    * lands its synthetic __kit_ifunc_init entry here so IFUNC
    181    * slots are filled before any user ctor or test_main runs. */
    182   for (p = __preinit_array_start; p != __preinit_array_end; ++p) (*p)();
    183   for (p = __init_array_start; p != __init_array_end; ++p) (*p)();
    184 #endif
    185 
    186   result = test_main();
    187 
    188 #if !defined(__APPLE__)
    189   for (p = __fini_array_end; p-- != __fini_array_start;) (*p)();
    190 #endif
    191 
    192   if (result == 0) result = test_post_fini();
    193 
    194   do_exit(result);
    195 }