kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit a3e7b03a7d9a882ee7f73e48a7ccfffd0f44dd44
parent 5fec69282f697c6a701638e7da969c3f60fd52b0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 14:53:22 -0700

Exercise toy cross TLS startup with kit

Diffstat:
Mdriver/cmd/ld.c | 11++++++++++-
Mtest/toy/run.sh | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/driver/cmd/ld.c b/driver/cmd/ld.c @@ -1265,7 +1265,16 @@ static int ld_run_link(LdOptions* o) { lopts.build_id_len = o->build_id_len; lopts.gc_sections = o->gc_sections; lopts.strip_debug = o->strip_debug; - lopts.pie = driver_link_pie(o->target, o->pie, o->shared, o->relocatable); + /* GNU ld and lld default to a non-PIE ET_EXEC; PIE is opt-in via -pie. + * The linker imposes no hosted-PIE default the way a compiler driver + * does (kit cc still defaults hosted executables to PIE through the link + * API) — a freestanding or static link has no dynamic loader to apply a + * PIE image's relocations or choose its base, so a PIE default produces a + * binary that faults under a direct/qemu loader (its writable segments + * sit below mmap_min_addr at the vaddr-0 PIE base). -shared/-r never PIE. + * Callers that want a hosted PIE executable pass -pie explicitly (as the + * musl dynamic lane and real compiler drivers do). */ + lopts.pie = o->pie && !o->shared && !o->relocatable; lopts.pe_subsystem = o->pe_subsystem; lopts.interp_path = kit_slice_cstr(o->interp_path); lopts.soname = kit_slice_cstr(o->soname); diff --git a/test/toy/run.sh b/test/toy/run.sh @@ -9,8 +9,11 @@ # sidecar substring check (separate :objdump verdict) and a # <name>.link.skip sidecar. # X cross-arch: kit cc -O{level} -target -> kit ld -> exec_target for the -# aa64/x64/rv64 Linux targets. Exec is deferred to the engine's batched -# exec_target flush (kit_queue_e). Opt-in (not in the default paths). +# aa64/x64/rv64 Linux targets. The freestanding _start/TLS stub is also +# built by kit cc -target (no host toolchain), so the whole path — case +# object, startup stub, and link — is kit end to end. Exec is deferred to +# the engine's batched exec_target flush (kit_queue_e). Opt-in (not in the +# default paths). # C kit cc --emit=c case.toy -> host cc -> native exec. Exercises the # --emit=c C-source backend driven by a non-C frontend (validates that the # CGTarget seam is frontend-agnostic). Phased-rollout panics from the C @@ -210,6 +213,56 @@ cross_make_start_obj() { cat > "$start_c" <<'EOF_START' extern int main(void); +/* kit-ld synthesizes the .tdata template boundary symbols for a static + * image. The per-thread TLS image size is their difference, computed as a + * pointer subtraction so each symbol's PC-relative bias cancels — a lone + * kit boundary symbol doesn't resolve cleanly under -fno-pic. The toy + * corpus has no uninitialized (.tbss) thread-locals, so the image is + * exactly .tdata; any .tbss tail is covered by g_tls_block's zero-fill. */ +extern char __tdata_start[]; +extern char __tdata_end[]; + +/* Per-thread static-TLS image. kit resolves a local-exec var at .tdata + * image offset `o` to a thread-pointer-relative `o + 16` on aarch64/riscv64 + * (a 16-byte TCB sits ahead of .tdata; see src/obj/elf/link.c) and to a + * negative offset on x86_64 (variant II, TCB after the image). The + * freestanding stub must seed this block and set the thread pointer ITSELF: + * the cross runtimes don't do it for a no-libc static binary — notably + * qemu-riscv64 user-mode leaves tp pointing at uninitialized memory. The + * block is zero-initialized (.bss); single-threaded, so file scope is fine. */ +static char g_tls_block[4096] __attribute__((aligned(16))); + +static void tls_init(void) { + unsigned long td_n = (unsigned long)(__tdata_end - __tdata_start); + unsigned long i; +#if defined(__aarch64__) + /* Variant I: tp -> [TCB(16) | tdata]; var at tp + (off + 16). */ + char* dst = g_tls_block + 16; + for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i]; + __asm__ volatile("msr tpidr_el0, %0" ::"r"(g_tls_block) : "memory"); +#elif defined(__riscv) && __riscv_xlen == 64 + /* Variant I: tp -> [TCB(16) | tdata]; var at tp + (off + 16). */ + char* dst = g_tls_block + 16; + for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i]; + __asm__ volatile("mv tp, %0" ::"r"(g_tls_block) : "memory"); +#elif defined(__x86_64__) + /* Variant II: TLS bytes at negative offsets from %fs, which points at the + * TCB whose first word is a self-pointer (kit reads it via %fs:0). Lay out + * [tdata | TCB] and arch_prctl(ARCH_SET_FS, &TCB) so a var at offset `o` + * lands at fs + (o - td_n) = &tdata[o]. */ + char* tcb = g_tls_block + td_n; + for (i = 0; i < td_n; ++i) g_tls_block[i] = __tdata_start[i]; + *(void**)tcb = tcb; + register long rax __asm__("rax") = 158; /* arch_prctl */ + register long rdi __asm__("rdi") = 0x1002; /* ARCH_SET_FS */ + register long rsi __asm__("rsi") = (long)tcb; + __asm__ volatile("syscall" : "+r"(rax) : "r"(rdi), "r"(rsi) + : "rcx", "r11", "memory"); +#else +#error unsupported target +#endif +} + __attribute__((noreturn)) static void do_exit(int code) { #if defined(__aarch64__) register long x8 __asm__("x8") = 94; @@ -233,10 +286,16 @@ __attribute__((noreturn)) static void do_exit(int code) { __attribute__((force_align_arg_pointer)) #endif void _start(void) { + tls_init(); do_exit(main()); } EOF_START - if ! clang --target="$triple" -O1 -ffreestanding -fno-stack-protector \ + # Compile the startup stub with kit cc (the tool under test), not a host + # toolchain: the X path exercises kit cc + kit ld end to end, stub + # included. -fno-PIC/-fno-pie give absolute (non-GOT) relocations for the + # kit-ld-synthesized __tdata_{start,end} boundary symbols, matching the + # static ET_EXEC the link produces. (kit cc has no -fno-stack-protector.) + if ! "$KIT" cc -O1 -target "$triple" -ffreestanding \ -fno-PIC -fno-pie -c "$start_c" -o "$start_o" \ > "$work/$arch.start.out" 2> "$work/$arch.start.err"; then return 1 @@ -261,6 +320,18 @@ cross_one() { kit_skip "$label" "asmnop is target-specific before toy asm selectors" return fi + # Arch-specific cases name themselves with an _aa64/_x64/_rv64 basename + # suffix and use intrinsics that only lower on that arch (e.g. the aa64 + # privileged wfi/wfe/DAIF pair has no x64/rv64 lowering). Cross-compiling + # them onto a sibling arch is expected to fail, so skip the mismatches. + case "$KIT_BASE" in + *_aa64) case "$arch" in aa64|aarch64) ;; *) + kit_skip "$label" "aa64-only case (intrinsics have no $arch lowering)"; return ;; esac ;; + *_x64) case "$arch" in x64|x86_64) ;; *) + kit_skip "$label" "x64-only case (intrinsics have no $arch lowering)"; return ;; esac ;; + *_rv64) case "$arch" in rv64|riscv64) ;; *) + kit_skip "$label" "rv64-only case (intrinsics have no $arch lowering)"; return ;; esac ;; + esac if ! exec_target_supported "$tag"; then kit_skip "$label" "no runner for $tag" return @@ -285,11 +356,19 @@ cross_one() { return fi + # The stub is compiled by kit cc (the tool under test), so a failure here + # is a real codegen bug, not a missing host toolchain — surface it as FAIL. start_obj="$(cross_make_start_obj "$arch" "$triple" "$KIT_WORK")" || { - kit_skip "$label" "clang --target=$triple unavailable for startup" + kit_fail "$label" "kit cc -target $triple failed on startup stub" + sed 's/^/ | /' "$KIT_WORK/$arch.start.err" return } + # kit ld links a non-PIE static ET_EXEC by default (no -pie here), so the + # freestanding image lands at IMAGE_BASE_STATIC (0x400000): a PIE/ET_DYN + # would be based at vaddr 0, putting its writable segments below the + # loader's mmap_min_addr where qemu-user maps them non-writable and the + # stub's TLS-block store faults. if ! "$KIT" ld "$obj" "$start_obj" -o "$exe" \ > "$KIT_WORK/$arch.ld.out" 2> "$ld_err"; then kit_fail "$label" "kit ld failed"