commit a3e7b03a7d9a882ee7f73e48a7ccfffd0f44dd44
parent 5fec69282f697c6a701638e7da969c3f60fd52b0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 3 Jun 2026 14:53:22 -0700
Exercise toy cross TLS startup with kit
Diffstat:
2 files changed, 93 insertions(+), 5 deletions(-)
diff --git a/driver/cmd/ld.c b/driver/cmd/ld.c
@@ -1265,7 +1265,16 @@ static int ld_run_link(LdOptions* o) {
lopts.build_id_len = o->build_id_len;
lopts.gc_sections = o->gc_sections;
lopts.strip_debug = o->strip_debug;
- lopts.pie = driver_link_pie(o->target, o->pie, o->shared, o->relocatable);
+ /* GNU ld and lld default to a non-PIE ET_EXEC; PIE is opt-in via -pie.
+ * The linker imposes no hosted-PIE default the way a compiler driver
+ * does (kit cc still defaults hosted executables to PIE through the link
+ * API) — a freestanding or static link has no dynamic loader to apply a
+ * PIE image's relocations or choose its base, so a PIE default produces a
+ * binary that faults under a direct/qemu loader (its writable segments
+ * sit below mmap_min_addr at the vaddr-0 PIE base). -shared/-r never PIE.
+ * Callers that want a hosted PIE executable pass -pie explicitly (as the
+ * musl dynamic lane and real compiler drivers do). */
+ lopts.pie = o->pie && !o->shared && !o->relocatable;
lopts.pe_subsystem = o->pe_subsystem;
lopts.interp_path = kit_slice_cstr(o->interp_path);
lopts.soname = kit_slice_cstr(o->soname);
diff --git a/test/toy/run.sh b/test/toy/run.sh
@@ -9,8 +9,11 @@
# sidecar substring check (separate :objdump verdict) and a
# <name>.link.skip sidecar.
# X cross-arch: kit cc -O{level} -target -> kit ld -> exec_target for the
-# aa64/x64/rv64 Linux targets. Exec is deferred to the engine's batched
-# exec_target flush (kit_queue_e). Opt-in (not in the default paths).
+# aa64/x64/rv64 Linux targets. The freestanding _start/TLS stub is also
+# built by kit cc -target (no host toolchain), so the whole path — case
+# object, startup stub, and link — is kit end to end. Exec is deferred to
+# the engine's batched exec_target flush (kit_queue_e). Opt-in (not in the
+# default paths).
# C kit cc --emit=c case.toy -> host cc -> native exec. Exercises the
# --emit=c C-source backend driven by a non-C frontend (validates that the
# CGTarget seam is frontend-agnostic). Phased-rollout panics from the C
@@ -210,6 +213,56 @@ cross_make_start_obj() {
cat > "$start_c" <<'EOF_START'
extern int main(void);
+/* kit-ld synthesizes the .tdata template boundary symbols for a static
+ * image. The per-thread TLS image size is their difference, computed as a
+ * pointer subtraction so each symbol's PC-relative bias cancels — a lone
+ * kit boundary symbol doesn't resolve cleanly under -fno-pic. The toy
+ * corpus has no uninitialized (.tbss) thread-locals, so the image is
+ * exactly .tdata; any .tbss tail is covered by g_tls_block's zero-fill. */
+extern char __tdata_start[];
+extern char __tdata_end[];
+
+/* Per-thread static-TLS image. kit resolves a local-exec var at .tdata
+ * image offset `o` to a thread-pointer-relative `o + 16` on aarch64/riscv64
+ * (a 16-byte TCB sits ahead of .tdata; see src/obj/elf/link.c) and to a
+ * negative offset on x86_64 (variant II, TCB after the image). The
+ * freestanding stub must seed this block and set the thread pointer ITSELF:
+ * the cross runtimes don't do it for a no-libc static binary — notably
+ * qemu-riscv64 user-mode leaves tp pointing at uninitialized memory. The
+ * block is zero-initialized (.bss); single-threaded, so file scope is fine. */
+static char g_tls_block[4096] __attribute__((aligned(16)));
+
+static void tls_init(void) {
+ unsigned long td_n = (unsigned long)(__tdata_end - __tdata_start);
+ unsigned long i;
+#if defined(__aarch64__)
+ /* Variant I: tp -> [TCB(16) | tdata]; var at tp + (off + 16). */
+ char* dst = g_tls_block + 16;
+ for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i];
+ __asm__ volatile("msr tpidr_el0, %0" ::"r"(g_tls_block) : "memory");
+#elif defined(__riscv) && __riscv_xlen == 64
+ /* Variant I: tp -> [TCB(16) | tdata]; var at tp + (off + 16). */
+ char* dst = g_tls_block + 16;
+ for (i = 0; i < td_n; ++i) dst[i] = __tdata_start[i];
+ __asm__ volatile("mv tp, %0" ::"r"(g_tls_block) : "memory");
+#elif defined(__x86_64__)
+ /* Variant II: TLS bytes at negative offsets from %fs, which points at the
+ * TCB whose first word is a self-pointer (kit reads it via %fs:0). Lay out
+ * [tdata | TCB] and arch_prctl(ARCH_SET_FS, &TCB) so a var at offset `o`
+ * lands at fs + (o - td_n) = &tdata[o]. */
+ char* tcb = g_tls_block + td_n;
+ for (i = 0; i < td_n; ++i) g_tls_block[i] = __tdata_start[i];
+ *(void**)tcb = tcb;
+ register long rax __asm__("rax") = 158; /* arch_prctl */
+ register long rdi __asm__("rdi") = 0x1002; /* ARCH_SET_FS */
+ register long rsi __asm__("rsi") = (long)tcb;
+ __asm__ volatile("syscall" : "+r"(rax) : "r"(rdi), "r"(rsi)
+ : "rcx", "r11", "memory");
+#else
+#error unsupported target
+#endif
+}
+
__attribute__((noreturn)) static void do_exit(int code) {
#if defined(__aarch64__)
register long x8 __asm__("x8") = 94;
@@ -233,10 +286,16 @@ __attribute__((noreturn)) static void do_exit(int code) {
__attribute__((force_align_arg_pointer))
#endif
void _start(void) {
+ tls_init();
do_exit(main());
}
EOF_START
- if ! clang --target="$triple" -O1 -ffreestanding -fno-stack-protector \
+ # Compile the startup stub with kit cc (the tool under test), not a host
+ # toolchain: the X path exercises kit cc + kit ld end to end, stub
+ # included. -fno-PIC/-fno-pie give absolute (non-GOT) relocations for the
+ # kit-ld-synthesized __tdata_{start,end} boundary symbols, matching the
+ # static ET_EXEC the link produces. (kit cc has no -fno-stack-protector.)
+ if ! "$KIT" cc -O1 -target "$triple" -ffreestanding \
-fno-PIC -fno-pie -c "$start_c" -o "$start_o" \
> "$work/$arch.start.out" 2> "$work/$arch.start.err"; then
return 1
@@ -261,6 +320,18 @@ cross_one() {
kit_skip "$label" "asmnop is target-specific before toy asm selectors"
return
fi
+ # Arch-specific cases name themselves with an _aa64/_x64/_rv64 basename
+ # suffix and use intrinsics that only lower on that arch (e.g. the aa64
+ # privileged wfi/wfe/DAIF pair has no x64/rv64 lowering). Cross-compiling
+ # them onto a sibling arch is expected to fail, so skip the mismatches.
+ case "$KIT_BASE" in
+ *_aa64) case "$arch" in aa64|aarch64) ;; *)
+ kit_skip "$label" "aa64-only case (intrinsics have no $arch lowering)"; return ;; esac ;;
+ *_x64) case "$arch" in x64|x86_64) ;; *)
+ kit_skip "$label" "x64-only case (intrinsics have no $arch lowering)"; return ;; esac ;;
+ *_rv64) case "$arch" in rv64|riscv64) ;; *)
+ kit_skip "$label" "rv64-only case (intrinsics have no $arch lowering)"; return ;; esac ;;
+ esac
if ! exec_target_supported "$tag"; then
kit_skip "$label" "no runner for $tag"
return
@@ -285,11 +356,19 @@ cross_one() {
return
fi
+ # The stub is compiled by kit cc (the tool under test), so a failure here
+ # is a real codegen bug, not a missing host toolchain — surface it as FAIL.
start_obj="$(cross_make_start_obj "$arch" "$triple" "$KIT_WORK")" || {
- kit_skip "$label" "clang --target=$triple unavailable for startup"
+ kit_fail "$label" "kit cc -target $triple failed on startup stub"
+ sed 's/^/ | /' "$KIT_WORK/$arch.start.err"
return
}
+ # kit ld links a non-PIE static ET_EXEC by default (no -pie here), so the
+ # freestanding image lands at IMAGE_BASE_STATIC (0x400000): a PIE/ET_DYN
+ # would be based at vaddr 0, putting its writable segments below the
+ # loader's mmap_min_addr where qemu-user maps them non-writable and the
+ # stub's TLS-block store faults.
if ! "$KIT" ld "$obj" "$start_obj" -o "$exe" \
> "$KIT_WORK/$arch.ld.out" 2> "$ld_err"; then
kit_fail "$label" "kit ld failed"