commit 49d093effdd28f9e16f08f184e9d4cd99a5b8a6d
parent 8e04ae6420498beedf0939e5e12509290dd97157
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 4 May 2026 17:42:40 -0700
boot4: vendor riscv64 musl overrides + amd64/riscv64 calibration
Brings boot.sh end-to-end on amd64 and riscv64 (previously only
aarch64 was wired up).
riscv64 was missing the per-arch musl-overrides aarch64 already had:
tcc 0.9.26's riscv64 frontend errors with "RISCV64 asm not implemented"
on any inline asm carrying named operand constraints (musl's
syscall_arch.h, pthread_arch.h, atomic_arch.h all use them), and
riscv64-asm.c has no LR/SC, `tail`, `j`, `ret`, or `.option` pseudos
(stock crt_arch.h needs every one of those). Add:
- arch/riscv64/syscall_arch.h: route __syscallN through extern
__syscall trampoline (mirrors aarch64/x86_64 strategy)
- src/internal/riscv64/syscall.s: a0..a6 -> a7+a0..a5 shuffle, ecall
- arch/riscv64/pthread_arch.h: extern __get_tp instead of inline asm
- src/internal/riscv64/get_tp.s: `mv a0, tp` helper
- src/thread/riscv64/__set_thread_area.s: replaces upstream's `ret`
- arch/riscv64/atomic_arch.h: single-threaded C-inline a_cas/a_cas_p
(true atomicity needs raw .long LR/SC encoding; not worth the
complexity for a single-threaded smoke binary)
- arch/riscv64/crt_arch.h: minimal `mv a0, sp` + `jal zero, _start_c`
trampoline; drops gp setup (no relaxation), _DYNAMIC (static-only)
Vendor calibrated skip lists for the two newly-supported arches:
amd64=12 (long-double math + an x86_64 expl.s), riscv64=3 (math/log,
math/pow_data, thread/__unmapself). Status table in docs/MUSL.md
updated to match.
Diffstat:
10 files changed, 212 insertions(+), 3 deletions(-)
diff --git a/docs/MUSL.md b/docs/MUSL.md
@@ -118,7 +118,8 @@ surfaces tcc 0.9.26 cannot compile:
| Area | Rule |
|------|------|
| syscalls | replace GCC register-asm-variable wrappers with per-arch asm trampolines |
-| atomics / thread pointer | replace inline asm operands with extern asm helpers on aarch64/riscv64 |
+| atomics / thread pointer | replace inline asm operands with extern asm helpers on aarch64 (true atomic via raw `.long` LL/SC) and riscv64 (single-threaded C-inline a_cas — sufficient for the boot4 hello smoke binary; tcc-asm has no LR/SC mnemonics) |
+| crt entry trampoline | aarch64 + riscv64: replace upstream `crt_arch.h` with a minimal `_start` that passes `sp` to `_start_c` and tail-jumps. Drops `.option`, `lla gp`, and the `tail` pseudo (none parseable by tcc-asm). |
| weak aliases | implement `weak_alias` via assembler `.weak`/`.set` directives |
| C99 array parameters | remove `[static N]` qualifiers tcc does not parse |
| `_Complex` | stub `complex.h` and remove complex sources |
@@ -150,8 +151,8 @@ command is expected to succeed.
| arch | calibration vendored | skipped sources |
|------|----------------------|-----------------|
| `aarch64` | yes | 8 |
-| `amd64` | pending — run `scripts/boot4-calibrate.sh amd64` | — |
-| `riscv64` | pending — run `scripts/boot4-calibrate.sh riscv64` | — |
+| `amd64` | yes | 12 |
+| `riscv64` | yes | 3 |
Skipped sources are outside the boot4 hello closure. They fall into two
categories:
diff --git a/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/atomic_arch.h b/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/atomic_arch.h
@@ -0,0 +1,44 @@
+/* tcc-build riscv64 atomic_arch.h replacement.
+ *
+ * Stock musl uses inline asm with named operand constraints
+ * (`"=&r"(old), "r"(p)`) and the LR/SC mnemonics
+ * (`lr.w.aqrl` / `sc.w.aqrl`). tcc 0.9.26's riscv64 frontend lacks
+ * the operand-substitution path entirely (errors with "RISCV64 asm
+ * not implemented"), and riscv64-asm.c has no LR/SC entries in its
+ * mnemonic table.
+ *
+ * boot4 produces a static, single-threaded smoke binary, so we don't
+ * need true atomicity. Provide a_cas / a_cas_p as ordinary C inline
+ * functions; musl's src/internal/atomic.h derives a_swap, a_inc,
+ * a_dec, a_or, a_and, a_fetch_add, a_store, etc. from a_cas. A real
+ * atomic implementation would need raw .long encoding of LR/SC inside
+ * an extern asm helper (cf. aarch64), but that adds encoding
+ * complexity for a property the smoke binary doesn't depend on.
+ */
+
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+ int old = *p;
+ if (old == t) *p = s;
+ return old;
+}
+
+#define a_cas_p a_cas_p
+static inline void *a_cas_p(volatile void *p, void *t, void *s)
+{
+ void **pp = (void **)p;
+ void *old = *pp;
+ if (old == t) *pp = s;
+ return old;
+}
+
+#define a_barrier a_barrier
+static inline void a_barrier(void)
+{
+ /* `fence` (nullary) is in tcc's riscv64-asm.c table; it emits a
+ full memory fence with all pred/succ bits zero (effectively a
+ no-op, but still serves as a compiler barrier here). The
+ stock `fence rw,rw` form needs operand parsing tcc lacks. */
+ __asm__ __volatile__ ("fence");
+}
diff --git a/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/crt_arch.h b/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/crt_arch.h
@@ -0,0 +1,32 @@
+/* tcc-build riscv64 crt_arch.h replacement.
+ *
+ * Stock musl uses several things tcc 0.9.26's riscv64 assembler can't:
+ *
+ * 1. `.option push/norelax/pop` directives — tcc has no .option
+ * handling at all. We don't run linker relaxation, so dropping
+ * these is harmless.
+ *
+ * 2. `lla gp, __global_pointer$` — sets the global pointer for
+ * gp-relative addressing under linker relaxation. Without
+ * relaxation, no code uses gp; leaving it at process-entry value
+ * is fine for static-only boot4 binaries.
+ *
+ * 3. `tail _start_c` — the `tail` pseudo (auipc+jalr indirect) is
+ * not in tcc's mnemonic table. Replace with `jal zero, _start_c`
+ * which tcc accepts and which reaches any in-TU target (crt1.c
+ * defines _start_c in the same translation unit).
+ *
+ * 4. _DYNAMIC handling — boot4 only links static binaries, so
+ * _start_c's second argument is unused. Don't pass it.
+ *
+ * What's left is the minimum kernel-to-userland glue: pass sp to
+ * _start_c so it can read argc/argv/envp/auxv, then jump.
+ */
+__asm__(
+".text\n"
+".global " START "\n"
+".type " START ",%function\n"
+START ":\n"
+" mv a0, sp\n"
+" jal zero, " START "_c\n"
+);
diff --git a/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/pthread_arch.h b/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/pthread_arch.h
@@ -0,0 +1,17 @@
+/* tcc-build riscv64 pthread_arch.h replacement.
+ *
+ * Stock musl reads the thread pointer via inline asm with output
+ * operand: `__asm__ ("mv %0, tp" : "=r"(tp))`. tcc 0.9.26's riscv64
+ * frontend has no inline-asm operand-substitution path, so route
+ * through extern __get_tp implemented in
+ * src/internal/riscv64/get_tp.s. Mirrors the aarch64 override.
+ */
+
+extern unsigned long __get_tp(void);
+
+#define TLS_ABOVE_TP
+#define GAP_ABOVE_TP 0
+
+#define DTP_OFFSET 0x800
+
+#define MC_PC __gregs[0]
diff --git a/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/syscall_arch.h b/vendor/upstream/musl-1.2.5-overrides/arch/riscv64/syscall_arch.h
@@ -0,0 +1,50 @@
+/* tcc-build riscv64 syscall_arch.h replacement.
+ *
+ * Stock musl uses GCC register-asm-variable inline asm
+ * (`register long a7 __asm__("a7") = n; ... ecall`); tcc 0.9.26 lacks
+ * that GCC extension, and its riscv64 inline-asm operand path
+ * ("RISCV64 asm not implemented") can't process the constraint list
+ * either. Route every __syscallN through one variadic-style trampoline
+ * implemented in pure asm (src/internal/riscv64/syscall.s); the
+ * trampoline shuffles C-ABI a0..a6 into kernel-ABI a7 + a0..a5 and
+ * issues `ecall`.
+ *
+ * Mirrors the aarch64 / x86_64 trampoline strategy.
+ */
+
+#define __SYSCALL_LL_E(x) (x)
+#define __SYSCALL_LL_O(x) (x)
+
+extern long __syscall(long, ...);
+
+static __inline long __syscall0(long n)
+{
+ return __syscall(n);
+}
+static __inline long __syscall1(long n, long a)
+{
+ return __syscall(n, a);
+}
+static __inline long __syscall2(long n, long a, long b)
+{
+ return __syscall(n, a, b);
+}
+static __inline long __syscall3(long n, long a, long b, long c)
+{
+ return __syscall(n, a, b, c);
+}
+static __inline long __syscall4(long n, long a, long b, long c, long d)
+{
+ return __syscall(n, a, b, c, d);
+}
+static __inline long __syscall5(long n, long a, long b, long c, long d, long e)
+{
+ return __syscall(n, a, b, c, d, e);
+}
+static __inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
+{
+ return __syscall(n, a, b, c, d, e, f);
+}
+
+#define SYSCALL_FADVISE_6_ARG
+#define SYSCALL_IPC_BROKEN_MODE
diff --git a/vendor/upstream/musl-1.2.5-overrides/src/internal/riscv64/get_tp.s b/vendor/upstream/musl-1.2.5-overrides/src/internal/riscv64/get_tp.s
@@ -0,0 +1,14 @@
+/* tcc-build riscv64 thread-pointer reader.
+ *
+ * Returns `tp` (x4) in a0. Stock musl spells this as inline asm with
+ * an output operand; tcc 0.9.26 has no riscv64 inline-asm operand
+ * path. tcc-asm has `mv` and `jalr`; it lacks the `ret` pseudo, so
+ * the canonical `jalr zero, ra, 0` form is used (matches tcc-libc/
+ * riscv64/sys_stubs.S).
+ */
+
+ .global __get_tp
+ .type __get_tp, %function
+__get_tp:
+ mv a0, tp
+ jalr zero, ra, 0
diff --git a/vendor/upstream/musl-1.2.5-overrides/src/internal/riscv64/syscall.s b/vendor/upstream/musl-1.2.5-overrides/src/internal/riscv64/syscall.s
@@ -0,0 +1,23 @@
+/* tcc-build riscv64 syscall trampoline.
+ *
+ * C-ABI brings the syscall number in a0 and arguments in a1-a6;
+ * Linux riscv64 ABI wants the number in a7 and arguments in a0-a5.
+ * Shuffle and ecall; the kernel returns into a0 already.
+ *
+ * tcc-asm lacks the `ret` pseudo, so use the canonical
+ * `jalr zero, ra, 0` form (matches tcc-libc/riscv64/sys_stubs.S).
+ */
+
+ .global __syscall
+ .hidden __syscall
+ .type __syscall, %function
+__syscall:
+ mv a7, a0
+ mv a0, a1
+ mv a1, a2
+ mv a2, a3
+ mv a3, a4
+ mv a4, a5
+ mv a5, a6
+ ecall
+ jalr zero, ra, 0
diff --git a/vendor/upstream/musl-1.2.5-overrides/src/thread/riscv64/__set_thread_area.s b/vendor/upstream/musl-1.2.5-overrides/src/thread/riscv64/__set_thread_area.s
@@ -0,0 +1,13 @@
+/* tcc-build riscv64 __set_thread_area.
+ *
+ * Replaces upstream's three-line stub which uses the `ret` pseudo
+ * tcc-asm doesn't recognize. Behaviorally identical: copy a0 into the
+ * thread-pointer register tp, return 0.
+ */
+
+ .global __set_thread_area
+ .type __set_thread_area, %function
+__set_thread_area:
+ mv tp, a0
+ li a0, 0
+ jalr zero, ra, 0
diff --git a/vendor/upstream/musl-1.2.5-skip-amd64.txt b/vendor/upstream/musl-1.2.5-skip-amd64.txt
@@ -0,0 +1,12 @@
+src/math/__rem_pio2l.c
+src/math/__sinl.c
+src/math/__tanl.c
+src/math/erfl.c
+src/math/lgammal.c
+src/math/modfl.c
+src/math/pow_data.c
+src/math/powl.c
+src/math/rintl.c
+src/math/roundl.c
+src/math/tgammal.c
+src/math/x86_64/exp2l.s
diff --git a/vendor/upstream/musl-1.2.5-skip-riscv64.txt b/vendor/upstream/musl-1.2.5-skip-riscv64.txt
@@ -0,0 +1,3 @@
+src/math/log.c
+src/math/pow_data.c
+src/thread/__unmapself.c