commit 23596093076569e952f968e2a764d44310332f82
parent 35bf8d6e9e0113647636411fd4b58c1d06fe94c5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 4 May 2026 13:09:16 -0700
boot4: riscv64 musl patches — 135 -> 3 skips, hello runs
Add riscv64 to the musl patch set, mirroring the aarch64 strategy:
route every inline-asm site that riscv64-asm.c's stub
subst_asm_operand can't handle through pure-asm trampolines or
extern function calls.
Patch additions in vendor/upstream/musl-1.2.5-tcc.patch:
- arch/riscv64/syscall_arch.h static __inline wrappers calling __syscall
- src/internal/riscv64/syscall.s C-ABI -> kernel-ABI shuffle, ecall
- arch/riscv64/pthread_arch.h __get_tp as extern
- src/internal/riscv64/get_tp.s mv a0, tp
- arch/riscv64/atomic_arch.h a_barrier/a_cas/a_cas_p as externs
- src/internal/riscv64/atomic.s lr/sc/bne/fence as raw .word; control
flow as plain mnemonics; `ret` spelled `jalr x0, x1, 0`
- arch/riscv64/crt_arch.h drop tail/.option/lla gp (static-only, GP
relaxation unnecessary); pass _DYNAMIC=NULL; jal x0, _start_c
- src/thread/riscv64/__set_thread_area.s same instructions, but `ret`
-> `jalr x0, x1, 0`. Load-bearing for __init_tp on riscv64 (Linux
has no SYS_set_thread_area; the generic fallback would a_crash).
scripts/boot4-musl-shim-riscv64.h: replace the placeholder with the
lp64 va_arg pointer-arithmetic macro from tcc/include/stdarg.h
(__builtin_va_list = char *, no helper-call required, unlike amd64).
scripts/boot4.sh riscv64 deletion sweep: src/math/riscv64/*.c
(FPU "=f" inline asm), src/{fenv,setjmp,signal,thread,process}/
riscv64/*.{s,S,c} except __set_thread_area.s. Portable C in
src/{math,fenv} takes over; libc.a will lack clone, syscall_cp,
__unmapself helper, vfork, restore, setjmp/longjmp/sigsetjmp.
Result: riscv64 compile reaches 1268/1271; the 3 remaining skips are
src/math/{log,pow_data}.c (long-double constant folding, same tcc
0.9.26 bug as amd64) and src/thread/__unmapself.c (CRTJMP inline asm
needs subst_asm_operand). Hello links to a 69 KB static binary, runs
in scratch+busybox container, prints argc + strdup demo. amd64 still
green; aarch64 unchanged from previous commit.
Diffstat:
4 files changed, 488 insertions(+), 16 deletions(-)
diff --git a/docs/MUSL.md b/docs/MUSL.md
@@ -4,8 +4,9 @@ Working doc. boot3 produces a self-host fixed-point tcc (`tcc2 == tcc3`).
boot4 takes that compiler and uses it to build [musl
1.2.5](https://musl.libc.org/) from upstream source plus a small set of
tcc-compatibility patches, then links and runs a static hello world.
-The harness is wired for amd64, aarch64, and riscv64; only **amd64** is
-verified end-to-end today. Modeled on the in-image build in
+The harness is wired for amd64, aarch64, and riscv64; **amd64** and
+**riscv64** are verified end-to-end (aarch64 still blocks at link due
+to tcc 0.9.26 codegen bugs). Modeled on the in-image build in
[/Users/ryan/tmp/musltcc](file:///Users/ryan/tmp/musltcc) but driven
from this repo's bootstrap and constrained to scratch+busybox.
@@ -41,7 +42,7 @@ The container is the same `boot2-scratch:$ARCH` boot3 uses (FROM scratch
|---------|:--:|:--:|:--:|:--:|
| amd64 | ✓ | ✓ | ✓ | ✓ verified |
| aarch64 | ✓ | aarch64-targeted patches landed (syscall trampoline, atomic externs, get_tp helper, replacement crt_arch.h, deletion sweep). Compile reaches 1251/1271; 20 skips. | mirrors tcc 0.9.26 AAPCS register-save struct, gets past the `va_list` typename | **blocks at hello link** — load-bearing files hit tcc codegen bugs (see below) |
-| riscv64 | ✓ | partial — same shared-hunk caveat | placeholder ([scripts/boot4-musl-shim-riscv64.h](../scripts/boot4-musl-shim-riscv64.h)) — `typedef char *__builtin_va_list;`, may conflict with tcc's builtin keyword recognition | not yet exercised |
+| riscv64 | ✓ | riscv64-targeted patches landed (mirrors aarch64: syscall trampoline, atomic externs, get_tp helper, replacement crt_arch.h, patched __set_thread_area, deletion sweep). Compile reaches **1268/1271**; 3 skips. | mirrors tcc 0.9.26 riscv64 stdarg.h: `__builtin_va_list = char *`, va_arg as the lp64 pointer-arithmetic macro | ✓ verified |
### aarch64 status after patch round 1
@@ -127,6 +128,66 @@ The 17 codegen-bug files are the irreducible residual either way — tcc
0.9.26 has aarch64 codegen issues that would need fixing in the
compiler itself.
+### riscv64 status after patch round 1
+
+riscv64 mirrors the aarch64 strategy: tcc 0.9.26's `riscv64-asm.c`
+has a real upstream assembler for the base ISA, but
+`subst_asm_operand` is a stub (`tcc_error("RISCV64 asm not
+implemented.")`), so every musl inline-asm site with output operands
+fails. The lr/sc atomics, the named `fence rw,rw` form, and a handful
+of pseudo-instructions (`tail`, `j`, `ret`) are also absent.
+
+Patches added (mirrors the aarch64 set):
+
+- `arch/riscv64/syscall_arch.h` — static `__inline` wrappers calling
+ one variadic `__syscall` trampoline
+- `src/internal/riscv64/syscall.s` — C-ABI → kernel-ABI shuffle, `ecall`
+- `arch/riscv64/pthread_arch.h` — `__get_tp` extern
+- `src/internal/riscv64/get_tp.s` — `mv a0, tp; jalr x0, x1, 0`
+- `arch/riscv64/atomic_arch.h` — `a_barrier` / `a_cas` / `a_cas_p` as externs
+- `src/internal/riscv64/atomic.s` — `lr.w/d.aqrl`, `sc.w/d.aqrl`,
+ `bne ±12`, `fence rw, rw` as raw `.word` encodings; control flow as
+ plain mnemonics
+- `arch/riscv64/crt_arch.h` — drop `tail` / `.option push/norelax/pop`
+ / `lla gp, __global_pointer$` (static-only, GP relaxation
+ unnecessary); pass `_DYNAMIC = NULL`; tail-call via `jal x0, _start_c`
+- `src/thread/riscv64/__set_thread_area.s` — replace the `ret`
+ pseudo with `jalr x0, x1, 0` (other two instructions are stock).
+ This file is on the `__init_tp` startup path; without it the
+ generic C fallback returns -ENOSYS (no `SYS_set_thread_area` on
+ riscv64) and `a_crash()` fires before main runs.
+
+Plus the per-arch va_list shim. tcc 0.9.26's stock riscv64 `stdarg.h`
+spells `__builtin_va_list` as `char *` and implements `va_arg` as the
+lp64 pointer-arithmetic macro (no helper-call required, unlike the
+amd64 path). The shim mirrors that exactly so musl's `<stdarg.h>` /
+`bits/alltypes.h` typedefs and macros resolve under `-nostdinc`.
+
+Plus `boot4.sh` deletion sweep: `src/math/riscv64/*.c` (FPU inline asm
+with `"=f"` constraints — portable C in `src/math/` takes over),
+`src/fenv/riscv64/*`, `src/setjmp/riscv64/*.S`, `src/signal/riscv64/*.s`,
+the remaining `src/thread/riscv64/*.s` (clone, syscall_cp,
+__unmapself), `src/process/riscv64/vfork.s` — all use displacement
+load/store syntax (`sd rs, off(rd)`), `csr*` mnemonics, or the
+missing pseudos (`j`, `ret`). libc.a will lack clone, syscall_cp,
+setjmp/longjmp/sigsetjmp, vfork, fenv, restore — fine for hello.
+
+Result: **1268/1271 sources compile**, libc.a 2.77 MB, hello 69 KB,
+runs in scratch+busybox container. The 3 remaining skips are:
+
+- `src/math/log.c`, `src/math/pow_data.c` — long-double constant-
+ initializer folding (same tcc 0.9.26 bug that skips 11 files on
+ amd64).
+- `src/thread/__unmapself.c` — `arch/riscv64/reloc.h`'s `CRTJMP`
+ macro is `__asm__("mv sp, %1 ; jr %0" : : "r"(pc), "r"(sp) :
+ "memory")`, which needs `subst_asm_operand`. Not on the hello
+ path; called only when threads exit.
+
+Cleaner residual than amd64 (3 vs 11) and dramatically cleaner than
+aarch64 (3 vs 20) — riscv64 has no equivalent of aarch64's tcc
+codegen-bug bucket, so all asm-shaped failures cleared with the
+patch round.
+
## Inputs
| Path | Contents |
@@ -219,9 +280,10 @@ hello ~55 KB static ELF, runs in container, prints argc + strdup dem
with what tcc's internal linker synthesizes for `-static` binaries.
- `compat/time32` skipped — 32-bit time_t aliases, irrelevant on
x86_64 and produces duplicate-symbol errors.
-- aarch64 / riscv64: not yet. Patches are x86_64-specific (syscall
- trampoline, dropped overrides) and aarch64 also has no in-tcc
- assembler in 0.9.26 (see `docs/TCC-ARM64-ASM.md`).
+- aarch64: blocks at link — see "aarch64 status after patch round 1"
+ for the remaining tcc 0.9.26 codegen bugs.
+- riscv64: end-to-end works after patch round 1 (see "riscv64 status
+ after patch round 1"). 3 residual skips, none on the hello path.
- No `make`, `busybox`, or further userland — boot4 stops at hello.
The musltcc demo continues to GNU make 4.4.1 and busybox 1.36.1; that
pipeline could plug in here once the libc is solid.
diff --git a/scripts/boot4-musl-shim-riscv64.h b/scripts/boot4-musl-shim-riscv64.h
@@ -1,18 +1,40 @@
/* boot4 va_list shim for compiling musl with tcc 0.9.26 on riscv64.
*
- * On riscv64, tcc 0.9.26's <stdarg.h> spells va_list as `char *` and
- * relies on `__builtin_va_start` / `_arg` / `_copy` / `_end` being
- * compiler-recognized intrinsics — there is no per-arg-type
- * decomposition macro to provide here. The shim only needs to make
- * `__builtin_va_list` resolvable so musl's bits/alltypes.h
+ * musl's stdarg.h and bits/alltypes.h spell varargs the GCC way:
*
* typedef __builtin_va_list va_list;
+ * #define va_start(v,l) __builtin_va_start(v,l)
+ * ...
*
- * succeeds; tcc handles the va_* operations internally.
+ * tcc 0.9.26's riscv64 stdarg.h spells va_list as `char *` and
+ * implements va_arg / va_copy / va_end as plain pointer-arithmetic
+ * macros. Mirror that here, since musl is compiled with -nostdinc
+ * (tcc's stdarg.h is unreachable on its own); this header is
+ * `-include`d on every musl translation unit.
*
- * Untested — see docs/MUSL.md "multi-arch status". May conflict with
- * tcc's own builtin keyword recognition (tcc 0.9.26 may treat
- * __builtin_va_list as a reserved name on riscv64); revisit when first
- * exercising this arch. */
+ * The va_arg expansion is the riscv64 lp64/lp64d branch from
+ * tcc/include/stdarg.h: 8-byte slots, args ≤ 16 bytes by-value, args
+ * > 16 bytes by-pointer. __builtin_va_start is recognized internally
+ * by tcc's frontend; we only need to bridge the typedef and the
+ * remaining macros.
+ *
+ * Untested-but-promising — may need adjustment once first failures
+ * appear in printf/scanf/syslog/openat-style varargs callers. */
typedef char *__builtin_va_list;
+
+#define __va_reg_size 8 /* __riscv_xlen / 8, fixed for rv64 */
+
+#define _tcc_align(addr,type) \
+ (((unsigned long)(addr) + __alignof__(type) - 1) & -(__alignof__(type)))
+
+#define __builtin_va_arg(ap, type) \
+ (*(sizeof(type) > (2 * __va_reg_size) \
+ ? *(type **)((ap += __va_reg_size) - __va_reg_size) \
+ : (ap = (__builtin_va_list)(_tcc_align(ap, type) + \
+ (sizeof(type) + __va_reg_size - 1) & -__va_reg_size), \
+ (type *)(ap - ((sizeof(type) + __va_reg_size - 1) & \
+ -__va_reg_size)))))
+
+#define __builtin_va_copy(dest, src) ((dest) = (src))
+#define __builtin_va_end(ap) ((void)(ap))
diff --git a/scripts/boot4.sh b/scripts/boot4.sh
@@ -197,6 +197,27 @@ rm -f musl-1.2.5/src/thread/aarch64/__unmapself.s
rm -f musl-1.2.5/src/setjmp/aarch64/setjmp.s
rm -f musl-1.2.5/src/setjmp/aarch64/longjmp.s
rm -f musl-1.2.5/src/signal/aarch64/sigsetjmp.s
+# riscv64 sweep: math overrides use FPU inline asm with "=f" operand
+# constraints (subst_asm_operand stub on riscv64-asm.c); fenv.S /
+# setjmp.S / clone.s / etc. use displacement-load syntax, csr*
+# mnemonics, and the `j` / `ret` pseudos absent from riscv64-tok.h.
+# Drop them — portable C in src/{math,fenv} takes over; libc.a will
+# lack clone, syscall_cp, setjmp/longjmp/sigsetjmp, vfork, restore,
+# fenv. Boot4 hello doesn't reach them.
+rm -f musl-1.2.5/src/math/riscv64/*.c
+rm -f musl-1.2.5/src/fenv/riscv64/fenv.S
+rm -f musl-1.2.5/src/fenv/riscv64/fenv-sf.c
+rm -f musl-1.2.5/src/setjmp/riscv64/setjmp.S
+rm -f musl-1.2.5/src/setjmp/riscv64/longjmp.S
+rm -f musl-1.2.5/src/signal/riscv64/sigsetjmp.s
+rm -f musl-1.2.5/src/signal/riscv64/restore.s
+rm -f musl-1.2.5/src/thread/riscv64/clone.s
+rm -f musl-1.2.5/src/thread/riscv64/syscall_cp.s
+rm -f musl-1.2.5/src/thread/riscv64/__unmapself.s
+# (keep src/thread/riscv64/__set_thread_area.s — patched to use the
+# explicit `jalr x0, x1, 0` form of `ret`. It's on the __init_tp /
+# __libc_start_main path; linker would fail at start otherwise.)
+rm -f musl-1.2.5/src/process/riscv64/vfork.s
# ── Stage C: configure + generate musl headers ────────────────────────
cd /tmp/musl-1.2.5
diff --git a/vendor/upstream/musl-1.2.5-tcc.patch b/vendor/upstream/musl-1.2.5-tcc.patch
@@ -281,6 +281,243 @@ diff -urN orig_musl/musl-1.2.5/arch/aarch64/syscall_arch.h patched_musl/musl-1.2
-#define IPC_64 0
+#define SYSCALL_FADVISE_6_ARG
+#define SYSCALL_IPC_BROKEN_MODE
+diff -urN orig_musl/musl-1.2.5/arch/riscv64/atomic_arch.h patched_musl/musl-1.2.5/arch/riscv64/atomic_arch.h
+--- a/musl-1.2.5/arch/riscv64/atomic_arch.h 2026-05-04 12:39:15
++++ b/musl-1.2.5/arch/riscv64/atomic_arch.h 2026-05-04 12:39:35
+@@ -1,38 +1,22 @@
+-#define a_barrier a_barrier
+-static inline void a_barrier()
+-{
+- __asm__ __volatile__ ("fence rw,rw" : : : "memory");
+-}
++/* tcc-build riscv64 atomic_arch.h replacement.
++ *
++ * Stock musl provides a_barrier / a_cas / a_cas_p as inline asm with
++ * output operand constraints + lr.w.aqrl / sc.w.aqrl / bne-to-label.
++ * riscv64-asm.c's subst_asm_operand is a stub
++ * (`tcc_error("RISCV64 asm not implemented.")`); the lr/sc family,
++ * the named-fence form, and bne-to-label are also outside the
++ * in-tree assembler's surface. Route through pure-asm externs in
++ * src/internal/riscv64/atomic.s, which spell the lr/sc/branch
++ * instructions as raw .word encodings.
++ *
++ * Mirrors the aarch64 strategy (different primitive set: aarch64 musl
++ * exposes a_ll/a_sc and lets atomic.h derive a_cas; riscv64 musl
++ * exposes a_cas/a_cas_p directly). */
+
+-#define a_cas a_cas
+-static inline int a_cas(volatile int *p, int t, int s)
+-{
+- int old, tmp;
+- __asm__ __volatile__ (
+- "\n1: lr.w.aqrl %0, (%2)\n"
+- " bne %0, %3, 1f\n"
+- " sc.w.aqrl %1, %4, (%2)\n"
+- " bnez %1, 1b\n"
+- "1:"
+- : "=&r"(old), "=&r"(tmp)
+- : "r"(p), "r"((long)t), "r"((long)s)
+- : "memory");
+- return old;
+-}
++extern void a_barrier (void);
++extern int a_cas (volatile int *, int, int);
++extern void *a_cas_p (volatile void *, void *, void *);
+
+-#define a_cas_p a_cas_p
+-static inline void *a_cas_p(volatile void *p, void *t, void *s)
+-{
+- void *old;
+- int tmp;
+- __asm__ __volatile__ (
+- "\n1: lr.d.aqrl %0, (%2)\n"
+- " bne %0, %3, 1f\n"
+- " sc.d.aqrl %1, %4, (%2)\n"
+- " bnez %1, 1b\n"
+- "1:"
+- : "=&r"(old), "=&r"(tmp)
+- : "r"(p), "r"(t), "r"(s)
+- : "memory");
+- return old;
+-}
++#define a_barrier a_barrier
++#define a_cas a_cas
++#define a_cas_p a_cas_p
+diff -urN orig_musl/musl-1.2.5/arch/riscv64/crt_arch.h patched_musl/musl-1.2.5/arch/riscv64/crt_arch.h
+--- a/musl-1.2.5/arch/riscv64/crt_arch.h 2026-05-04 12:39:15
++++ b/musl-1.2.5/arch/riscv64/crt_arch.h 2026-05-04 12:39:47
+@@ -1,19 +1,24 @@
++/* tcc-build riscv64 crt_arch.h replacement.
++ *
++ * Stock musl uses the `tail` pseudo-instruction (auipc + jalr), the
++ * `.option push/norelax/pop` directives, and `lla gp,
++ * __global_pointer$` to set up GP-relative addressing for linker
++ * relaxation. tcc 0.9.26's riscv64 assembler has none of those:
++ * `tail` isn't in riscv64-tok.h, `.option` has no handler in
++ * tccasm.c, and the internal linker doesn't relax — so GP setup is
++ * unnecessary. Drop the GP block and tail-call via plain
++ * `jal x0, _start_c`; R_RISCV_JAL is in range (boot4 hello is ~55
++ * KB and _start_c sits in the same TU).
++ *
++ * _DYNAMIC is unused for static-only builds — pass NULL as arg1.
++ */
+ __asm__(
+-".section .sdata,\"aw\"\n"
+ ".text\n"
+ ".global " START "\n"
+ ".type " START ",%function\n"
+ START ":\n"
+-".weak __global_pointer$\n"
+-".hidden __global_pointer$\n"
+-".option push\n"
+-".option norelax\n\t"
+-"lla gp, __global_pointer$\n"
+-".option pop\n\t"
+ "mv a0, sp\n"
+-".weak _DYNAMIC\n"
+-".hidden _DYNAMIC\n\t"
+-"lla a1, _DYNAMIC\n\t"
++"mv a1, x0\n"
+ "andi sp, sp, -16\n\t"
+-"tail " START "_c"
++"jal x0, " START "_c"
+ );
+diff -urN orig_musl/musl-1.2.5/arch/riscv64/pthread_arch.h patched_musl/musl-1.2.5/arch/riscv64/pthread_arch.h
+--- a/musl-1.2.5/arch/riscv64/pthread_arch.h 2026-05-04 12:39:15
++++ b/musl-1.2.5/arch/riscv64/pthread_arch.h 2026-05-04 12:39:58
+@@ -1,10 +1,12 @@
+-static inline uintptr_t __get_tp()
+-{
+- uintptr_t tp;
+- __asm__ __volatile__("mv %0, tp" : "=r"(tp));
+- return tp;
+-}
++/* tcc-build riscv64 pthread_arch.h replacement.
++ *
++ * Stock `__asm__("mv %0, tp" : "=r"(tp))` needs inline-asm operand
++ * constraints — riscv64-asm.c's subst_asm_operand is a stub. Route
++ * through extern __get_tp implemented in src/internal/riscv64/get_tp.s
++ * (single-instruction `mv a0, tp`). */
+
++extern unsigned long __get_tp(void);
++
+ #define TLS_ABOVE_TP
+ #define GAP_ABOVE_TP 0
+
+diff -urN orig_musl/musl-1.2.5/arch/riscv64/syscall_arch.h patched_musl/musl-1.2.5/arch/riscv64/syscall_arch.h
+--- a/musl-1.2.5/arch/riscv64/syscall_arch.h 2026-05-04 12:39:15
++++ b/musl-1.2.5/arch/riscv64/syscall_arch.h 2026-05-04 12:40:13
+@@ -1,78 +1,53 @@
++/* tcc-build riscv64 syscall_arch.h replacement.
++ *
++ * Stock musl uses GCC register-asm-variable inline asm
++ * (`register long a7 __asm__("a7") = n; ... ecall`); tcc 0.9.26
++ * lacks that GCC extension and riscv64-asm.c's subst_asm_operand is
++ * a stub. Route every __syscallN through one trampoline implemented
++ * in pure asm (src/internal/riscv64/syscall.s); the trampoline
++ * shuffles C-ABI a0..a6 into kernel-ABI a7 + a0..a5 and issues
++ * `ecall`.
++ *
++ * The wrappers are static __inline functions, not macros — musl's
++ * src/internal/syscall.h applies a `#define __syscallN(...)` wrapper
++ * which would defeat a macro-form replacement (CPP self-reference
++ * rule).
++ *
++ * Mirrors the aarch64 / x86_64 strategy. */
++
+ #define __SYSCALL_LL_E(x) (x)
+ #define __SYSCALL_LL_O(x) (x)
+
+-#define __asm_syscall(...) \
+- __asm__ __volatile__ ("ecall\n\t" \
+- : "=r"(a0) : __VA_ARGS__ : "memory"); \
+- return a0; \
++extern long __syscall(long, ...);
+
+-static inline long __syscall0(long n)
++static __inline long __syscall0(long n)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0");
+- __asm_syscall("r"(a7))
++ return __syscall(n);
+ }
+-
+-static inline long __syscall1(long n, long a)
++static __inline long __syscall1(long n, long a)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- __asm_syscall("r"(a7), "0"(a0))
++ return __syscall(n, a);
+ }
+-
+-static inline long __syscall2(long n, long a, long b)
++static __inline long __syscall2(long n, long a, long b)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- register long a1 __asm__("a1") = b;
+- __asm_syscall("r"(a7), "0"(a0), "r"(a1))
++ return __syscall(n, a, b);
+ }
+-
+-static inline long __syscall3(long n, long a, long b, long c)
++static __inline long __syscall3(long n, long a, long b, long c)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- register long a1 __asm__("a1") = b;
+- register long a2 __asm__("a2") = c;
+- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2))
++ return __syscall(n, a, b, c);
+ }
+-
+-static inline long __syscall4(long n, long a, long b, long c, long d)
++static __inline long __syscall4(long n, long a, long b, long c, long d)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- register long a1 __asm__("a1") = b;
+- register long a2 __asm__("a2") = c;
+- register long a3 __asm__("a3") = d;
+- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3))
++ return __syscall(n, a, b, c, d);
+ }
+-
+-static inline long __syscall5(long n, long a, long b, long c, long d, long e)
++static __inline long __syscall5(long n, long a, long b, long c, long d, long e)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- register long a1 __asm__("a1") = b;
+- register long a2 __asm__("a2") = c;
+- register long a3 __asm__("a3") = d;
+- register long a4 __asm__("a4") = e;
+- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4))
++ return __syscall(n, a, b, c, d, e);
+ }
+-
+-static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
++static __inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
+ {
+- register long a7 __asm__("a7") = n;
+- register long a0 __asm__("a0") = a;
+- register long a1 __asm__("a1") = b;
+- register long a2 __asm__("a2") = c;
+- register long a3 __asm__("a3") = d;
+- register long a4 __asm__("a4") = e;
+- register long a5 __asm__("a5") = f;
+- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5))
++ return __syscall(n, a, b, c, d, e, f);
+ }
+
+ #define VDSO_USEFUL
+-/* We don't have a clock_gettime function.
+-#define VDSO_CGT_SYM "__vdso_clock_gettime"
+-#define VDSO_CGT_VER "LINUX_2.6" */
+-
+ #define IPC_64 0
diff -urN orig_musl/musl-1.2.5/arch/x86_64/syscall_arch.h patched_musl/musl-1.2.5/arch/x86_64/syscall_arch.h
--- a/musl-1.2.5/arch/x86_64/syscall_arch.h 2024-02-29 18:07:33
+++ b/musl-1.2.5/arch/x86_64/syscall_arch.h 2026-04-28 20:23:48
@@ -3020,6 +3257,109 @@ diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/syscall.s patched_musl/musl-
+ mov x5, x6
+ svc #0
+ ret
+diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/atomic.s patched_musl/musl-1.2.5/src/internal/riscv64/atomic.s
+--- a/musl-1.2.5/src/internal/riscv64/atomic.s 1969-12-31 16:00:00
++++ b/musl-1.2.5/src/internal/riscv64/atomic.s 2026-05-04 12:40:46
+@@ -0,0 +1,51 @@
++/* tcc-build riscv64 atomic primitives.
++ *
++ * Implements the three externs that arch/riscv64/atomic_arch.h
++ * declares; musl's src/internal/atomic.h derives the rest (a_swap,
++ * a_fetch_add, a_or, a_and, a_inc, a_dec, a_store, a_ctz_*,
++ * a_clz_*, …).
++ *
++ * Mnemonics lr.w/lr.d/sc.w/sc.d (with .aqrl ordering), the named
++ * `fence rw,rw` form, and bne-to-immediate-offset are outside the
++ * surface of tcc 0.9.26's riscv64-asm.c — emit raw .word encodings.
++ * `mv` and `jalr` are stock; `jalr x0, x1, 0` is the canonical
++ * encoding of the `ret` pseudo (which tcc's token table omits).
++ *
++ * Encoding cheat sheet (RV64A + base):
++ * lr.w.aqrl rd=a3, (a0) 0x160526AF
++ * lr.d.aqrl rd=a3, (a0) 0x160536AF
++ * sc.w.aqrl rd=a4, a2, (a0) 0x1EC5272F
++ * sc.d.aqrl rd=a4, a2, (a0) 0x1EC5372F
++ * bne a3, a1, +12 0x00B69663
++ * bne a4, x0, -12 0xFE071AE3
++ * fence rw, rw 0x0330000F
++ */
++
++/* void a_barrier(void) -- fence rw, rw */
++.global a_barrier
++.type a_barrier,@function
++a_barrier:
++ .word 0x0330000F
++ jalr x0, x1, 0
++
++/* int a_cas(volatile int *p, int t, int s) -- 32-bit CAS loop */
++.global a_cas
++.type a_cas,@function
++a_cas:
++ .word 0x160526AF /* lr.w.aqrl a3, (a0) */
++ .word 0x00B69663 /* bne a3, a1, +12 (skip to mv if old != expected) */
++ .word 0x1EC5272F /* sc.w.aqrl a4, a2, (a0) */
++ .word 0xFE071AE3 /* bne a4, x0, -12 (retry on SC failure) */
++ mv a0, a3
++ jalr x0, x1, 0
++
++/* void *a_cas_p(volatile void *p, void *t, void *s) -- 64-bit CAS loop */
++.global a_cas_p
++.type a_cas_p,@function
++a_cas_p:
++ .word 0x160536AF /* lr.d.aqrl a3, (a0) */
++ .word 0x00B69663 /* bne a3, a1, +12 */
++ .word 0x1EC5372F /* sc.d.aqrl a4, a2, (a0) */
++ .word 0xFE071AE3 /* bne a4, x0, -12 */
++ mv a0, a3
++ jalr x0, x1, 0
+diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/get_tp.s patched_musl/musl-1.2.5/src/internal/riscv64/get_tp.s
+--- a/musl-1.2.5/src/internal/riscv64/get_tp.s 1969-12-31 16:00:00
++++ b/musl-1.2.5/src/internal/riscv64/get_tp.s 2026-05-04 12:40:30
+@@ -0,0 +1,15 @@
++/* tcc-build riscv64 thread-pointer reader.
++ *
++ * Reads tp (= x4) into a0 and returns. Stock musl's pthread_arch.h
++ * uses `__asm__("mv %0, tp" : "=r"(tp))`; the inline-asm operand
++ * syntax is unsupported by tcc 0.9.26's riscv64-asm.c
++ * (subst_asm_operand is a stub). Trivial single-instruction wrapper
++ * with a tail return; phase-3-equivalent inline-asm support would
++ * let us inline this again.
++ */
++
++.global __get_tp
++.type __get_tp,@function
++__get_tp:
++ mv a0, tp
++ jalr x0, x1, 0
+diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/syscall.s patched_musl/musl-1.2.5/src/internal/riscv64/syscall.s
+--- a/musl-1.2.5/src/internal/riscv64/syscall.s 1969-12-31 16:00:00
++++ b/musl-1.2.5/src/internal/riscv64/syscall.s 2026-05-04 12:40:24
+@@ -0,0 +1,25 @@
++/* tcc-build riscv64 syscall trampoline.
++ *
++ * Mirrors src/internal/x86_64/syscall.s and src/internal/aarch64/
++ * syscall.s. C-ABI passes the syscall number in a0 and arguments
++ * 1..6 in a1..a6; Linux riscv64 syscall ABI wants the number in a7
++ * and arguments in a0..a5. Shuffle and `ecall`. Return value is
++ * already in a0 from the kernel side.
++ *
++ * Mnemonics here (mv, ecall, jalr) are all in tcc 0.9.26's stock
++ * riscv64-asm.c. `ret` is a pseudo not in the token table — emit
++ * the canonical encoding `jalr x0, x1, 0` instead.
++ */
++
++.global __syscall
++.type __syscall,@function
++__syscall:
++ mv a7, a0
++ mv a0, a1
++ mv a1, a2
++ mv a2, a3
++ mv a3, a4
++ mv a4, a5
++ mv a5, a6
++ ecall
++ jalr x0, x1, 0
diff -urN orig_musl/musl-1.2.5/src/internal/syscall.h patched_musl/musl-1.2.5/src/internal/syscall.h
--- a/musl-1.2.5/src/internal/syscall.h 2024-02-29 18:07:33
+++ b/musl-1.2.5/src/internal/syscall.h 2026-04-28 22:05:59
@@ -3550,3 +3890,30 @@ diff -urN orig_musl/musl-1.2.5/src/signal/x86_64/sigsetjmp.s patched_musl/musl-1
- jmp __sigsetjmp_tail
-
-1: jmp setjmp@PLT
+diff -urN orig_musl/musl-1.2.5/src/thread/riscv64/__set_thread_area.s patched_musl/musl-1.2.5/src/thread/riscv64/__set_thread_area.s
+--- a/musl-1.2.5/src/thread/riscv64/__set_thread_area.s 2026-05-04 12:39:16
++++ b/musl-1.2.5/src/thread/riscv64/__set_thread_area.s 2026-05-04 13:00:50
+@@ -1,6 +1,20 @@
++/* tcc-build riscv64 __set_thread_area replacement.
++ *
++ * Identical semantics to the upstream three-instruction file (set
++ * tp, return 0). Only difference: replace the `ret` pseudo
++ * (`jalr x0, x1, 0`) with the explicit jalr form, since tcc
++ * 0.9.26's riscv64-tok.h has no `ret` token.
++ *
++ * On riscv64 the userspace-visible thread pointer is just the `tp`
++ * register; Linux exposes no SYS_set_thread_area (the generic C
++ * fallback in src/thread/__set_thread_area.c would return -ENOSYS
++ * and __init_tp would a_crash). This file is on the startup path —
++ * called from __init_tp via __libc_start_main, so it must link.
++ */
++
+ .global __set_thread_area
+-.type __set_thread_area, %function
++.type __set_thread_area,@function
+ __set_thread_area:
+ mv tp, a0
+- li a0, 0
+- ret
++ addi a0, x0, 0
++ jalr x0, x1, 0