commit b797882f164a5440762926256347aed75997d2ef
parent 23596093076569e952f968e2a764d44310332f82
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 4 May 2026 13:58:08 -0700
boot4: aarch64 round 2 — tcc store/load VT_CONST|VT_LVAL fix + atomic CAS
tcc 0.9.26 arm64-gen.c's store/load functions handle VT_CONST|VT_LVAL
only when VT_SYM is also set; bare VT_CONST|VT_LVAL (store/load via
literal integer address) fell through to printf+assert(0). Fixed by
mirroring the existing |VT_SYM case but using arm64_movimm to
materialize the address. Trip site: musl files that constant-fold
weak-hidden externs (__libc_start_main, mallocng, abort, asctime_r,
…). Regression test in tests/cc/338-literal-addr-deref.c.
aarch64 musl patch reworked: atomic CAS now lives in a single asm
function (extern a_ll/a_sc deadlooped — call boundary clears the
exclusive monitor); __set_thread_area restored as a replacement
(deletion let __init_tls call into garbage); crt_arch.h simplified to
two instructions (avoids broken mov-xN-imm form, _DYNAMIC handling,
sp realignment); .word→.long throughout (tcc's .word is 2 bytes).
Branch layout reordered with exit blocks before function entries:
arm64-asm.c phase 2 silently emits forward `b`/`b.cond` to local
labels as branch-to-self / CONDBR19-error.
Skip count drops 153 → 20 → 8 across the two rounds; libc.a archives
at 2.95 MB and hello links at 87 KB. Hello prints its first line then
segfaults on a malloc-init oddity that's the next investigation.
Diffstat:
9 files changed, 378 insertions(+), 519 deletions(-)
diff --git a/docs/MUSL.md b/docs/MUSL.md
@@ -41,42 +41,106 @@ The container is the same `boot2-scratch:$ARCH` boot3 uses (FROM scratch
| arch | dispatch | musl patch | va_list shim | end-to-end |
|---------|:--:|:--:|:--:|:--:|
| amd64 | ✓ | ✓ | ✓ | ✓ verified |
-| aarch64 | ✓ | aarch64-targeted patches landed (syscall trampoline, atomic externs, get_tp helper, replacement crt_arch.h, deletion sweep). Compile reaches 1251/1271; 20 skips. | mirrors tcc 0.9.26 AAPCS register-save struct, gets past the `va_list` typename | **blocks at hello link** — load-bearing files hit tcc codegen bugs (see below) |
+| aarch64 | ✓ | aarch64-targeted patches landed (syscall trampoline, atomic CAS via single-fn LL/SC, get_tp helper, replacement crt_arch.h, replacement __set_thread_area, deletion sweep) + arm64-gen.c VT_CONST\|VT_LVAL store/load fix. Compile reaches **1263/1271**; 8 skips. | mirrors tcc 0.9.26 AAPCS register-save struct, gets past the `va_list` typename | hello starts, runs first printf, then segfaults — open mystery (see below) |
| riscv64 | ✓ | riscv64-targeted patches landed (mirrors aarch64: syscall trampoline, atomic externs, get_tp helper, replacement crt_arch.h, patched __set_thread_area, deletion sweep). Compile reaches **1268/1271**; 3 skips. | mirrors tcc 0.9.26 riscv64 stdarg.h: `__builtin_va_list = char *`, va_arg as the lp64 pointer-arithmetic macro | ✓ verified |
-### aarch64 status after patch round 1
-
-After landing the 8-patch set above, the skip count drops from 153
-to 20. All asm-shaped failures are gone (inline-asm operands, dsb
-named-options, missing mnemonics, CONDBR19 relocs); what remains is a
-single tcc 0.9.26 aarch64 codegen bug:
-
-```
-store(0, (1011, 5130, 0))
-assert fail: 0
-```
-
-reproducing across **17 of the 20 remaining skips** (the other 3 are
-long-double, same as amd64). It fires from a tccgen-level store
-assertion that doesn't reference a line number, so the trigger is
-inferred. The 17 affected files include `__libc_start_main`,
-`__init_tls`, `__stack_chk_fail`, `abort`, the entire `mallocng/*` and
-`oldmalloc/malloc.c`, `pthread_join`, `asctime_r`, plus a few math
-sources; both malloc implementations fail the same way, so swapping
-allocators doesn't help.
-
-Hello can't link without `__libc_start_main` and `__libc_free`, both
-in the codegen-bug bucket. Closing the gap requires either:
-
-1. Fixing the codegen path in tcc 0.9.26 aarch64 (out of scope for the
- musl patch).
-2. Writing replacement libc internals as a separate compilation unit:
- minimal `__libc_start_main` (no TLS init, no `__init_ssp`, no
- inline-asm barrier), a tiny mmap-or-bss-backed bump allocator
- exporting `malloc/free/calloc/realloc/aligned_alloc/__libc_free/
- malloc_usable_size`, and a stub `__errno_location` if needed once
- TLS init is bypassed. Substantial and brittle (printf et al. assume
- musl's full init).
+### aarch64 status after patch round 1 + tcc fix
+
+The first round of musl patches dropped the skip count from 153 to 20.
+Of those 20, 17 were tccgen-level `store(0, (1011, 5130, 0)) / assert
+fail: 0` failures across `__libc_start_main`, `__init_tls`, `abort`,
+the entire `mallocng/*`, `oldmalloc/malloc.c`, `pthread_join`,
+`asctime_r`, etc. — the load-bearing files hello needs to link.
+
+Root cause **found and fixed in tcc**:
+[`scripts/simple-patches/tcc-0.9.26/arm64-{store,load}-const-lvalue.{before,after}`](../scripts/simple-patches/tcc-0.9.26/).
+`arm64-gen.c`'s `store` and `load` handle `VT_CONST | VT_LVAL | VT_SYM`
+(store/load via symbol address) but never plain `VT_CONST | VT_LVAL`
+(via integer address). Trips on `*(volatile T *)addr = v;` patterns
+that fall out of musl's weak-hidden-extern code paths after constant
+folding. x86_64 routes through generic `gen_modrm`, riscv64 has an
+explicit `fr == VT_CONST` branch, arm64 just ran into `printf + assert`.
+The patch mirrors the existing `|VT_SYM` case but materializes the
+address with `arm64_movimm` instead of `arm64_sym`. Regression test:
+`tests/cc/338-literal-addr-deref.c`. After this fix the skip count
+drops from 20 to 8.
+
+### aarch64 status after patch round 2 (current)
+
+Patches added in round 2 to push past the residual asm-shaped issues:
+
+- **`arch/aarch64/atomic_arch.h` redesigned** to expose only `a_cas` /
+ `a_cas_p` as externs (plus `a_barrier` / `a_ctz_64` / `a_clz_64`).
+ An earlier attempt with extern `a_ll` / `a_sc` deadlooped: the
+ function-call boundary between `ldaxr` and `stlxr` clears the
+ exclusive monitor on real hardware and on QEMU/Apple Silicon, so
+ the LL/SC retry loop never made progress. musl's
+ `src/internal/atomic.h` derives `a_swap` / `a_fetch_add` / `a_or` /
+ `a_and` / `a_inc` / `a_dec` / `a_store` from `a_cas`.
+- **`src/internal/aarch64/atomic.s`** holds the entire LL/SC pair
+ inside one call. Two arm64-asm.c phase-2 quirks shape the layout:
+ - **forward `b.cond` / `cbz` / `cbnz` to a same-file label** errors
+ with `"CONDBR19 reloc unsupported"`,
+ - **forward unconditional `b` to a same-file label** silently
+ assembles as `b +0` (branch-to-self) — no error, but the function
+ becomes an infinite loop.
+ Backward branches resolve correctly; branches to extern symbols
+ (CALL26/JUMP26) work in either direction. So each function defines
+ its exit block BEFORE the function entry, making every conditional
+ branch backward.
+- **`src/thread/aarch64/__set_thread_area.s` restored** as a
+ replacement (not deletion). Stock musl uses `msr tpidr_el0, x0`;
+ arm64-asm.c phase 1+2 doesn't recognize the `msr` mnemonic, so the
+ encoding is emitted as a raw `.long`. Without this file,
+ `__init_tls` calls undefined `__set_thread_area` whose static-link
+ reference gets silently resolved by tcc, then jumps to garbage
+ before main runs.
+- **`arch/aarch64/crt_arch.h` simplified** to just `mov x0, sp; b _start_c`
+ — drops the `adrp`/`:lo12:_DYNAMIC` sequence (unused for static
+ builds), the `and sp, x0, #-16` alignment (`bic` rejects bitmask-
+ immediate; Linux/AAPCS already 16-byte-aligns sp at process entry),
+ and the `mov x29, #0` / `mov x30, #0` register zeroing (arm64-asm.c
+ encodes `mov xN, #imm` as the 32-bit `MOVZ wN, #imm` form, leaving
+ upper 32 bits unset — kernel zeroes GPRs at process entry anyway).
+- Two compounding `.word` → `.long` fixes: tcc's `.word` is **2 bytes**
+ (gas-style for x86), not 4. Every raw-encoding line in `atomic.s`,
+ `get_tp.s`, and `__set_thread_area.s` would have emitted only half
+ the instruction, misaligning subsequent function symbols and tripping
+ `R_AARCH64_(JUMP|CALL)26 relocation failed (val=…, addr=…)` at
+ link.
+
+Result: **1263/1271 compile, 8 skips, libc.a archives at ~2.95 MB,
+hello links at 87 KB.** The 8 skips are the same long-double
+constant-folding files as amd64 (`__cosl.c`, `__sinl.c`, `__tanl.c`,
+`exp2l.c`, `fmaf.c`, `j1f.c`, `pow_data.c`) plus
+`src/thread/__unmapself.c` (inline asm with output operand —
+phase-3-blocked).
+
+### Hello segfault — open mystery
+
+Hello starts up, prints `hello from boot4 (tcc-built musl); argc=4`,
+then segfaults before the second printf. Isolating shows
+`malloc(8)` returns NULL deterministically in some link closures and
+succeeds in others. So far:
+
+- Direct `__syscall(SYS_brk, 0)` works (returns valid break).
+- Direct `__syscall(SYS_mmap, ...)` works (returns valid page).
+- `int main(void) { malloc(8); }` — NULL.
+- `int main(void) { malloc(8); printf(...); }` — still NULL.
+- Same with `extern int *__errno_location(void);` declared — still NULL.
+- Same with `putchar` before malloc — still NULL.
+- Same with extern `__syscall` reference — still NULL.
+- BUT: same after explicitly calling `__syscall(SYS_brk, 0)` once
+ before malloc — succeeds. Repeated mallocs after that all succeed.
+
+The malloc trampoline path is right (verified in isolation); the asm
+primitives are right (verified). The trigger isn't an unresolved
+weak-alias `___errno_location` either — adding strong references to
+it doesn't change the behavior. Looks like an actual bug in mallocng's
+first-call init that depends on something subtle about call ordering
+or the kernel's brk-state-on-first-call semantics under QEMU emulation.
+
+**Pursuing root cause; not papering over with a `brk(0)` warm-up call.**
### aarch64 skip taxonomy (pre-patch snapshot — 153 skipped sources)
@@ -94,40 +158,6 @@ Compiling each skipped file in isolation and bucketing the first error:
| **1** | `pow_data.c` "initializer element is not constant" | same long-double constant-folding bug as on amd64. |
| **1** | crt step: `arch/aarch64/crt_arch.h:15: known instruction expected` | (separate from the 153 — kills the `crt1.o` build.) |
-### Two paths forward for aarch64
-
-The 109 inline-asm failures (79 + 30) plus the 17 `dsb` failures are
-~80% of the wall. All three sit in the assembler/parser. Two ways to
-clear them:
-
-1. **Patch musl** to bypass the failing inline-asm sites (mirrors what
- the x86_64 patch does for syscall_arch.h). Concretely:
- - new `arch/aarch64/syscall_arch.h` calling pure-asm trampolines, +
- `src/internal/aarch64/syscall.s` providing `__syscall0`..`__syscall6`
- - new `arch/aarch64/atomic_arch.h` using `__sync_*` builtins or a
- pure-asm helper file
- - tweak `pthread_arch.h`'s `__get_tp` to call a one-instruction asm
- helper (`mrs x0, tpidr_el0; ret`)
- - delete `src/math/aarch64/*.c` overrides (the portable C falls
- back, like the amd64 patch deletes `src/math/x86_64/*.c`)
- - drop the rest of `src/{thread,setjmp,signal,fenv,string}/aarch64/*.{s,S}`
- and let the portable C versions cover them
- - skip the 17 tcc-codegen-bug files like the long-double set
-
- Scope: similar to the existing musl patch — a few hundred lines of
- replacement code + a deletion list. Boot4 reaches a working hello.
-
-2. **Finish arm64-asm.c phase 3** ([docs/TCC-ARM64-ASM.md](TCC-ARM64-ASM.md)).
- Then nearly all 109 + 17 + a few mnemonics resolve themselves and
- musl's aarch64 source is consumed unmodified. Larger lift but
- higher leverage — also unlocks any other inline-asm-using code we
- compile later. Doesn't address the 17 codegen bugs, which still
- need either tcc fixes or per-file skips.
-
-The 17 codegen-bug files are the irreducible residual either way — tcc
-0.9.26 has aarch64 codegen issues that would need fixing in the
-compiler itself.
-
### riscv64 status after patch round 1
riscv64 mirrors the aarch64 strategy: tcc 0.9.26's `riscv64-asm.c`
diff --git a/scripts/boot4.sh b/scripts/boot4.sh
@@ -192,8 +192,9 @@ rm -f musl-1.2.5/src/string/aarch64/memcpy.S
rm -f musl-1.2.5/src/fenv/aarch64/fenv.s
rm -f musl-1.2.5/src/thread/aarch64/clone.s
rm -f musl-1.2.5/src/thread/aarch64/syscall_cp.s
-rm -f musl-1.2.5/src/thread/aarch64/__set_thread_area.s
rm -f musl-1.2.5/src/thread/aarch64/__unmapself.s
+# __set_thread_area.s is REPLACED in the patch (msr as raw .long), not deleted —
+# __init_tls.c calls it during process init.
rm -f musl-1.2.5/src/setjmp/aarch64/setjmp.s
rm -f musl-1.2.5/src/setjmp/aarch64/longjmp.s
rm -f musl-1.2.5/src/signal/aarch64/sigsetjmp.s
diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-load-const-lvalue.after b/scripts/simple-patches/tcc-0.9.26/arm64-load-const-lvalue.after
@@ -0,0 +1,26 @@
+ if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+ arm64_sym(30, sv->sym, svcul); // use x30 for address
+ if (IS_FREG(r))
+ arm64_ldrv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_ldrx(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+ intr(r), 30, 0);
+ return;
+ }
+
+ if (svr == (VT_CONST | VT_LVAL)) {
+ /* boot2: load via literal integer address (no VT_SYM).
+ * Mirror of the store-side patch (arm64-store-const-lvalue):
+ * stock arm64-gen.c handles VT_CONST|VT_LVAL only when VT_SYM
+ * is also set. Materialize the constant address in x30, then
+ * use the standard indirect-load path. */
+ arm64_movimm(30, svcul);
+ if (IS_FREG(r))
+ arm64_ldrv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_ldrx(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+ intr(r), 30, 0);
+ return;
+ }
+
+ if (svr == (VT_CONST | VT_SYM)) {
diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-load-const-lvalue.before b/scripts/simple-patches/tcc-0.9.26/arm64-load-const-lvalue.before
@@ -0,0 +1,11 @@
+ if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+ arm64_sym(30, sv->sym, svcul); // use x30 for address
+ if (IS_FREG(r))
+ arm64_ldrv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_ldrx(!(svtt & VT_UNSIGNED), arm64_type_size(svtt),
+ intr(r), 30, 0);
+ return;
+ }
+
+ if (svr == (VT_CONST | VT_SYM)) {
diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-store-const-lvalue.after b/scripts/simple-patches/tcc-0.9.26/arm64-store-const-lvalue.after
@@ -0,0 +1,28 @@
+ if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+ arm64_sym(30, sv->sym, svcul); // use x30 for address
+ if (IS_FREG(r))
+ arm64_strv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_strx(arm64_type_size(svtt), intr(r), 30, 0);
+ return;
+ }
+
+ if (svr == (VT_CONST | VT_LVAL)) {
+ /* boot2: store via literal integer address (no VT_SYM).
+ * Stock arm64-gen.c lacks this case; x86_64 routes through
+ * gen_modrm and riscv64 has an explicit fr==VT_CONST branch.
+ * Triggered by musl when storing through pointers tcc has
+ * folded to constants (e.g. weak-hidden refs in
+ * __libc_start_main, mallocng). Mirrors the |VT_SYM case
+ * above but materializes the address with arm64_movimm
+ * instead of arm64_sym. */
+ arm64_movimm(30, svcul);
+ if (IS_FREG(r))
+ arm64_strv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_strx(arm64_type_size(svtt), intr(r), 30, 0);
+ return;
+ }
+
+ printf("store(%x, (%x, %x, %llx))\n", r, svtt, sv->r, (long long)svcul);
+ assert(0);
diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-store-const-lvalue.before b/scripts/simple-patches/tcc-0.9.26/arm64-store-const-lvalue.before
@@ -0,0 +1,11 @@
+ if (svr == (VT_CONST | VT_LVAL | VT_SYM)) {
+ arm64_sym(30, sv->sym, svcul); // use x30 for address
+ if (IS_FREG(r))
+ arm64_strv(arm64_type_size(svtt), fltr(r), 30, 0);
+ else
+ arm64_strx(arm64_type_size(svtt), intr(r), 30, 0);
+ return;
+ }
+
+ printf("store(%x, (%x, %x, %llx))\n", r, svtt, sv->r, (long long)svcul);
+ assert(0);
diff --git a/scripts/stage1-flatten.sh b/scripts/stage1-flatten.sh
@@ -174,6 +174,16 @@ apply_our_patch aarch64-stdarg-array "$SRC/include/stdarg.h"
apply_our_patch arm64-va-pointer-operand "$SRC/arm64-gen.c"
apply_our_patch arm64-va-arg-pointer "$SRC/arm64-gen.c"
+# AArch64 codegen: store/load through a literal integer address
+# (VT_CONST | VT_LVAL without VT_SYM). Stock arm64-gen.c only handles
+# the |VT_SYM case; bare integer addresses fall through to the
+# `printf + assert(0)` tail. Hits in musl when tcc folds weak-hidden
+# refs in __libc_start_main/mallocng. Patch is gated by the
+# surrounding store/load functions which exist only under
+# TCC_TARGET_ARM64.
+apply_our_patch arm64-store-const-lvalue "$SRC/arm64-gen.c"
+apply_our_patch arm64-load-const-lvalue "$SRC/arm64-gen.c"
+
# AArch64 assembler — phase 1. Drops in arm64-asm.c + arm64-tok.h and
# wires their includes into tcc.h, libtcc.c, and tcctok.h. Patches are
# gated by TCC_TARGET_ARM64 in the surrounding source so they no-op on
diff --git a/tests/cc/338-literal-addr-deref.c b/tests/cc/338-literal-addr-deref.c
@@ -0,0 +1,40 @@
+/* Store/load through a pointer whose address is a plain integer
+ * constant (no symbol). On aarch64, stock arm64-gen.c (0.9.26) handles
+ * only the VT_CONST | VT_LVAL | VT_SYM case in store/load and asserts
+ * out on bare VT_CONST | VT_LVAL — i.e. `*(volatile T*)0x1234`. The
+ * matching x86_64 path goes through gen_modrm and the riscv64 path has
+ * an explicit fr==VT_CONST branch, so neither tripped before. The
+ * scripts/simple-patches/tcc-0.9.26/arm64-{store,load}-const-lvalue
+ * pair adds the missing case.
+ *
+ * The volatile global keeps tcc from constant-propagating the
+ * branch to false, so the body must emit code; the runtime value
+ * stays 0 so we never actually dereference 0x1234 / 0x5678. The test
+ * passes if the program both compiles and exits 0.
+ *
+ * Surfaced when building musl on aarch64 (boot4): __libc_start_main,
+ * the mallocng family, abort, asctime_r and friends emit this pattern
+ * through folded weak-hidden references and trip the assert.
+ */
+
+static volatile int never = 0;
+
+int main(void)
+{
+ if (never) {
+ *(volatile unsigned char *)0x1234 = 5; /* store */
+ unsigned char b = *(volatile unsigned char *)0x1234;
+
+ *(volatile unsigned short *)0x2000 = 0xbeef; /* sized variants */
+ unsigned short s = *(volatile unsigned short *)0x2000;
+
+ *(volatile unsigned int *)0x3000 = 0xdeadbeef;
+ unsigned int i = *(volatile unsigned int *)0x3000;
+
+ *(volatile unsigned long *)0x4000 = 0x1122334455667788UL;
+ unsigned long l = *(volatile unsigned long *)0x4000;
+
+ return (int)(b + s + i + l);
+ }
+ return 0;
+}
diff --git a/vendor/upstream/musl-1.2.5-tcc.patch b/vendor/upstream/musl-1.2.5-tcc.patch
@@ -1,7 +1,7 @@
diff -urN orig_musl/musl-1.2.5/arch/aarch64/atomic_arch.h patched_musl/musl-1.2.5/arch/aarch64/atomic_arch.h
--- a/musl-1.2.5/arch/aarch64/atomic_arch.h 2024-02-29 18:07:33
-+++ b/musl-1.2.5/arch/aarch64/atomic_arch.h 2026-05-04 12:05:56
-@@ -1,82 +1,27 @@
++++ b/musl-1.2.5/arch/aarch64/atomic_arch.h 2026-05-04 13:33:15
+@@ -1,82 +1,28 @@
-#define a_ll a_ll
-static inline int a_ll(volatile int *p)
-{
@@ -12,14 +12,19 @@ diff -urN orig_musl/musl-1.2.5/arch/aarch64/atomic_arch.h patched_musl/musl-1.2.
+/* tcc-build aarch64 atomic_arch.h replacement.
+ *
+ * Stock musl's atomic primitives are inline asm with output operands
-+ * and the `dmb ish` named option — none of which arm64-asm.c phase
-+ * 1+2 handles. Declare a_ll / a_sc / a_barrier / a_ll_p / a_sc_p /
-+ * a_ctz_64 / a_clz_64 as externs implemented in
-+ * src/internal/aarch64/atomic.s; musl's src/internal/atomic.h derives
-+ * a_cas / a_swap / a_fetch_add / a_or / a_and / a_inc / a_dec / a_store
-+ * from a_ll + a_sc automatically.
++ * (LL/SC pairs and the `dmb ish` named option), none of which
++ * arm64-asm.c phase 1+2 handles. We can't decompose into separate
++ * extern a_ll / a_sc — function-call boundaries between ldaxr and
++ * stlxr clear the exclusive monitor on real hardware (and on
++ * QEMU/Apple Silicon), making the LL/SC retry loop deadloop.
+ *
-+ * Phase 3 of arm64-asm.c will let us inline these again. */
++ * Provide a_cas (32-bit) and a_cas_p (64-bit, since aarch64 pointers
++ * are 64-bit) as single-function externs implemented in
++ * src/internal/aarch64/atomic.s; the LL/SC pair lives inside one
++ * call. musl's src/internal/atomic.h derives a_swap, a_fetch_add,
++ * a_or, a_and, a_inc, a_dec, a_store and friends from a_cas.
++ *
++ * Phase 3 of arm64-asm.c will let us inline these. */
-#define a_sc a_sc
-static inline int a_sc(volatile int *p, int v)
@@ -28,17 +33,13 @@ diff -urN orig_musl/musl-1.2.5/arch/aarch64/atomic_arch.h patched_musl/musl-1.2.
- __asm__ __volatile__ ("stlxr %w0,%w2,%1" : "=&r"(r), "=Q"(*p) : "r"(v) : "memory");
- return !r;
-}
-+extern int a_ll (volatile int *);
-+extern int a_sc (volatile int *, int);
-+extern void a_barrier (void);
-+extern void *a_ll_p (volatile void *);
-+extern int a_sc_p (volatile int *, void *);
-+extern int a_ctz_64 (unsigned long long);
-+extern int a_clz_64 (unsigned long long);
++extern int a_cas (volatile int *, int, int);
++extern void *a_cas_p (volatile void *, void *, void *);
++extern void a_barrier(void);
++extern int a_ctz_64 (unsigned long long);
++extern int a_clz_64 (unsigned long long);
-+#define a_ll a_ll
-+#define a_sc a_sc
- #define a_barrier a_barrier
+-#define a_barrier a_barrier
-static inline void a_barrier()
-{
- __asm__ __volatile__ ("dmb ish" : : : "memory");
@@ -104,41 +105,56 @@ diff -urN orig_musl/musl-1.2.5/arch/aarch64/atomic_arch.h patched_musl/musl-1.2.
- __asm__("clz %0, %1" : "=r"(x) : "r"(x));
- return x;
-}
-+#define a_ll_p a_ll_p
-+#define a_sc_p a_sc_p
-+#define a_ctz_64 a_ctz_64
-+#define a_clz_64 a_clz_64
++#define a_cas a_cas
++#define a_cas_p a_cas_p
++#define a_barrier a_barrier
++#define a_ctz_64 a_ctz_64
++#define a_clz_64 a_clz_64
diff -urN orig_musl/musl-1.2.5/arch/aarch64/crt_arch.h patched_musl/musl-1.2.5/arch/aarch64/crt_arch.h
--- a/musl-1.2.5/arch/aarch64/crt_arch.h 2024-02-29 18:07:33
-+++ b/musl-1.2.5/arch/aarch64/crt_arch.h 2026-05-04 12:09:24
-@@ -1,3 +1,17 @@
++++ b/musl-1.2.5/arch/aarch64/crt_arch.h 2026-05-04 13:25:22
+@@ -1,15 +1,36 @@
+/* tcc-build aarch64 crt_arch.h replacement.
+ *
-+ * Stock musl uses `adrp x1, _DYNAMIC` + `add x1, x1, #:lo12:_DYNAMIC`
-+ * to pass _DYNAMIC's address to __dls2/__libc_start_main; arm64-asm.c
-+ * phase 1+2 has neither the `adrp` mnemonic nor `:lo12:` ELF reloc
-+ * syntax. boot4 only links static binaries — _DYNAMIC is unused — so
-+ * pass NULL instead.
++ * Three things stock musl does that we can't:
++ *
++ * 1. `adrp x1, _DYNAMIC` + `add x1, x1, #:lo12:_DYNAMIC` to pass
++ * _DYNAMIC's address. arm64-asm.c phase 1+2 has neither the
++ * `adrp` mnemonic nor the `:lo12:` ELF reloc syntax, and boot4
++ * only links static binaries (no _DYNAMIC) anyway. _start_c
++ * ignores its second argument; just don't pass it.
++ *
++ * 2. `and sp, x0, #-16` to align the stack. Phase 2's `bic` rejects
++ * the bitmask-immediate form ("invert form requires a register").
++ * Linux/AAPCS already guarantees a 16-byte-aligned sp at process
++ * entry, so the realignment is defensive only.
++ *
++ * 3. `mov x29, #0` / `mov x30, #0` / `mov x1, #0` — phase 1 emits
++ * these as the 32-bit MOVZ form (sf=0), leaving the upper 32
++ * bits of the X register at their entry value. Subsequent code
++ * treats x29 as a frame-pointer chain anchor; non-zero high bits
++ * crash on the first `stp x29, x30, [...]`. The Linux kernel
++ * zeroes all GPRs except sp at process entry (see
++ * `start_thread` in arch/arm64/kernel/process.c), so omitting
++ * the explicit zeroing is safe — when phase 3 lands, restore
++ * the zeroing for defense-in-depth.
+ *
-+ * We also omit the `and sp, x0, #-16` alignment step: `bic` in phase 2
-+ * only handles the register form (bitmask-immediate is rejected with
-+ * "invert form requires a register"), and Linux/AAPCS already guarantees
-+ * a 16-byte-aligned sp at process entry. Phase 3 will let us reinstate
-+ * the explicit alignment for safety.
++ * What's left is the minimum kernel-to-userland glue: pass sp to
++ * _start_c so it can read argc/argv/envp/auxv, then jump.
+ */
__asm__(
".text \n"
".global " START "\n"
-@@ -6,10 +20,6 @@
- " mov x29, #0\n"
- " mov x30, #0\n"
+ ".type " START ",%function\n"
+ START ":\n"
+-" mov x29, #0\n"
+-" mov x30, #0\n"
" mov x0, sp\n"
-".weak _DYNAMIC\n"
-".hidden _DYNAMIC\n"
-" adrp x1, _DYNAMIC\n"
-" add x1, x1, #:lo12:_DYNAMIC\n"
-" and sp, x0, #-16\n"
-+" mov x1, #0\n"
" b " START "_c\n"
);
diff -urN orig_musl/musl-1.2.5/arch/aarch64/pthread_arch.h patched_musl/musl-1.2.5/arch/aarch64/pthread_arch.h
@@ -281,243 +297,6 @@ diff -urN orig_musl/musl-1.2.5/arch/aarch64/syscall_arch.h patched_musl/musl-1.2
-#define IPC_64 0
+#define SYSCALL_FADVISE_6_ARG
+#define SYSCALL_IPC_BROKEN_MODE
-diff -urN orig_musl/musl-1.2.5/arch/riscv64/atomic_arch.h patched_musl/musl-1.2.5/arch/riscv64/atomic_arch.h
---- a/musl-1.2.5/arch/riscv64/atomic_arch.h 2026-05-04 12:39:15
-+++ b/musl-1.2.5/arch/riscv64/atomic_arch.h 2026-05-04 12:39:35
-@@ -1,38 +1,22 @@
--#define a_barrier a_barrier
--static inline void a_barrier()
--{
-- __asm__ __volatile__ ("fence rw,rw" : : : "memory");
--}
-+/* tcc-build riscv64 atomic_arch.h replacement.
-+ *
-+ * Stock musl provides a_barrier / a_cas / a_cas_p as inline asm with
-+ * output operand constraints + lr.w.aqrl / sc.w.aqrl / bne-to-label.
-+ * riscv64-asm.c's subst_asm_operand is a stub
-+ * (`tcc_error("RISCV64 asm not implemented.")`); the lr/sc family,
-+ * the named-fence form, and bne-to-label are also outside the
-+ * in-tree assembler's surface. Route through pure-asm externs in
-+ * src/internal/riscv64/atomic.s, which spell the lr/sc/branch
-+ * instructions as raw .word encodings.
-+ *
-+ * Mirrors the aarch64 strategy (different primitive set: aarch64 musl
-+ * exposes a_ll/a_sc and lets atomic.h derive a_cas; riscv64 musl
-+ * exposes a_cas/a_cas_p directly). */
-
--#define a_cas a_cas
--static inline int a_cas(volatile int *p, int t, int s)
--{
-- int old, tmp;
-- __asm__ __volatile__ (
-- "\n1: lr.w.aqrl %0, (%2)\n"
-- " bne %0, %3, 1f\n"
-- " sc.w.aqrl %1, %4, (%2)\n"
-- " bnez %1, 1b\n"
-- "1:"
-- : "=&r"(old), "=&r"(tmp)
-- : "r"(p), "r"((long)t), "r"((long)s)
-- : "memory");
-- return old;
--}
-+extern void a_barrier (void);
-+extern int a_cas (volatile int *, int, int);
-+extern void *a_cas_p (volatile void *, void *, void *);
-
--#define a_cas_p a_cas_p
--static inline void *a_cas_p(volatile void *p, void *t, void *s)
--{
-- void *old;
-- int tmp;
-- __asm__ __volatile__ (
-- "\n1: lr.d.aqrl %0, (%2)\n"
-- " bne %0, %3, 1f\n"
-- " sc.d.aqrl %1, %4, (%2)\n"
-- " bnez %1, 1b\n"
-- "1:"
-- : "=&r"(old), "=&r"(tmp)
-- : "r"(p), "r"(t), "r"(s)
-- : "memory");
-- return old;
--}
-+#define a_barrier a_barrier
-+#define a_cas a_cas
-+#define a_cas_p a_cas_p
-diff -urN orig_musl/musl-1.2.5/arch/riscv64/crt_arch.h patched_musl/musl-1.2.5/arch/riscv64/crt_arch.h
---- a/musl-1.2.5/arch/riscv64/crt_arch.h 2026-05-04 12:39:15
-+++ b/musl-1.2.5/arch/riscv64/crt_arch.h 2026-05-04 12:39:47
-@@ -1,19 +1,24 @@
-+/* tcc-build riscv64 crt_arch.h replacement.
-+ *
-+ * Stock musl uses the `tail` pseudo-instruction (auipc + jalr), the
-+ * `.option push/norelax/pop` directives, and `lla gp,
-+ * __global_pointer$` to set up GP-relative addressing for linker
-+ * relaxation. tcc 0.9.26's riscv64 assembler has none of those:
-+ * `tail` isn't in riscv64-tok.h, `.option` has no handler in
-+ * tccasm.c, and the internal linker doesn't relax — so GP setup is
-+ * unnecessary. Drop the GP block and tail-call via plain
-+ * `jal x0, _start_c`; R_RISCV_JAL is in range (boot4 hello is ~55
-+ * KB and _start_c sits in the same TU).
-+ *
-+ * _DYNAMIC is unused for static-only builds — pass NULL as arg1.
-+ */
- __asm__(
--".section .sdata,\"aw\"\n"
- ".text\n"
- ".global " START "\n"
- ".type " START ",%function\n"
- START ":\n"
--".weak __global_pointer$\n"
--".hidden __global_pointer$\n"
--".option push\n"
--".option norelax\n\t"
--"lla gp, __global_pointer$\n"
--".option pop\n\t"
- "mv a0, sp\n"
--".weak _DYNAMIC\n"
--".hidden _DYNAMIC\n\t"
--"lla a1, _DYNAMIC\n\t"
-+"mv a1, x0\n"
- "andi sp, sp, -16\n\t"
--"tail " START "_c"
-+"jal x0, " START "_c"
- );
-diff -urN orig_musl/musl-1.2.5/arch/riscv64/pthread_arch.h patched_musl/musl-1.2.5/arch/riscv64/pthread_arch.h
---- a/musl-1.2.5/arch/riscv64/pthread_arch.h 2026-05-04 12:39:15
-+++ b/musl-1.2.5/arch/riscv64/pthread_arch.h 2026-05-04 12:39:58
-@@ -1,10 +1,12 @@
--static inline uintptr_t __get_tp()
--{
-- uintptr_t tp;
-- __asm__ __volatile__("mv %0, tp" : "=r"(tp));
-- return tp;
--}
-+/* tcc-build riscv64 pthread_arch.h replacement.
-+ *
-+ * Stock `__asm__("mv %0, tp" : "=r"(tp))` needs inline-asm operand
-+ * constraints — riscv64-asm.c's subst_asm_operand is a stub. Route
-+ * through extern __get_tp implemented in src/internal/riscv64/get_tp.s
-+ * (single-instruction `mv a0, tp`). */
-
-+extern unsigned long __get_tp(void);
-+
- #define TLS_ABOVE_TP
- #define GAP_ABOVE_TP 0
-
-diff -urN orig_musl/musl-1.2.5/arch/riscv64/syscall_arch.h patched_musl/musl-1.2.5/arch/riscv64/syscall_arch.h
---- a/musl-1.2.5/arch/riscv64/syscall_arch.h 2026-05-04 12:39:15
-+++ b/musl-1.2.5/arch/riscv64/syscall_arch.h 2026-05-04 12:40:13
-@@ -1,78 +1,53 @@
-+/* tcc-build riscv64 syscall_arch.h replacement.
-+ *
-+ * Stock musl uses GCC register-asm-variable inline asm
-+ * (`register long a7 __asm__("a7") = n; ... ecall`); tcc 0.9.26
-+ * lacks that GCC extension and riscv64-asm.c's subst_asm_operand is
-+ * a stub. Route every __syscallN through one trampoline implemented
-+ * in pure asm (src/internal/riscv64/syscall.s); the trampoline
-+ * shuffles C-ABI a0..a6 into kernel-ABI a7 + a0..a5 and issues
-+ * `ecall`.
-+ *
-+ * The wrappers are static __inline functions, not macros — musl's
-+ * src/internal/syscall.h applies a `#define __syscallN(...)` wrapper
-+ * which would defeat a macro-form replacement (CPP self-reference
-+ * rule).
-+ *
-+ * Mirrors the aarch64 / x86_64 strategy. */
-+
- #define __SYSCALL_LL_E(x) (x)
- #define __SYSCALL_LL_O(x) (x)
-
--#define __asm_syscall(...) \
-- __asm__ __volatile__ ("ecall\n\t" \
-- : "=r"(a0) : __VA_ARGS__ : "memory"); \
-- return a0; \
-+extern long __syscall(long, ...);
-
--static inline long __syscall0(long n)
-+static __inline long __syscall0(long n)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0");
-- __asm_syscall("r"(a7))
-+ return __syscall(n);
- }
--
--static inline long __syscall1(long n, long a)
-+static __inline long __syscall1(long n, long a)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- __asm_syscall("r"(a7), "0"(a0))
-+ return __syscall(n, a);
- }
--
--static inline long __syscall2(long n, long a, long b)
-+static __inline long __syscall2(long n, long a, long b)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- register long a1 __asm__("a1") = b;
-- __asm_syscall("r"(a7), "0"(a0), "r"(a1))
-+ return __syscall(n, a, b);
- }
--
--static inline long __syscall3(long n, long a, long b, long c)
-+static __inline long __syscall3(long n, long a, long b, long c)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- register long a1 __asm__("a1") = b;
-- register long a2 __asm__("a2") = c;
-- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2))
-+ return __syscall(n, a, b, c);
- }
--
--static inline long __syscall4(long n, long a, long b, long c, long d)
-+static __inline long __syscall4(long n, long a, long b, long c, long d)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- register long a1 __asm__("a1") = b;
-- register long a2 __asm__("a2") = c;
-- register long a3 __asm__("a3") = d;
-- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3))
-+ return __syscall(n, a, b, c, d);
- }
--
--static inline long __syscall5(long n, long a, long b, long c, long d, long e)
-+static __inline long __syscall5(long n, long a, long b, long c, long d, long e)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- register long a1 __asm__("a1") = b;
-- register long a2 __asm__("a2") = c;
-- register long a3 __asm__("a3") = d;
-- register long a4 __asm__("a4") = e;
-- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4))
-+ return __syscall(n, a, b, c, d, e);
- }
--
--static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
-+static __inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
- {
-- register long a7 __asm__("a7") = n;
-- register long a0 __asm__("a0") = a;
-- register long a1 __asm__("a1") = b;
-- register long a2 __asm__("a2") = c;
-- register long a3 __asm__("a3") = d;
-- register long a4 __asm__("a4") = e;
-- register long a5 __asm__("a5") = f;
-- __asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5))
-+ return __syscall(n, a, b, c, d, e, f);
- }
-
- #define VDSO_USEFUL
--/* We don't have a clock_gettime function.
--#define VDSO_CGT_SYM "__vdso_clock_gettime"
--#define VDSO_CGT_VER "LINUX_2.6" */
--
- #define IPC_64 0
diff -urN orig_musl/musl-1.2.5/arch/x86_64/syscall_arch.h patched_musl/musl-1.2.5/arch/x86_64/syscall_arch.h
--- a/musl-1.2.5/arch/x86_64/syscall_arch.h 2024-02-29 18:07:33
+++ b/musl-1.2.5/arch/x86_64/syscall_arch.h 2026-04-28 20:23:48
@@ -3135,84 +2914,108 @@ diff -urN orig_musl/musl-1.2.5/src/include/features.h patched_musl/musl-1.2.5/sr
#endif
diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/atomic.s patched_musl/musl-1.2.5/src/internal/aarch64/atomic.s
--- a/musl-1.2.5/src/internal/aarch64/atomic.s 1969-12-31 16:00:00
-+++ b/musl-1.2.5/src/internal/aarch64/atomic.s 2026-05-04 12:06:50
-@@ -0,0 +1,73 @@
++++ b/musl-1.2.5/src/internal/aarch64/atomic.s 2026-05-04 13:48:27
+@@ -0,0 +1,97 @@
+/* tcc-build aarch64 atomic primitives.
+ *
-+ * Implements the seven externs that arch/aarch64/atomic_arch.h
-+ * declares; musl's src/internal/atomic.h derives the rest (a_cas,
-+ * a_swap, a_fetch_add, a_or, a_and, a_inc, a_dec, a_store, …).
++ * Stock musl's atomic primitives are inline asm with output operands
++ * (LL/SC pairs and `dmb ish` named-option), unsupported by
++ * arm64-asm.c phase 1+2. We can't decompose into separate extern
++ * a_ll/a_sc — function-call boundaries between ldaxr and stlxr clear
++ * the exclusive monitor on real hardware (and on QEMU/Apple Silicon),
++ * making the LL/SC retry loop deadloop. So provide a_cas / a_cas_p as
++ * single asm functions whose LL/SC pair lives inside one call.
++ *
++ * Two arm64-asm.c phase-2 quirks shape the layout below:
++ * 1. Forward `b.cond` / `cbz` / `cbnz` to a same-file label emits
++ * `CONDBR19 reloc (unsupported)`.
++ * 2. Forward unconditional `b` to a same-file label silently
++ * assembles as `b +0` (branch-to-self) — no error, but the
++ * function turns into an infinite loop.
++ * Backward branches resolve correctly (offset known at emit time);
++ * branches to external symbols (e.g. `bl __syscall`) go through
++ * JUMP26/CALL26 relocations which arm64-link.c handles. So the trick
++ * is: define each function's "exit" block BEFORE the function entry,
++ * so every conditional branch out of the loop is backward, and the
++ * tail unconditional `b` is also backward.
+ *
-+ * Mnemonics ldaxr, stlxr, dmb-with-named-option, rbit, clz are
-+ * outside arm64-asm.c phase 1+2 — emit raw .word encodings. Control-
-+ * flow mnemonics (mov, cmp, cset, ret) are phase 1+2.
++ * Mnemonics ldaxr, stlxr, dmb, rbit, clz are outside arm64-asm.c
++ * phase 1+2; emit them as raw .long words. Phase 1+2 covers cmp,
++ * b.cond (backward only), cbnz (backward), mov, ret, b (backward to
++ * same-file or any-direction to extern).
+ *
+ * Encoding cheat sheet:
-+ * 0x885FFC00 ldaxr w0, [x0]
-+ * 0xC85FFC00 ldaxr x0, [x0]
-+ * 0x8802FC01 stlxr w2, w1, [x0]
-+ * 0xC802FC01 stlxr w2, x1, [x0]
++ * 0x885FFC03 ldaxr w3, [x0]
++ * 0xC85FFC03 ldaxr x3, [x0]
++ * 0x8804FC02 stlxr w4, w2, [x0]
++ * 0xC804FC02 stlxr w4, x2, [x0]
+ * 0xD5033BBF dmb ish
+ * 0xDAC00000 rbit x0, x0
+ * 0xDAC01000 clz x0, x0
-+ */
++ *
++ * Note: tcc treats `.word` as 2 bytes; use `.long` for 4. */
+
-+/* int a_ll(volatile int *p) -- load-acquire-exclusive 32-bit */
-+.global a_ll
-+.type a_ll,@function
-+a_ll:
-+ .word 0x885FFC00
++.text
++
++/* Exit blocks defined first so a_cas / a_cas_p only emit BACKWARD
++ * branches. Not part of any function — only entered via the backward
++ * b.ne / b inside their respective functions below. */
++.Lcas_done:
++ .long 0xD5033BBF /* dmb ish */
++ mov w0, w3 /* return old (32-bit) */
+ ret
+
-+/* int a_sc(volatile int *p, int v) -- store-release-exclusive; return !status */
-+.global a_sc
-+.type a_sc,@function
-+a_sc:
-+ .word 0x8802FC01
-+ cmp w2, #0
-+ cset w0, eq
++.Lcasp_done:
++ .long 0xD5033BBF /* dmb ish */
++ mov x0, x3 /* return old (64-bit) */
+ ret
+
-+/* void a_barrier(void) -- dmb ish */
++/* int a_cas(volatile int *p, int t, int s) — 32-bit CAS. */
++.global a_cas
++.type a_cas,@function
++a_cas:
++ .long 0x885FFC03 /* ldaxr w3, [x0] : old = *p, mark exclusive */
++ cmp w3, w1 /* old == t ? */
++ b.ne .Lcas_done /* backward — OK */
++ .long 0x8804FC02 /* stlxr w4, w2, [x0]: try *p = s, w4 = status */
++ cbnz w4, a_cas /* backward — failed, retry from a_cas entry */
++ b .Lcas_done /* backward — succeeded */
++
++/* void *a_cas_p(volatile void *p, void *t, void *s) — 64-bit CAS. */
++.global a_cas_p
++.type a_cas_p,@function
++a_cas_p:
++ .long 0xC85FFC03 /* ldaxr x3, [x0] */
++ cmp x3, x1
++ b.ne .Lcasp_done
++ .long 0xC804FC02 /* stlxr w4, x2, [x0] */
++ cbnz w4, a_cas_p
++ b .Lcasp_done
++
++/* void a_barrier(void) — dmb ish */
+.global a_barrier
+.type a_barrier,@function
+a_barrier:
-+ .word 0xD5033BBF
++ .long 0xD5033BBF
+ ret
+
-+/* void *a_ll_p(volatile void *p) -- load-acquire-exclusive 64-bit */
-+.global a_ll_p
-+.type a_ll_p,@function
-+a_ll_p:
-+ .word 0xC85FFC00
-+ ret
-+
-+/* int a_sc_p(volatile int *p, void *v) -- store-release-exclusive 64-bit */
-+.global a_sc_p
-+.type a_sc_p,@function
-+a_sc_p:
-+ .word 0xC802FC01
-+ cmp w2, #0
-+ cset w0, eq
-+ ret
-+
-+/* int a_ctz_64(uint64_t x) -- count trailing zeros: rbit + clz */
++/* int a_ctz_64(uint64_t x) — count trailing zeros: rbit + clz. */
+.global a_ctz_64
+.type a_ctz_64,@function
+a_ctz_64:
-+ .word 0xDAC00000
-+ .word 0xDAC01000
++ .long 0xDAC00000 /* rbit x0, x0 */
++ .long 0xDAC01000 /* clz x0, x0 */
+ ret
+
+/* int a_clz_64(uint64_t x) */
+.global a_clz_64
+.type a_clz_64,@function
+a_clz_64:
-+ .word 0xDAC01000
++ .long 0xDAC01000
+ ret
diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/get_tp.s patched_musl/musl-1.2.5/src/internal/aarch64/get_tp.s
--- a/musl-1.2.5/src/internal/aarch64/get_tp.s 1969-12-31 16:00:00
-+++ b/musl-1.2.5/src/internal/aarch64/get_tp.s 2026-05-04 12:05:41
++++ b/musl-1.2.5/src/internal/aarch64/get_tp.s 2026-05-04 13:20:36
@@ -0,0 +1,17 @@
+/* tcc-build aarch64 thread-pointer reader.
+ *
@@ -3229,7 +3032,7 @@ diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/get_tp.s patched_musl/musl-1
+.global __get_tp
+.type __get_tp,@function
+__get_tp:
-+ .word 0xd53bd040
++ .long 0xd53bd040
+ ret
diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/syscall.s patched_musl/musl-1.2.5/src/internal/aarch64/syscall.s
--- a/musl-1.2.5/src/internal/aarch64/syscall.s 1969-12-31 16:00:00
@@ -3257,109 +3060,6 @@ diff -urN orig_musl/musl-1.2.5/src/internal/aarch64/syscall.s patched_musl/musl-
+ mov x5, x6
+ svc #0
+ ret
-diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/atomic.s patched_musl/musl-1.2.5/src/internal/riscv64/atomic.s
---- a/musl-1.2.5/src/internal/riscv64/atomic.s 1969-12-31 16:00:00
-+++ b/musl-1.2.5/src/internal/riscv64/atomic.s 2026-05-04 12:40:46
-@@ -0,0 +1,51 @@
-+/* tcc-build riscv64 atomic primitives.
-+ *
-+ * Implements the three externs that arch/riscv64/atomic_arch.h
-+ * declares; musl's src/internal/atomic.h derives the rest (a_swap,
-+ * a_fetch_add, a_or, a_and, a_inc, a_dec, a_store, a_ctz_*,
-+ * a_clz_*, …).
-+ *
-+ * Mnemonics lr.w/lr.d/sc.w/sc.d (with .aqrl ordering), the named
-+ * `fence rw,rw` form, and bne-to-immediate-offset are outside the
-+ * surface of tcc 0.9.26's riscv64-asm.c — emit raw .word encodings.
-+ * `mv` and `jalr` are stock; `jalr x0, x1, 0` is the canonical
-+ * encoding of the `ret` pseudo (which tcc's token table omits).
-+ *
-+ * Encoding cheat sheet (RV64A + base):
-+ * lr.w.aqrl rd=a3, (a0) 0x160526AF
-+ * lr.d.aqrl rd=a3, (a0) 0x160536AF
-+ * sc.w.aqrl rd=a4, a2, (a0) 0x1EC5272F
-+ * sc.d.aqrl rd=a4, a2, (a0) 0x1EC5372F
-+ * bne a3, a1, +12 0x00B69663
-+ * bne a4, x0, -12 0xFE071AE3
-+ * fence rw, rw 0x0330000F
-+ */
-+
-+/* void a_barrier(void) -- fence rw, rw */
-+.global a_barrier
-+.type a_barrier,@function
-+a_barrier:
-+ .word 0x0330000F
-+ jalr x0, x1, 0
-+
-+/* int a_cas(volatile int *p, int t, int s) -- 32-bit CAS loop */
-+.global a_cas
-+.type a_cas,@function
-+a_cas:
-+ .word 0x160526AF /* lr.w.aqrl a3, (a0) */
-+ .word 0x00B69663 /* bne a3, a1, +12 (skip to mv if old != expected) */
-+ .word 0x1EC5272F /* sc.w.aqrl a4, a2, (a0) */
-+ .word 0xFE071AE3 /* bne a4, x0, -12 (retry on SC failure) */
-+ mv a0, a3
-+ jalr x0, x1, 0
-+
-+/* void *a_cas_p(volatile void *p, void *t, void *s) -- 64-bit CAS loop */
-+.global a_cas_p
-+.type a_cas_p,@function
-+a_cas_p:
-+ .word 0x160536AF /* lr.d.aqrl a3, (a0) */
-+ .word 0x00B69663 /* bne a3, a1, +12 */
-+ .word 0x1EC5372F /* sc.d.aqrl a4, a2, (a0) */
-+ .word 0xFE071AE3 /* bne a4, x0, -12 */
-+ mv a0, a3
-+ jalr x0, x1, 0
-diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/get_tp.s patched_musl/musl-1.2.5/src/internal/riscv64/get_tp.s
---- a/musl-1.2.5/src/internal/riscv64/get_tp.s 1969-12-31 16:00:00
-+++ b/musl-1.2.5/src/internal/riscv64/get_tp.s 2026-05-04 12:40:30
-@@ -0,0 +1,15 @@
-+/* tcc-build riscv64 thread-pointer reader.
-+ *
-+ * Reads tp (= x4) into a0 and returns. Stock musl's pthread_arch.h
-+ * uses `__asm__("mv %0, tp" : "=r"(tp))`; the inline-asm operand
-+ * syntax is unsupported by tcc 0.9.26's riscv64-asm.c
-+ * (subst_asm_operand is a stub). Trivial single-instruction wrapper
-+ * with a tail return; phase-3-equivalent inline-asm support would
-+ * let us inline this again.
-+ */
-+
-+.global __get_tp
-+.type __get_tp,@function
-+__get_tp:
-+ mv a0, tp
-+ jalr x0, x1, 0
-diff -urN orig_musl/musl-1.2.5/src/internal/riscv64/syscall.s patched_musl/musl-1.2.5/src/internal/riscv64/syscall.s
---- a/musl-1.2.5/src/internal/riscv64/syscall.s 1969-12-31 16:00:00
-+++ b/musl-1.2.5/src/internal/riscv64/syscall.s 2026-05-04 12:40:24
-@@ -0,0 +1,25 @@
-+/* tcc-build riscv64 syscall trampoline.
-+ *
-+ * Mirrors src/internal/x86_64/syscall.s and src/internal/aarch64/
-+ * syscall.s. C-ABI passes the syscall number in a0 and arguments
-+ * 1..6 in a1..a6; Linux riscv64 syscall ABI wants the number in a7
-+ * and arguments in a0..a5. Shuffle and `ecall`. Return value is
-+ * already in a0 from the kernel side.
-+ *
-+ * Mnemonics here (mv, ecall, jalr) are all in tcc 0.9.26's stock
-+ * riscv64-asm.c. `ret` is a pseudo not in the token table — emit
-+ * the canonical encoding `jalr x0, x1, 0` instead.
-+ */
-+
-+.global __syscall
-+.type __syscall,@function
-+__syscall:
-+ mv a7, a0
-+ mv a0, a1
-+ mv a1, a2
-+ mv a2, a3
-+ mv a3, a4
-+ mv a4, a5
-+ mv a5, a6
-+ ecall
-+ jalr x0, x1, 0
diff -urN orig_musl/musl-1.2.5/src/internal/syscall.h patched_musl/musl-1.2.5/src/internal/syscall.h
--- a/musl-1.2.5/src/internal/syscall.h 2024-02-29 18:07:33
+++ b/musl-1.2.5/src/internal/syscall.h 2026-04-28 22:05:59
@@ -3890,30 +3590,32 @@ diff -urN orig_musl/musl-1.2.5/src/signal/x86_64/sigsetjmp.s patched_musl/musl-1
- jmp __sigsetjmp_tail
-
-1: jmp setjmp@PLT
-diff -urN orig_musl/musl-1.2.5/src/thread/riscv64/__set_thread_area.s patched_musl/musl-1.2.5/src/thread/riscv64/__set_thread_area.s
---- a/musl-1.2.5/src/thread/riscv64/__set_thread_area.s 2026-05-04 12:39:16
-+++ b/musl-1.2.5/src/thread/riscv64/__set_thread_area.s 2026-05-04 13:00:50
-@@ -1,6 +1,20 @@
-+/* tcc-build riscv64 __set_thread_area replacement.
+diff -urN orig_musl/musl-1.2.5/src/thread/aarch64/__set_thread_area.s patched_musl/musl-1.2.5/src/thread/aarch64/__set_thread_area.s
+--- a/musl-1.2.5/src/thread/aarch64/__set_thread_area.s 2024-02-29 18:07:33
++++ b/musl-1.2.5/src/thread/aarch64/__set_thread_area.s 2026-05-04 13:29:45
+@@ -1,7 +1,23 @@
++/* tcc-build aarch64 __set_thread_area.
+ *
-+ * Identical semantics to the upstream three-instruction file (set
-+ * tp, return 0). Only difference: replace the `ret` pseudo
-+ * (`jalr x0, x1, 0`) with the explicit jalr form, since tcc
-+ * 0.9.26's riscv64-tok.h has no `ret` token.
++ * Sets the EL0 thread pointer (TPIDR_EL0) to x0 and returns 0.
++ * Stock musl writes `msr tpidr_el0, x0` as a plain mnemonic; the
++ * `msr` system-register move is outside arm64-asm.c phase 1+2, so
++ * we emit the encoding as a raw word.
+ *
-+ * On riscv64 the userspace-visible thread pointer is just the `tp`
-+ * register; Linux exposes no SYS_set_thread_area (the generic C
-+ * fallback in src/thread/__set_thread_area.c would return -ENOSYS
-+ * and __init_tp would a_crash). This file is on the startup path —
-+ * called from __init_tp via __libc_start_main, so it must link.
-+ */
++ * d51bd040 msr tpidr_el0, x0 (op0=3 op1=3 CRn=13 CRm=0 op2=2 Rt=0)
++ *
++ * Without this, __init_tls's call to __set_thread_area resolves to
++ * an invalid address (tcc's static linker doesn't fail on undefined
++ * archive references), and hello segfaults inside __libc_start_main
++ * before main runs.
++ *
++ * Note: tcc treats `.word` as 2 bytes; use `.long` for 4. */
+
.global __set_thread_area
--.type __set_thread_area, %function
-+.type __set_thread_area,@function
+ .hidden __set_thread_area
+ .type __set_thread_area,@function
__set_thread_area:
- mv tp, a0
-- li a0, 0
-- ret
-+ addi a0, x0, 0
-+ jalr x0, x1, 0
+- msr tpidr_el0,x0
+- mov w0,#0
++ .long 0xd51bd040
++ mov w0, #0
+ ret