atomic.s (3393B)
1 /* tcc-build aarch64 atomic primitives. 2 * 3 * Stock musl's atomic primitives are inline asm with output operands 4 * (LL/SC pairs and `dmb ish` named-option), unsupported by 5 * arm64-asm.c phase 1+2. We can't decompose into separate extern 6 * a_ll/a_sc — function-call boundaries between ldaxr and stlxr clear 7 * the exclusive monitor on real hardware (and on QEMU/Apple Silicon), 8 * making the LL/SC retry loop deadloop. So provide a_cas / a_cas_p as 9 * single asm functions whose LL/SC pair lives inside one call. 10 * 11 * Two arm64-asm.c phase-2 quirks shape the layout below: 12 * 1. Forward `b.cond` / `cbz` / `cbnz` to a same-file label emits 13 * `CONDBR19 reloc (unsupported)`. 14 * 2. Forward unconditional `b` to a same-file label silently 15 * assembles as `b +0` (branch-to-self) — no error, but the 16 * function turns into an infinite loop. 17 * Backward branches resolve correctly (offset known at emit time); 18 * branches to external symbols (e.g. `bl __syscall`) go through 19 * JUMP26/CALL26 relocations which arm64-link.c handles. So the trick 20 * is: define each function's "exit" block BEFORE the function entry, 21 * so every conditional branch out of the loop is backward, and the 22 * tail unconditional `b` is also backward. 23 * 24 * Mnemonics ldaxr, stlxr, dmb, rbit, clz are outside arm64-asm.c 25 * phase 1+2; emit them as raw .long words. Phase 1+2 covers cmp, 26 * b.cond (backward only), cbnz (backward), mov, ret, b (backward to 27 * same-file or any-direction to extern). 28 * 29 * Encoding cheat sheet: 30 * 0x885FFC03 ldaxr w3, [x0] 31 * 0xC85FFC03 ldaxr x3, [x0] 32 * 0x8804FC02 stlxr w4, w2, [x0] 33 * 0xC804FC02 stlxr w4, x2, [x0] 34 * 0xD5033BBF dmb ish 35 * 0xDAC00000 rbit x0, x0 36 * 0xDAC01000 clz x0, x0 37 * 38 * Note: tcc treats `.word` as 2 bytes; use `.long` for 4. */ 39 40 .text 41 42 /* Exit blocks defined first so a_cas / a_cas_p only emit BACKWARD 43 * branches. Not part of any function — only entered via the backward 44 * b.ne / b inside their respective functions below. */ 45 .Lcas_done: 46 .long 0xD5033BBF /* dmb ish */ 47 mov w0, w3 /* return old (32-bit) */ 48 ret 49 50 .Lcasp_done: 51 .long 0xD5033BBF /* dmb ish */ 52 mov x0, x3 /* return old (64-bit) */ 53 ret 54 55 /* int a_cas(volatile int *p, int t, int s) — 32-bit CAS. */ 56 .global a_cas 57 .type a_cas,@function 58 a_cas: 59 .long 0x885FFC03 /* ldaxr w3, [x0] : old = *p, mark exclusive */ 60 cmp w3, w1 /* old == t ? */ 61 b.ne .Lcas_done /* backward — OK */ 62 .long 0x8804FC02 /* stlxr w4, w2, [x0]: try *p = s, w4 = status */ 63 cbnz w4, a_cas /* backward — failed, retry from a_cas entry */ 64 b .Lcas_done /* backward — succeeded */ 65 66 /* void *a_cas_p(volatile void *p, void *t, void *s) — 64-bit CAS. */ 67 .global a_cas_p 68 .type a_cas_p,@function 69 a_cas_p: 70 .long 0xC85FFC03 /* ldaxr x3, [x0] */ 71 cmp x3, x1 72 b.ne .Lcasp_done 73 .long 0xC804FC02 /* stlxr w4, x2, [x0] */ 74 cbnz w4, a_cas_p 75 b .Lcasp_done 76 77 /* void a_barrier(void) — dmb ish */ 78 .global a_barrier 79 .type a_barrier,@function 80 a_barrier: 81 .long 0xD5033BBF 82 ret 83 84 /* int a_ctz_64(uint64_t x) — count trailing zeros: rbit + clz. */ 85 .global a_ctz_64 86 .type a_ctz_64,@function 87 a_ctz_64: 88 .long 0xDAC00000 /* rbit x0, x0 */ 89 .long 0xDAC01000 /* clz x0, x0 */ 90 ret 91 92 /* int a_clz_64(uint64_t x) */ 93 .global a_clz_64 94 .type a_clz_64,@function 95 a_clz_64: 96 .long 0xDAC01000 97 ret