commit bd3ed18a176f836e8e4ad927673b63da714c33e4
parent 626bb143300e32d23b3b80d4e92a4ba5bc80dd34
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 15:36:21 -0700
cg/aa64: implement Groups N, O, Q (TLS, globals, multi-function)
- aa_tls_addr_of: TLS Local-Exec sequence (mrs tpidr_el0; add hi12;
add lo12_nc), using R_AARCH64_TLSLE_ADD_TPREL_{HI12,LO12_NC}.
- aa_load/aa_store: direct OPK_GLOBAL via ADRP + LDR/STR with
R_AARCH64_ADR_PREL_PG_HI21 + LDST{8,16,32,64}_ABS_LO12_NC; addr_base
also handles OPK_GLOBAL by materializing &sym+addend through ADRP+ADD.
- New encoding helpers: aa64_ldr_uimm, aa64_str_fp_uimm,
aa64_mrs_tpidr_el0.
- cg-runner --jit: build a per-thread TLS image and msr tpidr_el0
immediately before invoking test_main (mirrors jit_runner.c) so
Group N passes path D on the host.
Diffstat:
2 files changed, 172 insertions(+), 8 deletions(-)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -98,6 +98,21 @@ static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
u32 sc = byte_off >> size;
return 0x39000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
}
+static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+ u32 sc = byte_off >> size;
+ return 0x39400000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* STR (SIMD & FP, unsigned offset). size: 2=S (32), 3=D (64). */
+static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+ u32 sc = byte_off >> size;
+ return 0x3D000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* MRS Xt, TPIDR_EL0 — read AArch64 user thread pointer. */
+static inline u32 aa64_mrs_tpidr_el0(u32 Rt)
+{ return 0xD53BD040u | (Rt & 0x1fu); }
/* Branch (unconditional, 26-bit imm). Emitted with imm26=0 when paired
* with a JUMP26/CALL26 relocation; the patcher fills in imm26. */
static inline u32 aa64_b_base(void) { return 0x14000000u; }
@@ -989,6 +1004,33 @@ static void aa_copy(CGTarget* t, Operand dst, Operand src)
/* ---- load / store / addr_of ---- */
+/* Reloc kind for an LDR/STR (immediate, unsigned offset) of `nbytes`. */
+static RelocKind ldst_lo12_reloc_for(u32 nbytes)
+{
+ switch (nbytes) {
+ case 1: return R_AARCH64_LDST8_ABS_LO12_NC;
+ case 2: return R_AARCH64_LDST16_ABS_LO12_NC;
+ case 4: return R_AARCH64_LDST32_ABS_LO12_NC;
+ case 8: return R_AARCH64_LDST64_ABS_LO12_NC;
+ default: return R_AARCH64_LDST64_ABS_LO12_NC;
+ }
+}
+
+/* Materialize &sym+addend into `dst_reg` via ADRP + ADD (LO12_NC). */
+static void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend)
+{
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 adrp_pos = mc->pos(mc);
+ emit32(mc, aa64_adrp_base(dst_reg));
+ mc->emit_reloc_at(mc, sec, adrp_pos,
+ R_AARCH64_ADR_PREL_PG_HI21, sym, addend, 0, 0);
+ u32 add_pos = mc->pos(mc);
+ emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0));
+ mc->emit_reloc_at(mc, sec, add_pos,
+ R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0, 0);
+}
+
/* Resolve an address operand (LOCAL or INDIRECT) into (base_reg, signed
* offset) via a possibly-temporary base register. Returns the base reg. */
static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg)
@@ -1005,9 +1047,10 @@ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg)
return reg_num((Operand){.kind=OPK_REG, .v.reg = addr.v.ind.base});
}
if (addr.kind == OPK_GLOBAL) {
- compiler_panic(t->c, a->loc, "aarch64: GLOBAL address not yet supported");
+ emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend);
+ *out_off = 0;
+ return tmp_reg;
}
- (void)tmp_reg;
compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d",
(int)addr.kind);
}
@@ -1015,10 +1058,33 @@ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg)
static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma)
{
AAImpl* a = impl_of(t);
- i32 off;
- u32 base = addr_base(t, addr, &off, 9);
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
u32 sidx = size_idx_for_bytes(sz);
+
+ /* OPK_GLOBAL: ADRP scratch, sym ; LDR Wd, [scratch, #:lo12:sym].
+ * The LO12_NC reloc requires the scaled-offset LDR encoding, not LDUR. */
+ if (addr.kind == OPK_GLOBAL) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ ObjSymId sym = addr.v.global.sym;
+ i64 add = addr.v.global.addend;
+ u32 adrp_pos = mc->pos(mc);
+ emit32(mc, aa64_adrp_base(/*Rd=*/9));
+ mc->emit_reloc_at(mc, sec, adrp_pos,
+ R_AARCH64_ADR_PREL_PG_HI21, sym, add, 0, 0);
+ u32 ld_pos = mc->pos(mc);
+ if (dst.cls == RC_FP) {
+ emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0));
+ } else {
+ emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), 9, 0));
+ }
+ mc->emit_reloc_at(mc, sec, ld_pos,
+ ldst_lo12_reloc_for(sz), sym, add, 0, 0);
+ return;
+ }
+
+ i32 off;
+ u32 base = addr_base(t, addr, &off, 9);
if (off < -256 || off > 255) {
compiler_panic(t->c, a->loc, "aarch64 load: offset %d out of LDUR range", off);
}
@@ -1032,10 +1098,48 @@ static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma)
static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma)
{
AAImpl* a = impl_of(t);
- i32 off;
- u32 base = addr_base(t, addr, &off, 9);
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
u32 sidx = size_idx_for_bytes(sz);
+
+ /* OPK_GLOBAL: ADRP scratch, sym ; STR Wt, [scratch, #:lo12:sym].
+ * For OPK_IMM source, materialize the value first into x9, then use
+ * x10 for the global base so the two scratches don't collide. */
+ if (addr.kind == OPK_GLOBAL) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ ObjSymId sym = addr.v.global.sym;
+ i64 add = addr.v.global.addend;
+
+ u32 src_reg;
+ u32 src_is_fp = 0;
+ if (src.kind == OPK_IMM) {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ emit_load_imm(mc, sf, /*Rd=*/9, src.v.imm);
+ src_reg = 9;
+ } else if (src.cls == RC_FP) {
+ src_reg = reg_num(src);
+ src_is_fp = 1;
+ } else {
+ src_reg = reg_num(src);
+ }
+ u32 base = (src.kind == OPK_IMM) ? 10u : 9u;
+ u32 adrp_pos = mc->pos(mc);
+ emit32(mc, aa64_adrp_base(base));
+ mc->emit_reloc_at(mc, sec, adrp_pos,
+ R_AARCH64_ADR_PREL_PG_HI21, sym, add, 0, 0);
+ u32 st_pos = mc->pos(mc);
+ if (src_is_fp) {
+ emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0));
+ } else {
+ emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0));
+ }
+ mc->emit_reloc_at(mc, sec, st_pos,
+ ldst_lo12_reloc_for(sz), sym, add, 0, 0);
+ return;
+ }
+
+ i32 off;
+ u32 base = addr_base(t, addr, &off, 9);
if (off < -256 || off > 255) {
compiler_panic(t->c, a->loc, "aarch64 store: offset %d out of STUR range", off);
}
@@ -1098,7 +1202,33 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv)
aa_panic(t, "addr_of");
}
-static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a) { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); }
+/* AArch64 TLS Local-Exec materialization.
+ * mrs xtmp, tpidr_el0
+ * add xdst, xtmp, #:tprel_hi12:sym, lsl #12
+ * add xdst, xdst, #:tprel_lo12_nc:sym
+ * The two ADDs carry HI12 / LO12_NC TLSLE relocations; the linker fills in
+ * the per-target TP-relative offset (image offset + AARCH64_TCB_SIZE). */
+static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend)
+{
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 rd = reg_num(dst);
+
+ /* Read thread pointer into x9 (scratch). */
+ emit32(mc, aa64_mrs_tpidr_el0(/*Rt=*/9));
+
+ /* add xdst, x9, #:tprel_hi12:sym, lsl #12 */
+ u32 hi_pos = mc->pos(mc);
+ emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/9, /*imm12=*/0, /*sh=*/1));
+ mc->emit_reloc_at(mc, sec, hi_pos,
+ R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, addend, 0, 0);
+
+ /* add xdst, xdst, #:tprel_lo12_nc:sym */
+ u32 lo_pos = mc->pos(mc);
+ emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0));
+ mc->emit_reloc_at(mc, sec, lo_pos,
+ R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym, addend, 0, 0);
+}
/* Resolve a dst/src address operand for the aggregate ops below.
* Accepts OPK_REG (already a pointer) and OPK_LOCAL (= fp - off);
diff --git a/test/cg/harness/cg_runner.c b/test/cg/harness/cg_runner.c
@@ -346,7 +346,41 @@ static int mode_jit(const char* name)
}
int (*fn)(void) = (int(*)(void))cfree_jit_lookup(jit, "test_main");
- int result = fn ? fn() : 1;
+
+ /* AArch64 TLS Local-Exec setup, mirroring jit_runner.c. Build a
+ * thread-local image (16-byte TCB + .tdata copy + .tbss zero-fill) and
+ * point TPIDR_EL0 at it just before invoking test_main. On Darwin,
+ * libc functions clobber TPIDR_EL0 (probably via dyld stub binding /
+ * locale TSD), so msr → call() must be back-to-back with NO libc
+ * invocations between. */
+#if defined(__aarch64__) || defined(__arm64__)
+ static char tls_block[8192] __attribute__((aligned(16)));
+ {
+ char* td_start = (char*)cfree_jit_lookup(jit, "__tdata_start");
+ char* td_end = (char*)cfree_jit_lookup(jit, "__tdata_end");
+ unsigned long bs_n = (unsigned long)(unsigned long long)
+ cfree_jit_lookup(jit, "__tbss_size");
+ if (td_start && td_end) {
+ unsigned long td_n = (unsigned long)(td_end - td_start);
+ unsigned long i;
+ /* Plain loops at -O0 stay loops; do NOT use memcpy/memset
+ * here — those go through dyld's stub binder on first call
+ * and clobber TPIDR_EL0. */
+ for (i = 0; i < td_n; ++i) tls_block[16 + i] = td_start[i];
+ for (i = 0; i < bs_n; ++i) tls_block[16 + td_n + i] = 0;
+ }
+ }
+#endif
+
+ int result;
+ if (fn) {
+#if defined(__aarch64__) || defined(__arm64__)
+ __asm__ volatile ("msr tpidr_el0, %0" :: "r"(tls_block) : "memory");
+#endif
+ result = fn();
+ } else {
+ result = 1;
+ }
cfree_jit_free(jit);
link_free(lk);