kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit bd3ed18a176f836e8e4ad927673b63da714c33e4
parent 626bb143300e32d23b3b80d4e92a4ba5bc80dd34
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 15:36:21 -0700

cg/aa64: implement Groups N, O, Q (TLS, globals, multi-function)

- aa_tls_addr_of: TLS Local-Exec sequence (mrs tpidr_el0; add hi12;
  add lo12_nc), using R_AARCH64_TLSLE_ADD_TPREL_{HI12,LO12_NC}.
- aa_load/aa_store: direct OPK_GLOBAL via ADRP + LDR/STR with
  R_AARCH64_ADR_PREL_PG_HI21 + LDST{8,16,32,64}_ABS_LO12_NC; addr_base
  also handles OPK_GLOBAL by materializing &sym+addend through ADRP+ADD.
- New encoding helpers: aa64_ldr_uimm, aa64_str_fp_uimm,
  aa64_mrs_tpidr_el0.
- cg-runner --jit: build a per-thread TLS image and msr tpidr_el0
  immediately before invoking test_main (mirrors jit_runner.c) so
  Group N passes path D on the host.

Diffstat:
Msrc/arch/aarch64.c | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mtest/cg/harness/cg_runner.c | 36+++++++++++++++++++++++++++++++++++-
2 files changed, 172 insertions(+), 8 deletions(-)

diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c @@ -98,6 +98,21 @@ static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) u32 sc = byte_off >> size; return 0x39000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) +{ + u32 sc = byte_off >> size; + return 0x39400000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +/* STR (SIMD & FP, unsigned offset). size: 2=S (32), 3=D (64). */ +static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) +{ + u32 sc = byte_off >> size; + return 0x3D000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} + +/* MRS Xt, TPIDR_EL0 — read AArch64 user thread pointer. */ +static inline u32 aa64_mrs_tpidr_el0(u32 Rt) +{ return 0xD53BD040u | (Rt & 0x1fu); } /* Branch (unconditional, 26-bit imm). Emitted with imm26=0 when paired * with a JUMP26/CALL26 relocation; the patcher fills in imm26. */ static inline u32 aa64_b_base(void) { return 0x14000000u; } @@ -989,6 +1004,33 @@ static void aa_copy(CGTarget* t, Operand dst, Operand src) /* ---- load / store / addr_of ---- */ +/* Reloc kind for an LDR/STR (immediate, unsigned offset) of `nbytes`. */ +static RelocKind ldst_lo12_reloc_for(u32 nbytes) +{ + switch (nbytes) { + case 1: return R_AARCH64_LDST8_ABS_LO12_NC; + case 2: return R_AARCH64_LDST16_ABS_LO12_NC; + case 4: return R_AARCH64_LDST32_ABS_LO12_NC; + case 8: return R_AARCH64_LDST64_ABS_LO12_NC; + default: return R_AARCH64_LDST64_ABS_LO12_NC; + } +} + +/* Materialize &sym+addend into `dst_reg` via ADRP + ADD (LO12_NC). */ +static void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) +{ + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + u32 adrp_pos = mc->pos(mc); + emit32(mc, aa64_adrp_base(dst_reg)); + mc->emit_reloc_at(mc, sec, adrp_pos, + R_AARCH64_ADR_PREL_PG_HI21, sym, addend, 0, 0); + u32 add_pos = mc->pos(mc); + emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0)); + mc->emit_reloc_at(mc, sec, add_pos, + R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0, 0); +} + /* Resolve an address operand (LOCAL or INDIRECT) into (base_reg, signed * offset) via a possibly-temporary base register. Returns the base reg. */ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) @@ -1005,9 +1047,10 @@ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) return reg_num((Operand){.kind=OPK_REG, .v.reg = addr.v.ind.base}); } if (addr.kind == OPK_GLOBAL) { - compiler_panic(t->c, a->loc, "aarch64: GLOBAL address not yet supported"); + emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend); + *out_off = 0; + return tmp_reg; } - (void)tmp_reg; compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d", (int)addr.kind); } @@ -1015,10 +1058,33 @@ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { AAImpl* a = impl_of(t); - i32 off; - u32 base = addr_base(t, addr, &off, 9); u32 sz = ma.size ? ma.size : type_byte_size(addr.type); u32 sidx = size_idx_for_bytes(sz); + + /* OPK_GLOBAL: ADRP scratch, sym ; LDR Wd, [scratch, #:lo12:sym]. + * The LO12_NC reloc requires the scaled-offset LDR encoding, not LDUR. */ + if (addr.kind == OPK_GLOBAL) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + ObjSymId sym = addr.v.global.sym; + i64 add = addr.v.global.addend; + u32 adrp_pos = mc->pos(mc); + emit32(mc, aa64_adrp_base(/*Rd=*/9)); + mc->emit_reloc_at(mc, sec, adrp_pos, + R_AARCH64_ADR_PREL_PG_HI21, sym, add, 0, 0); + u32 ld_pos = mc->pos(mc); + if (dst.cls == RC_FP) { + emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0)); + } else { + emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), 9, 0)); + } + mc->emit_reloc_at(mc, sec, ld_pos, + ldst_lo12_reloc_for(sz), sym, add, 0, 0); + return; + } + + i32 off; + u32 base = addr_base(t, addr, &off, 9); if (off < -256 || off > 255) { compiler_panic(t->c, a->loc, "aarch64 load: offset %d out of LDUR range", off); } @@ -1032,10 +1098,48 @@ static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { AAImpl* a = impl_of(t); - i32 off; - u32 base = addr_base(t, addr, &off, 9); u32 sz = ma.size ? ma.size : type_byte_size(addr.type); u32 sidx = size_idx_for_bytes(sz); + + /* OPK_GLOBAL: ADRP scratch, sym ; STR Wt, [scratch, #:lo12:sym]. + * For OPK_IMM source, materialize the value first into x9, then use + * x10 for the global base so the two scratches don't collide. */ + if (addr.kind == OPK_GLOBAL) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + ObjSymId sym = addr.v.global.sym; + i64 add = addr.v.global.addend; + + u32 src_reg; + u32 src_is_fp = 0; + if (src.kind == OPK_IMM) { + u32 sf = (sz == 8) ? 1u : 0u; + emit_load_imm(mc, sf, /*Rd=*/9, src.v.imm); + src_reg = 9; + } else if (src.cls == RC_FP) { + src_reg = reg_num(src); + src_is_fp = 1; + } else { + src_reg = reg_num(src); + } + u32 base = (src.kind == OPK_IMM) ? 10u : 9u; + u32 adrp_pos = mc->pos(mc); + emit32(mc, aa64_adrp_base(base)); + mc->emit_reloc_at(mc, sec, adrp_pos, + R_AARCH64_ADR_PREL_PG_HI21, sym, add, 0, 0); + u32 st_pos = mc->pos(mc); + if (src_is_fp) { + emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0)); + } else { + emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0)); + } + mc->emit_reloc_at(mc, sec, st_pos, + ldst_lo12_reloc_for(sz), sym, add, 0, 0); + return; + } + + i32 off; + u32 base = addr_base(t, addr, &off, 9); if (off < -256 || off > 255) { compiler_panic(t->c, a->loc, "aarch64 store: offset %d out of STUR range", off); } @@ -1098,7 +1202,33 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) aa_panic(t, "addr_of"); } -static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a) { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); } +/* AArch64 TLS Local-Exec materialization. + * mrs xtmp, tpidr_el0 + * add xdst, xtmp, #:tprel_hi12:sym, lsl #12 + * add xdst, xdst, #:tprel_lo12_nc:sym + * The two ADDs carry HI12 / LO12_NC TLSLE relocations; the linker fills in + * the per-target TP-relative offset (image offset + AARCH64_TCB_SIZE). */ +static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) +{ + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + u32 rd = reg_num(dst); + + /* Read thread pointer into x9 (scratch). */ + emit32(mc, aa64_mrs_tpidr_el0(/*Rt=*/9)); + + /* add xdst, x9, #:tprel_hi12:sym, lsl #12 */ + u32 hi_pos = mc->pos(mc); + emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/9, /*imm12=*/0, /*sh=*/1)); + mc->emit_reloc_at(mc, sec, hi_pos, + R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, addend, 0, 0); + + /* add xdst, xdst, #:tprel_lo12_nc:sym */ + u32 lo_pos = mc->pos(mc); + emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0)); + mc->emit_reloc_at(mc, sec, lo_pos, + R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym, addend, 0, 0); +} /* Resolve a dst/src address operand for the aggregate ops below. * Accepts OPK_REG (already a pointer) and OPK_LOCAL (= fp - off); diff --git a/test/cg/harness/cg_runner.c b/test/cg/harness/cg_runner.c @@ -346,7 +346,41 @@ static int mode_jit(const char* name) } int (*fn)(void) = (int(*)(void))cfree_jit_lookup(jit, "test_main"); - int result = fn ? fn() : 1; + + /* AArch64 TLS Local-Exec setup, mirroring jit_runner.c. Build a + * thread-local image (16-byte TCB + .tdata copy + .tbss zero-fill) and + * point TPIDR_EL0 at it just before invoking test_main. On Darwin, + * libc functions clobber TPIDR_EL0 (probably via dyld stub binding / + * locale TSD), so msr → call() must be back-to-back with NO libc + * invocations between. */ +#if defined(__aarch64__) || defined(__arm64__) + static char tls_block[8192] __attribute__((aligned(16))); + { + char* td_start = (char*)cfree_jit_lookup(jit, "__tdata_start"); + char* td_end = (char*)cfree_jit_lookup(jit, "__tdata_end"); + unsigned long bs_n = (unsigned long)(unsigned long long) + cfree_jit_lookup(jit, "__tbss_size"); + if (td_start && td_end) { + unsigned long td_n = (unsigned long)(td_end - td_start); + unsigned long i; + /* Plain loops at -O0 stay loops; do NOT use memcpy/memset + * here — those go through dyld's stub binder on first call + * and clobber TPIDR_EL0. */ + for (i = 0; i < td_n; ++i) tls_block[16 + i] = td_start[i]; + for (i = 0; i < bs_n; ++i) tls_block[16 + td_n + i] = 0; + } + } +#endif + + int result; + if (fn) { +#if defined(__aarch64__) || defined(__arm64__) + __asm__ volatile ("msr tpidr_el0, %0" :: "r"(tls_block) : "memory"); +#endif + result = fn(); + } else { + result = 1; + } cfree_jit_free(jit); link_free(lk);