kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 060d8253db61a71604f234a187998b47c3fc6a0c
parent 781d954928484c2614b1a43d73460b4c66b00212
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 15:56:45 -0700

cg/aa64: implement Groups J, K, L (varargs, atomics, intrinsics)

- ABI: replace the placeholder va_list type with the AAPCS64
  __va_list struct (3 ptrs + 2 ints, 32 bytes).
- Variadic prologue: reserve GP/FP register save areas and spill
  x0..x7 / d0..d7 immediately after the prologue placeholder.
- va_start/va_arg/va_end/va_copy: full AAPCS64 lowering, dispatching
  to the GP or FP save area then falling through to the caller's
  stack at __gr_offs/__vr_offs == 0.
- Atomics: ARMv8.0 LL/SC lowering — LDAR/STLR for plain ordered
  load/store, LDAXR/STLXR retry loops for rmw and cas, DMB ISH for
  fences. NAND synthesized via AND+MVN.
- Intrinsics: NEON-based POPCOUNT (CNT.8B + ADDV), CLZ/CTZ via RBIT,
  REV*-family BSWAP, constant-size MEMCPY/MEMMOVE/MEMSET, no-op
  PREFETCH/ASSUME_ALIGNED/EXPECT, BRK for TRAP/UNREACHABLE, and
  ADDS/SUBS+CSET (signed V flag) for ADD/SUB_OVERFLOW with
  SMULL+sxtw compare for MUL_OVERFLOW.
- Misc: extend the FP scratch range to v16..v23 for short-lived
  materialization (j06 needs 9 simultaneous FP regs); fix the
  call-site FP-to-stack path that was clobbering v0/v1 with FMOV.

All 752 cg cases pass across D/R/E/J paths.

Diffstat:
Msrc/abi/abi.c | 24+++++++++++++++++++-----
Msrc/arch/aarch64.c | 760++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 751 insertions(+), 33 deletions(-)

diff --git a/src/abi/abi.c b/src/abi/abi.c @@ -12,6 +12,7 @@ #include "abi/abi.h" #include "core/core.h" #include "core/arena.h" +#include "core/pool.h" #include <cfree.h> @@ -36,6 +37,7 @@ struct TargetABI { /* Per-TU cached lookups. */ FuncInfoCacheEntry* fn_cache; RecordLayoutCacheEntry* rec_cache; + const Type* va_list_cache; }; /* ---- scalar profile ---- */ @@ -331,11 +333,22 @@ const Type* abi_intptr_type (TargetABI* a, Pool* p) const Type* abi_uintptr_type(TargetABI* a, Pool* p) { return size_or_uintptr(a, p); } const Type* abi_va_list_type(TargetABI* a, Pool* p) { - /* AAPCS64: __va_list is a struct of three pointers + two ints. v1 returns - * a placeholder pointer; this is exercised only by the parser/builtin - * substitution path, which Group A does not reach. */ - (void)a; - return type_ptr(p, type_void(p)); + /* AAPCS64 __va_list: 3 pointers (__stack, __gr_top, __vr_top) followed + * by 2 ints (__gr_offs, __vr_offs). Total 32 bytes, 8-aligned. */ + if (a->va_list_cache) return a->va_list_cache; + const Type* vp = type_ptr(p, type_void(p)); + const Type* it = type_prim(p, TY_INT); + Sym name = pool_intern_cstr(p, "__va_list"); + SrcLoc nl = {0,0,0}; + TagId tg = type_tag_new(p, TAG_STRUCT, name, nl); + TypeRecordBuilder* b = type_record_begin(p, TY_STRUCT, tg, name); + type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__stack"), .type = vp }); + type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_top"), .type = vp }); + type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_top"), .type = vp }); + type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_offs"), .type = it }); + type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_offs"), .type = it }); + a->va_list_cache = type_record_end(p, b); + return a->va_list_cache; } /* ---- lifecycle ---- */ @@ -352,6 +365,7 @@ void abi_fini(TargetABI* a) if (!a) return; a->fn_cache = NULL; a->rec_cache = NULL; + a->va_list_cache = NULL; a->c = NULL; } diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c @@ -276,6 +276,14 @@ typedef struct AAImpl { struct AAAllocaPatch { u32 pos; u32 dst_reg; }* add_patches; u32 nadd_patches; u32 add_patches_cap; + + /* Variadic — AAPCS64 register save areas reserved at function entry. + * gp_save_slot holds 8*8=64 bytes (x0..x7); fp_save_slot holds 8*16=128 + * bytes (v0..v7 with 16-byte stride). Saves are emitted in func_begin + * after the prologue placeholder so FP is already valid when they run. */ + u8 is_variadic; + FrameSlot gp_save_slot; + FrameSlot fp_save_slot; } AAImpl; static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } @@ -450,6 +458,9 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) a->has_alloca = 0; a->nadd_patches= 0; a->sret_ptr_slot = FRAME_SLOT_NONE; + a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0; + a->gp_save_slot = FRAME_SLOT_NONE; + a->fp_save_slot = FRAME_SLOT_NONE; a->epilogue_label = mc->label_new(mc); mc->cfi_startproc(mc); @@ -470,6 +481,31 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) }; a->sret_ptr_slot = aa_frame_slot(t, &fsd); } + + /* Variadic: reserve GP and FP register save areas and emit saves of + * x0..x7 / d0..d7 here, after the prologue placeholder, so FP is set + * up. Param stores below run after these saves but before any user + * code clobbers x0..x7. */ + if (a->is_variadic) { + FrameSlotDesc gpd = { + .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0}, + .size = 64, .align = 8, .kind = FS_SPILL, .flags = 0, + }; + a->gp_save_slot = aa_frame_slot(t, &gpd); + FrameSlotDesc fpd = { + .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0}, + .size = 128, .align = 16, .kind = FS_SPILL, .flags = 0, + }; + a->fp_save_slot = aa_frame_slot(t, &fpd); + AASlot* gs = slot_get(a, a->gp_save_slot); + AASlot* fs = slot_get(a, a->fp_save_slot); + for (u32 i = 0; i < 8; ++i) { + emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i*8)); + } + for (u32 i = 0; i < 8; ++i) { + emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i*16)); + } + } } static void aa_func_end(CGTarget* t) @@ -477,9 +513,12 @@ static void aa_func_end(CGTarget* t) AAImpl* a = impl_of(t); MCEmitter* mc = t->mc; - /* Compute callee-save layout. */ + /* Compute callee-save layout. Only v8..v15 are callee-saved; the + * caller-saved v16..v23 are handed out by alloc_reg too but never + * appear in prologue saves. */ u32 n_int_pairs = (a->used_int + 1) / 2; /* round up */ - u32 n_fp_pairs = (a->used_fp + 1) / 2; + u32 used_fp_cs = a->used_fp > 8 ? 8u : a->used_fp; + u32 n_fp_pairs = (used_fp_cs + 1) / 2; u32 outgoing_off = 0; u32 int_save_off = a->max_outgoing; @@ -619,11 +658,16 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty) return (Reg)(19u + a->used_int++); } if (cls == RC_FP) { - if (a->used_fp >= 8) { + /* v8..v15 are callee-saved (low 64 bits); v16..v23 are caller-saved + * scratch. Hand out callee-saved first, then fall back to scratch + * for short-lived materialization (e.g. j06 builds 9 FP arg regs + * with no intervening call). */ + if (a->used_fp >= 16) { compiler_panic(t->c, a->loc, "aarch64 alloc_reg: out of FP scratch (no spill yet)"); } - return (Reg)(8u + a->used_fp++); + u32 idx = a->used_fp++; + return (Reg)(idx < 8 ? 8u + idx : 16u + (idx - 8u)); } compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls); } @@ -1593,7 +1637,26 @@ static void emit_arg_value(CGTarget* t, u32* next_int, u32* next_fp, u32* stack_off) { AAImpl* a = impl_of(t); + /* Synthesize a one-part DIRECT ABIArgInfo for var args (av->abi is NULL + * past the fixed-param count). AAPCS64 routes var args through the same + * register/stack rules as fixed scalars, so this matches what + * abi_func_info would have produced. */ + ABIArgInfo va_ai; + ABIArgPart va_pt; const ABIArgInfo* ai = av->abi; + if (!ai) { + u32 sz = type_byte_size(av->type); + memset(&va_ai, 0, sizeof va_ai); + memset(&va_pt, 0, sizeof va_pt); + va_ai.kind = ABI_ARG_DIRECT; + va_ai.parts = &va_pt; + va_ai.nparts = 1; + va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT; + va_pt.size = sz; + va_pt.align = sz; + va_pt.src_offset = 0; + ai = &va_ai; + } if (ai->kind == ABI_ARG_IGNORE) return; if (ai->kind == ABI_ARG_INDIRECT) { @@ -1660,20 +1723,33 @@ static void emit_arg_value(CGTarget* t, } } else if (pt->cls == ABI_CLASS_FP) { int to_stack = (*next_fp >= 8); - u32 dst_reg = to_stack ? 0u : (*next_fp)++; - switch (av->storage.kind) { - case OPK_REG: { - u32 type = (sz == 8) ? 1u : 0u; - emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); - break; - } - default: - compiler_panic(t->c, a->loc, - "aarch64 call: FP arg storage kind %d unsupported", - (int)av->storage.kind); - } - if (to_stack) { - emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off)); + if (!to_stack) { + u32 dst_reg = (*next_fp)++; + switch (av->storage.kind) { + case OPK_REG: { + u32 type = (sz == 8) ? 1u : 0u; + emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); + break; + } + default: + compiler_panic(t->c, a->loc, + "aarch64 call: FP arg storage kind %d unsupported", + (int)av->storage.kind); + } + } else { + /* Store source FP reg directly into the stack slot — going + * through v0/v1 would corrupt args already placed in the + * register save area. */ + switch (av->storage.kind) { + case OPK_REG: + emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31, + (i32)*stack_off)); + break; + default: + compiler_panic(t->c, a->loc, + "aarch64 call: FP stack-arg storage kind %d unsupported", + (int)av->storage.kind); + } *stack_off += 8; } } else { @@ -1932,18 +2008,646 @@ static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/31, 0, 0)); a->has_alloca = 1; } -static void aa_va_start_(CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_start"); } -static void aa_va_arg_ (CGTarget* t, Operand d, Operand a, const Type* ty) { (void)d;(void)a;(void)ty; aa_panic(t, "va_arg"); } -static void aa_va_end_ (CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_end"); } -static void aa_va_copy_ (CGTarget* t, Operand d, Operand s) { (void)d;(void)s; aa_panic(t, "va_copy"); } +/* AAPCS64 va_list (32 bytes): + * off 0 void* __stack next stack-passed var arg + * off 8 void* __gr_top one past end of GP save area + * off 16 void* __vr_top one past end of FP save area + * off 24 int __gr_offs current GP offset (negative; >= 0 → use stack) + * off 28 int __vr_offs current FP offset (negative; >= 0 → use stack) + * + * va_start populates the struct from the function's reg-save areas and + * the named-param consumption already tracked on AAImpl. va_arg dispatches + * by RegClass: int args walk the GP save area at 8-byte stride; FP args + * walk the FP save area at 16-byte stride (q-register-sized slots). When + * the offset reaches 0, fall through to the stack at 8-byte stride. */ +static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) +{ + if (ofs == 0) emit32(mc, aa64_mov_reg(1, dst, 29)); + else if (ofs > 0 + && (u32)ofs <= 0xfff) emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0)); + else if (ofs < 0 + && (u32)(-ofs) <= 0xfff) emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0)); + else { + emit_load_imm(mc, 1, dst, ofs); + emit32(mc, aa64_add(1, dst, 29, dst)); + } +} + +static void aa_va_start_(CGTarget* t, Operand ap_op) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + if (!a->is_variadic) { + compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic"); + } + u32 ap = reg_num(ap_op); + AASlot* gs = slot_get(a, a->gp_save_slot); + AASlot* fs = slot_get(a, a->fp_save_slot); + + /* __stack = fp + 16 + named-stack-args-bytes */ + { + u32 ofs = 16u + a->next_param_stack; + if (ofs <= 0xfff) emit32(mc, aa64_add_imm(1, 9, 29, ofs, 0)); + else { emit_load_imm(mc, 1, 9, (i64)ofs); emit32(mc, aa64_add(1, 9, 29, 9)); } + emit32(mc, aa64_str_uimm(3, 9, ap, 0)); + } + /* __gr_top = fp - gs->off + gs->size */ + emit_fp_off(mc, 9, -(i32)gs->off + (i32)gs->size); + emit32(mc, aa64_str_uimm(3, 9, ap, 8)); + /* __vr_top = fp - fs->off + fs->size */ + emit_fp_off(mc, 9, -(i32)fs->off + (i32)fs->size); + emit32(mc, aa64_str_uimm(3, 9, ap, 16)); + /* __gr_offs = named_int*8 - 64 */ + emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_int * 8u) - 64)); + emit32(mc, aa64_str_uimm(2, 9, ap, 24)); + /* __vr_offs = named_fp*16 - 128 */ + emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_fp * 16u) - 128)); + emit32(mc, aa64_str_uimm(2, 9, ap, 28)); +} + +static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, const Type* ty) +{ + MCEmitter* mc = t->mc; + u32 ap = reg_num(ap_op); + int is_fp = (dst.cls == RC_FP); + u32 offs_field = is_fp ? 28u : 24u; + u32 top_field = is_fp ? 16u : 8u; + u32 stride_reg = is_fp ? 16u : 8u; + u32 sz = type_byte_size(ty); + u32 sidx = size_idx_for_bytes(sz); + + MCLabel L_stack = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + + /* w9 = ap.offs ; cmp; b.ge L_stack (>=0 means save area exhausted) */ + emit32(mc, aa64_ldur(2, 9, ap, (i32)offs_field)); + emit32(mc, aa64_subs_imm(0, 31, 9, 0)); + emit32(mc, aa64_b_cond(0xa /*GE*/)); + mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0); + + /* save-area path: + * x10 = ap.top + * x12 = sxtw(w9) + * x11 = x10 + x12 + * load dst, [x11] + * w9 += stride_reg ; ap.offs = w9 ; b L_done */ + emit32(mc, aa64_ldur(3, 10, ap, (i32)top_field)); + emit32(mc, aa64_sbfm(1, 12, 9, 0, 31)); + emit32(mc, aa64_add(1, 11, 10, 12)); + if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 11, 0)); + else emit32(mc, aa64_ldur (sidx, reg_num(dst), 11, 0)); + emit32(mc, aa64_add_imm(0, 9, 9, stride_reg, 0)); + emit32(mc, aa64_stur(2, 9, ap, (i32)offs_field)); + emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); + + /* L_stack: x10 = ap.stack ; load dst,[x10] ; x10+=8 ; ap.stack=x10 */ + mc->label_place(mc, L_stack); + emit32(mc, aa64_ldur(3, 10, ap, 0)); + if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 10, 0)); + else emit32(mc, aa64_ldur (sidx, reg_num(dst), 10, 0)); + emit32(mc, aa64_add_imm(1, 10, 10, 8u, 0)); + emit32(mc, aa64_stur(3, 10, ap, 0)); + + mc->label_place(mc, L_done); +} + +static void aa_va_end_(CGTarget* t, Operand a) { (void)t; (void)a; } + +static void aa_va_copy_(CGTarget* t, Operand d, Operand s) +{ + MCEmitter* mc = t->mc; + u32 dr = reg_num(d); + u32 sr = reg_num(s); + /* va_list is 32 bytes — 4 x 8-byte LDR/STR pairs. */ + for (u32 i = 0; i < 32u; i += 8u) { + emit32(mc, aa64_ldur(3, 9, sr, (i32)i)); + emit32(mc, aa64_stur(3, 9, dr, (i32)i)); + } +} -static void aa_atomic_load (CGTarget* t, Operand d, Operand a, MemAccess m, MemOrder o) { (void)d;(void)a;(void)m;(void)o; aa_panic(t, "atomic_load"); } -static void aa_atomic_store(CGTarget* t, Operand a, Operand s, MemAccess m, MemOrder o) { (void)a;(void)s;(void)m;(void)o; aa_panic(t, "atomic_store"); } -static void aa_atomic_rmw (CGTarget* t, AtomicOp op, Operand d, Operand a, Operand v, MemAccess m, MemOrder o) { (void)op;(void)d;(void)a;(void)v;(void)m;(void)o; aa_panic(t, "atomic_rmw"); } -static void aa_atomic_cas (CGTarget* t, Operand p, Operand ok, Operand a, Operand e, Operand des, MemAccess m, MemOrder s, MemOrder f) { (void)p;(void)ok;(void)a;(void)e;(void)des;(void)m;(void)s;(void)f; aa_panic(t, "atomic_cas"); } -static void aa_fence (CGTarget* t, MemOrder o) { (void)o; aa_panic(t, "fence"); } +/* ---- atomics ---- + * + * Lowering uses ARMv8.0 LL/SC (LDXR/STXR family) — no FEAT_LSE assumption. + * Acquire/Release semantics ride the load/store form chosen by MemOrder + * (LDAR/STLR for plain accesses; LDAXR/STLXR inside the LL/SC loop). + * fence() emits DMB ISH (data memory barrier, inner shareable). */ + +/* Encoder helpers — inline since only used here. */ +static inline u32 aa64_ldar (u32 sf64, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_stlr (u32 sf64, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_ldxr (u32 sf64, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_stxr (u32 sf64, u32 Rs, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC8007C00u : 0x88007C00u) + | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) +{ return (sf64 ? 0xC800FC00u : 0x8800FC00u) + | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); } +static inline u32 aa64_dmb_ish(void) { return 0xD5033BBFu; } +static inline u32 aa64_clrex (void) { return 0xD5033F5Fu; } +/* CBNZ Rt, imm19 */ +static inline u32 aa64_cbnz (u32 sf64, u32 Rt) +{ return 0x35000000u | (sf64<<31) | (Rt&0x1f); } + +static int mem_order_is_acquire(MemOrder o) +{ return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || o == MO_CONSUME; } +static int mem_order_is_release(MemOrder o) +{ return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; } + +static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, + MemAccess ma, MemOrder ord) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + /* Resolve addr to a base register; LDAR/LDR-exclusive both want a + * pointer in a GPR, no offset form. */ + u32 base; + if (addr.kind == OPK_REG) { + base = reg_num(addr); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot"); + base = 9u; + emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_load: addr kind %d unsupported", + (int)addr.kind); + } + if (mem_order_is_acquire(ord)) { + emit32(mc, aa64_ldar(sf, reg_num(dst), base)); + } else { + u32 sidx = size_idx_for_bytes(ma.size); + emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0)); + } +} -static void aa_intrinsic(CGTarget* t, IntrinKind k, Operand* dsts, u32 nd, const Operand* args, u32 na) { (void)k;(void)dsts;(void)nd;(void)args;(void)na; aa_panic(t, "intrinsic"); } +static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, + MemAccess ma, MemOrder ord) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + /* Materialize src into a register if needed. */ + u32 src_reg; + if (src.kind == OPK_IMM) { + src_reg = 10u; + emit_load_imm(mc, sf, src_reg, src.v.imm); + } else if (src.kind == OPK_REG) { + src_reg = reg_num(src); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_store: src kind %d unsupported", + (int)src.kind); + } + /* Base reg. */ + u32 base; + if (addr.kind == OPK_REG) { + base = reg_num(addr); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot"); + base = 9u; + emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_store: addr kind %d unsupported", + (int)addr.kind); + } + if (mem_order_is_release(ord)) { + emit32(mc, aa64_stlr(sf, src_reg, base)); + } else { + u32 sidx = size_idx_for_bytes(ma.size); + emit32(mc, aa64_stur(sidx, src_reg, base, 0)); + } +} + +/* Apply rmw op: new = f(prior, val). prior, val, dst are W/X based on sf. + * Uses scratch x12 if a temporary is needed (e.g. NAND). */ +static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, + u32 dst_new, u32 prior, u32 val) +{ + switch (op) { + case AO_XCHG: emit32(mc, aa64_mov_reg(sf, dst_new, val)); break; + case AO_ADD: emit32(mc, aa64_add(sf, dst_new, prior, val)); break; + case AO_SUB: emit32(mc, aa64_sub(sf, dst_new, prior, val)); break; + case AO_AND: emit32(mc, aa64_and(sf, dst_new, prior, val)); break; + case AO_OR: emit32(mc, aa64_orr(sf, dst_new, prior, val)); break; + case AO_XOR: emit32(mc, aa64_eor(sf, dst_new, prior, val)); break; + case AO_NAND: + /* NAND: new = ~(prior & val). AArch64 has no NAND; use AND then MVN. */ + emit32(mc, aa64_and(sf, dst_new, prior, val)); + emit32(mc, aa64_mvn(sf, dst_new, dst_new)); + break; + default: + emit32(mc, aa64_mov_reg(sf, dst_new, val)); + break; + } +} + +static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, + Operand addr, Operand val, + MemAccess ma, MemOrder ord) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + /* Pin operands into scratch regs: + * x9 = base (atomic addr) + * x10 = val + * x11 = new (computed) + * w12 = stxr status flag + * dst (prior) is the user-provided destination reg. */ + u32 base = 9u; + if (addr.kind == OPK_REG) { + emit32(mc, aa64_mov_reg(1, 9, reg_num(addr))); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot"); + emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported", + (int)addr.kind); + } + u32 vreg = 10u; + if (val.kind == OPK_IMM) { + emit_load_imm(mc, sf, vreg, val.v.imm); + } else if (val.kind == OPK_REG) { + emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val))); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported", + (int)val.kind); + } + + int do_acq = mem_order_is_acquire(ord); + int do_rel = mem_order_is_release(ord); + + MCLabel L_retry = mc->label_new(mc); + mc->label_place(mc, L_retry); + + /* prior <- ldxr/ldaxr [base] */ + if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(dst), base)); + else emit32(mc, aa64_ldxr (sf, reg_num(dst), base)); + + /* new = combine(prior, val) into x11 */ + emit_rmw_combine(mc, op, sf, /*new=*/11u, /*prior=*/reg_num(dst), vreg); + + /* status <- stxr/stlxr [base], new ; cbnz status, retry */ + if (do_rel) emit32(mc, aa64_stlxr(sf, /*Rs=*/12u, /*Rt=*/11u, base)); + else emit32(mc, aa64_stxr (sf, /*Rs=*/12u, /*Rt=*/11u, base)); + + u32 cbnz_pos = mc->pos(mc); + emit32(mc, aa64_cbnz(0, /*Rt=*/12u)); + mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); + (void)cbnz_pos; +} + +static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, + Operand addr, Operand expected, Operand desired, + MemAccess ma, MemOrder succ, MemOrder fail) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + (void)fail; + + /* Pin operands: + * x9 = base + * x10 = expected (compare against prior) + * x11 = desired (store on match) + * w12 = stxr status flag */ + u32 base = 9u; + if (addr.kind == OPK_REG) emit32(mc, aa64_mov_reg(1, 9, reg_num(addr))); + else if (addr.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot"); + emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported", + (int)addr.kind); + } + if (expected.kind == OPK_IMM) emit_load_imm(mc, sf, 10, expected.v.imm); + else if (expected.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 10, reg_num(expected))); + else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported", + (int)expected.kind); + if (desired.kind == OPK_IMM) emit_load_imm(mc, sf, 11, desired.v.imm); + else if (desired.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 11, reg_num(desired))); + else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported", + (int)desired.kind); + + int do_acq = mem_order_is_acquire(succ); + int do_rel = mem_order_is_release(succ); + + MCLabel L_retry = mc->label_new(mc); + MCLabel L_fail = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + + mc->label_place(mc, L_retry); + if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(prior), base)); + else emit32(mc, aa64_ldxr (sf, reg_num(prior), base)); + + /* if (prior != expected) -> fail (clrex + ok=0) */ + emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/31u, reg_num(prior), 10u)); + emit32(mc, aa64_b_cond(0x1u /*NE*/)); + mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0); + + /* try store; retry on stxr failure */ + if (do_rel) emit32(mc, aa64_stlxr(sf, 12u, 11u, base)); + else emit32(mc, aa64_stxr (sf, 12u, 11u, base)); + emit32(mc, aa64_cbnz(0, 12u)); + mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); + + /* ok = 1 ; jump done */ + emit_load_imm(mc, 0, reg_num(ok), 1); + emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); + + /* L_fail: clear monitor; ok = 0 */ + mc->label_place(mc, L_fail); + emit32(mc, aa64_clrex()); + emit_load_imm(mc, 0, reg_num(ok), 0); + + mc->label_place(mc, L_done); +} + +static void aa_fence(CGTarget* t, MemOrder o) +{ + (void)o; + /* Conservative: full-system DMB ISH for any release/acquire/seq_cst. + * RELAXED fence is a no-op. */ + if (o == MO_RELAXED) return; + emit32(t->mc, aa64_dmb_ish()); +} + +/* ---- intrinsics ---- */ + +/* Data-processing (1 source) — REV16 / REV / REV32 / RBIT / CLZ. + * Family base 0x5AC00000 (sf=0); set sf<<31 for 64-bit forms. */ +static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) +{ return 0x5AC00400u | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_rev_w (u32 Rd, u32 Rn) +{ return 0x5AC00800u | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_rev_x (u32 Rd, u32 Rn) +{ return 0xDAC00C00u | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_rbit (u32 sf64, u32 Rd, u32 Rn) +{ return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_clz (u32 sf64, u32 Rd, u32 Rn) +{ return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* SIMD CNT (Vd.<T>, Vn.<T>) and ADDV (Bd, Vn.8B). 8B form, Q=0. */ +static inline u32 aa64_cnt_8b (u32 Vd, u32 Vn) +{ return 0x0E205800u | ((Vn&0x1f)<<5) | (Vd&0x1f); } +static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) +{ return 0x0E31B800u | ((Vn&0x1f)<<5) | (Vd&0x1f); } + +/* ADDS / SUBS shifted register (S=1; sets NZCV including V for signed ovf). */ +static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) +{ return 0x2B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* SMADDL / UMADDL → SMULL / UMULL with Ra = ZR. 64-bit dst, 32-bit srcs. */ +static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) +{ return aa64_dp3_pack((AA64DP3){.sf=1,.op31=1,.o0=0,.Rm=Rm,.Ra=Ra,.Rn=Rn,.Rd=Rd}); } +static inline u32 aa64_smull (u32 Rd, u32 Rn, u32 Rm) +{ return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); } + +/* SUBS Xd, Xn, Wm, SXTW — extended-register form, used for the + * mul_overflow check (compare full 64-bit product to sign-extended low 32). */ +static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) +{ return 0xEB200000u | ((Rm&0x1f)<<16) | (6u<<13) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* BRK #imm16 — used for TRAP/UNREACHABLE landing pads. */ +static inline u32 aa64_brk(u32 imm16) +{ return 0xD4200000u | ((imm16 & 0xffffu) << 5); } + +static void aa_intrinsic(CGTarget* t, IntrinKind kind, + Operand* dsts, u32 nd, + const Operand* args, u32 na) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + (void)nd; + + switch (kind) { + case INTRIN_POPCOUNT: { + /* fmov v0, src ; cnt v0.8b, v0.8b ; addv b0, v0.8b ; fmov w_dst, s0 */ + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sz_in = type_byte_size(src.type); + if (sz_in == 8) emit32(mc, aa64_fmov_d_x(0, reg_num(src))); + else emit32(mc, aa64_fmov_s_w(0, reg_num(src))); + emit32(mc, aa64_cnt_8b (0, 0)); + emit32(mc, aa64_addv_b_8b(0, 0)); + emit32(mc, aa64_fmov_w_s (reg_num(dst), 0)); + return; + } + case INTRIN_CLZ: { + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(src.type) ? 1u : 0u; + emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src))); + return; + } + case INTRIN_CTZ: { + /* ctz(x) = clz(rbit(x)) */ + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(src.type) ? 1u : 0u; + emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src))); + emit32(mc, aa64_clz (sf, reg_num(dst), reg_num(dst))); + return; + } + case INTRIN_BSWAP16: { + emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_BSWAP32: { + emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_BSWAP64: { + emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_MEMCPY: + case INTRIN_MEMMOVE: { + /* args = (dst_addr, src_addr, n_bytes). v1 only handles a constant + * n: unroll forward (memcpy) or backward (memmove). */ + Operand da = args[0], sa = args[1], nb = args[2]; + if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: %s with non-const n or non-REG ptr", + kind == INTRIN_MEMCPY ? "memcpy" : "memmove"); + } + u32 dr = reg_num(da); + u32 sr = reg_num(sa); + u32 n = (u32)nb.v.imm; + if (kind == INTRIN_MEMCPY) { + u32 i = 0; + while (i + 8 <= n) { + emit32(mc, aa64_ldur(3, 12, sr, (i32)i)); + emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + i += 8; + } + while (i + 4 <= n) { + emit32(mc, aa64_ldur(2, 12, sr, (i32)i)); + emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + i += 4; + } + while (i + 2 <= n) { + emit32(mc, aa64_ldur(1, 12, sr, (i32)i)); + emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + i += 2; + } + while (i < n) { + emit32(mc, aa64_ldur(0, 12, sr, (i32)i)); + emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + i += 1; + } + } else { + /* memmove: copy backward to handle dst > src overlap. */ + u32 i = n; + while (i >= 8) { + i -= 8; + emit32(mc, aa64_ldur(3, 12, sr, (i32)i)); + emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + } + while (i >= 4) { + i -= 4; + emit32(mc, aa64_ldur(2, 12, sr, (i32)i)); + emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + } + while (i >= 2) { + i -= 2; + emit32(mc, aa64_ldur(1, 12, sr, (i32)i)); + emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + } + while (i >= 1) { + i -= 1; + emit32(mc, aa64_ldur(0, 12, sr, (i32)i)); + emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + } + } + return; + } + case INTRIN_MEMSET: { + /* args = (dst_addr, byte, n) */ + Operand da = args[0], bv = args[1], nb = args[2]; + if (da.kind != OPK_REG || nb.kind != OPK_IMM) { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: memset with non-const n / non-REG ptr"); + } + u32 dr = reg_num(da); + u32 n = (u32)nb.v.imm; + u32 byte; + u32 src_reg; + if (bv.kind == OPK_IMM) { + byte = (u32)(bv.v.imm & 0xffu); + if (byte == 0) { + src_reg = 31u; /* XZR / WZR */ + } else { + u64 b64 = byte; + b64 |= b64 << 8; b64 |= b64 << 16; b64 |= b64 << 32; + emit_load_imm(mc, 1, 12, (i64)b64); + src_reg = 12u; + } + } else if (bv.kind == OPK_REG) { + /* Broadcast: dup low byte across all 8 bytes via ORR-immediate + * trickery is awkward; use mul-by-0x0101010101010101. */ + emit_load_imm(mc, 1, 12, (i64)0x0101010101010101ll); + emit32(mc, aa64_madd(1, 12, reg_num(bv), 12, AA64_ZR)); + src_reg = 12u; + } else { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: memset byte kind %d unsupported", + (int)bv.kind); + } + u32 i = 0; + while (i + 8 <= n) { emit32(mc, aa64_stur(3, src_reg, dr, (i32)i)); i += 8; } + while (i + 4 <= n) { emit32(mc, aa64_stur(2, src_reg, dr, (i32)i)); i += 4; } + while (i + 2 <= n) { emit32(mc, aa64_stur(1, src_reg, dr, (i32)i)); i += 2; } + while (i < n) { emit32(mc, aa64_stur(0, src_reg, dr, (i32)i)); i += 1; } + return; + } + case INTRIN_PREFETCH: + /* No-op hint. */ + (void)args; (void)na; + return; + case INTRIN_ASSUME_ALIGNED: { + /* dst = src (alignment is a hint only). */ + Operand src = args[0]; + Operand dst = dsts[0]; + if (reg_num(src) != reg_num(dst)) { + emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src))); + } + return; + } + case INTRIN_EXPECT: { + /* dst = val (the "expected" hint is dropped). */ + Operand val = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(dst.type) ? 1u : 0u; + if (val.kind == OPK_REG) { + if (reg_num(val) != reg_num(dst)) { + emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val))); + } + } else if (val.kind == OPK_IMM) { + emit_load_imm(mc, sf, reg_num(dst), val.v.imm); + } else { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: expect val kind %d unsupported", + (int)val.kind); + } + return; + } + case INTRIN_UNREACHABLE: + case INTRIN_TRAP: + emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u)); + return; + case INTRIN_ADD_OVERFLOW: + case INTRIN_SUB_OVERFLOW: { + /* dsts: [val, ovf]. ADDS/SUBS sets V on signed overflow; CSET VS. */ + Operand a_op = args[0], b_op = args[1]; + Operand dval = dsts[0], dovf = dsts[1]; + u32 sf = type_is_64(dval.type) ? 1u : 0u; + u32 ra = force_reg_int(t, a_op, sf, 9); + u32 rb = force_reg_int(t, b_op, sf, (ra == 9) ? 10u : 9u); + u32 word = (kind == INTRIN_ADD_OVERFLOW) + ? aa64_adds_reg(sf, reg_num(dval), ra, rb) + : aa64_subs_reg(sf, reg_num(dval), ra, rb); + emit32(mc, word); + emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/)); + return; + } + case INTRIN_MUL_OVERFLOW: { + /* SMULL Xtmp, Wn, Wm gives full 64-bit signed product. + * ovf = (Xtmp != sxtw(Wtmp)) — i.e. upper 32 bits ≠ sign-ext of low. + * dval gets the truncated low 32 bits. */ + Operand a_op = args[0], b_op = args[1]; + Operand dval = dsts[0], dovf = dsts[1]; + u32 sf = type_is_64(dval.type) ? 1u : 0u; + if (sf) { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: mul_overflow on i64 not yet supported"); + } + u32 ra = force_reg_int(t, a_op, 0, 9); + u32 rb = force_reg_int(t, b_op, 0, (ra == 9) ? 10u : 9u); + emit32(mc, aa64_smull(/*X*/11u, ra, rb)); + emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/31u, /*Xn=*/11u, /*Wm=*/11u)); + emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); + emit32(mc, aa64_mov_reg(0, reg_num(dval), 11u)); + return; + } + default: + compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported", + (int)kind); + } +} static void aa_asm_block(CGTarget* t, const char* tmpl, const AsmConstraint* outs, u32 no, Operand* oo,