commit 060d8253db61a71604f234a187998b47c3fc6a0c
parent 781d954928484c2614b1a43d73460b4c66b00212
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 15:56:45 -0700
cg/aa64: implement Groups J, K, L (varargs, atomics, intrinsics)
- ABI: replace the placeholder va_list type with the AAPCS64
__va_list struct (3 ptrs + 2 ints, 32 bytes).
- Variadic prologue: reserve GP/FP register save areas and spill
x0..x7 / d0..d7 immediately after the prologue placeholder.
- va_start/va_arg/va_end/va_copy: full AAPCS64 lowering, dispatching
to the GP or FP save area then falling through to the caller's
stack at __gr_offs/__vr_offs == 0.
- Atomics: ARMv8.0 LL/SC lowering — LDAR/STLR for plain ordered
load/store, LDAXR/STLXR retry loops for rmw and cas, DMB ISH for
fences. NAND synthesized via AND+MVN.
- Intrinsics: NEON-based POPCOUNT (CNT.8B + ADDV), CLZ/CTZ via RBIT,
REV*-family BSWAP, constant-size MEMCPY/MEMMOVE/MEMSET, no-op
PREFETCH/ASSUME_ALIGNED/EXPECT, BRK for TRAP/UNREACHABLE, and
ADDS/SUBS+CSET (signed V flag) for ADD/SUB_OVERFLOW with
SMULL+sxtw compare for MUL_OVERFLOW.
- Misc: extend the FP scratch range to v16..v23 for short-lived
materialization (j06 needs 9 simultaneous FP regs); fix the
call-site FP-to-stack path that was clobbering v0/v1 with FMOV.
All 752 cg cases pass across D/R/E/J paths.
Diffstat:
2 files changed, 751 insertions(+), 33 deletions(-)
diff --git a/src/abi/abi.c b/src/abi/abi.c
@@ -12,6 +12,7 @@
#include "abi/abi.h"
#include "core/core.h"
#include "core/arena.h"
+#include "core/pool.h"
#include <cfree.h>
@@ -36,6 +37,7 @@ struct TargetABI {
/* Per-TU cached lookups. */
FuncInfoCacheEntry* fn_cache;
RecordLayoutCacheEntry* rec_cache;
+ const Type* va_list_cache;
};
/* ---- scalar profile ---- */
@@ -331,11 +333,22 @@ const Type* abi_intptr_type (TargetABI* a, Pool* p)
const Type* abi_uintptr_type(TargetABI* a, Pool* p) { return size_or_uintptr(a, p); }
const Type* abi_va_list_type(TargetABI* a, Pool* p)
{
- /* AAPCS64: __va_list is a struct of three pointers + two ints. v1 returns
- * a placeholder pointer; this is exercised only by the parser/builtin
- * substitution path, which Group A does not reach. */
- (void)a;
- return type_ptr(p, type_void(p));
+ /* AAPCS64 __va_list: 3 pointers (__stack, __gr_top, __vr_top) followed
+ * by 2 ints (__gr_offs, __vr_offs). Total 32 bytes, 8-aligned. */
+ if (a->va_list_cache) return a->va_list_cache;
+ const Type* vp = type_ptr(p, type_void(p));
+ const Type* it = type_prim(p, TY_INT);
+ Sym name = pool_intern_cstr(p, "__va_list");
+ SrcLoc nl = {0,0,0};
+ TagId tg = type_tag_new(p, TAG_STRUCT, name, nl);
+ TypeRecordBuilder* b = type_record_begin(p, TY_STRUCT, tg, name);
+ type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__stack"), .type = vp });
+ type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_top"), .type = vp });
+ type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_top"), .type = vp });
+ type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_offs"), .type = it });
+ type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_offs"), .type = it });
+ a->va_list_cache = type_record_end(p, b);
+ return a->va_list_cache;
}
/* ---- lifecycle ---- */
@@ -352,6 +365,7 @@ void abi_fini(TargetABI* a)
if (!a) return;
a->fn_cache = NULL;
a->rec_cache = NULL;
+ a->va_list_cache = NULL;
a->c = NULL;
}
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -276,6 +276,14 @@ typedef struct AAImpl {
struct AAAllocaPatch { u32 pos; u32 dst_reg; }* add_patches;
u32 nadd_patches;
u32 add_patches_cap;
+
+ /* Variadic — AAPCS64 register save areas reserved at function entry.
+ * gp_save_slot holds 8*8=64 bytes (x0..x7); fp_save_slot holds 8*16=128
+ * bytes (v0..v7 with 16-byte stride). Saves are emitted in func_begin
+ * after the prologue placeholder so FP is already valid when they run. */
+ u8 is_variadic;
+ FrameSlot gp_save_slot;
+ FrameSlot fp_save_slot;
} AAImpl;
static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
@@ -450,6 +458,9 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
a->has_alloca = 0;
a->nadd_patches= 0;
a->sret_ptr_slot = FRAME_SLOT_NONE;
+ a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+ a->gp_save_slot = FRAME_SLOT_NONE;
+ a->fp_save_slot = FRAME_SLOT_NONE;
a->epilogue_label = mc->label_new(mc);
mc->cfi_startproc(mc);
@@ -470,6 +481,31 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
};
a->sret_ptr_slot = aa_frame_slot(t, &fsd);
}
+
+ /* Variadic: reserve GP and FP register save areas and emit saves of
+ * x0..x7 / d0..d7 here, after the prologue placeholder, so FP is set
+ * up. Param stores below run after these saves but before any user
+ * code clobbers x0..x7. */
+ if (a->is_variadic) {
+ FrameSlotDesc gpd = {
+ .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+ .size = 64, .align = 8, .kind = FS_SPILL, .flags = 0,
+ };
+ a->gp_save_slot = aa_frame_slot(t, &gpd);
+ FrameSlotDesc fpd = {
+ .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+ .size = 128, .align = 16, .kind = FS_SPILL, .flags = 0,
+ };
+ a->fp_save_slot = aa_frame_slot(t, &fpd);
+ AASlot* gs = slot_get(a, a->gp_save_slot);
+ AASlot* fs = slot_get(a, a->fp_save_slot);
+ for (u32 i = 0; i < 8; ++i) {
+ emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i*8));
+ }
+ for (u32 i = 0; i < 8; ++i) {
+ emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i*16));
+ }
+ }
}
static void aa_func_end(CGTarget* t)
@@ -477,9 +513,12 @@ static void aa_func_end(CGTarget* t)
AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
- /* Compute callee-save layout. */
+ /* Compute callee-save layout. Only v8..v15 are callee-saved; the
+ * caller-saved v16..v23 are handed out by alloc_reg too but never
+ * appear in prologue saves. */
u32 n_int_pairs = (a->used_int + 1) / 2; /* round up */
- u32 n_fp_pairs = (a->used_fp + 1) / 2;
+ u32 used_fp_cs = a->used_fp > 8 ? 8u : a->used_fp;
+ u32 n_fp_pairs = (used_fp_cs + 1) / 2;
u32 outgoing_off = 0;
u32 int_save_off = a->max_outgoing;
@@ -619,11 +658,16 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty)
return (Reg)(19u + a->used_int++);
}
if (cls == RC_FP) {
- if (a->used_fp >= 8) {
+ /* v8..v15 are callee-saved (low 64 bits); v16..v23 are caller-saved
+ * scratch. Hand out callee-saved first, then fall back to scratch
+ * for short-lived materialization (e.g. j06 builds 9 FP arg regs
+ * with no intervening call). */
+ if (a->used_fp >= 16) {
compiler_panic(t->c, a->loc,
"aarch64 alloc_reg: out of FP scratch (no spill yet)");
}
- return (Reg)(8u + a->used_fp++);
+ u32 idx = a->used_fp++;
+ return (Reg)(idx < 8 ? 8u + idx : 16u + (idx - 8u));
}
compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
}
@@ -1593,7 +1637,26 @@ static void emit_arg_value(CGTarget* t,
u32* next_int, u32* next_fp, u32* stack_off)
{
AAImpl* a = impl_of(t);
+ /* Synthesize a one-part DIRECT ABIArgInfo for var args (av->abi is NULL
+ * past the fixed-param count). AAPCS64 routes var args through the same
+ * register/stack rules as fixed scalars, so this matches what
+ * abi_func_info would have produced. */
+ ABIArgInfo va_ai;
+ ABIArgPart va_pt;
const ABIArgInfo* ai = av->abi;
+ if (!ai) {
+ u32 sz = type_byte_size(av->type);
+ memset(&va_ai, 0, sizeof va_ai);
+ memset(&va_pt, 0, sizeof va_pt);
+ va_ai.kind = ABI_ARG_DIRECT;
+ va_ai.parts = &va_pt;
+ va_ai.nparts = 1;
+ va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ va_pt.size = sz;
+ va_pt.align = sz;
+ va_pt.src_offset = 0;
+ ai = &va_ai;
+ }
if (ai->kind == ABI_ARG_IGNORE) return;
if (ai->kind == ABI_ARG_INDIRECT) {
@@ -1660,20 +1723,33 @@ static void emit_arg_value(CGTarget* t,
}
} else if (pt->cls == ABI_CLASS_FP) {
int to_stack = (*next_fp >= 8);
- u32 dst_reg = to_stack ? 0u : (*next_fp)++;
- switch (av->storage.kind) {
- case OPK_REG: {
- u32 type = (sz == 8) ? 1u : 0u;
- emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
- break;
- }
- default:
- compiler_panic(t->c, a->loc,
- "aarch64 call: FP arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- if (to_stack) {
- emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off));
+ if (!to_stack) {
+ u32 dst_reg = (*next_fp)++;
+ switch (av->storage.kind) {
+ case OPK_REG: {
+ u32 type = (sz == 8) ? 1u : 0u;
+ emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
+ break;
+ }
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: FP arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ } else {
+ /* Store source FP reg directly into the stack slot — going
+ * through v0/v1 would corrupt args already placed in the
+ * register save area. */
+ switch (av->storage.kind) {
+ case OPK_REG:
+ emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31,
+ (i32)*stack_off));
+ break;
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: FP stack-arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
*stack_off += 8;
}
} else {
@@ -1932,18 +2008,646 @@ static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align)
emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/31, 0, 0));
a->has_alloca = 1;
}
-static void aa_va_start_(CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_start"); }
-static void aa_va_arg_ (CGTarget* t, Operand d, Operand a, const Type* ty) { (void)d;(void)a;(void)ty; aa_panic(t, "va_arg"); }
-static void aa_va_end_ (CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_end"); }
-static void aa_va_copy_ (CGTarget* t, Operand d, Operand s) { (void)d;(void)s; aa_panic(t, "va_copy"); }
+/* AAPCS64 va_list (32 bytes):
+ * off 0 void* __stack next stack-passed var arg
+ * off 8 void* __gr_top one past end of GP save area
+ * off 16 void* __vr_top one past end of FP save area
+ * off 24 int __gr_offs current GP offset (negative; >= 0 → use stack)
+ * off 28 int __vr_offs current FP offset (negative; >= 0 → use stack)
+ *
+ * va_start populates the struct from the function's reg-save areas and
+ * the named-param consumption already tracked on AAImpl. va_arg dispatches
+ * by RegClass: int args walk the GP save area at 8-byte stride; FP args
+ * walk the FP save area at 16-byte stride (q-register-sized slots). When
+ * the offset reaches 0, fall through to the stack at 8-byte stride. */
+static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs)
+{
+ if (ofs == 0) emit32(mc, aa64_mov_reg(1, dst, 29));
+ else if (ofs > 0
+ && (u32)ofs <= 0xfff) emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0));
+ else if (ofs < 0
+ && (u32)(-ofs) <= 0xfff) emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0));
+ else {
+ emit_load_imm(mc, 1, dst, ofs);
+ emit32(mc, aa64_add(1, dst, 29, dst));
+ }
+}
+
+static void aa_va_start_(CGTarget* t, Operand ap_op)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ if (!a->is_variadic) {
+ compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic");
+ }
+ u32 ap = reg_num(ap_op);
+ AASlot* gs = slot_get(a, a->gp_save_slot);
+ AASlot* fs = slot_get(a, a->fp_save_slot);
+
+ /* __stack = fp + 16 + named-stack-args-bytes */
+ {
+ u32 ofs = 16u + a->next_param_stack;
+ if (ofs <= 0xfff) emit32(mc, aa64_add_imm(1, 9, 29, ofs, 0));
+ else { emit_load_imm(mc, 1, 9, (i64)ofs); emit32(mc, aa64_add(1, 9, 29, 9)); }
+ emit32(mc, aa64_str_uimm(3, 9, ap, 0));
+ }
+ /* __gr_top = fp - gs->off + gs->size */
+ emit_fp_off(mc, 9, -(i32)gs->off + (i32)gs->size);
+ emit32(mc, aa64_str_uimm(3, 9, ap, 8));
+ /* __vr_top = fp - fs->off + fs->size */
+ emit_fp_off(mc, 9, -(i32)fs->off + (i32)fs->size);
+ emit32(mc, aa64_str_uimm(3, 9, ap, 16));
+ /* __gr_offs = named_int*8 - 64 */
+ emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_int * 8u) - 64));
+ emit32(mc, aa64_str_uimm(2, 9, ap, 24));
+ /* __vr_offs = named_fp*16 - 128 */
+ emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_fp * 16u) - 128));
+ emit32(mc, aa64_str_uimm(2, 9, ap, 28));
+}
+
+static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, const Type* ty)
+{
+ MCEmitter* mc = t->mc;
+ u32 ap = reg_num(ap_op);
+ int is_fp = (dst.cls == RC_FP);
+ u32 offs_field = is_fp ? 28u : 24u;
+ u32 top_field = is_fp ? 16u : 8u;
+ u32 stride_reg = is_fp ? 16u : 8u;
+ u32 sz = type_byte_size(ty);
+ u32 sidx = size_idx_for_bytes(sz);
+
+ MCLabel L_stack = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+
+ /* w9 = ap.offs ; cmp; b.ge L_stack (>=0 means save area exhausted) */
+ emit32(mc, aa64_ldur(2, 9, ap, (i32)offs_field));
+ emit32(mc, aa64_subs_imm(0, 31, 9, 0));
+ emit32(mc, aa64_b_cond(0xa /*GE*/));
+ mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0);
+
+ /* save-area path:
+ * x10 = ap.top
+ * x12 = sxtw(w9)
+ * x11 = x10 + x12
+ * load dst, [x11]
+ * w9 += stride_reg ; ap.offs = w9 ; b L_done */
+ emit32(mc, aa64_ldur(3, 10, ap, (i32)top_field));
+ emit32(mc, aa64_sbfm(1, 12, 9, 0, 31));
+ emit32(mc, aa64_add(1, 11, 10, 12));
+ if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 11, 0));
+ else emit32(mc, aa64_ldur (sidx, reg_num(dst), 11, 0));
+ emit32(mc, aa64_add_imm(0, 9, 9, stride_reg, 0));
+ emit32(mc, aa64_stur(2, 9, ap, (i32)offs_field));
+ emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+ /* L_stack: x10 = ap.stack ; load dst,[x10] ; x10+=8 ; ap.stack=x10 */
+ mc->label_place(mc, L_stack);
+ emit32(mc, aa64_ldur(3, 10, ap, 0));
+ if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 10, 0));
+ else emit32(mc, aa64_ldur (sidx, reg_num(dst), 10, 0));
+ emit32(mc, aa64_add_imm(1, 10, 10, 8u, 0));
+ emit32(mc, aa64_stur(3, 10, ap, 0));
+
+ mc->label_place(mc, L_done);
+}
+
+static void aa_va_end_(CGTarget* t, Operand a) { (void)t; (void)a; }
+
+static void aa_va_copy_(CGTarget* t, Operand d, Operand s)
+{
+ MCEmitter* mc = t->mc;
+ u32 dr = reg_num(d);
+ u32 sr = reg_num(s);
+ /* va_list is 32 bytes — 4 x 8-byte LDR/STR pairs. */
+ for (u32 i = 0; i < 32u; i += 8u) {
+ emit32(mc, aa64_ldur(3, 9, sr, (i32)i));
+ emit32(mc, aa64_stur(3, 9, dr, (i32)i));
+ }
+}
-static void aa_atomic_load (CGTarget* t, Operand d, Operand a, MemAccess m, MemOrder o) { (void)d;(void)a;(void)m;(void)o; aa_panic(t, "atomic_load"); }
-static void aa_atomic_store(CGTarget* t, Operand a, Operand s, MemAccess m, MemOrder o) { (void)a;(void)s;(void)m;(void)o; aa_panic(t, "atomic_store"); }
-static void aa_atomic_rmw (CGTarget* t, AtomicOp op, Operand d, Operand a, Operand v, MemAccess m, MemOrder o) { (void)op;(void)d;(void)a;(void)v;(void)m;(void)o; aa_panic(t, "atomic_rmw"); }
-static void aa_atomic_cas (CGTarget* t, Operand p, Operand ok, Operand a, Operand e, Operand des, MemAccess m, MemOrder s, MemOrder f) { (void)p;(void)ok;(void)a;(void)e;(void)des;(void)m;(void)s;(void)f; aa_panic(t, "atomic_cas"); }
-static void aa_fence (CGTarget* t, MemOrder o) { (void)o; aa_panic(t, "fence"); }
+/* ---- atomics ----
+ *
+ * Lowering uses ARMv8.0 LL/SC (LDXR/STXR family) — no FEAT_LSE assumption.
+ * Acquire/Release semantics ride the load/store form chosen by MemOrder
+ * (LDAR/STLR for plain accesses; LDAXR/STLXR inside the LL/SC loop).
+ * fence() emits DMB ISH (data memory barrier, inner shareable). */
+
+/* Encoder helpers — inline since only used here. */
+static inline u32 aa64_ldar (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stlr (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_ldxr (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stxr (u32 sf64, u32 Rs, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC8007C00u : 0x88007C00u)
+ | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC800FC00u : 0x8800FC00u)
+ | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_dmb_ish(void) { return 0xD5033BBFu; }
+static inline u32 aa64_clrex (void) { return 0xD5033F5Fu; }
+/* CBNZ Rt, imm19 */
+static inline u32 aa64_cbnz (u32 sf64, u32 Rt)
+{ return 0x35000000u | (sf64<<31) | (Rt&0x1f); }
+
+static int mem_order_is_acquire(MemOrder o)
+{ return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || o == MO_CONSUME; }
+static int mem_order_is_release(MemOrder o)
+{ return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; }
+
+static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr,
+ MemAccess ma, MemOrder ord)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ /* Resolve addr to a base register; LDAR/LDR-exclusive both want a
+ * pointer in a GPR, no offset form. */
+ u32 base;
+ if (addr.kind == OPK_REG) {
+ base = reg_num(addr);
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot");
+ base = 9u;
+ emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_load: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (mem_order_is_acquire(ord)) {
+ emit32(mc, aa64_ldar(sf, reg_num(dst), base));
+ } else {
+ u32 sidx = size_idx_for_bytes(ma.size);
+ emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0));
+ }
+}
-static void aa_intrinsic(CGTarget* t, IntrinKind k, Operand* dsts, u32 nd, const Operand* args, u32 na) { (void)k;(void)dsts;(void)nd;(void)args;(void)na; aa_panic(t, "intrinsic"); }
+static void aa_atomic_store(CGTarget* t, Operand addr, Operand src,
+ MemAccess ma, MemOrder ord)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ /* Materialize src into a register if needed. */
+ u32 src_reg;
+ if (src.kind == OPK_IMM) {
+ src_reg = 10u;
+ emit_load_imm(mc, sf, src_reg, src.v.imm);
+ } else if (src.kind == OPK_REG) {
+ src_reg = reg_num(src);
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_store: src kind %d unsupported",
+ (int)src.kind);
+ }
+ /* Base reg. */
+ u32 base;
+ if (addr.kind == OPK_REG) {
+ base = reg_num(addr);
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot");
+ base = 9u;
+ emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_store: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (mem_order_is_release(ord)) {
+ emit32(mc, aa64_stlr(sf, src_reg, base));
+ } else {
+ u32 sidx = size_idx_for_bytes(ma.size);
+ emit32(mc, aa64_stur(sidx, src_reg, base, 0));
+ }
+}
+
+/* Apply rmw op: new = f(prior, val). prior, val, dst are W/X based on sf.
+ * Uses scratch x12 if a temporary is needed (e.g. NAND). */
+static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf,
+ u32 dst_new, u32 prior, u32 val)
+{
+ switch (op) {
+ case AO_XCHG: emit32(mc, aa64_mov_reg(sf, dst_new, val)); break;
+ case AO_ADD: emit32(mc, aa64_add(sf, dst_new, prior, val)); break;
+ case AO_SUB: emit32(mc, aa64_sub(sf, dst_new, prior, val)); break;
+ case AO_AND: emit32(mc, aa64_and(sf, dst_new, prior, val)); break;
+ case AO_OR: emit32(mc, aa64_orr(sf, dst_new, prior, val)); break;
+ case AO_XOR: emit32(mc, aa64_eor(sf, dst_new, prior, val)); break;
+ case AO_NAND:
+ /* NAND: new = ~(prior & val). AArch64 has no NAND; use AND then MVN. */
+ emit32(mc, aa64_and(sf, dst_new, prior, val));
+ emit32(mc, aa64_mvn(sf, dst_new, dst_new));
+ break;
+ default:
+ emit32(mc, aa64_mov_reg(sf, dst_new, val));
+ break;
+ }
+}
+
+static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst,
+ Operand addr, Operand val,
+ MemAccess ma, MemOrder ord)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ /* Pin operands into scratch regs:
+ * x9 = base (atomic addr)
+ * x10 = val
+ * x11 = new (computed)
+ * w12 = stxr status flag
+ * dst (prior) is the user-provided destination reg. */
+ u32 base = 9u;
+ if (addr.kind == OPK_REG) {
+ emit32(mc, aa64_mov_reg(1, 9, reg_num(addr)));
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot");
+ emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ u32 vreg = 10u;
+ if (val.kind == OPK_IMM) {
+ emit_load_imm(mc, sf, vreg, val.v.imm);
+ } else if (val.kind == OPK_REG) {
+ emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val)));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported",
+ (int)val.kind);
+ }
+
+ int do_acq = mem_order_is_acquire(ord);
+ int do_rel = mem_order_is_release(ord);
+
+ MCLabel L_retry = mc->label_new(mc);
+ mc->label_place(mc, L_retry);
+
+ /* prior <- ldxr/ldaxr [base] */
+ if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(dst), base));
+ else emit32(mc, aa64_ldxr (sf, reg_num(dst), base));
+
+ /* new = combine(prior, val) into x11 */
+ emit_rmw_combine(mc, op, sf, /*new=*/11u, /*prior=*/reg_num(dst), vreg);
+
+ /* status <- stxr/stlxr [base], new ; cbnz status, retry */
+ if (do_rel) emit32(mc, aa64_stlxr(sf, /*Rs=*/12u, /*Rt=*/11u, base));
+ else emit32(mc, aa64_stxr (sf, /*Rs=*/12u, /*Rt=*/11u, base));
+
+ u32 cbnz_pos = mc->pos(mc);
+ emit32(mc, aa64_cbnz(0, /*Rt=*/12u));
+ mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+ (void)cbnz_pos;
+}
+
+static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok,
+ Operand addr, Operand expected, Operand desired,
+ MemAccess ma, MemOrder succ, MemOrder fail)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+ (void)fail;
+
+ /* Pin operands:
+ * x9 = base
+ * x10 = expected (compare against prior)
+ * x11 = desired (store on match)
+ * w12 = stxr status flag */
+ u32 base = 9u;
+ if (addr.kind == OPK_REG) emit32(mc, aa64_mov_reg(1, 9, reg_num(addr)));
+ else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot");
+ emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (expected.kind == OPK_IMM) emit_load_imm(mc, sf, 10, expected.v.imm);
+ else if (expected.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 10, reg_num(expected)));
+ else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported",
+ (int)expected.kind);
+ if (desired.kind == OPK_IMM) emit_load_imm(mc, sf, 11, desired.v.imm);
+ else if (desired.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 11, reg_num(desired)));
+ else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported",
+ (int)desired.kind);
+
+ int do_acq = mem_order_is_acquire(succ);
+ int do_rel = mem_order_is_release(succ);
+
+ MCLabel L_retry = mc->label_new(mc);
+ MCLabel L_fail = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+
+ mc->label_place(mc, L_retry);
+ if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(prior), base));
+ else emit32(mc, aa64_ldxr (sf, reg_num(prior), base));
+
+ /* if (prior != expected) -> fail (clrex + ok=0) */
+ emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/31u, reg_num(prior), 10u));
+ emit32(mc, aa64_b_cond(0x1u /*NE*/));
+ mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0);
+
+ /* try store; retry on stxr failure */
+ if (do_rel) emit32(mc, aa64_stlxr(sf, 12u, 11u, base));
+ else emit32(mc, aa64_stxr (sf, 12u, 11u, base));
+ emit32(mc, aa64_cbnz(0, 12u));
+ mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+
+ /* ok = 1 ; jump done */
+ emit_load_imm(mc, 0, reg_num(ok), 1);
+ emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+ /* L_fail: clear monitor; ok = 0 */
+ mc->label_place(mc, L_fail);
+ emit32(mc, aa64_clrex());
+ emit_load_imm(mc, 0, reg_num(ok), 0);
+
+ mc->label_place(mc, L_done);
+}
+
+static void aa_fence(CGTarget* t, MemOrder o)
+{
+ (void)o;
+ /* Conservative: full-system DMB ISH for any release/acquire/seq_cst.
+ * RELAXED fence is a no-op. */
+ if (o == MO_RELAXED) return;
+ emit32(t->mc, aa64_dmb_ish());
+}
+
+/* ---- intrinsics ---- */
+
+/* Data-processing (1 source) — REV16 / REV / REV32 / RBIT / CLZ.
+ * Family base 0x5AC00000 (sf=0); set sf<<31 for 64-bit forms. */
+static inline u32 aa64_rev16_w(u32 Rd, u32 Rn)
+{ return 0x5AC00400u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rev_w (u32 Rd, u32 Rn)
+{ return 0x5AC00800u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rev_x (u32 Rd, u32 Rn)
+{ return 0xDAC00C00u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rbit (u32 sf64, u32 Rd, u32 Rn)
+{ return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_clz (u32 sf64, u32 Rd, u32 Rn)
+{ return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SIMD CNT (Vd.<T>, Vn.<T>) and ADDV (Bd, Vn.8B). 8B form, Q=0. */
+static inline u32 aa64_cnt_8b (u32 Vd, u32 Vn)
+{ return 0x0E205800u | ((Vn&0x1f)<<5) | (Vd&0x1f); }
+static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn)
+{ return 0x0E31B800u | ((Vn&0x1f)<<5) | (Vd&0x1f); }
+
+/* ADDS / SUBS shifted register (S=1; sets NZCV including V for signed ovf). */
+static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x2B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SMADDL / UMADDL → SMULL / UMULL with Ra = ZR. 64-bit dst, 32-bit srcs. */
+static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra)
+{ return aa64_dp3_pack((AA64DP3){.sf=1,.op31=1,.o0=0,.Rm=Rm,.Ra=Ra,.Rn=Rn,.Rd=Rd}); }
+static inline u32 aa64_smull (u32 Rd, u32 Rn, u32 Rm)
+{ return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); }
+
+/* SUBS Xd, Xn, Wm, SXTW — extended-register form, used for the
+ * mul_overflow check (compare full 64-bit product to sign-extended low 32). */
+static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm)
+{ return 0xEB200000u | ((Rm&0x1f)<<16) | (6u<<13) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* BRK #imm16 — used for TRAP/UNREACHABLE landing pads. */
+static inline u32 aa64_brk(u32 imm16)
+{ return 0xD4200000u | ((imm16 & 0xffffu) << 5); }
+
+static void aa_intrinsic(CGTarget* t, IntrinKind kind,
+ Operand* dsts, u32 nd,
+ const Operand* args, u32 na)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ (void)nd;
+
+ switch (kind) {
+ case INTRIN_POPCOUNT: {
+ /* fmov v0, src ; cnt v0.8b, v0.8b ; addv b0, v0.8b ; fmov w_dst, s0 */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sz_in = type_byte_size(src.type);
+ if (sz_in == 8) emit32(mc, aa64_fmov_d_x(0, reg_num(src)));
+ else emit32(mc, aa64_fmov_s_w(0, reg_num(src)));
+ emit32(mc, aa64_cnt_8b (0, 0));
+ emit32(mc, aa64_addv_b_8b(0, 0));
+ emit32(mc, aa64_fmov_w_s (reg_num(dst), 0));
+ return;
+ }
+ case INTRIN_CLZ: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(src.type) ? 1u : 0u;
+ emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src)));
+ return;
+ }
+ case INTRIN_CTZ: {
+ /* ctz(x) = clz(rbit(x)) */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(src.type) ? 1u : 0u;
+ emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src)));
+ emit32(mc, aa64_clz (sf, reg_num(dst), reg_num(dst)));
+ return;
+ }
+ case INTRIN_BSWAP16: {
+ emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_BSWAP32: {
+ emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_BSWAP64: {
+ emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_MEMCPY:
+ case INTRIN_MEMMOVE: {
+ /* args = (dst_addr, src_addr, n_bytes). v1 only handles a constant
+ * n: unroll forward (memcpy) or backward (memmove). */
+ Operand da = args[0], sa = args[1], nb = args[2];
+ if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: %s with non-const n or non-REG ptr",
+ kind == INTRIN_MEMCPY ? "memcpy" : "memmove");
+ }
+ u32 dr = reg_num(da);
+ u32 sr = reg_num(sa);
+ u32 n = (u32)nb.v.imm;
+ if (kind == INTRIN_MEMCPY) {
+ u32 i = 0;
+ while (i + 8 <= n) {
+ emit32(mc, aa64_ldur(3, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(3, 12, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= n) {
+ emit32(mc, aa64_ldur(2, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(2, 12, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= n) {
+ emit32(mc, aa64_ldur(1, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(1, 12, dr, (i32)i));
+ i += 2;
+ }
+ while (i < n) {
+ emit32(mc, aa64_ldur(0, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(0, 12, dr, (i32)i));
+ i += 1;
+ }
+ } else {
+ /* memmove: copy backward to handle dst > src overlap. */
+ u32 i = n;
+ while (i >= 8) {
+ i -= 8;
+ emit32(mc, aa64_ldur(3, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(3, 12, dr, (i32)i));
+ }
+ while (i >= 4) {
+ i -= 4;
+ emit32(mc, aa64_ldur(2, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(2, 12, dr, (i32)i));
+ }
+ while (i >= 2) {
+ i -= 2;
+ emit32(mc, aa64_ldur(1, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(1, 12, dr, (i32)i));
+ }
+ while (i >= 1) {
+ i -= 1;
+ emit32(mc, aa64_ldur(0, 12, sr, (i32)i));
+ emit32(mc, aa64_stur(0, 12, dr, (i32)i));
+ }
+ }
+ return;
+ }
+ case INTRIN_MEMSET: {
+ /* args = (dst_addr, byte, n) */
+ Operand da = args[0], bv = args[1], nb = args[2];
+ if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: memset with non-const n / non-REG ptr");
+ }
+ u32 dr = reg_num(da);
+ u32 n = (u32)nb.v.imm;
+ u32 byte;
+ u32 src_reg;
+ if (bv.kind == OPK_IMM) {
+ byte = (u32)(bv.v.imm & 0xffu);
+ if (byte == 0) {
+ src_reg = 31u; /* XZR / WZR */
+ } else {
+ u64 b64 = byte;
+ b64 |= b64 << 8; b64 |= b64 << 16; b64 |= b64 << 32;
+ emit_load_imm(mc, 1, 12, (i64)b64);
+ src_reg = 12u;
+ }
+ } else if (bv.kind == OPK_REG) {
+ /* Broadcast: dup low byte across all 8 bytes via ORR-immediate
+ * trickery is awkward; use mul-by-0x0101010101010101. */
+ emit_load_imm(mc, 1, 12, (i64)0x0101010101010101ll);
+ emit32(mc, aa64_madd(1, 12, reg_num(bv), 12, AA64_ZR));
+ src_reg = 12u;
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: memset byte kind %d unsupported",
+ (int)bv.kind);
+ }
+ u32 i = 0;
+ while (i + 8 <= n) { emit32(mc, aa64_stur(3, src_reg, dr, (i32)i)); i += 8; }
+ while (i + 4 <= n) { emit32(mc, aa64_stur(2, src_reg, dr, (i32)i)); i += 4; }
+ while (i + 2 <= n) { emit32(mc, aa64_stur(1, src_reg, dr, (i32)i)); i += 2; }
+ while (i < n) { emit32(mc, aa64_stur(0, src_reg, dr, (i32)i)); i += 1; }
+ return;
+ }
+ case INTRIN_PREFETCH:
+ /* No-op hint. */
+ (void)args; (void)na;
+ return;
+ case INTRIN_ASSUME_ALIGNED: {
+ /* dst = src (alignment is a hint only). */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ if (reg_num(src) != reg_num(dst)) {
+ emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src)));
+ }
+ return;
+ }
+ case INTRIN_EXPECT: {
+ /* dst = val (the "expected" hint is dropped). */
+ Operand val = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ if (val.kind == OPK_REG) {
+ if (reg_num(val) != reg_num(dst)) {
+ emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val)));
+ }
+ } else if (val.kind == OPK_IMM) {
+ emit_load_imm(mc, sf, reg_num(dst), val.v.imm);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: expect val kind %d unsupported",
+ (int)val.kind);
+ }
+ return;
+ }
+ case INTRIN_UNREACHABLE:
+ case INTRIN_TRAP:
+ emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u));
+ return;
+ case INTRIN_ADD_OVERFLOW:
+ case INTRIN_SUB_OVERFLOW: {
+ /* dsts: [val, ovf]. ADDS/SUBS sets V on signed overflow; CSET VS. */
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ u32 sf = type_is_64(dval.type) ? 1u : 0u;
+ u32 ra = force_reg_int(t, a_op, sf, 9);
+ u32 rb = force_reg_int(t, b_op, sf, (ra == 9) ? 10u : 9u);
+ u32 word = (kind == INTRIN_ADD_OVERFLOW)
+ ? aa64_adds_reg(sf, reg_num(dval), ra, rb)
+ : aa64_subs_reg(sf, reg_num(dval), ra, rb);
+ emit32(mc, word);
+ emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/));
+ return;
+ }
+ case INTRIN_MUL_OVERFLOW: {
+ /* SMULL Xtmp, Wn, Wm gives full 64-bit signed product.
+ * ovf = (Xtmp != sxtw(Wtmp)) — i.e. upper 32 bits ≠ sign-ext of low.
+ * dval gets the truncated low 32 bits. */
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ u32 sf = type_is_64(dval.type) ? 1u : 0u;
+ if (sf) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: mul_overflow on i64 not yet supported");
+ }
+ u32 ra = force_reg_int(t, a_op, 0, 9);
+ u32 rb = force_reg_int(t, b_op, 0, (ra == 9) ? 10u : 9u);
+ emit32(mc, aa64_smull(/*X*/11u, ra, rb));
+ emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/31u, /*Xn=*/11u, /*Wm=*/11u));
+ emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
+ emit32(mc, aa64_mov_reg(0, reg_num(dval), 11u));
+ return;
+ }
+ default:
+ compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported",
+ (int)kind);
+ }
+}
static void aa_asm_block(CGTarget* t, const char* tmpl,
const AsmConstraint* outs, u32 no, Operand* oo,