cg/aa64: implement Groups J, K, L (varargs, atomics, intrinsics) - kit

commit 060d8253db61a71604f234a187998b47c3fc6a0c
parent 781d954928484c2614b1a43d73460b4c66b00212
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 15:56:45 -0700

cg/aa64: implement Groups J, K, L (varargs, atomics, intrinsics)

- ABI: replace the placeholder va_list type with the AAPCS64
  __va_list struct (3 ptrs + 2 ints, 32 bytes).
- Variadic prologue: reserve GP/FP register save areas and spill
  x0..x7 / d0..d7 immediately after the prologue placeholder.
- va_start/va_arg/va_end/va_copy: full AAPCS64 lowering, dispatching
  to the GP or FP save area then falling through to the caller's
  stack at __gr_offs/__vr_offs == 0.
- Atomics: ARMv8.0 LL/SC lowering — LDAR/STLR for plain ordered
  load/store, LDAXR/STLXR retry loops for rmw and cas, DMB ISH for
  fences. NAND synthesized via AND+MVN.
- Intrinsics: NEON-based POPCOUNT (CNT.8B + ADDV), CLZ/CTZ via RBIT,
  REV*-family BSWAP, constant-size MEMCPY/MEMMOVE/MEMSET, no-op
  PREFETCH/ASSUME_ALIGNED/EXPECT, BRK for TRAP/UNREACHABLE, and
  ADDS/SUBS+CSET (signed V flag) for ADD/SUB_OVERFLOW with
  SMULL+sxtw compare for MUL_OVERFLOW.
- Misc: extend the FP scratch range to v16..v23 for short-lived
  materialization (j06 needs 9 simultaneous FP regs); fix the
  call-site FP-to-stack path that was clobbering v0/v1 with FMOV.

All 752 cg cases pass across D/R/E/J paths.

Diffstat:
M src/abi/abi.c  | 24 +++++++++++++++++++-----
M src/arch/aarch64.c  | 760 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---

2 files changed, 751 insertions(+), 33 deletions(-)
diff --git a/src/abi/abi.c b/src/abi/abi.c
@@ -12,6 +12,7 @@
 #include "abi/abi.h"
 #include "core/core.h"
 #include "core/arena.h"
+#include "core/pool.h"
 
 #include <cfree.h>
 
@@ -36,6 +37,7 @@ struct TargetABI {
     /* Per-TU cached lookups. */
     FuncInfoCacheEntry*     fn_cache;
     RecordLayoutCacheEntry* rec_cache;
+    const Type*             va_list_cache;
 };
 
 /* ---- scalar profile ---- */
@@ -331,11 +333,22 @@ const Type* abi_intptr_type (TargetABI* a, Pool* p)
 const Type* abi_uintptr_type(TargetABI* a, Pool* p) { return size_or_uintptr(a, p); }
 const Type* abi_va_list_type(TargetABI* a, Pool* p)
 {
-    /* AAPCS64: __va_list is a struct of three pointers + two ints. v1 returns
-     * a placeholder pointer; this is exercised only by the parser/builtin
-     * substitution path, which Group A does not reach. */
-    (void)a;
-    return type_ptr(p, type_void(p));
+    /* AAPCS64 __va_list: 3 pointers (__stack, __gr_top, __vr_top) followed
+     * by 2 ints (__gr_offs, __vr_offs). Total 32 bytes, 8-aligned. */
+    if (a->va_list_cache) return a->va_list_cache;
+    const Type* vp = type_ptr(p, type_void(p));
+    const Type* it = type_prim(p, TY_INT);
+    Sym name = pool_intern_cstr(p, "__va_list");
+    SrcLoc nl = {0,0,0};
+    TagId  tg = type_tag_new(p, TAG_STRUCT, name, nl);
+    TypeRecordBuilder* b = type_record_begin(p, TY_STRUCT, tg, name);
+    type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__stack"),    .type = vp });
+    type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_top"),   .type = vp });
+    type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_top"),   .type = vp });
+    type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__gr_offs"),  .type = it });
+    type_record_field(b, (Field){ .name = pool_intern_cstr(p, "__vr_offs"),  .type = it });
+    a->va_list_cache = type_record_end(p, b);
+    return a->va_list_cache;
 }
 
 /* ---- lifecycle ---- */
@@ -352,6 +365,7 @@ void abi_fini(TargetABI* a)
     if (!a) return;
     a->fn_cache = NULL;
     a->rec_cache = NULL;
+    a->va_list_cache = NULL;
     a->c = NULL;
 }
 
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -276,6 +276,14 @@ typedef struct AAImpl {
     struct AAAllocaPatch { u32 pos; u32 dst_reg; }* add_patches;
     u32        nadd_patches;
     u32        add_patches_cap;
+
+    /* Variadic — AAPCS64 register save areas reserved at function entry.
+     * gp_save_slot holds 8*8=64 bytes (x0..x7); fp_save_slot holds 8*16=128
+     * bytes (v0..v7 with 16-byte stride). Saves are emitted in func_begin
+     * after the prologue placeholder so FP is already valid when they run. */
+    u8         is_variadic;
+    FrameSlot  gp_save_slot;
+    FrameSlot  fp_save_slot;
 } AAImpl;
 
 static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
@@ -450,6 +458,9 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
     a->has_alloca  = 0;
     a->nadd_patches= 0;
     a->sret_ptr_slot = FRAME_SLOT_NONE;
+    a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+    a->gp_save_slot = FRAME_SLOT_NONE;
+    a->fp_save_slot = FRAME_SLOT_NONE;
     a->epilogue_label = mc->label_new(mc);
 
     mc->cfi_startproc(mc);
@@ -470,6 +481,31 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
         };
         a->sret_ptr_slot = aa_frame_slot(t, &fsd);
     }
+
+    /* Variadic: reserve GP and FP register save areas and emit saves of
+     * x0..x7 / d0..d7 here, after the prologue placeholder, so FP is set
+     * up. Param stores below run after these saves but before any user
+     * code clobbers x0..x7. */
+    if (a->is_variadic) {
+        FrameSlotDesc gpd = {
+            .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+            .size = 64, .align = 8, .kind = FS_SPILL, .flags = 0,
+        };
+        a->gp_save_slot = aa_frame_slot(t, &gpd);
+        FrameSlotDesc fpd = {
+            .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+            .size = 128, .align = 16, .kind = FS_SPILL, .flags = 0,
+        };
+        a->fp_save_slot = aa_frame_slot(t, &fpd);
+        AASlot* gs = slot_get(a, a->gp_save_slot);
+        AASlot* fs = slot_get(a, a->fp_save_slot);
+        for (u32 i = 0; i < 8; ++i) {
+            emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i*8));
+        }
+        for (u32 i = 0; i < 8; ++i) {
+            emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i*16));
+        }
+    }
 }
 
 static void aa_func_end(CGTarget* t)
@@ -477,9 +513,12 @@ static void aa_func_end(CGTarget* t)
     AAImpl*    a  = impl_of(t);
     MCEmitter* mc = t->mc;
 
-    /* Compute callee-save layout. */
+    /* Compute callee-save layout. Only v8..v15 are callee-saved; the
+     * caller-saved v16..v23 are handed out by alloc_reg too but never
+     * appear in prologue saves. */
     u32 n_int_pairs = (a->used_int + 1) / 2;   /* round up */
-    u32 n_fp_pairs  = (a->used_fp  + 1) / 2;
+    u32 used_fp_cs  = a->used_fp > 8 ? 8u : a->used_fp;
+    u32 n_fp_pairs  = (used_fp_cs + 1) / 2;
 
     u32 outgoing_off = 0;
     u32 int_save_off = a->max_outgoing;
@@ -619,11 +658,16 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty)
         return (Reg)(19u + a->used_int++);
     }
     if (cls == RC_FP) {
-        if (a->used_fp >= 8) {
+        /* v8..v15 are callee-saved (low 64 bits); v16..v23 are caller-saved
+         * scratch. Hand out callee-saved first, then fall back to scratch
+         * for short-lived materialization (e.g. j06 builds 9 FP arg regs
+         * with no intervening call). */
+        if (a->used_fp >= 16) {
             compiler_panic(t->c, a->loc,
                 "aarch64 alloc_reg: out of FP scratch (no spill yet)");
         }
-        return (Reg)(8u + a->used_fp++);
+        u32 idx = a->used_fp++;
+        return (Reg)(idx < 8 ? 8u + idx : 16u + (idx - 8u));
     }
     compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
 }
@@ -1593,7 +1637,26 @@ static void emit_arg_value(CGTarget* t,
                             u32* next_int, u32* next_fp, u32* stack_off)
 {
     AAImpl* a = impl_of(t);
+    /* Synthesize a one-part DIRECT ABIArgInfo for var args (av->abi is NULL
+     * past the fixed-param count). AAPCS64 routes var args through the same
+     * register/stack rules as fixed scalars, so this matches what
+     * abi_func_info would have produced. */
+    ABIArgInfo  va_ai;
+    ABIArgPart  va_pt;
     const ABIArgInfo* ai = av->abi;
+    if (!ai) {
+        u32 sz = type_byte_size(av->type);
+        memset(&va_ai, 0, sizeof va_ai);
+        memset(&va_pt, 0, sizeof va_pt);
+        va_ai.kind   = ABI_ARG_DIRECT;
+        va_ai.parts  = &va_pt;
+        va_ai.nparts = 1;
+        va_pt.cls    = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
+        va_pt.size   = sz;
+        va_pt.align  = sz;
+        va_pt.src_offset = 0;
+        ai = &va_ai;
+    }
     if (ai->kind == ABI_ARG_IGNORE) return;
 
     if (ai->kind == ABI_ARG_INDIRECT) {
@@ -1660,20 +1723,33 @@ static void emit_arg_value(CGTarget* t,
             }
         } else if (pt->cls == ABI_CLASS_FP) {
             int to_stack = (*next_fp >= 8);
-            u32 dst_reg = to_stack ? 0u : (*next_fp)++;
-            switch (av->storage.kind) {
-            case OPK_REG: {
-                u32 type = (sz == 8) ? 1u : 0u;
-                emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
-                break;
-            }
-            default:
-                compiler_panic(t->c, a->loc,
-                    "aarch64 call: FP arg storage kind %d unsupported",
-                    (int)av->storage.kind);
-            }
-            if (to_stack) {
-                emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off));
+            if (!to_stack) {
+                u32 dst_reg = (*next_fp)++;
+                switch (av->storage.kind) {
+                case OPK_REG: {
+                    u32 type = (sz == 8) ? 1u : 0u;
+                    emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
+                    break;
+                }
+                default:
+                    compiler_panic(t->c, a->loc,
+                        "aarch64 call: FP arg storage kind %d unsupported",
+                        (int)av->storage.kind);
+                }
+            } else {
+                /* Store source FP reg directly into the stack slot — going
+                 * through v0/v1 would corrupt args already placed in the
+                 * register save area. */
+                switch (av->storage.kind) {
+                case OPK_REG:
+                    emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31,
+                                               (i32)*stack_off));
+                    break;
+                default:
+                    compiler_panic(t->c, a->loc,
+                        "aarch64 call: FP stack-arg storage kind %d unsupported",
+                        (int)av->storage.kind);
+                }
                 *stack_off += 8;
             }
         } else {
@@ -1932,18 +2008,646 @@ static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align)
     emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/31, 0, 0));
     a->has_alloca = 1;
 }
-static void aa_va_start_(CGTarget* t, Operand a)                  { (void)a; aa_panic(t, "va_start"); }
-static void aa_va_arg_  (CGTarget* t, Operand d, Operand a, const Type* ty) { (void)d;(void)a;(void)ty; aa_panic(t, "va_arg"); }
-static void aa_va_end_  (CGTarget* t, Operand a)                  { (void)a; aa_panic(t, "va_end"); }
-static void aa_va_copy_ (CGTarget* t, Operand d, Operand s)       { (void)d;(void)s; aa_panic(t, "va_copy"); }
+/* AAPCS64 va_list (32 bytes):
+ *   off  0  void* __stack    next stack-passed var arg
+ *   off  8  void* __gr_top   one past end of GP save area
+ *   off 16  void* __vr_top   one past end of FP save area
+ *   off 24  int   __gr_offs  current GP offset (negative; >= 0 → use stack)
+ *   off 28  int   __vr_offs  current FP offset (negative; >= 0 → use stack)
+ *
+ * va_start populates the struct from the function's reg-save areas and
+ * the named-param consumption already tracked on AAImpl. va_arg dispatches
+ * by RegClass: int args walk the GP save area at 8-byte stride; FP args
+ * walk the FP save area at 16-byte stride (q-register-sized slots). When
+ * the offset reaches 0, fall through to the stack at 8-byte stride. */
+static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs)
+{
+    if (ofs == 0)            emit32(mc, aa64_mov_reg(1, dst, 29));
+    else if (ofs > 0
+          && (u32)ofs <= 0xfff) emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0));
+    else if (ofs < 0
+          && (u32)(-ofs) <= 0xfff) emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0));
+    else {
+        emit_load_imm(mc, 1, dst, ofs);
+        emit32(mc, aa64_add(1, dst, 29, dst));
+    }
+}
+
+static void aa_va_start_(CGTarget* t, Operand ap_op)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    if (!a->is_variadic) {
+        compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic");
+    }
+    u32 ap = reg_num(ap_op);
+    AASlot* gs = slot_get(a, a->gp_save_slot);
+    AASlot* fs = slot_get(a, a->fp_save_slot);
+
+    /* __stack = fp + 16 + named-stack-args-bytes */
+    {
+        u32 ofs = 16u + a->next_param_stack;
+        if (ofs <= 0xfff) emit32(mc, aa64_add_imm(1, 9, 29, ofs, 0));
+        else { emit_load_imm(mc, 1, 9, (i64)ofs); emit32(mc, aa64_add(1, 9, 29, 9)); }
+        emit32(mc, aa64_str_uimm(3, 9, ap, 0));
+    }
+    /* __gr_top = fp - gs->off + gs->size */
+    emit_fp_off(mc, 9, -(i32)gs->off + (i32)gs->size);
+    emit32(mc, aa64_str_uimm(3, 9, ap, 8));
+    /* __vr_top = fp - fs->off + fs->size */
+    emit_fp_off(mc, 9, -(i32)fs->off + (i32)fs->size);
+    emit32(mc, aa64_str_uimm(3, 9, ap, 16));
+    /* __gr_offs = named_int*8 - 64 */
+    emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_int * 8u) - 64));
+    emit32(mc, aa64_str_uimm(2, 9, ap, 24));
+    /* __vr_offs = named_fp*16 - 128 */
+    emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_fp * 16u) - 128));
+    emit32(mc, aa64_str_uimm(2, 9, ap, 28));
+}
+
+static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, const Type* ty)
+{
+    MCEmitter* mc = t->mc;
+    u32 ap = reg_num(ap_op);
+    int is_fp = (dst.cls == RC_FP);
+    u32 offs_field = is_fp ? 28u : 24u;
+    u32 top_field  = is_fp ? 16u :  8u;
+    u32 stride_reg = is_fp ? 16u :  8u;
+    u32 sz = type_byte_size(ty);
+    u32 sidx = size_idx_for_bytes(sz);
+
+    MCLabel L_stack = mc->label_new(mc);
+    MCLabel L_done  = mc->label_new(mc);
+
+    /* w9 = ap.offs ; cmp; b.ge L_stack (>=0 means save area exhausted) */
+    emit32(mc, aa64_ldur(2, 9, ap, (i32)offs_field));
+    emit32(mc, aa64_subs_imm(0, 31, 9, 0));
+    emit32(mc, aa64_b_cond(0xa /*GE*/));
+    mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0);
+
+    /* save-area path:
+     *   x10 = ap.top
+     *   x12 = sxtw(w9)
+     *   x11 = x10 + x12
+     *   load dst, [x11]
+     *   w9 += stride_reg ; ap.offs = w9 ; b L_done */
+    emit32(mc, aa64_ldur(3, 10, ap, (i32)top_field));
+    emit32(mc, aa64_sbfm(1, 12, 9, 0, 31));
+    emit32(mc, aa64_add(1, 11, 10, 12));
+    if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 11, 0));
+    else       emit32(mc, aa64_ldur   (sidx, reg_num(dst), 11, 0));
+    emit32(mc, aa64_add_imm(0, 9, 9, stride_reg, 0));
+    emit32(mc, aa64_stur(2, 9, ap, (i32)offs_field));
+    emit32(mc, aa64_b_base());
+    mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+    /* L_stack: x10 = ap.stack ; load dst,[x10] ; x10+=8 ; ap.stack=x10 */
+    mc->label_place(mc, L_stack);
+    emit32(mc, aa64_ldur(3, 10, ap, 0));
+    if (is_fp) emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 10, 0));
+    else       emit32(mc, aa64_ldur   (sidx, reg_num(dst), 10, 0));
+    emit32(mc, aa64_add_imm(1, 10, 10, 8u, 0));
+    emit32(mc, aa64_stur(3, 10, ap, 0));
+
+    mc->label_place(mc, L_done);
+}
+
+static void aa_va_end_(CGTarget* t, Operand a) { (void)t; (void)a; }
+
+static void aa_va_copy_(CGTarget* t, Operand d, Operand s)
+{
+    MCEmitter* mc = t->mc;
+    u32 dr = reg_num(d);
+    u32 sr = reg_num(s);
+    /* va_list is 32 bytes — 4 x 8-byte LDR/STR pairs. */
+    for (u32 i = 0; i < 32u; i += 8u) {
+        emit32(mc, aa64_ldur(3, 9, sr, (i32)i));
+        emit32(mc, aa64_stur(3, 9, dr, (i32)i));
+    }
+}
 
-static void aa_atomic_load (CGTarget* t, Operand d, Operand a, MemAccess m, MemOrder o) { (void)d;(void)a;(void)m;(void)o; aa_panic(t, "atomic_load"); }
-static void aa_atomic_store(CGTarget* t, Operand a, Operand s, MemAccess m, MemOrder o) { (void)a;(void)s;(void)m;(void)o; aa_panic(t, "atomic_store"); }
-static void aa_atomic_rmw  (CGTarget* t, AtomicOp op, Operand d, Operand a, Operand v, MemAccess m, MemOrder o) { (void)op;(void)d;(void)a;(void)v;(void)m;(void)o; aa_panic(t, "atomic_rmw"); }
-static void aa_atomic_cas  (CGTarget* t, Operand p, Operand ok, Operand a, Operand e, Operand des, MemAccess m, MemOrder s, MemOrder f) { (void)p;(void)ok;(void)a;(void)e;(void)des;(void)m;(void)s;(void)f; aa_panic(t, "atomic_cas"); }
-static void aa_fence       (CGTarget* t, MemOrder o)                                                         { (void)o; aa_panic(t, "fence"); }
+/* ---- atomics ----
+ *
+ * Lowering uses ARMv8.0 LL/SC (LDXR/STXR family) — no FEAT_LSE assumption.
+ * Acquire/Release semantics ride the load/store form chosen by MemOrder
+ * (LDAR/STLR for plain accesses; LDAXR/STLXR inside the LL/SC loop).
+ * fence() emits DMB ISH (data memory barrier, inner shareable). */
+
+/* Encoder helpers — inline since only used here. */
+static inline u32 aa64_ldar (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stlr (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_ldxr (u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stxr (u32 sf64, u32 Rs, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC8007C00u : 0x88007C00u)
+       | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn)
+{ return (sf64 ? 0xC800FC00u : 0x8800FC00u)
+       | ((Rs&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rt&0x1f); }
+static inline u32 aa64_dmb_ish(void)  { return 0xD5033BBFu; }
+static inline u32 aa64_clrex  (void)  { return 0xD5033F5Fu; }
+/* CBNZ Rt, imm19 */
+static inline u32 aa64_cbnz   (u32 sf64, u32 Rt)
+{ return 0x35000000u | (sf64<<31) | (Rt&0x1f); }
+
+static int mem_order_is_acquire(MemOrder o)
+{ return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || o == MO_CONSUME; }
+static int mem_order_is_release(MemOrder o)
+{ return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; }
+
+static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr,
+                           MemAccess ma, MemOrder ord)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    u32 sf = (ma.size == 8) ? 1u : 0u;
+
+    /* Resolve addr to a base register; LDAR/LDR-exclusive both want a
+     * pointer in a GPR, no offset form. */
+    u32 base;
+    if (addr.kind == OPK_REG) {
+        base = reg_num(addr);
+    } else if (addr.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, addr.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot");
+        base = 9u;
+        emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_load: addr kind %d unsupported",
+                       (int)addr.kind);
+    }
+    if (mem_order_is_acquire(ord)) {
+        emit32(mc, aa64_ldar(sf, reg_num(dst), base));
+    } else {
+        u32 sidx = size_idx_for_bytes(ma.size);
+        emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0));
+    }
+}
 
-static void aa_intrinsic(CGTarget* t, IntrinKind k, Operand* dsts, u32 nd, const Operand* args, u32 na) { (void)k;(void)dsts;(void)nd;(void)args;(void)na; aa_panic(t, "intrinsic"); }
+static void aa_atomic_store(CGTarget* t, Operand addr, Operand src,
+                            MemAccess ma, MemOrder ord)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    u32 sf = (ma.size == 8) ? 1u : 0u;
+
+    /* Materialize src into a register if needed. */
+    u32 src_reg;
+    if (src.kind == OPK_IMM) {
+        src_reg = 10u;
+        emit_load_imm(mc, sf, src_reg, src.v.imm);
+    } else if (src.kind == OPK_REG) {
+        src_reg = reg_num(src);
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_store: src kind %d unsupported",
+                       (int)src.kind);
+    }
+    /* Base reg. */
+    u32 base;
+    if (addr.kind == OPK_REG) {
+        base = reg_num(addr);
+    } else if (addr.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, addr.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot");
+        base = 9u;
+        emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_store: addr kind %d unsupported",
+                       (int)addr.kind);
+    }
+    if (mem_order_is_release(ord)) {
+        emit32(mc, aa64_stlr(sf, src_reg, base));
+    } else {
+        u32 sidx = size_idx_for_bytes(ma.size);
+        emit32(mc, aa64_stur(sidx, src_reg, base, 0));
+    }
+}
+
+/* Apply rmw op: new = f(prior, val). prior, val, dst are W/X based on sf.
+ * Uses scratch x12 if a temporary is needed (e.g. NAND). */
+static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf,
+                              u32 dst_new, u32 prior, u32 val)
+{
+    switch (op) {
+    case AO_XCHG: emit32(mc, aa64_mov_reg(sf, dst_new, val));        break;
+    case AO_ADD:  emit32(mc, aa64_add(sf, dst_new, prior, val));     break;
+    case AO_SUB:  emit32(mc, aa64_sub(sf, dst_new, prior, val));     break;
+    case AO_AND:  emit32(mc, aa64_and(sf, dst_new, prior, val));     break;
+    case AO_OR:   emit32(mc, aa64_orr(sf, dst_new, prior, val));     break;
+    case AO_XOR:  emit32(mc, aa64_eor(sf, dst_new, prior, val));     break;
+    case AO_NAND:
+        /* NAND: new = ~(prior & val). AArch64 has no NAND; use AND then MVN. */
+        emit32(mc, aa64_and(sf, dst_new, prior, val));
+        emit32(mc, aa64_mvn(sf, dst_new, dst_new));
+        break;
+    default:
+        emit32(mc, aa64_mov_reg(sf, dst_new, val));
+        break;
+    }
+}
+
+static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst,
+                          Operand addr, Operand val,
+                          MemAccess ma, MemOrder ord)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    u32 sf = (ma.size == 8) ? 1u : 0u;
+
+    /* Pin operands into scratch regs:
+     *   x9   = base (atomic addr)
+     *   x10  = val
+     *   x11  = new (computed)
+     *   w12  = stxr status flag
+     * dst (prior) is the user-provided destination reg.  */
+    u32 base = 9u;
+    if (addr.kind == OPK_REG) {
+        emit32(mc, aa64_mov_reg(1, 9, reg_num(addr)));
+    } else if (addr.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, addr.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot");
+        emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0));
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported",
+                       (int)addr.kind);
+    }
+    u32 vreg = 10u;
+    if (val.kind == OPK_IMM) {
+        emit_load_imm(mc, sf, vreg, val.v.imm);
+    } else if (val.kind == OPK_REG) {
+        emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val)));
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported",
+                       (int)val.kind);
+    }
+
+    int do_acq = mem_order_is_acquire(ord);
+    int do_rel = mem_order_is_release(ord);
+
+    MCLabel L_retry = mc->label_new(mc);
+    mc->label_place(mc, L_retry);
+
+    /* prior <- ldxr/ldaxr [base] */
+    if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(dst), base));
+    else        emit32(mc, aa64_ldxr (sf, reg_num(dst), base));
+
+    /* new = combine(prior, val) into x11 */
+    emit_rmw_combine(mc, op, sf, /*new=*/11u, /*prior=*/reg_num(dst), vreg);
+
+    /* status <- stxr/stlxr [base], new   ; cbnz status, retry */
+    if (do_rel) emit32(mc, aa64_stlxr(sf, /*Rs=*/12u, /*Rt=*/11u, base));
+    else        emit32(mc, aa64_stxr (sf, /*Rs=*/12u, /*Rt=*/11u, base));
+
+    u32 cbnz_pos = mc->pos(mc);
+    emit32(mc, aa64_cbnz(0, /*Rt=*/12u));
+    mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+    (void)cbnz_pos;
+}
+
+static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok,
+                          Operand addr, Operand expected, Operand desired,
+                          MemAccess ma, MemOrder succ, MemOrder fail)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    u32 sf = (ma.size == 8) ? 1u : 0u;
+    (void)fail;
+
+    /* Pin operands:
+     *   x9  = base
+     *   x10 = expected (compare against prior)
+     *   x11 = desired  (store on match)
+     *   w12 = stxr status flag */
+    u32 base = 9u;
+    if (addr.kind == OPK_REG) emit32(mc, aa64_mov_reg(1, 9, reg_num(addr)));
+    else if (addr.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, addr.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot");
+        emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0));
+    } else {
+        compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported",
+                       (int)addr.kind);
+    }
+    if (expected.kind == OPK_IMM)      emit_load_imm(mc, sf, 10, expected.v.imm);
+    else if (expected.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 10, reg_num(expected)));
+    else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported",
+                        (int)expected.kind);
+    if (desired.kind == OPK_IMM)      emit_load_imm(mc, sf, 11, desired.v.imm);
+    else if (desired.kind == OPK_REG) emit32(mc, aa64_mov_reg(sf, 11, reg_num(desired)));
+    else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported",
+                        (int)desired.kind);
+
+    int do_acq = mem_order_is_acquire(succ);
+    int do_rel = mem_order_is_release(succ);
+
+    MCLabel L_retry = mc->label_new(mc);
+    MCLabel L_fail  = mc->label_new(mc);
+    MCLabel L_done  = mc->label_new(mc);
+
+    mc->label_place(mc, L_retry);
+    if (do_acq) emit32(mc, aa64_ldaxr(sf, reg_num(prior), base));
+    else        emit32(mc, aa64_ldxr (sf, reg_num(prior), base));
+
+    /* if (prior != expected) -> fail (clrex + ok=0) */
+    emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/31u, reg_num(prior), 10u));
+    emit32(mc, aa64_b_cond(0x1u /*NE*/));
+    mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0);
+
+    /* try store; retry on stxr failure */
+    if (do_rel) emit32(mc, aa64_stlxr(sf, 12u, 11u, base));
+    else        emit32(mc, aa64_stxr (sf, 12u, 11u, base));
+    emit32(mc, aa64_cbnz(0, 12u));
+    mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+
+    /* ok = 1 ; jump done */
+    emit_load_imm(mc, 0, reg_num(ok), 1);
+    emit32(mc, aa64_b_base());
+    mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+    /* L_fail: clear monitor; ok = 0 */
+    mc->label_place(mc, L_fail);
+    emit32(mc, aa64_clrex());
+    emit_load_imm(mc, 0, reg_num(ok), 0);
+
+    mc->label_place(mc, L_done);
+}
+
+static void aa_fence(CGTarget* t, MemOrder o)
+{
+    (void)o;
+    /* Conservative: full-system DMB ISH for any release/acquire/seq_cst.
+     * RELAXED fence is a no-op. */
+    if (o == MO_RELAXED) return;
+    emit32(t->mc, aa64_dmb_ish());
+}
+
+/* ---- intrinsics ---- */
+
+/* Data-processing (1 source) — REV16 / REV / REV32 / RBIT / CLZ.
+ * Family base 0x5AC00000 (sf=0); set sf<<31 for 64-bit forms. */
+static inline u32 aa64_rev16_w(u32 Rd, u32 Rn)
+{ return 0x5AC00400u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rev_w  (u32 Rd, u32 Rn)
+{ return 0x5AC00800u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rev_x  (u32 Rd, u32 Rn)
+{ return 0xDAC00C00u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_rbit   (u32 sf64, u32 Rd, u32 Rn)
+{ return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_clz    (u32 sf64, u32 Rd, u32 Rn)
+{ return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SIMD CNT (Vd.<T>, Vn.<T>) and ADDV (Bd, Vn.8B). 8B form, Q=0. */
+static inline u32 aa64_cnt_8b   (u32 Vd, u32 Vn)
+{ return 0x0E205800u | ((Vn&0x1f)<<5) | (Vd&0x1f); }
+static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn)
+{ return 0x0E31B800u | ((Vn&0x1f)<<5) | (Vd&0x1f); }
+
+/* ADDS / SUBS shifted register (S=1; sets NZCV including V for signed ovf). */
+static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x2B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SMADDL / UMADDL → SMULL / UMULL with Ra = ZR. 64-bit dst, 32-bit srcs. */
+static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra)
+{ return aa64_dp3_pack((AA64DP3){.sf=1,.op31=1,.o0=0,.Rm=Rm,.Ra=Ra,.Rn=Rn,.Rd=Rd}); }
+static inline u32 aa64_smull (u32 Rd, u32 Rn, u32 Rm)
+{ return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); }
+
+/* SUBS Xd, Xn, Wm, SXTW — extended-register form, used for the
+ * mul_overflow check (compare full 64-bit product to sign-extended low 32). */
+static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm)
+{ return 0xEB200000u | ((Rm&0x1f)<<16) | (6u<<13) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* BRK #imm16 — used for TRAP/UNREACHABLE landing pads. */
+static inline u32 aa64_brk(u32 imm16)
+{ return 0xD4200000u | ((imm16 & 0xffffu) << 5); }
+
+static void aa_intrinsic(CGTarget* t, IntrinKind kind,
+                         Operand* dsts, u32 nd,
+                         const Operand* args, u32 na)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    (void)nd;
+
+    switch (kind) {
+    case INTRIN_POPCOUNT: {
+        /* fmov v0, src ; cnt v0.8b, v0.8b ; addv b0, v0.8b ; fmov w_dst, s0 */
+        Operand src = args[0];
+        Operand dst = dsts[0];
+        u32 sz_in  = type_byte_size(src.type);
+        if (sz_in == 8) emit32(mc, aa64_fmov_d_x(0, reg_num(src)));
+        else            emit32(mc, aa64_fmov_s_w(0, reg_num(src)));
+        emit32(mc, aa64_cnt_8b   (0, 0));
+        emit32(mc, aa64_addv_b_8b(0, 0));
+        emit32(mc, aa64_fmov_w_s (reg_num(dst), 0));
+        return;
+    }
+    case INTRIN_CLZ: {
+        Operand src = args[0];
+        Operand dst = dsts[0];
+        u32 sf = type_is_64(src.type) ? 1u : 0u;
+        emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src)));
+        return;
+    }
+    case INTRIN_CTZ: {
+        /* ctz(x) = clz(rbit(x)) */
+        Operand src = args[0];
+        Operand dst = dsts[0];
+        u32 sf = type_is_64(src.type) ? 1u : 0u;
+        emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src)));
+        emit32(mc, aa64_clz (sf, reg_num(dst), reg_num(dst)));
+        return;
+    }
+    case INTRIN_BSWAP16: {
+        emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0])));
+        return;
+    }
+    case INTRIN_BSWAP32: {
+        emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0])));
+        return;
+    }
+    case INTRIN_BSWAP64: {
+        emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0])));
+        return;
+    }
+    case INTRIN_MEMCPY:
+    case INTRIN_MEMMOVE: {
+        /* args = (dst_addr, src_addr, n_bytes). v1 only handles a constant
+         * n: unroll forward (memcpy) or backward (memmove). */
+        Operand da = args[0], sa = args[1], nb = args[2];
+        if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 intrinsic: %s with non-const n or non-REG ptr",
+                kind == INTRIN_MEMCPY ? "memcpy" : "memmove");
+        }
+        u32 dr = reg_num(da);
+        u32 sr = reg_num(sa);
+        u32 n  = (u32)nb.v.imm;
+        if (kind == INTRIN_MEMCPY) {
+            u32 i = 0;
+            while (i + 8 <= n) {
+                emit32(mc, aa64_ldur(3, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(3, 12, dr, (i32)i));
+                i += 8;
+            }
+            while (i + 4 <= n) {
+                emit32(mc, aa64_ldur(2, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(2, 12, dr, (i32)i));
+                i += 4;
+            }
+            while (i + 2 <= n) {
+                emit32(mc, aa64_ldur(1, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(1, 12, dr, (i32)i));
+                i += 2;
+            }
+            while (i < n) {
+                emit32(mc, aa64_ldur(0, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(0, 12, dr, (i32)i));
+                i += 1;
+            }
+        } else {
+            /* memmove: copy backward to handle dst > src overlap. */
+            u32 i = n;
+            while (i >= 8) {
+                i -= 8;
+                emit32(mc, aa64_ldur(3, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(3, 12, dr, (i32)i));
+            }
+            while (i >= 4) {
+                i -= 4;
+                emit32(mc, aa64_ldur(2, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(2, 12, dr, (i32)i));
+            }
+            while (i >= 2) {
+                i -= 2;
+                emit32(mc, aa64_ldur(1, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(1, 12, dr, (i32)i));
+            }
+            while (i >= 1) {
+                i -= 1;
+                emit32(mc, aa64_ldur(0, 12, sr, (i32)i));
+                emit32(mc, aa64_stur(0, 12, dr, (i32)i));
+            }
+        }
+        return;
+    }
+    case INTRIN_MEMSET: {
+        /* args = (dst_addr, byte, n) */
+        Operand da = args[0], bv = args[1], nb = args[2];
+        if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 intrinsic: memset with non-const n / non-REG ptr");
+        }
+        u32 dr = reg_num(da);
+        u32 n  = (u32)nb.v.imm;
+        u32 byte;
+        u32 src_reg;
+        if (bv.kind == OPK_IMM) {
+            byte = (u32)(bv.v.imm & 0xffu);
+            if (byte == 0) {
+                src_reg = 31u;  /* XZR / WZR */
+            } else {
+                u64 b64 = byte;
+                b64 |= b64 << 8; b64 |= b64 << 16; b64 |= b64 << 32;
+                emit_load_imm(mc, 1, 12, (i64)b64);
+                src_reg = 12u;
+            }
+        } else if (bv.kind == OPK_REG) {
+            /* Broadcast: dup low byte across all 8 bytes via ORR-immediate
+             * trickery is awkward; use mul-by-0x0101010101010101. */
+            emit_load_imm(mc, 1, 12, (i64)0x0101010101010101ll);
+            emit32(mc, aa64_madd(1, 12, reg_num(bv), 12, AA64_ZR));
+            src_reg = 12u;
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 intrinsic: memset byte kind %d unsupported",
+                (int)bv.kind);
+        }
+        u32 i = 0;
+        while (i + 8 <= n) { emit32(mc, aa64_stur(3, src_reg, dr, (i32)i)); i += 8; }
+        while (i + 4 <= n) { emit32(mc, aa64_stur(2, src_reg, dr, (i32)i)); i += 4; }
+        while (i + 2 <= n) { emit32(mc, aa64_stur(1, src_reg, dr, (i32)i)); i += 2; }
+        while (i < n)      { emit32(mc, aa64_stur(0, src_reg, dr, (i32)i)); i += 1; }
+        return;
+    }
+    case INTRIN_PREFETCH:
+        /* No-op hint. */
+        (void)args; (void)na;
+        return;
+    case INTRIN_ASSUME_ALIGNED: {
+        /* dst = src (alignment is a hint only). */
+        Operand src = args[0];
+        Operand dst = dsts[0];
+        if (reg_num(src) != reg_num(dst)) {
+            emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src)));
+        }
+        return;
+    }
+    case INTRIN_EXPECT: {
+        /* dst = val (the "expected" hint is dropped). */
+        Operand val = args[0];
+        Operand dst = dsts[0];
+        u32 sf = type_is_64(dst.type) ? 1u : 0u;
+        if (val.kind == OPK_REG) {
+            if (reg_num(val) != reg_num(dst)) {
+                emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val)));
+            }
+        } else if (val.kind == OPK_IMM) {
+            emit_load_imm(mc, sf, reg_num(dst), val.v.imm);
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 intrinsic: expect val kind %d unsupported",
+                (int)val.kind);
+        }
+        return;
+    }
+    case INTRIN_UNREACHABLE:
+    case INTRIN_TRAP:
+        emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u));
+        return;
+    case INTRIN_ADD_OVERFLOW:
+    case INTRIN_SUB_OVERFLOW: {
+        /* dsts: [val, ovf]. ADDS/SUBS sets V on signed overflow; CSET VS. */
+        Operand a_op = args[0], b_op = args[1];
+        Operand dval = dsts[0], dovf = dsts[1];
+        u32 sf = type_is_64(dval.type) ? 1u : 0u;
+        u32 ra = force_reg_int(t, a_op, sf, 9);
+        u32 rb = force_reg_int(t, b_op, sf, (ra == 9) ? 10u : 9u);
+        u32 word = (kind == INTRIN_ADD_OVERFLOW)
+                 ? aa64_adds_reg(sf, reg_num(dval), ra, rb)
+                 : aa64_subs_reg(sf, reg_num(dval), ra, rb);
+        emit32(mc, word);
+        emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/));
+        return;
+    }
+    case INTRIN_MUL_OVERFLOW: {
+        /* SMULL Xtmp, Wn, Wm gives full 64-bit signed product.
+         *   ovf = (Xtmp != sxtw(Wtmp)) — i.e. upper 32 bits ≠ sign-ext of low.
+         * dval gets the truncated low 32 bits. */
+        Operand a_op = args[0], b_op = args[1];
+        Operand dval = dsts[0], dovf = dsts[1];
+        u32 sf = type_is_64(dval.type) ? 1u : 0u;
+        if (sf) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 intrinsic: mul_overflow on i64 not yet supported");
+        }
+        u32 ra = force_reg_int(t, a_op, 0, 9);
+        u32 rb = force_reg_int(t, b_op, 0, (ra == 9) ? 10u : 9u);
+        emit32(mc, aa64_smull(/*X*/11u, ra, rb));
+        emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/31u, /*Xn=*/11u, /*Wm=*/11u));
+        emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
+        emit32(mc, aa64_mov_reg(0, reg_num(dval), 11u));
+        return;
+    }
+    default:
+        compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported",
+                       (int)kind);
+    }
+}
 
 static void aa_asm_block(CGTarget* t, const char* tmpl,
                          const AsmConstraint* outs, u32 no, Operand* oo,

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/abi/abi.c	\|	24	+++++++++++++++++++-----
M	src/arch/aarch64.c	\|	760	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---