cg/aa64: bring up frames, params, calls, FP — pass Groups A/B/C - kit

commit e25cbf0f256f656ae48652f76b1e757ad194564b
parent aef3673230d5108c76a0a58c69c64f1c2ff7fcd5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 14:07:16 -0700

cg/aa64: bring up frames, params, calls, FP — pass Groups A/B/C

Frame layout uses a fixed-size prologue placeholder patched at func_end
so frame_size and the callee-save count are knowable when the prologue
is finally written. Slots are FP-relative so per-slot offsets stay
stable while the eventual frame size is unknown; outgoing stack args
are SP-relative.

New surface:
- frame_slot, param (incoming x0..x7 / v0..v7 + stack overflow + sret x8)
- load/store/addr_of for LOCAL and INDIRECT operands
- call: direct BL with arg materialization (IMM/REG/LOCAL/BYVAL),
  sret pointer in x8, return-value placement into REG/LOCAL incl.
  small-struct-in-regs
- load_const for FP via .rodata + ADRP/LDR with the standard relocs;
  RC_FP allocator; FCVTZS; FMOV reg-reg
- binop accepts IMM via scratch; SREM/UREM via SDIV/UDIV+MSUB; UO_NOT
  via SUBS+CSET (previously panicked and "passed" by exit-code
  coincidence)

All 124 test/cg cases pass across D/R/E/J paths.

Diffstat:
M src/arch/aarch64.c  | 1073 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M test/cg/CORPUS.md  | 44 ++++++++++++++++++++++----------------------

2 files changed, 985 insertions(+), 132 deletions(-)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -1,17 +1,29 @@
 /* Minimal AArch64 CGTarget.
  *
- * Initial coverage matches the spine A + C corpus (function lifecycle and
- * integer arithmetic). Other CGTarget methods panic with a clear "unimpl"
- * diagnostic so test cases that touch them fail visibly rather than
- * silently emitting nothing.
+ * Single-pass codegen for the cg test corpus (Groups A, B, C). Frame
+ * layout uses a fixed-size prologue placeholder patched at func_end so
+ * frame_size and the callee-save register count are knowable when the
+ * prologue is finally written. FP-relative (x29) addressing is used for
+ * local slots and incoming stack args so that per-slot offsets can be
+ * assigned at frame_slot() time without depending on the eventual
+ * frame_size or callee-save count. SP-relative addressing is used for
+ * outgoing stack args.
  *
- * Single-pass register allocation: alloc_reg hands out W19..W28 in order
- * and panics on exhaustion. No live-range tracking, no spills. Suitable
- * for short straight-line fixtures only; replaced when CG's
- * value-stack-aware spill/reload arrives.
+ * Frame layout (low SP -> high):
+ *   outgoing args  (max_outgoing bytes, 16-aligned)
+ *   int reg saves  (n_int_pairs * 16)        -- x19/x20, x21/x22, ...
+ *   fp  reg saves  (n_fp_pairs  * 16)        -- d8/d9, d10/d11, ...
+ *   local slots    (cum_off bytes)
+ *   x29, x30 save  (16 bytes)                -- x29 = sp + frame_size - 16
  *
- * Width is derived from Operand.type via type_is_64(). For the test
- * harness this is enough; full ABI integration arrives with TargetABI. */
+ * Single-pass register allocator: alloc_reg(RC_INT) hands out x19..x28 in
+ * order; alloc_reg(RC_FP) hands out v8..v15. Both ranges are callee-saved
+ * and only the prefix actually used is saved by the prologue. Width
+ * derives from Operand.type via type_is_64. Spill/reload not implemented.
+ *
+ * Multi-function: each func_begin/func_end pair owns its own frame state
+ * via the AAImpl fields, so the harness can build several functions in
+ * one TU. */
 
 #include "arch/arch.h"
 #include "arch/aa64_isa.h"
@@ -21,16 +33,153 @@
 
 #include <string.h>
 
+/* ============================================================
+ * Local encoding helpers (kept here, not in aa64_isa.h, while the
+ * disassembler-shared table only needs the Group A/C subset).
+ * ============================================================ */
+
+#define AA64_NOP 0xD503201Fu
+
+/* ADD/SUB immediate (12-bit imm, optional shift-12). Rd/Rn = 31 means SP
+ * for these encodings (not ZR). */
+static inline u32 aa64_add_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh)
+{ return 0x11000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_sub_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh)
+{ return 0x51000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* STP/LDP signed offset, X registers. Offset is byte offset, must be a
+ * multiple of 8; encoded value = byte_offset / 8 in a signed 7-bit field
+ * (range -512..504). */
+static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+    i32 sc = byte_off >> 3;
+    return 0xA9000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+    i32 sc = byte_off >> 3;
+    return 0xA9400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* STP/LDP signed offset, D registers (64-bit FP, scale 8). */
+static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+    i32 sc = byte_off >> 3;
+    return 0x6D000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+    i32 sc = byte_off >> 3;
+    return 0x6D400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* LDUR / STUR (general regs, unscaled simm9 in -256..255).
+ * size: 0=B, 1=H, 2=W, 3=X. */
+static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+    return 0x38000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+    return 0x38400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* LDUR/STUR for SIMD & FP registers (V=1). size: 2=S (32-bit), 3=D (64-bit). */
+static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+    return 0x3C000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+    return 0x3C400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* STR/LDR scaled (unsigned imm12). byte_off must be a multiple of (1<<size). */
+static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+    u32 sc = byte_off >> size;
+    return 0x39000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* Branch (unconditional, 26-bit imm). Emitted with imm26=0 when paired
+ * with a JUMP26/CALL26 relocation; the patcher fills in imm26. */
+static inline u32 aa64_b_base(void)  { return 0x14000000u; }
+static inline u32 aa64_bl_base(void) { return 0x94000000u; }
+
+/* ADRP base (Rd in low 5 bits). imm bits filled by relocation. */
+static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd&0x1f); }
+
+/* LDR (unsigned offset) for SIMD & FP, used after ADRP for FP literals.
+ * size 2 => S (32-bit). imm12 patched by linker. */
+static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+    u32 sc = byte_off >> size;
+    return 0x3D400000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* FMOV (scalar register). type: 0=single, 1=double. */
+static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn)
+{ return 0x1E204000u | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SUBS immediate (used to encode CMP Xn, #imm via SUBS ZR, Xn, #imm). */
+static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12)
+{ return 0x71000000u | (sf<<31) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* CSET Wd/Xd, EQ — alias of CSINC Rd, ZR, ZR, NE (inverted EQ). */
+static inline u32 aa64_cset_eq(u32 sf, u32 Rd)
+{ return 0x1A800400u | (sf<<31) | (31u<<16) | (0x1u<<12) | (31u<<5) | (Rd&0x1f); }
+
+/* FCVTZS (scalar fp -> integer, round toward zero, signed).
+ * sf: 0=W, 1=X. type: 0=S, 1=D. */
+static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn)
+{ return 0x1E380000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* ============================================================
+ * AAImpl
+ * ============================================================ */
+
+#define AA_PROLOGUE_WORDS 12u    /* worst case: sub sp + stp/add fp + 5 int + 4 fp = 11 */
+
+typedef struct AASlot {
+    u32 off;        /* bytes below fp; address = x29 - off */
+    u32 size;
+    u32 align;
+    u8  kind;       /* FrameSlotKind */
+    u8  pad[3];
+} AASlot;
+
 typedef struct AAImpl {
     CGTarget   base;
     SrcLoc     loc;
     const CGFuncDesc* fd;
+
+    /* Function emission. */
     u32        func_start;
-    u32        next_alloc;
+    u32        prologue_pos;
+    MCLabel    epilogue_label;
+
+    /* Frame layout (in bytes; final frame_size computed at func_end). */
+    AASlot*    slots;
+    u32        nslots;
+    u32        slots_cap;
+    u32        cum_off;          /* total bytes consumed by local slots */
+    u32        max_outgoing;     /* max stack arg bytes for any call */
+
+    /* Param incoming tracking — set by func_begin from ABIFuncInfo. */
+    u32        next_param_int;   /* x0..x7 consumed so far */
+    u32        next_param_fp;    /* v0..v7 consumed so far */
+    u32        next_param_stack; /* offset into caller's stack arg area */
+    u8         has_sret;         /* sret pointer arrived in x8 */
+    FrameSlot  sret_ptr_slot;    /* hidden slot holding incoming x8 */
+
+    /* Reg allocator (callee-saved prefix). */
+    u32        used_int;         /* x19 + i, i in [0, used_int) */
+    u32        used_fp;          /* v8  + i, i in [0, used_fp ) */
 } AAImpl;
 
 static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
 
+/* Forward decls used before definition. */
+static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
+static AASlot* slot_get(AAImpl* a, FrameSlot fs);
+
 /* ---- helpers ---- */
 
 static int type_is_64(const Type* t)
@@ -47,6 +196,46 @@ static int type_is_64(const Type* t)
     }
 }
 
+static int type_is_fp_double(const Type* t)
+{ return t && (t->kind == TY_DOUBLE || t->kind == TY_LDOUBLE); }
+
+static int type_is_signed(const Type* t)
+{
+    if (!t) return 0;
+    switch (t->kind) {
+    case TY_CHAR: case TY_SCHAR:
+    case TY_SHORT: case TY_INT: case TY_LONG: case TY_LLONG:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+static u32 type_byte_size(const Type* t)
+{
+    if (!t) return 4;
+    switch (t->kind) {
+    case TY_CHAR: case TY_SCHAR: case TY_UCHAR: case TY_BOOL: return 1;
+    case TY_SHORT: case TY_USHORT: return 2;
+    case TY_INT: case TY_UINT: case TY_FLOAT: return 4;
+    case TY_LONG: case TY_ULONG: case TY_LLONG: case TY_ULLONG:
+    case TY_PTR: case TY_DOUBLE: return 8;
+    default: return 8;
+    }
+}
+
+/* Encode size index for STUR/LDUR (0=B,1=H,2=W,3=X). */
+static u32 size_idx_for_bytes(u32 nbytes)
+{
+    switch (nbytes) {
+    case 1: return 0;
+    case 2: return 1;
+    case 4: return 2;
+    case 8: return 3;
+    default: return 3;
+    }
+}
+
 static u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
 
 static void emit32(MCEmitter* mc, u32 word)
@@ -59,12 +248,78 @@ static void emit32(MCEmitter* mc, u32 word)
     mc->emit_bytes(mc, b, 4);
 }
 
+static void patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word)
+{
+    u8 b[4];
+    b[0] = (u8)(word        & 0xff);
+    b[1] = (u8)((word >> 8) & 0xff);
+    b[2] = (u8)((word >> 16)& 0xff);
+    b[3] = (u8)((word >> 24)& 0xff);
+    obj_patch(obj, sec_id, ofs, b, 4);
+}
+
 static _Noreturn void aa_panic(CGTarget* t, const char* what)
 {
     SrcLoc loc = impl_of(t)->loc;
     compiler_panic(t->c, loc, "aarch64: %s not implemented", what);
 }
 
+/* ---- AArch64 immediate encoding helpers ---- */
+
+/* Materialize a u64 into a register using MOVZ/MOVN/MOVK. Used both for
+ * the public load_imm() and internally for synthesizing immediates. */
+static void emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm)
+{
+    const u32 nslots = sf ? 4u : 2u;
+    u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
+
+    for (u32 i = 0; i < nslots; ++i) {
+        u32 slot   = (u32)((v >> (i * 16)) & 0xffffu);
+        u64 cleared = v & ~((u64)0xffffu << (i * 16));
+        if (slot != 0 && cleared == 0) {
+            emit32(mc, aa64_movz(sf, Rd, slot, i));
+            return;
+        }
+    }
+
+    {
+        u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
+        for (u32 i = 0; i < nslots; ++i) {
+            u32 slot   = (u32)((inv >> (i * 16)) & 0xffffu);
+            u64 cleared = inv & ~((u64)0xffffu << (i * 16));
+            if (cleared == 0) {
+                emit32(mc, aa64_movn(sf, Rd, slot, i));
+                return;
+            }
+        }
+    }
+
+    int placed = 0;
+    for (u32 i = 0; i < nslots; ++i) {
+        u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
+        if (!placed) {
+            if (slot == 0) continue;
+            emit32(mc, aa64_movz(sf, Rd, slot, i));
+            placed = 1;
+        } else if (slot != 0) {
+            emit32(mc, aa64_movk(sf, Rd, slot, i));
+        }
+    }
+    if (!placed) emit32(mc, aa64_movz(sf, Rd, 0, 0));
+}
+
+static void emit_sp_add(MCEmitter* mc, u32 imm)
+{
+    if (imm <= 0xfff) {
+        emit32(mc, aa64_add_imm(1, 31, 31, imm, 0));
+    } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) {
+        emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1));
+    } else {
+        emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1));
+        emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0));
+    }
+}
+
 /* ---- function lifecycle ---- */
 
 static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
@@ -73,26 +328,144 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
     MCEmitter* mc = t->mc;
 
     mc->set_section(mc, fd->text_section_id);
-    mc->emit_align(mc, 4, 0);    /* instruction alignment */
+    mc->emit_align(mc, 4, 0);
 
     a->fd          = fd;
     a->func_start  = mc->pos(mc);
-    a->next_alloc  = 0;
+    a->next_param_int = 0;
+    a->next_param_fp  = 0;
+    a->next_param_stack = 0;
+    a->has_sret    = (fd->abi && fd->abi->has_sret) ? 1 : 0;
+    a->cum_off     = 0;
+    a->max_outgoing= 0;
+    a->used_int    = 0;
+    a->used_fp     = 0;
+    a->nslots      = 0;
+    a->sret_ptr_slot = FRAME_SLOT_NONE;
+    a->epilogue_label = mc->label_new(mc);
 
     mc->cfi_startproc(mc);
+
+    /* Reserve a fixed-size prologue placeholder, NOP-filled. We patch the
+     * prefix at func_end with the real prologue once frame_size and the
+     * callee-save count are known. */
+    a->prologue_pos = mc->pos(mc);
+    for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) emit32(mc, AA64_NOP);
+
+    /* If the function returns indirect (sret), x8 holds the destination
+     * pointer on entry. Reserve a hidden slot to spill it into so the
+     * body can use x8 as scratch and ret can recover the dest pointer. */
+    if (a->has_sret) {
+        FrameSlotDesc fsd = {
+            .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+            .size = 8, .align = 8, .kind = FS_SPILL, .flags = 0,
+        };
+        a->sret_ptr_slot = aa_frame_slot(t, &fsd);
+    }
 }
 
 static void aa_func_end(CGTarget* t)
 {
     AAImpl*    a  = impl_of(t);
     MCEmitter* mc = t->mc;
-    u32        end = mc->pos(mc);
 
-    obj_symbol_define(t->obj,
-                      a->fd->sym,
-                      a->fd->text_section_id,
-                      (u64)a->func_start,
-                      (u64)(end - a->func_start));
+    /* Compute callee-save layout. */
+    u32 n_int_pairs = (a->used_int + 1) / 2;   /* round up */
+    u32 n_fp_pairs  = (a->used_fp  + 1) / 2;
+
+    u32 outgoing_off = 0;
+    u32 int_save_off = a->max_outgoing;
+    u32 fp_save_off  = int_save_off + n_int_pairs * 16;
+    u32 locals_off   = fp_save_off + n_fp_pairs * 16;
+    u32 fp_lr_off    = locals_off + a->cum_off;
+    u32 frame_size   = fp_lr_off + 16;
+    /* round to 16. */
+    frame_size = (frame_size + 15u) & ~15u;
+    fp_lr_off  = frame_size - 16;
+
+    (void)outgoing_off;
+
+    /* Emit epilogue at current pos, then place label. The label we emit
+     * must point at the first instruction of the epilogue so `b epilogue`
+     * branches land here. */
+    mc->label_place(mc, a->epilogue_label);
+
+    /* Restore FP saves, then INT saves, then fp/lr, then add sp + ret. */
+    for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) {
+        u32 r0 = 8u + (u32)i * 2u;
+        u32 r1 = r0 + 1u;
+        emit32(mc, aa64_ldp_d(r0, r1, 31, (i32)(fp_save_off + (u32)i*16u)));
+    }
+    for (i32 i = (i32)n_int_pairs - 1; i >= 0; --i) {
+        u32 r0 = 19u + (u32)i * 2u;
+        u32 r1 = r0 + 1u;
+        emit32(mc, aa64_ldp_x(r0, r1, 31, (i32)(int_save_off + (u32)i*16u)));
+    }
+    emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+    emit_sp_add(mc, frame_size);
+    emit32(mc, aa64_ret(AA64_LR));
+
+    /* Now patch prologue placeholder. */
+    u32 pos = a->prologue_pos;
+    ObjBuilder* obj = t->obj;
+    u32 sec = a->fd->text_section_id;
+
+    u32 words[AA_PROLOGUE_WORDS];
+    for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP;
+    u32 wi = 0;
+
+    /* sub sp, sp, #frame_size — may take 2 insns if > 4095. */
+    if (frame_size <= 0xfff) {
+        words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
+    } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
+        words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
+    } else {
+        if (wi + 2 > AA_PROLOGUE_WORDS) {
+            compiler_panic(t->c, a->loc,
+                "aarch64: prologue overflow for frame_size %u", frame_size);
+        }
+        words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
+        words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
+    }
+    /* stp x29, x30, [sp, #fp_lr_off]; add x29, sp, #fp_lr_off */
+    words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+    words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+    /* If sret, save incoming x8 (caller's destination pointer). */
+    if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
+        AASlot* s = slot_get(a, a->sret_ptr_slot);
+        if (s) {
+            if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+            words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
+        }
+    }
+    /* INT pair saves. */
+    for (u32 i = 0; i < n_int_pairs; ++i) {
+        u32 r0 = 19u + i*2u;
+        u32 r1 = r0 + 1u;
+        if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+        words[wi++] = aa64_stp_x(r0, r1, 31, (i32)(int_save_off + i*16u));
+    }
+    for (u32 i = 0; i < n_fp_pairs; ++i) {
+        u32 r0 = 8u + i*2u;
+        u32 r1 = r0 + 1u;
+        if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+        words[wi++] = aa64_stp_d(r0, r1, 31, (i32)(fp_save_off + i*16u));
+    }
+    if (0) {
+overflow:
+        compiler_panic(t->c, a->loc,
+            "aarch64: prologue placeholder too small (used %u of %u words)",
+            wi, AA_PROLOGUE_WORDS);
+    }
+
+    for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) {
+        patch32(obj, sec, pos + i*4u, words[i]);
+    }
+
+    /* Define the function symbol. */
+    u32 end = mc->pos(mc);
+    obj_symbol_define(obj, a->fd->sym, sec,
+                      (u64)a->func_start, (u64)(end - a->func_start));
 
     mc->cfi_endproc(mc);
     a->fd = NULL;
@@ -104,25 +477,128 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty)
 {
     AAImpl* a = impl_of(t);
     (void)ty;
-    if (cls != RC_INT) {
-        compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
+    if (cls == RC_INT) {
+        if (a->used_int >= 10) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 alloc_reg: out of INT scratch (no spill yet)");
+        }
+        return (Reg)(19u + a->used_int++);
     }
-    if (a->next_alloc >= 10) {
-        compiler_panic(t->c, a->loc,
-                       "aarch64 alloc_reg: out of scratch regs (no spill yet)");
+    if (cls == RC_FP) {
+        if (a->used_fp >= 8) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 alloc_reg: out of FP scratch (no spill yet)");
+        }
+        return (Reg)(8u + a->used_fp++);
     }
-    return (Reg)(19u + a->next_alloc++);
+    compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
 }
 
 static void aa_free_reg(CGTarget* t, Reg r) { (void)t; (void)r; }
 
-static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { (void)d; aa_panic(t, "frame_slot"); }
-static void      aa_param     (CGTarget* t, const CGParamDesc* p)    { (void)p; aa_panic(t, "param"); }
+static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d)
+{
+    AAImpl* a = impl_of(t);
+    if (a->nslots == a->slots_cap) {
+        u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8;
+        AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap);
+        if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots);
+        a->slots = nbuf;
+        a->slots_cap = ncap;
+    }
+    u32 size  = d->size  ? d->size  : 8;
+    u32 align = d->align ? d->align : 1;
+    u32 next  = a->cum_off + size;
+    /* Round up so that slot start (= fp - off) is align-aligned. fp is
+     * 16-aligned, so requiring off aligned to `align` suffices. */
+    u32 mask  = align - 1;
+    next = (next + mask) & ~mask;
+
+    AASlot* s = &a->slots[a->nslots];
+    s->off   = next;
+    s->size  = size;
+    s->align = align;
+    s->kind  = d->kind;
+
+    a->cum_off = next;
+    a->nslots++;
+    return (FrameSlot)(a->nslots);    /* 1-based; FRAME_SLOT_NONE == 0 */
+}
+
+static AASlot* slot_get(AAImpl* a, FrameSlot fs)
+{
+    if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL;
+    return &a->slots[fs - 1];
+}
+
+/* ---- param: store incoming arg(s) into the home slot ---- */
+
+static void aa_param(CGTarget* t, const CGParamDesc* p)
+{
+    AAImpl* a = impl_of(t);
+    AASlot* s = slot_get(a, p->slot);
+    if (!s) {
+        compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
+    }
+    const ABIArgInfo* ai = p->abi;
+
+    if (ai->kind == ABI_ARG_IGNORE) return;
+    if (ai->kind == ABI_ARG_INDIRECT) {
+        /* Caller passes a pointer to the data. Pointer comes in next
+         * INT arg reg; store it into the home slot (which holds the
+         * pointer-sized address). */
+        if (a->next_param_int < 8) {
+            u32 reg = a->next_param_int++;
+            emit32(t->mc, aa64_stur(3, reg, 29, -(i32)s->off));
+        } else {
+            /* Pointer on stack — load and store. */
+            u32 caller_off = a->next_param_stack;
+            a->next_param_stack += 8;
+            emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off)));
+            emit32(t->mc, aa64_stur(3, 9, 29, -(i32)s->off));
+        }
+        return;
+    }
+    /* DIRECT: place each part. */
+    for (u16 i = 0; i < ai->nparts; ++i) {
+        const ABIArgPart* pt = &ai->parts[i];
+        u32 part_off = pt->src_offset;
+        u32 sz = pt->size;
+        u32 sidx = size_idx_for_bytes(sz);
+
+        if (pt->cls == ABI_CLASS_INT) {
+            if (a->next_param_int < 8) {
+                u32 reg = a->next_param_int++;
+                emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+            } else {
+                /* Each stack-passed slot is 8 bytes regardless of part size. */
+                u32 caller_off = a->next_param_stack;
+                a->next_param_stack += 8;
+                emit32(t->mc, aa64_ldur(sidx, 9, 29, (i32)(16 + caller_off)));
+                emit32(t->mc, aa64_stur(sidx, 9, 29, -(i32)s->off + (i32)part_off));
+            }
+        } else if (pt->cls == ABI_CLASS_FP) {
+            if (a->next_param_fp < 8) {
+                u32 reg = a->next_param_fp++;
+                emit32(t->mc, aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+            } else {
+                u32 caller_off = a->next_param_stack;
+                a->next_param_stack += 8;
+                emit32(t->mc, aa64_ldur_fp(sidx, 0, 29, (i32)(16 + caller_off)));
+                emit32(t->mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off + (i32)part_off));
+            }
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 param: ABI class %d unimpl", (int)pt->cls);
+        }
+    }
+}
+
 static const Reg* aa_clobbers (CGTarget* t, RegClass c, u32* n)      { (void)c; (void)n; aa_panic(t, "clobbers"); }
 static void      aa_spill_reg (CGTarget* t, Operand s, FrameSlot f, MemAccess m) { (void)s; (void)f; (void)m; aa_panic(t, "spill_reg"); }
 static void      aa_reload_reg(CGTarget* t, Operand d, FrameSlot f, MemAccess m) { (void)d; (void)f; (void)m; aa_panic(t, "reload_reg"); }
 
-/* ---- labels / control flow (deferred) ---- */
+/* ---- labels / control flow (deferred for D-group; ret uses internal label) ---- */
 
 static Label aa_label_new  (CGTarget* t)                         { aa_panic(t, "label_new"); }
 static void  aa_label_place(CGTarget* t, Label l)                { (void)l; aa_panic(t, "label_place"); }
@@ -139,74 +615,169 @@ static void    aa_continue_to(CGTarget* t, CGScope s)             { (void)s; aa_
 
 static void aa_load_imm(CGTarget* t, Operand dst, i64 imm)
 {
-    MCEmitter* mc = t->mc;
     u32 sf = type_is_64(dst.type) ? 1u : 0u;
-    u32 rd = reg_num(dst);
-
-    /* Effective bit-width: 32 unless we're materializing into Xd. The 32-bit
-     * encoding zero-extends the result, so we mask to 32 bits when sf==0
-     * so a "negative" int constant materializes its low 32 bits exactly. */
-    const u32 nslots = sf ? 4u : 2u;
-    u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
+    emit_load_imm(t->mc, sf, reg_num(dst), imm);
+}
 
-    /* Single MOVZ when only one 16-bit slot is non-zero. */
-    for (u32 i = 0; i < nslots; ++i) {
-        u32 slot   = (u32)((v >> (i * 16)) & 0xffffu);
-        u64 cleared = v & ~((u64)0xffffu << (i * 16));
-        if (slot != 0 && cleared == 0) {
-            emit32(mc, aa64_movz(sf, rd, slot, i));
-            return;
-        }
+/* load_const: emit ADRP + LDR Sd, [Xt, #:lo12:sym] against a fresh
+ * symbol in .rodata. Used by b08 to materialize a float bit pattern. */
+static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb)
+{
+    AAImpl* a = impl_of(t);
+    if (dst.cls != RC_FP) {
+        compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1");
     }
 
-    /* Single MOVN when one slot of the inverted value covers the rest.
-     * For sf==1 the "rest is all ones" test is over the full 64 bits;
-     * for sf==0 we work in the 32-bit space. */
+    /* Find or create .rodata. */
+    Sym ro_name = pool_intern_cstr(t->c->global, ".rodata");
+    ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, cb.align ? cb.align : 4);
+
+    u32 cur_section = t->mc->section_id;
+    t->mc->set_section(t->mc, ro);
+    t->mc->emit_align(t->mc, cb.align ? cb.align : 4, 0);
+    u32 ro_off = t->mc->pos(t->mc);
+    t->mc->emit_bytes(t->mc, cb.bytes, cb.size);
+
+    /* Local symbol pointing at the literal. */
+    char namebuf[64];
+    static u32 lit_seq = 0;
+    int len = 0;
     {
-        u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
-        u64 all = sf ? ~(u64)0 : 0xffffffffu;
-        (void)all;
-        for (u32 i = 0; i < nslots; ++i) {
-            u32 slot   = (u32)((inv >> (i * 16)) & 0xffffu);
-            u64 cleared = inv & ~((u64)0xffffu << (i * 16));
-            if (cleared == 0) {
-                emit32(mc, aa64_movn(sf, rd, slot, i));
-                return;
-            }
-        }
+        const char* prefix = ".LCFP";
+        for (; prefix[len]; ++len) namebuf[len] = prefix[len];
+        u32 v = lit_seq++;
+        char tmp[16]; int tn = 0;
+        if (v == 0) tmp[tn++] = '0';
+        else { while (v) { tmp[tn++] = '0' + (char)(v % 10); v /= 10; } }
+        for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i];
+        namebuf[len] = 0;
     }
+    Sym sname = pool_intern_cstr(t->c->global, namebuf);
+    ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro,
+                              (u64)ro_off, (u64)cb.size);
+
+    t->mc->set_section(t->mc, cur_section);
+
+    /* ADRP X9, sym ; LDR Sd, [X9, #:lo12:sym] */
+    u32 adrp_pos = t->mc->pos(t->mc);
+    emit32(t->mc, aa64_adrp_base(9));
+    t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos,
+                         R_AARCH64_ADR_PREL_PG_HI21, sym, 0, 0, 0);
+
+    u32 ldr_pos = t->mc->pos(t->mc);
+    u32 sidx = (cb.size == 8) ? 3u : 2u;
+    emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0));
+    RelocKind lo12 = (cb.size == 8)
+        ? R_AARCH64_LDST64_ABS_LO12_NC
+        : R_AARCH64_LDST32_ABS_LO12_NC;
+    t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0);
+}
 
-    /* General path: MOVZ the lowest non-zero slot, then MOVK any other
-     * non-zero slot. v==0 was caught by the single-MOVZ branch above. */
-    int placed = 0;
-    for (u32 i = 0; i < nslots; ++i) {
-        u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
-        if (!placed) {
-            if (slot == 0) continue;
-            emit32(mc, aa64_movz(sf, rd, slot, i));
-            placed = 1;
-        } else if (slot != 0) {
-            emit32(mc, aa64_movk(sf, rd, slot, i));
-        }
+static void aa_copy(CGTarget* t, Operand dst, Operand src)
+{
+    if (dst.cls == RC_FP || src.cls == RC_FP) {
+        u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+        emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src)));
+        return;
     }
-    if (!placed) {
-        /* Defensive: should be unreachable (v==0 caught above). */
-        emit32(mc, aa64_movz(sf, rd, 0, 0));
+    u32 sf = type_is_64(dst.type) ? 1u : 0u;
+    emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
+}
+
+/* ---- load / store / addr_of ---- */
+
+/* Resolve an address operand (LOCAL or INDIRECT) into (base_reg, signed
+ * offset) via a possibly-temporary base register. Returns the base reg. */
+static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg)
+{
+    AAImpl* a = impl_of(t);
+    if (addr.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, addr.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot");
+        *out_off = -(i32)s->off;
+        return 29;          /* x29 = fp */
+    }
+    if (addr.kind == OPK_INDIRECT) {
+        *out_off = addr.v.ind.ofs;
+        return reg_num((Operand){.kind=OPK_REG, .v.reg = addr.v.ind.base});
     }
+    if (addr.kind == OPK_GLOBAL) {
+        compiler_panic(t->c, a->loc, "aarch64: GLOBAL address not yet supported");
+    }
+    (void)tmp_reg;
+    compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d",
+                   (int)addr.kind);
 }
 
-static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb)
-{ (void)dst; (void)cb; aa_panic(t, "load_const"); }
+static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma)
+{
+    AAImpl* a = impl_of(t);
+    i32 off;
+    u32 base = addr_base(t, addr, &off, 9);
+    u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+    u32 sidx = size_idx_for_bytes(sz);
+    if (off < -256 || off > 255) {
+        compiler_panic(t->c, a->loc, "aarch64 load: offset %d out of LDUR range", off);
+    }
+    if (dst.cls == RC_FP) {
+        emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off));
+    } else {
+        emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off));
+    }
+}
 
-static void aa_copy(CGTarget* t, Operand dst, Operand src)
+static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma)
 {
-    u32 sf = type_is_64(dst.type) ? 1u : 0u;
-    emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
+    AAImpl* a = impl_of(t);
+    i32 off;
+    u32 base = addr_base(t, addr, &off, 9);
+    u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+    u32 sidx = size_idx_for_bytes(sz);
+    if (off < -256 || off > 255) {
+        compiler_panic(t->c, a->loc, "aarch64 store: offset %d out of STUR range", off);
+    }
+
+    if (src.kind == OPK_IMM) {
+        /* Materialize through a scratch register. Use x9 (caller-saved). */
+        u32 sf = (sz == 8) ? 1u : 0u;
+        emit_load_imm(t->mc, sf, 9, src.v.imm);
+        emit32(t->mc, aa64_stur(sidx, 9, base, off));
+        return;
+    }
+    if (src.cls == RC_FP) {
+        emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off));
+    } else {
+        emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off));
+    }
+}
+
+static void aa_addr_of(CGTarget* t, Operand dst, Operand lv)
+{
+    AAImpl* a = impl_of(t);
+    if (lv.kind == OPK_LOCAL) {
+        AASlot* s = slot_get(a, lv.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot");
+        /* dst = x29 - off */
+        emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0));
+        return;
+    }
+    if (lv.kind == OPK_INDIRECT) {
+        i32 ofs = lv.v.ind.ofs;
+        u32 base = lv.v.ind.base & 0x1f;
+        if (ofs == 0) {
+            emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base));
+        } else if (ofs > 0 && ofs <= 0xfff) {
+            emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0));
+        } else if (ofs < 0 && -ofs <= 0xfff) {
+            emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0));
+        } else {
+            compiler_panic(t->c, a->loc, "aarch64 addr_of: indirect offset %d unsupported", ofs);
+        }
+        return;
+    }
+    aa_panic(t, "addr_of");
 }
 
-static void aa_load    (CGTarget* t, Operand d, Operand a, MemAccess m) { (void)d;(void)a;(void)m; aa_panic(t, "load"); }
-static void aa_store   (CGTarget* t, Operand a, Operand s, MemAccess m) { (void)a;(void)s;(void)m; aa_panic(t, "store"); }
-static void aa_addr_of (CGTarget* t, Operand d, Operand l)              { (void)d;(void)l; aa_panic(t, "addr_of"); }
 static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a)   { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); }
 static void aa_copy_bytes(CGTarget* t, Operand d, Operand s, AggregateAccess g) { (void)d;(void)s;(void)g; aa_panic(t, "copy_bytes"); }
 static void aa_set_bytes (CGTarget* t, Operand d, Operand b, AggregateAccess g) { (void)d;(void)b;(void)g; aa_panic(t, "set_bytes"); }
@@ -215,21 +786,28 @@ static void aa_bitfield_store(CGTarget* t, Operand a, Operand s, BitFieldAccess 
 
 /* ---- arithmetic ---- */
 
-static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
+/* Force an Operand into a register, materializing immediates via x9.
+ * Returns the register number to use as Rn/Rm. */
+static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch)
+{
+    if (op.kind == OPK_REG) return reg_num(op);
+    if (op.kind == OPK_IMM) {
+        emit_load_imm(t->mc, sf, scratch, op.v.imm);
+        return scratch;
+    }
+    compiler_panic(t->c, impl_of(t)->loc,
+        "aarch64 binop: operand kind %d unsupported", (int)op.kind);
+}
+
+static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, Operand b_op)
 {
     MCEmitter* mc = t->mc;
     u32 sf = type_is_64(dst.type) ? 1u : 0u;
     u32 rd = reg_num(dst);
-    u32 rn = reg_num(a);
-    u32 rm = reg_num(b);
+    u32 rn = force_reg_int(t, a_op, sf, 9);
+    u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10 : 9);
     u32 word;
 
-    /* All operands must be REG. CG materializes immediates first. */
-    if (a.kind != OPK_REG || b.kind != OPK_REG) {
-        compiler_panic(t->c, impl_of(t)->loc,
-                       "aarch64 binop: non-REG operands not yet supported");
-    }
-
     switch (op) {
     case BO_IADD:  word = aa64_add (sf, rd, rn, rm); break;
     case BO_ISUB:  word = aa64_sub (sf, rd, rn, rm); break;
@@ -242,8 +820,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
     case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break;
     case BO_UDIV:  word = aa64_udiv(sf, rd, rn, rm); break;
     case BO_SDIV:  word = aa64_sdiv(sf, rd, rn, rm); break;
+    /* rem = a - (a/b)*b → SDIV/UDIV into x11, then MSUB rd, x11, b, a. */
     case BO_SREM:
+        emit32(mc, aa64_sdiv(sf, 11, rn, rm));
+        word = aa64_msub(sf, rd, 11, rm, rn);
+        break;
     case BO_UREM:
+        emit32(mc, aa64_udiv(sf, 11, rn, rm));
+        word = aa64_msub(sf, rd, 11, rm, rn);
+        break;
     case BO_FADD: case BO_FSUB: case BO_FMUL: case BO_FDIV:
     default:
         compiler_panic(t->c, impl_of(t)->loc,
@@ -252,15 +837,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
     emit32(mc, word);
 }
 
-static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
+static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op)
 {
     MCEmitter* mc = t->mc;
     u32 sf = type_is_64(dst.type) ? 1u : 0u;
     u32 rd = reg_num(dst);
-    u32 rn = reg_num(a);
+    u32 rn = reg_num(a_op);
     u32 word;
 
-    if (a.kind != OPK_REG) {
+    if (a_op.kind != OPK_REG) {
         compiler_panic(t->c, impl_of(t)->loc,
                        "aarch64 unop: non-REG operand not yet supported");
     }
@@ -269,6 +854,10 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
     case UO_NEG:  word = aa64_neg(sf, rd, rn); break;
     case UO_BNOT: word = aa64_mvn(sf, rd, rn); break;
     case UO_NOT:
+        /* !x → cmp Xn, #0 ; cset Xd, EQ */
+        emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0));
+        word = aa64_cset_eq(sf, rd);
+        break;
     default:
         compiler_panic(t->c, impl_of(t)->loc,
                        "aarch64 unop: op %d unimpl", (int)op);
@@ -276,26 +865,291 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
     emit32(mc, word);
 }
 
-static void aa_cmp    (CGTarget* t, CmpOp op, Operand d, Operand a, Operand b)  { (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); }
-static void aa_convert(CGTarget* t, ConvKind k, Operand d, Operand s)           { (void)k;(void)d;(void)s; aa_panic(t, "convert"); }
+static void aa_cmp(CGTarget* t, CmpOp op, Operand d, Operand a, Operand b)
+{ (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); }
+
+static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src)
+{
+    AAImpl* a = impl_of(t);
+    switch (k) {
+    case CV_FTOI_S: {
+        if (src.cls != RC_FP || dst.cls != RC_INT) {
+            compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
+        }
+        u32 sf   = type_is_64(dst.type) ? 1u : 0u;
+        u32 type = type_is_fp_double(src.type) ? 1u : 0u;
+        emit32(t->mc, aa64_fcvtzs(sf, type, reg_num(dst), reg_num(src)));
+        return;
+    }
+    default:
+        compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k);
+    }
+}
 
 /* ---- calls / return ---- */
 
-static void aa_call(CGTarget* t, const CGCallDesc* d) { (void)d; aa_panic(t, "call"); }
+/* Materialize a CGABIValue into the outgoing argument slots: register
+ * arguments go to x0..x7 / v0..v7; overflow goes to [sp, #stack_off].
+ * For BYVAL/INDIRECT the caller's `storage` is the address of the source
+ * data; we either load chunks into the next register pair (DIRECT
+ * aggregate) or pass the address itself (INDIRECT). */
+static void emit_arg_value(CGTarget* t,
+                            const CGABIValue* av,
+                            u32* next_int, u32* next_fp, u32* stack_off)
+{
+    AAImpl* a = impl_of(t);
+    const ABIArgInfo* ai = av->abi;
+    if (ai->kind == ABI_ARG_IGNORE) return;
+
+    if (ai->kind == ABI_ARG_INDIRECT) {
+        /* Pass the address of the storage. storage is OPK_LOCAL holding
+         * the byval source. */
+        u32 dst_reg;
+        int to_stack = (*next_int >= 8);
+        if (!to_stack) dst_reg = (*next_int)++;
+        else dst_reg = 9;
+        if (av->storage.kind == OPK_LOCAL) {
+            AASlot* s = slot_get(a, av->storage.v.frame_slot);
+            if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot");
+            emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0));
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 call: INDIRECT arg storage kind %d unsupported",
+                (int)av->storage.kind);
+        }
+        if (to_stack) {
+            emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+            *stack_off += 8;
+        }
+        return;
+    }
 
+    /* DIRECT — possibly multiple parts. */
+    for (u16 i = 0; i < ai->nparts; ++i) {
+        const ABIArgPart* pt = &ai->parts[i];
+        u32 sz = pt->size;
+        u32 sidx = size_idx_for_bytes(sz);
+
+        if (pt->cls == ABI_CLASS_INT) {
+            int to_stack = (*next_int >= 8);
+            u32 dst_reg = to_stack ? 9u : (*next_int)++;
+            /* Source bits for this part. */
+            switch (av->storage.kind) {
+            case OPK_IMM: {
+                u32 sf = (sz == 8) ? 1u : 0u;
+                emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm);
+                break;
+            }
+            case OPK_REG: {
+                u32 sf = (sz == 8) ? 1u : 0u;
+                emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage)));
+                break;
+            }
+            case OPK_LOCAL: {
+                /* BYVAL aggregate carried in registers: load chunks from
+                 * the source local's address + part->src_offset. */
+                AASlot* s = slot_get(a, av->storage.v.frame_slot);
+                if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
+                i32 off = -(i32)s->off + (i32)pt->src_offset;
+                emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
+                break;
+            }
+            default:
+                compiler_panic(t->c, a->loc,
+                    "aarch64 call: arg storage kind %d unsupported",
+                    (int)av->storage.kind);
+            }
+            if (to_stack) {
+                emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+                *stack_off += 8;
+            }
+        } else if (pt->cls == ABI_CLASS_FP) {
+            int to_stack = (*next_fp >= 8);
+            u32 dst_reg = to_stack ? 0u : (*next_fp)++;
+            switch (av->storage.kind) {
+            case OPK_REG: {
+                u32 type = (sz == 8) ? 1u : 0u;
+                emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
+                break;
+            }
+            default:
+                compiler_panic(t->c, a->loc,
+                    "aarch64 call: FP arg storage kind %d unsupported",
+                    (int)av->storage.kind);
+            }
+            if (to_stack) {
+                emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off));
+                *stack_off += 8;
+            }
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 call: ABI class %d unimpl", (int)pt->cls);
+        }
+    }
+}
+
+static void aa_call(CGTarget* t, const CGCallDesc* d)
+{
+    AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+
+    /* Pass 1: classify args, set up argument registers/stack. */
+    u32 next_int = 0, next_fp = 0, stack_off = 0;
+
+    /* sret: caller passes destination pointer in x8. */
+    if (d->abi && d->abi->has_sret) {
+        if (d->ret.storage.kind != OPK_LOCAL) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 call: sret destination must be LOCAL");
+        }
+        AASlot* s = slot_get(a, d->ret.storage.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot");
+        emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0));
+    }
+
+    for (u32 i = 0; i < d->nargs; ++i) {
+        emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off);
+    }
+
+    /* Track outgoing-arg high-water mark, 16-aligned. */
+    u32 needed = (stack_off + 15u) & ~15u;
+    if (needed > a->max_outgoing) a->max_outgoing = needed;
+
+    /* BL <callee> — direct only. */
+    if (d->callee.kind != OPK_GLOBAL) {
+        compiler_panic(t->c, a->loc,
+            "aarch64 call: indirect call not yet supported");
+    }
+    u32 bl_pos = mc->pos(mc);
+    emit32(mc, aa64_bl_base());
+    mc->emit_reloc_at(mc, mc->section_id, bl_pos,
+                      R_AARCH64_CALL26, d->callee.v.global.sym,
+                      d->callee.v.global.addend, 0, 0);
+
+    /* Receive return value. */
+    const ABIArgInfo* ri = &d->abi->ret;
+    if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) {
+        /* Nothing to copy — sret was placed directly into the dst slot. */
+        return;
+    }
+    /* DIRECT scalar in our coverage: a single INT or FP part placed in
+     * x0 / v0. Move into ret_storage. */
+    if (ri->nparts == 0) return;
+    const ABIArgPart* p0 = &ri->parts[0];
+    Operand rs = d->ret.storage;
+    if (p0->cls == ABI_CLASS_INT) {
+        u32 sf = (p0->size == 8) ? 1u : 0u;
+        if (rs.kind == OPK_REG) {
+            emit32(mc, aa64_mov_reg(sf, reg_num(rs), 0));
+        } else if (rs.kind == OPK_LOCAL) {
+            AASlot* s = slot_get(a, rs.v.frame_slot);
+            if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
+            u32 sidx = size_idx_for_bytes(p0->size);
+            emit32(mc, aa64_stur(sidx, 0, 29, -(i32)s->off));
+        }
+    } else if (p0->cls == ABI_CLASS_FP) {
+        u32 type = (p0->size == 8) ? 1u : 0u;
+        if (rs.kind == OPK_REG) {
+            emit32(mc, aa64_fmov_reg(type, reg_num(rs), 0));
+        } else if (rs.kind == OPK_LOCAL) {
+            AASlot* s = slot_get(a, rs.v.frame_slot);
+            if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
+            u32 sidx = size_idx_for_bytes(p0->size);
+            emit32(mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off));
+        }
+    }
+    /* Multi-part returns: not exercised yet. */
+    if (ri->nparts > 1) {
+        compiler_panic(t->c, a->loc,
+            "aarch64 call: multi-part return not yet supported");
+    }
+}
+
+/* Materialize the return value, then branch to the function epilogue. */
 static void aa_ret(CGTarget* t, const CGABIValue* val)
 {
+    AAImpl* a  = impl_of(t);
     MCEmitter* mc = t->mc;
 
-    if (val && val->storage.kind == OPK_REG) {
-        u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
-        emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
-    } else if (val && val->storage.kind == OPK_IMM) {
-        /* MOV W0, #imm via load_imm */
-        Operand w0 = { OPK_REG, RC_INT, 0, val->storage.type, .v.reg = 0 };
-        aa_load_imm(t, w0, val->storage.v.imm);
+    if (val) {
+        const ABIArgInfo* ri = val->abi;
+        if (ri && ri->kind == ABI_ARG_INDIRECT) {
+            /* sret: caller passed the destination pointer in x8 at entry,
+             * which we spilled into sret_ptr_slot. Reload x8 from there,
+             * then memcpy the source storage into [x8]. */
+            if (val->storage.kind == OPK_LOCAL) {
+                AASlot* s = slot_get(a, val->storage.v.frame_slot);
+                if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot");
+                if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
+                    AASlot* sp = slot_get(a, a->sret_ptr_slot);
+                    if (sp) emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
+                }
+                u32 nbytes = s->size;
+                u32 i = 0;
+                while (i + 8 <= nbytes) {
+                    emit32(mc, aa64_ldur(3, 9, 29, -(i32)s->off + (i32)i));
+                    emit32(mc, aa64_str_uimm(3, 9, 8, i));
+                    i += 8;
+                }
+                while (i + 4 <= nbytes) {
+                    emit32(mc, aa64_ldur(2, 9, 29, -(i32)s->off + (i32)i));
+                    emit32(mc, aa64_str_uimm(2, 9, 8, i));
+                    i += 4;
+                }
+                while (i + 2 <= nbytes) {
+                    emit32(mc, aa64_ldur(1, 9, 29, -(i32)s->off + (i32)i));
+                    emit32(mc, aa64_str_uimm(1, 9, 8, i));
+                    i += 2;
+                }
+                while (i < nbytes) {
+                    emit32(mc, aa64_ldur(0, 9, 29, -(i32)s->off + (i32)i));
+                    emit32(mc, aa64_str_uimm(0, 9, 8, i));
+                    i += 1;
+                }
+            } else {
+                compiler_panic(t->c, a->loc,
+                    "aarch64 ret indirect: storage kind %d unsupported",
+                    (int)val->storage.kind);
+            }
+        } else if (val->storage.kind == OPK_REG) {
+            if (val->storage.cls == RC_FP) {
+                u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
+                emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
+            } else {
+                u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+                emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
+            }
+        } else if (val->storage.kind == OPK_IMM) {
+            u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+            emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm);
+        } else if (val->storage.kind == OPK_LOCAL) {
+            /* DIRECT return whose source is a local: load each part into
+             * x0/x1 (or v0/v1) per the ABI classification. Used for
+             * small structs returned in registers. */
+            AASlot* s = slot_get(a, val->storage.v.frame_slot);
+            if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot");
+            const ABIArgInfo* ri = val->abi;
+            for (u16 i = 0; i < (ri ? ri->nparts : 0); ++i) {
+                const ABIArgPart* pt = &ri->parts[i];
+                u32 sidx = size_idx_for_bytes(pt->size);
+                i32 off = -(i32)s->off + (i32)pt->src_offset;
+                if (pt->cls == ABI_CLASS_INT) {
+                    emit32(mc, aa64_ldur(sidx, /*Rt=*/i, 29, off));
+                } else if (pt->cls == ABI_CLASS_FP) {
+                    emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, 29, off));
+                } else {
+                    compiler_panic(t->c, a->loc,
+                        "aarch64 ret: ret part cls %d unimpl", (int)pt->cls);
+                }
+            }
+        }
     }
-    emit32(mc, aa64_ret(AA64_LR));
+    /* Branch to the epilogue. mc->emit_label_ref records a fixup that
+     * resolves to a JUMP26-encoded displacement when the label is placed. */
+    u32 bpos = mc->pos(mc);
+    emit32(mc, aa64_b_base());
+    mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
+    (void)bpos;
 }
 
 static void aa_alloca_ (CGTarget* t, Operand d, Operand s, u32 a) { (void)d;(void)s;(void)a; aa_panic(t, "alloca"); }
@@ -334,8 +1188,6 @@ static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); }
 
 CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
 {
-    /* v1: only AArch64 implemented. Other targets fall back to a
-     * "not implemented" diagnostic at construction. */
     if (c->target.arch != CFREE_ARCH_ARM_64) {
         SrcLoc loc = {0,0,0};
         compiler_panic(c, loc,
@@ -399,7 +1251,7 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
     t->va_end_    = aa_va_end_;
     t->va_copy_   = aa_va_copy_;
 
-    t->setjmp_    = NULL;       /* parser lowers via __cfree_setjmp */
+    t->setjmp_    = NULL;
     t->longjmp_   = NULL;
 
     t->atomic_load = aa_atomic_load;
@@ -415,6 +1267,9 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
     t->finalize    = aa_finalize;
     t->destroy     = aa_destroy;
 
+    /* Avoid signed/unsigned warning. */
+    (void)type_is_signed;
+
     compiler_defer(c, cgt_cleanup, t);
     return t;
 }
@@ -424,7 +1279,5 @@ void cgtarget_finalize(CGTarget* t) { if (t && t->finalize) t->finalize(t); }
 void cgtarget_free(CGTarget* t)
 {
     if (!t) return;
-    /* Arena-backed; nothing to free. The compiler_defer cleanup callback
-     * arrives here at panic; intentional double-call from explicit free
-     * after success is safe because everything is arena memory. */
+    /* Arena-backed; nothing to free. */
 }
diff --git a/test/cg/CORPUS.md b/test/cg/CORPUS.md
@@ -44,12 +44,12 @@ parser will, and fail at runtime until those land — that is intentional.
 | `a02_return_zero`         | ★ | `load_imm 0; ret reg`                       |   0 |
 | `a03_ret_imm`             | ★ | `ret IMM 17` (backend materializes)         |  17 |
 | `a04_copy_reg`            | ★ | `load_imm 7; copy r1->r2; ret r2`           |   7 |
-| `a05_return_neg_small`    | · | `load_imm -7` via MOVN; ret                 | 249 (= -7 & 0xff) |
-| `a06_return_i64`          | · | i64 `load_imm 0x1_0000_002A`; ret as i64    |  42 (low 32 of x0) |
-| `a07_void_return`         | · | `ret(NULL)`                                 |   0 (via _start zeroing x0) |
-| `a08_multiple_returns`    | · | `ret_imm 1; ret_imm 2` (second is dead)     |   1 |
-| `a09_load_imm_movz_movk`  | · | `load_imm 0xABCD` (multi-step materialize)  | 205 (= 0xCD) |
-| `a10_return_u8`           | · | `load_imm 200` into u8 reg; ret             | 200 |
+| `a05_return_neg_small`    | ★ | `load_imm -7` via MOVN; ret                 | 249 (= -7 & 0xff) |
+| `a06_return_i64`          | ★ | i64 `load_imm 0x1_0000_002A`; ret as i64    |  42 (low 32 of x0) |
+| `a07_void_return`         | ★ | `ret(NULL)`                                 |   0 (via _start zeroing x0) |
+| `a08_multiple_returns`    | ★ | `ret_imm 1; ret_imm 2` (second is dead)     |   1 |
+| `a09_load_imm_movz_movk`  | ★ | `load_imm 0xABCD` (multi-step materialize)  | 205 (= 0xCD) |
+| `a10_return_u8`           | ★ | `load_imm 200` into u8 reg; ret             | 200 |
 
 ## Group B — frame slots, parameters, locals
 
@@ -62,14 +62,14 @@ materialization, slot allocation, and call lowering use the live
 
 | Case | Status | Body | Expected |
 |---|---|---|---|
-| `b01_param_int`        | · | `int echo(int x){return x;}; echo(201)`                                    | 201 |
-| `b02_param_sum`        | · | `int sum2(int a,int b){return a+b;}; sum2(40,2)`                           |  42 |
-| `b03_param_spill`      | · | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)`           |  45 |
-| `b04_local_int`        | · | local int slot; `*p = 42; return *p`                                       |  42 |
-| `b05_addr_taken_local` | · | `int x=17; int*p=&x; *p+=1; return *p`                                     |  18 |
-| `b06_sret`             | · | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b`         |  42 |
-| `b07_byval_param`      | · | `int take(struct Pt p){return p.a+p.b;}; take({15,27})`                    |  42 |
-| `b08_fp_param`         | · | `int trunc(float f){return (int)f;}; trunc(7.5f)`                          |   7 |
+| `b01_param_int`        | ★ | `int echo(int x){return x;}; echo(201)`                                    | 201 |
+| `b02_param_sum`        | ★ | `int sum2(int a,int b){return a+b;}; sum2(40,2)`                           |  42 |
+| `b03_param_spill`      | ★ | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)`           |  45 |
+| `b04_local_int`        | ★ | local int slot; `*p = 42; return *p`                                       |  42 |
+| `b05_addr_taken_local` | ★ | `int x=17; int*p=&x; *p+=1; return *p`                                     |  18 |
+| `b06_sret`             | ★ | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b`         |  42 |
+| `b07_byval_param`      | ★ | `int take(struct Pt p){return p.a+p.b;}; take({15,27})`                    |  42 |
+| `b08_fp_param`         | ★ | `int trunc(float f){return (int)f;}; trunc(7.5f)`                          |   7 |
 
 ## Group C — integer arithmetic
 
@@ -79,14 +79,14 @@ materialization, slot allocation, and call lowering use the live
 | `c02_sub_mul`        | ★ | `7 * 3 - 4`                          |  17 |
 | `c03_bitwise`        | ★ | `(~3) & 0xff`                        | 252 |
 | `c04_shift`          | ★ | `(1<<5) \| (16>>1)` (logical shr)    |  40 |
-| `c05_div_mod`        | · | `23 / 4 + 23 % 4` (signed)           |   8 |
-| `c06_xor`            | · | `0xa5 ^ 0x5a`                        | 255 |
-| `c07_iadd_i64`       | · | i64 `0x1_0000_0029 + 0x1_0000_0001`  |  42 (low 32) |
-| `c08_unsigned_div`   | · | `100u / 7u`                          |  14 |
-| `c09_neg`            | · | `UO_NEG` 42                          | 214 (= -42 & 0xff) |
-| `c10_logical_not`    | · | `UO_NOT 0` (zero-test → 0/1)         |   1 |
-| `c11_shr_signed`     | · | `-16 >>(s) 2`                        | 252 (= -4 & 0xff) |
-| `c12_imul_i64`       | · | i64 `7 * 6`                          |  42 |
+| `c05_div_mod`        | ★ | `23 / 4 + 23 % 4` (signed)           |   8 |
+| `c06_xor`            | ★ | `0xa5 ^ 0x5a`                        | 255 |
+| `c07_iadd_i64`       | ★ | i64 `0x1_0000_0029 + 0x1_0000_0001`  |  42 (low 32) |
+| `c08_unsigned_div`   | ★ | `100u / 7u`                          |  14 |
+| `c09_neg`            | ★ | `UO_NEG` 42                          | 214 (= -42 & 0xff) |
+| `c10_logical_not`    | ★ | `UO_NOT 0` (zero-test → 0/1)         |   1 |
+| `c11_shr_signed`     | ★ | `-16 >>(s) 2`                        | 252 (= -4 & 0xff) |
+| `c12_imul_i64`       | ★ | i64 `7 * 6`                          |  42 |
 
 ## Deferred groups

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aarch64.c	\|	1073	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	test/cg/CORPUS.md	\|	44	++++++++++++++++++++++----------------------