cg/aa64: implement Groups G, H, I (calls, control flow, alloca) - kit

commit 5ae1703a745e575657fee65c946f9e1f241d4a3a
parent b871959ddf3e4188f507041b2f9f7181d2662750
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 15:17:20 -0700

cg/aa64: implement Groups G, H, I (calls, control flow, alloca)

- Calls: FP binops (FADD/FSUB/FMUL/FDIV), addr_of for OPK_GLOBAL via
  ADRP+ADD, indirect calls (BLR Xn), multi-part returns (HFA into
  LOCAL storage), and memcpy-on-entry for ABI_ARG_INDIRECT params.
- Control flow: SCOPE_LOOP / SCOPE_BLOCK as bookkeeping over the
  caller-driven label_place/jump, with break_to/continue_to forwarding
  to the recorded labels.
- Alloca: SUB SP by an aligned const or runtime size, return
  SP + max_outgoing via a placeholder ADD patched at func_end. For
  has_alloca functions, restore SP from FP at the epilogue.

Diffstat:
M src/arch/aarch64.c  | 328 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------

1 file changed, 279 insertions(+), 49 deletions(-)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -153,6 +153,12 @@ static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn)  /* GPR→FP, double */
 static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn)  /* FP→GPR, double */
 { return 0x9E660000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
 
+/* SUB (extended register), 64-bit, UXTX, shift 0. Unlike SUB shifted-reg
+ * (where Rd=31 means ZR), this form treats Rd/Rn=31 as SP — needed to
+ * decrement SP by a register amount during alloca. */
+static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm)
+{ return 0xCB206000u | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
 /* SUBS shifted register (Rd=ZR encodes CMP). */
 static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm)
 { return 0x6B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
@@ -168,6 +174,16 @@ static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond)
 static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond)
 { return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); }
 
+/* FADD / FSUB / FMUL / FDIV (scalar). type: 0=S (float), 1=D (double). */
+static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x1E202800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x1E203800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x1E200800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x1E201800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
 /* SBFM / UBFM / BFM (bitfield move family).
  *   sf opc(2) 100110 N immr(6) imms(6) Rn(5) Rd(5)
  * opc: 00=SBFM, 01=BFM, 10=UBFM. N must equal sf. */
@@ -198,6 +214,8 @@ typedef struct AAScope {
     u8       pad[2];
     MCLabel  else_label;    /* SCOPE_IF: false branch target / end-of-then */
     MCLabel  end_label;     /* SCOPE_IF: join point past the whole if/else */
+    Label    break_label;   /* SCOPE_LOOP/BLOCK: explicit break target */
+    Label    continue_label;/* SCOPE_LOOP: explicit continue target */
 } AAScope;
 
 typedef struct AAImpl {
@@ -234,6 +252,15 @@ typedef struct AAImpl {
     AAScope*   scopes;
     u32        nscopes;
     u32        scopes_cap;
+
+    /* alloca: each call emits an `ADD result, SP, #0` placeholder; at
+     * func_end the imm12 is patched with the final max_outgoing. Tracks
+     * (instruction pos, dst reg) for each placeholder. has_alloca also
+     * triggers SP-from-FP restoration in the epilogue. */
+    u8         has_alloca;
+    struct AAAllocaPatch { u32 pos; u32 dst_reg; }* add_patches;
+    u32        nadd_patches;
+    u32        add_patches_cap;
 } AAImpl;
 
 static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
@@ -405,6 +432,8 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
     a->used_fp     = 0;
     a->nslots      = 0;
     a->nscopes     = 0;
+    a->has_alloca  = 0;
+    a->nadd_patches= 0;
     a->sret_ptr_slot = FRAME_SLOT_NONE;
     a->epilogue_label = mc->label_new(mc);
 
@@ -454,6 +483,19 @@ static void aa_func_end(CGTarget* t)
      * branches land here. */
     mc->label_place(mc, a->epilogue_label);
 
+    /* If the body called alloca, SP may sit below the locals area.
+     * Restore SP from FP before reloading callee-saves, since those use
+     * SP-relative offsets. */
+    if (a->has_alloca) {
+        if (fp_lr_off <= 0xfff) {
+            emit32(mc, aa64_sub_imm(1, /*Rd=SP*/31, /*Rn=*/29, fp_lr_off, 0));
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64: has_alloca + fp_lr_off %u out of imm12 range",
+                fp_lr_off);
+        }
+    }
+
     /* Restore FP saves, then INT saves, then fp/lr, then add sp + ret. */
     for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) {
         u32 r0 = 8u + (u32)i * 2u;
@@ -526,6 +568,19 @@ overflow:
         patch32(obj, sec, pos + i*4u, words[i]);
     }
 
+    /* Patch each alloca's `ADD dst, SP, #0` placeholder with the final
+     * max_outgoing offset, now that the high-water mark is known. */
+    if (a->max_outgoing > 0xfff) {
+        compiler_panic(t->c, a->loc,
+            "aarch64: max_outgoing %u out of imm12 range for alloca patch",
+            a->max_outgoing);
+    }
+    for (u32 i = 0; i < a->nadd_patches; ++i) {
+        u32 dr = a->add_patches[i].dst_reg;
+        u32 word = aa64_add_imm(1, dr, /*Rn=SP*/31, a->max_outgoing, 0);
+        patch32(obj, sec, a->add_patches[i].pos, word);
+    }
+
     /* Define the function symbol. */
     u32 end = mc->pos(mc);
     obj_symbol_define(obj, a->fd->sym, sec,
@@ -608,18 +663,40 @@ static void aa_param(CGTarget* t, const CGParamDesc* p)
 
     if (ai->kind == ABI_ARG_IGNORE) return;
     if (ai->kind == ABI_ARG_INDIRECT) {
-        /* Caller passes a pointer to the data. Pointer comes in next
-         * INT arg reg; store it into the home slot (which holds the
-         * pointer-sized address). */
+        /* Caller passes a pointer to a copy. Materialize that pointer
+         * into a scratch reg, then memcpy `s->size` bytes from there
+         * into the slot — so subsequent LOCAL_op(slot) reads/writes the
+         * struct contents directly, not the pointer. */
+        u32 ptr_reg;
         if (a->next_param_int < 8) {
-            u32 reg = a->next_param_int++;
-            emit32(t->mc, aa64_stur(3, reg, 29, -(i32)s->off));
+            ptr_reg = a->next_param_int++;
         } else {
-            /* Pointer on stack — load and store. */
             u32 caller_off = a->next_param_stack;
             a->next_param_stack += 8;
             emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off)));
-            emit32(t->mc, aa64_stur(3, 9, 29, -(i32)s->off));
+            ptr_reg = 9;
+        }
+        u32 nbytes = s->size;
+        u32 i = 0;
+        while (i + 8 <= nbytes) {
+            emit32(t->mc, aa64_ldur(3, 10, ptr_reg, (i32)i));
+            emit32(t->mc, aa64_stur(3, 10, 29, -(i32)s->off + (i32)i));
+            i += 8;
+        }
+        while (i + 4 <= nbytes) {
+            emit32(t->mc, aa64_ldur(2, 10, ptr_reg, (i32)i));
+            emit32(t->mc, aa64_stur(2, 10, 29, -(i32)s->off + (i32)i));
+            i += 4;
+        }
+        while (i + 2 <= nbytes) {
+            emit32(t->mc, aa64_ldur(1, 10, ptr_reg, (i32)i));
+            emit32(t->mc, aa64_stur(1, 10, 29, -(i32)s->off + (i32)i));
+            i += 2;
+        }
+        while (i < nbytes) {
+            emit32(t->mc, aa64_ldur(0, 10, ptr_reg, (i32)i));
+            emit32(t->mc, aa64_stur(0, 10, 29, -(i32)s->off + (i32)i));
+            i += 1;
         }
         return;
     }
@@ -751,12 +828,16 @@ static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d)
         a->scopes_cap = ncap;
     }
     AAScope* sc = &a->scopes[a->nscopes];
-    sc->kind       = (u8)d->kind;
-    sc->has_else   = 0;
-    sc->else_label = t->mc->label_new(t->mc);
-    sc->end_label  = t->mc->label_new(t->mc);
+    sc->kind            = (u8)d->kind;
+    sc->has_else        = 0;
+    sc->else_label      = 0;
+    sc->end_label       = 0;
+    sc->break_label     = d->break_label;
+    sc->continue_label  = d->continue_label;
 
     if (d->kind == SCOPE_IF) {
+        sc->else_label = t->mc->label_new(t->mc);
+        sc->end_label  = t->mc->label_new(t->mc);
         /* Test cond against zero, branch to else_label on EQ (false). */
         u32 sf = type_is_64(d->cond.type) ? 1u : 0u;
         u32 rn = force_reg_int(t, d->cond, sf, 9);
@@ -764,8 +845,11 @@ static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d)
         emit32(t->mc, aa64_b_cond(0x0u /*EQ*/));
         t->mc->emit_label_ref(t->mc, sc->else_label,
                               R_AARCH64_CONDBR19, 4, 0);
+    } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) {
+        /* Structured loop/block: bookkeep only. The caller drives
+         * label_place + jump itself; break_to/continue_to forward to the
+         * recorded labels. No instructions emitted here. */
     } else {
-        /* BLOCK / LOOP not yet exercised by the corpus. */
         compiler_panic(t->c, a->loc,
             "aarch64 scope_begin: kind %d not yet implemented", (int)d->kind);
     }
@@ -798,15 +882,37 @@ static void aa_scope_end(CGTarget* t, CGScope s)
                        (unsigned)s);
     }
     AAScope* sc = &a->scopes[s - 1];
-    if (sc->kind == SCOPE_IF && !sc->has_else) {
-        /* No else body — false-branch lands at scope_end. */
-        t->mc->label_place(t->mc, sc->else_label);
+    if (sc->kind == SCOPE_IF) {
+        if (!sc->has_else) {
+            /* No else body — false-branch lands at scope_end. */
+            t->mc->label_place(t->mc, sc->else_label);
+        }
+        t->mc->label_place(t->mc, sc->end_label);
+    }
+    /* SCOPE_LOOP / SCOPE_BLOCK: caller has already placed the break_label. */
+}
+
+static void aa_break_to(CGTarget* t, CGScope s)
+{
+    AAImpl* a = impl_of(t);
+    if (s == CG_SCOPE_NONE || s > a->nscopes) {
+        compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u",
+                       (unsigned)s);
     }
-    t->mc->label_place(t->mc, sc->end_label);
+    AAScope* sc = &a->scopes[s - 1];
+    aa_jump(t, sc->break_label);
 }
 
-static void aa_break_to   (CGTarget* t, CGScope s) { (void)s; aa_panic(t, "break_to"); }
-static void aa_continue_to(CGTarget* t, CGScope s) { (void)s; aa_panic(t, "continue_to"); }
+static void aa_continue_to(CGTarget* t, CGScope s)
+{
+    AAImpl* a = impl_of(t);
+    if (s == CG_SCOPE_NONE || s > a->nscopes) {
+        compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u",
+                       (unsigned)s);
+    }
+    AAScope* sc = &a->scopes[s - 1];
+    aa_jump(t, sc->continue_label);
+}
 
 /* ---- data movement ---- */
 
@@ -972,6 +1078,23 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv)
         }
         return;
     }
+    if (lv.kind == OPK_GLOBAL) {
+        /* ADRP Xd, sym ; ADD Xd, Xd, #:lo12:sym (with addend baked into both
+         * relocations). Used to materialize a function or data pointer. */
+        u32 rd = reg_num(dst);
+        u32 sec = t->mc->section_id;
+        u32 adrp_pos = t->mc->pos(t->mc);
+        emit32(t->mc, aa64_adrp_base(rd));
+        t->mc->emit_reloc_at(t->mc, sec, adrp_pos,
+                             R_AARCH64_ADR_PREL_PG_HI21,
+                             lv.v.global.sym, lv.v.global.addend, 0, 0);
+        u32 add_pos = t->mc->pos(t->mc);
+        emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
+        t->mc->emit_reloc_at(t->mc, sec, add_pos,
+                             R_AARCH64_ADD_ABS_LO12_NC,
+                             lv.v.global.sym, lv.v.global.addend, 0, 0);
+        return;
+    }
     aa_panic(t, "addr_of");
 }
 
@@ -1144,6 +1267,29 @@ static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch)
 static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, Operand b_op)
 {
     MCEmitter* mc = t->mc;
+
+    /* FP binops route through scalar FADD/FSUB/FMUL/FDIV. */
+    if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) {
+        if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) {
+            compiler_panic(t->c, impl_of(t)->loc,
+                "aarch64 binop: FP op requires REG operands");
+        }
+        u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+        u32 rd = reg_num(dst);
+        u32 rn = reg_num(a_op);
+        u32 rm = reg_num(b_op);
+        u32 w;
+        switch (op) {
+        case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break;
+        case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break;
+        case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break;
+        case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break;
+        default:      w = 0; break;     /* unreachable */
+        }
+        emit32(mc, w);
+        return;
+    }
+
     u32 sf = type_is_64(dst.type) ? 1u : 0u;
     u32 rd = reg_num(dst);
     u32 rn = force_reg_int(t, a_op, sf, 9);
@@ -1434,16 +1580,19 @@ static void aa_call(CGTarget* t, const CGCallDesc* d)
     u32 needed = (stack_off + 15u) & ~15u;
     if (needed > a->max_outgoing) a->max_outgoing = needed;
 
-    /* BL <callee> — direct only. */
-    if (d->callee.kind != OPK_GLOBAL) {
+    /* Direct (BL <sym>) vs. indirect (BLR Xn). */
+    if (d->callee.kind == OPK_GLOBAL) {
+        u32 bl_pos = mc->pos(mc);
+        emit32(mc, aa64_bl_base());
+        mc->emit_reloc_at(mc, mc->section_id, bl_pos,
+                          R_AARCH64_CALL26, d->callee.v.global.sym,
+                          d->callee.v.global.addend, 0, 0);
+    } else if (d->callee.kind == OPK_REG) {
+        emit32(mc, aa64_blr(reg_num(d->callee)));
+    } else {
         compiler_panic(t->c, a->loc,
-            "aarch64 call: indirect call not yet supported");
+            "aarch64 call: callee kind %d unsupported", (int)d->callee.kind);
     }
-    u32 bl_pos = mc->pos(mc);
-    emit32(mc, aa64_bl_base());
-    mc->emit_reloc_at(mc, mc->section_id, bl_pos,
-                      R_AARCH64_CALL26, d->callee.v.global.sym,
-                      d->callee.v.global.addend, 0, 0);
 
     /* Receive return value. */
     const ABIArgInfo* ri = &d->abi->ret;
@@ -1451,37 +1600,54 @@ static void aa_call(CGTarget* t, const CGCallDesc* d)
         /* Nothing to copy — sret was placed directly into the dst slot. */
         return;
     }
-    /* DIRECT scalar in our coverage: a single INT or FP part placed in
-     * x0 / v0. Move into ret_storage. */
     if (ri->nparts == 0) return;
-    const ABIArgPart* p0 = &ri->parts[0];
+
     Operand rs = d->ret.storage;
-    if (p0->cls == ABI_CLASS_INT) {
-        u32 sf = (p0->size == 8) ? 1u : 0u;
-        if (rs.kind == OPK_REG) {
-            emit32(mc, aa64_mov_reg(sf, reg_num(rs), 0));
-        } else if (rs.kind == OPK_LOCAL) {
-            AASlot* s = slot_get(a, rs.v.frame_slot);
-            if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
-            u32 sidx = size_idx_for_bytes(p0->size);
-            emit32(mc, aa64_stur(sidx, 0, 29, -(i32)s->off));
+    /* Walk parts; INT parts come from x0, x1, ...; FP parts from v0, v1, .... */
+    u32 next_int_ret = 0, next_fp_ret = 0;
+    for (u16 i = 0; i < ri->nparts; ++i) {
+        const ABIArgPart* p = &ri->parts[i];
+        u32 src_reg;
+        if (p->cls == ABI_CLASS_INT) {
+            src_reg = next_int_ret++;
+        } else if (p->cls == ABI_CLASS_FP) {
+            src_reg = next_fp_ret++;
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 call: ret part cls %d unimpl", (int)p->cls);
         }
-    } else if (p0->cls == ABI_CLASS_FP) {
-        u32 type = (p0->size == 8) ? 1u : 0u;
+
         if (rs.kind == OPK_REG) {
-            emit32(mc, aa64_fmov_reg(type, reg_num(rs), 0));
+            if (ri->nparts != 1) {
+                compiler_panic(t->c, a->loc,
+                    "aarch64 call: REG ret_storage with %u parts", (unsigned)ri->nparts);
+            }
+            if (p->cls == ABI_CLASS_INT) {
+                u32 sf = (p->size == 8) ? 1u : 0u;
+                emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg));
+            } else {
+                u32 type = (p->size == 8) ? 1u : 0u;
+                emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg));
+            }
         } else if (rs.kind == OPK_LOCAL) {
             AASlot* s = slot_get(a, rs.v.frame_slot);
             if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
-            u32 sidx = size_idx_for_bytes(p0->size);
-            emit32(mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off));
+            u32 sidx = size_idx_for_bytes(p->size);
+            i32 off = -(i32)s->off + (i32)p->src_offset;
+            if (p->cls == ABI_CLASS_INT) {
+                emit32(mc, aa64_stur(sidx, src_reg, 29, off));
+            } else {
+                emit32(mc, aa64_stur_fp(sidx, src_reg, 29, off));
+            }
+        } else if (rs.kind == OPK_IMM && rs.type
+                   && rs.type->kind == TY_VOID) {
+            /* Void return placeholder — nothing to do. */
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 call: ret_storage kind %d unsupported",
+                (int)rs.kind);
         }
     }
-    /* Multi-part returns: not exercised yet. */
-    if (ri->nparts > 1) {
-        compiler_panic(t->c, a->loc,
-            "aarch64 call: multi-part return not yet supported");
-    }
 }
 
 /* Materialize the return value, then branch to the function epilogue. */
@@ -1571,7 +1737,71 @@ static void aa_ret(CGTarget* t, const CGABIValue* val)
     (void)bpos;
 }
 
-static void aa_alloca_ (CGTarget* t, Operand d, Operand s, u32 a) { (void)d;(void)s;(void)a; aa_panic(t, "alloca"); }
+/* Dynamic stack allocation. Layout: outgoing-args (max_outgoing bytes,
+ * 16-aligned) sit at the bottom of SP; the alloca block goes immediately
+ * above. After lowering SP by an aligned size, the new block's address is
+ * (SP + max_outgoing). max_outgoing is only known at func_end, so each
+ * alloca emits a placeholder `ADD dst, SP, #0` and registers a patch site;
+ * func_end rewrites the imm12 with the final max_outgoing. */
+static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align)
+{
+    AAImpl*    a  = impl_of(t);
+    MCEmitter* mc = t->mc;
+
+    if (d.kind != OPK_REG) {
+        compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG");
+    }
+    /* SP is 16-aligned and we lower it by an aligned amount, so result
+     * inherits 16-byte alignment. Larger requests would need an
+     * additional mask on the result; reject so we notice. */
+    if (align > 16) {
+        compiler_panic(t->c, a->loc,
+            "aarch64 alloca: align %u > 16 not yet supported", align);
+    }
+
+    if (sz.kind == OPK_IMM) {
+        i64 v = sz.v.imm;
+        if (v < 0) {
+            compiler_panic(t->c, a->loc, "aarch64 alloca: negative size");
+        }
+        u64 aligned = ((u64)v + 15u) & ~(u64)15u;
+        if (aligned == 0) aligned = 16;     /* keep SP changing */
+        if (aligned > 0xfffu) {
+            compiler_panic(t->c, a->loc,
+                "aarch64 alloca: const size %llu too large for v1",
+                (unsigned long long)aligned);
+        }
+        emit32(mc, aa64_sub_imm(1, /*Rd=SP*/31, /*Rn=SP*/31, (u32)aligned, 0));
+    } else if (sz.kind == OPK_REG) {
+        /* Round size up to a 16-byte multiple, then `sub sp, sp, x9`
+         * (extended-register form so Rd/Rn=SP work). */
+        u32 sz_reg = reg_num(sz);
+        emit32(mc, aa64_add_imm(1, 9, sz_reg, 15u, 0));    /* x9 = size+15 */
+        emit32(mc, aa64_ubfm(1, 9, 9, 4, 63));              /* lsr x9, x9, #4 */
+        emit32(mc, aa64_ubfm(1, 9, 9, 60, 59));             /* lsl x9, x9, #4 */
+        emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/31, /*SP*/31, 9));
+    } else {
+        compiler_panic(t->c, a->loc,
+            "aarch64 alloca: size kind %d unsupported", (int)sz.kind);
+    }
+
+    /* Placeholder ADD dst, SP, #<max_outgoing>. Patched at func_end. */
+    if (a->nadd_patches == a->add_patches_cap) {
+        u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4;
+        struct AAAllocaPatch* nb = arena_array(t->c->tu,
+                                                struct AAAllocaPatch, ncap);
+        if (a->add_patches) memcpy(nb, a->add_patches,
+                                    sizeof(*nb) * a->nadd_patches);
+        a->add_patches = nb;
+        a->add_patches_cap = ncap;
+    }
+    u32 dst_reg = reg_num(d);
+    a->add_patches[a->nadd_patches].pos     = mc->pos(mc);
+    a->add_patches[a->nadd_patches].dst_reg = dst_reg;
+    a->nadd_patches++;
+    emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/31, 0, 0));
+    a->has_alloca = 1;
+}
 static void aa_va_start_(CGTarget* t, Operand a)                  { (void)a; aa_panic(t, "va_start"); }
 static void aa_va_arg_  (CGTarget* t, Operand d, Operand a, const Type* ty) { (void)d;(void)a;(void)ty; aa_panic(t, "va_arg"); }
 static void aa_va_end_  (CGTarget* t, Operand a)                  { (void)a; aa_panic(t, "va_end"); }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README