commit e25cbf0f256f656ae48652f76b1e757ad194564b
parent aef3673230d5108c76a0a58c69c64f1c2ff7fcd5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 14:07:16 -0700
cg/aa64: bring up frames, params, calls, FP — pass Groups A/B/C
Frame layout uses a fixed-size prologue placeholder patched at func_end
so frame_size and the callee-save count are knowable when the prologue
is finally written. Slots are FP-relative so per-slot offsets stay
stable while the eventual frame size is unknown; outgoing stack args
are SP-relative.
New surface:
- frame_slot, param (incoming x0..x7 / v0..v7 + stack overflow + sret x8)
- load/store/addr_of for LOCAL and INDIRECT operands
- call: direct BL with arg materialization (IMM/REG/LOCAL/BYVAL),
sret pointer in x8, return-value placement into REG/LOCAL incl.
small-struct-in-regs
- load_const for FP via .rodata + ADRP/LDR with the standard relocs;
RC_FP allocator; FCVTZS; FMOV reg-reg
- binop accepts IMM via scratch; SREM/UREM via SDIV/UDIV+MSUB; UO_NOT
via SUBS+CSET (previously panicked and "passed" by exit-code
coincidence)
All 124 test/cg cases pass across D/R/E/J paths.
Diffstat:
2 files changed, 985 insertions(+), 132 deletions(-)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -1,17 +1,29 @@
/* Minimal AArch64 CGTarget.
*
- * Initial coverage matches the spine A + C corpus (function lifecycle and
- * integer arithmetic). Other CGTarget methods panic with a clear "unimpl"
- * diagnostic so test cases that touch them fail visibly rather than
- * silently emitting nothing.
+ * Single-pass codegen for the cg test corpus (Groups A, B, C). Frame
+ * layout uses a fixed-size prologue placeholder patched at func_end so
+ * frame_size and the callee-save register count are knowable when the
+ * prologue is finally written. FP-relative (x29) addressing is used for
+ * local slots and incoming stack args so that per-slot offsets can be
+ * assigned at frame_slot() time without depending on the eventual
+ * frame_size or callee-save count. SP-relative addressing is used for
+ * outgoing stack args.
*
- * Single-pass register allocation: alloc_reg hands out W19..W28 in order
- * and panics on exhaustion. No live-range tracking, no spills. Suitable
- * for short straight-line fixtures only; replaced when CG's
- * value-stack-aware spill/reload arrives.
+ * Frame layout (low SP -> high):
+ * outgoing args (max_outgoing bytes, 16-aligned)
+ * int reg saves (n_int_pairs * 16) -- x19/x20, x21/x22, ...
+ * fp reg saves (n_fp_pairs * 16) -- d8/d9, d10/d11, ...
+ * local slots (cum_off bytes)
+ * x29, x30 save (16 bytes) -- x29 = sp + frame_size - 16
*
- * Width is derived from Operand.type via type_is_64(). For the test
- * harness this is enough; full ABI integration arrives with TargetABI. */
+ * Single-pass register allocator: alloc_reg(RC_INT) hands out x19..x28 in
+ * order; alloc_reg(RC_FP) hands out v8..v15. Both ranges are callee-saved
+ * and only the prefix actually used is saved by the prologue. Width
+ * derives from Operand.type via type_is_64. Spill/reload not implemented.
+ *
+ * Multi-function: each func_begin/func_end pair owns its own frame state
+ * via the AAImpl fields, so the harness can build several functions in
+ * one TU. */
#include "arch/arch.h"
#include "arch/aa64_isa.h"
@@ -21,16 +33,153 @@
#include <string.h>
+/* ============================================================
+ * Local encoding helpers (kept here, not in aa64_isa.h, while the
+ * disassembler-shared table only needs the Group A/C subset).
+ * ============================================================ */
+
+#define AA64_NOP 0xD503201Fu
+
+/* ADD/SUB immediate (12-bit imm, optional shift-12). Rd/Rn = 31 means SP
+ * for these encodings (not ZR). */
+static inline u32 aa64_add_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh)
+{ return 0x11000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_sub_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh)
+{ return 0x51000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* STP/LDP signed offset, X registers. Offset is byte offset, must be a
+ * multiple of 8; encoded value = byte_offset / 8 in a signed 7-bit field
+ * (range -512..504). */
+static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+ i32 sc = byte_off >> 3;
+ return 0xA9000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+ i32 sc = byte_off >> 3;
+ return 0xA9400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* STP/LDP signed offset, D registers (64-bit FP, scale 8). */
+static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+ i32 sc = byte_off >> 3;
+ return 0x6D000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off)
+{
+ i32 sc = byte_off >> 3;
+ return 0x6D400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* LDUR / STUR (general regs, unscaled simm9 in -256..255).
+ * size: 0=B, 1=H, 2=W, 3=X. */
+static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+ return 0x38000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+ return 0x38400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* LDUR/STUR for SIMD & FP registers (V=1). size: 2=S (32-bit), 3=D (64-bit). */
+static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+ return 0x3C000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9)
+{
+ return 0x3C400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* STR/LDR scaled (unsigned imm12). byte_off must be a multiple of (1<<size). */
+static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+ u32 sc = byte_off >> size;
+ return 0x39000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+/* Branch (unconditional, 26-bit imm). Emitted with imm26=0 when paired
+ * with a JUMP26/CALL26 relocation; the patcher fills in imm26. */
+static inline u32 aa64_b_base(void) { return 0x14000000u; }
+static inline u32 aa64_bl_base(void) { return 0x94000000u; }
+
+/* ADRP base (Rd in low 5 bits). imm bits filled by relocation. */
+static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd&0x1f); }
+
+/* LDR (unsigned offset) for SIMD & FP, used after ADRP for FP literals.
+ * size 2 => S (32-bit). imm12 patched by linker. */
+static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off)
+{
+ u32 sc = byte_off >> size;
+ return 0x3D400000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f);
+}
+
+/* FMOV (scalar register). type: 0=single, 1=double. */
+static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn)
+{ return 0x1E204000u | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SUBS immediate (used to encode CMP Xn, #imm via SUBS ZR, Xn, #imm). */
+static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12)
+{ return 0x71000000u | (sf<<31) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* CSET Wd/Xd, EQ — alias of CSINC Rd, ZR, ZR, NE (inverted EQ). */
+static inline u32 aa64_cset_eq(u32 sf, u32 Rd)
+{ return 0x1A800400u | (sf<<31) | (31u<<16) | (0x1u<<12) | (31u<<5) | (Rd&0x1f); }
+
+/* FCVTZS (scalar fp -> integer, round toward zero, signed).
+ * sf: 0=W, 1=X. type: 0=S, 1=D. */
+static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn)
+{ return 0x1E380000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* ============================================================
+ * AAImpl
+ * ============================================================ */
+
+#define AA_PROLOGUE_WORDS 12u /* worst case: sub sp + stp/add fp + 5 int + 4 fp = 11 */
+
+typedef struct AASlot {
+ u32 off; /* bytes below fp; address = x29 - off */
+ u32 size;
+ u32 align;
+ u8 kind; /* FrameSlotKind */
+ u8 pad[3];
+} AASlot;
+
typedef struct AAImpl {
CGTarget base;
SrcLoc loc;
const CGFuncDesc* fd;
+
+ /* Function emission. */
u32 func_start;
- u32 next_alloc;
+ u32 prologue_pos;
+ MCLabel epilogue_label;
+
+ /* Frame layout (in bytes; final frame_size computed at func_end). */
+ AASlot* slots;
+ u32 nslots;
+ u32 slots_cap;
+ u32 cum_off; /* total bytes consumed by local slots */
+ u32 max_outgoing; /* max stack arg bytes for any call */
+
+ /* Param incoming tracking — set by func_begin from ABIFuncInfo. */
+ u32 next_param_int; /* x0..x7 consumed so far */
+ u32 next_param_fp; /* v0..v7 consumed so far */
+ u32 next_param_stack; /* offset into caller's stack arg area */
+ u8 has_sret; /* sret pointer arrived in x8 */
+ FrameSlot sret_ptr_slot; /* hidden slot holding incoming x8 */
+
+ /* Reg allocator (callee-saved prefix). */
+ u32 used_int; /* x19 + i, i in [0, used_int) */
+ u32 used_fp; /* v8 + i, i in [0, used_fp ) */
} AAImpl;
static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
+/* Forward decls used before definition. */
+static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
+static AASlot* slot_get(AAImpl* a, FrameSlot fs);
+
/* ---- helpers ---- */
static int type_is_64(const Type* t)
@@ -47,6 +196,46 @@ static int type_is_64(const Type* t)
}
}
+static int type_is_fp_double(const Type* t)
+{ return t && (t->kind == TY_DOUBLE || t->kind == TY_LDOUBLE); }
+
+static int type_is_signed(const Type* t)
+{
+ if (!t) return 0;
+ switch (t->kind) {
+ case TY_CHAR: case TY_SCHAR:
+ case TY_SHORT: case TY_INT: case TY_LONG: case TY_LLONG:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static u32 type_byte_size(const Type* t)
+{
+ if (!t) return 4;
+ switch (t->kind) {
+ case TY_CHAR: case TY_SCHAR: case TY_UCHAR: case TY_BOOL: return 1;
+ case TY_SHORT: case TY_USHORT: return 2;
+ case TY_INT: case TY_UINT: case TY_FLOAT: return 4;
+ case TY_LONG: case TY_ULONG: case TY_LLONG: case TY_ULLONG:
+ case TY_PTR: case TY_DOUBLE: return 8;
+ default: return 8;
+ }
+}
+
+/* Encode size index for STUR/LDUR (0=B,1=H,2=W,3=X). */
+static u32 size_idx_for_bytes(u32 nbytes)
+{
+ switch (nbytes) {
+ case 1: return 0;
+ case 2: return 1;
+ case 4: return 2;
+ case 8: return 3;
+ default: return 3;
+ }
+}
+
static u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
static void emit32(MCEmitter* mc, u32 word)
@@ -59,12 +248,78 @@ static void emit32(MCEmitter* mc, u32 word)
mc->emit_bytes(mc, b, 4);
}
+static void patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word)
+{
+ u8 b[4];
+ b[0] = (u8)(word & 0xff);
+ b[1] = (u8)((word >> 8) & 0xff);
+ b[2] = (u8)((word >> 16)& 0xff);
+ b[3] = (u8)((word >> 24)& 0xff);
+ obj_patch(obj, sec_id, ofs, b, 4);
+}
+
static _Noreturn void aa_panic(CGTarget* t, const char* what)
{
SrcLoc loc = impl_of(t)->loc;
compiler_panic(t->c, loc, "aarch64: %s not implemented", what);
}
+/* ---- AArch64 immediate encoding helpers ---- */
+
+/* Materialize a u64 into a register using MOVZ/MOVN/MOVK. Used both for
+ * the public load_imm() and internally for synthesizing immediates. */
+static void emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm)
+{
+ const u32 nslots = sf ? 4u : 2u;
+ u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
+
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
+ u64 cleared = v & ~((u64)0xffffu << (i * 16));
+ if (slot != 0 && cleared == 0) {
+ emit32(mc, aa64_movz(sf, Rd, slot, i));
+ return;
+ }
+ }
+
+ {
+ u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((inv >> (i * 16)) & 0xffffu);
+ u64 cleared = inv & ~((u64)0xffffu << (i * 16));
+ if (cleared == 0) {
+ emit32(mc, aa64_movn(sf, Rd, slot, i));
+ return;
+ }
+ }
+ }
+
+ int placed = 0;
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
+ if (!placed) {
+ if (slot == 0) continue;
+ emit32(mc, aa64_movz(sf, Rd, slot, i));
+ placed = 1;
+ } else if (slot != 0) {
+ emit32(mc, aa64_movk(sf, Rd, slot, i));
+ }
+ }
+ if (!placed) emit32(mc, aa64_movz(sf, Rd, 0, 0));
+}
+
+static void emit_sp_add(MCEmitter* mc, u32 imm)
+{
+ if (imm <= 0xfff) {
+ emit32(mc, aa64_add_imm(1, 31, 31, imm, 0));
+ } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) {
+ emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1));
+ } else {
+ emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1));
+ emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0));
+ }
+}
+
/* ---- function lifecycle ---- */
static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
@@ -73,26 +328,144 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
MCEmitter* mc = t->mc;
mc->set_section(mc, fd->text_section_id);
- mc->emit_align(mc, 4, 0); /* instruction alignment */
+ mc->emit_align(mc, 4, 0);
a->fd = fd;
a->func_start = mc->pos(mc);
- a->next_alloc = 0;
+ a->next_param_int = 0;
+ a->next_param_fp = 0;
+ a->next_param_stack = 0;
+ a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
+ a->cum_off = 0;
+ a->max_outgoing= 0;
+ a->used_int = 0;
+ a->used_fp = 0;
+ a->nslots = 0;
+ a->sret_ptr_slot = FRAME_SLOT_NONE;
+ a->epilogue_label = mc->label_new(mc);
mc->cfi_startproc(mc);
+
+ /* Reserve a fixed-size prologue placeholder, NOP-filled. We patch the
+ * prefix at func_end with the real prologue once frame_size and the
+ * callee-save count are known. */
+ a->prologue_pos = mc->pos(mc);
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) emit32(mc, AA64_NOP);
+
+ /* If the function returns indirect (sret), x8 holds the destination
+ * pointer on entry. Reserve a hidden slot to spill it into so the
+ * body can use x8 as scratch and ret can recover the dest pointer. */
+ if (a->has_sret) {
+ FrameSlotDesc fsd = {
+ .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0},
+ .size = 8, .align = 8, .kind = FS_SPILL, .flags = 0,
+ };
+ a->sret_ptr_slot = aa_frame_slot(t, &fsd);
+ }
}
static void aa_func_end(CGTarget* t)
{
AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
- u32 end = mc->pos(mc);
- obj_symbol_define(t->obj,
- a->fd->sym,
- a->fd->text_section_id,
- (u64)a->func_start,
- (u64)(end - a->func_start));
+ /* Compute callee-save layout. */
+ u32 n_int_pairs = (a->used_int + 1) / 2; /* round up */
+ u32 n_fp_pairs = (a->used_fp + 1) / 2;
+
+ u32 outgoing_off = 0;
+ u32 int_save_off = a->max_outgoing;
+ u32 fp_save_off = int_save_off + n_int_pairs * 16;
+ u32 locals_off = fp_save_off + n_fp_pairs * 16;
+ u32 fp_lr_off = locals_off + a->cum_off;
+ u32 frame_size = fp_lr_off + 16;
+ /* round to 16. */
+ frame_size = (frame_size + 15u) & ~15u;
+ fp_lr_off = frame_size - 16;
+
+ (void)outgoing_off;
+
+ /* Emit epilogue at current pos, then place label. The label we emit
+ * must point at the first instruction of the epilogue so `b epilogue`
+ * branches land here. */
+ mc->label_place(mc, a->epilogue_label);
+
+ /* Restore FP saves, then INT saves, then fp/lr, then add sp + ret. */
+ for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) {
+ u32 r0 = 8u + (u32)i * 2u;
+ u32 r1 = r0 + 1u;
+ emit32(mc, aa64_ldp_d(r0, r1, 31, (i32)(fp_save_off + (u32)i*16u)));
+ }
+ for (i32 i = (i32)n_int_pairs - 1; i >= 0; --i) {
+ u32 r0 = 19u + (u32)i * 2u;
+ u32 r1 = r0 + 1u;
+ emit32(mc, aa64_ldp_x(r0, r1, 31, (i32)(int_save_off + (u32)i*16u)));
+ }
+ emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+ emit_sp_add(mc, frame_size);
+ emit32(mc, aa64_ret(AA64_LR));
+
+ /* Now patch prologue placeholder. */
+ u32 pos = a->prologue_pos;
+ ObjBuilder* obj = t->obj;
+ u32 sec = a->fd->text_section_id;
+
+ u32 words[AA_PROLOGUE_WORDS];
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP;
+ u32 wi = 0;
+
+ /* sub sp, sp, #frame_size — may take 2 insns if > 4095. */
+ if (frame_size <= 0xfff) {
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
+ } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
+ } else {
+ if (wi + 2 > AA_PROLOGUE_WORDS) {
+ compiler_panic(t->c, a->loc,
+ "aarch64: prologue overflow for frame_size %u", frame_size);
+ }
+ words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
+ }
+ /* stp x29, x30, [sp, #fp_lr_off]; add x29, sp, #fp_lr_off */
+ words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+ words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+ /* If sret, save incoming x8 (caller's destination pointer). */
+ if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot* s = slot_get(a, a->sret_ptr_slot);
+ if (s) {
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
+ }
+ }
+ /* INT pair saves. */
+ for (u32 i = 0; i < n_int_pairs; ++i) {
+ u32 r0 = 19u + i*2u;
+ u32 r1 = r0 + 1u;
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_stp_x(r0, r1, 31, (i32)(int_save_off + i*16u));
+ }
+ for (u32 i = 0; i < n_fp_pairs; ++i) {
+ u32 r0 = 8u + i*2u;
+ u32 r1 = r0 + 1u;
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_stp_d(r0, r1, 31, (i32)(fp_save_off + i*16u));
+ }
+ if (0) {
+overflow:
+ compiler_panic(t->c, a->loc,
+ "aarch64: prologue placeholder too small (used %u of %u words)",
+ wi, AA_PROLOGUE_WORDS);
+ }
+
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) {
+ patch32(obj, sec, pos + i*4u, words[i]);
+ }
+
+ /* Define the function symbol. */
+ u32 end = mc->pos(mc);
+ obj_symbol_define(obj, a->fd->sym, sec,
+ (u64)a->func_start, (u64)(end - a->func_start));
mc->cfi_endproc(mc);
a->fd = NULL;
@@ -104,25 +477,128 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty)
{
AAImpl* a = impl_of(t);
(void)ty;
- if (cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
+ if (cls == RC_INT) {
+ if (a->used_int >= 10) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 alloc_reg: out of INT scratch (no spill yet)");
+ }
+ return (Reg)(19u + a->used_int++);
}
- if (a->next_alloc >= 10) {
- compiler_panic(t->c, a->loc,
- "aarch64 alloc_reg: out of scratch regs (no spill yet)");
+ if (cls == RC_FP) {
+ if (a->used_fp >= 8) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 alloc_reg: out of FP scratch (no spill yet)");
+ }
+ return (Reg)(8u + a->used_fp++);
}
- return (Reg)(19u + a->next_alloc++);
+ compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls);
}
static void aa_free_reg(CGTarget* t, Reg r) { (void)t; (void)r; }
-static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { (void)d; aa_panic(t, "frame_slot"); }
-static void aa_param (CGTarget* t, const CGParamDesc* p) { (void)p; aa_panic(t, "param"); }
+static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d)
+{
+ AAImpl* a = impl_of(t);
+ if (a->nslots == a->slots_cap) {
+ u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8;
+ AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap);
+ if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots);
+ a->slots = nbuf;
+ a->slots_cap = ncap;
+ }
+ u32 size = d->size ? d->size : 8;
+ u32 align = d->align ? d->align : 1;
+ u32 next = a->cum_off + size;
+ /* Round up so that slot start (= fp - off) is align-aligned. fp is
+ * 16-aligned, so requiring off aligned to `align` suffices. */
+ u32 mask = align - 1;
+ next = (next + mask) & ~mask;
+
+ AASlot* s = &a->slots[a->nslots];
+ s->off = next;
+ s->size = size;
+ s->align = align;
+ s->kind = d->kind;
+
+ a->cum_off = next;
+ a->nslots++;
+ return (FrameSlot)(a->nslots); /* 1-based; FRAME_SLOT_NONE == 0 */
+}
+
+static AASlot* slot_get(AAImpl* a, FrameSlot fs)
+{
+ if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL;
+ return &a->slots[fs - 1];
+}
+
+/* ---- param: store incoming arg(s) into the home slot ---- */
+
+static void aa_param(CGTarget* t, const CGParamDesc* p)
+{
+ AAImpl* a = impl_of(t);
+ AASlot* s = slot_get(a, p->slot);
+ if (!s) {
+ compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
+ }
+ const ABIArgInfo* ai = p->abi;
+
+ if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ /* Caller passes a pointer to the data. Pointer comes in next
+ * INT arg reg; store it into the home slot (which holds the
+ * pointer-sized address). */
+ if (a->next_param_int < 8) {
+ u32 reg = a->next_param_int++;
+ emit32(t->mc, aa64_stur(3, reg, 29, -(i32)s->off));
+ } else {
+ /* Pointer on stack — load and store. */
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off)));
+ emit32(t->mc, aa64_stur(3, 9, 29, -(i32)s->off));
+ }
+ return;
+ }
+ /* DIRECT: place each part. */
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ u32 part_off = pt->src_offset;
+ u32 sz = pt->size;
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 8) {
+ u32 reg = a->next_param_int++;
+ emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ } else {
+ /* Each stack-passed slot is 8 bytes regardless of part size. */
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ emit32(t->mc, aa64_ldur(sidx, 9, 29, (i32)(16 + caller_off)));
+ emit32(t->mc, aa64_stur(sidx, 9, 29, -(i32)s->off + (i32)part_off));
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (a->next_param_fp < 8) {
+ u32 reg = a->next_param_fp++;
+ emit32(t->mc, aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ emit32(t->mc, aa64_ldur_fp(sidx, 0, 29, (i32)(16 + caller_off)));
+ emit32(t->mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off + (i32)part_off));
+ }
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 param: ABI class %d unimpl", (int)pt->cls);
+ }
+ }
+}
+
static const Reg* aa_clobbers (CGTarget* t, RegClass c, u32* n) { (void)c; (void)n; aa_panic(t, "clobbers"); }
static void aa_spill_reg (CGTarget* t, Operand s, FrameSlot f, MemAccess m) { (void)s; (void)f; (void)m; aa_panic(t, "spill_reg"); }
static void aa_reload_reg(CGTarget* t, Operand d, FrameSlot f, MemAccess m) { (void)d; (void)f; (void)m; aa_panic(t, "reload_reg"); }
-/* ---- labels / control flow (deferred) ---- */
+/* ---- labels / control flow (deferred for D-group; ret uses internal label) ---- */
static Label aa_label_new (CGTarget* t) { aa_panic(t, "label_new"); }
static void aa_label_place(CGTarget* t, Label l) { (void)l; aa_panic(t, "label_place"); }
@@ -139,74 +615,169 @@ static void aa_continue_to(CGTarget* t, CGScope s) { (void)s; aa_
static void aa_load_imm(CGTarget* t, Operand dst, i64 imm)
{
- MCEmitter* mc = t->mc;
u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
-
- /* Effective bit-width: 32 unless we're materializing into Xd. The 32-bit
- * encoding zero-extends the result, so we mask to 32 bits when sf==0
- * so a "negative" int constant materializes its low 32 bits exactly. */
- const u32 nslots = sf ? 4u : 2u;
- u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
+ emit_load_imm(t->mc, sf, reg_num(dst), imm);
+}
- /* Single MOVZ when only one 16-bit slot is non-zero. */
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- u64 cleared = v & ~((u64)0xffffu << (i * 16));
- if (slot != 0 && cleared == 0) {
- emit32(mc, aa64_movz(sf, rd, slot, i));
- return;
- }
+/* load_const: emit ADRP + LDR Sd, [Xt, #:lo12:sym] against a fresh
+ * symbol in .rodata. Used by b08 to materialize a float bit pattern. */
+static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb)
+{
+ AAImpl* a = impl_of(t);
+ if (dst.cls != RC_FP) {
+ compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1");
}
- /* Single MOVN when one slot of the inverted value covers the rest.
- * For sf==1 the "rest is all ones" test is over the full 64 bits;
- * for sf==0 we work in the 32-bit space. */
+ /* Find or create .rodata. */
+ Sym ro_name = pool_intern_cstr(t->c->global, ".rodata");
+ ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, cb.align ? cb.align : 4);
+
+ u32 cur_section = t->mc->section_id;
+ t->mc->set_section(t->mc, ro);
+ t->mc->emit_align(t->mc, cb.align ? cb.align : 4, 0);
+ u32 ro_off = t->mc->pos(t->mc);
+ t->mc->emit_bytes(t->mc, cb.bytes, cb.size);
+
+ /* Local symbol pointing at the literal. */
+ char namebuf[64];
+ static u32 lit_seq = 0;
+ int len = 0;
{
- u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
- u64 all = sf ? ~(u64)0 : 0xffffffffu;
- (void)all;
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((inv >> (i * 16)) & 0xffffu);
- u64 cleared = inv & ~((u64)0xffffu << (i * 16));
- if (cleared == 0) {
- emit32(mc, aa64_movn(sf, rd, slot, i));
- return;
- }
- }
+ const char* prefix = ".LCFP";
+ for (; prefix[len]; ++len) namebuf[len] = prefix[len];
+ u32 v = lit_seq++;
+ char tmp[16]; int tn = 0;
+ if (v == 0) tmp[tn++] = '0';
+ else { while (v) { tmp[tn++] = '0' + (char)(v % 10); v /= 10; } }
+ for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i];
+ namebuf[len] = 0;
}
+ Sym sname = pool_intern_cstr(t->c->global, namebuf);
+ ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro,
+ (u64)ro_off, (u64)cb.size);
+
+ t->mc->set_section(t->mc, cur_section);
+
+ /* ADRP X9, sym ; LDR Sd, [X9, #:lo12:sym] */
+ u32 adrp_pos = t->mc->pos(t->mc);
+ emit32(t->mc, aa64_adrp_base(9));
+ t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos,
+ R_AARCH64_ADR_PREL_PG_HI21, sym, 0, 0, 0);
+
+ u32 ldr_pos = t->mc->pos(t->mc);
+ u32 sidx = (cb.size == 8) ? 3u : 2u;
+ emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0));
+ RelocKind lo12 = (cb.size == 8)
+ ? R_AARCH64_LDST64_ABS_LO12_NC
+ : R_AARCH64_LDST32_ABS_LO12_NC;
+ t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0);
+}
- /* General path: MOVZ the lowest non-zero slot, then MOVK any other
- * non-zero slot. v==0 was caught by the single-MOVZ branch above. */
- int placed = 0;
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- if (!placed) {
- if (slot == 0) continue;
- emit32(mc, aa64_movz(sf, rd, slot, i));
- placed = 1;
- } else if (slot != 0) {
- emit32(mc, aa64_movk(sf, rd, slot, i));
- }
+static void aa_copy(CGTarget* t, Operand dst, Operand src)
+{
+ if (dst.cls == RC_FP || src.cls == RC_FP) {
+ u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+ emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src)));
+ return;
}
- if (!placed) {
- /* Defensive: should be unreachable (v==0 caught above). */
- emit32(mc, aa64_movz(sf, rd, 0, 0));
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
+}
+
+/* ---- load / store / addr_of ---- */
+
+/* Resolve an address operand (LOCAL or INDIRECT) into (base_reg, signed
+ * offset) via a possibly-temporary base register. Returns the base reg. */
+static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg)
+{
+ AAImpl* a = impl_of(t);
+ if (addr.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot");
+ *out_off = -(i32)s->off;
+ return 29; /* x29 = fp */
+ }
+ if (addr.kind == OPK_INDIRECT) {
+ *out_off = addr.v.ind.ofs;
+ return reg_num((Operand){.kind=OPK_REG, .v.reg = addr.v.ind.base});
}
+ if (addr.kind == OPK_GLOBAL) {
+ compiler_panic(t->c, a->loc, "aarch64: GLOBAL address not yet supported");
+ }
+ (void)tmp_reg;
+ compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d",
+ (int)addr.kind);
}
-static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb)
-{ (void)dst; (void)cb; aa_panic(t, "load_const"); }
+static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma)
+{
+ AAImpl* a = impl_of(t);
+ i32 off;
+ u32 base = addr_base(t, addr, &off, 9);
+ u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+ u32 sidx = size_idx_for_bytes(sz);
+ if (off < -256 || off > 255) {
+ compiler_panic(t->c, a->loc, "aarch64 load: offset %d out of LDUR range", off);
+ }
+ if (dst.cls == RC_FP) {
+ emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off));
+ } else {
+ emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off));
+ }
+}
-static void aa_copy(CGTarget* t, Operand dst, Operand src)
+static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma)
{
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
+ AAImpl* a = impl_of(t);
+ i32 off;
+ u32 base = addr_base(t, addr, &off, 9);
+ u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+ u32 sidx = size_idx_for_bytes(sz);
+ if (off < -256 || off > 255) {
+ compiler_panic(t->c, a->loc, "aarch64 store: offset %d out of STUR range", off);
+ }
+
+ if (src.kind == OPK_IMM) {
+ /* Materialize through a scratch register. Use x9 (caller-saved). */
+ u32 sf = (sz == 8) ? 1u : 0u;
+ emit_load_imm(t->mc, sf, 9, src.v.imm);
+ emit32(t->mc, aa64_stur(sidx, 9, base, off));
+ return;
+ }
+ if (src.cls == RC_FP) {
+ emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off));
+ } else {
+ emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off));
+ }
+}
+
+static void aa_addr_of(CGTarget* t, Operand dst, Operand lv)
+{
+ AAImpl* a = impl_of(t);
+ if (lv.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, lv.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot");
+ /* dst = x29 - off */
+ emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0));
+ return;
+ }
+ if (lv.kind == OPK_INDIRECT) {
+ i32 ofs = lv.v.ind.ofs;
+ u32 base = lv.v.ind.base & 0x1f;
+ if (ofs == 0) {
+ emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base));
+ } else if (ofs > 0 && ofs <= 0xfff) {
+ emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0));
+ } else if (ofs < 0 && -ofs <= 0xfff) {
+ emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 addr_of: indirect offset %d unsupported", ofs);
+ }
+ return;
+ }
+ aa_panic(t, "addr_of");
}
-static void aa_load (CGTarget* t, Operand d, Operand a, MemAccess m) { (void)d;(void)a;(void)m; aa_panic(t, "load"); }
-static void aa_store (CGTarget* t, Operand a, Operand s, MemAccess m) { (void)a;(void)s;(void)m; aa_panic(t, "store"); }
-static void aa_addr_of (CGTarget* t, Operand d, Operand l) { (void)d;(void)l; aa_panic(t, "addr_of"); }
static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a) { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); }
static void aa_copy_bytes(CGTarget* t, Operand d, Operand s, AggregateAccess g) { (void)d;(void)s;(void)g; aa_panic(t, "copy_bytes"); }
static void aa_set_bytes (CGTarget* t, Operand d, Operand b, AggregateAccess g) { (void)d;(void)b;(void)g; aa_panic(t, "set_bytes"); }
@@ -215,21 +786,28 @@ static void aa_bitfield_store(CGTarget* t, Operand a, Operand s, BitFieldAccess
/* ---- arithmetic ---- */
-static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
+/* Force an Operand into a register, materializing immediates via x9.
+ * Returns the register number to use as Rn/Rm. */
+static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch)
+{
+ if (op.kind == OPK_REG) return reg_num(op);
+ if (op.kind == OPK_IMM) {
+ emit_load_imm(t->mc, sf, scratch, op.v.imm);
+ return scratch;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 binop: operand kind %d unsupported", (int)op.kind);
+}
+
+static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, Operand b_op)
{
MCEmitter* mc = t->mc;
u32 sf = type_is_64(dst.type) ? 1u : 0u;
u32 rd = reg_num(dst);
- u32 rn = reg_num(a);
- u32 rm = reg_num(b);
+ u32 rn = force_reg_int(t, a_op, sf, 9);
+ u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10 : 9);
u32 word;
- /* All operands must be REG. CG materializes immediates first. */
- if (a.kind != OPK_REG || b.kind != OPK_REG) {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 binop: non-REG operands not yet supported");
- }
-
switch (op) {
case BO_IADD: word = aa64_add (sf, rd, rn, rm); break;
case BO_ISUB: word = aa64_sub (sf, rd, rn, rm); break;
@@ -242,8 +820,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break;
case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break;
case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break;
+ /* rem = a - (a/b)*b → SDIV/UDIV into x11, then MSUB rd, x11, b, a. */
case BO_SREM:
+ emit32(mc, aa64_sdiv(sf, 11, rn, rm));
+ word = aa64_msub(sf, rd, 11, rm, rn);
+ break;
case BO_UREM:
+ emit32(mc, aa64_udiv(sf, 11, rn, rm));
+ word = aa64_msub(sf, rd, 11, rm, rn);
+ break;
case BO_FADD: case BO_FSUB: case BO_FMUL: case BO_FDIV:
default:
compiler_panic(t->c, impl_of(t)->loc,
@@ -252,15 +837,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b)
emit32(mc, word);
}
-static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
+static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op)
{
MCEmitter* mc = t->mc;
u32 sf = type_is_64(dst.type) ? 1u : 0u;
u32 rd = reg_num(dst);
- u32 rn = reg_num(a);
+ u32 rn = reg_num(a_op);
u32 word;
- if (a.kind != OPK_REG) {
+ if (a_op.kind != OPK_REG) {
compiler_panic(t->c, impl_of(t)->loc,
"aarch64 unop: non-REG operand not yet supported");
}
@@ -269,6 +854,10 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
case UO_NEG: word = aa64_neg(sf, rd, rn); break;
case UO_BNOT: word = aa64_mvn(sf, rd, rn); break;
case UO_NOT:
+ /* !x → cmp Xn, #0 ; cset Xd, EQ */
+ emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0));
+ word = aa64_cset_eq(sf, rd);
+ break;
default:
compiler_panic(t->c, impl_of(t)->loc,
"aarch64 unop: op %d unimpl", (int)op);
@@ -276,26 +865,291 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a)
emit32(mc, word);
}
-static void aa_cmp (CGTarget* t, CmpOp op, Operand d, Operand a, Operand b) { (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); }
-static void aa_convert(CGTarget* t, ConvKind k, Operand d, Operand s) { (void)k;(void)d;(void)s; aa_panic(t, "convert"); }
+static void aa_cmp(CGTarget* t, CmpOp op, Operand d, Operand a, Operand b)
+{ (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); }
+
+static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src)
+{
+ AAImpl* a = impl_of(t);
+ switch (k) {
+ case CV_FTOI_S: {
+ if (src.cls != RC_FP || dst.cls != RC_INT) {
+ compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
+ }
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ u32 type = type_is_fp_double(src.type) ? 1u : 0u;
+ emit32(t->mc, aa64_fcvtzs(sf, type, reg_num(dst), reg_num(src)));
+ return;
+ }
+ default:
+ compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k);
+ }
+}
/* ---- calls / return ---- */
-static void aa_call(CGTarget* t, const CGCallDesc* d) { (void)d; aa_panic(t, "call"); }
+/* Materialize a CGABIValue into the outgoing argument slots: register
+ * arguments go to x0..x7 / v0..v7; overflow goes to [sp, #stack_off].
+ * For BYVAL/INDIRECT the caller's `storage` is the address of the source
+ * data; we either load chunks into the next register pair (DIRECT
+ * aggregate) or pass the address itself (INDIRECT). */
+static void emit_arg_value(CGTarget* t,
+ const CGABIValue* av,
+ u32* next_int, u32* next_fp, u32* stack_off)
+{
+ AAImpl* a = impl_of(t);
+ const ABIArgInfo* ai = av->abi;
+ if (ai->kind == ABI_ARG_IGNORE) return;
+
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ /* Pass the address of the storage. storage is OPK_LOCAL holding
+ * the byval source. */
+ u32 dst_reg;
+ int to_stack = (*next_int >= 8);
+ if (!to_stack) dst_reg = (*next_int)++;
+ else dst_reg = 9;
+ if (av->storage.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, av->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot");
+ emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: INDIRECT arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ if (to_stack) {
+ emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+ *stack_off += 8;
+ }
+ return;
+ }
+ /* DIRECT — possibly multiple parts. */
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ u32 sz = pt->size;
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (pt->cls == ABI_CLASS_INT) {
+ int to_stack = (*next_int >= 8);
+ u32 dst_reg = to_stack ? 9u : (*next_int)++;
+ /* Source bits for this part. */
+ switch (av->storage.kind) {
+ case OPK_IMM: {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm);
+ break;
+ }
+ case OPK_REG: {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage)));
+ break;
+ }
+ case OPK_LOCAL: {
+ /* BYVAL aggregate carried in registers: load chunks from
+ * the source local's address + part->src_offset. */
+ AASlot* s = slot_get(a, av->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
+ i32 off = -(i32)s->off + (i32)pt->src_offset;
+ emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
+ break;
+ }
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ if (to_stack) {
+ emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+ *stack_off += 8;
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ int to_stack = (*next_fp >= 8);
+ u32 dst_reg = to_stack ? 0u : (*next_fp)++;
+ switch (av->storage.kind) {
+ case OPK_REG: {
+ u32 type = (sz == 8) ? 1u : 0u;
+ emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
+ break;
+ }
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: FP arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ if (to_stack) {
+ emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off));
+ *stack_off += 8;
+ }
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: ABI class %d unimpl", (int)pt->cls);
+ }
+ }
+}
+
+static void aa_call(CGTarget* t, const CGCallDesc* d)
+{
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ /* Pass 1: classify args, set up argument registers/stack. */
+ u32 next_int = 0, next_fp = 0, stack_off = 0;
+
+ /* sret: caller passes destination pointer in x8. */
+ if (d->abi && d->abi->has_sret) {
+ if (d->ret.storage.kind != OPK_LOCAL) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: sret destination must be LOCAL");
+ }
+ AASlot* s = slot_get(a, d->ret.storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot");
+ emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0));
+ }
+
+ for (u32 i = 0; i < d->nargs; ++i) {
+ emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off);
+ }
+
+ /* Track outgoing-arg high-water mark, 16-aligned. */
+ u32 needed = (stack_off + 15u) & ~15u;
+ if (needed > a->max_outgoing) a->max_outgoing = needed;
+
+ /* BL <callee> — direct only. */
+ if (d->callee.kind != OPK_GLOBAL) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: indirect call not yet supported");
+ }
+ u32 bl_pos = mc->pos(mc);
+ emit32(mc, aa64_bl_base());
+ mc->emit_reloc_at(mc, mc->section_id, bl_pos,
+ R_AARCH64_CALL26, d->callee.v.global.sym,
+ d->callee.v.global.addend, 0, 0);
+
+ /* Receive return value. */
+ const ABIArgInfo* ri = &d->abi->ret;
+ if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) {
+ /* Nothing to copy — sret was placed directly into the dst slot. */
+ return;
+ }
+ /* DIRECT scalar in our coverage: a single INT or FP part placed in
+ * x0 / v0. Move into ret_storage. */
+ if (ri->nparts == 0) return;
+ const ABIArgPart* p0 = &ri->parts[0];
+ Operand rs = d->ret.storage;
+ if (p0->cls == ABI_CLASS_INT) {
+ u32 sf = (p0->size == 8) ? 1u : 0u;
+ if (rs.kind == OPK_REG) {
+ emit32(mc, aa64_mov_reg(sf, reg_num(rs), 0));
+ } else if (rs.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, rs.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
+ u32 sidx = size_idx_for_bytes(p0->size);
+ emit32(mc, aa64_stur(sidx, 0, 29, -(i32)s->off));
+ }
+ } else if (p0->cls == ABI_CLASS_FP) {
+ u32 type = (p0->size == 8) ? 1u : 0u;
+ if (rs.kind == OPK_REG) {
+ emit32(mc, aa64_fmov_reg(type, reg_num(rs), 0));
+ } else if (rs.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, rs.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
+ u32 sidx = size_idx_for_bytes(p0->size);
+ emit32(mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off));
+ }
+ }
+ /* Multi-part returns: not exercised yet. */
+ if (ri->nparts > 1) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: multi-part return not yet supported");
+ }
+}
+
+/* Materialize the return value, then branch to the function epilogue. */
static void aa_ret(CGTarget* t, const CGABIValue* val)
{
+ AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
- if (val && val->storage.kind == OPK_REG) {
- u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
- } else if (val && val->storage.kind == OPK_IMM) {
- /* MOV W0, #imm via load_imm */
- Operand w0 = { OPK_REG, RC_INT, 0, val->storage.type, .v.reg = 0 };
- aa_load_imm(t, w0, val->storage.v.imm);
+ if (val) {
+ const ABIArgInfo* ri = val->abi;
+ if (ri && ri->kind == ABI_ARG_INDIRECT) {
+ /* sret: caller passed the destination pointer in x8 at entry,
+ * which we spilled into sret_ptr_slot. Reload x8 from there,
+ * then memcpy the source storage into [x8]. */
+ if (val->storage.kind == OPK_LOCAL) {
+ AASlot* s = slot_get(a, val->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot");
+ if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot* sp = slot_get(a, a->sret_ptr_slot);
+ if (sp) emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
+ }
+ u32 nbytes = s->size;
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ emit32(mc, aa64_ldur(3, 9, 29, -(i32)s->off + (i32)i));
+ emit32(mc, aa64_str_uimm(3, 9, 8, i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ emit32(mc, aa64_ldur(2, 9, 29, -(i32)s->off + (i32)i));
+ emit32(mc, aa64_str_uimm(2, 9, 8, i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ emit32(mc, aa64_ldur(1, 9, 29, -(i32)s->off + (i32)i));
+ emit32(mc, aa64_str_uimm(1, 9, 8, i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ emit32(mc, aa64_ldur(0, 9, 29, -(i32)s->off + (i32)i));
+ emit32(mc, aa64_str_uimm(0, 9, 8, i));
+ i += 1;
+ }
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 ret indirect: storage kind %d unsupported",
+ (int)val->storage.kind);
+ }
+ } else if (val->storage.kind == OPK_REG) {
+ if (val->storage.cls == RC_FP) {
+ u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
+ emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
+ } else {
+ u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+ emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
+ }
+ } else if (val->storage.kind == OPK_IMM) {
+ u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+ emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm);
+ } else if (val->storage.kind == OPK_LOCAL) {
+ /* DIRECT return whose source is a local: load each part into
+ * x0/x1 (or v0/v1) per the ABI classification. Used for
+ * small structs returned in registers. */
+ AASlot* s = slot_get(a, val->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot");
+ const ABIArgInfo* ri = val->abi;
+ for (u16 i = 0; i < (ri ? ri->nparts : 0); ++i) {
+ const ABIArgPart* pt = &ri->parts[i];
+ u32 sidx = size_idx_for_bytes(pt->size);
+ i32 off = -(i32)s->off + (i32)pt->src_offset;
+ if (pt->cls == ABI_CLASS_INT) {
+ emit32(mc, aa64_ldur(sidx, /*Rt=*/i, 29, off));
+ } else if (pt->cls == ABI_CLASS_FP) {
+ emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, 29, off));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 ret: ret part cls %d unimpl", (int)pt->cls);
+ }
+ }
+ }
}
- emit32(mc, aa64_ret(AA64_LR));
+ /* Branch to the epilogue. mc->emit_label_ref records a fixup that
+ * resolves to a JUMP26-encoded displacement when the label is placed. */
+ u32 bpos = mc->pos(mc);
+ emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
+ (void)bpos;
}
static void aa_alloca_ (CGTarget* t, Operand d, Operand s, u32 a) { (void)d;(void)s;(void)a; aa_panic(t, "alloca"); }
@@ -334,8 +1188,6 @@ static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); }
CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
{
- /* v1: only AArch64 implemented. Other targets fall back to a
- * "not implemented" diagnostic at construction. */
if (c->target.arch != CFREE_ARCH_ARM_64) {
SrcLoc loc = {0,0,0};
compiler_panic(c, loc,
@@ -399,7 +1251,7 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
t->va_end_ = aa_va_end_;
t->va_copy_ = aa_va_copy_;
- t->setjmp_ = NULL; /* parser lowers via __cfree_setjmp */
+ t->setjmp_ = NULL;
t->longjmp_ = NULL;
t->atomic_load = aa_atomic_load;
@@ -415,6 +1267,9 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m)
t->finalize = aa_finalize;
t->destroy = aa_destroy;
+ /* Avoid signed/unsigned warning. */
+ (void)type_is_signed;
+
compiler_defer(c, cgt_cleanup, t);
return t;
}
@@ -424,7 +1279,5 @@ void cgtarget_finalize(CGTarget* t) { if (t && t->finalize) t->finalize(t); }
void cgtarget_free(CGTarget* t)
{
if (!t) return;
- /* Arena-backed; nothing to free. The compiler_defer cleanup callback
- * arrives here at panic; intentional double-call from explicit free
- * after success is safe because everything is arena memory. */
+ /* Arena-backed; nothing to free. */
}
diff --git a/test/cg/CORPUS.md b/test/cg/CORPUS.md
@@ -44,12 +44,12 @@ parser will, and fail at runtime until those land — that is intentional.
| `a02_return_zero` | ★ | `load_imm 0; ret reg` | 0 |
| `a03_ret_imm` | ★ | `ret IMM 17` (backend materializes) | 17 |
| `a04_copy_reg` | ★ | `load_imm 7; copy r1->r2; ret r2` | 7 |
-| `a05_return_neg_small` | · | `load_imm -7` via MOVN; ret | 249 (= -7 & 0xff) |
-| `a06_return_i64` | · | i64 `load_imm 0x1_0000_002A`; ret as i64 | 42 (low 32 of x0) |
-| `a07_void_return` | · | `ret(NULL)` | 0 (via _start zeroing x0) |
-| `a08_multiple_returns` | · | `ret_imm 1; ret_imm 2` (second is dead) | 1 |
-| `a09_load_imm_movz_movk` | · | `load_imm 0xABCD` (multi-step materialize) | 205 (= 0xCD) |
-| `a10_return_u8` | · | `load_imm 200` into u8 reg; ret | 200 |
+| `a05_return_neg_small` | ★ | `load_imm -7` via MOVN; ret | 249 (= -7 & 0xff) |
+| `a06_return_i64` | ★ | i64 `load_imm 0x1_0000_002A`; ret as i64 | 42 (low 32 of x0) |
+| `a07_void_return` | ★ | `ret(NULL)` | 0 (via _start zeroing x0) |
+| `a08_multiple_returns` | ★ | `ret_imm 1; ret_imm 2` (second is dead) | 1 |
+| `a09_load_imm_movz_movk` | ★ | `load_imm 0xABCD` (multi-step materialize) | 205 (= 0xCD) |
+| `a10_return_u8` | ★ | `load_imm 200` into u8 reg; ret | 200 |
## Group B — frame slots, parameters, locals
@@ -62,14 +62,14 @@ materialization, slot allocation, and call lowering use the live
| Case | Status | Body | Expected |
|---|---|---|---|
-| `b01_param_int` | · | `int echo(int x){return x;}; echo(201)` | 201 |
-| `b02_param_sum` | · | `int sum2(int a,int b){return a+b;}; sum2(40,2)` | 42 |
-| `b03_param_spill` | · | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)` | 45 |
-| `b04_local_int` | · | local int slot; `*p = 42; return *p` | 42 |
-| `b05_addr_taken_local` | · | `int x=17; int*p=&x; *p+=1; return *p` | 18 |
-| `b06_sret` | · | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b` | 42 |
-| `b07_byval_param` | · | `int take(struct Pt p){return p.a+p.b;}; take({15,27})` | 42 |
-| `b08_fp_param` | · | `int trunc(float f){return (int)f;}; trunc(7.5f)` | 7 |
+| `b01_param_int` | ★ | `int echo(int x){return x;}; echo(201)` | 201 |
+| `b02_param_sum` | ★ | `int sum2(int a,int b){return a+b;}; sum2(40,2)` | 42 |
+| `b03_param_spill` | ★ | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)` | 45 |
+| `b04_local_int` | ★ | local int slot; `*p = 42; return *p` | 42 |
+| `b05_addr_taken_local` | ★ | `int x=17; int*p=&x; *p+=1; return *p` | 18 |
+| `b06_sret` | ★ | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b` | 42 |
+| `b07_byval_param` | ★ | `int take(struct Pt p){return p.a+p.b;}; take({15,27})` | 42 |
+| `b08_fp_param` | ★ | `int trunc(float f){return (int)f;}; trunc(7.5f)` | 7 |
## Group C — integer arithmetic
@@ -79,14 +79,14 @@ materialization, slot allocation, and call lowering use the live
| `c02_sub_mul` | ★ | `7 * 3 - 4` | 17 |
| `c03_bitwise` | ★ | `(~3) & 0xff` | 252 |
| `c04_shift` | ★ | `(1<<5) \| (16>>1)` (logical shr) | 40 |
-| `c05_div_mod` | · | `23 / 4 + 23 % 4` (signed) | 8 |
-| `c06_xor` | · | `0xa5 ^ 0x5a` | 255 |
-| `c07_iadd_i64` | · | i64 `0x1_0000_0029 + 0x1_0000_0001` | 42 (low 32) |
-| `c08_unsigned_div` | · | `100u / 7u` | 14 |
-| `c09_neg` | · | `UO_NEG` 42 | 214 (= -42 & 0xff) |
-| `c10_logical_not` | · | `UO_NOT 0` (zero-test → 0/1) | 1 |
-| `c11_shr_signed` | · | `-16 >>(s) 2` | 252 (= -4 & 0xff) |
-| `c12_imul_i64` | · | i64 `7 * 6` | 42 |
+| `c05_div_mod` | ★ | `23 / 4 + 23 % 4` (signed) | 8 |
+| `c06_xor` | ★ | `0xa5 ^ 0x5a` | 255 |
+| `c07_iadd_i64` | ★ | i64 `0x1_0000_0029 + 0x1_0000_0001` | 42 (low 32) |
+| `c08_unsigned_div` | ★ | `100u / 7u` | 14 |
+| `c09_neg` | ★ | `UO_NEG` 42 | 214 (= -42 & 0xff) |
+| `c10_logical_not` | ★ | `UO_NOT 0` (zero-test → 0/1) | 1 |
+| `c11_shr_signed` | ★ | `-16 >>(s) 2` | 252 (= -4 & 0xff) |
+| `c12_imul_i64` | ★ | i64 `7 * 6` | 42 |
## Deferred groups