kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit e25cbf0f256f656ae48652f76b1e757ad194564b
parent aef3673230d5108c76a0a58c69c64f1c2ff7fcd5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 14:07:16 -0700

cg/aa64: bring up frames, params, calls, FP — pass Groups A/B/C

Frame layout uses a fixed-size prologue placeholder patched at func_end
so frame_size and the callee-save count are knowable when the prologue
is finally written. Slots are FP-relative so per-slot offsets stay
stable while the eventual frame size is unknown; outgoing stack args
are SP-relative.

New surface:
- frame_slot, param (incoming x0..x7 / v0..v7 + stack overflow + sret x8)
- load/store/addr_of for LOCAL and INDIRECT operands
- call: direct BL with arg materialization (IMM/REG/LOCAL/BYVAL),
  sret pointer in x8, return-value placement into REG/LOCAL incl.
  small-struct-in-regs
- load_const for FP via .rodata + ADRP/LDR with the standard relocs;
  RC_FP allocator; FCVTZS; FMOV reg-reg
- binop accepts IMM via scratch; SREM/UREM via SDIV/UDIV+MSUB; UO_NOT
  via SUBS+CSET (previously panicked and "passed" by exit-code
  coincidence)

All 124 test/cg cases pass across D/R/E/J paths.

Diffstat:
Msrc/arch/aarch64.c | 1073+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mtest/cg/CORPUS.md | 44++++++++++++++++++++++----------------------
2 files changed, 985 insertions(+), 132 deletions(-)

diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c @@ -1,17 +1,29 @@ /* Minimal AArch64 CGTarget. * - * Initial coverage matches the spine A + C corpus (function lifecycle and - * integer arithmetic). Other CGTarget methods panic with a clear "unimpl" - * diagnostic so test cases that touch them fail visibly rather than - * silently emitting nothing. + * Single-pass codegen for the cg test corpus (Groups A, B, C). Frame + * layout uses a fixed-size prologue placeholder patched at func_end so + * frame_size and the callee-save register count are knowable when the + * prologue is finally written. FP-relative (x29) addressing is used for + * local slots and incoming stack args so that per-slot offsets can be + * assigned at frame_slot() time without depending on the eventual + * frame_size or callee-save count. SP-relative addressing is used for + * outgoing stack args. * - * Single-pass register allocation: alloc_reg hands out W19..W28 in order - * and panics on exhaustion. No live-range tracking, no spills. Suitable - * for short straight-line fixtures only; replaced when CG's - * value-stack-aware spill/reload arrives. + * Frame layout (low SP -> high): + * outgoing args (max_outgoing bytes, 16-aligned) + * int reg saves (n_int_pairs * 16) -- x19/x20, x21/x22, ... + * fp reg saves (n_fp_pairs * 16) -- d8/d9, d10/d11, ... + * local slots (cum_off bytes) + * x29, x30 save (16 bytes) -- x29 = sp + frame_size - 16 * - * Width is derived from Operand.type via type_is_64(). For the test - * harness this is enough; full ABI integration arrives with TargetABI. */ + * Single-pass register allocator: alloc_reg(RC_INT) hands out x19..x28 in + * order; alloc_reg(RC_FP) hands out v8..v15. Both ranges are callee-saved + * and only the prefix actually used is saved by the prologue. Width + * derives from Operand.type via type_is_64. Spill/reload not implemented. + * + * Multi-function: each func_begin/func_end pair owns its own frame state + * via the AAImpl fields, so the harness can build several functions in + * one TU. */ #include "arch/arch.h" #include "arch/aa64_isa.h" @@ -21,16 +33,153 @@ #include <string.h> +/* ============================================================ + * Local encoding helpers (kept here, not in aa64_isa.h, while the + * disassembler-shared table only needs the Group A/C subset). + * ============================================================ */ + +#define AA64_NOP 0xD503201Fu + +/* ADD/SUB immediate (12-bit imm, optional shift-12). Rd/Rn = 31 means SP + * for these encodings (not ZR). */ +static inline u32 aa64_add_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) +{ return 0x11000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_sub_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) +{ return 0x51000000u | (sf<<31) | ((sh&1)<<22) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* STP/LDP signed offset, X registers. Offset is byte offset, must be a + * multiple of 8; encoded value = byte_offset / 8 in a signed 7-bit field + * (range -512..504). */ +static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) +{ + i32 sc = byte_off >> 3; + return 0xA9000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) +{ + i32 sc = byte_off >> 3; + return 0xA9400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +/* STP/LDP signed offset, D registers (64-bit FP, scale 8). */ +static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) +{ + i32 sc = byte_off >> 3; + return 0x6D000000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) +{ + i32 sc = byte_off >> 3; + return 0x6D400000u | (((u32)sc & 0x7fu)<<15) | ((Rt2&0x1f)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} + +/* LDUR / STUR (general regs, unscaled simm9 in -256..255). + * size: 0=B, 1=H, 2=W, 3=X. */ +static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) +{ + return 0x38000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) +{ + return 0x38400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +/* LDUR/STUR for SIMD & FP registers (V=1). size: 2=S (32-bit), 3=D (64-bit). */ +static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) +{ + return 0x3C000000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) +{ + return 0x3C400000u | (size<<30) | (((u32)simm9 & 0x1ffu)<<12) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} + +/* STR/LDR scaled (unsigned imm12). byte_off must be a multiple of (1<<size). */ +static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) +{ + u32 sc = byte_off >> size; + return 0x39000000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} +/* Branch (unconditional, 26-bit imm). Emitted with imm26=0 when paired + * with a JUMP26/CALL26 relocation; the patcher fills in imm26. */ +static inline u32 aa64_b_base(void) { return 0x14000000u; } +static inline u32 aa64_bl_base(void) { return 0x94000000u; } + +/* ADRP base (Rd in low 5 bits). imm bits filled by relocation. */ +static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd&0x1f); } + +/* LDR (unsigned offset) for SIMD & FP, used after ADRP for FP literals. + * size 2 => S (32-bit). imm12 patched by linker. */ +static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) +{ + u32 sc = byte_off >> size; + return 0x3D400000u | (size<<30) | ((sc & 0xfffu)<<10) | ((Rn&0x1f)<<5) | (Rt&0x1f); +} + +/* FMOV (scalar register). type: 0=single, 1=double. */ +static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) +{ return 0x1E204000u | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* SUBS immediate (used to encode CMP Xn, #imm via SUBS ZR, Xn, #imm). */ +static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) +{ return 0x71000000u | (sf<<31) | ((imm12&0xfff)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* CSET Wd/Xd, EQ — alias of CSINC Rd, ZR, ZR, NE (inverted EQ). */ +static inline u32 aa64_cset_eq(u32 sf, u32 Rd) +{ return 0x1A800400u | (sf<<31) | (31u<<16) | (0x1u<<12) | (31u<<5) | (Rd&0x1f); } + +/* FCVTZS (scalar fp -> integer, round toward zero, signed). + * sf: 0=W, 1=X. type: 0=S, 1=D. */ +static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) +{ return 0x1E380000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + +/* ============================================================ + * AAImpl + * ============================================================ */ + +#define AA_PROLOGUE_WORDS 12u /* worst case: sub sp + stp/add fp + 5 int + 4 fp = 11 */ + +typedef struct AASlot { + u32 off; /* bytes below fp; address = x29 - off */ + u32 size; + u32 align; + u8 kind; /* FrameSlotKind */ + u8 pad[3]; +} AASlot; + typedef struct AAImpl { CGTarget base; SrcLoc loc; const CGFuncDesc* fd; + + /* Function emission. */ u32 func_start; - u32 next_alloc; + u32 prologue_pos; + MCLabel epilogue_label; + + /* Frame layout (in bytes; final frame_size computed at func_end). */ + AASlot* slots; + u32 nslots; + u32 slots_cap; + u32 cum_off; /* total bytes consumed by local slots */ + u32 max_outgoing; /* max stack arg bytes for any call */ + + /* Param incoming tracking — set by func_begin from ABIFuncInfo. */ + u32 next_param_int; /* x0..x7 consumed so far */ + u32 next_param_fp; /* v0..v7 consumed so far */ + u32 next_param_stack; /* offset into caller's stack arg area */ + u8 has_sret; /* sret pointer arrived in x8 */ + FrameSlot sret_ptr_slot; /* hidden slot holding incoming x8 */ + + /* Reg allocator (callee-saved prefix). */ + u32 used_int; /* x19 + i, i in [0, used_int) */ + u32 used_fp; /* v8 + i, i in [0, used_fp ) */ } AAImpl; static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } +/* Forward decls used before definition. */ +static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d); +static AASlot* slot_get(AAImpl* a, FrameSlot fs); + /* ---- helpers ---- */ static int type_is_64(const Type* t) @@ -47,6 +196,46 @@ static int type_is_64(const Type* t) } } +static int type_is_fp_double(const Type* t) +{ return t && (t->kind == TY_DOUBLE || t->kind == TY_LDOUBLE); } + +static int type_is_signed(const Type* t) +{ + if (!t) return 0; + switch (t->kind) { + case TY_CHAR: case TY_SCHAR: + case TY_SHORT: case TY_INT: case TY_LONG: case TY_LLONG: + return 1; + default: + return 0; + } +} + +static u32 type_byte_size(const Type* t) +{ + if (!t) return 4; + switch (t->kind) { + case TY_CHAR: case TY_SCHAR: case TY_UCHAR: case TY_BOOL: return 1; + case TY_SHORT: case TY_USHORT: return 2; + case TY_INT: case TY_UINT: case TY_FLOAT: return 4; + case TY_LONG: case TY_ULONG: case TY_LLONG: case TY_ULLONG: + case TY_PTR: case TY_DOUBLE: return 8; + default: return 8; + } +} + +/* Encode size index for STUR/LDUR (0=B,1=H,2=W,3=X). */ +static u32 size_idx_for_bytes(u32 nbytes) +{ + switch (nbytes) { + case 1: return 0; + case 2: return 1; + case 4: return 2; + case 8: return 3; + default: return 3; + } +} + static u32 reg_num(Operand op) { return op.v.reg & 0x1fu; } static void emit32(MCEmitter* mc, u32 word) @@ -59,12 +248,78 @@ static void emit32(MCEmitter* mc, u32 word) mc->emit_bytes(mc, b, 4); } +static void patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) +{ + u8 b[4]; + b[0] = (u8)(word & 0xff); + b[1] = (u8)((word >> 8) & 0xff); + b[2] = (u8)((word >> 16)& 0xff); + b[3] = (u8)((word >> 24)& 0xff); + obj_patch(obj, sec_id, ofs, b, 4); +} + static _Noreturn void aa_panic(CGTarget* t, const char* what) { SrcLoc loc = impl_of(t)->loc; compiler_panic(t->c, loc, "aarch64: %s not implemented", what); } +/* ---- AArch64 immediate encoding helpers ---- */ + +/* Materialize a u64 into a register using MOVZ/MOVN/MOVK. Used both for + * the public load_imm() and internally for synthesizing immediates. */ +static void emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) +{ + const u32 nslots = sf ? 4u : 2u; + u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu); + + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((v >> (i * 16)) & 0xffffu); + u64 cleared = v & ~((u64)0xffffu << (i * 16)); + if (slot != 0 && cleared == 0) { + emit32(mc, aa64_movz(sf, Rd, slot, i)); + return; + } + } + + { + u64 inv = sf ? ~v : ((~v) & 0xffffffffu); + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((inv >> (i * 16)) & 0xffffu); + u64 cleared = inv & ~((u64)0xffffu << (i * 16)); + if (cleared == 0) { + emit32(mc, aa64_movn(sf, Rd, slot, i)); + return; + } + } + } + + int placed = 0; + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((v >> (i * 16)) & 0xffffu); + if (!placed) { + if (slot == 0) continue; + emit32(mc, aa64_movz(sf, Rd, slot, i)); + placed = 1; + } else if (slot != 0) { + emit32(mc, aa64_movk(sf, Rd, slot, i)); + } + } + if (!placed) emit32(mc, aa64_movz(sf, Rd, 0, 0)); +} + +static void emit_sp_add(MCEmitter* mc, u32 imm) +{ + if (imm <= 0xfff) { + emit32(mc, aa64_add_imm(1, 31, 31, imm, 0)); + } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) { + emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1)); + } else { + emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1)); + emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0)); + } +} + /* ---- function lifecycle ---- */ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) @@ -73,26 +328,144 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) MCEmitter* mc = t->mc; mc->set_section(mc, fd->text_section_id); - mc->emit_align(mc, 4, 0); /* instruction alignment */ + mc->emit_align(mc, 4, 0); a->fd = fd; a->func_start = mc->pos(mc); - a->next_alloc = 0; + a->next_param_int = 0; + a->next_param_fp = 0; + a->next_param_stack = 0; + a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0; + a->cum_off = 0; + a->max_outgoing= 0; + a->used_int = 0; + a->used_fp = 0; + a->nslots = 0; + a->sret_ptr_slot = FRAME_SLOT_NONE; + a->epilogue_label = mc->label_new(mc); mc->cfi_startproc(mc); + + /* Reserve a fixed-size prologue placeholder, NOP-filled. We patch the + * prefix at func_end with the real prologue once frame_size and the + * callee-save count are known. */ + a->prologue_pos = mc->pos(mc); + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) emit32(mc, AA64_NOP); + + /* If the function returns indirect (sret), x8 holds the destination + * pointer on entry. Reserve a hidden slot to spill it into so the + * body can use x8 as scratch and ret can recover the dest pointer. */ + if (a->has_sret) { + FrameSlotDesc fsd = { + .type = NULL, .name = 0, .loc = (SrcLoc){0,0,0}, + .size = 8, .align = 8, .kind = FS_SPILL, .flags = 0, + }; + a->sret_ptr_slot = aa_frame_slot(t, &fsd); + } } static void aa_func_end(CGTarget* t) { AAImpl* a = impl_of(t); MCEmitter* mc = t->mc; - u32 end = mc->pos(mc); - obj_symbol_define(t->obj, - a->fd->sym, - a->fd->text_section_id, - (u64)a->func_start, - (u64)(end - a->func_start)); + /* Compute callee-save layout. */ + u32 n_int_pairs = (a->used_int + 1) / 2; /* round up */ + u32 n_fp_pairs = (a->used_fp + 1) / 2; + + u32 outgoing_off = 0; + u32 int_save_off = a->max_outgoing; + u32 fp_save_off = int_save_off + n_int_pairs * 16; + u32 locals_off = fp_save_off + n_fp_pairs * 16; + u32 fp_lr_off = locals_off + a->cum_off; + u32 frame_size = fp_lr_off + 16; + /* round to 16. */ + frame_size = (frame_size + 15u) & ~15u; + fp_lr_off = frame_size - 16; + + (void)outgoing_off; + + /* Emit epilogue at current pos, then place label. The label we emit + * must point at the first instruction of the epilogue so `b epilogue` + * branches land here. */ + mc->label_place(mc, a->epilogue_label); + + /* Restore FP saves, then INT saves, then fp/lr, then add sp + ret. */ + for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) { + u32 r0 = 8u + (u32)i * 2u; + u32 r1 = r0 + 1u; + emit32(mc, aa64_ldp_d(r0, r1, 31, (i32)(fp_save_off + (u32)i*16u))); + } + for (i32 i = (i32)n_int_pairs - 1; i >= 0; --i) { + u32 r0 = 19u + (u32)i * 2u; + u32 r1 = r0 + 1u; + emit32(mc, aa64_ldp_x(r0, r1, 31, (i32)(int_save_off + (u32)i*16u))); + } + emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); + emit_sp_add(mc, frame_size); + emit32(mc, aa64_ret(AA64_LR)); + + /* Now patch prologue placeholder. */ + u32 pos = a->prologue_pos; + ObjBuilder* obj = t->obj; + u32 sec = a->fd->text_section_id; + + u32 words[AA_PROLOGUE_WORDS]; + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP; + u32 wi = 0; + + /* sub sp, sp, #frame_size — may take 2 insns if > 4095. */ + if (frame_size <= 0xfff) { + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0); + } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) { + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1); + } else { + if (wi + 2 > AA_PROLOGUE_WORDS) { + compiler_panic(t->c, a->loc, + "aarch64: prologue overflow for frame_size %u", frame_size); + } + words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1); + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0); + } + /* stp x29, x30, [sp, #fp_lr_off]; add x29, sp, #fp_lr_off */ + words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); + words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); + /* If sret, save incoming x8 (caller's destination pointer). */ + if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { + AASlot* s = slot_get(a, a->sret_ptr_slot); + if (s) { + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off); + } + } + /* INT pair saves. */ + for (u32 i = 0; i < n_int_pairs; ++i) { + u32 r0 = 19u + i*2u; + u32 r1 = r0 + 1u; + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_stp_x(r0, r1, 31, (i32)(int_save_off + i*16u)); + } + for (u32 i = 0; i < n_fp_pairs; ++i) { + u32 r0 = 8u + i*2u; + u32 r1 = r0 + 1u; + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_stp_d(r0, r1, 31, (i32)(fp_save_off + i*16u)); + } + if (0) { +overflow: + compiler_panic(t->c, a->loc, + "aarch64: prologue placeholder too small (used %u of %u words)", + wi, AA_PROLOGUE_WORDS); + } + + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) { + patch32(obj, sec, pos + i*4u, words[i]); + } + + /* Define the function symbol. */ + u32 end = mc->pos(mc); + obj_symbol_define(obj, a->fd->sym, sec, + (u64)a->func_start, (u64)(end - a->func_start)); mc->cfi_endproc(mc); a->fd = NULL; @@ -104,25 +477,128 @@ static Reg aa_alloc_reg(CGTarget* t, RegClass cls, const Type* ty) { AAImpl* a = impl_of(t); (void)ty; - if (cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls); + if (cls == RC_INT) { + if (a->used_int >= 10) { + compiler_panic(t->c, a->loc, + "aarch64 alloc_reg: out of INT scratch (no spill yet)"); + } + return (Reg)(19u + a->used_int++); } - if (a->next_alloc >= 10) { - compiler_panic(t->c, a->loc, - "aarch64 alloc_reg: out of scratch regs (no spill yet)"); + if (cls == RC_FP) { + if (a->used_fp >= 8) { + compiler_panic(t->c, a->loc, + "aarch64 alloc_reg: out of FP scratch (no spill yet)"); + } + return (Reg)(8u + a->used_fp++); } - return (Reg)(19u + a->next_alloc++); + compiler_panic(t->c, a->loc, "aarch64 alloc_reg: class %d unimpl", (int)cls); } static void aa_free_reg(CGTarget* t, Reg r) { (void)t; (void)r; } -static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { (void)d; aa_panic(t, "frame_slot"); } -static void aa_param (CGTarget* t, const CGParamDesc* p) { (void)p; aa_panic(t, "param"); } +static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) +{ + AAImpl* a = impl_of(t); + if (a->nslots == a->slots_cap) { + u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8; + AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap); + if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots); + a->slots = nbuf; + a->slots_cap = ncap; + } + u32 size = d->size ? d->size : 8; + u32 align = d->align ? d->align : 1; + u32 next = a->cum_off + size; + /* Round up so that slot start (= fp - off) is align-aligned. fp is + * 16-aligned, so requiring off aligned to `align` suffices. */ + u32 mask = align - 1; + next = (next + mask) & ~mask; + + AASlot* s = &a->slots[a->nslots]; + s->off = next; + s->size = size; + s->align = align; + s->kind = d->kind; + + a->cum_off = next; + a->nslots++; + return (FrameSlot)(a->nslots); /* 1-based; FRAME_SLOT_NONE == 0 */ +} + +static AASlot* slot_get(AAImpl* a, FrameSlot fs) +{ + if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL; + return &a->slots[fs - 1]; +} + +/* ---- param: store incoming arg(s) into the home slot ---- */ + +static void aa_param(CGTarget* t, const CGParamDesc* p) +{ + AAImpl* a = impl_of(t); + AASlot* s = slot_get(a, p->slot); + if (!s) { + compiler_panic(t->c, a->loc, "aarch64 param: bad slot"); + } + const ABIArgInfo* ai = p->abi; + + if (ai->kind == ABI_ARG_IGNORE) return; + if (ai->kind == ABI_ARG_INDIRECT) { + /* Caller passes a pointer to the data. Pointer comes in next + * INT arg reg; store it into the home slot (which holds the + * pointer-sized address). */ + if (a->next_param_int < 8) { + u32 reg = a->next_param_int++; + emit32(t->mc, aa64_stur(3, reg, 29, -(i32)s->off)); + } else { + /* Pointer on stack — load and store. */ + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off))); + emit32(t->mc, aa64_stur(3, 9, 29, -(i32)s->off)); + } + return; + } + /* DIRECT: place each part. */ + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 part_off = pt->src_offset; + u32 sz = pt->size; + u32 sidx = size_idx_for_bytes(sz); + + if (pt->cls == ABI_CLASS_INT) { + if (a->next_param_int < 8) { + u32 reg = a->next_param_int++; + emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + } else { + /* Each stack-passed slot is 8 bytes regardless of part size. */ + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + emit32(t->mc, aa64_ldur(sidx, 9, 29, (i32)(16 + caller_off))); + emit32(t->mc, aa64_stur(sidx, 9, 29, -(i32)s->off + (i32)part_off)); + } + } else if (pt->cls == ABI_CLASS_FP) { + if (a->next_param_fp < 8) { + u32 reg = a->next_param_fp++; + emit32(t->mc, aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + emit32(t->mc, aa64_ldur_fp(sidx, 0, 29, (i32)(16 + caller_off))); + emit32(t->mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off + (i32)part_off)); + } + } else { + compiler_panic(t->c, a->loc, + "aarch64 param: ABI class %d unimpl", (int)pt->cls); + } + } +} + static const Reg* aa_clobbers (CGTarget* t, RegClass c, u32* n) { (void)c; (void)n; aa_panic(t, "clobbers"); } static void aa_spill_reg (CGTarget* t, Operand s, FrameSlot f, MemAccess m) { (void)s; (void)f; (void)m; aa_panic(t, "spill_reg"); } static void aa_reload_reg(CGTarget* t, Operand d, FrameSlot f, MemAccess m) { (void)d; (void)f; (void)m; aa_panic(t, "reload_reg"); } -/* ---- labels / control flow (deferred) ---- */ +/* ---- labels / control flow (deferred for D-group; ret uses internal label) ---- */ static Label aa_label_new (CGTarget* t) { aa_panic(t, "label_new"); } static void aa_label_place(CGTarget* t, Label l) { (void)l; aa_panic(t, "label_place"); } @@ -139,74 +615,169 @@ static void aa_continue_to(CGTarget* t, CGScope s) { (void)s; aa_ static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) { - MCEmitter* mc = t->mc; u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - - /* Effective bit-width: 32 unless we're materializing into Xd. The 32-bit - * encoding zero-extends the result, so we mask to 32 bits when sf==0 - * so a "negative" int constant materializes its low 32 bits exactly. */ - const u32 nslots = sf ? 4u : 2u; - u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu); + emit_load_imm(t->mc, sf, reg_num(dst), imm); +} - /* Single MOVZ when only one 16-bit slot is non-zero. */ - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - u64 cleared = v & ~((u64)0xffffu << (i * 16)); - if (slot != 0 && cleared == 0) { - emit32(mc, aa64_movz(sf, rd, slot, i)); - return; - } +/* load_const: emit ADRP + LDR Sd, [Xt, #:lo12:sym] against a fresh + * symbol in .rodata. Used by b08 to materialize a float bit pattern. */ +static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) +{ + AAImpl* a = impl_of(t); + if (dst.cls != RC_FP) { + compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1"); } - /* Single MOVN when one slot of the inverted value covers the rest. - * For sf==1 the "rest is all ones" test is over the full 64 bits; - * for sf==0 we work in the 32-bit space. */ + /* Find or create .rodata. */ + Sym ro_name = pool_intern_cstr(t->c->global, ".rodata"); + ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, cb.align ? cb.align : 4); + + u32 cur_section = t->mc->section_id; + t->mc->set_section(t->mc, ro); + t->mc->emit_align(t->mc, cb.align ? cb.align : 4, 0); + u32 ro_off = t->mc->pos(t->mc); + t->mc->emit_bytes(t->mc, cb.bytes, cb.size); + + /* Local symbol pointing at the literal. */ + char namebuf[64]; + static u32 lit_seq = 0; + int len = 0; { - u64 inv = sf ? ~v : ((~v) & 0xffffffffu); - u64 all = sf ? ~(u64)0 : 0xffffffffu; - (void)all; - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((inv >> (i * 16)) & 0xffffu); - u64 cleared = inv & ~((u64)0xffffu << (i * 16)); - if (cleared == 0) { - emit32(mc, aa64_movn(sf, rd, slot, i)); - return; - } - } + const char* prefix = ".LCFP"; + for (; prefix[len]; ++len) namebuf[len] = prefix[len]; + u32 v = lit_seq++; + char tmp[16]; int tn = 0; + if (v == 0) tmp[tn++] = '0'; + else { while (v) { tmp[tn++] = '0' + (char)(v % 10); v /= 10; } } + for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i]; + namebuf[len] = 0; } + Sym sname = pool_intern_cstr(t->c->global, namebuf); + ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, + (u64)ro_off, (u64)cb.size); + + t->mc->set_section(t->mc, cur_section); + + /* ADRP X9, sym ; LDR Sd, [X9, #:lo12:sym] */ + u32 adrp_pos = t->mc->pos(t->mc); + emit32(t->mc, aa64_adrp_base(9)); + t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, + R_AARCH64_ADR_PREL_PG_HI21, sym, 0, 0, 0); + + u32 ldr_pos = t->mc->pos(t->mc); + u32 sidx = (cb.size == 8) ? 3u : 2u; + emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0)); + RelocKind lo12 = (cb.size == 8) + ? R_AARCH64_LDST64_ABS_LO12_NC + : R_AARCH64_LDST32_ABS_LO12_NC; + t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0); +} - /* General path: MOVZ the lowest non-zero slot, then MOVK any other - * non-zero slot. v==0 was caught by the single-MOVZ branch above. */ - int placed = 0; - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - if (!placed) { - if (slot == 0) continue; - emit32(mc, aa64_movz(sf, rd, slot, i)); - placed = 1; - } else if (slot != 0) { - emit32(mc, aa64_movk(sf, rd, slot, i)); - } +static void aa_copy(CGTarget* t, Operand dst, Operand src) +{ + if (dst.cls == RC_FP || src.cls == RC_FP) { + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src))); + return; } - if (!placed) { - /* Defensive: should be unreachable (v==0 caught above). */ - emit32(mc, aa64_movz(sf, rd, 0, 0)); + u32 sf = type_is_64(dst.type) ? 1u : 0u; + emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src))); +} + +/* ---- load / store / addr_of ---- */ + +/* Resolve an address operand (LOCAL or INDIRECT) into (base_reg, signed + * offset) via a possibly-temporary base register. Returns the base reg. */ +static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) +{ + AAImpl* a = impl_of(t); + if (addr.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot"); + *out_off = -(i32)s->off; + return 29; /* x29 = fp */ + } + if (addr.kind == OPK_INDIRECT) { + *out_off = addr.v.ind.ofs; + return reg_num((Operand){.kind=OPK_REG, .v.reg = addr.v.ind.base}); } + if (addr.kind == OPK_GLOBAL) { + compiler_panic(t->c, a->loc, "aarch64: GLOBAL address not yet supported"); + } + (void)tmp_reg; + compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d", + (int)addr.kind); } -static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) -{ (void)dst; (void)cb; aa_panic(t, "load_const"); } +static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) +{ + AAImpl* a = impl_of(t); + i32 off; + u32 base = addr_base(t, addr, &off, 9); + u32 sz = ma.size ? ma.size : type_byte_size(addr.type); + u32 sidx = size_idx_for_bytes(sz); + if (off < -256 || off > 255) { + compiler_panic(t->c, a->loc, "aarch64 load: offset %d out of LDUR range", off); + } + if (dst.cls == RC_FP) { + emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off)); + } else { + emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off)); + } +} -static void aa_copy(CGTarget* t, Operand dst, Operand src) +static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { - u32 sf = type_is_64(dst.type) ? 1u : 0u; - emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src))); + AAImpl* a = impl_of(t); + i32 off; + u32 base = addr_base(t, addr, &off, 9); + u32 sz = ma.size ? ma.size : type_byte_size(addr.type); + u32 sidx = size_idx_for_bytes(sz); + if (off < -256 || off > 255) { + compiler_panic(t->c, a->loc, "aarch64 store: offset %d out of STUR range", off); + } + + if (src.kind == OPK_IMM) { + /* Materialize through a scratch register. Use x9 (caller-saved). */ + u32 sf = (sz == 8) ? 1u : 0u; + emit_load_imm(t->mc, sf, 9, src.v.imm); + emit32(t->mc, aa64_stur(sidx, 9, base, off)); + return; + } + if (src.cls == RC_FP) { + emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off)); + } else { + emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off)); + } +} + +static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) +{ + AAImpl* a = impl_of(t); + if (lv.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, lv.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot"); + /* dst = x29 - off */ + emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0)); + return; + } + if (lv.kind == OPK_INDIRECT) { + i32 ofs = lv.v.ind.ofs; + u32 base = lv.v.ind.base & 0x1f; + if (ofs == 0) { + emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base)); + } else if (ofs > 0 && ofs <= 0xfff) { + emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0)); + } else if (ofs < 0 && -ofs <= 0xfff) { + emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 addr_of: indirect offset %d unsupported", ofs); + } + return; + } + aa_panic(t, "addr_of"); } -static void aa_load (CGTarget* t, Operand d, Operand a, MemAccess m) { (void)d;(void)a;(void)m; aa_panic(t, "load"); } -static void aa_store (CGTarget* t, Operand a, Operand s, MemAccess m) { (void)a;(void)s;(void)m; aa_panic(t, "store"); } -static void aa_addr_of (CGTarget* t, Operand d, Operand l) { (void)d;(void)l; aa_panic(t, "addr_of"); } static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a) { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); } static void aa_copy_bytes(CGTarget* t, Operand d, Operand s, AggregateAccess g) { (void)d;(void)s;(void)g; aa_panic(t, "copy_bytes"); } static void aa_set_bytes (CGTarget* t, Operand d, Operand b, AggregateAccess g) { (void)d;(void)b;(void)g; aa_panic(t, "set_bytes"); } @@ -215,21 +786,28 @@ static void aa_bitfield_store(CGTarget* t, Operand a, Operand s, BitFieldAccess /* ---- arithmetic ---- */ -static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) +/* Force an Operand into a register, materializing immediates via x9. + * Returns the register number to use as Rn/Rm. */ +static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) +{ + if (op.kind == OPK_REG) return reg_num(op); + if (op.kind == OPK_IMM) { + emit_load_imm(t->mc, sf, scratch, op.v.imm); + return scratch; + } + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 binop: operand kind %d unsupported", (int)op.kind); +} + +static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, Operand b_op) { MCEmitter* mc = t->mc; u32 sf = type_is_64(dst.type) ? 1u : 0u; u32 rd = reg_num(dst); - u32 rn = reg_num(a); - u32 rm = reg_num(b); + u32 rn = force_reg_int(t, a_op, sf, 9); + u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10 : 9); u32 word; - /* All operands must be REG. CG materializes immediates first. */ - if (a.kind != OPK_REG || b.kind != OPK_REG) { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 binop: non-REG operands not yet supported"); - } - switch (op) { case BO_IADD: word = aa64_add (sf, rd, rn, rm); break; case BO_ISUB: word = aa64_sub (sf, rd, rn, rm); break; @@ -242,8 +820,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break; case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break; case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break; + /* rem = a - (a/b)*b → SDIV/UDIV into x11, then MSUB rd, x11, b, a. */ case BO_SREM: + emit32(mc, aa64_sdiv(sf, 11, rn, rm)); + word = aa64_msub(sf, rd, 11, rm, rn); + break; case BO_UREM: + emit32(mc, aa64_udiv(sf, 11, rn, rm)); + word = aa64_msub(sf, rd, 11, rm, rn); + break; case BO_FADD: case BO_FSUB: case BO_FMUL: case BO_FDIV: default: compiler_panic(t->c, impl_of(t)->loc, @@ -252,15 +837,15 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) emit32(mc, word); } -static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a) +static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) { MCEmitter* mc = t->mc; u32 sf = type_is_64(dst.type) ? 1u : 0u; u32 rd = reg_num(dst); - u32 rn = reg_num(a); + u32 rn = reg_num(a_op); u32 word; - if (a.kind != OPK_REG) { + if (a_op.kind != OPK_REG) { compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: non-REG operand not yet supported"); } @@ -269,6 +854,10 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a) case UO_NEG: word = aa64_neg(sf, rd, rn); break; case UO_BNOT: word = aa64_mvn(sf, rd, rn); break; case UO_NOT: + /* !x → cmp Xn, #0 ; cset Xd, EQ */ + emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0)); + word = aa64_cset_eq(sf, rd); + break; default: compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl", (int)op); @@ -276,26 +865,291 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a) emit32(mc, word); } -static void aa_cmp (CGTarget* t, CmpOp op, Operand d, Operand a, Operand b) { (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); } -static void aa_convert(CGTarget* t, ConvKind k, Operand d, Operand s) { (void)k;(void)d;(void)s; aa_panic(t, "convert"); } +static void aa_cmp(CGTarget* t, CmpOp op, Operand d, Operand a, Operand b) +{ (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); } + +static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) +{ + AAImpl* a = impl_of(t); + switch (k) { + case CV_FTOI_S: { + if (src.cls != RC_FP || dst.cls != RC_INT) { + compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes"); + } + u32 sf = type_is_64(dst.type) ? 1u : 0u; + u32 type = type_is_fp_double(src.type) ? 1u : 0u; + emit32(t->mc, aa64_fcvtzs(sf, type, reg_num(dst), reg_num(src))); + return; + } + default: + compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k); + } +} /* ---- calls / return ---- */ -static void aa_call(CGTarget* t, const CGCallDesc* d) { (void)d; aa_panic(t, "call"); } +/* Materialize a CGABIValue into the outgoing argument slots: register + * arguments go to x0..x7 / v0..v7; overflow goes to [sp, #stack_off]. + * For BYVAL/INDIRECT the caller's `storage` is the address of the source + * data; we either load chunks into the next register pair (DIRECT + * aggregate) or pass the address itself (INDIRECT). */ +static void emit_arg_value(CGTarget* t, + const CGABIValue* av, + u32* next_int, u32* next_fp, u32* stack_off) +{ + AAImpl* a = impl_of(t); + const ABIArgInfo* ai = av->abi; + if (ai->kind == ABI_ARG_IGNORE) return; + + if (ai->kind == ABI_ARG_INDIRECT) { + /* Pass the address of the storage. storage is OPK_LOCAL holding + * the byval source. */ + u32 dst_reg; + int to_stack = (*next_int >= 8); + if (!to_stack) dst_reg = (*next_int)++; + else dst_reg = 9; + if (av->storage.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot"); + emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: INDIRECT arg storage kind %d unsupported", + (int)av->storage.kind); + } + if (to_stack) { + emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); + *stack_off += 8; + } + return; + } + /* DIRECT — possibly multiple parts. */ + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 sz = pt->size; + u32 sidx = size_idx_for_bytes(sz); + + if (pt->cls == ABI_CLASS_INT) { + int to_stack = (*next_int >= 8); + u32 dst_reg = to_stack ? 9u : (*next_int)++; + /* Source bits for this part. */ + switch (av->storage.kind) { + case OPK_IMM: { + u32 sf = (sz == 8) ? 1u : 0u; + emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm); + break; + } + case OPK_REG: { + u32 sf = (sz == 8) ? 1u : 0u; + emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage))); + break; + } + case OPK_LOCAL: { + /* BYVAL aggregate carried in registers: load chunks from + * the source local's address + part->src_offset. */ + AASlot* s = slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot"); + i32 off = -(i32)s->off + (i32)pt->src_offset; + emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off)); + break; + } + default: + compiler_panic(t->c, a->loc, + "aarch64 call: arg storage kind %d unsupported", + (int)av->storage.kind); + } + if (to_stack) { + emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); + *stack_off += 8; + } + } else if (pt->cls == ABI_CLASS_FP) { + int to_stack = (*next_fp >= 8); + u32 dst_reg = to_stack ? 0u : (*next_fp)++; + switch (av->storage.kind) { + case OPK_REG: { + u32 type = (sz == 8) ? 1u : 0u; + emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); + break; + } + default: + compiler_panic(t->c, a->loc, + "aarch64 call: FP arg storage kind %d unsupported", + (int)av->storage.kind); + } + if (to_stack) { + emit32(t->mc, aa64_stur_fp(sidx, dst_reg, 31, (i32)*stack_off)); + *stack_off += 8; + } + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: ABI class %d unimpl", (int)pt->cls); + } + } +} + +static void aa_call(CGTarget* t, const CGCallDesc* d) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + /* Pass 1: classify args, set up argument registers/stack. */ + u32 next_int = 0, next_fp = 0, stack_off = 0; + + /* sret: caller passes destination pointer in x8. */ + if (d->abi && d->abi->has_sret) { + if (d->ret.storage.kind != OPK_LOCAL) { + compiler_panic(t->c, a->loc, + "aarch64 call: sret destination must be LOCAL"); + } + AASlot* s = slot_get(a, d->ret.storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot"); + emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0)); + } + + for (u32 i = 0; i < d->nargs; ++i) { + emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off); + } + + /* Track outgoing-arg high-water mark, 16-aligned. */ + u32 needed = (stack_off + 15u) & ~15u; + if (needed > a->max_outgoing) a->max_outgoing = needed; + + /* BL <callee> — direct only. */ + if (d->callee.kind != OPK_GLOBAL) { + compiler_panic(t->c, a->loc, + "aarch64 call: indirect call not yet supported"); + } + u32 bl_pos = mc->pos(mc); + emit32(mc, aa64_bl_base()); + mc->emit_reloc_at(mc, mc->section_id, bl_pos, + R_AARCH64_CALL26, d->callee.v.global.sym, + d->callee.v.global.addend, 0, 0); + + /* Receive return value. */ + const ABIArgInfo* ri = &d->abi->ret; + if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) { + /* Nothing to copy — sret was placed directly into the dst slot. */ + return; + } + /* DIRECT scalar in our coverage: a single INT or FP part placed in + * x0 / v0. Move into ret_storage. */ + if (ri->nparts == 0) return; + const ABIArgPart* p0 = &ri->parts[0]; + Operand rs = d->ret.storage; + if (p0->cls == ABI_CLASS_INT) { + u32 sf = (p0->size == 8) ? 1u : 0u; + if (rs.kind == OPK_REG) { + emit32(mc, aa64_mov_reg(sf, reg_num(rs), 0)); + } else if (rs.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, rs.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); + u32 sidx = size_idx_for_bytes(p0->size); + emit32(mc, aa64_stur(sidx, 0, 29, -(i32)s->off)); + } + } else if (p0->cls == ABI_CLASS_FP) { + u32 type = (p0->size == 8) ? 1u : 0u; + if (rs.kind == OPK_REG) { + emit32(mc, aa64_fmov_reg(type, reg_num(rs), 0)); + } else if (rs.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, rs.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); + u32 sidx = size_idx_for_bytes(p0->size); + emit32(mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off)); + } + } + /* Multi-part returns: not exercised yet. */ + if (ri->nparts > 1) { + compiler_panic(t->c, a->loc, + "aarch64 call: multi-part return not yet supported"); + } +} + +/* Materialize the return value, then branch to the function epilogue. */ static void aa_ret(CGTarget* t, const CGABIValue* val) { + AAImpl* a = impl_of(t); MCEmitter* mc = t->mc; - if (val && val->storage.kind == OPK_REG) { - u32 sf = type_is_64(val->storage.type) ? 1u : 0u; - emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage))); - } else if (val && val->storage.kind == OPK_IMM) { - /* MOV W0, #imm via load_imm */ - Operand w0 = { OPK_REG, RC_INT, 0, val->storage.type, .v.reg = 0 }; - aa_load_imm(t, w0, val->storage.v.imm); + if (val) { + const ABIArgInfo* ri = val->abi; + if (ri && ri->kind == ABI_ARG_INDIRECT) { + /* sret: caller passed the destination pointer in x8 at entry, + * which we spilled into sret_ptr_slot. Reload x8 from there, + * then memcpy the source storage into [x8]. */ + if (val->storage.kind == OPK_LOCAL) { + AASlot* s = slot_get(a, val->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot"); + if (a->sret_ptr_slot != FRAME_SLOT_NONE) { + AASlot* sp = slot_get(a, a->sret_ptr_slot); + if (sp) emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); + } + u32 nbytes = s->size; + u32 i = 0; + while (i + 8 <= nbytes) { + emit32(mc, aa64_ldur(3, 9, 29, -(i32)s->off + (i32)i)); + emit32(mc, aa64_str_uimm(3, 9, 8, i)); + i += 8; + } + while (i + 4 <= nbytes) { + emit32(mc, aa64_ldur(2, 9, 29, -(i32)s->off + (i32)i)); + emit32(mc, aa64_str_uimm(2, 9, 8, i)); + i += 4; + } + while (i + 2 <= nbytes) { + emit32(mc, aa64_ldur(1, 9, 29, -(i32)s->off + (i32)i)); + emit32(mc, aa64_str_uimm(1, 9, 8, i)); + i += 2; + } + while (i < nbytes) { + emit32(mc, aa64_ldur(0, 9, 29, -(i32)s->off + (i32)i)); + emit32(mc, aa64_str_uimm(0, 9, 8, i)); + i += 1; + } + } else { + compiler_panic(t->c, a->loc, + "aarch64 ret indirect: storage kind %d unsupported", + (int)val->storage.kind); + } + } else if (val->storage.kind == OPK_REG) { + if (val->storage.cls == RC_FP) { + u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u; + emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage))); + } else { + u32 sf = type_is_64(val->storage.type) ? 1u : 0u; + emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage))); + } + } else if (val->storage.kind == OPK_IMM) { + u32 sf = type_is_64(val->storage.type) ? 1u : 0u; + emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm); + } else if (val->storage.kind == OPK_LOCAL) { + /* DIRECT return whose source is a local: load each part into + * x0/x1 (or v0/v1) per the ABI classification. Used for + * small structs returned in registers. */ + AASlot* s = slot_get(a, val->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot"); + const ABIArgInfo* ri = val->abi; + for (u16 i = 0; i < (ri ? ri->nparts : 0); ++i) { + const ABIArgPart* pt = &ri->parts[i]; + u32 sidx = size_idx_for_bytes(pt->size); + i32 off = -(i32)s->off + (i32)pt->src_offset; + if (pt->cls == ABI_CLASS_INT) { + emit32(mc, aa64_ldur(sidx, /*Rt=*/i, 29, off)); + } else if (pt->cls == ABI_CLASS_FP) { + emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, 29, off)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 ret: ret part cls %d unimpl", (int)pt->cls); + } + } + } } - emit32(mc, aa64_ret(AA64_LR)); + /* Branch to the epilogue. mc->emit_label_ref records a fixup that + * resolves to a JUMP26-encoded displacement when the label is placed. */ + u32 bpos = mc->pos(mc); + emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0); + (void)bpos; } static void aa_alloca_ (CGTarget* t, Operand d, Operand s, u32 a) { (void)d;(void)s;(void)a; aa_panic(t, "alloca"); } @@ -334,8 +1188,6 @@ static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); } CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { - /* v1: only AArch64 implemented. Other targets fall back to a - * "not implemented" diagnostic at construction. */ if (c->target.arch != CFREE_ARCH_ARM_64) { SrcLoc loc = {0,0,0}; compiler_panic(c, loc, @@ -399,7 +1251,7 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) t->va_end_ = aa_va_end_; t->va_copy_ = aa_va_copy_; - t->setjmp_ = NULL; /* parser lowers via __cfree_setjmp */ + t->setjmp_ = NULL; t->longjmp_ = NULL; t->atomic_load = aa_atomic_load; @@ -415,6 +1267,9 @@ CGTarget* cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) t->finalize = aa_finalize; t->destroy = aa_destroy; + /* Avoid signed/unsigned warning. */ + (void)type_is_signed; + compiler_defer(c, cgt_cleanup, t); return t; } @@ -424,7 +1279,5 @@ void cgtarget_finalize(CGTarget* t) { if (t && t->finalize) t->finalize(t); } void cgtarget_free(CGTarget* t) { if (!t) return; - /* Arena-backed; nothing to free. The compiler_defer cleanup callback - * arrives here at panic; intentional double-call from explicit free - * after success is safe because everything is arena memory. */ + /* Arena-backed; nothing to free. */ } diff --git a/test/cg/CORPUS.md b/test/cg/CORPUS.md @@ -44,12 +44,12 @@ parser will, and fail at runtime until those land — that is intentional. | `a02_return_zero` | ★ | `load_imm 0; ret reg` | 0 | | `a03_ret_imm` | ★ | `ret IMM 17` (backend materializes) | 17 | | `a04_copy_reg` | ★ | `load_imm 7; copy r1->r2; ret r2` | 7 | -| `a05_return_neg_small` | · | `load_imm -7` via MOVN; ret | 249 (= -7 & 0xff) | -| `a06_return_i64` | · | i64 `load_imm 0x1_0000_002A`; ret as i64 | 42 (low 32 of x0) | -| `a07_void_return` | · | `ret(NULL)` | 0 (via _start zeroing x0) | -| `a08_multiple_returns` | · | `ret_imm 1; ret_imm 2` (second is dead) | 1 | -| `a09_load_imm_movz_movk` | · | `load_imm 0xABCD` (multi-step materialize) | 205 (= 0xCD) | -| `a10_return_u8` | · | `load_imm 200` into u8 reg; ret | 200 | +| `a05_return_neg_small` | ★ | `load_imm -7` via MOVN; ret | 249 (= -7 & 0xff) | +| `a06_return_i64` | ★ | i64 `load_imm 0x1_0000_002A`; ret as i64 | 42 (low 32 of x0) | +| `a07_void_return` | ★ | `ret(NULL)` | 0 (via _start zeroing x0) | +| `a08_multiple_returns` | ★ | `ret_imm 1; ret_imm 2` (second is dead) | 1 | +| `a09_load_imm_movz_movk` | ★ | `load_imm 0xABCD` (multi-step materialize) | 205 (= 0xCD) | +| `a10_return_u8` | ★ | `load_imm 200` into u8 reg; ret | 200 | ## Group B — frame slots, parameters, locals @@ -62,14 +62,14 @@ materialization, slot allocation, and call lowering use the live | Case | Status | Body | Expected | |---|---|---|---| -| `b01_param_int` | · | `int echo(int x){return x;}; echo(201)` | 201 | -| `b02_param_sum` | · | `int sum2(int a,int b){return a+b;}; sum2(40,2)` | 42 | -| `b03_param_spill` | · | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)` | 45 | -| `b04_local_int` | · | local int slot; `*p = 42; return *p` | 42 | -| `b05_addr_taken_local` | · | `int x=17; int*p=&x; *p+=1; return *p` | 18 | -| `b06_sret` | · | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b` | 42 | -| `b07_byval_param` | · | `int take(struct Pt p){return p.a+p.b;}; take({15,27})` | 42 | -| `b08_fp_param` | · | `int trunc(float f){return (int)f;}; trunc(7.5f)` | 7 | +| `b01_param_int` | ★ | `int echo(int x){return x;}; echo(201)` | 201 | +| `b02_param_sum` | ★ | `int sum2(int a,int b){return a+b;}; sum2(40,2)` | 42 | +| `b03_param_spill` | ★ | `int sum9(a..i)`; nine int params (8 GPR, 1 stack); `sum9(1..9)` | 45 | +| `b04_local_int` | ★ | local int slot; `*p = 42; return *p` | 42 | +| `b05_addr_taken_local` | ★ | `int x=17; int*p=&x; *p+=1; return *p` | 18 | +| `b06_sret` | ★ | `struct Pt{int a,b;}; Pt mk(){{10,32}}; pt=mk(); return pt.a+pt.b` | 42 | +| `b07_byval_param` | ★ | `int take(struct Pt p){return p.a+p.b;}; take({15,27})` | 42 | +| `b08_fp_param` | ★ | `int trunc(float f){return (int)f;}; trunc(7.5f)` | 7 | ## Group C — integer arithmetic @@ -79,14 +79,14 @@ materialization, slot allocation, and call lowering use the live | `c02_sub_mul` | ★ | `7 * 3 - 4` | 17 | | `c03_bitwise` | ★ | `(~3) & 0xff` | 252 | | `c04_shift` | ★ | `(1<<5) \| (16>>1)` (logical shr) | 40 | -| `c05_div_mod` | · | `23 / 4 + 23 % 4` (signed) | 8 | -| `c06_xor` | · | `0xa5 ^ 0x5a` | 255 | -| `c07_iadd_i64` | · | i64 `0x1_0000_0029 + 0x1_0000_0001` | 42 (low 32) | -| `c08_unsigned_div` | · | `100u / 7u` | 14 | -| `c09_neg` | · | `UO_NEG` 42 | 214 (= -42 & 0xff) | -| `c10_logical_not` | · | `UO_NOT 0` (zero-test → 0/1) | 1 | -| `c11_shr_signed` | · | `-16 >>(s) 2` | 252 (= -4 & 0xff) | -| `c12_imul_i64` | · | i64 `7 * 6` | 42 | +| `c05_div_mod` | ★ | `23 / 4 + 23 % 4` (signed) | 8 | +| `c06_xor` | ★ | `0xa5 ^ 0x5a` | 255 | +| `c07_iadd_i64` | ★ | i64 `0x1_0000_0029 + 0x1_0000_0001` | 42 (low 32) | +| `c08_unsigned_div` | ★ | `100u / 7u` | 14 | +| `c09_neg` | ★ | `UO_NEG` 42 | 214 (= -42 & 0xff) | +| `c10_logical_not` | ★ | `UO_NOT 0` (zero-test → 0/1) | 1 | +| `c11_shr_signed` | ★ | `-16 >>(s) 2` | 252 (= -4 & 0xff) | +| `c12_imul_i64` | ★ | i64 `7 * 6` | 42 | ## Deferred groups