kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit edbd83e9f471e5ce53f0bd6716d5d7d9476ba302
parent 19d5d73f838dcb2dfede65d2724fc9a0c891a6c8
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 26 May 2026 15:41:39 -0700

aa64: delete old CGTarget backend, switch to NativeDirectTarget

Remove the entire old CGTarget-level aarch64 backend: alloc.c (spill/reload,
labels, scopes), emit.c (function lifecycle, frame layout, prologue/epilogue),
internal.h (AAImpl struct and helpers), ops.c (per-instruction CGTarget
methods), and opt_coord.c (optimizer coordination).

Replace aa64_cgtarget_new with aa64_native_target_new +
aa64_native_direct_ops, routing through NativeDirectTarget.  Add native.c
implementing the NativeTarget vtable (func lifecycle, frame slots, move,
load/store, binop/cmp, call planning, and prologue/epilogue patching).

Update the inline-asm operand binder in asm.c/asm.h to use arch-private
AA64_INLINE_OPK_REG / AA64_INLINE_OPCLS_* pseudo-kinds instead of the
semantic OPK_REG, matching the new division where semantic targets never
expose physical registers.

Diffstat:
Msrc/arch/aa64/aa64.h | 6+++++-
Dsrc/arch/aa64/alloc.c | 319-------------------------------------------------------------------------------
Msrc/arch/aa64/arch.c | 34++++++++++++++++++++++++++++------
Msrc/arch/aa64/asm.c | 40++++++++++++++++++++++++++++++++++------
Msrc/arch/aa64/asm.h | 12++++++++++++
Dsrc/arch/aa64/emit.c | 874-------------------------------------------------------------------------------
Dsrc/arch/aa64/internal.h | 355-------------------------------------------------------------------------------
Asrc/arch/aa64/native.c | 3226+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/arch/aa64/ops.c | 2908-------------------------------------------------------------------------------
Dsrc/arch/aa64/opt_coord.c | 373-------------------------------------------------------------------------------
10 files changed, 3305 insertions(+), 4842 deletions(-)

diff --git a/src/arch/aa64/aa64.h b/src/arch/aa64/aa64.h @@ -2,7 +2,11 @@ #define CFREE_ARCH_AA64_H #include "arch/arch.h" +#include "arch/native_target.h" -CGTarget* aa64_cgtarget_new(Compiler*, ObjBuilder*, MCEmitter*); +typedef struct NativeOps NativeOps; + +NativeTarget* aa64_native_target_new(Compiler*, ObjBuilder*, MCEmitter*); +const NativeOps* aa64_native_direct_ops(void); #endif diff --git a/src/arch/aa64/alloc.c b/src/arch/aa64/alloc.c @@ -1,319 +0,0 @@ -/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */ - -#include "arch/aa64/internal.h" - -/* ============================================================ - * AAImpl accessor - * ============================================================ */ - -AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } - -/* ============================================================ - * Slot accessor - * ============================================================ */ - -AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) { - if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL; - return &a->slots[fs - 1]; -} - -static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out, - RegClass* cls_out) { - (void)t; - Slice ns = pool_slice(t->c->global, name); - if (!ns.s || !ns.len) return 1; - char buf[8]; - if (ns.len >= sizeof buf) return 1; - memcpy(buf, ns.s, ns.len); - buf[ns.len] = '\0'; - u32 dwarf; - if (aa64_register_index(buf, &dwarf) != 0) return 1; - if (dwarf <= 30u) { - if (out) *out = (Reg)dwarf; - if (cls_out) *cls_out = RC_INT; - return 0; - } - if (dwarf >= 64u && dwarf <= 95u) { - if (out) *out = (Reg)(dwarf - 64u); - if (cls_out) *cls_out = RC_FP; - return 0; - } - return 1; -} - -static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot, - MemAccess ma) { - AAImpl* a = impl_of(t); - if (src.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG"); - } - Operand addr; - memset(&addr, 0, sizeof addr); - addr.kind = OPK_LOCAL; - addr.cls = RC_INT; - addr.type = ma.type; - addr.v.frame_slot = slot; - aa_store(t, addr, src, ma); -} - -static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot, - MemAccess ma) { - AAImpl* a = impl_of(t); - if (dst.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG"); - } - Operand addr; - memset(&addr, 0, sizeof addr); - addr.kind = OPK_LOCAL; - addr.cls = RC_INT; - addr.type = ma.type; - addr.v.frame_slot = slot; - aa_load(t, dst, addr, ma); -} - -/* ============================================================ - * Labels / control flow - * ============================================================ */ - -static Label aa_label_new(CGTarget* t) { - return (Label)t->mc->label_new(t->mc); -} - -static void aa_label_place(CGTarget* t, Label l) { - t->mc->label_place(t->mc, (MCLabel)l); -} - -void aa_jump(CGTarget* t, Label l) { - MCEmitter* mc = t->mc; - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0); -} - -static void aa_emit_zero64(MCEmitter* mc) { - static const u8 zero[8] = {0}; - mc->emit_bytes(mc, zero, sizeof zero); -} - -static void aa_load_label_addr(CGTarget* t, Operand dst, Label l) { - /* Reserve: - * insn0: ADR Xdst, label (patched to LDR literal if out of range) - * insn1: B .+12 (skip the inline literal) - * lit: .quad label (relocated fallback target if needed) - * - * The MC fixup range-checks ADR at label placement. In-range labels use the - * first instruction; out-of-range labels use the relocated literal slot. */ - MCEmitter* mc = t->mc; - u32 rd; - if (dst.kind != OPK_REG) { - compiler_panic(t->c, mc->loc, "aa64: load_label_addr dst must be REG"); - } - rd = reg_num(dst); - aa64_emit32(mc, aa64_adr(rd, 0u, 0u)); - aa64_emit32(mc, aa64_b_base() | 3u); - aa_emit_zero64(mc); - mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_INTRA_LABEL_ADDR, 16, 0); -} - -static void aa_indirect_branch(CGTarget* t, Operand addr, const Label* targets, - u32 ntargets) { - /* BR Xn — register-indirect branch (no fixup needed). */ - MCEmitter* mc = t->mc; - (void)targets; - (void)ntargets; - if (addr.kind != OPK_REG) { - compiler_panic(t->c, mc->loc, "aa64: indirect_branch expects REG operand"); - } - aa64_emit32(mc, aa64_br(reg_num(addr))); -} - -static u32 cmp_to_cond(CmpOp op) { - switch (op) { - case CMP_EQ: - return 0x0u; - case CMP_NE: - return 0x1u; - case CMP_LT_U: - return 0x3u; - case CMP_LE_U: - return 0x9u; - case CMP_GT_U: - return 0x8u; - case CMP_GE_U: - return 0x2u; - case CMP_LT_S: - return 0xbu; - case CMP_LE_S: - return 0xdu; - case CMP_GT_S: - return 0xcu; - case CMP_GE_S: - return 0xau; - default: - return 0x0u; - } -} - -static u32 fp_cmp_to_cond(CmpOp op) { - switch (op) { - case CMP_EQ: - return 0x0u; /* equal; unordered is false */ - case CMP_NE: - return 0x1u; /* not equal; unordered is true */ - case CMP_LT_F: - return 0x4u; /* MI: less-than only */ - case CMP_LE_F: - return 0x9u; /* LS: less-or-equal only */ - case CMP_GT_F: - return 0xcu; /* GT excludes unordered */ - case CMP_GE_F: - return 0xau; /* GE excludes unordered */ - default: - return cmp_to_cond(op); - } -} - -void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) { - MCEmitter* mc = t->mc; - u32 sf = type_is_64(a_op.type) ? 1u : 0u; - if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { - u32 imm12, sh; - if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) { - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh)); - return; - } - } - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); - aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm)); -} - -static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, - Label l) { - MCEmitter* mc = t->mc; - emit_cmp_ab(t, a, b); - aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op))); - mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0); -} - -static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { - u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; - if (a.cls == RC_FP || b.cls == RC_FP) { - u32 type = type_is_fp_double(a.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_fcmp(type, reg_num(a), reg_num(b))); - aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), fp_cmp_to_cond(op))); - return; - } - emit_cmp_ab(t, a, b); - aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op))); -} - -/* ============================================================ - * Structured scopes - * ============================================================ */ - -static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) { - AAImpl* a = impl_of(t); - if (a->nscopes == a->scopes_cap) { - u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u; - AAScope* nb = arena_array(t->c->tu, AAScope, ncap); - if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes); - a->scopes = nb; - a->scopes_cap = ncap; - } - AAScope* sc = &a->scopes[a->nscopes]; - sc->kind = (u8)d->kind; - sc->has_else = 0; - sc->else_label = 0; - sc->end_label = 0; - sc->break_label = d->break_label; - sc->continue_label = d->continue_label; - - if (d->kind == SCOPE_IF) { - sc->else_label = t->mc->label_new(t->mc); - sc->end_label = t->mc->label_new(t->mc); - u32 sf = type_is_64(d->cond.type) ? 1u : 0u; - u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0); - aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0)); - aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/)); - t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0); - } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) { - /* bookkeep only */ - } else { - compiler_panic(t->c, a->loc, - "aarch64 scope_begin: kind %d not yet implemented", - (int)d->kind); - } - - a->nscopes++; - return (CGScope)a->nscopes; -} - -static void aa_scope_else(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa64_emit32(t->mc, aa64_b_base()); - t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0); - t->mc->label_place(t->mc, sc->else_label); - sc->has_else = 1; -} - -static void aa_scope_end(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - if (sc->kind == SCOPE_IF) { - if (!sc->has_else) { - t->mc->label_place(t->mc, sc->else_label); - } - t->mc->label_place(t->mc, sc->end_label); - } -} - -static void aa_break_to(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa_jump(t, sc->break_label); -} - -static void aa_continue_to(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa_jump(t, sc->continue_label); -} - -/* Expose vtable entries to ops.c constructor via a registration helper. - * ops.c calls this after the basic ops vtable is populated. */ -void aa_alloc_vtable_init(CGTarget* t) { - t->spill_reg = aa_spill_reg; - t->reload_reg = aa_reload_reg; - t->resolve_reg_name = aa_resolve_reg_name; - - t->label_new = aa_label_new; - t->label_place = aa_label_place; - t->jump = aa_jump; - t->cmp_branch = aa_cmp_branch; - t->cmp = aa_cmp; - t->load_label_addr = aa_load_label_addr; - t->indirect_branch = aa_indirect_branch; - - t->scope_begin = aa_scope_begin; - t->scope_else = aa_scope_else; - t->scope_end = aa_scope_end; - t->break_to = aa_break_to; - t->continue_to = aa_continue_to; -} diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c @@ -1,10 +1,13 @@ #include "arch/arch.h" +#include <string.h> + #include "arch/aa64/aa64.h" #include "arch/aa64/asm.h" #include "arch/aa64/disasm.h" #include "arch/aa64/isa.h" #include "arch/aa64/regs.h" +#include "cg/native_direct_target.h" #include "core/bytes.h" #include "link/link_arch.h" #include "obj/obj.h" @@ -107,18 +110,37 @@ static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) { return 0; } -static CGTarget* aa64_backend_make(Compiler* c, ObjBuilder* o, +static CgTarget* aa64_backend_make(Compiler* c, ObjBuilder* o, const CfreeCodeOptions* opts) { MCEmitter* mc = NULL; Debug* debug = NULL; - CGTarget* t; + CgTarget* t; + NativeTarget* native; + NativeDirectTargetConfig cfg; if (cg_mc_debug_new(c, o, opts, &mc, &debug) != CFREE_OK) return NULL; - t = aa64_cgtarget_new(c, o, mc); - if (!t) return NULL; - t->debug = debug; + (void)debug; + native = aa64_native_target_new(c, o, mc); + if (!native) return NULL; + memset(&cfg, 0, sizeof cfg); + cfg.native = native; + cfg.ops = aa64_native_direct_ops(); + t = native_direct_target_new(c, o, &cfg); return t; } +static CgTarget* aa64_semantic_target_new(Compiler* c, ObjBuilder* o, + MCEmitter* mc) { + NativeTarget* native; + NativeDirectTargetConfig cfg; + if (!mc) mc = mc_new(c, o); + native = aa64_native_target_new(c, o, mc); + if (!native) return NULL; + memset(&cfg, 0, sizeof cfg); + cfg.native = native; + cfg.ops = aa64_native_direct_ops(); + return native_direct_target_new(c, o, &cfg); +} + static const CfreePredefinedMacro aa64_predefined_macros[] = { {CFREE_SLICE_LIT("__aarch64__"), CFREE_SLICE_LIT("1")}, {CFREE_SLICE_LIT("__AARCH64EL__"), CFREE_SLICE_LIT("1")}, @@ -136,7 +158,7 @@ const ArchImpl arch_impl_aa64 = { .backend = {.name = "aa64", .make = aa64_backend_make}, .kind = CFREE_ARCH_ARM_64, .name = "aa64", - .cgtarget_new = aa64_cgtarget_new, + .cgtarget_new = aa64_semantic_target_new, .asm_new = aa64_arch_asm_new, .disasm_new = aa64_disasm_new, .apply_label_fixup = aa64_apply_label_fixup, diff --git a/src/arch/aa64/asm.c b/src/arch/aa64/asm.c @@ -22,6 +22,7 @@ #include "arch/arch.h" #include "asm/asm_helpers.h" #include "asm/asm_lex.h" +#include "cg/type.h" #include "core/arena.h" #include "core/pool.h" #include "core/slice.h" @@ -1232,6 +1233,8 @@ void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) { * deliberate reason. */ #define AA64_INLINE_LINE_CAP 1024 +_Noreturn static void inline_panic(AA64Asm* a, const char* msg); + /* Render a 5-bit register number into the StrBuf using the requested * width form. is64 picks x-form vs w-form; SP / ZR encode as * register #31 and we render them as wzr/xzr or wsp/sp depending on @@ -1247,6 +1250,24 @@ static void render_reg(StrBuf* sb, u32 reg, int is64) { strbuf_putc(sb, (char)('0' + (reg % 10u))); } +static void render_fp_reg(StrBuf* sb, u32 reg, u32 nbytes) { + strbuf_putc(sb, nbytes <= 4u ? 's' : 'd'); + if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u))); + strbuf_putc(sb, (char)('0' + (reg % 10u))); +} + +static u32 inline_op_size(AA64Asm* a, const Operand* op) { + if (!op->type) return 8u; + u64 n = cg_type_size(a->c, op->type); + if (!n) return 8u; + if (n > 16u) inline_panic(a, "inline asm operand is too large"); + return (u32)n; +} + +static int inline_op_is_ptr(AA64Asm* a, const Operand* op) { + return op->type && cg_type_is_ptr(a->c, op->type); +} + /* Render a signed 64-bit integer prefixed with '#'. */ static void render_imm(StrBuf* sb, i64 v) { strbuf_putc(sb, '#'); @@ -1279,12 +1300,14 @@ static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) { (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout]; switch (form) { case 1: /* %wN — force 32-bit register form */ - if (op->kind != OPK_REG) inline_panic(a, "%w on non-register operand"); - render_reg(sb, (u32)op->v.reg, /*is64=*/0); + if (op->kind != AA64_INLINE_OPK_REG || op->pad[0] != AA64_INLINE_OPCLS_INT) + inline_panic(a, "%w on non-integer-register operand"); + render_reg(sb, (u32)op->v.local, 0); return; case 2: /* %xN — force 64-bit register form */ - if (op->kind != OPK_REG) inline_panic(a, "%x on non-register operand"); - render_reg(sb, (u32)op->v.reg, /*is64=*/1); + if (op->kind != AA64_INLINE_OPK_REG || op->pad[0] != AA64_INLINE_OPCLS_INT) + inline_panic(a, "%x on non-integer-register operand"); + render_reg(sb, (u32)op->v.local, 1); return; case 3: /* %aN — memory addressing form */ if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand"); @@ -1299,8 +1322,13 @@ static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) { } /* Default rendering by operand kind. */ switch (op->kind) { - case OPK_REG: - render_reg(sb, (u32)op->v.reg, /*is64=*/1); + case AA64_INLINE_OPK_REG: + if (op->pad[0] == AA64_INLINE_OPCLS_FP) { + render_fp_reg(sb, (u32)op->v.local, inline_op_size(a, op)); + } else { + render_reg(sb, (u32)op->v.local, + inline_op_is_ptr(a, op) || inline_op_size(a, op) > 4u); + } return; case OPK_IMM: render_imm(sb, op->v.imm); diff --git a/src/arch/aa64/asm.h b/src/arch/aa64/asm.h @@ -20,6 +20,18 @@ typedef struct ArchAsm ArchAsm; typedef struct AA64Asm AA64Asm; +/* Private pseudo operand used by the aa64 inline-asm binder. Semantic CG + * operands never expose physical registers, so native.c lowers register + * constraints into this arch-private shape before template substitution. + * Operand.v.local carries the 5-bit physical register number; pad[0] carries + * AA64_INLINE_OPCLS_*. + */ +enum { + AA64_INLINE_OPK_REG = 0xf0u, + AA64_INLINE_OPCLS_INT = 0u, + AA64_INLINE_OPCLS_FP = 1u, +}; + /* Construct/destroy. Pure: no allocations beyond the AA64Asm struct * itself (which lives on the compiler's TU arena). */ AA64Asm* aa64_asm_open(Compiler* c); diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c @@ -1,874 +0,0 @@ -/* aarch64/emit.c — instruction encoding helpers, function lifecycle, - * frame layout, parameter ABI, address materialization. */ - -#include "arch/aa64/internal.h" - -extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); -extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs, - u32 end_ofs); - -static void aa_emit_cfi_frame(CGTarget* t, u32 post_prologue_off, u32 fp_lr_off, - u32 int_save_off, u32 fp_save_off, u32 frame_size, - const u32* int_regs, u32 n_int_saves, - const u32* fp_regs, u32 n_fp_saves, - int omit_frame); - -/* ============================================================ - * Shared type / operand helpers - * ============================================================ */ - -int type_is_64(CfreeCgTypeId t) { - return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) || - t >= (CfreeCgTypeId)(2u << 6); -} - -int type_is_fp_double(CfreeCgTypeId t) { - return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64); -} - -int type_is_signed(CfreeCgTypeId t) { - (void)t; - return 0; -} - -u32 type_byte_size(CfreeCgTypeId t) { - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL)) - return 1; - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2; - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) - return 4; - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F128)) return 16; - return 8; -} - -u32 size_idx_for_bytes(u32 nbytes) { - switch (nbytes) { - case 1: - return 0; - case 2: - return 1; - case 4: - return 2; - case 8: - return 3; - case 16: - return 4; - default: - return 3; - } -} - -u32 reg_num(Operand op) { return op.v.reg & 0x1fu; } - -static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) { - u32 n = 0; - for (u32 r = first; r <= last; ++r) { - if (mask & (1u << r)) out[n++] = r; - } - return n; -} - -static u32 count_mask_regs(u32 mask, u32 first, u32 last) { - u32 n = 0; - for (u32 r = first; r <= last; ++r) { - if (mask & (1u << r)) ++n; - } - return n; -} - -static u32 aa_planned_prologue_words(const AAImpl* a) { - u32 n = AA_PROLOGUE_FRAME_WORDS; - if (a->has_sret) ++n; - n += count_mask_regs(a->planned_cs_int_mask, 19u, 28u); - n += count_mask_regs(a->planned_cs_fp_mask, 8u, 15u); - return n ? n : 1u; -} - -static void aa_func_begin_init(CGTarget* t, const CGFuncDesc* fd) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - mc->set_section(mc, fd->text_section_id); - mc->emit_align(mc, 4, 0); - - a->fd = fd; - a->func_start = mc->pos(mc); - mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); - a->next_param_int = 0; - a->next_param_fp = 0; - a->next_param_stack = 0; - a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0; - a->cum_off = 0; - a->max_outgoing = 0; - a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0; - a->used_cs_fp_mask = a->has_planned_regs ? a->planned_cs_fp_mask : 0; - a->prologue_words = - a->has_planned_regs ? aa_planned_prologue_words(a) : AA_PROLOGUE_WORDS; - a->post_prologue_off = 0; - a->planned_cs_int_mask = 0; - a->planned_cs_fp_mask = 0; - a->has_planned_regs = 0; - a->nslots = 0; - a->nscopes = 0; - a->has_alloca = 0; - a->known_frame = 0; - a->omit_frame = 0; - a->nadd_patches = 0; - a->sret_ptr_slot = FRAME_SLOT_NONE; - a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0; - a->gp_save_slot = FRAME_SLOT_NONE; - a->fp_save_slot = FRAME_SLOT_NONE; - a->epilogue_label = mc->label_new(mc); - - mc->cfi_startproc(mc); -} - -static void aa_add_entry_frame_slots(CGTarget* t) { - AAImpl* a = impl_of(t); - - if (a->has_sret) { - FrameSlotDesc fsd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 8, - .align = 8, - .kind = FS_SPILL, - .flags = 0, - }; - a->sret_ptr_slot = aa_frame_slot(t, &fsd); - } - - if (a->is_variadic) { - FrameSlotDesc gpd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 64, - .align = 8, - .kind = FS_SPILL, - .flags = 0, - }; - a->gp_save_slot = aa_frame_slot(t, &gpd); - FrameSlotDesc fpd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 128, - .align = 16, - .kind = FS_SPILL, - .flags = 0, - }; - a->fp_save_slot = aa_frame_slot(t, &fpd); - } -} - -static void aa_emit_variadic_reg_saves(CGTarget* t) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (!a->is_variadic) return; - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - AASlot* fs = aa64_slot_get(a, a->fp_save_slot); - for (u32 i = 0; i < 8; ++i) - aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8)); - for (u32 i = 0; i < 8; ++i) - aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16)); -} - -/* ============================================================ - * Low-level emission - * ============================================================ */ - -void aa64_emit32(MCEmitter* mc, u32 word) { - u32 ofs = obj_pos(mc->obj, mc->section_id); - u8 b[4]; - b[0] = (u8)(word & 0xff); - b[1] = (u8)((word >> 8) & 0xff); - b[2] = (u8)((word >> 16) & 0xff); - b[3] = (u8)((word >> 24) & 0xff); - mc->emit_bytes(mc, b, 4); - if (mc->debug) { - debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); - } -} - -void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) { - u8 b[4]; - b[0] = (u8)(word & 0xff); - b[1] = (u8)((word >> 8) & 0xff); - b[2] = (u8)((word >> 16) & 0xff); - b[3] = (u8)((word >> 24) & 0xff); - obj_patch(obj, sec_id, ofs, b, 4); -} - -/* ============================================================ - * Immediate encoding helpers - * ============================================================ */ - -void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) { - const u32 nslots = sf ? 4u : 2u; - u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu); - - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - u64 cleared = v & ~((u64)0xffffu << (i * 16)); - if (slot != 0 && cleared == 0) { - aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); - return; - } - } - - { - u64 inv = sf ? ~v : ((~v) & 0xffffffffu); - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((inv >> (i * 16)) & 0xffffu); - u64 cleared = inv & ~((u64)0xffffu << (i * 16)); - if (cleared == 0) { - aa64_emit32(mc, aa64_movn(sf, Rd, slot, i)); - return; - } - } - } - - int placed = 0; - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - if (!placed) { - if (slot == 0) continue; - aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); - placed = 1; - } else if (slot != 0) { - aa64_emit32(mc, aa64_movk(sf, Rd, slot, i)); - } - } - if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0)); -} - -void emit_sp_add(MCEmitter* mc, u32 imm) { - if (imm <= 0xfff) { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0)); - } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1)); - } else { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1)); - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0)); - } -} - -/* ============================================================ - * Function lifecycle - * ============================================================ */ - -void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - aa_func_begin_init(t, fd); - - a->prologue_pos = mc->pos(mc); - for (u32 i = 0; i < a->prologue_words; ++i) aa64_emit32(mc, AA64_NOP); - - aa_add_entry_frame_slots(t); - aa_emit_variadic_reg_saves(t); - /* Capture end-of-prologue position for CFI emission in func_end. */ - a->post_prologue_off = mc->pos(mc) - a->func_start; -} - -static u32 aa_build_prologue(CGTarget* t, u32* words, u32 cap, u32 frame_size, - u32 fp_lr_off, u32 int_save_off, u32 fp_save_off, - const u32* int_regs, u32 n_int_saves, - const u32* fp_regs, u32 n_fp_saves) { - AAImpl* a = impl_of(t); - u32 wi = 0; - - if (frame_size <= 0xfff) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0); - } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1); - } else { - if (wi + 2 > cap) goto overflow; - words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1); - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0); - } - if (fp_lr_off <= 504u) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); - } else { - if (wi + 2 > cap) goto overflow; - words[wi++] = aa64_str_uimm(3, 29, 31, fp_lr_off); - words[wi++] = aa64_str_uimm(3, 30, 31, fp_lr_off + 8u); - } - if (wi >= cap) goto overflow; - if (fp_lr_off <= 0xfffu) { - words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); - } else if ((fp_lr_off >> 24) == 0) { - u32 hi = (fp_lr_off >> 12) & 0xfffu; - u32 lo = fp_lr_off & 0xfffu; - if (hi) { - words[wi++] = aa64_add_imm(1, 29, 31, hi, 1); - if (lo) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_add_imm(1, 29, 29, lo, 0); - } - } else if (lo) { - words[wi++] = aa64_add_imm(1, 29, 31, lo, 0); - } - } else { - compiler_panic(t->c, a->loc, - "aarch64: fp/lr offset %u out of prologue range", fp_lr_off); - } - if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* s = aa64_slot_get(a, a->sret_ptr_slot); - if (s) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off); - } - } - for (u32 i = 0; i < n_int_saves; ++i) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_str_uimm(3, int_regs[i], 31, int_save_off + i * 8u); - } - for (u32 i = 0; i < n_fp_saves; ++i) { - if (wi >= cap) goto overflow; - words[wi++] = aa64_str_fp_uimm(3, fp_regs[i], 31, fp_save_off + i * 8u); - } - return wi; - -overflow: - compiler_panic(t->c, a->loc, - "aarch64: prologue too small (used more than %u words)", cap); - return 0; -} - -static void aa_compute_frame(const AAImpl* a, u32 n_int_saves, u32 n_fp_saves, - u32* int_save_off, u32* fp_save_off, - u32* fp_lr_off, u32* frame_size) { - *int_save_off = a->max_outgoing; - *fp_save_off = *int_save_off + n_int_saves * 8u; - u32 locals_off = *fp_save_off + n_fp_saves * 8u; - *fp_lr_off = locals_off + a->cum_off; - *frame_size = *fp_lr_off + 16; - *frame_size = (*frame_size + 15u) & ~15u; - *fp_lr_off = *frame_size - 16; -} - -void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd, - const CGKnownFrameDesc* frame, - FrameSlot* out_slots) { - AAImpl* a = impl_of(t); - u32 int_regs[10]; - u32 fp_regs[8]; - u32 int_save_off, fp_save_off, fp_lr_off, frame_size; - u32 words[AA_PROLOGUE_WORDS]; - - aa_func_begin_init(t, fd); - a->known_frame = 1; - aa_add_entry_frame_slots(t); - for (u32 i = 0; frame && i < frame->nslots; ++i) { - FrameSlot fs = aa_frame_slot(t, &frame->slots[i]); - if (out_slots) out_slots[i] = fs; - } - if (frame) { - a->max_outgoing = frame->max_outgoing; - a->has_alloca = frame->has_alloca ? 1u : 0u; - } - - u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs); - u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs); - if (frame && frame->may_omit_frame && frame->nslots == 0 && - frame->max_outgoing == 0 && !frame->has_alloca && !frame->has_call && - !a->has_sret && !a->is_variadic && n_int_saves == 0 && n_fp_saves == 0) { - a->omit_frame = 1; - return; - } - aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off, - &fp_lr_off, &frame_size); - - a->prologue_pos = t->mc->pos(t->mc); - u32 nwords = aa_build_prologue(t, words, AA_PROLOGUE_WORDS, frame_size, - fp_lr_off, int_save_off, fp_save_off, int_regs, - n_int_saves, fp_regs, n_fp_saves); - for (u32 i = 0; i < nwords; ++i) aa64_emit32(t->mc, words[i]); - aa_emit_variadic_reg_saves(t); - { - u32 post = t->mc->pos(t->mc) - a->func_start; - aa_emit_cfi_frame(t, post, fp_lr_off, int_save_off, fp_save_off, frame_size, - int_regs, n_int_saves, fp_regs, n_fp_saves, - /*omit_frame=*/0); - } -} - -/* CFI for the post-prologue state of an AArch64 frame. - * CFA = x29 + 16 (x29 points to saved-FP/LR pair; pre-call sp = x29+16) - * x29 saved at CFA-16, x30 (LR) at CFA-8 - * callee-saved ints/fps at their slot offsets - * pc_offset = end-of-prologue offset within the function. */ -static void aa_emit_cfi_frame(CGTarget* t, u32 post_prologue_off, u32 fp_lr_off, - u32 int_save_off, u32 fp_save_off, u32 frame_size, - const u32* int_regs, u32 n_int_saves, - const u32* fp_regs, u32 n_fp_saves, - int omit_frame) { - MCEmitter* mc = t->mc; - if (omit_frame) return; - (void)fp_lr_off; - mc->cfi_set_next_pc_offset(mc, post_prologue_off); - mc->cfi_def_cfa(mc, 29u, 16); - mc->cfi_offset(mc, 29u, -16); - mc->cfi_offset(mc, 30u, -8); - { - u32 i; - for (i = 0; i < n_int_saves; ++i) { - i32 sp_off = (i32)int_save_off + (i32)i * 8; - i32 cfa_off = sp_off - (i32)frame_size; - mc->cfi_offset(mc, int_regs[i], cfa_off); - } - for (i = 0; i < n_fp_saves; ++i) { - /* AAPCS DWARF: V0=64, so D8..D15 → DWARF 72..79. */ - i32 sp_off = (i32)fp_save_off + (i32)i * 8; - i32 cfa_off = sp_off - (i32)frame_size; - mc->cfi_offset(mc, 64u + fp_regs[i], cfa_off); - } - } -} - -void aa_func_end(CGTarget* t) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - ObjBuilder* obj = t->obj; - u32 sec = a->fd->text_section_id; - - u32 int_regs[10]; - u32 fp_regs[8]; - u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs); - u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs); - - u32 int_save_off, fp_save_off, fp_lr_off, frame_size; - aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off, - &fp_lr_off, &frame_size); - - if (!a->known_frame) { - aa_emit_cfi_frame(t, a->post_prologue_off, fp_lr_off, int_save_off, - fp_save_off, frame_size, int_regs, n_int_saves, fp_regs, - n_fp_saves, /*omit_frame=*/a->omit_frame); - } - - if (a->omit_frame) goto finish; - - mc->label_place(mc, a->epilogue_label); - - if (a->has_alloca) { - if (fp_lr_off <= 0xfff) { - aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0)); - } else { - compiler_panic(t->c, a->loc, - "aarch64: has_alloca + fp_lr_off %u out of imm12 range", - fp_lr_off); - } - } - - for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { - u32 r0 = fp_regs[i]; - aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31, fp_save_off + (u32)i * 8u)); - } - for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { - u32 r0 = int_regs[i]; - aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, int_save_off + (u32)i * 8u)); - } - if (fp_lr_off <= 504u) { - aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); - } else { - aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off)); - aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u)); - } - emit_sp_add(mc, frame_size); - aa64_emit32(mc, aa64_ret(AA64_LR)); - - if (!a->known_frame) { - u32 pos = a->prologue_pos; - u32 words[AA_PROLOGUE_WORDS]; - u32 prologue_words = - a->prologue_words ? a->prologue_words : AA_PROLOGUE_WORDS; - for (u32 i = 0; i < prologue_words; ++i) words[i] = AA64_NOP; - (void)aa_build_prologue(t, words, prologue_words, frame_size, fp_lr_off, - int_save_off, fp_save_off, int_regs, n_int_saves, - fp_regs, n_fp_saves); - for (u32 i = 0; i < prologue_words; ++i) - aa64_patch32(obj, sec, pos + i * 4u, words[i]); - } - - if (a->max_outgoing > 0xfff) { - compiler_panic( - t->c, a->loc, - "aarch64: max_outgoing %u out of imm12 range for alloca patch", - a->max_outgoing); - } - for (u32 i = 0; i < a->nadd_patches; ++i) { - u32 dr = a->add_patches[i].dst_reg; - u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0); - aa64_patch32(obj, sec, a->add_patches[i].pos, word); - } - -finish:; - u32 end = mc->pos(mc); - obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start, - (u64)(end - a->func_start)); - if (a->fd->atomize) { - obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->fd->sym, - 0); - } - if (t->debug) debug_func_pc_range(t->debug, sec, a->func_start, end); - - mc->cfi_endproc(mc); - mc_end_function(mc); - a->fd = NULL; -} - -/* ============================================================ - * Frame slots - * ============================================================ */ - -FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { - AAImpl* a = impl_of(t); - if (a->nslots == a->slots_cap) { - u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8; - AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap); - if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots); - a->slots = nbuf; - a->slots_cap = ncap; - } - u32 size = d->size ? d->size : 8; - u32 align = d->align ? d->align : 1; - u32 next = a->cum_off + size; - u32 mask = align - 1; - next = (next + mask) & ~mask; - - AASlot* s = &a->slots[a->nslots]; - s->off = next; - s->size = size; - s->align = align; - s->kind = d->kind; - - a->cum_off = next; - a->nslots++; - return (FrameSlot)(a->nslots); -} - -/* ============================================================ - * Parameters - * ============================================================ */ - -static void aa_consume_param_location(AAImpl* a, const ABIArgInfo* ai) { - if (!ai || ai->kind == ABI_ARG_IGNORE) return; - if (ai->kind == ABI_ARG_INDIRECT) { - if (a->next_param_int < 8) - ++a->next_param_int; - else { - a->next_param_stack += 8; - } - return; - } - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - if (pt->cls == ABI_CLASS_INT) { - if (a->next_param_int < 8) - ++a->next_param_int; - else - a->next_param_stack += 8; - } else if (pt->cls == ABI_CLASS_FP) { - if (a->next_param_fp < 8) - ++a->next_param_fp; - else - a->next_param_stack += pt->size > 8 ? pt->size : 8; - } - } -} - -CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p) { - AAImpl* a = impl_of(t); - CGLocalStorage st = p->storage; - if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { - FrameSlotDesc fsd = {0}; - fsd.type = p->type; - fsd.name = p->name; - fsd.loc = p->loc; - fsd.size = p->size; - fsd.align = p->align; - fsd.kind = FS_PARAM; - if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; - st.v.frame_slot = aa_frame_slot(t, &fsd); - } - AASlot* s = st.kind == CG_LOCAL_STORAGE_FRAME - ? aa64_slot_get(a, st.v.frame_slot) - : NULL; - if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) { - compiler_panic(t->c, a->loc, "aarch64 param: bad slot"); - } - const ABIArgInfo* ai = p->abi; - u32 incoming_stack_base = a->omit_frame ? 31u : 29u; - i32 incoming_stack_bias = a->omit_frame ? 0 : 16; - - if (ai->kind == ABI_ARG_IGNORE) return st; - if (st.kind == CG_LOCAL_STORAGE_REG && st.v.reg == (Reg)REG_NONE) { - aa_consume_param_location(a, ai); - return st; - } - if (st.kind == CG_LOCAL_STORAGE_REG) { - if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) { - compiler_panic( - t->c, a->loc, - "aarch64 param: register storage requires one direct part"); - } - const ABIArgPart* pt = &ai->parts[0]; - u32 sz = pt->size; - u32 sidx = size_idx_for_bytes(sz); - if (pt->cls == ABI_CLASS_INT) { - u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg}); - if (a->next_param_int < 8) { - u32 src = a->next_param_int++; - if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64)) { - aa64_emit32(t->mc, aa64_fmov_d_x(dst, src)); - } else if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) { - aa64_emit32(t->mc, aa64_fmov_s_w(dst, src)); - } else { - u32 sf = (sz == 8) ? 1u : 0u; - if (dst != src) aa64_emit32(t->mc, aa64_mov_reg(sf, dst, src)); - } - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) || - p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) { - aa64_emit_ldur_fp_off(t->mc, sidx, dst, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - } else { - aa64_emit_ldur_off(t->mc, sidx, dst, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - } - } - } else if (pt->cls == ABI_CLASS_FP) { - u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg}); - if (a->next_param_fp < 8) { - u32 src = a->next_param_fp++; - if (sz == 16) { - if (dst != src) aa64_emit32(t->mc, aa64_mov_v16b(dst, src)); - } else { - u32 type = (sz == 8) ? 1u : 0u; - if (dst != src) aa64_emit32(t->mc, aa64_fmov_reg(type, dst, src)); - } - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += sz > 8 ? sz : 8; - if (sz == 16) - aa64_emit32(t->mc, - aa64_ldur_q(dst, incoming_stack_base, - incoming_stack_bias + (i32)caller_off)); - else - aa64_emit_ldur_fp_off(t->mc, sidx, dst, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - } - } else { - compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", - (int)pt->cls); - } - return st; - } - if (ai->kind == ABI_ARG_INDIRECT) { - u32 ptr_reg; - if (a->next_param_int < 8) { - ptr_reg = a->next_param_int++; - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - aa64_emit_ldur_off(t->mc, 3, AA_TMP0, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - ptr_reg = AA_TMP0; - } - u32 nbytes = s->size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit_ldur_off(t->mc, 3, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); - aa64_emit_stur_off(t->mc, 3, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit_ldur_off(t->mc, 2, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); - aa64_emit_stur_off(t->mc, 2, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit_ldur_off(t->mc, 1, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); - aa64_emit_stur_off(t->mc, 1, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2); - i += 2; - } - while (i < nbytes) { - aa64_emit_ldur_off(t->mc, 0, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); - aa64_emit_stur_off(t->mc, 0, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2); - i += 1; - } - return st; - } - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - u32 part_off = pt->src_offset; - u32 sz = pt->size; - u32 sidx = size_idx_for_bytes(sz); - - if (pt->cls == ABI_CLASS_INT) { - if (a->next_param_int < 8) { - u32 reg = a->next_param_int++; - aa64_emit_stur_off(t->mc, sidx, reg, 29, -(i32)s->off + (i32)part_off, - AA_TMP0); - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29, - -(i32)s->off + (i32)part_off, AA_TMP1); - } - } else if (pt->cls == ABI_CLASS_FP) { - if (a->next_param_fp < 8) { - u32 reg = a->next_param_fp++; - if (sz == 16) - aa64_emit32(t->mc, - aa64_stur_q(reg, 29, -(i32)s->off + (i32)part_off)); - else - aa64_emit_stur_fp_off(t->mc, sidx, reg, 29, - -(i32)s->off + (i32)part_off, AA_TMP0); - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += sz > 8 ? sz : 8; - if (sz == 16) { - aa64_emit32(t->mc, - aa64_ldur_q(AA_FP_TMP0, incoming_stack_base, - incoming_stack_bias + (i32)caller_off)); - aa64_emit32( - t->mc, aa64_stur_q(AA_FP_TMP0, 29, -(i32)s->off + (i32)part_off)); - } else { - aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, incoming_stack_base, - incoming_stack_bias + (i32)caller_off, AA_TMP0); - aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29, - -(i32)s->off + (i32)part_off, AA_TMP0); - } - } - } else { - compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", - (int)pt->cls); - } - } - return st; -} - -/* ============================================================ - * Address materialization helpers - * ============================================================ */ - -static int use_got_for_sym(CGTarget* t, ObjSymId sym) { - return obj_symbol_extern_via_got(t->c, t->obj, sym); -} - -void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(dst_reg)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0); - u32 ldr_pos = mc->pos(mc); - aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0)); - mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0); -} - -void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) { - MCEmitter* mc = t->mc; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, dst_reg, sym); - if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend); - return; - } - u32 sec = mc->section_id; - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(dst_reg)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend, - 0, 0); - u32 add_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0)); - mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0, - 0); -} - -void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) { - if (off == 0) { - aa64_emit32(mc, aa64_mov_reg(1, Rd, base)); - return; - } - u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off; - if (abs_off <= 0xfff) { - if (off < 0) - aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0)); - else - aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0)); - return; - } - if ((abs_off >> 24) == 0) { - u32 hi = (abs_off >> 12) & 0xfff; - u32 lo = abs_off & 0xfff; - if (off < 0) { - if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1)); - if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0)); - } else { - if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1)); - if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0)); - } - return; - } - aa64_emit_load_imm(mc, 1, Rd, off); - aa64_emit32(mc, aa64_add(1, Rd, base, Rd)); -} - -static int aa64_simm9_fits(i32 off) { return off >= -256 && off <= 255; } - -void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp) { - if (aa64_simm9_fits(off)) { - aa64_emit32(mc, aa64_ldur(size, Rt, Rn, off)); - return; - } - aa64_emit_addr_adjust(mc, tmp, Rn, off); - aa64_emit32(mc, aa64_ldur(size, Rt, tmp, 0)); -} - -void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp) { - if (aa64_simm9_fits(off)) { - aa64_emit32(mc, aa64_stur(size, Rt, Rn, off)); - return; - } - aa64_emit_addr_adjust(mc, tmp, Rn, off); - aa64_emit32(mc, aa64_stur(size, Rt, tmp, 0)); -} - -void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp) { - if (aa64_simm9_fits(off)) { - aa64_emit32(mc, aa64_ldur_fp(size, Rt, Rn, off)); - return; - } - aa64_emit_addr_adjust(mc, tmp, Rn, off); - aa64_emit32(mc, aa64_ldur_fp(size, Rt, tmp, 0)); -} - -void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp) { - if (aa64_simm9_fits(off)) { - aa64_emit32(mc, aa64_stur_fp(size, Rt, Rn, off)); - return; - } - aa64_emit_addr_adjust(mc, tmp, Rn, off); - aa64_emit32(mc, aa64_stur_fp(size, Rt, tmp, 0)); -} diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h @@ -1,355 +0,0 @@ -/* aarch64/internal.h — private types and forward decls shared across - * emit.c / alloc.c / ops.c. NOT part of the public API. */ -#pragma once - -#include <string.h> - -#include "arch/aa64/asm.h" -#include "arch/aa64/isa.h" -#include "arch/aa64/regs.h" -#include "arch/arch.h" -#include "core/arena.h" -#include "core/pool.h" -#include "obj/obj.h" - -/* ============================================================ - * Local encoding helpers (kept here, not in aa64_isa.h). - * ============================================================ */ - -#define AA64_NOP 0xD503201Fu - -/* Hidden backend temporaries. These must stay outside the allocable pools and - * outside optimizer scratch registers because CGTarget ops may clobber them - * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */ -enum { - AA_TMP0 = 9u, - AA_TMP1 = 10u, - AA_TMP2 = 11u, - AA_FP_TMP0 = 31u, -}; -#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k))) - -static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stur_q(u32 Rt, u32 Rn, i32 simm9) { - return 0x3C800000u | (((u32)simm9 & 0x1ffu) << 12) | ((Rn & 0x1f) << 5) | - (Rt & 0x1f); -} -static inline u32 aa64_ldur_q(u32 Rt, u32 Rn, i32 simm9) { - return 0x3CC00000u | (((u32)simm9 & 0x1ffu) << 12) | ((Rn & 0x1f) << 5) | - (Rt & 0x1f); -} - -static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_str_q_uimm(u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> 4; - return 0x3D800000u | ((sc & 0xfffu) << 10) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_mrs_tpidr_el0(u32 Rt) { - return 0xD53BD040u | (Rt & 0x1fu); -} -static inline u32 aa64_b_base(void) { return 0x14000000u; } -static inline u32 aa64_bl_base(void) { return 0x94000000u; } - -static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); } - -static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldr_q_uimm(u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> 4; - return 0x3DC00000u | ((sc & 0xfffu) << 10) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) { - return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_mov_v16b(u32 Rd, u32 Rn) { - return 0x4EA01C00u | ((Rn & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) { - return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_cset_eq(u32 sf, u32 Rd) { - return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) { - return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) { - return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) { - return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) { - return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) { - return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) { - return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) { - return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { - return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); } - -static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) { - return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) | - ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) { - return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); -} - -static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fneg(u32 type, u32 Rd, u32 Rn) { - return 0x1E214000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_fcmp(u32 type, u32 Rn, u32 Rm) { - return 0x1E202000u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5); -} - -static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -/* ============================================================ - * AAImpl types - * ============================================================ */ - -#define AA_PROLOGUE_WORDS \ - 25u /* worst case: sub sp + str/str/add-add fp + sret + 10 int + 8 fp */ -#define AA_PROLOGUE_FRAME_WORDS \ - 6u /* worst-case frame adjust + split fp/lr saves + add-add fp */ - -typedef struct AASlot { - u32 off; - u32 size; - u32 align; - u8 kind; - u8 pad[3]; -} AASlot; - -typedef struct AAScope { - u8 kind; - u8 has_else; - u8 pad[2]; - MCLabel else_label; - MCLabel end_label; - Label break_label; - Label continue_label; -} AAScope; - -typedef struct AAImpl { - CGTarget base; - SrcLoc loc; - const CGFuncDesc* fd; - - u32 func_start; - u32 prologue_pos; - u32 prologue_words; - u32 post_prologue_off; /* end-of-prologue offset within function, for CFI */ - MCLabel epilogue_label; - u8 known_frame; - u8 omit_frame; - u8 pad0[2]; - - AASlot* slots; - u32 nslots; - u32 slots_cap; - u32 cum_off; - u32 max_outgoing; - - u32 next_param_int; - u32 next_param_fp; - u32 next_param_stack; - u8 has_sret; - FrameSlot sret_ptr_slot; - - u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */ - u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */ - u32 planned_cs_int_mask; - u32 planned_cs_fp_mask; - u8 has_planned_regs; - u8 pad1[3]; - - AAScope* scopes; - u32 nscopes; - u32 scopes_cap; - - u8 has_alloca; - struct AAAllocaPatch { - u32 pos; - u32 dst_reg; - }* add_patches; - u32 nadd_patches; - u32 add_patches_cap; - - u8 is_variadic; - FrameSlot gp_save_slot; - FrameSlot fp_save_slot; -} AAImpl; - -/* ============================================================ - * Cross-file forward declarations - * ============================================================ */ - -/* emit.c helpers used in alloc.c / ops.c */ -void aa64_emit32(MCEmitter* mc, u32 word); -void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word); -void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm); -void emit_sp_add(MCEmitter* mc, u32 imm); -void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off); -void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp); -void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp); -void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp); -void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, - u32 tmp); -void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym); -void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend); - -/* emit.c public surface */ -FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d); -void aa_func_begin(CGTarget* t, const CGFuncDesc* fd); -void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd, - const CGKnownFrameDesc* frame, - FrameSlot* out_slots); -void aa_func_end(CGTarget* t); -CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p); - -/* alloc.c helpers used in emit.c / ops.c */ -AAImpl* impl_of(CGTarget* t); -AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs); -void aa_jump(CGTarget* t, Label l); - -/* ops.c helpers used in alloc.c */ -void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma); -void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma); -u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch); - -/* alloc.c helpers used in ops.c */ -void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op); -void aa_alloc_vtable_init(CGTarget* t); -void aa_coord_vtable_init(CGTarget* t); - -/* shared type helpers (defined in emit.c, used broadly) */ -int type_is_64(CfreeCgTypeId t); -int type_is_fp_double(CfreeCgTypeId t); -int type_is_signed(CfreeCgTypeId t); -u32 type_byte_size(CfreeCgTypeId t); -u32 size_idx_for_bytes(u32 nbytes); -u32 reg_num(Operand op); diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -0,0 +1,3226 @@ +/* aa64 NativeTarget production-readiness checklist: + * - ABI completeness: finish AAPCS64/Linux va_list and register-save-area + * lowering, verify Apple/AAPCS64/Windows arm64 differences, handle all + * homogeneous aggregates, indirect/byval/sret corner cases, small aggregate + * splitting, multi-register returns, stack alignment, and ABI diagnostics. + * - Calls and returns: replace call-plus-return tail handling with true direct + * and indirect sibling calls, preserve musttail ABI guarantees, support stack + * argument reshuffling without clobbering live inputs, and cover all sret, + * variadic, FP, aggregate, and many-argument combinations. + * - Frame lowering: implement known-frame/prologue integration for optimized + * emission, spill/reload hooks, callee-save tracking for integer and FP/SIMD + * registers, large-frame probing/materialization as needed by each platform, + * dynamic alloca restoration, and unwind/debug frame metadata. + * - Operations and intrinsics: fill remaining scalar, FP, conversion, rounding, + * overflow, bit, vector/SIMD, trap, prefetch, and target-specific intrinsics; + * validate NaN/ordered/unordered FP compare semantics and integer narrowing + * behavior for every supported width. + * - Aggregates and memory: support large constants, overlap-safe memmove, + * optimized bulk copy/set selection, bitfield load/store, packed/unaligned + * accesses, volatile access constraints, and record/slice edge cases across + * direct and optimized lowering. + * - Atomics: replace ordinary load/store RMW/CAS sequences with correct LL/SC + * or LSE loops, implement acquire/release/seq_cst mappings precisely, handle + * failure ordering, byte/halfword/word/dword widths, and retry/clobber rules. + * - Inline and file-scope asm: complete register/memory/immediate constraints, + * named operands, tied operands, early-clobber and clobber validation, hard + * register conflicts, memory barriers, outputs for aggregates/FP values, and + * file-scope asm integration. */ + +#include "arch/aa64/aa64.h" + +#include <string.h> + +#include "arch/aa64/asm.h" +#include "arch/aa64/isa.h" +#include "arch/aa64/regs.h" +#include "abi/abi.h" +#include "asm/asm.h" +#include "asm/asm_lex.h" +#include "cg/native_direct_target.h" +#include "cg/type.h" +#include "core/arena.h" +#include "core/bytes.h" +#include "core/pool.h" +#include "core/slice.h" +#include "obj/obj.h" + +#if defined(__GNUC__) || defined(__clang__) +#define AA_UNUSED_FN __attribute__((unused)) +#else +#define AA_UNUSED_FN +#endif + +enum { + AA_TMP0 = 16u, + AA_TMP1 = 17u, + AA_FP = 29u, + AA_LR = 30u, + AA_SP = 31u, + AA_FRAME_SAVE_SIZE = 16u, + AA_PROLOGUE_WORDS = 24u, + AA_TAIL_WORDS = 16u, +}; + +typedef struct AANativeSlot { + u32 off; + u32 size; + u32 align; + u8 kind; + u8 pad[3]; +} AANativeSlot; + +typedef struct AATailSite { + u32 pos; + NativeLoc callee; +} AATailSite; + +typedef struct AAAllocaPatch { + u32 pos; + u32 dst_reg; +} AAAllocaPatch; + +typedef struct AANativeTarget { + NativeTarget base; + SrcLoc loc; + const CGFuncDesc* func; + + AANativeSlot* slots; + u32 nslots; + u32 slots_cap; + u32 cum_off; + u32 max_outgoing; + u32 incoming_stack_size; + u32 next_param_int; + u32 next_param_fp; + u32 next_param_stack; + NativeFrameSlot sret_ptr_slot; + NativeFrameSlot saved_tmp_slot; + NativeFrameSlot va_gr_slot; + NativeFrameSlot va_vr_slot; + + AATailSite* tail_sites; + u32 ntail_sites; + u32 tail_sites_cap; + AAAllocaPatch* alloca_patches; + u32 nalloca_patches; + u32 alloca_patches_cap; + + u32 func_start; + u32 prologue_pos; + MCLabel epilogue_label; +} AANativeTarget; + +static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; } + +static void aa_panic(AANativeTarget* a, const char* msg) { + compiler_panic(a->base.c, a->loc, "aarch64 native target: %s", msg); +} + +static void aa_emit32(MCEmitter* mc, u32 word) { + u8 b[4]; + wr_u32_le(b, word); + mc->emit_bytes(mc, b, sizeof b); +} + +static void aa_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) { + u8 b[4]; + wr_u32_le(b, word); + obj_patch(obj, sec, off, b, sizeof b); +} + +static u32 align_up_u32(u32 v, u32 align) { + u32 mask = align ? align - 1u : 0u; + return (v + mask) & ~mask; +} + +static u32 type_size32(NativeTarget* t, CfreeCgTypeId type) { + u64 n = type ? cg_type_size(t->c, type) : 8u; + if (n == 0) n = 8u; + if (n > 16u) compiler_panic(t->c, (SrcLoc){0, 0, 0}, + "aarch64 native target: scalar too large"); + return (u32)n; +} + +static u32 type_align32(NativeTarget* t, CfreeCgTypeId type) { + u64 n = type ? cg_type_align(t->c, type) : 8u; + if (n == 0) n = 1u; + if (n > 16u) n = 16u; + return (u32)n; +} + +static u32 size_idx(u32 n) { + if (n <= 1u) return 0u; + if (n <= 2u) return 1u; + if (n <= 4u) return 2u; + return 3u; +} + +static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; } + +static int loc_is_64(NativeTarget* t, NativeLoc loc) { + return type_size32(t, loc.type) == 8u || cg_type_is_ptr(t->c, loc.type); +} + +static int loc_is_fp(NativeLoc loc) { + return (NativeAllocClass)loc.cls == NATIVE_REG_FP; +} + +static __attribute__((unused)) int aa_use_got_for_sym(NativeTarget* t, + ObjSymId sym) { + return obj_symbol_extern_via_got(t->c, t->obj, sym); +} + +static __attribute__((unused)) RelocKind aa_ldst_reloc_for_size(u32 size) { + switch (size) { + case 0: + return R_AARCH64_LDST8_ABS_LO12_NC; + case 1: + return R_AARCH64_LDST16_ABS_LO12_NC; + case 2: + return R_AARCH64_LDST32_ABS_LO12_NC; + case 3: + return R_AARCH64_LDST64_ABS_LO12_NC; + default: + return R_AARCH64_LDST64_ABS_LO12_NC; + } +} + +static u32 aa_load_imm_words(u32* out, u32 cap, u32 sf, u32 rd, i64 imm) { + u64 v = (u64)imm; + u32 words = sf ? 4u : 2u; + u32 n = 0; + for (u32 i = 0; i < words; ++i) { + u32 part = (u32)((v >> (i * 16u)) & 0xffffu); + if (!part && n) continue; + if (n >= cap) return 0; + out[n] = n ? aa64_movk(sf, rd, part, i) : aa64_movz(sf, rd, part, i); + ++n; + } + if (!n) { + if (!cap) return 0; + out[n++] = aa64_movz(sf, rd, 0, 0); + } + return n; +} + +static void aa_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) { + u32 words[4]; + u32 n = aa_load_imm_words(words, 4u, sf, rd, imm); + for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]); +} + +static void aa_emit_add_imm(AANativeTarget* a, u32 rd, u32 rn, i32 off) { + u32 imm12, sh; + MCEmitter* mc = a->base.mc; + if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) { + aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh)); + return; + } + if (off < 0 && aa64_addsub_imm_fits(-(i64)off, &imm12, &sh)) { + aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh)); + return; + } + aa_emit_load_imm(mc, 1, rd, off); + aa_emit32(mc, aa64_add(1, rd, rn, rd)); +} + +static __attribute__((unused)) void aa_emit_add_i64(AANativeTarget* a, u32 rd, + u32 rn, i64 off) { + u32 imm12, sh; + MCEmitter* mc = a->base.mc; + if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) { + aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh)); + return; + } + if (off < 0 && aa64_addsub_imm_fits(-off, &imm12, &sh)) { + aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh)); + return; + } + aa_emit_load_imm(mc, 1, rd, off); + aa_emit32(mc, aa64_add(1, rd, rn, rd)); +} + +static u32 aa_ldur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) { + return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size, + .V = v, + .opc = AA64_LDST_OPC_LDR, + .imm9 = (u32)simm9 & 0x1ffu, + .Rn = rn, + .Rt = rt}); +} + +static u32 aa_stur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) { + return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size, + .V = v, + .opc = AA64_LDST_OPC_STR, + .imm9 = (u32)simm9 & 0x1ffu, + .Rn = rn, + .Rt = rt}); +} + +static u32 aa_ldr_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) { + u32 sc = byte_off >> size; + return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size, + .V = v, + .opc = AA64_LDST_OPC_LDR, + .imm12 = sc, + .Rn = rn, + .Rt = rt}); +} + +static u32 aa_str_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) { + u32 sc = byte_off >> size; + return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size, + .V = v, + .opc = AA64_LDST_OPC_STR, + .imm12 = sc, + .Rn = rn, + .Rt = rt}); +} + +static u32 aa_ldr_uimm(u32 size, u32 rt, u32 rn, u32 byte_off) { + return aa_ldr_uimm_v(size, 0, rt, rn, byte_off); +} + +static __attribute__((unused)) u32 aa_str_uimm(u32 size, u32 rt, u32 rn, + u32 byte_off) { + return aa_str_uimm_v(size, 0, rt, rn, byte_off); +} + +static __attribute__((unused)) u32 aa_ldst_regoff_v(u32 size, u32 v, u32 load, + u32 rt, u32 rn, u32 rm, + u32 scaled) { + return ((size & 3u) << 30) | 0x38200800u | ((v & 1u) << 26) | + ((load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR) << 22) | + ((rm & 0x1fu) << 16) | (3u << 13) | ((scaled & 1u) << 12) | + ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_ldr_lit64(u32 rt, u32 imm19) { + return 0x58000000u | ((imm19 & 0x7ffffu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_mrs_tpidr_el0(u32 rt) { + return 0xd53bd040u | (rt & 0x1fu); +} + +static u32 aa_fp_bin(u32 op, u32 is_double, u32 rd, u32 rn, u32 rm) { + return (is_double ? 0x1e600000u : 0x1e200000u) | op | + ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static u32 aa_fcmp(u32 is_double, u32 rn, u32 rm) { + return (is_double ? 0x1e602000u : 0x1e202000u) | ((rm & 0x1fu) << 16) | + ((rn & 0x1fu) << 5); +} + +static u32 aa_fneg(u32 is_double, u32 rd, u32 rn) { + return (is_double ? 0x1e614000u : 0x1e214000u) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) { + return (is_double ? 0x1e604000u : 0x1e204000u) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) { + return (is64_src ? 0x9e220000u : 0x1e220000u) | + (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | + (fd & 0x1fu); +} + +static u32 aa_ucvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) { + return (is64_src ? 0x9e230000u : 0x1e230000u) | + (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | + (fd & 0x1fu); +} + +static u32 aa_fcvtzs(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) { + return (is64_dst ? 0x9e380000u : 0x1e380000u) | + (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_fcvtzu(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) { + return (is64_dst ? 0x9e390000u : 0x1e390000u) | + (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_fcvt_d_s(u32 rd, u32 rn) { + return 0x1e22c000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static u32 aa_fcvt_s_d(u32 rd, u32 rn) { + return 0x1e624000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static u32 aa_fmov_gpr_to_fp(u32 is64, u32 fd, u32 rn) { + return (is64 ? 0x9e670000u : 0x1e270000u) | ((rn & 0x1fu) << 5) | + (fd & 0x1fu); +} + +static u32 aa_fmov_fp_to_gpr(u32 is64, u32 rd, u32 fn) { + return (is64 ? 0x9e660000u : 0x1e260000u) | ((fn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_clz(u32 sf, u32 rd, u32 rn) { + return (sf ? 0xdac01000u : 0x5ac01000u) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_rbit(u32 sf, u32 rd, u32 rn) { + return (sf ? 0xdac00000u : 0x5ac00000u) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_rev(u32 sf, u32 rd, u32 rn) { + return (sf ? 0xdac00c00u : 0x5ac00800u) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_sbfm(u32 sf, u32 rd, u32 rn, u32 immr, u32 imms) { + return (sf ? 0x93400000u : 0x13000000u) | ((immr & 0x3fu) << 16) | + ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static __attribute__((unused)) u32 aa_ubfm(u32 sf, u32 rd, u32 rn, u32 immr, + u32 imms) { + return (sf ? 0xd3400000u : 0x53000000u) | ((immr & 0x3fu) << 16) | + ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static __attribute__((unused)) u32 aa_ldaxr(u32 size, u32 rt, u32 rn) { + return (size << 30) | 0x085ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_ldxr(u32 size, u32 rt, u32 rn) { + return (size << 30) | 0x085f7c00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_stlxr(u32 size, u32 rs, u32 rt, u32 rn) { + return (size << 30) | 0x0800fc00u | ((rs & 0x1fu) << 16) | + ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_stxr(u32 size, u32 rs, u32 rt, u32 rn) { + return (size << 30) | 0x08007c00u | ((rs & 0x1fu) << 16) | + ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_ldar(u32 size, u32 rt, u32 rn) { + return (size << 30) | 0x08dffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static __attribute__((unused)) u32 aa_stlr(u32 size, u32 rt, u32 rn) { + return (size << 30) | 0x089ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); +} + +static u32 aa_umaddl(u32 rd, u32 rn, u32 rm, u32 ra) { + return 0x9ba00000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) | + ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static u32 aa_smaddl(u32 rd, u32 rn, u32 rm, u32 ra) { + return 0x9b200000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) | + ((rn & 0x1fu) << 5) | (rd & 0x1fu); +} + +static u32 aa_smulh(u32 rd, u32 rn, u32 rm) { + return 0x9b407c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_umulh(u32 rd, u32 rn, u32 rm) { + return 0x9bc07c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + +static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) { + return aa64_addsubsr_pack((AA64AddSubSR){ + .sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd}); +} + +static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) { + return aa64_addsubsr_pack((AA64AddSubSR){ + .sf = 1, .op = 0, .S = 0, .shift = 0, .Rm = rm, .imm6 = shift, + .Rn = rn, .Rd = rd}); +} + +static u32 aa_cset(u32 sf, u32 rd, u32 cond) { + return aa64_csinc_enc(sf, rd, AA64_ZR, AA64_ZR, cond ^ 1u); +} + +static u32 cmp_cond(CmpOp op) { + switch (op) { + case CMP_EQ: + return 0x0u; + case CMP_NE: + return 0x1u; + case CMP_LT_U: + return 0x3u; + case CMP_LE_U: + return 0x9u; + case CMP_GT_U: + return 0x8u; + case CMP_GE_U: + return 0x2u; + case CMP_LT_S: + return 0xbu; + case CMP_LE_S: + return 0xdu; + case CMP_GT_S: + return 0xcu; + case CMP_GE_S: + return 0xau; + case CMP_LT_F: + return 0x4u; + case CMP_LE_F: + return 0x9u; + case CMP_GT_F: + return 0xcu; + case CMP_GE_F: + return 0xau; + default: + return 0x0u; + } +} + +static AANativeSlot* aa_slot(AANativeTarget* a, NativeFrameSlot slot) { + if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->nslots) + aa_panic(a, "bad frame slot"); + return &a->slots[slot - 1u]; +} + +static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out, + i32* off_out) { + *base_out = AA_TMP0; + *off_out = addr.offset; + switch ((NativeAddrBaseKind)addr.base_kind) { + case NATIVE_ADDR_BASE_REG: + *base_out = addr.base.reg; + return; + case NATIVE_ADDR_BASE_FRAME: { + AANativeSlot* s = aa_slot(a, addr.base.frame); + *base_out = AA_FP; + *off_out = -(i32)s->off + addr.offset; + return; + } + case NATIVE_ADDR_BASE_GLOBAL: { + NativeLoc tmp; + memset(&tmp, 0, sizeof tmp); + tmp.kind = NATIVE_LOC_REG; + tmp.cls = NATIVE_REG_INT; + tmp.type = builtin_id(CFREE_CG_BUILTIN_I64); + tmp.v.reg = AA_TMP0; + a->base.load_addr(&a->base, tmp, addr); + *base_out = AA_TMP0; + *off_out = 0; + return; + } + default: + aa_panic(a, "unsupported address base"); + } +} + +static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off); +static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off); + +static void aa_emit_mem_q(AANativeTarget* a, int load, NativeLoc reg, + NativeAddr addr) { + u32 base, rt; + i32 off; + MCEmitter* mc = a->base.mc; + if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) + aa_panic(a, "unsupported q-register indexed memory access"); + aa_addr_base(a, addr, &base, &off); + rt = loc_reg(reg); + if (off >= 0 && (((u32)off & 15u) == 0) && ((u32)off >> 4) <= 0xfffu) { + aa_emit32(mc, aa_ldst_q_uimm(load, rt, base, (u32)off)); + return; + } + if (off >= -256 && off <= 255) { + aa_emit32(mc, aa_ldst_q_simm9(load, rt, base, off)); + return; + } + aa_emit_add_imm(a, AA_TMP1, base, off); + aa_emit32(mc, aa_ldst_q_uimm(load, rt, AA_TMP1, 0)); +} + +static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg, + NativeAddr addr, MemAccess mem) { + u32 base, rt, sz; + i32 off; + MCEmitter* mc = a->base.mc; + rt = loc_reg(reg); + sz = size_idx(mem.size ? mem.size + : type_size32(&a->base, reg.type ? reg.type + : mem.type)); + if (loc_is_fp(reg) && (mem.size ? mem.size + : type_size32(&a->base, reg.type + ? reg.type + : mem.type)) == + 16u) { + aa_emit_mem_q(a, load, reg, addr); + return; + } + if (loc_is_fp(reg) && sz < 2u) sz = 2u; + if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL && + addr.index_kind == NATIVE_ADDR_INDEX_NONE) { + i64 addend = addr.base.global.addend + (i64)addr.offset; + u32 scratch = (!load && rt == AA_TMP0) ? AA_TMP1 : AA_TMP0; + u32 pos = mc->pos(mc); + if (aa_use_got_for_sym(&a->base, addr.base.global.sym)) { + aa_emit32(mc, aa64_adrp(scratch, 0, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_GOT_PAGE, + addr.base.global.sym, 0, 0, 0); + pos = mc->pos(mc); + aa_emit32(mc, aa_ldr_uimm(3, scratch, scratch, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_LD64_GOT_LO12_NC, + addr.base.global.sym, 0, 0, 0); + if (addend) aa_emit_add_i64(a, scratch, scratch, addend); + aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, scratch, 0) + : aa_stur_v(sz, loc_is_fp(reg), rt, scratch, 0)); + return; + } + aa_emit32(mc, aa64_adrp(scratch, 0, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21, + addr.base.global.sym, addend, 0, 0); + pos = mc->pos(mc); + aa_emit32(mc, load ? aa_ldr_uimm_v(sz, loc_is_fp(reg), rt, scratch, 0) + : aa_str_uimm_v(sz, loc_is_fp(reg), rt, scratch, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, aa_ldst_reloc_for_size(sz), + addr.base.global.sym, addend, 0, 0); + return; + } + aa_addr_base(a, addr, &base, &off); + if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) { + u32 use_base = base; + u32 scaled = 0; + if (addr.index_kind != NATIVE_ADDR_INDEX_REG) + aa_panic(a, "unsupported address index"); + if (off) { + use_base = AA_TMP1; + aa_emit_add_imm(a, use_base, base, off); + } + if (addr.log2_scale == 0) { + scaled = 0; + } else if (addr.log2_scale == sz) { + scaled = 1; + } else { + aa_panic(a, "unsupported memory address scale"); + } + aa_emit32(mc, aa_ldst_regoff_v(sz, loc_is_fp(reg), load, rt, use_base, + addr.index.reg, scaled)); + return; + } + if (off >= 0 && (((u32)off & ((1u << sz) - 1u)) == 0) && + ((u32)off >> sz) <= 0xfffu) { + aa_emit32(mc, load ? aa_ldr_uimm_v(sz, loc_is_fp(reg), rt, base, (u32)off) + : aa_str_uimm_v(sz, loc_is_fp(reg), rt, base, (u32)off)); + return; + } + if (off >= -256 && off <= 255) { + aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, base, off) + : aa_stur_v(sz, loc_is_fp(reg), rt, base, off)); + return; + } + aa_emit_add_imm(a, AA_TMP1, base, off); + aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, AA_TMP1, 0) + : aa_stur_v(sz, loc_is_fp(reg), rt, AA_TMP1, 0)); +} + +static NativeAllocClass aa_class_for_type(NativeTarget* t, + CfreeCgTypeId type) { + if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u) + return NATIVE_REG_FP; + return NATIVE_REG_INT; +} + +static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr, + MemAccess mem) { + u32 sz; + (void)t; + if (!addr) return 0; + if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return 1; + if (addr->index_kind != NATIVE_ADDR_INDEX_REG) return 0; + if (addr->log2_scale == 0) return 1; + sz = size_idx(mem.size ? mem.size : 8u); + return addr->log2_scale == sz; +} + +static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) { + if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return; + if (addr->index_kind != NATIVE_ADDR_INDEX_REG) + aa_panic(a, "unsupported address index"); + if (addr->log2_scale > 4u) aa_panic(a, "unsupported address scale"); + aa_emit32(a->base.mc, aa_add_lsl(rd, rd, addr->index.reg, addr->log2_scale)); +} + +static void aa_materialize_frame_index(AANativeTarget* a, NativeAddr* addr, + u32 avoid_reg) { + NativeAddr load; + NativeLoc idx; + MemAccess mem; + u32 reg; + if (addr->index_kind != NATIVE_ADDR_INDEX_FRAME_VALUE) return; + reg = avoid_reg == AA_TMP1 ? AA_TMP0 : AA_TMP1; + memset(&load, 0, sizeof load); + load.base_kind = NATIVE_ADDR_BASE_FRAME; + load.base.frame = addr->index.frame; + load.base_type = addr->index_type ? addr->index_type + : builtin_id(CFREE_CG_BUILTIN_I64); + memset(&idx, 0, sizeof idx); + idx.kind = NATIVE_LOC_REG; + idx.cls = NATIVE_REG_INT; + idx.type = load.base_type; + idx.v.reg = reg; + memset(&mem, 0, sizeof mem); + mem.type = load.base_type; + mem.size = 8; + mem.align = 8; + aa_emit_mem(a, 1, idx, load, mem); + addr->index_kind = NATIVE_ADDR_INDEX_REG; + addr->index.reg = reg; +} + +static NativeLoc aa_reg_loc(CfreeCgTypeId type, NativeAllocClass cls, Reg reg); + +static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off) { + return aa64_ldst_uimm_pack((AA64LdStUimm){.size = 0, + .V = 1, + .opc = load ? 3u : 2u, + .imm12 = byte_off >> 4, + .Rn = rn, + .Rt = rt}); +} + +static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) { + return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = 0, + .V = 1, + .opc = load ? 3u : 2u, + .imm9 = (u32)byte_off & 0x1ffu, + .Rn = rn, + .Rt = rt}); +} + +static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg, + NativeFrameSlot slot, u32 offset) { + AANativeSlot* s = aa_slot(a, slot); + i32 off = -(i32)s->off + (i32)offset; + MCEmitter* mc = a->base.mc; + if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) { + aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off)); + return; + } + if (off >= -256 && off <= 255) { + aa_emit32(mc, aa_ldst_q_simm9(load, qreg, AA_FP, off)); + return; + } + aa_emit_add_imm(a, AA_TMP1, AA_FP, off); + aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_TMP1, 0)); +} + +static void aa_emit_variadic_reg_saves(AANativeTarget* a) { + NativeFrameSlotDesc sd; + NativeAddr addr; + MemAccess mem; + CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); + ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi); + if (vai.kind != ABI_VA_LIST_AAPCS64) return; + memset(&sd, 0, sizeof sd); + sd.type = i64; + sd.size = vai.gp_reg_count * vai.gp_slot_size; + sd.align = 8; + sd.kind = NATIVE_FRAME_SLOT_SAVE; + a->va_gr_slot = a->base.frame_slot(&a->base, &sd); + sd.size = vai.fp_reg_count * vai.fp_slot_size; + sd.align = 16; + a->va_vr_slot = a->base.frame_slot(&a->base, &sd); + memset(&mem, 0, sizeof mem); + mem.type = i64; + mem.size = 8; + mem.align = 8; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = a->va_gr_slot; + addr.base_type = i64; + for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) { + NativeLoc src = aa_reg_loc(i64, NATIVE_REG_INT, r); + addr.offset = (i32)(r * vai.gp_slot_size); + aa_emit_mem(a, 0, src, addr, mem); + } + for (u32 r = 0; r < vai.fp_reg_count && r < 8u; ++r) + aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size); +} + +static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { + AANativeTarget* a = aa_of(t); + MCEmitter* mc = t->mc; + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); + a->func = fd; + a->nslots = 0; + a->cum_off = AA_FRAME_SAVE_SIZE; + a->max_outgoing = 0; + a->incoming_stack_size = 0; + a->next_param_int = 0; + a->next_param_fp = 0; + a->next_param_stack = 0; + a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; + a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE; + a->va_gr_slot = NATIVE_FRAME_SLOT_NONE; + a->va_vr_slot = NATIVE_FRAME_SLOT_NONE; + a->ntail_sites = 0; + a->nalloca_patches = 0; + mc->set_section(mc, fd->text_section_id); + mc->emit_align(mc, 4, 0); + a->func_start = mc->pos(mc); + mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); + if (mc->cfi_startproc) mc->cfi_startproc(mc); + a->prologue_pos = mc->pos(mc); + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu); + a->epilogue_label = mc->label_new(mc); + if (abi && abi->has_sret) { + NativeFrameSlotDesc sd; + NativeAddr addr; + NativeLoc src; + MemAccess mem; + memset(&sd, 0, sizeof sd); + sd.type = builtin_id(CFREE_CG_BUILTIN_I64); + sd.size = 8; + sd.align = 8; + sd.kind = NATIVE_FRAME_SLOT_SAVE; + a->sret_ptr_slot = t->frame_slot(t, &sd); + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = a->sret_ptr_slot; + addr.base_type = sd.type; + memset(&src, 0, sizeof src); + src.kind = NATIVE_LOC_REG; + src.cls = NATIVE_REG_INT; + src.type = sd.type; + src.v.reg = 8u; + memset(&mem, 0, sizeof mem); + mem.type = sd.type; + mem.size = 8; + mem.align = 8; + aa_emit_mem(a, 0, src, addr, mem); + } + if (abi && abi->variadic) aa_emit_variadic_reg_saves(a); +} + +static void aa_note_frame_state(NativeTarget* t, + const NativeFramePatchState* state) { + AANativeTarget* a = aa_of(t); + if (state && state->max_outgoing > a->max_outgoing) + a->max_outgoing = state->max_outgoing; +} + +static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n, + u32 rd, i64 imm) { + u32 tmp[4]; + u32 m = aa_load_imm_words(tmp, 4u, 1, rd, imm); + if (!m || *n + m > cap) aa_panic(a, "instruction patch too small"); + for (u32 i = 0; i < m; ++i) words[(*n)++] = tmp[i]; +} + +static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap, + u32* n, u32 frame_size) { + u32 imm12, sh; + if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) { + if (*n >= cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_sub_imm(1, AA_SP, AA_SP, imm12, sh); + return; + } + aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size); + if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); + words[(*n)++] = aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0); + words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0); +} + +static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap, + u32* n, u32 frame_size) { + u32 imm12, sh; + if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) { + if (*n >= cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh); + return; + } + aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size); + if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); + words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0); +} + +static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap, + u32* n, u32 frame_size) { + u32 save_off = frame_size - AA_FRAME_SAVE_SIZE; + u32 imm12, sh; + if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) { + if (*n >= cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, imm12, sh); + return; + } + aa_words_load_imm(a, words, cap, n, AA_TMP0, save_off); + if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); + words[(*n)++] = aa64_add(1, AA_TMP1, AA_TMP1, AA_TMP0); +} + +static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap, + u32* n, u32 frame_size) { + if (!frame_size) return; + if (*n + 4u > cap) aa_panic(a, "instruction patch too small"); + words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0); + words[(*n)++] = aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16); + words[(*n)++] = aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8); + words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0); +} + +static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) { + u32 words[AA_PROLOGUE_WORDS]; + u32 n = 0; + ObjSecId sec = a->func->text_section_id; + memset(words, 0, sizeof words); + if (frame_size) { + aa_words_sub_sp_frame(a, words, AA_PROLOGUE_WORDS, &n, frame_size); + aa_words_saved_pair_addr(a, words, AA_PROLOGUE_WORDS, &n, frame_size); + if (n + 2u > AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large"); + words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0); + words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8); + aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size); + } + while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu; + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) + aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]); +} + +static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) { + MCEmitter* mc = a->base.mc; + if (!frame_size) return; + aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0)); + aa_emit32(mc, aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16)); + aa_emit32(mc, aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8)); + aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0)); +} + +static void aa_patch_allocas(AANativeTarget* a) { + ObjSecId sec = a->func->text_section_id; + u32 imm12, sh; + for (u32 i = 0; i < a->nalloca_patches; ++i) { + AAAllocaPatch* p = &a->alloca_patches[i]; + if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh)) + aa_panic(a, "outgoing area too large for alloca result"); + aa_patch32(a->base.obj, sec, p->pos, + aa64_add_imm(1, p->dst_reg, AA_SP, imm12, sh)); + } +} + +static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { + ObjSecId sec = a->func->text_section_id; + for (u32 i = 0; i < a->ntail_sites; ++i) { + AATailSite* site = &a->tail_sites[i]; + u32 words[AA_TAIL_WORDS]; + u32 n = 0; + memset(words, 0, sizeof words); + aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size); + if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); + if (site->callee.kind == NATIVE_LOC_REG) { + words[n++] = aa64_br(loc_reg(site->callee)); + } else if (site->callee.kind == NATIVE_LOC_GLOBAL) { + while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu; + words[n++] = aa64_b(0); + } else { + aa_panic(a, "unsupported tail target"); + } + while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu; + for (u32 w = 0; w < AA_TAIL_WORDS; ++w) + aa_patch32(a->base.obj, sec, site->pos + w * 4u, words[w]); + } +} + +static void aa_func_end(NativeTarget* t) { + AANativeTarget* a = aa_of(t); + MCEmitter* mc = t->mc; + u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u); + mc->label_place(mc, a->epilogue_label); + aa_emit_restore_frame(a, frame_size); + aa_emit32(mc, aa64_ret(AA_LR)); + aa_patch_prologue(a, frame_size); + aa_patch_allocas(a); + aa_patch_tail_sites(a, frame_size); + if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { + mc->cfi_set_next_pc_offset(mc, AA_PROLOGUE_WORDS * 4u); + mc->cfi_def_cfa(mc, AA_FP, 0); + mc->cfi_offset(mc, AA_FP, -16); + mc->cfi_offset(mc, AA_LR, -8); + } + obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id, + a->func_start, mc->pos(mc) - a->func_start); + if (a->func->atomize) { + obj_atom_define(t->obj, a->func->text_section_id, a->func_start, + mc->pos(mc) - a->func_start, a->func->sym, 0); + } + if (mc->cfi_endproc) mc->cfi_endproc(mc); + mc_end_function(mc); + a->func = NULL; +} + +static NativeFrameSlot aa_frame_slot(NativeTarget* t, + const NativeFrameSlotDesc* d) { + AANativeTarget* a = aa_of(t); + AANativeSlot* s; + u32 size = d->size ? d->size : 8u; + u32 align = d->align ? d->align : 1u; + if (a->nslots == a->slots_cap) { + u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u; + AANativeSlot* nb = arena_zarray(t->c->tu, AANativeSlot, cap); + if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots); + a->slots = nb; + a->slots_cap = cap; + } + a->cum_off = align_up_u32(a->cum_off + size, align); + s = &a->slots[a->nslots++]; + s->off = a->cum_off; + s->size = size; + s->align = align; + s->kind = d->kind; + return a->nslots; +} + +static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, + const NativeKnownFrameDesc* frame, + NativeFrameSlot* out_slots) { + aa_func_begin(t, fd); + if (frame) { + AANativeTarget* a = aa_of(t); + if (frame->max_outgoing > a->max_outgoing) + a->max_outgoing = frame->max_outgoing; + for (u32 i = 0; i < frame->nslots; ++i) { + NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]); + if (out_slots) out_slots[i] = slot; + } + } +} + +static void aa_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot, + MemAccess mem) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = slot; + addr.base_type = src.type; + aa_emit_mem(aa_of(t), 0, src, addr, mem); +} + +static void aa_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot, + MemAccess mem) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = slot; + addr.base_type = dst.type; + aa_emit_mem(aa_of(t), 1, dst, addr, mem); +} + +static MCLabel aa_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); } + +static void aa_label_place(NativeTarget* t, MCLabel label) { + t->mc->label_place(t->mc, label); +} + +static void aa_jump(NativeTarget* t, MCLabel label) { + aa_emit32(t->mc, aa64_b(0)); + t->mc->emit_label_ref(t->mc, label, R_AARCH64_JUMP26, 4, 0); +} + +static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs, + NativeLoc rhs, MCLabel label) { + if (loc_is_fp(lhs)) { + aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs), + loc_reg(rhs))); + } else { + u32 sf = loc_is_64(t, lhs) ? 1u : 0u; + aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs))); + } + aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)})); + t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); +} + +static void aa_indirect_branch(NativeTarget* t, NativeLoc addr, + const MCLabel* valid_targets, u32 ntargets) { + (void)valid_targets; + (void)ntargets; + aa_emit32(t->mc, aa64_br(loc_reg(addr))); +} + +static void aa_load_label_addr(NativeTarget* t, NativeLoc dst, + MCLabel target) { + aa_emit32(t->mc, aa64_adr(loc_reg(dst), 0, 0)); + aa_emit32(t->mc, aa64_b(3)); + aa_emit32(t->mc, 0); + aa_emit32(t->mc, 0); + t->mc->emit_label_ref(t->mc, target, R_AARCH64_INTRA_LABEL_ADDR, 16, 0); +} + +static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { + if (loc_is_fp(dst) && loc_is_fp(src)) { + aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst), + loc_reg(src))); + } else if (loc_is_fp(dst)) { + aa_emit32(t->mc, + aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src))); + } else if (loc_is_fp(src)) { + aa_emit32(t->mc, + aa_fmov_fp_to_gpr(loc_is_64(t, dst), loc_reg(dst), loc_reg(src))); + } else { + aa_emit32(t->mc, + aa64_mov_reg(loc_is_64(t, dst), loc_reg(dst), loc_reg(src))); + } +} + +static NativeLoc aa_tmp_loc(CfreeCgTypeId type, Reg reg); + +static void aa_load_imm_native(NativeTarget* t, NativeLoc dst, i64 imm) { + aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), imm); +} + +static void aa_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cbytes) { + u64 v = 0; + if (cbytes.size > 8u) + compiler_panic(t->c, ((AANativeTarget*)t)->loc, + "aarch64 native target: byte constant too large"); + for (u32 i = 0; i < cbytes.size; ++i) v |= (u64)cbytes.bytes[i] << (i * 8u); + if (loc_is_fp(dst)) { + NativeLoc tmp = aa_tmp_loc(cbytes.type, AA_TMP0); + aa_emit_load_imm(t->mc, cbytes.size == 8u, AA_TMP0, (i64)v); + aa_move(t, dst, tmp); + } else { + aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), (i64)v); + } +} + +static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { + AANativeTarget* a = aa_of(t); + u32 rd = loc_reg(dst); + aa_materialize_frame_index(a, &addr, rd); + switch ((NativeAddrBaseKind)addr.base_kind) { + case NATIVE_ADDR_BASE_FRAME: { + AANativeSlot* s = aa_slot(a, addr.base.frame); + aa_emit_add_imm(a, rd, AA_FP, -(i32)s->off + addr.offset); + aa_apply_index(a, rd, &addr); + return; + } + case NATIVE_ADDR_BASE_FRAME_VALUE: { + NativeAddr load; + MemAccess mem; + memset(&load, 0, sizeof load); + load.base_kind = NATIVE_ADDR_BASE_FRAME; + load.base.frame = addr.base.frame; + load.base_type = addr.base_type ? addr.base_type + : builtin_id(CFREE_CG_BUILTIN_I64); + memset(&mem, 0, sizeof mem); + mem.type = load.base_type; + mem.size = 8; + mem.align = 8; + aa_emit_mem(a, 1, dst, load, mem); + if (addr.offset) aa_emit_add_imm(a, rd, rd, addr.offset); + aa_apply_index(a, rd, &addr); + return; + } + case NATIVE_ADDR_BASE_REG: + aa_emit_add_imm(a, rd, addr.base.reg, addr.offset); + aa_apply_index(a, rd, &addr); + return; + case NATIVE_ADDR_BASE_GLOBAL: { + i64 addend = addr.base.global.addend + (i64)addr.offset; + u32 pos = t->mc->pos(t->mc); + if (aa_use_got_for_sym(t, addr.base.global.sym)) { + aa_emit32(t->mc, aa64_adrp(rd, 0, 0)); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, + R_AARCH64_ADR_GOT_PAGE, addr.base.global.sym, 0, + 0, 0); + pos = t->mc->pos(t->mc); + aa_emit32(t->mc, aa_ldr_uimm(3, rd, rd, 0)); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, + R_AARCH64_LD64_GOT_LO12_NC, + addr.base.global.sym, 0, 0, 0); + if (addend) aa_emit_add_i64(a, rd, rd, addend); + aa_apply_index(a, rd, &addr); + return; + } + aa_emit32(t->mc, aa64_adrp(rd, 0, 0)); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, + R_AARCH64_ADR_PREL_PG_HI21, addr.base.global.sym, + addend, 0, 0); + pos = t->mc->pos(t->mc); + aa_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, + R_AARCH64_ADD_ABS_LO12_NC, addr.base.global.sym, + addend, 0, 0); + aa_apply_index(a, rd, &addr); + return; + } + default: + aa_panic(a, "unsupported load_addr"); + } +} + +static void aa_load_native(NativeTarget* t, NativeLoc dst, NativeAddr addr, + MemAccess mem) { + aa_emit_mem(aa_of(t), 1, dst, addr, mem); +} + +static void aa_store_native(NativeTarget* t, NativeAddr addr, NativeLoc src, + MemAccess mem) { + aa_emit_mem(aa_of(t), 0, src, addr, mem); +} + +static void aa_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, + i64 addend) { + AANativeTarget* a = aa_of(t); + MCEmitter* mc = t->mc; + u32 rd = loc_reg(dst); + u32 pos; + if (obj_format_tls_via_descriptor(t->c)) { + aa_emit32(mc, aa64_adrp(0, 0, 0)); + pos = mc->pos(mc) - 4u; + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGE21, + sym, 0, 0, 0); + aa_emit32(mc, aa_ldr_uimm(3, 0, 0, 0)); + pos = mc->pos(mc) - 4u; + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, + sym, 0, 0, 0); + aa_emit32(mc, aa_ldr_uimm(3, AA_TMP0, 0, 0)); + aa_emit32(mc, aa64_blr(AA_TMP0)); + if (addend) aa_emit_add_i64(a, 0, 0, addend); + if (rd != 0) aa_emit32(mc, aa64_mov_reg(1, rd, 0)); + return; + } + if (t->c->target.obj != CFREE_OBJ_ELF) { + aa_panic(a, "unsupported TLS object format"); + } + aa_emit32(mc, aa_mrs_tpidr_el0(rd)); + pos = mc->pos(mc); + aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, + sym, addend, 0, 0); + pos = mc->pos(mc); + aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, + sym, addend, 0, 0); +} + +static NativeLoc aa_tmp_loc(CfreeCgTypeId type, Reg reg) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_REG; + loc.cls = NATIVE_REG_INT; + loc.type = type; + loc.v.reg = reg; + return loc; +} + +static NativeAddr aa_addr_plus(NativeAddr addr, u32 off) { + addr.offset += (i32)off; + return addr; +} + +static void aa_copy_bytes_dir(NativeTarget* t, NativeAddr dst, NativeAddr src, + AggregateAccess access, int backward) { + CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); + CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32); + CfreeCgTypeId i16 = builtin_id(CFREE_CG_BUILTIN_I16); + CfreeCgTypeId i8 = builtin_id(CFREE_CG_BUILTIN_I8); + NativeLoc tmp = aa_tmp_loc(i64, AA_TMP0); + u32 off = 0; + while (off < access.size) { + u32 rem = access.size - off; + u32 pos; + MemAccess mem = access.mem; + if (rem >= 8u) { + mem.type = i64; + mem.size = 8u; + } else if (rem >= 4u) { + mem.type = i32; + mem.size = 4u; + tmp.type = i32; + } else if (rem >= 2u) { + mem.type = i16; + mem.size = 2u; + tmp.type = i16; + } else { + mem.type = i8; + mem.size = 1u; + tmp.type = i8; + } + mem.align = mem.size; + pos = backward ? access.size - off - mem.size : off; + aa_load_native(t, tmp, aa_addr_plus(src, pos), mem); + aa_store_native(t, aa_addr_plus(dst, pos), tmp, mem); + off += mem.size; + tmp.type = i64; + } +} + +static void aa_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, + AggregateAccess access) { + aa_copy_bytes_dir(t, dst, src, access, 0); +} + +static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, + AggregateAccess access) { + CfreeCgTypeId i8 = builtin_id(CFREE_CG_BUILTIN_I8); + NativeLoc byte = byte_value; + MemAccess mem = access.mem; + mem.type = i8; + mem.size = 1u; + mem.align = 1u; + byte.type = i8; + for (u32 off = 0; off < access.size; ++off) + aa_store_native(t, aa_addr_plus(dst, off), byte, mem); +} + +static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs, + NativeLoc rhs) { + u32 sf = loc_is_64(t, dst) ? 1u : 0u; + u32 rd = loc_reg(dst), rn = loc_reg(lhs), rm = loc_reg(rhs); + if (loc_is_fp(dst)) { + u32 d = type_size32(t, dst.type) == 8u; + switch (op) { + case BO_FADD: + aa_emit32(t->mc, aa_fp_bin(0x002800u, d, rd, rn, rm)); + return; + case BO_FSUB: + aa_emit32(t->mc, aa_fp_bin(0x003800u, d, rd, rn, rm)); + return; + case BO_FMUL: + aa_emit32(t->mc, aa_fp_bin(0x000800u, d, rd, rn, rm)); + return; + case BO_FDIV: + aa_emit32(t->mc, aa_fp_bin(0x001800u, d, rd, rn, rm)); + return; + default: + aa_panic(aa_of(t), "unsupported floating binary op"); + } + } + switch (op) { + case BO_IADD: + aa_emit32(t->mc, aa64_add(sf, rd, rn, rm)); + return; + case BO_ISUB: + aa_emit32(t->mc, aa64_sub(sf, rd, rn, rm)); + return; + case BO_IMUL: + aa_emit32(t->mc, aa64_mul(sf, rd, rn, rm)); + return; + case BO_SDIV: + aa_emit32(t->mc, aa64_sdiv(sf, rd, rn, rm)); + return; + case BO_UDIV: + aa_emit32(t->mc, aa64_udiv(sf, rd, rn, rm)); + return; + case BO_SREM: + aa_emit32(t->mc, aa64_sdiv(sf, AA_TMP0, rn, rm)); + aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm)); + aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0)); + return; + case BO_UREM: + aa_emit32(t->mc, aa64_udiv(sf, AA_TMP0, rn, rm)); + aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm)); + aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0)); + return; + case BO_AND: + aa_emit32(t->mc, aa64_and(sf, rd, rn, rm)); + return; + case BO_OR: + aa_emit32(t->mc, aa64_orr(sf, rd, rn, rm)); + return; + case BO_XOR: + aa_emit32(t->mc, aa64_eor(sf, rd, rn, rm)); + return; + case BO_SHL: + aa_emit32(t->mc, aa64_lslv(sf, rd, rn, rm)); + return; + case BO_SHR_U: + aa_emit32(t->mc, aa64_lsrv(sf, rd, rn, rm)); + return; + case BO_SHR_S: + aa_emit32(t->mc, aa64_asrv(sf, rd, rn, rm)); + return; + default: + aa_panic(aa_of(t), "unsupported binary op"); + } +} + +static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { + u32 sf = loc_is_64(t, dst) ? 1u : 0u; + if (loc_is_fp(dst)) { + switch (op) { + case UO_FNEG: + case UO_NEG: + aa_emit32(t->mc, aa_fneg(type_size32(t, dst.type) == 8u, loc_reg(dst), + loc_reg(src))); + return; + default: + aa_panic(aa_of(t), "unsupported floating unary op"); + } + } + switch (op) { + case UO_NEG: + aa_emit32(t->mc, aa64_neg(sf, loc_reg(dst), loc_reg(src))); + return; + case UO_BNOT: + aa_emit32(t->mc, aa64_mvn(sf, loc_reg(dst), loc_reg(src))); + return; + case UO_NOT: + aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(src), 0, 0)); + aa_emit32(t->mc, aa_cset(sf, loc_reg(dst), 0x0u)); + return; + default: + aa_panic(aa_of(t), "unsupported unary op"); + } +} + +static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs, + NativeLoc rhs) { + if (loc_is_fp(lhs)) { + aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs), + loc_reg(rhs))); + } else { + u32 sf = loc_is_64(t, lhs) ? 1u : 0u; + aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs))); + } + aa_emit32(t->mc, aa_cset(loc_is_64(t, dst), loc_reg(dst), cmp_cond(op))); +} + +static void aa_convert(NativeTarget* t, ConvKind op, NativeLoc dst, + NativeLoc src) { + int dst_fp = loc_is_fp(dst); + int src_fp = loc_is_fp(src); + switch (op) { + case CV_TRUNC: + case CV_BITCAST: + aa_move(t, dst, src); + return; + case CV_ZEXT: { + u32 src_bits = type_size32(t, src.type) * 8u; + u32 dst_bits = type_size32(t, dst.type) * 8u; + u32 sf = dst_bits > 32u; + if (src_bits >= dst_bits) { + aa_move(t, dst, src); + } else if (src_bits >= 32u) { + aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dst), loc_reg(src))); + } else { + aa_emit32(t->mc, + aa_ubfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u)); + } + return; + } + case CV_SEXT: { + u32 src_bits = type_size32(t, src.type) * 8u; + u32 dst_bits = type_size32(t, dst.type) * 8u; + u32 sf = dst_bits > 32u; + if (src_bits >= dst_bits) { + aa_move(t, dst, src); + } else { + aa_emit32(t->mc, + aa_sbfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u)); + } + return; + } + case CV_ITOF_S: + aa_emit32(t->mc, aa_scvtf(type_size32(t, dst.type) == 8u, + loc_is_64(t, src), loc_reg(dst), + loc_reg(src))); + return; + case CV_ITOF_U: + aa_emit32(t->mc, aa_ucvtf(type_size32(t, dst.type) == 8u, + loc_is_64(t, src), loc_reg(dst), + loc_reg(src))); + return; + case CV_FTOI_S: + aa_emit32(t->mc, aa_fcvtzs(loc_is_64(t, dst), + type_size32(t, src.type) == 8u, loc_reg(dst), + loc_reg(src))); + return; + case CV_FTOI_U: + aa_emit32(t->mc, aa_fcvtzu(loc_is_64(t, dst), + type_size32(t, src.type) == 8u, loc_reg(dst), + loc_reg(src))); + return; + case CV_FEXT: + if (dst_fp && src_fp) aa_emit32(t->mc, aa_fcvt_d_s(loc_reg(dst), loc_reg(src))); + else aa_move(t, dst, src); + return; + case CV_FTRUNC: + if (dst_fp && src_fp) aa_emit32(t->mc, aa_fcvt_s_d(loc_reg(dst), loc_reg(src))); + else aa_move(t, dst, src); + return; + default: + aa_panic(aa_of(t), "unsupported conversion"); + } +} + +static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, + u32 align) { + AANativeTarget* a = aa_of(t); + u32 use_align = align < 16u ? 16u : align; + if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2"); + if (a->nalloca_patches == a->alloca_patches_cap) { + u32 cap = a->alloca_patches_cap ? a->alloca_patches_cap * 2u : 8u; + AAAllocaPatch* nb = arena_zarray(t->c->tu, AAAllocaPatch, cap); + if (a->alloca_patches) + memcpy(nb, a->alloca_patches, sizeof(*nb) * a->nalloca_patches); + a->alloca_patches = nb; + a->alloca_patches_cap = cap; + } + aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u)); + aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align); + aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0)); + aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0)); + aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0)); + a->alloca_patches[a->nalloca_patches].pos = t->mc->pos(t->mc); + a->alloca_patches[a->nalloca_patches].dst_reg = loc_reg(dst); + a->nalloca_patches++; + aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0)); +} + +static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size) { + MemAccess mem; + memset(&mem, 0, sizeof mem); + mem.type = type; + mem.size = size ? size : type_size32(t, type); + mem.align = type_align32(t, type); + if (mem.align > mem.size && mem.size) mem.align = mem.size; + return mem; +} + +static NativeLoc aa_reg_loc(CfreeCgTypeId type, NativeAllocClass cls, Reg reg) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_REG; + loc.cls = (u8)cls; + loc.type = type; + loc.v.reg = reg; + return loc; +} + +static NativeLoc aa_stack_loc(CfreeCgTypeId type, NativeFrameSlot slot, + i32 offset) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_STACK; + loc.cls = NATIVE_REG_INT; + loc.type = type; + loc.v.stack.slot = slot; + loc.v.stack.offset = offset; + return loc; +} + +static NativeAddr aa_loc_addr(AANativeTarget* a, NativeLoc loc, u32 offset) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + switch ((NativeLocKind)loc.kind) { + case NATIVE_LOC_FRAME: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = loc.v.frame; + addr.base_type = loc.type; + addr.offset = (i32)offset; + return addr; + case NATIVE_LOC_STACK: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = loc.v.stack.slot; + addr.base_type = loc.type; + addr.offset = loc.v.stack.offset + (i32)offset; + return addr; + case NATIVE_LOC_ADDR: + addr = loc.v.addr; + addr.offset += (i32)offset; + return addr; + default: + aa_panic(a, "location is not addressable"); + } + return addr; +} + +static void aa_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { + AANativeTarget* a = aa_of(t); + NativeAddr addr = aa_loc_addr(a, src, 0); + aa_load_addr(t, dst, addr); +} + +static void aa_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src, + u32 offset, u32 size) { + AANativeTarget* a = aa_of(t); + MemAccess mem = aa_mem_for_type(t, dst.type, size); + if (src.kind == NATIVE_LOC_REG) { + aa_move(t, dst, src); + return; + } + if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK || + src.kind == NATIVE_LOC_ADDR) { + NativeAddr addr = aa_loc_addr(a, src, offset); + addr.base_type = dst.type; + aa_emit_mem(a, 1, dst, addr, mem); + return; + } + if (src.kind == NATIVE_LOC_IMM) { + aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), src.v.imm); + return; + } + aa_panic(a, "unsupported call argument source"); +} + +static void aa_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, + u32 offset, u32 size) { + AANativeTarget* a = aa_of(t); + MemAccess mem = aa_mem_for_type(t, src.type, size); + if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK || + dst.kind == NATIVE_LOC_ADDR) { + NativeAddr addr = aa_loc_addr(a, dst, offset); + addr.base_type = src.type; + aa_emit_mem(a, 0, src, addr, mem); + return; + } + if (dst.kind == NATIVE_LOC_REG) { + aa_move(t, dst, src); + return; + } + aa_panic(a, "unsupported call return destination"); +} + +static void aa_store_outgoing_part(NativeTarget* t, int tail_call, + u32 stack_off, NativeLoc src, u32 size) { + NativeAddr addr; + MemAccess mem = aa_mem_for_type(t, src.type, size); + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.base.reg = tail_call ? AA_FP : AA_SP; + addr.base_type = src.type; + addr.offset = (i32)stack_off; + aa_emit_mem(aa_of(t), 0, src, addr, mem); +} + +static const ABIArgInfo* aa_param_abi(NativeTarget* t, + const ABIFuncInfo* abi, + const NativeCallDesc* desc, u32 i, + ABIArgInfo* scratch) { + if (abi && i < abi->nparams) return &abi->params[i]; + memset(scratch, 0, sizeof *scratch); + scratch->kind = ABI_ARG_DIRECT; + scratch->flags = ABI_AF_NONE; + scratch->nparts = 1; + scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); + ((ABIArgPart*)scratch->parts)[0].cls = + cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT; + ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; + ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].src_offset = 0; + return scratch; +} + +static u32 aa_class_stack_size(const ABIArgInfo* ai) { + if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; + if (ai->kind == ABI_ARG_INDIRECT) return 8u; + return align_up_u32(ai->nparts ? ai->nparts * 8u : 8u, 8u); +} + +static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); + u32 next_int = 0, next_fp = 0, stack = 0; + for (u32 i = 0; i < desc->nargs; ++i) { + ABIArgInfo tmp; + const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp); + int force_stack = abi && abi->variadic && abi->vararg_on_stack && + i >= abi->nparams; + if (ai->kind == ABI_ARG_IGNORE) continue; + if (force_stack) { + stack += aa_class_stack_size(ai); + continue; + } + if (ai->kind == ABI_ARG_INDIRECT) { + if (next_int < 8u) + next_int++; + else + stack += 8u; + continue; + } + for (u32 p = 0; p < ai->nparts; ++p) { + const ABIArgPart* part = &ai->parts[p]; + if (part->cls == ABI_CLASS_FP) { + if (next_fp < 8u) + next_fp++; + else + stack += 8u; + } else { + if (next_int < 8u) + next_int++; + else + stack += 8u; + } + } + } + return align_up_u32(stack, 16u); +} + +static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, + NativeCallPlan* plan) { + NativeCallPlanRet* rets; + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); + memset(plan, 0, sizeof *plan); + rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL; + plan->callee = desc->callee; + plan->rets = rets; + plan->flags = desc->flags; + plan->has_sret = abi && abi->has_sret; + plan->is_variadic = abi && abi->variadic; + plan->stack_arg_size = aa_call_stack_size(t, desc); + if (plan->stack_arg_size > aa_of(t)->max_outgoing) + aa_of(t)->max_outgoing = plan->stack_arg_size; + { + u32 next_int = 0, next_fp = 0, stack = 0; + int tail_call = (desc->flags & CG_CALL_TAIL) != 0; + if (abi && abi->has_sret) { + NativeLoc x8 = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), + NATIVE_REG_INT, 8u); + if (desc->flags & CG_CALL_TAIL) { + AANativeTarget* a = aa_of(t); + NativeLoc saved = + aa_stack_loc(x8.type, a->sret_ptr_slot, 0); + aa_load_part(t, x8, saved, 0, 8); + } else if (desc->nresults) { + aa_addr_of_loc(t, x8, desc->results[0]); + } + } + for (u32 i = 0; i < desc->nargs; ++i) { + ABIArgInfo tmp; + const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp); + int force_stack = abi && abi->variadic && abi->vararg_on_stack && + i >= abi->nparams; + if (ai->kind == ABI_ARG_IGNORE) continue; + if (force_stack) { + NativeLoc tmpreg = + aa_reg_loc(desc->args[i].type, NATIVE_REG_INT, AA_TMP0); + u32 n = aa_class_stack_size(ai); + u32 off = 0; + while (off < n) { + aa_load_part(t, tmpreg, desc->args[i], off, 8); + aa_store_outgoing_part(t, tail_call, stack + off, tmpreg, 8); + off += 8; + } + stack += n; + continue; + } + if (ai->kind == ABI_ARG_INDIRECT) { + NativeLoc ptr; + if (next_int < 8u) { + ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + next_int++); + aa_addr_of_loc(t, ptr, desc->args[i]); + } else { + ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + AA_TMP0); + aa_addr_of_loc(t, ptr, desc->args[i]); + aa_store_outgoing_part(t, tail_call, stack, ptr, 8); + stack += 8u; + } + continue; + } + for (u32 p = 0; p < ai->nparts; ++p) { + const ABIArgPart* part = &ai->parts[p]; + NativeAllocClass cls = + part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; + if (cls == NATIVE_REG_FP && next_fp < 8u) { + NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_fp++); + aa_load_part(t, dst, desc->args[i], part->src_offset, part->size); + } else if (cls == NATIVE_REG_INT && next_int < 8u) { + NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_int++); + aa_load_part(t, dst, desc->args[i], part->src_offset, part->size); + } else { + NativeLoc tmpreg = + aa_reg_loc(desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u + : AA_TMP0); + aa_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); + aa_store_outgoing_part(t, tail_call, stack, tmpreg, part->size); + stack += 8u; + } + } + } + } + if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { + u32 nr = 0, ni = 0, nf = 0; + for (u32 p = 0; p < abi->ret.nparts; ++p) { + const ABIArgPart* part = &abi->ret.parts[p]; + NativeAllocClass cls = + part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; + rets[nr].src = aa_reg_loc(desc->results[0].type, cls, + cls == NATIVE_REG_FP ? nf++ : ni++); + rets[nr].dst = desc->results[0]; + if (rets[nr].dst.kind == NATIVE_LOC_FRAME) + rets[nr].dst = aa_stack_loc(desc->results[0].type, + desc->results[0].v.frame, + (i32)part->src_offset); + else if (rets[nr].dst.kind == NATIVE_LOC_STACK) + rets[nr].dst.v.stack.offset += (i32)part->src_offset; + else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) + rets[nr].dst.v.addr.offset += (i32)part->src_offset; + rets[nr].mem = aa_mem_for_type(t, desc->results[0].type, part->size); + nr++; + } + plan->nrets = nr; + } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { + plan->nrets = 0; + } else if (!abi && desc->nresults) { + rets[0].src = aa_reg_loc(desc->results[0].type, NATIVE_REG_INT, 0); + rets[0].dst = desc->results[0]; + rets[0].mem = aa_mem_for_type(t, desc->results[0].type, 0); + plan->nrets = 1; + } +} + +static void aa_ret(NativeTarget* t); + +static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) { + AANativeTarget* a = aa_of(t); + if (a->ntail_sites == a->tail_sites_cap) { + u32 cap = a->tail_sites_cap ? a->tail_sites_cap * 2u : 8u; + AATailSite* nb = arena_zarray(t->c->tu, AATailSite, cap); + if (a->tail_sites) memcpy(nb, a->tail_sites, sizeof(*nb) * a->ntail_sites); + a->tail_sites = nb; + a->tail_sites_cap = cap; + } + a->tail_sites[a->ntail_sites].pos = t->mc->pos(t->mc); + a->tail_sites[a->ntail_sites].callee = callee; + a->ntail_sites++; + for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu); + if (callee.kind == NATIVE_LOC_GLOBAL) { + t->mc->emit_reloc_at(t->mc, t->mc->section_id, + a->tail_sites[a->ntail_sites - 1u].pos + + (AA_TAIL_WORDS - 1u) * 4u, + R_AARCH64_JUMP26, callee.v.global.sym, + callee.v.global.addend, 0, 0); + } +} + +static void aa_emit_call(NativeTarget* t, const NativeCallPlan* plan) { + int is_tail = (plan->flags & CG_CALL_TAIL) != 0; + if (is_tail) { + if (plan->callee.kind != NATIVE_LOC_GLOBAL && + plan->callee.kind != NATIVE_LOC_REG) + aa_panic(aa_of(t), "unsupported tail target"); + aa_emit_tail_site(t, plan->callee); + return; + } + if (plan->callee.kind == NATIVE_LOC_GLOBAL) { + aa_emit32(t->mc, aa64_bl(0)); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, t->mc->pos(t->mc) - 4u, + R_AARCH64_CALL26, plan->callee.v.global.sym, + plan->callee.v.global.addend, 0, 0); + return; + } + if (plan->callee.kind == NATIVE_LOC_REG) { + aa_emit32(t->mc, aa64_blr(loc_reg(plan->callee))); + return; + } + aa_panic(aa_of(t), "unsupported call target"); +} + +static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd, + const NativeLoc* values, u32 nvalues, + NativeCallPlanRet** out_rets, u32* out_nrets) { + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); + NativeCallPlanRet* rets = NULL; + u32 nr = 0; + if (nvalues > 1u) aa_panic(aa_of(t), "multiple returns unsupported"); + if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4); + if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) { + AANativeTarget* a = aa_of(t); + NativeLoc dstp = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), + NATIVE_REG_INT, AA_TMP1); + NativeLoc saved = aa_stack_loc(dstp.type, a->sret_ptr_slot, 0); + NativeAddr dst_addr, src_addr; + AggregateAccess access; + aa_load_part(t, dstp, saved, 0, 8); + memset(&dst_addr, 0, sizeof dst_addr); + dst_addr.base_kind = NATIVE_ADDR_BASE_REG; + dst_addr.base.reg = AA_TMP1; + dst_addr.base_type = values[0].type; + src_addr = aa_loc_addr(a, values[0], 0); + src_addr.base_type = values[0].type; + memset(&access, 0, sizeof access); + access.type = values[0].type; + access.size = (u32)cg_type_size(t->c, values[0].type); + access.align = type_align32(t, values[0].type); + aa_copy_bytes(t, dst_addr, src_addr, access); + *out_rets = NULL; + *out_nrets = 0; + return; + } + if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) { + u32 ni = 0, nf = 0; + for (u32 p = 0; p < abi->ret.nparts; ++p) { + const ABIArgPart* part = &abi->ret.parts[p]; + NativeAllocClass cls = + part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; + rets[nr].src = values[0]; + if (rets[nr].src.kind == NATIVE_LOC_FRAME) + rets[nr].src = aa_stack_loc(values[0].type, values[0].v.frame, + (i32)part->src_offset); + else if (rets[nr].src.kind == NATIVE_LOC_STACK) + rets[nr].src.v.stack.offset += (i32)part->src_offset; + else if (rets[nr].src.kind == NATIVE_LOC_ADDR) + rets[nr].src.v.addr.offset += (i32)part->src_offset; + rets[nr].dst = aa_reg_loc(values[0].type, cls, + cls == NATIVE_REG_FP ? nf++ : ni++); + rets[nr].mem = aa_mem_for_type(t, values[0].type, part->size); + nr++; + } + } else if (nvalues) { + rets[0].src = values[0]; + rets[0].dst = aa_reg_loc(values[0].type, NATIVE_REG_INT, 0); + rets[0].mem = aa_mem_for_type(t, values[0].type, 0); + nr = 1; + } + *out_rets = rets; + *out_nrets = nr; +} + +static void aa_ret(NativeTarget* t) { + AANativeTarget* a = aa_of(t); + aa_jump(t, a->epilogue_label); +} + +static u32 aa_bit_storage_reg_bits(u32 storage_bytes) { + return storage_bytes == 8u ? 64u : 32u; +} + +static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { + u32 bits = sf ? 64u : 32u; + if (!sh) { + if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); + return; + } + aa_emit32(t->mc, aa_ubfm(sf, rd, rn, bits - sh, bits - 1u - sh)); +} + +static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { + if (!sh) { + if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); + return; + } + aa_emit32(t->mc, aa_ubfm(sf, rd, rn, sh, sf ? 63u : 31u)); +} + +static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { + if (!sh) { + if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); + return; + } + aa_emit32(t->mc, aa_sbfm(sf, rd, rn, sh, sf ? 63u : 31u)); +} + +static void aa_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, + BitFieldAccess bf) { + u32 storage = bf.storage.size ? bf.storage.size : 4u; + u32 bits = aa_bit_storage_reg_bits(storage); + u32 width = bf.bit_width ? bf.bit_width : 1u; + u32 sf = bits == 64u; + NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset); + NativeLoc tmp = dst; + tmp.type = bf.storage.type ? bf.storage.type : dst.type; + aa_load_native(t, tmp, saddr, bf.storage); + aa_lsl_imm(t, sf, loc_reg(dst), loc_reg(dst), + bits - (u32)bf.bit_offset - width); + if (bf.signed_) + aa_asr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width); + else + aa_lsr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width); +} + +static void aa_bitfield_store(NativeTarget* t, NativeAddr addr, NativeLoc src, + BitFieldAccess bf) { + u32 storage = bf.storage.size ? bf.storage.size : 4u; + u32 bits = aa_bit_storage_reg_bits(storage); + u32 width = bf.bit_width ? bf.bit_width : 1u; + u32 sf = bits == 64u; + u64 ones = width >= 64u ? ~(u64)0 : ((1ull << width) - 1ull); + u64 field_mask = ones << bf.bit_offset; + NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset); + NativeLoc word = aa_tmp_loc(bf.storage.type ? bf.storage.type : src.type, + AA_TMP0); + aa_load_native(t, word, saddr, bf.storage); + aa_emit_load_imm(t->mc, sf, AA_TMP1, (i64)~field_mask); + aa_emit32(t->mc, aa64_and(sf, AA_TMP0, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa_ubfm(sf, AA_TMP1, loc_reg(src), 0, width - 1u)); + aa_lsl_imm(t, sf, AA_TMP1, AA_TMP1, bf.bit_offset); + aa_emit32(t->mc, aa64_orr(sf, AA_TMP0, AA_TMP0, AA_TMP1)); + aa_store_native(t, saddr, word, bf.storage); +} + +static void aa_trap(NativeTarget* t); + +static int aa_order_acquire(MemOrder order) { + return order == MO_CONSUME || order == MO_ACQUIRE || order == MO_ACQ_REL || + order == MO_SEQ_CST; +} + +static int aa_order_release(MemOrder order) { + return order == MO_RELEASE || order == MO_ACQ_REL || order == MO_SEQ_CST; +} + +static NativeLoc aa_i64_reg_loc(u32 reg) { + return aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, reg); +} + +static void aa_atomic_addr_reg(NativeTarget* t, NativeAddr addr, u32 reg) { + NativeLoc dst = aa_i64_reg_loc(reg); + t->load_addr(t, dst, addr); +} + +static u32 aa_saved_tmp_pick(u32 a, u32 b, u32 c) { + static const u32 regs[] = {11u, 12u, 13u, 14u, 15u}; + for (u32 i = 0; i < sizeof regs / sizeof regs[0]; ++i) { + if (regs[i] != a && regs[i] != b && regs[i] != c) return regs[i]; + } + return 15u; +} + +static void aa_saved_tmp_spill(AANativeTarget* a, u32 reg) { + NativeFrameSlotDesc sd; + NativeAddr addr; + MemAccess mem; + memset(&sd, 0, sizeof sd); + if (a->saved_tmp_slot == NATIVE_FRAME_SLOT_NONE) { + sd.type = builtin_id(CFREE_CG_BUILTIN_I64); + sd.size = 8; + sd.align = 8; + sd.kind = NATIVE_FRAME_SLOT_SPILL; + a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd); + } + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = a->saved_tmp_slot; + addr.base_type = builtin_id(CFREE_CG_BUILTIN_I64); + mem = aa_mem_for_type(&a->base, addr.base_type, 8); + aa_store_native(&a->base, addr, aa_i64_reg_loc(reg), mem); +} + +static void aa_saved_tmp_restore(AANativeTarget* a, u32 reg) { + NativeAddr addr; + MemAccess mem; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = a->saved_tmp_slot; + addr.base_type = builtin_id(CFREE_CG_BUILTIN_I64); + mem = aa_mem_for_type(&a->base, addr.base_type, 8); + aa_load_native(&a->base, aa_i64_reg_loc(reg), addr, mem); +} + +static void aa_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, + MemAccess mem, MemOrder order) { + u32 base = AA_TMP0; + u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type)); + aa_atomic_addr_reg(t, addr, base); + aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldar(sz, loc_reg(dst), base) + : aa_ldr_uimm(sz, loc_reg(dst), + base, 0)); + if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +static void aa_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, + MemAccess mem, MemOrder order) { + u32 base = AA_TMP0; + u32 sz = size_idx(mem.size ? mem.size : type_size32(t, src.type)); + if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); + aa_atomic_addr_reg(t, addr, base); + aa_emit32(t->mc, aa_order_release(order) ? aa_stlr(sz, loc_reg(src), base) + : aa_str_uimm(sz, loc_reg(src), + base, 0)); + if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +static void aa_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst, + NativeAddr addr, NativeLoc val, MemAccess mem, + MemOrder order) { + AANativeTarget* a = aa_of(t); + u32 base = AA_TMP0; + u32 next_reg = AA_TMP1; + u32 status = aa_saved_tmp_pick(loc_reg(dst), loc_reg(val), base); + NativeLoc next = aa_tmp_loc(dst.type, next_reg); + MCLabel retry = t->mc->label_new(t->mc); + u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type)); + if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); + aa_saved_tmp_spill(a, status); + aa_atomic_addr_reg(t, addr, base); + t->mc->label_place(t->mc, retry); + aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldaxr(sz, loc_reg(dst), base) + : aa_ldxr(sz, loc_reg(dst), base)); + switch (op) { + case AO_XCHG: + aa_move(t, next, val); + break; + case AO_ADD: + aa_binop(t, BO_IADD, next, dst, val); + break; + case AO_SUB: + aa_binop(t, BO_ISUB, next, dst, val); + break; + case AO_AND: + aa_binop(t, BO_AND, next, dst, val); + break; + case AO_OR: + aa_binop(t, BO_OR, next, dst, val); + break; + case AO_XOR: + aa_binop(t, BO_XOR, next, dst, val); + break; + case AO_NAND: + aa_binop(t, BO_AND, next, dst, val); + aa_unop(t, UO_BNOT, next, next); + break; + default: + aa_panic(a, "unsupported atomic rmw op"); + } + aa_emit32(t->mc, aa_order_release(order) ? aa_stlxr(sz, status, next_reg, base) + : aa_stxr(sz, status, next_reg, + base)); + aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0)); + t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0); + aa_saved_tmp_restore(a, status); + if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +static void aa_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, + NativeAddr addr, NativeLoc expected, + NativeLoc desired, MemAccess mem, MemOrder success, + MemOrder failure) { + u32 base = AA_TMP0; + u32 status = AA_TMP1; + u32 sz = size_idx(mem.size ? mem.size : type_size32(t, prior.type)); + u32 sf = sz == 3u; + int acquire = aa_order_acquire(success) || aa_order_acquire(failure); + int release = aa_order_release(success); + MCLabel retry = t->mc->label_new(t->mc); + MCLabel fail = t->mc->label_new(t->mc); + MCLabel done = t->mc->label_new(t->mc); + if (success == MO_SEQ_CST || failure == MO_SEQ_CST) + aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); + aa_atomic_addr_reg(t, addr, base); + t->mc->label_place(t->mc, retry); + aa_emit32(t->mc, acquire ? aa_ldaxr(sz, loc_reg(prior), base) + : aa_ldxr(sz, loc_reg(prior), base)); + aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(prior), loc_reg(expected))); + aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_NE)})); + t->mc->emit_label_ref(t->mc, fail, R_AARCH64_CONDBR19, 4, 0); + aa_emit32(t->mc, release ? aa_stlxr(sz, status, loc_reg(desired), base) + : aa_stxr(sz, status, loc_reg(desired), base)); + aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0)); + t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0); + aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 1); + aa_jump(t, done); + t->mc->label_place(t->mc, fail); + aa_emit32(t->mc, aa64_clrex(AA64_BARRIER_OPT_SY)); + aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 0); + t->mc->label_place(t->mc, done); + if (success == MO_SEQ_CST || failure == MO_SEQ_CST) + aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +static void aa_fence(NativeTarget* t, MemOrder order) { + if (order != MO_RELAXED) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +static void aa_intrinsic(NativeTarget* t, IntrinKind kind, + const NativeLoc* dsts, u32 ndst, + const NativeLoc* args, u32 narg) { + AggregateAccess access; + NativeAddr dst_addr; + NativeAddr src_addr; + memset(&access, 0, sizeof access); + memset(&dst_addr, 0, sizeof dst_addr); + memset(&src_addr, 0, sizeof src_addr); + switch (kind) { + case INTRIN_NONE: + if (ndst == 1u && narg == 3u && loc_is_fp(dsts[0])) { + u32 d = type_size32(t, dsts[0].type) == 8u; + aa_emit32(t->mc, aa_fp_bin(0x000800u, d, loc_reg(dsts[0]), + loc_reg(args[0]), loc_reg(args[1]))); + aa_emit32(t->mc, aa_fp_bin(0x002800u, d, loc_reg(dsts[0]), + loc_reg(dsts[0]), loc_reg(args[2]))); + return; + } + break; + case INTRIN_CLZ: + if (ndst == 1u && narg == 1u) { + aa_emit32(t->mc, aa_clz(loc_is_64(t, args[0]), loc_reg(dsts[0]), + loc_reg(args[0]))); + return; + } + break; + case INTRIN_CTZ: + if (ndst == 1u && narg == 1u) { + u32 sf = loc_is_64(t, args[0]); + aa_emit32(t->mc, aa_rbit(sf, loc_reg(dsts[0]), loc_reg(args[0]))); + aa_emit32(t->mc, aa_clz(sf, loc_reg(dsts[0]), loc_reg(dsts[0]))); + return; + } + break; + case INTRIN_POPCOUNT: + if (ndst == 1u && narg == 1u) { + u32 sf = loc_is_64(t, args[0]); + u32 rd = loc_reg(dsts[0]); + u32 rn = loc_reg(args[0]); + MCLabel loop = t->mc->label_new(t->mc); + MCLabel done = t->mc->label_new(t->mc); + aa_emit_load_imm(t->mc, sf, rd, 0); + aa_emit32(t->mc, aa64_mov_reg(sf, AA_TMP0, rn)); + t->mc->label_place(t->mc, loop); + aa_emit32(t->mc, aa64_cbz(sf, AA_TMP0, 0)); + t->mc->emit_label_ref(t->mc, done, R_AARCH64_CONDBR19, 4, 0); + aa_emit_load_imm(t->mc, sf, AA_TMP1, 1); + aa_emit32(t->mc, aa64_and(sf, AA_TMP1, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa64_add(sf, rd, rd, AA_TMP1)); + aa_emit_load_imm(t->mc, sf, AA_TMP1, 1); + aa_emit32(t->mc, aa64_lsrv(sf, AA_TMP0, AA_TMP0, AA_TMP1)); + aa_jump(t, loop); + t->mc->label_place(t->mc, done); + return; + } + break; + case INTRIN_BSWAP16: + case INTRIN_BSWAP32: + case INTRIN_BSWAP64: + if (ndst == 1u && narg == 1u) { + u32 sf = kind == INTRIN_BSWAP64; + aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); + if (kind == INTRIN_BSWAP16) { + aa_emit_load_imm(t->mc, 0, AA_TMP0, 16); + aa_emit32(t->mc, aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), + AA_TMP0)); + } + return; + } + break; + case INTRIN_SADD_OVERFLOW: + case INTRIN_UADD_OVERFLOW: + case INTRIN_SSUB_OVERFLOW: + case INTRIN_USUB_OVERFLOW: + if (ndst == 2u && narg == 2u) { + u32 sf = loc_is_64(t, dsts[0]); + u32 rd = loc_reg(dsts[0]); + if (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW) + aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){ + .sf = sf, .op = 0, .S = 1, + .Rm = loc_reg(args[1]), .Rn = loc_reg(args[0]), + .Rd = rd})); + else + aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){ + .sf = sf, .op = 1, .S = 1, + .Rm = loc_reg(args[1]), .Rn = loc_reg(args[0]), + .Rd = rd})); + aa_emit32(t->mc, + aa_cset(loc_is_64(t, dsts[1]), loc_reg(dsts[1]), + (kind == INTRIN_SADD_OVERFLOW || + kind == INTRIN_SSUB_OVERFLOW) + ? 0x6u + : (kind == INTRIN_UADD_OVERFLOW ? 0x2u : 0x3u))); + return; + } + break; + case INTRIN_SMUL_OVERFLOW: + case INTRIN_UMUL_OVERFLOW: + if (ndst == 2u && narg == 2u) { + u32 sf = loc_is_64(t, dsts[0]); + if (sf) { + if (kind == INTRIN_SMUL_OVERFLOW) { + aa_emit32(t->mc, aa_smulh(AA_TMP0, loc_reg(args[0]), + loc_reg(args[1]))); + aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]), + loc_reg(args[1]))); + aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 63, 63)); + aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); + } else { + aa_emit32(t->mc, aa_umulh(AA_TMP0, loc_reg(args[0]), + loc_reg(args[1]))); + aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]), + loc_reg(args[1]))); + aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA64_ZR)); + aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); + } + } else if (kind == INTRIN_SMUL_OVERFLOW) { + aa_emit32(t->mc, aa_smaddl(AA_TMP0, loc_reg(args[0]), + loc_reg(args[1]), AA64_ZR)); + aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0)); + aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 0, 31)); + aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); + } else { + aa_emit32(t->mc, aa_umaddl(AA_TMP0, loc_reg(args[0]), + loc_reg(args[1]), AA64_ZR)); + aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0)); + aa_emit_load_imm(t->mc, 1, AA_TMP1, 32); + aa_emit32(t->mc, aa64_lsrv(1, AA_TMP1, AA_TMP0, AA_TMP1)); + aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP1, AA64_ZR)); + aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); + } + return; + } + break; + case INTRIN_MEMCPY: + if (narg != 3u || args[0].kind != NATIVE_LOC_REG || + args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) + aa_panic(aa_of(t), "unsupported memory intrinsic operands"); + if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) + aa_panic(aa_of(t), "unsupported memory intrinsic size"); + access.size = (u32)args[2].v.imm; + access.align = 1u; + dst_addr.base_kind = NATIVE_ADDR_BASE_REG; + dst_addr.base.reg = args[0].v.reg; + src_addr.base_kind = NATIVE_ADDR_BASE_REG; + src_addr.base.reg = args[1].v.reg; + aa_copy_bytes(t, dst_addr, src_addr, access); + return; + case INTRIN_MEMMOVE: { + MCLabel forward = t->mc->label_new(t->mc); + MCLabel done = t->mc->label_new(t->mc); + if (narg != 3u || args[0].kind != NATIVE_LOC_REG || + args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) + aa_panic(aa_of(t), "unsupported memory intrinsic operands"); + if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) + aa_panic(aa_of(t), "unsupported memory intrinsic size"); + access.size = (u32)args[2].v.imm; + access.align = 1u; + dst_addr.base_kind = NATIVE_ADDR_BASE_REG; + dst_addr.base.reg = args[0].v.reg; + src_addr.base_kind = NATIVE_ADDR_BASE_REG; + src_addr.base.reg = args[1].v.reg; + aa_emit32(t->mc, + aa_subs_reg(1, AA64_ZR, args[0].v.reg, args[1].v.reg)); + aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_LT_U)})); + t->mc->emit_label_ref(t->mc, forward, R_AARCH64_CONDBR19, 4, 0); + aa_copy_bytes_dir(t, dst_addr, src_addr, access, 1); + aa_jump(t, done); + t->mc->label_place(t->mc, forward); + aa_copy_bytes_dir(t, dst_addr, src_addr, access, 0); + t->mc->label_place(t->mc, done); + return; + } + case INTRIN_MEMSET: + if (narg != 3u || args[0].kind != NATIVE_LOC_REG || + args[2].kind != NATIVE_LOC_IMM) + aa_panic(aa_of(t), "unsupported memset operands"); + if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) + aa_panic(aa_of(t), "unsupported memset size"); + access.size = (u32)args[2].v.imm; + access.align = 1u; + dst_addr.base_kind = NATIVE_ADDR_BASE_REG; + dst_addr.base.reg = args[0].v.reg; + if (args[1].kind == NATIVE_LOC_IMM) { + NativeLoc byte = aa_tmp_loc(builtin_id(CFREE_CG_BUILTIN_I8), AA_TMP0); + aa_emit_load_imm(t->mc, 0, AA_TMP0, args[1].v.imm & 0xff); + aa_set_bytes(t, dst_addr, byte, access); + } else { + aa_set_bytes(t, dst_addr, args[1], access); + } + return; + case INTRIN_EXPECT: + case INTRIN_ASSUME_ALIGNED: + if (ndst == 1u && narg >= 1u) { + if (args[0].kind == NATIVE_LOC_IMM) + aa_load_imm_native(t, dsts[0], args[0].v.imm); + else + aa_move(t, dsts[0], args[0]); + } + return; + case INTRIN_PREFETCH: + return; + case INTRIN_TRAP: + case INTRIN_UNREACHABLE: + aa_trap(t); + return; + default: + aa_panic(aa_of(t), "unsupported compiler intrinsic"); + } +} + +static void aa_trap(NativeTarget* t) { aa_emit32(t->mc, aa64_brk(0)); } + +static void aa_file_scope_asm(NativeTarget* t, const char* src, size_t len) { + AsmLexer* lex = asm_lex_open_mem(t->c, "<file-scope-asm>", src, len); + asm_parse(t->c, lex, t->mc); + asm_lex_close(lex); +} + +static void aa_set_loc(NativeTarget* t, SrcLoc loc) { + AANativeTarget* a = aa_of(t); + a->loc = loc; + if (t->mc && t->mc->set_loc) t->mc->set_loc(t->mc, loc); +} + +static void aa_finalize(NativeTarget* t) { + if (t->mc) mc_emit_eh_frame(t->mc); +} + +static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, + NativeFrameSlot home); + +static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u}; +static const Reg aa_int_scratch[] = {9u, 10u}; +static const Reg aa_fp_allocable[] = {18u, 19u}; +static const Reg aa_fp_scratch[] = {20u, 21u}; + +#define AA_PHYS_INT_ALLOC(r) \ + {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \ + .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ + .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_INT_CALLER(r) \ + {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLER_SAVED, .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_INT_ARG(r) \ + {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = (r), \ + .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ + ((r) < 2u ? NATIVE_REG_RET : 0), \ + .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_INT_CALLEE(r) \ + {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLEE_SAVED, .spill_cost = 4u, .copy_cost = 1u} +#define AA_PHYS_INT_RESERVED(r) \ + {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \ + .flags = NATIVE_REG_RESERVED, .spill_cost = 0u, .copy_cost = 0u} + +static const NativePhysRegInfo aa_int_phys[] = { + AA_PHYS_INT_ARG(0u), AA_PHYS_INT_ARG(1u), + AA_PHYS_INT_ARG(2u), AA_PHYS_INT_ARG(3u), + AA_PHYS_INT_ARG(4u), AA_PHYS_INT_ARG(5u), + AA_PHYS_INT_ARG(6u), AA_PHYS_INT_ARG(7u), + AA_PHYS_INT_ALLOC(8u), AA_PHYS_INT_RESERVED(9u), + AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_ALLOC(11u), + AA_PHYS_INT_ALLOC(12u), AA_PHYS_INT_ALLOC(13u), + AA_PHYS_INT_ALLOC(14u), AA_PHYS_INT_ALLOC(15u), + AA_PHYS_INT_RESERVED(16u), AA_PHYS_INT_RESERVED(17u), + AA_PHYS_INT_RESERVED(18u), AA_PHYS_INT_CALLEE(19u), + AA_PHYS_INT_CALLEE(20u), AA_PHYS_INT_CALLEE(21u), + AA_PHYS_INT_CALLEE(22u), AA_PHYS_INT_CALLEE(23u), + AA_PHYS_INT_CALLEE(24u), AA_PHYS_INT_CALLEE(25u), + AA_PHYS_INT_CALLEE(26u), AA_PHYS_INT_CALLEE(27u), + AA_PHYS_INT_CALLEE(28u), AA_PHYS_INT_RESERVED(29u), + AA_PHYS_INT_RESERVED(30u), AA_PHYS_INT_RESERVED(31u), +}; + +#define AA_PHYS_FP_ALLOC(r) \ + {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \ + .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ + .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_FP_CALLER(r) \ + {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLER_SAVED, .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_FP_ARG(r) \ + {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = (r), \ + .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ + ((r) < 4u ? NATIVE_REG_RET : 0), \ + .spill_cost = 1u, .copy_cost = 1u} +#define AA_PHYS_FP_CALLEE(r) \ + {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLEE_SAVED, .spill_cost = 4u, .copy_cost = 1u} +#define AA_PHYS_FP_RESERVED(r) \ + {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \ + .flags = NATIVE_REG_RESERVED, .spill_cost = 0u, .copy_cost = 0u} + +static const NativePhysRegInfo aa_fp_phys[] = { + AA_PHYS_FP_ARG(0u), AA_PHYS_FP_ARG(1u), + AA_PHYS_FP_ARG(2u), AA_PHYS_FP_ARG(3u), + AA_PHYS_FP_ARG(4u), AA_PHYS_FP_ARG(5u), + AA_PHYS_FP_ARG(6u), AA_PHYS_FP_ARG(7u), + AA_PHYS_FP_CALLEE(8u), AA_PHYS_FP_CALLEE(9u), + AA_PHYS_FP_CALLEE(10u), AA_PHYS_FP_CALLEE(11u), + AA_PHYS_FP_CALLEE(12u), AA_PHYS_FP_CALLEE(13u), + AA_PHYS_FP_CALLEE(14u), AA_PHYS_FP_CALLEE(15u), + AA_PHYS_FP_CALLER(16u), AA_PHYS_FP_CALLER(17u), + AA_PHYS_FP_ALLOC(18u), AA_PHYS_FP_ALLOC(19u), + AA_PHYS_FP_RESERVED(20u), AA_PHYS_FP_RESERVED(21u), + AA_PHYS_FP_CALLER(22u), AA_PHYS_FP_CALLER(23u), + AA_PHYS_FP_CALLER(24u), AA_PHYS_FP_CALLER(25u), + AA_PHYS_FP_CALLER(26u), AA_PHYS_FP_CALLER(27u), + AA_PHYS_FP_CALLER(28u), AA_PHYS_FP_CALLER(29u), + AA_PHYS_FP_CALLER(30u), AA_PHYS_FP_CALLER(31u), +}; + +static const NativeAllocClassInfo aa_classes[] = { + {.cls = NATIVE_REG_INT, + .allocable = aa_int_allocable, + .nallocable = sizeof aa_int_allocable / sizeof aa_int_allocable[0], + .scratch = aa_int_scratch, + .nscratch = sizeof aa_int_scratch / sizeof aa_int_scratch[0], + .phys = aa_int_phys, + .nphys = sizeof aa_int_phys / sizeof aa_int_phys[0], + .caller_saved_mask = 0x0007ffffu, + .callee_saved_mask = 0x1ff80000u, + .arg_mask = 0x000000ffu, + .ret_mask = 0x00000003u, + .reserved_mask = (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << AA_FP) | + (1u << AA_LR)}, + {.cls = NATIVE_REG_FP, + .allocable = aa_fp_allocable, + .nallocable = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0], + .scratch = aa_fp_scratch, + .nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0], + .phys = aa_fp_phys, + .nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0], + .caller_saved_mask = 0xffffffffu, + .arg_mask = 0x000000ffu, + .ret_mask = 0x0000000fu}, +}; + +static const NativeRegInfo aa_reg_info = { + .classes = aa_classes, + .nclasses = sizeof aa_classes / sizeof aa_classes[0], +}; + +NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, + MCEmitter* mc) { + AANativeTarget* a = arena_znew(c->tu, AANativeTarget); + NativeTarget* t; + if (!a) return NULL; + t = &a->base; + t->c = c; + t->obj = obj; + t->mc = mc; + t->regs = &aa_reg_info; + t->class_for_type = aa_class_for_type; + t->addr_legal = aa_addr_legal; + t->func_begin = aa_func_begin; + t->func_begin_known_frame = aa_func_begin_known_frame; + t->note_frame_state = aa_note_frame_state; + t->func_end = aa_func_end; + t->frame_slot = aa_frame_slot; + t->bind_param = aa_bind_native_param; + t->label_new = aa_label_new; + t->label_place = aa_label_place; + t->jump = aa_jump; + t->cmp_branch = aa_cmp_branch; + t->indirect_branch = aa_indirect_branch; + t->load_label_addr = aa_load_label_addr; + t->move = aa_move; + t->load_imm = aa_load_imm_native; + t->load_const = aa_load_const; + t->load_addr = aa_load_addr; + t->load = aa_load_native; + t->store = aa_store_native; + t->tls_addr_of = aa_tls_addr_of; + t->copy_bytes = aa_copy_bytes; + t->set_bytes = aa_set_bytes; + t->bitfield_load = aa_bitfield_load; + t->bitfield_store = aa_bitfield_store; + t->binop = aa_binop; + t->unop = aa_unop; + t->cmp = aa_cmp; + t->convert = aa_convert; + t->alloca_ = aa_alloca; + t->spill = aa_spill; + t->reload = aa_reload; + t->plan_call = aa_plan_call; + t->emit_call = aa_emit_call; + t->plan_ret = aa_plan_ret; + t->ret = aa_ret; + t->atomic_load = aa_atomic_load; + t->atomic_store = aa_atomic_store; + t->atomic_rmw = aa_atomic_rmw; + t->atomic_cas = aa_atomic_cas; + t->fence = aa_fence; + t->intrinsic = aa_intrinsic; + t->file_scope_asm = aa_file_scope_asm; + t->trap = aa_trap; + t->set_loc = aa_set_loc; + t->finalize = aa_finalize; + return t; +} + +static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, + NativeFrameSlot home) { + AANativeTarget* a = aa_of(t); + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); + const ABIArgInfo* ai = p->index < abi->nparams ? &abi->params[p->index] : NULL; + if (!ai || ai->kind == ABI_ARG_IGNORE) return; + if (ai->kind == ABI_ARG_INDIRECT) { + NativeLoc src = aa_reg_loc(p->type, NATIVE_REG_INT, + a->next_param_int < 8u ? a->next_param_int++ + : AA_TMP0); + if (src.v.reg == AA_TMP0) { + NativeAddr saddr; + memset(&saddr, 0, sizeof saddr); + saddr.base_kind = NATIVE_ADDR_BASE_REG; + saddr.base.reg = AA_FP; + saddr.offset = (i32)a->next_param_stack; + aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8)); + a->next_param_stack += 8u; + } + NativeAddr dst, from; + AggregateAccess access; + memset(&dst, 0, sizeof dst); + dst.base_kind = NATIVE_ADDR_BASE_FRAME; + dst.base.frame = home; + dst.base_type = p->type; + memset(&from, 0, sizeof from); + from.base_kind = NATIVE_ADDR_BASE_REG; + from.base.reg = src.v.reg; + from.base_type = p->type; + memset(&access, 0, sizeof access); + access.type = p->type; + access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); + access.align = p->align ? p->align : type_align32(t, p->type); + aa_copy_bytes(t, dst, from, access); + return; + } + for (u32 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* part = &ai->parts[i]; + NativeAllocClass cls = + part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; + NativeLoc src; + if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) { + src = aa_reg_loc(p->type, cls, a->next_param_fp++); + } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) { + src = aa_reg_loc(p->type, cls, a->next_param_int++); + } else { + src = aa_reg_loc(p->type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0); + NativeAddr saddr; + memset(&saddr, 0, sizeof saddr); + saddr.base_kind = NATIVE_ADDR_BASE_REG; + saddr.base.reg = AA_FP; + saddr.base_type = p->type; + saddr.offset = (i32)a->next_param_stack; + aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size)); + a->next_param_stack += 8u; + } + aa_store_part(t, aa_stack_loc(p->type, home, (i32)part->src_offset), + src, 0, part->size); + } + a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); +} + +static void aa_bind_param(NativeDirectTarget* d, const CGParamDesc* p, + CGLocal local, NativeDirectLocal* l) { + (void)local; + aa_bind_native_param(d->native, p, l->home); +} + +static const char* aa_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { + NativeCallDesc nd; + NativeLoc* args = NULL; + NativeLoc* results = NULL; + u32 stack; + memset(&nd, 0, sizeof nd); + if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); + if (call->nresults) + results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults); + for (u32 i = 0; i < call->nargs; ++i) { + args[i].kind = NATIVE_LOC_FRAME; + args[i].type = d->locals[call->args[i] - 1u].type; + args[i].cls = d->locals[call->args[i] - 1u].cls; + args[i].v.frame = d->locals[call->args[i] - 1u].home; + } + for (u32 i = 0; i < call->nresults; ++i) { + results[i].kind = NATIVE_LOC_FRAME; + results[i].type = d->locals[call->results[i] - 1u].type; + results[i].cls = d->locals[call->results[i] - 1u].cls; + results[i].v.frame = d->locals[call->results[i] - 1u].home; + } + nd.fn_type = call->fn_type; + nd.args = args; + nd.results = results; + nd.nargs = call->nargs; + nd.nresults = call->nresults; + stack = aa_call_stack_size(d->native, &nd); + if (stack > aa_of(d->native)->incoming_stack_size) + return "aarch64 tail call: stack argument area too small"; + return NULL; +} + +static NativeAddr aa_direct_addr(NativeDirectTarget* d, Operand op) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + switch ((OpKind)op.kind) { + case OPK_LOCAL: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = d->locals[op.v.local - 1u].home; + addr.base_type = op.type; + return addr; + case OPK_INDIRECT: + addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; + addr.base.frame = d->locals[op.v.ind.base - 1u].home; + addr.cls = d->locals[op.v.ind.base - 1u].cls; + addr.base_type = d->locals[op.v.ind.base - 1u].type; + addr.offset = op.v.ind.ofs; + return addr; + default: + compiler_panic(d->base.c, d->loc, + "aarch64 native target: operand is not addressable"); + } +} + +static NativeAddr aa_direct_materialize_addr(NativeDirectTarget* d, + Operand op) { + NativeAddr addr = aa_direct_addr(d, op); + if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { + NativeLoc base = aa_reg_loc(addr.base_type, NATIVE_REG_INT, AA_TMP1); + NativeAddr load; + memset(&load, 0, sizeof load); + load.base_kind = NATIVE_ADDR_BASE_FRAME; + load.base.frame = addr.base.frame; + load.base_type = addr.base_type; + aa_emit_mem(aa_of(d->native), 1, base, load, + aa_mem_for_type(d->native, addr.base_type, 8)); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.base.reg = AA_TMP1; + } + return addr; +} + +static NativeAddr aa_direct_pointer_addr(NativeDirectTarget* d, Operand op) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + if (op.kind == OPK_LOCAL) { + NativeLoc base = aa_reg_loc(op.type, NATIVE_REG_INT, AA_TMP1); + NativeAddr load; + memset(&load, 0, sizeof load); + load.base_kind = NATIVE_ADDR_BASE_FRAME; + load.base.frame = d->locals[op.v.local - 1u].home; + load.base_type = op.type; + aa_emit_mem(aa_of(d->native), 1, base, load, + aa_mem_for_type(d->native, op.type, 8)); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.base.reg = AA_TMP1; + addr.base_type = op.type; + return addr; + } + return aa_direct_materialize_addr(d, op); +} + +static NativeAddr aa_reg_addr(CfreeCgTypeId type, u32 reg, i32 offset) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.base.reg = reg; + addr.base_type = type; + addr.offset = offset; + return addr; +} + +static void aa_load_ap_addr(NativeDirectTarget* d, Operand ap_addr, + u32 dst_reg) { + NativeLoc dst = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), + NATIVE_REG_INT, dst_reg); + NativeAddr ap = aa_direct_pointer_addr(d, ap_addr); + d->native->load_addr(d->native, dst, ap); +} + +static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) { + AANativeTarget* a = aa_of(d->native); + ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi); + NativeLoc ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + AA_TMP0); + NativeAddr dst = aa_direct_pointer_addr(d, ap_addr); + if (vai.kind == ABI_VA_LIST_POINTER) { + aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack); + aa_emit_mem(a, 0, ptr, dst, aa_mem_for_type(d->native, ptr.type, 8)); + return; + } + if (vai.kind == ABI_VA_LIST_AAPCS64) { + CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32); + NativeLoc i32tmp = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1); + MemAccess ptr_mem = aa_mem_for_type(d->native, ptr.type, 8); + MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4); + AANativeSlot* gr = aa_slot(a, a->va_gr_slot); + AANativeSlot* vr = aa_slot(a, a->va_vr_slot); + u32 used_gr = a->next_param_int < vai.gp_reg_count ? a->next_param_int + : vai.gp_reg_count; + u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp + : vai.fp_reg_count; + aa_load_ap_addr(d, ap_addr, 15u); + aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack); + aa_emit_mem(a, 0, ptr, + aa_reg_addr(ptr.type, 15u, (i32)vai.stack_offset), ptr_mem); + aa_emit_add_imm(a, AA_TMP0, AA_FP, + -(i32)gr->off + + (i32)(vai.gp_reg_count * vai.gp_slot_size)); + aa_emit_mem(a, 0, ptr, + aa_reg_addr(ptr.type, 15u, (i32)vai.gr_top_offset), ptr_mem); + aa_emit_add_imm(a, AA_TMP0, AA_FP, + -(i32)vr->off + + (i32)(vai.fp_reg_count * vai.fp_slot_size)); + aa_emit_mem(a, 0, ptr, + aa_reg_addr(ptr.type, 15u, (i32)vai.vr_top_offset), ptr_mem); + aa_emit_load_imm(a->base.mc, 0, AA_TMP1, + -(i32)((vai.gp_reg_count - used_gr) * vai.gp_slot_size)); + aa_emit_mem(a, 0, i32tmp, + aa_reg_addr(i32_ty, 15u, (i32)vai.gr_offs_offset), i32_mem); + aa_emit_load_imm(a->base.mc, 0, AA_TMP1, + -(i32)((vai.fp_reg_count - used_vr) * vai.fp_slot_size)); + aa_emit_mem(a, 0, i32tmp, + aa_reg_addr(i32_ty, 15u, (i32)vai.vr_offs_offset), i32_mem); + return; + } + { + compiler_panic(d->base.c, d->loc, + "aarch64 native target: unsupported va_list layout"); + } +} + +static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr, + CfreeCgTypeId type) { + AANativeTarget* a = aa_of(d->native); + ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi); + NativeLoc cur = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + AA_TMP0); + NativeLoc val = + aa_reg_loc(type, cg_type_is_float(d->base.c, type) ? NATIVE_REG_FP + : NATIVE_REG_INT, + cg_type_is_float(d->base.c, type) ? 16u : 9u); + NativeAddr src, dst; + MemAccess ptr_mem = aa_mem_for_type(d->native, cur.type, 8); + MemAccess val_mem = aa_mem_for_type(d->native, type, type_size32(d->native, type)); + if (vai.kind == ABI_VA_LIST_POINTER) { + NativeAddr ap = aa_direct_pointer_addr(d, ap_addr); + aa_emit_mem(a, 1, cur, ap, ptr_mem); + src = aa_reg_addr(type, AA_TMP0, 0); + aa_emit_mem(a, 1, val, src, val_mem); + aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8); + aa_emit_mem(a, 0, cur, ap, ptr_mem); + dst = aa_direct_materialize_addr(d, dst_op); + aa_emit_mem(a, 0, val, dst, val_mem); + return; + } + if (vai.kind == ABI_VA_LIST_AAPCS64) { + CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32); + NativeLoc off = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1); + MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4); + int is_fp = cg_type_is_float(d->base.c, type); + u32 offs_field = is_fp ? vai.vr_offs_offset : vai.gr_offs_offset; + u32 top_field = is_fp ? vai.vr_top_offset : vai.gr_top_offset; + u32 slot_size = is_fp ? vai.fp_slot_size : vai.gp_slot_size; + MCLabel stack_label = d->native->mc->label_new(d->native->mc); + MCLabel done_label = d->native->mc->label_new(d->native->mc); + aa_load_ap_addr(d, ap_addr, 15u); + aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem); + aa_emit32(a->base.mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0)); + aa_emit32(a->base.mc, + aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)})); + a->base.mc->emit_label_ref(a->base.mc, stack_label, R_AARCH64_CONDBR19, 4, + 0); + aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)top_field), + ptr_mem); + aa_emit32(a->base.mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31)); + aa_emit32(a->base.mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1)); + aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem); + aa_emit_add_imm(a, AA_TMP1, AA_TMP1, (i32)slot_size); + aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem); + aa_emit32(a->base.mc, aa64_b(0)); + a->base.mc->emit_label_ref(a->base.mc, done_label, R_AARCH64_JUMP26, 4, 0); + a->base.mc->label_place(a->base.mc, stack_label); + aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset), + ptr_mem); + aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem); + aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8); + aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset), + ptr_mem); + a->base.mc->label_place(a->base.mc, done_label); + dst = aa_direct_materialize_addr(d, dst_op); + aa_emit_mem(a, 0, val, dst, val_mem); + return; + } + compiler_panic(d->base.c, d->loc, + "aarch64 native target: unsupported va_list layout"); +} + +static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) { + (void)d; + (void)ap_addr; +} + +static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr, + Operand src_ap_addr) { + AANativeTarget* a = aa_of(d->native); + ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi); + NativeLoc tmp = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + AA_TMP0); + MemAccess mem = aa_mem_for_type(d->native, tmp.type, 8); + if (vai.kind == ABI_VA_LIST_POINTER) { + NativeAddr src = aa_direct_pointer_addr(d, src_ap_addr); + NativeAddr dst; + aa_emit_mem(a, 1, tmp, src, mem); + dst = aa_direct_pointer_addr(d, dst_ap_addr); + aa_emit_mem(a, 0, tmp, dst, mem); + return; + } + if (vai.kind == ABI_VA_LIST_AAPCS64) { + aa_load_ap_addr(d, src_ap_addr, 14u); + aa_load_ap_addr(d, dst_ap_addr, 15u); + for (u32 off = 0; off < vai.type.size; off += 8u) { + aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, 14u, (i32)off), mem); + aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, 15u, (i32)off), mem); + } + return; + } + compiler_panic(d->base.c, d->loc, + "aarch64 native target: unsupported va_list layout"); +} + +AA_UNUSED_FN static const char* aa_asm_constraint_body(const char* s) { + if (!s) return ""; + if (s[0] == '=' && s[1] == '&') return s + 2; + if (s[0] == '=' || s[0] == '+' || s[0] == '&') return s + 1; + return s; +} + +AA_UNUSED_FN static int aa_asm_constraint_early(const char* s) { + if (!s) return 0; + return (s[0] == '=' && s[1] == '&') || s[0] == '&'; +} + +AA_UNUSED_FN static int aa_asm_match_index(const char* s) { + int n = 0; + if (!s || s[0] < '0' || s[0] > '9') return -1; + for (const char* p = s; *p >= '0' && *p <= '9'; ++p) { + n = n * 10 + (*p - '0'); + } + return n; +} + +_Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) { + compiler_panic(d->base.c, d->loc, "aarch64 inline asm: %s", msg); +} + +AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, CfreeCgTypeId type, + NativeAllocClass cls, Reg reg) { + memset(out, 0, sizeof *out); + out->kind = AA64_INLINE_OPK_REG; + out->pad[0] = (cls == NATIVE_REG_FP) ? AA64_INLINE_OPCLS_FP + : AA64_INLINE_OPCLS_INT; + out->type = type; + out->v.local = (CGLocal)reg; +} + +AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, CfreeCgTypeId type, + Reg base) { + memset(out, 0, sizeof *out); + out->kind = OPK_INDIRECT; + out->type = type; + out->v.ind.base = (CGLocal)base; + out->v.ind.index = CG_LOCAL_NONE; +} + +static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name, + NativeAllocClass* cls_out, Reg* reg_out) { + Slice s = pool_slice(d->base.c->global, name); + char buf[16]; + uint32_t dwarf; + if (!s.s || !s.len) return 0; + if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; + if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; + if (s.len >= sizeof buf) aa_asm_panic(d, "clobber name is too long"); + memcpy(buf, s.s, s.len); + buf[s.len] = '\0'; + if (aa64_register_index(buf, &dwarf) != 0) + aa_asm_panic(d, "unknown clobber register"); + if (dwarf <= 30u) { + *cls_out = NATIVE_REG_INT; + *reg_out = (Reg)dwarf; + return 1; + } + if (dwarf >= 64u && dwarf <= 95u) { + *cls_out = NATIVE_REG_FP; + *reg_out = (Reg)(dwarf - 64u); + return 1; + } + aa_asm_panic(d, "unsupported clobber register"); + return 0; +} + +AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d, + const Sym* clobbers, u32 nclob, + u32* int_mask, u32* fp_mask) { + *int_mask = 0; + *fp_mask = 0; + for (u32 i = 0; i < nclob; ++i) { + NativeAllocClass cls; + Reg reg; + if (!aa_asm_parse_reg_clobber(d, clobbers[i], &cls, &reg)) continue; + if (cls == NATIVE_REG_INT) + *int_mask |= 1u << reg; + else if (cls == NATIVE_REG_FP) + *fp_mask |= 1u << reg; + } +} + +AA_UNUSED_FN static Reg aa_asm_alloc_reg(NativeDirectTarget* d, + NativeAllocClass cls, u32* used_int, + u32* used_fp) { + static const Reg int_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, + 7u, 8u, 11u, 12u, 13u, 14u, 15u}; + static const Reg fp_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, + 16u, 17u, 18u, 19u, 22u, 23u, 24u, 25u, + 26u, 27u, 28u, 29u, 30u, 31u}; + const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool; + u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0]) + : (u32)(sizeof int_pool / sizeof int_pool[0]); + u32* used = cls == NATIVE_REG_FP ? used_fp : used_int; + for (u32 i = 0; i < n; ++i) { + Reg r = pool[i]; + if ((*used & (1u << r)) != 0) continue; + *used |= 1u << r; + return r; + } + aa_asm_panic(d, "out of registers for asm operands"); + return REG_NONE; +} + +AA_UNUSED_FN static NativeAllocClass +aa_asm_constraint_class(NativeDirectTarget* d, const char* body) { + if (body[0] == 'r') return NATIVE_REG_INT; + if (body[0] == 'w') return NATIVE_REG_FP; + aa_asm_panic(d, "constraint is not a register constraint"); + return NATIVE_REG_INT; +} + +AA_UNUSED_FN static void aa_direct_load_operand_to_reg(NativeDirectTarget* d, + Operand op, + NativeLoc dst) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + switch ((OpKind)op.kind) { + case OPK_IMM: + if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) + aa_asm_panic(d, "floating-point immediate asm input is unsupported"); + d->native->load_imm(d->native, dst, op.v.imm); + return; + case OPK_LOCAL: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = d->locals[op.v.local - 1u].home; + addr.base_type = op.type; + aa_emit_mem(aa_of(d->native), 1, dst, addr, + aa_mem_for_type(d->native, op.type, 0)); + return; + case OPK_GLOBAL: + addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; + addr.base.global.sym = op.v.global.sym; + addr.base.global.addend = op.v.global.addend; + addr.base_type = op.type; + d->native->load_addr(d->native, dst, addr); + return; + case OPK_INDIRECT: + addr = aa_direct_materialize_addr(d, op); + aa_emit_mem(aa_of(d->native), 1, dst, addr, + aa_mem_for_type(d->native, op.type, 0)); + return; + } + aa_asm_panic(d, "unsupported asm input operand"); +} + +AA_UNUSED_FN static void aa_direct_load_address_to_reg(NativeDirectTarget* d, + Operand op, + NativeLoc dst) { + NativeAddr addr = aa_direct_addr(d, op); + d->native->load_addr(d->native, dst, addr); +} + +AA_UNUSED_FN static void aa_direct_store_reg_to_operand(NativeDirectTarget* d, + Operand op, + NativeLoc src) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + if (op.kind == OPK_LOCAL) { + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = d->locals[op.v.local - 1u].home; + addr.base_type = op.type; + } else { + addr = aa_direct_materialize_addr(d, op); + } + aa_emit_mem(aa_of(d->native), 0, src, addr, + aa_mem_for_type(d->native, op.type, 0)); +} + +typedef struct AAAsmSavedClobber { + NativeFrameSlot slot; + NativeAllocClass cls; + Reg reg; + CfreeCgTypeId type; +} AAAsmSavedClobber; + +static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) { + NativeFrameSlotDesc desc; + NativeAddr addr; + NativeLoc reg; + memset(&desc, 0, sizeof desc); + desc.type = s->type; + desc.size = 8; + desc.align = 8; + desc.kind = NATIVE_FRAME_SLOT_SAVE; + s->slot = d->native->frame_slot(d->native, &desc); + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = s->slot; + addr.base_type = s->type; + reg = aa_reg_loc(s->type, s->cls, s->reg); + aa_emit_mem(aa_of(d->native), 0, reg, addr, + aa_mem_for_type(d->native, s->type, 8)); +} + +AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d, + const AAAsmSavedClobber* s) { + NativeAddr addr; + NativeLoc reg = aa_reg_loc(s->type, s->cls, s->reg); + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = s->slot; + addr.base_type = s->type; + aa_emit_mem(aa_of(d->native), 1, reg, addr, + aa_mem_for_type(d->native, s->type, 8)); +} + +AA_UNUSED_FN static AAAsmSavedClobber* +aa_asm_save_callee_clobbers(NativeDirectTarget* d, u32 int_mask, u32 fp_mask, + u32* nsaved_out) { + AAAsmSavedClobber* saved = + arena_zarray(d->base.c->tu, AAAsmSavedClobber, 20u); + u32 n = 0; + CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); + CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64); + for (Reg r = 19u; r <= 28u; ++r) { + if ((int_mask & (1u << r)) == 0) continue; + saved[n].cls = NATIVE_REG_INT; + saved[n].reg = r; + saved[n].type = i64; + aa_asm_save_one(d, &saved[n++]); + } + for (Reg r = 8u; r <= 15u; ++r) { + if ((fp_mask & (1u << r)) == 0) continue; + saved[n].cls = NATIVE_REG_FP; + saved[n].reg = r; + saved[n].type = f64; + aa_asm_save_one(d, &saved[n++]); + } + *nsaved_out = n; + return saved; +} + +static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, + const AsmConstraint* outs, u32 nout, + Operand* out_ops, const AsmConstraint* ins, + u32 nin, const Operand* in_ops, + const Sym* clobbers, u32 nclob) { + Operand* bound_outs = + nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL; + Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL; + u32 clob_int, clob_fp, used_int, used_fp; + AAAsmSavedClobber* saved; + u32 nsaved; + AA64Asm* a; + + aa_asm_clobber_masks(d, clobbers, nclob, &clob_int, &clob_fp); + used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) | + (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP); + used_fp = clob_fp | (1u << 20u) | (1u << 21u); + + for (u32 i = 0; i < nout; ++i) { + const char* body = aa_asm_constraint_body(outs[i].str); + if (body[0] == 'r' || body[0] == 'w') { + NativeAllocClass cls = aa_asm_constraint_class(d, body); + Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp); + CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; + aa_asm_bound_reg(&bound_outs[i], type, cls, reg); + if (outs[i].dir == ASM_INOUT) { + NativeLoc loc = aa_reg_loc(type, cls, reg); + aa_direct_load_operand_to_reg(d, out_ops[i], loc); + } + } else if (body[0] == 'm') { + Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); + NativeLoc loc = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), + NATIVE_REG_INT, reg); + CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; + aa_direct_load_address_to_reg(d, out_ops[i], loc); + aa_asm_bound_mem(&bound_outs[i], type, reg); + } else { + aa_asm_panic(d, "unsupported output constraint"); + } + } + + for (u32 i = 0; i < nin; ++i) { + const char* body = aa_asm_constraint_body(ins[i].str); + int matched = aa_asm_match_index(body); + if (matched >= 0) { + if ((u32)matched >= nout) + aa_asm_panic(d, "matching constraint out of range"); + if (aa_asm_constraint_early(outs[matched].str)) + aa_asm_panic(d, "matching input names early-clobber output"); + if (bound_outs[matched].kind != AA64_INLINE_OPK_REG) + aa_asm_panic(d, "matching constraint requires register output"); + bound_ins[i] = bound_outs[matched]; + aa_direct_load_operand_to_reg( + d, in_ops[i], + aa_reg_loc(bound_ins[i].type, + bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT, + (Reg)bound_ins[i].v.local)); + continue; + } + if (body[0] == 'r' || body[0] == 'w') { + NativeAllocClass cls = aa_asm_constraint_class(d, body); + Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp); + CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; + aa_asm_bound_reg(&bound_ins[i], type, cls, reg); + aa_direct_load_operand_to_reg(d, in_ops[i], + aa_reg_loc(type, cls, reg)); + } else if (body[0] == 'i') { + if (in_ops[i].kind != OPK_IMM) + aa_asm_panic(d, "immediate constraint requires immediate operand"); + bound_ins[i] = in_ops[i]; + } else if (body[0] == 'm') { + Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); + NativeLoc loc = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), + NATIVE_REG_INT, reg); + CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; + aa_direct_load_address_to_reg(d, in_ops[i], loc); + aa_asm_bound_mem(&bound_ins[i], type, reg); + } else { + aa_asm_panic(d, "unsupported input constraint"); + } + } + + saved = aa_asm_save_callee_clobbers(d, clob_int, clob_fp, &nsaved); + a = aa64_asm_open(d->base.c); + aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, + nclob); + aa64_asm_run_template(a, d->native->mc, tmpl); + aa64_asm_close(a); + + for (u32 i = 0; i < nout; ++i) { + NativeAllocClass cls; + NativeLoc src; + if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue; + cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP + : NATIVE_REG_INT; + src = aa_reg_loc(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); + aa_direct_store_reg_to_operand(d, out_ops[i], src); + } + for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(d, &saved[i - 1u]); +} + +static const NativeOps aa_direct_ops = { + .bind_param = aa_bind_param, + .tail_call_unrealizable_reason = aa_no_tail, + .va_start_ = aa_va_start_, + .va_arg_ = aa_va_arg_, + .va_end_ = aa_va_end_, + .va_copy_ = aa_va_copy_, + .asm_block = aa_direct_asm_block, +}; + +const NativeOps* aa64_native_direct_ops(void) { return &aa_direct_ops; } diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -1,2908 +0,0 @@ -/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics, - * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */ - -#include "arch/aa64/internal.h" -#include "cfree/config.h" -#include "core/slice.h" - -/* ============================================================ - * Data movement - * ============================================================ */ - -static RelocKind ldst_lo12_reloc_for(u32 nbytes); - -static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) { - u32 sf = type_is_64(dst.type) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm); -} - -static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) { - AAImpl* a = impl_of(t); - if (dst.cls != RC_FP) { - compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1"); - } - - Sym ro_name = pool_intern_slice(t->c->global, SLICE_LIT(".rodata")); - ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u); - - u32 cur_section = t->mc->section_id; - t->mc->set_section(t->mc, ro); - u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4); - t->mc->emit_bytes(t->mc, cb.bytes, cb.size); - - char namebuf[64]; - static u32 lit_seq = 0; - int len = 0; - { - const char* prefix = ".LCFP"; - for (; prefix[len]; ++len) namebuf[len] = prefix[len]; - u32 v = lit_seq++; - char tmp[16]; - int tn = 0; - if (v == 0) - tmp[tn++] = '0'; - else { - while (v) { - tmp[tn++] = '0' + (char)(v % 10); - v /= 10; - } - } - for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i]; - namebuf[len] = 0; - } - Sym sname = pool_intern_slice(t->c->global, slice_from_cstr(namebuf)); - ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off, - (u64)cb.size); - - t->mc->set_section(t->mc, cur_section); - - u32 adrp_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0)); - t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, - sym, 0, 0, 0); - - u32 ldr_pos = t->mc->pos(t->mc); - u32 sidx = size_idx_for_bytes(cb.size); - if (cb.size == 16) - aa64_emit32(t->mc, aa64_ldr_q_uimm(reg_num(dst), AA_TMP0, 0)); - else - aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); - RelocKind lo12 = ldst_lo12_reloc_for(cb.size); - t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0); -} - -static void aa_copy(CGTarget* t, Operand dst, Operand src) { - if (dst.cls == RC_FP && src.cls == RC_INT) { - u32 sz = type_byte_size(dst.type); - aa64_emit32(t->mc, sz == 8 ? aa64_fmov_d_x(reg_num(dst), reg_num(src)) - : aa64_fmov_s_w(reg_num(dst), reg_num(src))); - return; - } - if (dst.cls == RC_INT && src.cls == RC_FP) { - u32 sz = type_byte_size(src.type); - aa64_emit32(t->mc, sz == 8 ? aa64_fmov_x_d(reg_num(dst), reg_num(src)) - : aa64_fmov_w_s(reg_num(dst), reg_num(src))); - return; - } - if (dst.cls == RC_FP || src.cls == RC_FP) { - if (type_byte_size(dst.type) == 16) { - aa64_emit32(t->mc, aa64_mov_v16b(reg_num(dst), reg_num(src))); - } else { - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src))); - } - return; - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src))); -} - -/* ============================================================ - * Load / store - * ============================================================ */ - -static RelocKind ldst_lo12_reloc_for(u32 nbytes) { - switch (nbytes) { - case 1: - return R_AARCH64_LDST8_ABS_LO12_NC; - case 2: - return R_AARCH64_LDST16_ABS_LO12_NC; - case 4: - return R_AARCH64_LDST32_ABS_LO12_NC; - case 8: - return R_AARCH64_LDST64_ABS_LO12_NC; - case 16: - return R_AARCH64_LDST128_ABS_LO12_NC; - default: - return R_AARCH64_LDST64_ABS_LO12_NC; - } -} - -static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, - i32 off) { - if (off < -256 || off > 255) { - aa64_emit_addr_adjust(mc, AA_TMP0, rn, off); - rn = AA_TMP0; - off = 0; - } - if (sidx == 4) - aa64_emit32(mc, aa64_ldur_q(rt, rn, off)); - else - aa64_emit32(mc, aa64_ldur_fp(sidx, rt, rn, off)); -} - -static void aa_emit_str_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, - i32 off) { - if (off < -256 || off > 255) { - aa64_emit_addr_adjust(mc, AA_TMP0, rn, off); - rn = AA_TMP0; - off = 0; - } - if (sidx == 4) - aa64_emit32(mc, aa64_stur_q(rt, rn, off)); - else - aa64_emit32(mc, aa64_stur_fp(sidx, rt, rn, off)); -} - -static void aa_emit_ldr_fp_uimm_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, - u32 off) { - if (sidx == 4) - aa64_emit32(mc, aa64_ldr_q_uimm(rt, rn, off)); - else - aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, rt, rn, off)); -} - -static void aa_emit_str_fp_uimm_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, - u32 off) { - if (sidx == 4) - aa64_emit32(mc, aa64_str_q_uimm(rt, rn, off)); - else - aa64_emit32(mc, aa64_str_fp_uimm(sidx, rt, rn, off)); -} - -static int use_got_for_sym(CGTarget* t, ObjSymId sym) { - return obj_symbol_extern_via_got(t->c, t->obj, sym); -} - -/* Effective-address descriptor produced by addr_mode. Mirrors the - * Operand.v.ind shape after any required fixups (offset folded into a - * scratch register when out of range, GLOBAL materialized into a register). - * `index == REG_NONE` means plain base+offset; otherwise the indexed - * register-offset form should be used and ofs is always 0. */ -typedef struct AAAddrMode { - u32 base; /* physical register holding the base */ - u32 index; /* physical register holding the index, or REG_NONE */ - u32 log2_scale; /* 0..3 — only valid when index != REG_NONE */ - i32 ofs; /* signed displacement; 0 when index != REG_NONE */ -} AAAddrMode; - -/* Resolve an Operand addressing form to an AAAddrMode usable by the - * load/store emitters. Handles all base kinds (LOCAL, INDIRECT, GLOBAL) - * and folds out-of-range offsets through `tmp_reg` via - * aa64_emit_addr_adjust, matching the prior addr_base contract. - * - * When the input INDIRECT carries an index, this routine preserves it in - * the result. If a nonzero displacement is also present, it is added to - * the base via the temp register so the indexed register-offset - * instruction (which encodes no displacement) can use {tmp, index, 0}. */ -static AAAddrMode addr_mode(CGTarget* t, Operand addr, u32 tmp_reg) { - AAImpl* a = impl_of(t); - AAAddrMode m; - m.base = 0u; - m.index = REG_NONE; - m.log2_scale = 0u; - m.ofs = 0; - - if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_mode: bad slot"); - i32 off = -(i32)s->off; - if (off >= -256 && off <= 255) { - m.base = 29u; - m.ofs = off; - } else { - aa64_emit_addr_adjust(t->mc, tmp_reg, 29u, off); - m.base = tmp_reg; - m.ofs = 0; - } - return m; - } - if (addr.kind == OPK_INDIRECT) { - i32 off = addr.v.ind.ofs; - u32 base = addr.v.ind.base & 0x1fu; - Reg idx = addr.v.ind.index; - if (idx == REG_NONE) { - if (off >= -256 && off <= 255) { - m.base = base; - m.ofs = off; - } else { - aa64_emit_addr_adjust(t->mc, tmp_reg, base, off); - m.base = tmp_reg; - m.ofs = 0; - } - return m; - } - /* Indexed: fold any displacement into the base so the indexed - * register-offset instruction can encode just {base, index, scale}. */ - if (off != 0) { - aa64_emit_addr_adjust(t->mc, tmp_reg, base, off); - m.base = tmp_reg; - } else { - m.base = base; - } - m.index = (u32)idx & 0x1fu; - m.log2_scale = addr.v.ind.log2_scale & 0x3u; - m.ofs = 0; - return m; - } - if (addr.kind == OPK_GLOBAL) { - emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend); - m.base = tmp_reg; - m.ofs = 0; - return m; - } - compiler_panic(t->c, a->loc, "aarch64 addr_mode: unsupported kind %d", - (int)addr.kind); -} - -/* Assert that an Operand consumed by a non-load/store path carries no - * EA index. Per doc/INDIRECT.md the cg layer never routes an indexed - * OPK_INDIRECT to spill/reload, bitfield, atomics, copy_bytes/set_bytes, - * inline asm, or addr_of; the assert catches upstream misrouting before - * it silently produces incorrect addressing. */ -static inline void aa_assert_no_index(CGTarget* t, Operand addr, - const char* where) { - if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 %.*s: OPK_INDIRECT with index unexpected", - SLICE_ARG(slice_from_cstr(where))); - } -} - -/* LDR (register), 32-bit option=LSL. Encodes - * LDR<size> Wt|Xt, [Xn, Xm{, LSL #amt}] (integer) - * where size in {0..3} selects byte/half/word/double; opc=01 (load). - * S=0 -> no shift (amt=0); S=1 -> shift by `size` (amt=size). - * The aarch64 register-offset addressing mode supports only those two - * shift amounts (other values must be lowered upstream). */ -static inline u32 aa64_ldr_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x38606800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} -static inline u32 aa64_str_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x38206800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} -static inline u32 aa64_ldr_fp_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x3C606800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} -static inline u32 aa64_str_fp_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x3C206800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} -/* 128-bit Q register-offset variants (size encoded as size=00, opc bit - * pattern 11 selects 128b). */ -static inline u32 aa64_ldr_q_reg(u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x3CE06800u | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} -static inline u32 aa64_str_q_reg(u32 Rt, u32 Rn, u32 Rm, u32 S) { - return 0x3CA06800u | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) | - ((Rn & 0x1fu) << 5) | (Rt & 0x1fu); -} - -/* True if `log2_scale` is legal for the aarch64 register-offset form at - * a given access size index (sidx). The encoding supports S=0 (LSL #0) - * and S=1 (LSL #sidx) — any other scale must be lowered by adding - * `index << log2_scale` into the base via arch_lower_indexed before the - * load/store. */ -static inline int aa_indexed_scale_legal(u32 sidx, u32 log2_scale, u32* S_out) { - if (log2_scale == 0u) { - *S_out = 0u; - return 1; - } - if (log2_scale == sidx) { - *S_out = 1u; - return 1; - } - return 0; -} - -void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { - u32 sz = ma.size ? ma.size : type_byte_size(addr.type); - u32 sidx = size_idx_for_bytes(sz); - - if (addr.kind == OPK_GLOBAL) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - ObjSymId sym = addr.v.global.sym; - i64 add = addr.v.global.addend; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, AA_TMP0, sym); - if (dst.cls == RC_FP) { - aa_emit_ldr_fp_any(mc, sidx, reg_num(dst), AA_TMP0, (i32)add); - } else { - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add)); - } - return; - } - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(AA_TMP0)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, - 0, 0); - u32 ld_pos = mc->pos(mc); - if (dst.cls == RC_FP) { - aa_emit_ldr_fp_uimm_any(mc, sidx, reg_num(dst), AA_TMP0, 0); - } else { - aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0)); - } - mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); - return; - } - - /* Indexed register-offset form: emit `LDR Rt, [Rn, Rm{, LSL #s}]` when - * the EA carries an index and the scale matches the encoding (S=0 → - * LSL #0, S=1 → LSL #sidx). Otherwise fall back to - * arch_lower_indexed, which materializes base+(index<<scale) into a - * scratch and gives us a plain base+disp shape. */ - if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) { - u32 S; - if (aa_indexed_scale_legal(sidx, addr.v.ind.log2_scale & 0x3u, &S)) { - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (dst.cls == RC_FP) { - if (sidx == 4u) - aa64_emit32(t->mc, aa64_ldr_q_reg(reg_num(dst), m.base, m.index, S)); - else - aa64_emit32(t->mc, - aa64_ldr_fp_reg(sidx, reg_num(dst), m.base, m.index, S)); - } else { - aa64_emit32(t->mc, - aa64_ldr_reg(sidx, reg_num(dst), m.base, m.index, S)); - } - return; - } - addr = arch_lower_indexed(t, addr, AA_TMP0); - } - - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (dst.cls == RC_FP) { - aa_emit_ldr_fp_any(t->mc, sidx, reg_num(dst), m.base, m.ofs); - } else { - aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), m.base, m.ofs)); - } -} - -void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { - u32 sz = ma.size ? ma.size : type_byte_size(addr.type); - u32 sidx = size_idx_for_bytes(sz); - - if (addr.kind == OPK_GLOBAL) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - ObjSymId sym = addr.v.global.sym; - i64 add = addr.v.global.addend; - - u32 src_reg; - u32 src_is_fp = 0; - /* Zero immediate stores use wzr/xzr directly (reg 31). Avoids a - * separate `mov wN, #0` and frees AA_TMP0 for the address base. */ - int src_imm_zero = - (src.kind == OPK_IMM && src.v.imm == 0 && src.cls != RC_FP); - if (src_imm_zero) { - src_reg = 31u; - } else if (src.kind == OPK_IMM) { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm); - src_reg = AA_TMP0; - } else if (src.cls == RC_FP) { - src_reg = reg_num(src); - src_is_fp = 1; - } else { - src_reg = reg_num(src); - } - u32 base = (src.kind == OPK_IMM && !src_imm_zero) ? AA_TMP1 : AA_TMP0; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, base, sym); - if (src_is_fp) { - aa_emit_str_fp_any(mc, sidx, src_reg, base, (i32)add); - } else { - aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add)); - } - return; - } - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(base)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, - 0, 0); - u32 st_pos = mc->pos(mc); - if (src_is_fp) { - aa_emit_str_fp_uimm_any(mc, sidx, src_reg, base, 0); - } else { - aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0)); - } - mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); - return; - } - - /* Zero immediate stores use wzr/xzr directly (reg 31). */ - int src_imm_zero = - (src.kind == OPK_IMM && src.v.imm == 0 && src.cls != RC_FP); - u32 addr_tmp = (src.kind == OPK_IMM && !src_imm_zero) ? AA_TMP1 : AA_TMP0; - - /* Indexed register-offset form for STR when the EA's scale is legal. - * Falls back to arch_lower_indexed when LSL doesn't fit the - * instruction encoding (e.g. byte access with log2_scale=3). */ - if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) { - u32 S; - if (aa_indexed_scale_legal(sidx, addr.v.ind.log2_scale & 0x3u, &S)) { - AAAddrMode m = addr_mode(t, addr, addr_tmp); - u32 src_reg; - if (src_imm_zero) { - src_reg = 31u; - } else if (src.kind == OPK_IMM) { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm); - src_reg = AA_TMP0; - } else { - src_reg = reg_num(src); - } - if (src.cls == RC_FP && !src_imm_zero) { - if (sidx == 4u) - aa64_emit32(t->mc, aa64_str_q_reg(src_reg, m.base, m.index, S)); - else - aa64_emit32(t->mc, - aa64_str_fp_reg(sidx, src_reg, m.base, m.index, S)); - } else { - aa64_emit32(t->mc, aa64_str_reg(sidx, src_reg, m.base, m.index, S)); - } - return; - } - addr = arch_lower_indexed(t, addr, addr_tmp); - } - - AAAddrMode m = addr_mode(t, addr, addr_tmp); - - if (src_imm_zero) { - aa64_emit32(t->mc, aa64_stur(sidx, 31u, m.base, m.ofs)); - return; - } - if (src.kind == OPK_IMM) { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm); - aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, m.base, m.ofs)); - return; - } - if (src.cls == RC_FP) { - aa_emit_str_fp_any(t->mc, sidx, reg_num(src), m.base, m.ofs); - } else { - aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), m.base, m.ofs)); - } -} - -static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) { - AAImpl* a = impl_of(t); - if (lv.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, lv.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot"); - aa64_emit_addr_adjust(t->mc, reg_num(dst), 29, -(i32)s->off); - return; - } - if (lv.kind == OPK_INDIRECT) { - aa_assert_no_index(t, lv, "addr_of"); - i32 ofs = lv.v.ind.ofs; - u32 base = lv.v.ind.base & 0x1f; - aa64_emit_addr_adjust(t->mc, reg_num(dst), base, ofs); - return; - } - if (lv.kind == OPK_GLOBAL) { - u32 rd = reg_num(dst); - ObjSymId sym = lv.v.global.sym; - i64 addend = lv.v.global.addend; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, rd, sym); - if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend); - return; - } - u32 sec = t->mc->section_id; - u32 adrp_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_adrp_base(rd)); - t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, - addend, 0, 0); - u32 add_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); - t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, - addend, 0, 0); - return; - } - compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented"); -} - -static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - u32 rd = reg_num(dst); - - if (obj_format_tls_via_descriptor(t->c)) { - /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's - * ABI is custom — x0 in/out as descriptor → TLV addr, all other - * regs preserved — so we materialize via x0 and copy to `dst` only - * when they differ. x0/x1 are scratch here (the regalloc only hands - * out x19-x28), and x30 was saved at the prologue. - * - * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21 - * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12 - * ldr x1, [x0] ; descriptor[0] = thunk pointer - * blr x1 ; x0 in/out - * mov xdst, x0 ; only if dst != x0 - * - * TLVP relocs do not carry an addend; nonzero addends are applied - * after the call as a follow-on ADD/SUB on `dst`. */ - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0, - 0); - u32 ldr_pos = mc->pos(mc); - aa64_emit32(mc, - aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0)); - mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0, - 0, 0); - aa64_emit32(mc, - aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0)); - aa64_emit32(mc, aa64_blr(/*Rn=*/1)); - if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0)); - if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend); - return; - } - - /* Windows-on-ARM64 TLS Local-Exec. - * - * ldr xd, [x18, #0x58] ; xd = TEB->TlsSlots (TLS array) - * adrp x16, _tls_index ; ADR_PREL_PG_HI21 - * ldr w16, [x16, :lo12:_tls_index] ; LDST32_ABS_LO12_NC - * add xd, xd, x16, lsl #3 ; xd += index*8 - * ldr xd, [xd] ; xd = per-image TLS block base - * add xd, xd, #:secrel_hi12:sym, lsl#12 ; SECREL_HIGH12A - * add xd, xd, #:secrel_lo12:sym ; SECREL_LOW12A - * - * x16 (IP0) is a caller-saved intra-procedure-call scratch reg, - * always safe to clobber inside a function body. The two ADD-imm12 - * SECREL fixups assume the merged .tls section is < 16 MiB; cfree - * panics with a clear diagnostic at link time if that ever fails. */ - if (t->c->target.os == CFREE_OS_WINDOWS) { - Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index")); - ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name); - if (idx_sym == 0) { - idx_sym = - obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); - } - /* Windows ARM64 reserves x18 as the TEB pointer. Do not read - * TPIDR_EL0 here; Wine and real Windows expose the TLS slots via - * x18 + 0x58, matching clang/llvm-mingw codegen. */ - aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, rd, /*Rn=*/18, - /*byte_off=*/0x58)); - - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(/*Rd=*/16)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, idx_sym, 0, - 0, 0); - u32 ldr_pos = mc->pos(mc); - aa64_emit32( - mc, aa64_ldr_uimm(/*size=*/2, /*Rt=*/16, /*Rn=*/16, /*byte_off=*/0)); - mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LDST32_ABS_LO12_NC, idx_sym, - 0, 0, 0); - - /* add xd, xd, x16, LSL #3: - * 0x8B000000 | (Rm << 16) | (3 << 10) | (Rn << 5) | Rd - * sf=1, shift=LSL (00), Rm=16. */ - u32 add_shr = 0x8B000000u | (16u << 16) | (3u << 10) | ((rd & 0x1fu) << 5) | - (rd & 0x1fu); - aa64_emit32(mc, add_shr); - aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, rd, rd, /*byte_off=*/0)); - - /* add xd, xd, #(0 << 12), then patch HIGH12A: sh=1 in the encoding. */ - u32 hi_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, rd, /*imm12=*/0, /*sh=*/1)); - mc->emit_reloc_at(mc, sec, hi_pos, R_COFF_AARCH64_SECREL_HIGH12A, sym, - addend, 0, 0); - u32 lo_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, rd, /*imm12=*/0, /*sh=*/0)); - mc->emit_reloc_at(mc, sec, lo_pos, R_COFF_AARCH64_SECREL_LOW12A, sym, - addend, 0, 0); - return; - } - - aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0)); - - u32 hi_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1)); - mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, - addend, 0, 0); - - u32 lo_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0)); - mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym, - addend, 0, 0); -} - -/* ============================================================ - * Aggregate helpers - * ============================================================ */ - -static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) { - if (op.kind == OPK_REG) return reg_num(op); - if (op.kind == OPK_LOCAL) { - AAImpl* a = impl_of(t); - AASlot* s = aa64_slot_get(a, op.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot"); - aa64_emit_addr_adjust(t->mc, scratch, 29, -(i32)s->off); - return scratch; - } - if (op.kind == OPK_GLOBAL) { - emit_global_addr(t, scratch, op.v.global.sym, op.v.global.addend); - return scratch; - } - if (op.kind == OPK_INDIRECT) { - /* Aggregate helpers (copy_bytes/set_bytes, bitfield_*) take plain - * pointer addresses; the cg contract guarantees no EA index here. */ - aa_assert_no_index(t, op, "agg address"); - u32 base = op.v.ind.base & 0x1fu; - i32 ofs = op.v.ind.ofs; - if (ofs == 0) return base; - aa64_emit_addr_adjust(t->mc, scratch, base, ofs); - return scratch; - } - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 agg: address kind %d unsupported", (int)op.kind); -} - -static void aa_emit_load_at(MCEmitter* mc, u32 size, u32 rt, u32 rn, u32 off) { - if (off <= 255u) - aa64_emit32(mc, aa64_ldur(size, rt, rn, (i32)off)); - else - aa64_emit32(mc, aa64_ldr_uimm(size, rt, rn, off)); -} - -static void aa_emit_store_at(MCEmitter* mc, u32 size, u32 rt, u32 rn, u32 off) { - if (off <= 255u) - aa64_emit32(mc, aa64_stur(size, rt, rn, (i32)off)); - else - aa64_emit32(mc, aa64_str_uimm(size, rt, rn, off)); -} - -static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr, - AggregateAccess agg) { - MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); - u32 sr = agg_addr_reg(t, src_addr, (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1); - u32 nbytes = agg.size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa_emit_load_at(mc, 3, AA_TMP2, sr, i); - aa_emit_store_at(mc, 3, AA_TMP2, dr, i); - i += 8; - } - while (i + 4 <= nbytes) { - aa_emit_load_at(mc, 2, AA_TMP2, sr, i); - aa_emit_store_at(mc, 2, AA_TMP2, dr, i); - i += 4; - } - while (i + 2 <= nbytes) { - aa_emit_load_at(mc, 1, AA_TMP2, sr, i); - aa_emit_store_at(mc, 1, AA_TMP2, dr, i); - i += 2; - } - while (i < nbytes) { - aa_emit_load_at(mc, 0, AA_TMP2, sr, i); - aa_emit_store_at(mc, 0, AA_TMP2, dr, i); - i += 1; - } -} - -static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, - AggregateAccess agg) { - MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); - - u32 byte; - if (byte_value.kind == OPK_IMM) { - byte = (u32)(byte_value.v.imm & 0xffu); - } else { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 set_bytes: REG byte not yet supported"); - } - u32 nbytes = agg.size; - - if (byte == 0) { - u32 i = 0; - while (i + 8 <= nbytes) { - aa_emit_store_at(mc, 3, 31, dr, i); - i += 8; - } - while (i + 4 <= nbytes) { - aa_emit_store_at(mc, 2, 31, dr, i); - i += 4; - } - while (i + 2 <= nbytes) { - aa_emit_store_at(mc, 1, 31, dr, i); - i += 2; - } - while (i < nbytes) { - aa_emit_store_at(mc, 0, 31, dr, i); - i += 1; - } - return; - } - - u64 b64 = byte; - b64 |= b64 << 8; - b64 |= b64 << 16; - b64 |= b64 << 32; - aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64); - - u32 i = 0; - while (i + 8 <= nbytes) { - aa_emit_store_at(mc, 3, AA_TMP1, dr, i); - i += 8; - } - while (i + 4 <= nbytes) { - aa_emit_store_at(mc, 2, AA_TMP1, dr, i); - i += 4; - } - while (i + 2 <= nbytes) { - aa_emit_store_at(mc, 1, AA_TMP1, dr, i); - i += 2; - } - while (i < nbytes) { - aa_emit_store_at(mc, 0, AA_TMP1, dr, i); - i += 1; - } -} - -/* ============================================================ - * Bitfields - * ============================================================ */ - -static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr, - BitFieldAccess bf) { - MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, AA_TMP0); - u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; - u32 sf = (storage_bytes == 8u) ? 1u : 0u; - u32 sidx = size_idx_for_bytes(storage_bytes); - u32 rd = reg_num(dst); - - aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset)); - u32 lsb = bf.bit_offset; - u32 width = bf.bit_width ? bf.bit_width : 1u; - u32 imms = lsb + width - 1u; - if (bf.signed_) { - aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms)); - } else { - aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms)); - } -} - -static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src, - BitFieldAccess bf) { - MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, AA_TMP0); - u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; - u32 sf = (storage_bytes == 8u) ? 1u : 0u; - u32 sidx = size_idx_for_bytes(storage_bytes); - - aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); - - u32 src_reg; - if (src.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm); - src_reg = AA_TMP2; - } else if (src.kind == OPK_REG) { - src_reg = reg_num(src); - } else { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 bitfield_store: src kind %d unsupported", - (int)src.kind); - } - - u32 reg_size = sf ? 64u : 32u; - u32 lsb = bf.bit_offset; - u32 width = bf.bit_width ? bf.bit_width : 1u; - u32 immr = (reg_size - lsb) % reg_size; - u32 imms = width - 1u; - aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms)); - - aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); -} - -/* ============================================================ - * Arithmetic helpers - * ============================================================ */ - -u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) { - if (op.kind == OPK_REG) return reg_num(op); - if (op.kind == OPK_IMM) { - aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm); - return scratch; - } - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 binop: operand kind %d unsupported", (int)op.kind); -} - -static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, - Operand b_op) { - MCEmitter* mc = t->mc; - - if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { - if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 binop: FP op requires REG operands"); - } - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - u32 rn = reg_num(a_op); - u32 rm = reg_num(b_op); - u32 w; - switch (op) { - case BO_FADD: - w = aa64_fadd(type, rd, rn, rm); - break; - case BO_FSUB: - w = aa64_fsub(type, rd, rn, rm); - break; - case BO_FMUL: - w = aa64_fmul(type, rd, rn, rm); - break; - case BO_FDIV: - w = aa64_fdiv(type, rd, rn, rm); - break; - default: - w = 0; - break; - } - aa64_emit32(mc, w); - return; - } - - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - - switch (op) { - case BO_IADD: - case BO_AND: - case BO_OR: - case BO_XOR: { - if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) { - Operand t_op = a_op; - a_op = b_op; - b_op = t_op; - } - break; - } - default: - break; - } - - if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { - u32 rn_reg = reg_num(a_op); - i64 imm = b_op.v.imm; - u32 imm12, sh, N, immr, imms; - switch (op) { - case BO_IADD: - if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { - aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh)); - return; - } - break; - case BO_ISUB: - if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { - aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh)); - return; - } - break; - case BO_AND: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_OR: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_XOR: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_SHL: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - case BO_SHR_U: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - case BO_SHR_S: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - default: - break; - } - } - - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); - - u32 word; - switch (op) { - case BO_IADD: - word = aa64_add(sf, rd, rn, rm); - break; - case BO_ISUB: - word = aa64_sub(sf, rd, rn, rm); - break; - case BO_IMUL: - word = aa64_mul(sf, rd, rn, rm); - break; - case BO_AND: - word = aa64_and(sf, rd, rn, rm); - break; - case BO_OR: - word = aa64_orr(sf, rd, rn, rm); - break; - case BO_XOR: - word = aa64_eor(sf, rd, rn, rm); - break; - case BO_SHL: - word = aa64_lslv(sf, rd, rn, rm); - break; - case BO_SHR_U: - word = aa64_lsrv(sf, rd, rn, rm); - break; - case BO_SHR_S: - word = aa64_asrv(sf, rd, rn, rm); - break; - case BO_UDIV: - word = aa64_udiv(sf, rd, rn, rm); - break; - case BO_SDIV: - word = aa64_sdiv(sf, rd, rn, rm); - break; - case BO_SREM: - aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm)); - word = aa64_msub(sf, rd, AA_TMP2, rm, rn); - break; - case BO_UREM: - aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm)); - word = aa64_msub(sf, rd, AA_TMP2, rm, rn); - break; - case BO_FADD: - case BO_FSUB: - case BO_FMUL: - case BO_FDIV: - default: - compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl", - (int)op); - } - aa64_emit32(mc, word); -} - -static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) { - MCEmitter* mc = t->mc; - u32 rd = reg_num(dst); - u32 word; - - if (op == UO_FNEG) { - if (dst.cls != RC_FP || a_op.kind != OPK_REG || a_op.cls != RC_FP) { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 unop: FP neg requires FP REG operand"); - } - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fneg(type, rd, reg_num(a_op))); - return; - } - - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - switch (op) { - case UO_NEG: - word = aa64_neg(sf, rd, rn); - break; - case UO_BNOT: - word = aa64_mvn(sf, rd, rn); - break; - case UO_NOT: - aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0)); - word = aa64_cset_eq(sf, rd); - break; - default: - compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl", - (int)op); - } - aa64_emit32(mc, word); -} - -static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 rd = reg_num(dst); - u32 rn = reg_num(src); - - switch (k) { - case CV_SEXT: { - if (src.cls != RC_INT || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes"); - } - u32 src_bits = type_byte_size(src.type) * 8u; - u32 dst_bits = type_byte_size(dst.type) * 8u; - u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; - if (src_bits >= dst_bits) { - aa64_emit32(mc, aa64_mov_reg(sf_dst, rd, rn)); - return; - } - aa64_emit32( - mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); - return; - } - case CV_ZEXT: { - if (src.cls != RC_INT || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes"); - } - u32 src_bits = type_byte_size(src.type) * 8u; - u32 dst_bits = type_byte_size(dst.type) * 8u; - u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; - if (src_bits >= dst_bits || src_bits == 32u) { - aa64_emit32(mc, aa64_mov_reg(src_bits == 32u ? 0u : sf_dst, rd, rn)); - } else { - aa64_emit32( - mc, aa64_ubfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); - } - return; - } - case CV_TRUNC: { - aa64_emit32(mc, aa64_mov_reg(0, rd, rn)); - return; - } - case CV_ITOF_S: { - u32 sf_src = type_is_64(src.type) ? 1u : 0u; - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn)); - return; - } - case CV_ITOF_U: { - u32 sf_src = type_is_64(src.type) ? 1u : 0u; - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn)); - return; - } - case CV_FTOI_S: { - if (src.cls != RC_FP || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes"); - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 type = type_is_fp_double(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn)); - return; - } - case CV_FTOI_U: { - if (src.cls != RC_FP || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes"); - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 type = type_is_fp_double(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn)); - return; - } - case CV_FEXT: { - aa64_emit32(mc, aa64_fcvt_d_s(rd, rn)); - return; - } - case CV_FTRUNC: { - aa64_emit32(mc, aa64_fcvt_s_d(rd, rn)); - return; - } - case CV_BITCAST: { - if (src.cls == RC_INT && dst.cls == RC_FP) { - u32 sz = type_byte_size(dst.type); - aa64_emit32(mc, - sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn)); - } else if (src.cls == RC_FP && dst.cls == RC_INT) { - u32 sz = type_byte_size(src.type); - aa64_emit32(mc, - sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn)); - } else { - compiler_panic(t->c, a->loc, - "aarch64 convert BITCAST: same-class not yet supported"); - } - return; - } - default: - compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k); - } -} - -/* ============================================================ - * Calls - * ============================================================ */ - -static Operand aa_call_stack_arg_addr(CGTarget* t, u32 stack_offset, int tail) { - AAImpl* a = impl_of(t); - Operand addr; - memset(&addr, 0, sizeof addr); - addr.kind = OPK_INDIRECT; - addr.cls = RC_INT; - addr.v.ind.base = tail && !a->omit_frame ? 29u : 31u; - addr.v.ind.index = REG_NONE; - addr.v.ind.ofs = (i32)stack_offset; - if (tail && !a->omit_frame) addr.v.ind.ofs += 16; - return addr; -} - -static void aa_check_tail_stack_args(CGTarget* t, u32 stack_size) { - AAImpl* a = impl_of(t); - if (stack_size > a->next_param_stack) { - compiler_panic(t->c, a->loc, - "aarch64 tail call: stack argument area too small"); - } -} - -static u32 aa_call_plan_stack_raw_size(const CGCallPlan* p) { - u32 size = 0; - for (u32 i = 0; i < p->nargs; ++i) { - const CGCallPlanMove* m = &p->args[i]; - if (m->dst_kind == CG_CALL_PLAN_STACK || - m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); - if (end > size) size = end; - } - } - return size; -} - -static void aa_store_stack_reg(CGTarget* t, u32 reg, RegClass cls, - CfreeCgTypeId type, u32 size, u32 stack_offset, - int tail) { - Operand addr = aa_call_stack_arg_addr(t, stack_offset, tail); - Operand src; - MemAccess ma; - memset(&src, 0, sizeof src); - memset(&ma, 0, sizeof ma); - src.kind = OPK_REG; - src.cls = (u8)cls; - src.type = type; - src.v.reg = reg; - addr.type = type; - ma.type = type; - ma.size = size; - ma.align = size ? size : 1u; - aa_store(t, addr, src, ma); -} - -static int aa_windows_fp_vararg(const CGTarget* t, const CGABIValue* av) { - return t->c->target.os == CFREE_OS_WINDOWS && av && av->abi == NULL && - av->storage.cls == RC_FP; -} - -static void aa_move_fp_to_int_reg(MCEmitter* mc, u32 dst_reg, Operand src, - u32 size) { - if (size == 8) - aa64_emit32(mc, aa64_fmov_x_d(dst_reg, reg_num(src))); - else - aa64_emit32(mc, aa64_fmov_w_s(dst_reg, reg_num(src))); -} - -static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, - const CGABIValue* av, u32* next_int, u32* next_fp, - u32* stack_off, int tail) { - AAImpl* a = impl_of(t); - ABIArgInfo va_ai; - ABIArgPart va_pt; - const ABIArgInfo* ai = av->abi; - if (!ai) { - u32 sz = type_byte_size(av->type); - memset(&va_ai, 0, sizeof va_ai); - memset(&va_pt, 0, sizeof va_pt); - va_ai.kind = ABI_ARG_DIRECT; - va_ai.parts = &va_pt; - va_ai.nparts = 1; - va_pt.cls = - aa_windows_fp_vararg(t, av) - ? ABI_CLASS_INT - : ((av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT); - va_pt.size = sz; - va_pt.align = sz; - va_pt.src_offset = 0; - ai = &va_ai; - if (fi && fi->vararg_on_stack) { - *next_int = 8; - *next_fp = 8; - } - } - if (ai->kind == ABI_ARG_IGNORE) return; - - if (ai->kind == ABI_ARG_INDIRECT) { - u32 dst_reg; - int to_stack = (*next_int >= 8); - if (!to_stack) - dst_reg = (*next_int)++; - else - dst_reg = AA_TMP0; - if (av->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot"); - aa64_emit_addr_adjust(t->mc, dst_reg, 29, -(i32)s->off); - } else if (av->storage.kind == OPK_INDIRECT) { - aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f, - av->storage.v.ind.ofs); - } else if (av->storage.kind == OPK_GLOBAL) { - emit_global_addr(t, dst_reg, av->storage.v.global.sym, - av->storage.v.global.addend); - } else { - compiler_panic(t->c, a->loc, - "aarch64 call: INDIRECT arg storage kind %d unsupported", - (int)av->storage.kind); - } - if (to_stack) { - aa_store_stack_reg(t, dst_reg, RC_INT, av->type, 8, *stack_off, tail); - *stack_off += 8; - } - return; - } - - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - u32 sz = pt->size; - u32 sidx = size_idx_for_bytes(sz); - - if (pt->cls == ABI_CLASS_INT) { - int to_stack = (*next_int >= 8); - u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++; - switch (av->storage.kind) { - case OPK_IMM: { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm); - break; - } - case OPK_REG: { - u32 sf = (sz == 8) ? 1u : 0u; - if (av->storage.cls == RC_FP) - aa_move_fp_to_int_reg(t->mc, dst_reg, av->storage, sz); - else - aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage))); - break; - } - case OPK_LOCAL: { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - aa64_emit_ldur_off(t->mc, sidx, dst_reg, 29, off, dst_reg); - break; - } - case OPK_INDIRECT: { - aa_assert_no_index(t, av->storage, "call INT arg storage"); - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.index = REG_NONE; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - AAAddrMode m = addr_mode(t, src, AA_TMP0); - aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, m.base, m.ofs)); - break; - } - default: - compiler_panic(t->c, a->loc, - "aarch64 call: arg storage kind %d unsupported", - (int)av->storage.kind); - } - if (to_stack) { - aa_store_stack_reg(t, dst_reg, RC_INT, av->type, 8, *stack_off, tail); - *stack_off += 8; - } - } else if (pt->cls == ABI_CLASS_FP) { - int to_stack = (*next_fp >= 8); - if (!to_stack) { - u32 dst_reg = (*next_fp)++; - switch (av->storage.kind) { - case OPK_REG: { - if (sz == 16) - aa64_emit32(t->mc, aa64_mov_v16b(dst_reg, reg_num(av->storage))); - else { - u32 type = (sz == 8) ? 1u : 0u; - aa64_emit32(t->mc, - aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); - } - break; - } - case OPK_LOCAL: { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) - compiler_panic(t->c, a->loc, "aarch64 call: bad FP arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - aa_emit_ldr_fp_any(t->mc, sidx, dst_reg, 29, off); - break; - } - case OPK_INDIRECT: { - aa_assert_no_index(t, av->storage, "call FP arg storage"); - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.index = REG_NONE; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - AAAddrMode m = addr_mode(t, src, AA_TMP0); - aa_emit_ldr_fp_any(t->mc, sidx, dst_reg, m.base, m.ofs); - break; - } - default: - compiler_panic(t->c, a->loc, - "aarch64 call: FP arg storage kind %d unsupported", - (int)av->storage.kind); - } - } else { - switch (av->storage.kind) { - case OPK_REG: - aa_store_stack_reg(t, reg_num(av->storage), RC_FP, av->type, sz, - *stack_off, tail); - break; - case OPK_LOCAL: { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) - compiler_panic(t->c, a->loc, "aarch64 call: bad FP arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - aa_emit_ldr_fp_any(t->mc, sidx, AA_FP_TMP0, 29, off); - aa_store_stack_reg(t, AA_FP_TMP0, RC_FP, av->type, sz, *stack_off, - tail); - break; - } - case OPK_INDIRECT: { - aa_assert_no_index(t, av->storage, "call FP stack-arg storage"); - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.index = REG_NONE; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - AAAddrMode m = addr_mode(t, src, AA_TMP0); - aa_emit_ldr_fp_any(t->mc, sidx, AA_FP_TMP0, m.base, m.ofs); - aa_store_stack_reg(t, AA_FP_TMP0, RC_FP, av->type, sz, *stack_off, - tail); - break; - } - default: - compiler_panic( - t->c, a->loc, - "aarch64 call: FP stack-arg storage kind %d unsupported", - (int)av->storage.kind); - } - *stack_off += sz > 8 ? sz : 8; - } - } else { - compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl", - (int)pt->cls); - } - } -} - -static void count_arg_stack(CGTarget* t, const ABIFuncInfo* fi, - const CGABIValue* av, u32* next_int, u32* next_fp, - u32* stack_off) { - ABIArgInfo va_ai; - ABIArgPart va_pt; - const ABIArgInfo* ai = av->abi; - if (!ai) { - u32 sz = type_byte_size(av->type); - memset(&va_ai, 0, sizeof va_ai); - memset(&va_pt, 0, sizeof va_pt); - va_ai.kind = ABI_ARG_DIRECT; - va_ai.parts = &va_pt; - va_ai.nparts = 1; - va_pt.cls = - aa_windows_fp_vararg(t, av) - ? ABI_CLASS_INT - : ((av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT); - va_pt.size = sz; - va_pt.align = sz; - va_pt.src_offset = 0; - ai = &va_ai; - if (fi && fi->vararg_on_stack) { - *next_int = 8; - *next_fp = 8; - } - } - if (ai->kind == ABI_ARG_IGNORE) return; - if (ai->kind == ABI_ARG_INDIRECT) { - if (*next_int < 8) - ++*next_int; - else - *stack_off += 8; - return; - } - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - if (pt->cls == ABI_CLASS_INT) { - if (*next_int < 8) - ++*next_int; - else - *stack_off += 8; - } else if (pt->cls == ABI_CLASS_FP) { - if (*next_fp < 8) - ++*next_fp; - else - *stack_off += pt->size > 8 ? pt->size : 8; - } - } -} - -static u32 aa_call_stack_size(CGTarget* t, const CGCallDesc* d) { - (void)t; - u32 next_int = 0, next_fp = 0, stack_off = 0; - for (u32 i = 0; i < d->nargs; ++i) - count_arg_stack(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); - return (stack_off + 15u) & ~15u; -} - -/* Realizability of a sibling call (see CGTarget.tail_call_unrealizable_reason). - * The callee's outgoing stack arguments must fit the area this function itself - * received (next_param_stack); the tail prologue restore reuses those slots. - * Variadic callees need no special handling — their arguments are placed by - * the ordinary register/stack rules and the same fit check covers them. sret - * callees are realizable too: aa_call forwards this function's own incoming - * sret pointer (the return-shape precondition guarantees it matches). */ -static const char* aa_tail_call_unrealizable_reason(CGTarget* t, - const CGCallDesc* d) { - AAImpl* a = impl_of(t); - u32 next_int = 0, next_fp = 0, stack_off = 0; - for (u32 i = 0; i < d->nargs; ++i) - count_arg_stack(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); - if (stack_off > a->next_param_stack) - return "tail call stack arguments exceed the caller's parameter area"; - return NULL; -} - -static u32 aa_collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) { - u32 n = 0; - for (u32 r = first; r <= last; ++r) { - if (mask & (1u << r)) out[n++] = r; - } - return n; -} - -static void aa_tail_restore_frame(CGTarget* t) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 int_regs[10]; - u32 fp_regs[8]; - u32 n_int_saves = - aa_collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs); - u32 n_fp_saves = aa_collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs); - u32 int_save_off = a->max_outgoing; - u32 fp_save_off = int_save_off + n_int_saves * 8u; - u32 locals_off = fp_save_off + n_fp_saves * 8u; - u32 fp_lr_off = locals_off + a->cum_off; - u32 frame_size = (fp_lr_off + 16u + 15u) & ~15u; - fp_lr_off = frame_size - 16u; - - if (a->omit_frame) return; - if (a->has_alloca) { - if (fp_lr_off <= 0xfff) { - aa64_emit32(mc, aa64_sub_imm(1, 31, 29, fp_lr_off, 0)); - } else { - compiler_panic(t->c, a->loc, "aarch64 tail call: fp/lr offset too large"); - } - } - for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { - aa64_emit32(mc, - aa64_ldr_fp_uimm(3, fp_regs[i], 31, fp_save_off + (u32)i * 8u)); - } - for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { - aa64_emit32(mc, - aa64_ldr_uimm(3, int_regs[i], 31, int_save_off + (u32)i * 8u)); - } - if (fp_lr_off <= 504u) { - aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); - } else { - aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off)); - aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u)); - } - emit_sp_add(mc, frame_size); -} - -static void aa_tail_branch(CGTarget* t, Operand callee) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - if (callee.kind == OPK_REG) { - if (reg_num(callee) != AA_TMP0) - aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(callee))); - aa_tail_restore_frame(t); - aa64_emit32(mc, aa64_br(AA_TMP0)); - } else if (callee.kind == OPK_GLOBAL) { - aa_tail_restore_frame(t); - u32 b_pos = mc->pos(mc); - aa64_emit32(mc, aa64_b_base()); - mc->emit_reloc_at(mc, mc->section_id, b_pos, R_AARCH64_JUMP26, - callee.v.global.sym, callee.v.global.addend, 0, 0); - } else { - compiler_panic(t->c, a->loc, - "aarch64 tail call: callee kind %d unsupported", - (int)callee.kind); - } -} - -static void aa_call(CGTarget* t, const CGCallDesc* d) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - u32 next_int = 0, next_fp = 0, stack_off = 0; - - /* Ordinary sret call: point x8 at the caller-provided destination local. - * A tail call instead forwards this function's own incoming sret pointer - * (handled below), so skip this here. */ - if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { - if (d->ret.storage.kind != OPK_LOCAL) { - compiler_panic(t->c, a->loc, - "aarch64 call: sret destination must be LOCAL"); - } - AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot"); - aa64_emit_addr_adjust(mc, 8, 29, -(i32)s->off); - } - - for (u32 i = 0; i < d->nargs; ++i) { - emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off, - (d->flags & CG_CALL_TAIL) != 0); - } - - u32 needed = (stack_off + 15u) & ~15u; - if ((d->flags & CG_CALL_TAIL) == 0 && needed > a->max_outgoing) { - if (a->known_frame) { - compiler_panic(t->c, a->loc, - "aarch64 call: known frame outgoing area too small"); - } - a->max_outgoing = needed; - } - - if (d->flags & CG_CALL_TAIL) { - if (d->abi && d->abi->has_sret) { - /* Forward this function's own incoming sret pointer (spilled to - * sret_ptr_slot at entry) into x8 for the callee. The return-shape - * precondition guarantees the callee writes the same type, so the - * forwarded pointer is correct. Load while x29 still addresses this - * frame, before aa_tail_branch tears it down; x8 is untouched by the - * frame restore and any indirect-callee move (AA_TMP0 = x9). */ - AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE) - ? aa64_slot_get(a, a->sret_ptr_slot) - : NULL; - if (!s) - compiler_panic(t->c, a->loc, - "aarch64 tail call: missing incoming sret slot"); - aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off)); - } - aa_check_tail_stack_args(t, stack_off); - aa_tail_branch(t, d->callee); - return; - } - - if (d->callee.kind == OPK_GLOBAL) { - u32 bl_pos = mc->pos(mc); - aa64_emit32(mc, aa64_bl_base()); - mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26, - d->callee.v.global.sym, d->callee.v.global.addend, 0, 0); - } else if (d->callee.kind == OPK_REG) { - aa64_emit32(mc, aa64_blr(reg_num(d->callee))); - } else { - compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported", - (int)d->callee.kind); - } - - const ABIArgInfo* ri = &d->abi->ret; - if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) { - return; - } - if (ri->nparts == 0) return; - - Operand rs = d->ret.storage; - u32 next_int_ret = 0, next_fp_ret = 0; - for (u16 i = 0; i < ri->nparts; ++i) { - const ABIArgPart* p = &ri->parts[i]; - u32 src_reg; - if (p->cls == ABI_CLASS_INT) { - src_reg = next_int_ret++; - } else if (p->cls == ABI_CLASS_FP) { - src_reg = next_fp_ret++; - } else { - compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl", - (int)p->cls); - } - - if (rs.kind == OPK_REG) { - if (ri->nparts != 1) { - compiler_panic(t->c, a->loc, - "aarch64 call: REG ret_storage with %u parts", - (unsigned)ri->nparts); - } - if (p->cls == ABI_CLASS_INT) { - u32 sf = (p->size == 8) ? 1u : 0u; - aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg)); - } else { - if (p->size == 16) - aa64_emit32(mc, aa64_mov_v16b(reg_num(rs), src_reg)); - else { - u32 type = (p->size == 8) ? 1u : 0u; - aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg)); - } - } - } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (rs.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, rs.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); - base_reg = 29; - base_off = -(i32)s->off; - } else { - base_reg = rs.v.ind.base & 0x1f; - base_off = rs.v.ind.ofs; - } - u32 sidx = size_idx_for_bytes(p->size); - i32 off = base_off + (i32)p->src_offset; - if (p->cls == ABI_CLASS_INT) { - aa64_emit_stur_off(mc, sidx, src_reg, base_reg, off, AA_TMP0); - } else { - aa_emit_str_fp_any(mc, sidx, src_reg, base_reg, off); - } - } else if (rs.kind == OPK_IMM && - rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) { - /* void return placeholder */ - } else { - compiler_panic(t->c, a->loc, - "aarch64 call: ret_storage kind %d unsupported", - (int)rs.kind); - } - } -} - -static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (p->flags & CG_CALL_TAIL) { - if (p->has_sret) { - /* Forward the function's own incoming sret pointer into x8 (see the - * O0 path in aa_call). Load before aa_tail_branch tears the frame - * down; x8 survives the restore and any indirect-callee move. */ - AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE) - ? aa64_slot_get(a, a->sret_ptr_slot) - : NULL; - if (!s) - compiler_panic(t->c, a->loc, - "aarch64 tail call: missing incoming sret slot"); - aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off)); - } - aa_check_tail_stack_args(t, aa_call_plan_stack_raw_size(p)); - aa_tail_branch(t, p->callee); - return; - } - - { - u32 needed = (aa_call_plan_stack_raw_size(p) + 15u) & ~15u; - if (needed > a->max_outgoing) { - if (a->known_frame) - compiler_panic( - t->c, a->loc, - "aarch64 call plan: known frame outgoing area too small"); - a->max_outgoing = needed; - } - } - - if (p->callee.kind == OPK_GLOBAL) { - u32 bl_pos = mc->pos(mc); - aa64_emit32(mc, aa64_bl_base()); - mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26, - p->callee.v.global.sym, p->callee.v.global.addend, 0, 0); - } else if (p->callee.kind == OPK_REG) { - aa64_emit32(mc, aa64_blr(reg_num(p->callee))); - } else { - compiler_panic(t->c, a->loc, - "aarch64 emit_call_plan: callee kind %d unsupported", - (int)p->callee.kind); - } -} - -static Operand aa_call_plan_offset_operand(CGTarget* t, Operand op, - u32 offset) { - if (!offset) return op; - if (op.kind == OPK_INDIRECT) { - aa_assert_no_index(t, op, "call plan offset operand"); - op.v.ind.ofs += (i32)offset; - } else if (op.kind == OPK_LOCAL) { - AAImpl* a = impl_of(t); - AASlot* s = aa64_slot_get(a, op.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call plan: bad slot"); - op.kind = OPK_INDIRECT; - op.v.ind.base = 29; - op.v.ind.index = REG_NONE; - op.v.ind.log2_scale = 0; - op.v.ind.ofs = -(i32)s->off + (i32)offset; - } - return op; -} - -static void aa_load_call_arg(CGTarget* t, Operand dst, - const CGCallPlanMove* m) { - Operand src = aa_call_plan_offset_operand(t, m->src, m->src_offset); - if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) { - aa_addr_of(t, dst, src); - return; - } - if (src.kind == OPK_GLOBAL) { - aa_addr_of(t, dst, src); - return; - } - aa_load(t, dst, src, m->mem); -} - -static void aa_store_call_ret(CGTarget* t, const CGCallPlanRet* r, - Operand src) { - Operand dst = aa_call_plan_offset_operand(t, r->dst, r->dst_offset); - aa_store(t, dst, src, r->mem); -} - -static void aa_store_call_arg(CGTarget* t, const CGCallPlanMove* m) { - Operand addr; - addr = aa_call_stack_arg_addr(t, m->stack_offset, - m->dst_kind == CG_CALL_PLAN_TAIL_STACK); - addr.type = m->mem.type; - - if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) { - Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type}; - tmp.v.reg = AA_TMP0; - aa_load_call_arg(t, tmp, m); - aa_store(t, addr, tmp, m->mem); - return; - } - - if (m->src.kind == OPK_REG || m->src.kind == OPK_IMM) { - aa_store(t, addr, m->src, m->mem); - return; - } - if (m->src.kind == OPK_GLOBAL) { - Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type}; - tmp.v.reg = AA_TMP0; - aa_load_call_arg(t, tmp, m); - aa_store(t, addr, tmp, m->mem); - return; - } - if (m->src.kind == OPK_LOCAL || m->src.kind == OPK_INDIRECT) { - Operand tmp = {.kind = OPK_REG, .cls = m->cls, .type = m->mem.type}; - tmp.v.reg = m->cls == RC_FP ? AA_FP_TMP0 : AA_TMP0; - aa_load_call_arg(t, tmp, m); - aa_store(t, addr, tmp, m->mem); - return; - } - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 store_call_arg: source kind %d unsupported", - (int)m->src.kind); -} - -static void aa_ret(CGTarget* t, const CGABIValue* val) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (val) { - const ABIArgInfo* ri = val->abi; - if (ri && ri->kind == ABI_ARG_INDIRECT) { - if (val->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot"); - if (a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); - if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); - } - u32 nbytes = s->size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit_ldur_off(mc, 3, AA_TMP0, 29, -(i32)s->off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit_ldur_off(mc, 2, AA_TMP0, 29, -(i32)s->off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit_ldur_off(mc, 1, AA_TMP0, 29, -(i32)s->off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); - i += 2; - } - while (i < nbytes) { - aa64_emit_ldur_off(mc, 0, AA_TMP0, 29, -(i32)s->off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); - i += 1; - } - } else if (val->storage.kind == OPK_INDIRECT) { - u32 nbytes = val->size; - if (!nbytes) { - compiler_panic(t->c, a->loc, - "aarch64 ret indirect: missing aggregate size"); - } - if (a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); - if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); - } - u32 base_reg = val->storage.v.ind.base & 0x1f; - i32 base_off = val->storage.v.ind.ofs; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit_ldur_off(mc, 3, AA_TMP0, base_reg, base_off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit_ldur_off(mc, 2, AA_TMP0, base_reg, base_off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit_ldur_off(mc, 1, AA_TMP0, base_reg, base_off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); - i += 2; - } - while (i < nbytes) { - aa64_emit_ldur_off(mc, 0, AA_TMP0, base_reg, base_off + (i32)i, - AA_TMP0); - aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); - i += 1; - } - } else { - compiler_panic(t->c, a->loc, - "aarch64 ret indirect: storage kind %d unsupported", - (int)val->storage.kind); - } - } else if (val->storage.kind == OPK_REG) { - if (val->storage.cls == RC_FP) { - if (type_byte_size(val->storage.type) == 16) { - if (reg_num(val->storage) != 0) - aa64_emit32(mc, aa64_mov_v16b(/*Rd=*/0, reg_num(val->storage))); - } else { - u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u; - if (reg_num(val->storage) != 0) - aa64_emit32(mc, - aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage))); - } - } else { - u32 sf = type_is_64(val->storage.type) ? 1u : 0u; - if (reg_num(val->storage) != 0) - aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage))); - } - } else if (val->storage.kind == OPK_IMM) { - u32 sf = type_is_64(val->storage.type) ? 1u : 0u; - aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm); - } else if (val->storage.kind == OPK_LOCAL || - val->storage.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (val->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot"); - base_reg = 29; - base_off = -(i32)s->off; - } else { - base_reg = val->storage.v.ind.base & 0x1f; - base_off = val->storage.v.ind.ofs; - } - const ABIArgInfo* ri2 = val->abi; - u16 nparts = ri2 ? ri2->nparts : 0; - /* INT parts load into x0..x{n-1}. If the base address sits in one of - * those registers, loading that part clobbers the base before later - * parts are read (e.g. `ldur x0,[x0]; ldur w1,[x0,#8]`). Park the base - * in a scratch (x10, never a return reg) when an earlier INT part would - * overwrite it. FP parts target v-regs and never alias the int base. */ - u32 load_base = base_reg; - for (u16 i = 0; i + 1u < nparts; ++i) { - if (ri2->parts[i].cls == ABI_CLASS_INT && (u32)i == base_reg) { - aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, AA_TMP1, base_reg)); - load_base = AA_TMP1; - break; - } - } - for (u16 i = 0; i < nparts; ++i) { - const ABIArgPart* pt = &ri2->parts[i]; - u32 sidx = size_idx_for_bytes(pt->size); - i32 off = base_off + (i32)pt->src_offset; - if (pt->cls == ABI_CLASS_INT) { - aa64_emit_ldur_off(mc, sidx, /*Rt=*/i, load_base, off, AA_TMP0); - } else if (pt->cls == ABI_CLASS_FP) { - aa_emit_ldr_fp_any(mc, sidx, /*Rt=*/i, load_base, off); - } else { - compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl", - (int)pt->cls); - } - } - } - } - if (a->omit_frame) { - aa64_emit32(mc, aa64_ret(AA64_LR)); - return; - } - u32 bpos = mc->pos(mc); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0); - (void)bpos; -} - -/* ============================================================ - * alloca - * ============================================================ */ - -static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (d.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG"); - } - if (align > 16) { - compiler_panic(t->c, a->loc, - "aarch64 alloca: align %u > 16 not yet supported", align); - } - - if (sz.kind == OPK_IMM) { - i64 v = sz.v.imm; - if (v < 0) { - compiler_panic(t->c, a->loc, "aarch64 alloca: negative size"); - } - u64 aligned = ((u64)v + 15u) & ~(u64)15u; - if (aligned == 0) aligned = 16; - if (aligned > 0xfffu) { - compiler_panic(t->c, a->loc, - "aarch64 alloca: const size %llu too large for v1", - (unsigned long long)aligned); - } - aa64_emit32(mc, - aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0)); - } else if (sz.kind == OPK_REG) { - u32 sz_reg = reg_num(sz); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0)); - aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63)); - aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59)); - aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0)); - } else { - compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported", - (int)sz.kind); - } - - if (a->nadd_patches == a->add_patches_cap) { - u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4; - struct AAAllocaPatch* nb = - arena_array(t->c->tu, struct AAAllocaPatch, ncap); - if (a->add_patches) - memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches); - a->add_patches = nb; - a->add_patches_cap = ncap; - } - u32 dst_reg = reg_num(d); - a->add_patches[a->nadd_patches].pos = mc->pos(mc); - a->add_patches[a->nadd_patches].dst_reg = dst_reg; - a->nadd_patches++; - aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0)); - a->has_alloca = 1; -} - -/* ============================================================ - * Varargs - * ============================================================ */ - -static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) { - if (ofs == 0) - aa64_emit32(mc, aa64_mov_reg(1, dst, 29)); - else if (ofs > 0 && (u32)ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0)); - else if (ofs < 0 && (u32)(-ofs) <= 0xfff) - aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0)); - else { - aa64_emit_load_imm(mc, 1, dst, ofs); - aa64_emit32(mc, aa64_add(1, dst, 29, dst)); - } -} - -static void aa_va_start_(CGTarget* t, Operand ap_op) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - if (!a->is_variadic) { - compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic"); - } - u32 ap = reg_num(ap_op); - if (t->c->target.os == CFREE_OS_MACOS) { - u32 ofs = 16u + a->next_param_stack; - if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); - else { - aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); - aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); - } - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); - return; - } - if (t->c->target.os == CFREE_OS_WINDOWS) { - if (a->next_param_int < 8) { - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)(a->next_param_int * 8u)); - } else { - u32 ofs = 16u + a->next_param_stack; - if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); - else { - aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); - aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); - } - } - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); - return; - } - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - AASlot* fs = aa64_slot_get(a, a->fp_save_slot); - - { - u32 ofs = 16u + a->next_param_stack; - if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); - else { - aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); - aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); - } - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); - } - emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8)); - emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16)); - aa64_emit_load_imm(mc, 0, AA_TMP0, (i64)((i32)(a->next_param_int * 8u) - 64)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24)); - aa64_emit_load_imm(mc, 0, AA_TMP0, - (i64)((i32)(a->next_param_fp * 16u) - 128)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28)); -} - -static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, - CfreeCgTypeId ty) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 ap = reg_num(ap_op); - int is_fp = (dst.cls == RC_FP); - u32 offs_field = is_fp ? 28u : 24u; - u32 top_field = is_fp ? 16u : 8u; - u32 stride_reg = is_fp ? 16u : 8u; - u32 sz = type_byte_size(ty); - u32 sidx = size_idx_for_bytes(sz); - - if (t->c->target.os == CFREE_OS_MACOS) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); - aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); - return; - } - if (t->c->target.os == CFREE_OS_WINDOWS) { - MCLabel L_store = mc->label_new(mc); - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); - - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - if (gs) { - emit_fp_off(mc, AA_TMP2, -(i32)gs->off + 64); - aa64_emit32(mc, aa64_subs_reg(1, 31u, AA_TMP1, AA_TMP2)); - aa64_emit32(mc, aa64_b_cond(0x1 /*NE*/)); - mc->emit_label_ref(mc, L_store, R_AARCH64_CONDBR19, 4, 0); - u32 ofs = 16u + a->next_param_stack; - if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, 29, ofs, 0)); - else { - aa64_emit_load_imm(mc, 1, AA_TMP1, (i64)ofs); - aa64_emit32(mc, aa64_add(1, AA_TMP1, 29, AA_TMP1)); - } - } - mc->label_place(mc, L_store); - aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); - return; - } - - MCLabel L_stack = mc->label_new(mc); - MCLabel L_done = mc->label_new(mc); - - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field)); - aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0)); - aa64_emit32(mc, aa64_b_cond(0xa /*GE*/)); - mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0); - - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field)); - aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31)); - aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0)); - aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0)); - aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field)); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); - - mc->label_place(mc, L_stack); - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); - aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); - - mc->label_place(mc, L_done); -} - -static void aa_va_end_(CGTarget* t, Operand a) { - (void)t; - (void)a; -} - -static void aa_va_copy_(CGTarget* t, Operand d, Operand s) { - MCEmitter* mc = t->mc; - u32 dr = reg_num(d); - u32 sr = reg_num(s); - if (t->c->target.os == CFREE_OS_MACOS) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, 0)); - aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, 0)); - return; - } - if (t->c->target.os == CFREE_OS_WINDOWS) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, 0)); - aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, 0)); - return; - } - for (u32 i = 0; i < 32u; i += 8u) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i)); - } -} - -/* ============================================================ - * Atomics - * ============================================================ */ - -static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { - return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { - return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_cbnz(u32 sf64, u32 Rt) { - return 0x35000000u | (sf64 << 31) | (Rt & 0x1f); -} - -static int mem_order_is_acquire(MemOrder o) { - return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || - o == MO_CONSUME; -} -static int mem_order_is_release(MemOrder o) { - return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; -} - -static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma, - MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - aa_assert_no_index(t, addr, "atomic_load"); - u32 base; - if (addr.kind == OPK_REG) { - base = reg_num(addr); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot"); - base = AA_TMP0; - aa64_emit_addr_adjust(mc, base, 29, -(i32)s->off); - } else if (addr.kind == OPK_INDIRECT) { - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (m.ofs != 0) { - aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs); - base = AA_TMP0; - } else { - base = m.base; - } - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_load: addr kind %d unsupported", - (int)addr.kind); - } - if (mem_order_is_acquire(ord)) { - aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base)); - } else { - u32 sidx = size_idx_for_bytes(ma.size); - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0)); - } -} - -static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, - MemAccess ma, MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - u32 src_reg; - if (src.kind == OPK_IMM) { - src_reg = AA_TMP1; - aa64_emit_load_imm(mc, sf, src_reg, src.v.imm); - } else if (src.kind == OPK_REG) { - src_reg = reg_num(src); - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_store: src kind %d unsupported", - (int)src.kind); - } - aa_assert_no_index(t, addr, "atomic_store"); - u32 base; - if (addr.kind == OPK_REG) { - base = reg_num(addr); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot"); - base = AA_TMP0; - aa64_emit_addr_adjust(mc, base, 29, -(i32)s->off); - } else if (addr.kind == OPK_INDIRECT) { - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (m.ofs != 0) { - aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs); - base = AA_TMP0; - } else { - base = m.base; - } - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_store: addr kind %d unsupported", - (int)addr.kind); - } - if (mem_order_is_release(ord)) { - aa64_emit32(mc, aa64_stlr(sf, src_reg, base)); - } else { - u32 sidx = size_idx_for_bytes(ma.size); - aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0)); - } -} - -static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new, - u32 prior, u32 val) { - switch (op) { - case AO_XCHG: - aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); - break; - case AO_ADD: - aa64_emit32(mc, aa64_add(sf, dst_new, prior, val)); - break; - case AO_SUB: - aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val)); - break; - case AO_AND: - aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); - break; - case AO_OR: - aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val)); - break; - case AO_XOR: - aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val)); - break; - case AO_NAND: - aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); - aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new)); - break; - default: - aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); - break; - } -} - -static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, - Operand val, MemAccess ma, MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - aa_assert_no_index(t, addr, "atomic_rmw"); - u32 base = AA_TMP0; - if (addr.kind == OPK_REG) { - aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot"); - aa64_emit_addr_adjust(mc, AA_TMP0, 29, -(i32)s->off); - } else if (addr.kind == OPK_INDIRECT) { - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (m.base != AA_TMP0 || m.ofs != 0) - aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported", - (int)addr.kind); - } - u32 vreg = AA_TMP1; - if (val.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, vreg, val.v.imm); - } else if (val.kind == OPK_REG) { - aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val))); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported", - (int)val.kind); - } - - int do_acq = mem_order_is_acquire(ord); - int do_rel = mem_order_is_release(ord); - - MCLabel L_retry = mc->label_new(mc); - mc->label_place(mc, L_retry); - - if (do_acq) - aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base)); - else - aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base)); - - emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg); - - if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base)); - else - aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base)); - - u32 cbnz_pos = mc->pos(mc); - aa64_emit32(mc, aa64_cbnz(0, vreg)); - mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); - (void)cbnz_pos; -} - -static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, - Operand expected, Operand desired, MemAccess ma, - MemOrder succ, MemOrder fail) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - (void)fail; - - aa_assert_no_index(t, addr, "atomic_cas"); - u32 base = AA_TMP0; - if (addr.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); - else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot"); - aa64_emit_addr_adjust(mc, AA_TMP0, 29, -(i32)s->off); - } else if (addr.kind == OPK_INDIRECT) { - AAAddrMode m = addr_mode(t, addr, AA_TMP0); - if (m.base != AA_TMP0 || m.ofs != 0) - aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported", - (int)addr.kind); - } - if (expected.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm); - else if (expected.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected))); - else - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported", - (int)expected.kind); - if (desired.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm); - else if (desired.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired))); - else - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported", - (int)desired.kind); - - int do_acq = mem_order_is_acquire(succ); - int do_rel = mem_order_is_release(succ); - - MCLabel L_retry = mc->label_new(mc); - MCLabel L_fail = mc->label_new(mc); - MCLabel L_done = mc->label_new(mc); - - mc->label_place(mc, L_retry); - if (do_acq) - aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base)); - else - aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base)); - - aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1)); - aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/)); - mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0); - - if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base)); - else - aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base)); - aa64_emit32(mc, aa64_cbnz(0, AA_TMP1)); - mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); - - aa64_emit_load_imm(mc, 0, reg_num(ok), 1); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); - - mc->label_place(mc, L_fail); - aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY)); - aa64_emit_load_imm(mc, 0, reg_num(ok), 0); - - mc->label_place(mc, L_done); -} - -static void aa_fence(CGTarget* t, MemOrder o) { - (void)o; - if (o == MO_RELAXED) return; - aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); -} - -/* ============================================================ - * Intrinsics - * ============================================================ */ - -static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) { - return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rev_w(u32 Rd, u32 Rn) { - return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rev_x(u32 Rd, u32 Rn) { - return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) { - return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) { - return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) { - return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); -} -static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) { - return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); -} -static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { - return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) { - return aa64_dp3_pack((AA64DP3){ - .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd}); -} -static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) { - return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); -} -static inline u32 aa64_umaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) { - return aa64_dp3_pack((AA64DP3){ - .sf = 1, .op31 = 5, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd}); -} -static inline u32 aa64_umull(u32 Rd, u32 Rn, u32 Rm) { - return aa64_umaddl(Rd, Rn, Rm, AA64_ZR); -} -static inline u32 aa64_smulh(u32 Rd, u32 Rn, u32 Rm) { - return 0x9B407C00u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_umulh(u32 Rd, u32 Rn, u32 Rm) { - return 0x9BC07C00u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) { - return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, - const Operand* args, u32 na) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - (void)nd; - - switch (kind) { - case INTRIN_POPCOUNT: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sz_in = type_byte_size(src.type); - if (sz_in == 8) - aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src))); - else - aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src))); - aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0)); - aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0)); - aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0)); - return; - } - case INTRIN_CLZ: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src))); - return; - } - case INTRIN_CTZ: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src))); - aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst))); - return; - } - case INTRIN_BSWAP16: { - aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_BSWAP32: { - aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_BSWAP64: { - aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_MEMCPY: - case INTRIN_MEMMOVE: { - Operand da = args[0], sa = args[1], nb = args[2]; - if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) { - compiler_panic( - t->c, a->loc, - "aarch64 intrinsic: %.*s with non-const n or non-REG ptr", - SLICE_ARG( - slice_from_cstr(kind == INTRIN_MEMCPY ? "memcpy" : "memmove"))); - } - u32 dr = reg_num(da); - u32 sr = reg_num(sa); - u32 n = (u32)nb.v.imm; - if (kind == INTRIN_MEMCPY) { - u32 i = 0; - while (i + 8 <= n) { - aa_emit_load_at(mc, 3, AA_TMP2, sr, i); - aa_emit_store_at(mc, 3, AA_TMP2, dr, i); - i += 8; - } - while (i + 4 <= n) { - aa_emit_load_at(mc, 2, AA_TMP2, sr, i); - aa_emit_store_at(mc, 2, AA_TMP2, dr, i); - i += 4; - } - while (i + 2 <= n) { - aa_emit_load_at(mc, 1, AA_TMP2, sr, i); - aa_emit_store_at(mc, 1, AA_TMP2, dr, i); - i += 2; - } - while (i < n) { - aa_emit_load_at(mc, 0, AA_TMP2, sr, i); - aa_emit_store_at(mc, 0, AA_TMP2, dr, i); - i += 1; - } - } else { - u32 i = n; - while (i >= 8) { - i -= 8; - aa_emit_load_at(mc, 3, AA_TMP2, sr, i); - aa_emit_store_at(mc, 3, AA_TMP2, dr, i); - } - while (i >= 4) { - i -= 4; - aa_emit_load_at(mc, 2, AA_TMP2, sr, i); - aa_emit_store_at(mc, 2, AA_TMP2, dr, i); - } - while (i >= 2) { - i -= 2; - aa_emit_load_at(mc, 1, AA_TMP2, sr, i); - aa_emit_store_at(mc, 1, AA_TMP2, dr, i); - } - while (i >= 1) { - i -= 1; - aa_emit_load_at(mc, 0, AA_TMP2, sr, i); - aa_emit_store_at(mc, 0, AA_TMP2, dr, i); - } - } - return; - } - case INTRIN_MEMSET: { - Operand da = args[0], bv = args[1], nb = args[2]; - if (da.kind != OPK_REG || nb.kind != OPK_IMM) { - compiler_panic( - t->c, a->loc, - "aarch64 intrinsic: memset with non-const n / non-REG ptr"); - } - u32 dr = reg_num(da); - u32 n = (u32)nb.v.imm; - u32 byte; - u32 src_reg; - if (bv.kind == OPK_IMM) { - byte = (u32)(bv.v.imm & 0xffu); - if (byte == 0) { - src_reg = 31u; - } else { - u64 b64 = byte; - b64 |= b64 << 8; - b64 |= b64 << 16; - b64 |= b64 << 32; - aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64); - src_reg = AA_TMP2; - } - } else if (bv.kind == OPK_REG) { - aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll); - aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR)); - src_reg = AA_TMP2; - } else { - compiler_panic(t->c, a->loc, - "aarch64 intrinsic: memset byte kind %d unsupported", - (int)bv.kind); - } - u32 i = 0; - while (i + 8 <= n) { - aa_emit_store_at(mc, 3, src_reg, dr, i); - i += 8; - } - while (i + 4 <= n) { - aa_emit_store_at(mc, 2, src_reg, dr, i); - i += 4; - } - while (i + 2 <= n) { - aa_emit_store_at(mc, 1, src_reg, dr, i); - i += 2; - } - while (i < n) { - aa_emit_store_at(mc, 0, src_reg, dr, i); - i += 1; - } - return; - } - case INTRIN_PREFETCH: - (void)args; - (void)na; - return; - case INTRIN_ASSUME_ALIGNED: { - Operand src = args[0]; - Operand dst = dsts[0]; - if (reg_num(src) != reg_num(dst)) { - aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src))); - } - return; - } - case INTRIN_EXPECT: { - Operand val = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(dst.type) ? 1u : 0u; - if (val.kind == OPK_REG) { - if (reg_num(val) != reg_num(dst)) { - aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val))); - } - } else if (val.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm); - } else { - compiler_panic(t->c, a->loc, - "aarch64 intrinsic: expect val kind %d unsupported", - (int)val.kind); - } - return; - } - case INTRIN_UNREACHABLE: - case INTRIN_TRAP: - aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u)); - return; - case INTRIN_SADD_OVERFLOW: - case INTRIN_UADD_OVERFLOW: - case INTRIN_SSUB_OVERFLOW: - case INTRIN_USUB_OVERFLOW: { - Operand a_op = args[0], b_op = args[1]; - Operand dval = dsts[0], dovf = dsts[1]; - u32 sf = type_is_64(dval.type) ? 1u : 0u; - u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rb = - aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); - u32 word = (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW) - ? aa64_adds_reg(sf, reg_num(dval), ra, rb) - : aa64_subs_reg(sf, reg_num(dval), ra, rb); - u32 cond = (kind == INTRIN_UADD_OVERFLOW) ? 0x2u /*CS*/ - : (kind == INTRIN_USUB_OVERFLOW) ? 0x3u /*CC*/ - : 0x6u /*VS*/; - aa64_emit32(mc, word); - aa64_emit32(mc, aa64_cset(0, reg_num(dovf), cond)); - return; - } - case INTRIN_SMUL_OVERFLOW: { - Operand a_op = args[0], b_op = args[1]; - Operand dval = dsts[0], dovf = dsts[1]; - u32 sf = type_is_64(dval.type) ? 1u : 0u; - u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rb = - aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); - if (sf) { - aa64_emit32(mc, aa64_mul(1, reg_num(dval), ra, rb)); - aa64_emit32(mc, aa64_smulh(reg_num(dovf), ra, rb)); - aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, reg_num(dval), 63, 63)); - aa64_emit32(mc, aa64_subs_reg(1, 31u, reg_num(dovf), AA_TMP2)); - } else { - aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb)); - aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2)); - aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2)); - } - aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); - return; - } - case INTRIN_UMUL_OVERFLOW: { - Operand a_op = args[0], b_op = args[1]; - Operand dval = dsts[0], dovf = dsts[1]; - u32 sf = type_is_64(dval.type) ? 1u : 0u; - u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rb = - aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); - if (sf) { - aa64_emit32(mc, aa64_mul(1, reg_num(dval), ra, rb)); - aa64_emit32(mc, aa64_umulh(reg_num(dovf), ra, rb)); - } else { - aa64_emit32(mc, aa64_umull(AA_TMP2, ra, rb)); - aa64_emit32(mc, aa64_ubfm(1, reg_num(dovf), AA_TMP2, 32, 63)); - aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2)); - } - aa64_emit32(mc, aa64_subs_imm(1, 31u, reg_num(dovf), 0)); - aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); - return; - } - default: - compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported", - (int)kind); - } -} - -/* ============================================================ - * Inline asm block - * ============================================================ */ - -static void aa_asm_block(CGTarget* t, const char* tmpl, - const AsmConstraint* outs, u32 no, Operand* oo, - const AsmConstraint* ins, u32 ni, const Operand* io, - const Sym* clobs, u32 nc) { - AAImpl* a_impl = impl_of(t); - for (u32 i = 0; i < nc; ++i) { - Reg phys; - RegClass cls; - if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue; - if (cls == RC_INT) { - if (phys >= 19u && phys <= 28u) a_impl->used_cs_int_mask |= 1u << phys; - } else if (cls == RC_FP) { - if (phys >= 8u && phys <= 15u) a_impl->used_cs_fp_mask |= 1u << phys; - } - } - AA64Asm* a = aa64_asm_open(t->c); - aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc); - aa64_asm_run_template(a, t->mc, tmpl); - aa64_asm_close(a); -} - -/* ============================================================ - * Lifecycle / vtable constructor - * ============================================================ */ - -static void aa_set_loc(CGTarget* t, SrcLoc loc) { - impl_of(t)->loc = loc; - t->mc->set_loc(t->mc, loc); -} - -static void aa_finalize(CGTarget* t) { (void)t; } - -static void aa_destroy(CGTarget* t) { (void)t; } - -static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); } - -CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { - AAImpl* a = arena_new(c->tu, AAImpl); - memset(a, 0, sizeof *a); - - CGTarget* t = &a->base; - t->c = c; - t->obj = o; - t->mc = m; - - t->func_begin = aa_func_begin; - t->func_begin_known_frame = aa_func_begin_known_frame; - t->func_end = aa_func_end; - t->frame_slot = aa_frame_slot; - t->param = aa_param; - - t->load_imm = aa_load_imm; - t->load_const = aa_load_const; - t->copy = aa_copy; - t->load = aa_load; - t->store = aa_store; - t->addr_of = aa_addr_of; - t->tls_addr_of = aa_tls_addr_of; - t->copy_bytes = aa_copy_bytes; - t->set_bytes = aa_set_bytes; - t->bitfield_load = aa_bitfield_load; - t->bitfield_store = aa_bitfield_store; - - t->binop = aa_binop; - t->unop = aa_unop; - t->convert = aa_convert; - - t->call = aa_call; - t->load_call_arg = aa_load_call_arg; - t->emit_call_plan = aa_emit_call_plan; - t->store_call_arg = aa_store_call_arg; - t->store_call_ret = aa_store_call_ret; - t->call_stack_size = aa_call_stack_size; - t->tail_call_unrealizable_reason = aa_tail_call_unrealizable_reason; - t->ret = aa_ret; - - t->alloca_ = aa_alloca_; - t->va_start_ = aa_va_start_; - t->va_arg_ = aa_va_arg_; - t->va_end_ = aa_va_end_; - t->va_copy_ = aa_va_copy_; - - t->atomic_load = aa_atomic_load; - t->atomic_store = aa_atomic_store; - t->atomic_rmw = aa_atomic_rmw; - t->atomic_cas = aa_atomic_cas; - t->fence = aa_fence; - - t->intrinsic = aa_intrinsic; - t->asm_block = aa_asm_block; - - t->set_loc = aa_set_loc; - t->finalize = aa_finalize; - t->destroy = aa_destroy; - - /* alloc/label/scope vtable entries */ - aa_alloc_vtable_init(t); -#if CFREE_OPT_ENABLED - aa_coord_vtable_init(t); -#endif - - /* Suppress unused warning. */ - (void)type_is_signed; - - compiler_defer(c, cgt_cleanup, t); - return t; -} diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c @@ -1,373 +0,0 @@ -/* aarch64/opt_coord.c — opt/backend register coordination hooks. - * Static arrays so opt_machinize can query the backend instead of - * hard-coding arch knowledge. */ - -#include "arch/aa64/internal.h" - -/* ============================================================ - * Static register tables reported to caller-owned allocators. */ - -static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23, 24, 25, 26, 27, 28}; -static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23}; - -static const Reg aa_int_scratch[] = {16, 17}; -static const Reg aa_fp_scratch[] = {24, 25}; - -static const CGPhysRegInfo aa_int_phys[] = { - {0, RC_INT, 0, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {1, RC_INT, 1, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {2, RC_INT, 2, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {3, RC_INT, 3, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {4, RC_INT, 4, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {5, RC_INT, 5, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {6, RC_INT, 6, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {7, RC_INT, 7, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, - {8, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, - 0}, - {12, RC_INT, 0xff, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0}, - {13, RC_INT, 0xff, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0}, - {14, RC_INT, 0xff, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0}, - {15, RC_INT, 0xff, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0}, - {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {20, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {21, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {22, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {23, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {24, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {25, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {26, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {27, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {28, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, -}; -static const CGPhysRegInfo aa_fp_phys[] = { - {0, RC_FP, 0, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {1, RC_FP, 1, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {2, RC_FP, 2, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {3, RC_FP, 3, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {4, RC_FP, 4, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {5, RC_FP, 5, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {6, RC_FP, 6, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {7, RC_FP, 7, - CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0}, - {8, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {9, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {10, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {11, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {12, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {13, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {14, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {15, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, - {16, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {17, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {18, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {19, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {20, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {21, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {22, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {23, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {26, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {27, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {28, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {29, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, - {30, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, -}; - -/* ============================================================ - * Vtable methods */ - -static void aa_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out, - u32* nregs) { - (void)t; - switch (cls) { - case RC_INT: - *out = aa_int_allocable; - *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0]; - break; - case RC_FP: - *out = aa_fp_allocable; - *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0]; - break; - default: - *out = NULL; - *nregs = 0; - break; - } -} - -static void aa_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out, - u32* nregs) { - (void)t; - switch (cls) { - case RC_INT: - *out = aa_int_scratch; - *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0]; - break; - case RC_FP: - *out = aa_fp_scratch; - *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0]; - break; - default: - *out = NULL; - *nregs = 0; - break; - } -} - -static void aa_get_phys_regs(CGTarget* t, RegClass cls, - const CGPhysRegInfo** out, u32* nregs) { - (void)t; - switch (cls) { - case RC_INT: - *out = aa_int_phys; - *nregs = sizeof aa_int_phys / sizeof aa_int_phys[0]; - break; - case RC_FP: - *out = aa_fp_phys; - *nregs = sizeof aa_fp_phys / sizeof aa_fp_phys[0]; - break; - default: - *out = NULL; - *nregs = 0; - break; - } -} - -static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) { - (void)t; - switch (cls) { - case RC_INT: - /* AAPCS64 caller-saved: x0-x18, x30 */ - return reg <= 18 || reg == 30; - case RC_FP: - /* AAPCS64 caller-saved: v0-v7, v16-v31 */ - return reg <= 7 || reg >= 16; - default: - return 0; - } -} - -static u32 aa_call_clobber_mask(CGTarget* t, const CGCallDesc* d, - RegClass cls) { - (void)t; - (void)d; - if (cls == RC_INT) return ((1u << 19) - 1u) | (1u << 30); - if (cls == RC_FP) return 0xFFFF00FFu; - return 0; -} - -static u32 aa_callee_save_mask(CGTarget* t, RegClass cls) { - (void)t; - if (cls == RC_INT) { - u32 mask = 0; - for (u32 r = 19; r <= 28; ++r) mask |= 1u << r; - return mask; - } - if (cls == RC_FP) return 0x0000FF00u; - return 0; -} - -static u32 aa_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi, - RegClass cls) { - (void)t; - if (!abi || abi->ret.kind == ABI_ARG_IGNORE || - abi->ret.kind == ABI_ARG_INDIRECT) - return 0; - u32 mask = 0, ni = 0, nf = 0; - for (u16 i = 0; i < abi->ret.nparts; ++i) { - const ABIArgPart* p = &abi->ret.parts[i]; - if (cls == RC_INT && p->cls == ABI_CLASS_INT) - mask |= 1u << ni++; - else if (cls == RC_FP && p->cls == ABI_CLASS_FP) - mask |= 1u << nf++; - } - return mask; -} - -static int aa_windows_fp_vararg_plan(CGTarget* t, const CGABIValue* av) { - return t->c->target.os == CFREE_OS_WINDOWS && av && av->abi == NULL && - av->storage.cls == RC_FP; -} - -static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { - memset(out, 0, sizeof *out); - out->callee = d->callee; - out->flags = d->flags; - out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; - out->has_sret = d->abi && d->abi->has_sret; - out->is_variadic = d->abi && d->abi->variadic; - for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { - out->clobber_mask[c] = aa_call_clobber_mask(t, d, (RegClass)c); - out->return_mask[c] = aa_return_reg_mask(t, d->abi, (RegClass)c); - } - u32 cap = d->nargs * 2u + 2u; - out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u); - out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4); - u32 next_int = 0, next_fp = 0, stack = 0; - /* Ordinary sret call: pass the destination address in x8. A tail call - * instead forwards the function's own incoming sret pointer (handled in - * aa_emit_call_plan), and ret.storage is the void sentinel, so skip it. */ - if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { - CGCallPlanMove* m = &out->args[out->nargs++]; - m->src = d->ret.storage; - m->src_kind = CG_CALL_PLAN_SRC_ADDR; - m->dst_kind = CG_CALL_PLAN_REG; - m->cls = RC_INT; - m->dst_reg = 8; - m->mem.type = d->ret.type; - m->mem.size = 8; - m->mem.align = 8; - } - for (u32 a = 0; a < d->nargs; ++a) { - const CGABIValue* av = &d->args[a]; - const ABIArgInfo* ai = av->abi; - ABIArgInfo vai; - ABIArgPart vap; - if (!ai) { - memset(&vai, 0, sizeof vai); - memset(&vap, 0, sizeof vap); - vap.cls = aa_windows_fp_vararg_plan(t, av) - ? ABI_CLASS_INT - : (av->storage.cls == RC_FP ? ABI_CLASS_FP : ABI_CLASS_INT); - vap.size = type_byte_size(av->type); - vai.kind = ABI_ARG_DIRECT; - vai.nparts = 1; - vai.parts = &vap; - ai = &vai; - if (d->abi && d->abi->vararg_on_stack) next_int = next_fp = 8; - } - if (ai->kind == ABI_ARG_IGNORE) continue; - if (ai->kind == ABI_ARG_INDIRECT) { - CGCallPlanMove* m = &out->args[out->nargs++]; - m->src = av->storage; - m->src_kind = CG_CALL_PLAN_SRC_ADDR; - m->cls = RC_INT; - if (next_int < 8) { - m->dst_kind = CG_CALL_PLAN_REG; - m->dst_reg = next_int++; - } else { - m->dst_kind = CG_CALL_PLAN_STACK; - m->stack_offset = stack; - stack += 8; - } - m->mem.type = av->type; - m->mem.size = 8; - m->mem.align = 8; - continue; - } - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* p = &ai->parts[i]; - CGCallPlanMove* m = &out->args[out->nargs++]; - m->src = av->nparts ? av->parts[i].op : av->storage; - m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset; - m->mem.type = av->type; - m->mem.size = p->size; - m->mem.align = p->align ? p->align : p->size; - if (p->cls == ABI_CLASS_FP) { - m->cls = RC_FP; - if (next_fp < 8) { - m->dst_kind = CG_CALL_PLAN_REG; - m->dst_reg = next_fp++; - } else { - m->dst_kind = CG_CALL_PLAN_STACK; - m->stack_offset = stack; - stack += p->size > 8 ? p->size : 8; - } - } else { - m->cls = RC_INT; - if (next_int < 8) { - m->dst_kind = CG_CALL_PLAN_REG; - m->dst_reg = next_int++; - } else { - m->dst_kind = CG_CALL_PLAN_STACK; - m->stack_offset = stack; - stack += 8; - } - } - } - } - if ((d->flags & CG_CALL_TAIL) == 0 && d->abi && - d->abi->ret.kind != ABI_ARG_IGNORE && - d->abi->ret.kind != ABI_ARG_INDIRECT) { - u32 ni = 0, nf = 0; - for (u16 i = 0; i < d->abi->ret.nparts; ++i) { - const ABIArgPart* p = &d->abi->ret.parts[i]; - CGCallPlanRet* r = &out->rets[out->nrets++]; - r->dst = d->ret.storage; - r->dst_offset = p->src_offset; - r->mem.type = d->ret.type; - r->mem.size = p->size; - r->mem.align = p->align ? p->align : p->size; - if (p->cls == ABI_CLASS_FP) { - r->cls = RC_FP; - r->src_reg = nf++; - } else { - r->cls = RC_INT; - r->src_reg = ni++; - } - } - } -} - -static void aa_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, - u32 n) { - AAImpl* a = impl_of(t); - for (u32 i = 0; i < n; ++i) { - Reg r = regs[i]; - switch (cls) { - case RC_INT: - if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r; - break; - case RC_FP: - if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r; - break; - default: - break; - } - } -} - -static void aa_plan_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, - u32 n) { - AAImpl* a = impl_of(t); - a->has_planned_regs = 1; - for (u32 i = 0; i < n; ++i) { - Reg r = regs[i]; - switch (cls) { - case RC_INT: - if (r >= 19u && r <= 28u) a->planned_cs_int_mask |= 1u << r; - break; - case RC_FP: - if (r >= 8u && r <= 15u) a->planned_cs_fp_mask |= 1u << r; - break; - default: - break; - } - } -} - -void aa_coord_vtable_init(CGTarget* t) { - t->get_allocable_regs = aa_get_allocable_regs; - t->get_phys_regs = aa_get_phys_regs; - t->get_scratch_regs = aa_get_scratch_regs; - t->is_caller_saved = aa_is_caller_saved; - t->call_clobber_mask = aa_call_clobber_mask; - t->return_reg_mask = aa_return_reg_mask; - t->callee_save_mask = aa_callee_save_mask; - t->plan_call = aa_plan_call; - t->plan_hard_regs = aa_plan_hard_regs; - t->reserve_hard_regs = aa_reserve_hard_regs; -}