commit edbd83e9f471e5ce53f0bd6716d5d7d9476ba302
parent 19d5d73f838dcb2dfede65d2724fc9a0c891a6c8
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 26 May 2026 15:41:39 -0700
aa64: delete old CGTarget backend, switch to NativeDirectTarget
Remove the entire old CGTarget-level aarch64 backend: alloc.c (spill/reload,
labels, scopes), emit.c (function lifecycle, frame layout, prologue/epilogue),
internal.h (AAImpl struct and helpers), ops.c (per-instruction CGTarget
methods), and opt_coord.c (optimizer coordination).
Replace aa64_cgtarget_new with aa64_native_target_new +
aa64_native_direct_ops, routing through NativeDirectTarget. Add native.c
implementing the NativeTarget vtable (func lifecycle, frame slots, move,
load/store, binop/cmp, call planning, and prologue/epilogue patching).
Update the inline-asm operand binder in asm.c/asm.h to use arch-private
AA64_INLINE_OPK_REG / AA64_INLINE_OPCLS_* pseudo-kinds instead of the
semantic OPK_REG, matching the new division where semantic targets never
expose physical registers.
Diffstat:
10 files changed, 3305 insertions(+), 4842 deletions(-)
diff --git a/src/arch/aa64/aa64.h b/src/arch/aa64/aa64.h
@@ -2,7 +2,11 @@
#define CFREE_ARCH_AA64_H
#include "arch/arch.h"
+#include "arch/native_target.h"
-CGTarget* aa64_cgtarget_new(Compiler*, ObjBuilder*, MCEmitter*);
+typedef struct NativeOps NativeOps;
+
+NativeTarget* aa64_native_target_new(Compiler*, ObjBuilder*, MCEmitter*);
+const NativeOps* aa64_native_direct_ops(void);
#endif
diff --git a/src/arch/aa64/alloc.c b/src/arch/aa64/alloc.c
@@ -1,319 +0,0 @@
-/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */
-
-#include "arch/aa64/internal.h"
-
-/* ============================================================
- * AAImpl accessor
- * ============================================================ */
-
-AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
-
-/* ============================================================
- * Slot accessor
- * ============================================================ */
-
-AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) {
- if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL;
- return &a->slots[fs - 1];
-}
-
-static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out,
- RegClass* cls_out) {
- (void)t;
- Slice ns = pool_slice(t->c->global, name);
- if (!ns.s || !ns.len) return 1;
- char buf[8];
- if (ns.len >= sizeof buf) return 1;
- memcpy(buf, ns.s, ns.len);
- buf[ns.len] = '\0';
- u32 dwarf;
- if (aa64_register_index(buf, &dwarf) != 0) return 1;
- if (dwarf <= 30u) {
- if (out) *out = (Reg)dwarf;
- if (cls_out) *cls_out = RC_INT;
- return 0;
- }
- if (dwarf >= 64u && dwarf <= 95u) {
- if (out) *out = (Reg)(dwarf - 64u);
- if (cls_out) *cls_out = RC_FP;
- return 0;
- }
- return 1;
-}
-
-static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot,
- MemAccess ma) {
- AAImpl* a = impl_of(t);
- if (src.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG");
- }
- Operand addr;
- memset(&addr, 0, sizeof addr);
- addr.kind = OPK_LOCAL;
- addr.cls = RC_INT;
- addr.type = ma.type;
- addr.v.frame_slot = slot;
- aa_store(t, addr, src, ma);
-}
-
-static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot,
- MemAccess ma) {
- AAImpl* a = impl_of(t);
- if (dst.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG");
- }
- Operand addr;
- memset(&addr, 0, sizeof addr);
- addr.kind = OPK_LOCAL;
- addr.cls = RC_INT;
- addr.type = ma.type;
- addr.v.frame_slot = slot;
- aa_load(t, dst, addr, ma);
-}
-
-/* ============================================================
- * Labels / control flow
- * ============================================================ */
-
-static Label aa_label_new(CGTarget* t) {
- return (Label)t->mc->label_new(t->mc);
-}
-
-static void aa_label_place(CGTarget* t, Label l) {
- t->mc->label_place(t->mc, (MCLabel)l);
-}
-
-void aa_jump(CGTarget* t, Label l) {
- MCEmitter* mc = t->mc;
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0);
-}
-
-static void aa_emit_zero64(MCEmitter* mc) {
- static const u8 zero[8] = {0};
- mc->emit_bytes(mc, zero, sizeof zero);
-}
-
-static void aa_load_label_addr(CGTarget* t, Operand dst, Label l) {
- /* Reserve:
- * insn0: ADR Xdst, label (patched to LDR literal if out of range)
- * insn1: B .+12 (skip the inline literal)
- * lit: .quad label (relocated fallback target if needed)
- *
- * The MC fixup range-checks ADR at label placement. In-range labels use the
- * first instruction; out-of-range labels use the relocated literal slot. */
- MCEmitter* mc = t->mc;
- u32 rd;
- if (dst.kind != OPK_REG) {
- compiler_panic(t->c, mc->loc, "aa64: load_label_addr dst must be REG");
- }
- rd = reg_num(dst);
- aa64_emit32(mc, aa64_adr(rd, 0u, 0u));
- aa64_emit32(mc, aa64_b_base() | 3u);
- aa_emit_zero64(mc);
- mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_INTRA_LABEL_ADDR, 16, 0);
-}
-
-static void aa_indirect_branch(CGTarget* t, Operand addr, const Label* targets,
- u32 ntargets) {
- /* BR Xn — register-indirect branch (no fixup needed). */
- MCEmitter* mc = t->mc;
- (void)targets;
- (void)ntargets;
- if (addr.kind != OPK_REG) {
- compiler_panic(t->c, mc->loc, "aa64: indirect_branch expects REG operand");
- }
- aa64_emit32(mc, aa64_br(reg_num(addr)));
-}
-
-static u32 cmp_to_cond(CmpOp op) {
- switch (op) {
- case CMP_EQ:
- return 0x0u;
- case CMP_NE:
- return 0x1u;
- case CMP_LT_U:
- return 0x3u;
- case CMP_LE_U:
- return 0x9u;
- case CMP_GT_U:
- return 0x8u;
- case CMP_GE_U:
- return 0x2u;
- case CMP_LT_S:
- return 0xbu;
- case CMP_LE_S:
- return 0xdu;
- case CMP_GT_S:
- return 0xcu;
- case CMP_GE_S:
- return 0xau;
- default:
- return 0x0u;
- }
-}
-
-static u32 fp_cmp_to_cond(CmpOp op) {
- switch (op) {
- case CMP_EQ:
- return 0x0u; /* equal; unordered is false */
- case CMP_NE:
- return 0x1u; /* not equal; unordered is true */
- case CMP_LT_F:
- return 0x4u; /* MI: less-than only */
- case CMP_LE_F:
- return 0x9u; /* LS: less-or-equal only */
- case CMP_GT_F:
- return 0xcu; /* GT excludes unordered */
- case CMP_GE_F:
- return 0xau; /* GE excludes unordered */
- default:
- return cmp_to_cond(op);
- }
-}
-
-void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) {
- MCEmitter* mc = t->mc;
- u32 sf = type_is_64(a_op.type) ? 1u : 0u;
- if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
- u32 imm12, sh;
- if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) {
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh));
- return;
- }
- }
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm));
-}
-
-static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b,
- Label l) {
- MCEmitter* mc = t->mc;
- emit_cmp_ab(t, a, b);
- aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op)));
- mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0);
-}
-
-static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
- u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
- if (a.cls == RC_FP || b.cls == RC_FP) {
- u32 type = type_is_fp_double(a.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_fcmp(type, reg_num(a), reg_num(b)));
- aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), fp_cmp_to_cond(op)));
- return;
- }
- emit_cmp_ab(t, a, b);
- aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op)));
-}
-
-/* ============================================================
- * Structured scopes
- * ============================================================ */
-
-static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) {
- AAImpl* a = impl_of(t);
- if (a->nscopes == a->scopes_cap) {
- u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u;
- AAScope* nb = arena_array(t->c->tu, AAScope, ncap);
- if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes);
- a->scopes = nb;
- a->scopes_cap = ncap;
- }
- AAScope* sc = &a->scopes[a->nscopes];
- sc->kind = (u8)d->kind;
- sc->has_else = 0;
- sc->else_label = 0;
- sc->end_label = 0;
- sc->break_label = d->break_label;
- sc->continue_label = d->continue_label;
-
- if (d->kind == SCOPE_IF) {
- sc->else_label = t->mc->label_new(t->mc);
- sc->end_label = t->mc->label_new(t->mc);
- u32 sf = type_is_64(d->cond.type) ? 1u : 0u;
- u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0);
- aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0));
- aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/));
- t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0);
- } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) {
- /* bookkeep only */
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 scope_begin: kind %d not yet implemented",
- (int)d->kind);
- }
-
- a->nscopes++;
- return (CGScope)a->nscopes;
-}
-
-static void aa_scope_else(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa64_emit32(t->mc, aa64_b_base());
- t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0);
- t->mc->label_place(t->mc, sc->else_label);
- sc->has_else = 1;
-}
-
-static void aa_scope_end(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- if (sc->kind == SCOPE_IF) {
- if (!sc->has_else) {
- t->mc->label_place(t->mc, sc->else_label);
- }
- t->mc->label_place(t->mc, sc->end_label);
- }
-}
-
-static void aa_break_to(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa_jump(t, sc->break_label);
-}
-
-static void aa_continue_to(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa_jump(t, sc->continue_label);
-}
-
-/* Expose vtable entries to ops.c constructor via a registration helper.
- * ops.c calls this after the basic ops vtable is populated. */
-void aa_alloc_vtable_init(CGTarget* t) {
- t->spill_reg = aa_spill_reg;
- t->reload_reg = aa_reload_reg;
- t->resolve_reg_name = aa_resolve_reg_name;
-
- t->label_new = aa_label_new;
- t->label_place = aa_label_place;
- t->jump = aa_jump;
- t->cmp_branch = aa_cmp_branch;
- t->cmp = aa_cmp;
- t->load_label_addr = aa_load_label_addr;
- t->indirect_branch = aa_indirect_branch;
-
- t->scope_begin = aa_scope_begin;
- t->scope_else = aa_scope_else;
- t->scope_end = aa_scope_end;
- t->break_to = aa_break_to;
- t->continue_to = aa_continue_to;
-}
diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c
@@ -1,10 +1,13 @@
#include "arch/arch.h"
+#include <string.h>
+
#include "arch/aa64/aa64.h"
#include "arch/aa64/asm.h"
#include "arch/aa64/disasm.h"
#include "arch/aa64/isa.h"
#include "arch/aa64/regs.h"
+#include "cg/native_direct_target.h"
#include "core/bytes.h"
#include "link/link_arch.h"
#include "obj/obj.h"
@@ -107,18 +110,37 @@ static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) {
return 0;
}
-static CGTarget* aa64_backend_make(Compiler* c, ObjBuilder* o,
+static CgTarget* aa64_backend_make(Compiler* c, ObjBuilder* o,
const CfreeCodeOptions* opts) {
MCEmitter* mc = NULL;
Debug* debug = NULL;
- CGTarget* t;
+ CgTarget* t;
+ NativeTarget* native;
+ NativeDirectTargetConfig cfg;
if (cg_mc_debug_new(c, o, opts, &mc, &debug) != CFREE_OK) return NULL;
- t = aa64_cgtarget_new(c, o, mc);
- if (!t) return NULL;
- t->debug = debug;
+ (void)debug;
+ native = aa64_native_target_new(c, o, mc);
+ if (!native) return NULL;
+ memset(&cfg, 0, sizeof cfg);
+ cfg.native = native;
+ cfg.ops = aa64_native_direct_ops();
+ t = native_direct_target_new(c, o, &cfg);
return t;
}
+static CgTarget* aa64_semantic_target_new(Compiler* c, ObjBuilder* o,
+ MCEmitter* mc) {
+ NativeTarget* native;
+ NativeDirectTargetConfig cfg;
+ if (!mc) mc = mc_new(c, o);
+ native = aa64_native_target_new(c, o, mc);
+ if (!native) return NULL;
+ memset(&cfg, 0, sizeof cfg);
+ cfg.native = native;
+ cfg.ops = aa64_native_direct_ops();
+ return native_direct_target_new(c, o, &cfg);
+}
+
static const CfreePredefinedMacro aa64_predefined_macros[] = {
{CFREE_SLICE_LIT("__aarch64__"), CFREE_SLICE_LIT("1")},
{CFREE_SLICE_LIT("__AARCH64EL__"), CFREE_SLICE_LIT("1")},
@@ -136,7 +158,7 @@ const ArchImpl arch_impl_aa64 = {
.backend = {.name = "aa64", .make = aa64_backend_make},
.kind = CFREE_ARCH_ARM_64,
.name = "aa64",
- .cgtarget_new = aa64_cgtarget_new,
+ .cgtarget_new = aa64_semantic_target_new,
.asm_new = aa64_arch_asm_new,
.disasm_new = aa64_disasm_new,
.apply_label_fixup = aa64_apply_label_fixup,
diff --git a/src/arch/aa64/asm.c b/src/arch/aa64/asm.c
@@ -22,6 +22,7 @@
#include "arch/arch.h"
#include "asm/asm_helpers.h"
#include "asm/asm_lex.h"
+#include "cg/type.h"
#include "core/arena.h"
#include "core/pool.h"
#include "core/slice.h"
@@ -1232,6 +1233,8 @@ void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) {
* deliberate reason. */
#define AA64_INLINE_LINE_CAP 1024
+_Noreturn static void inline_panic(AA64Asm* a, const char* msg);
+
/* Render a 5-bit register number into the StrBuf using the requested
* width form. is64 picks x-form vs w-form; SP / ZR encode as
* register #31 and we render them as wzr/xzr or wsp/sp depending on
@@ -1247,6 +1250,24 @@ static void render_reg(StrBuf* sb, u32 reg, int is64) {
strbuf_putc(sb, (char)('0' + (reg % 10u)));
}
+static void render_fp_reg(StrBuf* sb, u32 reg, u32 nbytes) {
+ strbuf_putc(sb, nbytes <= 4u ? 's' : 'd');
+ if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u)));
+ strbuf_putc(sb, (char)('0' + (reg % 10u)));
+}
+
+static u32 inline_op_size(AA64Asm* a, const Operand* op) {
+ if (!op->type) return 8u;
+ u64 n = cg_type_size(a->c, op->type);
+ if (!n) return 8u;
+ if (n > 16u) inline_panic(a, "inline asm operand is too large");
+ return (u32)n;
+}
+
+static int inline_op_is_ptr(AA64Asm* a, const Operand* op) {
+ return op->type && cg_type_is_ptr(a->c, op->type);
+}
+
/* Render a signed 64-bit integer prefixed with '#'. */
static void render_imm(StrBuf* sb, i64 v) {
strbuf_putc(sb, '#');
@@ -1279,12 +1300,14 @@ static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) {
(idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout];
switch (form) {
case 1: /* %wN — force 32-bit register form */
- if (op->kind != OPK_REG) inline_panic(a, "%w on non-register operand");
- render_reg(sb, (u32)op->v.reg, /*is64=*/0);
+ if (op->kind != AA64_INLINE_OPK_REG || op->pad[0] != AA64_INLINE_OPCLS_INT)
+ inline_panic(a, "%w on non-integer-register operand");
+ render_reg(sb, (u32)op->v.local, 0);
return;
case 2: /* %xN — force 64-bit register form */
- if (op->kind != OPK_REG) inline_panic(a, "%x on non-register operand");
- render_reg(sb, (u32)op->v.reg, /*is64=*/1);
+ if (op->kind != AA64_INLINE_OPK_REG || op->pad[0] != AA64_INLINE_OPCLS_INT)
+ inline_panic(a, "%x on non-integer-register operand");
+ render_reg(sb, (u32)op->v.local, 1);
return;
case 3: /* %aN — memory addressing form */
if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand");
@@ -1299,8 +1322,13 @@ static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) {
}
/* Default rendering by operand kind. */
switch (op->kind) {
- case OPK_REG:
- render_reg(sb, (u32)op->v.reg, /*is64=*/1);
+ case AA64_INLINE_OPK_REG:
+ if (op->pad[0] == AA64_INLINE_OPCLS_FP) {
+ render_fp_reg(sb, (u32)op->v.local, inline_op_size(a, op));
+ } else {
+ render_reg(sb, (u32)op->v.local,
+ inline_op_is_ptr(a, op) || inline_op_size(a, op) > 4u);
+ }
return;
case OPK_IMM:
render_imm(sb, op->v.imm);
diff --git a/src/arch/aa64/asm.h b/src/arch/aa64/asm.h
@@ -20,6 +20,18 @@ typedef struct ArchAsm ArchAsm;
typedef struct AA64Asm AA64Asm;
+/* Private pseudo operand used by the aa64 inline-asm binder. Semantic CG
+ * operands never expose physical registers, so native.c lowers register
+ * constraints into this arch-private shape before template substitution.
+ * Operand.v.local carries the 5-bit physical register number; pad[0] carries
+ * AA64_INLINE_OPCLS_*.
+ */
+enum {
+ AA64_INLINE_OPK_REG = 0xf0u,
+ AA64_INLINE_OPCLS_INT = 0u,
+ AA64_INLINE_OPCLS_FP = 1u,
+};
+
/* Construct/destroy. Pure: no allocations beyond the AA64Asm struct
* itself (which lives on the compiler's TU arena). */
AA64Asm* aa64_asm_open(Compiler* c);
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -1,874 +0,0 @@
-/* aarch64/emit.c — instruction encoding helpers, function lifecycle,
- * frame layout, parameter ABI, address materialization. */
-
-#include "arch/aa64/internal.h"
-
-extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
-extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
- u32 end_ofs);
-
-static void aa_emit_cfi_frame(CGTarget* t, u32 post_prologue_off, u32 fp_lr_off,
- u32 int_save_off, u32 fp_save_off, u32 frame_size,
- const u32* int_regs, u32 n_int_saves,
- const u32* fp_regs, u32 n_fp_saves,
- int omit_frame);
-
-/* ============================================================
- * Shared type / operand helpers
- * ============================================================ */
-
-int type_is_64(CfreeCgTypeId t) {
- return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) ||
- t >= (CfreeCgTypeId)(2u << 6);
-}
-
-int type_is_fp_double(CfreeCgTypeId t) {
- return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64);
-}
-
-int type_is_signed(CfreeCgTypeId t) {
- (void)t;
- return 0;
-}
-
-u32 type_byte_size(CfreeCgTypeId t) {
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL))
- return 1;
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2;
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32))
- return 4;
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F128)) return 16;
- return 8;
-}
-
-u32 size_idx_for_bytes(u32 nbytes) {
- switch (nbytes) {
- case 1:
- return 0;
- case 2:
- return 1;
- case 4:
- return 2;
- case 8:
- return 3;
- case 16:
- return 4;
- default:
- return 3;
- }
-}
-
-u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
-
-static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) {
- u32 n = 0;
- for (u32 r = first; r <= last; ++r) {
- if (mask & (1u << r)) out[n++] = r;
- }
- return n;
-}
-
-static u32 count_mask_regs(u32 mask, u32 first, u32 last) {
- u32 n = 0;
- for (u32 r = first; r <= last; ++r) {
- if (mask & (1u << r)) ++n;
- }
- return n;
-}
-
-static u32 aa_planned_prologue_words(const AAImpl* a) {
- u32 n = AA_PROLOGUE_FRAME_WORDS;
- if (a->has_sret) ++n;
- n += count_mask_regs(a->planned_cs_int_mask, 19u, 28u);
- n += count_mask_regs(a->planned_cs_fp_mask, 8u, 15u);
- return n ? n : 1u;
-}
-
-static void aa_func_begin_init(CGTarget* t, const CGFuncDesc* fd) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- mc->set_section(mc, fd->text_section_id);
- mc->emit_align(mc, 4, 0);
-
- a->fd = fd;
- a->func_start = mc->pos(mc);
- mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
- a->next_param_int = 0;
- a->next_param_fp = 0;
- a->next_param_stack = 0;
- a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
- a->cum_off = 0;
- a->max_outgoing = 0;
- a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0;
- a->used_cs_fp_mask = a->has_planned_regs ? a->planned_cs_fp_mask : 0;
- a->prologue_words =
- a->has_planned_regs ? aa_planned_prologue_words(a) : AA_PROLOGUE_WORDS;
- a->post_prologue_off = 0;
- a->planned_cs_int_mask = 0;
- a->planned_cs_fp_mask = 0;
- a->has_planned_regs = 0;
- a->nslots = 0;
- a->nscopes = 0;
- a->has_alloca = 0;
- a->known_frame = 0;
- a->omit_frame = 0;
- a->nadd_patches = 0;
- a->sret_ptr_slot = FRAME_SLOT_NONE;
- a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
- a->gp_save_slot = FRAME_SLOT_NONE;
- a->fp_save_slot = FRAME_SLOT_NONE;
- a->epilogue_label = mc->label_new(mc);
-
- mc->cfi_startproc(mc);
-}
-
-static void aa_add_entry_frame_slots(CGTarget* t) {
- AAImpl* a = impl_of(t);
-
- if (a->has_sret) {
- FrameSlotDesc fsd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 8,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->sret_ptr_slot = aa_frame_slot(t, &fsd);
- }
-
- if (a->is_variadic) {
- FrameSlotDesc gpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 64,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->gp_save_slot = aa_frame_slot(t, &gpd);
- FrameSlotDesc fpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 128,
- .align = 16,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->fp_save_slot = aa_frame_slot(t, &fpd);
- }
-}
-
-static void aa_emit_variadic_reg_saves(CGTarget* t) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (!a->is_variadic) return;
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
- for (u32 i = 0; i < 8; ++i)
- aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8));
- for (u32 i = 0; i < 8; ++i)
- aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16));
-}
-
-/* ============================================================
- * Low-level emission
- * ============================================================ */
-
-void aa64_emit32(MCEmitter* mc, u32 word) {
- u32 ofs = obj_pos(mc->obj, mc->section_id);
- u8 b[4];
- b[0] = (u8)(word & 0xff);
- b[1] = (u8)((word >> 8) & 0xff);
- b[2] = (u8)((word >> 16) & 0xff);
- b[3] = (u8)((word >> 24) & 0xff);
- mc->emit_bytes(mc, b, 4);
- if (mc->debug) {
- debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
- }
-}
-
-void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) {
- u8 b[4];
- b[0] = (u8)(word & 0xff);
- b[1] = (u8)((word >> 8) & 0xff);
- b[2] = (u8)((word >> 16) & 0xff);
- b[3] = (u8)((word >> 24) & 0xff);
- obj_patch(obj, sec_id, ofs, b, 4);
-}
-
-/* ============================================================
- * Immediate encoding helpers
- * ============================================================ */
-
-void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) {
- const u32 nslots = sf ? 4u : 2u;
- u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
-
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- u64 cleared = v & ~((u64)0xffffu << (i * 16));
- if (slot != 0 && cleared == 0) {
- aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
- return;
- }
- }
-
- {
- u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((inv >> (i * 16)) & 0xffffu);
- u64 cleared = inv & ~((u64)0xffffu << (i * 16));
- if (cleared == 0) {
- aa64_emit32(mc, aa64_movn(sf, Rd, slot, i));
- return;
- }
- }
- }
-
- int placed = 0;
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- if (!placed) {
- if (slot == 0) continue;
- aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
- placed = 1;
- } else if (slot != 0) {
- aa64_emit32(mc, aa64_movk(sf, Rd, slot, i));
- }
- }
- if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0));
-}
-
-void emit_sp_add(MCEmitter* mc, u32 imm) {
- if (imm <= 0xfff) {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0));
- } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1));
- } else {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1));
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0));
- }
-}
-
-/* ============================================================
- * Function lifecycle
- * ============================================================ */
-
-void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- aa_func_begin_init(t, fd);
-
- a->prologue_pos = mc->pos(mc);
- for (u32 i = 0; i < a->prologue_words; ++i) aa64_emit32(mc, AA64_NOP);
-
- aa_add_entry_frame_slots(t);
- aa_emit_variadic_reg_saves(t);
- /* Capture end-of-prologue position for CFI emission in func_end. */
- a->post_prologue_off = mc->pos(mc) - a->func_start;
-}
-
-static u32 aa_build_prologue(CGTarget* t, u32* words, u32 cap, u32 frame_size,
- u32 fp_lr_off, u32 int_save_off, u32 fp_save_off,
- const u32* int_regs, u32 n_int_saves,
- const u32* fp_regs, u32 n_fp_saves) {
- AAImpl* a = impl_of(t);
- u32 wi = 0;
-
- if (frame_size <= 0xfff) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
- } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
- } else {
- if (wi + 2 > cap) goto overflow;
- words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
- }
- if (fp_lr_off <= 504u) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
- } else {
- if (wi + 2 > cap) goto overflow;
- words[wi++] = aa64_str_uimm(3, 29, 31, fp_lr_off);
- words[wi++] = aa64_str_uimm(3, 30, 31, fp_lr_off + 8u);
- }
- if (wi >= cap) goto overflow;
- if (fp_lr_off <= 0xfffu) {
- words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
- } else if ((fp_lr_off >> 24) == 0) {
- u32 hi = (fp_lr_off >> 12) & 0xfffu;
- u32 lo = fp_lr_off & 0xfffu;
- if (hi) {
- words[wi++] = aa64_add_imm(1, 29, 31, hi, 1);
- if (lo) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_add_imm(1, 29, 29, lo, 0);
- }
- } else if (lo) {
- words[wi++] = aa64_add_imm(1, 29, 31, lo, 0);
- }
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64: fp/lr offset %u out of prologue range", fp_lr_off);
- }
- if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* s = aa64_slot_get(a, a->sret_ptr_slot);
- if (s) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
- }
- }
- for (u32 i = 0; i < n_int_saves; ++i) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_str_uimm(3, int_regs[i], 31, int_save_off + i * 8u);
- }
- for (u32 i = 0; i < n_fp_saves; ++i) {
- if (wi >= cap) goto overflow;
- words[wi++] = aa64_str_fp_uimm(3, fp_regs[i], 31, fp_save_off + i * 8u);
- }
- return wi;
-
-overflow:
- compiler_panic(t->c, a->loc,
- "aarch64: prologue too small (used more than %u words)", cap);
- return 0;
-}
-
-static void aa_compute_frame(const AAImpl* a, u32 n_int_saves, u32 n_fp_saves,
- u32* int_save_off, u32* fp_save_off,
- u32* fp_lr_off, u32* frame_size) {
- *int_save_off = a->max_outgoing;
- *fp_save_off = *int_save_off + n_int_saves * 8u;
- u32 locals_off = *fp_save_off + n_fp_saves * 8u;
- *fp_lr_off = locals_off + a->cum_off;
- *frame_size = *fp_lr_off + 16;
- *frame_size = (*frame_size + 15u) & ~15u;
- *fp_lr_off = *frame_size - 16;
-}
-
-void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd,
- const CGKnownFrameDesc* frame,
- FrameSlot* out_slots) {
- AAImpl* a = impl_of(t);
- u32 int_regs[10];
- u32 fp_regs[8];
- u32 int_save_off, fp_save_off, fp_lr_off, frame_size;
- u32 words[AA_PROLOGUE_WORDS];
-
- aa_func_begin_init(t, fd);
- a->known_frame = 1;
- aa_add_entry_frame_slots(t);
- for (u32 i = 0; frame && i < frame->nslots; ++i) {
- FrameSlot fs = aa_frame_slot(t, &frame->slots[i]);
- if (out_slots) out_slots[i] = fs;
- }
- if (frame) {
- a->max_outgoing = frame->max_outgoing;
- a->has_alloca = frame->has_alloca ? 1u : 0u;
- }
-
- u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
- u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
- if (frame && frame->may_omit_frame && frame->nslots == 0 &&
- frame->max_outgoing == 0 && !frame->has_alloca && !frame->has_call &&
- !a->has_sret && !a->is_variadic && n_int_saves == 0 && n_fp_saves == 0) {
- a->omit_frame = 1;
- return;
- }
- aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
- &fp_lr_off, &frame_size);
-
- a->prologue_pos = t->mc->pos(t->mc);
- u32 nwords = aa_build_prologue(t, words, AA_PROLOGUE_WORDS, frame_size,
- fp_lr_off, int_save_off, fp_save_off, int_regs,
- n_int_saves, fp_regs, n_fp_saves);
- for (u32 i = 0; i < nwords; ++i) aa64_emit32(t->mc, words[i]);
- aa_emit_variadic_reg_saves(t);
- {
- u32 post = t->mc->pos(t->mc) - a->func_start;
- aa_emit_cfi_frame(t, post, fp_lr_off, int_save_off, fp_save_off, frame_size,
- int_regs, n_int_saves, fp_regs, n_fp_saves,
- /*omit_frame=*/0);
- }
-}
-
-/* CFI for the post-prologue state of an AArch64 frame.
- * CFA = x29 + 16 (x29 points to saved-FP/LR pair; pre-call sp = x29+16)
- * x29 saved at CFA-16, x30 (LR) at CFA-8
- * callee-saved ints/fps at their slot offsets
- * pc_offset = end-of-prologue offset within the function. */
-static void aa_emit_cfi_frame(CGTarget* t, u32 post_prologue_off, u32 fp_lr_off,
- u32 int_save_off, u32 fp_save_off, u32 frame_size,
- const u32* int_regs, u32 n_int_saves,
- const u32* fp_regs, u32 n_fp_saves,
- int omit_frame) {
- MCEmitter* mc = t->mc;
- if (omit_frame) return;
- (void)fp_lr_off;
- mc->cfi_set_next_pc_offset(mc, post_prologue_off);
- mc->cfi_def_cfa(mc, 29u, 16);
- mc->cfi_offset(mc, 29u, -16);
- mc->cfi_offset(mc, 30u, -8);
- {
- u32 i;
- for (i = 0; i < n_int_saves; ++i) {
- i32 sp_off = (i32)int_save_off + (i32)i * 8;
- i32 cfa_off = sp_off - (i32)frame_size;
- mc->cfi_offset(mc, int_regs[i], cfa_off);
- }
- for (i = 0; i < n_fp_saves; ++i) {
- /* AAPCS DWARF: V0=64, so D8..D15 → DWARF 72..79. */
- i32 sp_off = (i32)fp_save_off + (i32)i * 8;
- i32 cfa_off = sp_off - (i32)frame_size;
- mc->cfi_offset(mc, 64u + fp_regs[i], cfa_off);
- }
- }
-}
-
-void aa_func_end(CGTarget* t) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- ObjBuilder* obj = t->obj;
- u32 sec = a->fd->text_section_id;
-
- u32 int_regs[10];
- u32 fp_regs[8];
- u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
- u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
-
- u32 int_save_off, fp_save_off, fp_lr_off, frame_size;
- aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
- &fp_lr_off, &frame_size);
-
- if (!a->known_frame) {
- aa_emit_cfi_frame(t, a->post_prologue_off, fp_lr_off, int_save_off,
- fp_save_off, frame_size, int_regs, n_int_saves, fp_regs,
- n_fp_saves, /*omit_frame=*/a->omit_frame);
- }
-
- if (a->omit_frame) goto finish;
-
- mc->label_place(mc, a->epilogue_label);
-
- if (a->has_alloca) {
- if (fp_lr_off <= 0xfff) {
- aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64: has_alloca + fp_lr_off %u out of imm12 range",
- fp_lr_off);
- }
- }
-
- for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
- u32 r0 = fp_regs[i];
- aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31, fp_save_off + (u32)i * 8u));
- }
- for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
- u32 r0 = int_regs[i];
- aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, int_save_off + (u32)i * 8u));
- }
- if (fp_lr_off <= 504u) {
- aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
- } else {
- aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off));
- aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u));
- }
- emit_sp_add(mc, frame_size);
- aa64_emit32(mc, aa64_ret(AA64_LR));
-
- if (!a->known_frame) {
- u32 pos = a->prologue_pos;
- u32 words[AA_PROLOGUE_WORDS];
- u32 prologue_words =
- a->prologue_words ? a->prologue_words : AA_PROLOGUE_WORDS;
- for (u32 i = 0; i < prologue_words; ++i) words[i] = AA64_NOP;
- (void)aa_build_prologue(t, words, prologue_words, frame_size, fp_lr_off,
- int_save_off, fp_save_off, int_regs, n_int_saves,
- fp_regs, n_fp_saves);
- for (u32 i = 0; i < prologue_words; ++i)
- aa64_patch32(obj, sec, pos + i * 4u, words[i]);
- }
-
- if (a->max_outgoing > 0xfff) {
- compiler_panic(
- t->c, a->loc,
- "aarch64: max_outgoing %u out of imm12 range for alloca patch",
- a->max_outgoing);
- }
- for (u32 i = 0; i < a->nadd_patches; ++i) {
- u32 dr = a->add_patches[i].dst_reg;
- u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0);
- aa64_patch32(obj, sec, a->add_patches[i].pos, word);
- }
-
-finish:;
- u32 end = mc->pos(mc);
- obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start,
- (u64)(end - a->func_start));
- if (a->fd->atomize) {
- obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->fd->sym,
- 0);
- }
- if (t->debug) debug_func_pc_range(t->debug, sec, a->func_start, end);
-
- mc->cfi_endproc(mc);
- mc_end_function(mc);
- a->fd = NULL;
-}
-
-/* ============================================================
- * Frame slots
- * ============================================================ */
-
-FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) {
- AAImpl* a = impl_of(t);
- if (a->nslots == a->slots_cap) {
- u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8;
- AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap);
- if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots);
- a->slots = nbuf;
- a->slots_cap = ncap;
- }
- u32 size = d->size ? d->size : 8;
- u32 align = d->align ? d->align : 1;
- u32 next = a->cum_off + size;
- u32 mask = align - 1;
- next = (next + mask) & ~mask;
-
- AASlot* s = &a->slots[a->nslots];
- s->off = next;
- s->size = size;
- s->align = align;
- s->kind = d->kind;
-
- a->cum_off = next;
- a->nslots++;
- return (FrameSlot)(a->nslots);
-}
-
-/* ============================================================
- * Parameters
- * ============================================================ */
-
-static void aa_consume_param_location(AAImpl* a, const ABIArgInfo* ai) {
- if (!ai || ai->kind == ABI_ARG_IGNORE) return;
- if (ai->kind == ABI_ARG_INDIRECT) {
- if (a->next_param_int < 8)
- ++a->next_param_int;
- else {
- a->next_param_stack += 8;
- }
- return;
- }
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- if (pt->cls == ABI_CLASS_INT) {
- if (a->next_param_int < 8)
- ++a->next_param_int;
- else
- a->next_param_stack += 8;
- } else if (pt->cls == ABI_CLASS_FP) {
- if (a->next_param_fp < 8)
- ++a->next_param_fp;
- else
- a->next_param_stack += pt->size > 8 ? pt->size : 8;
- }
- }
-}
-
-CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p) {
- AAImpl* a = impl_of(t);
- CGLocalStorage st = p->storage;
- if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
- FrameSlotDesc fsd = {0};
- fsd.type = p->type;
- fsd.name = p->name;
- fsd.loc = p->loc;
- fsd.size = p->size;
- fsd.align = p->align;
- fsd.kind = FS_PARAM;
- if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
- st.v.frame_slot = aa_frame_slot(t, &fsd);
- }
- AASlot* s = st.kind == CG_LOCAL_STORAGE_FRAME
- ? aa64_slot_get(a, st.v.frame_slot)
- : NULL;
- if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) {
- compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
- }
- const ABIArgInfo* ai = p->abi;
- u32 incoming_stack_base = a->omit_frame ? 31u : 29u;
- i32 incoming_stack_bias = a->omit_frame ? 0 : 16;
-
- if (ai->kind == ABI_ARG_IGNORE) return st;
- if (st.kind == CG_LOCAL_STORAGE_REG && st.v.reg == (Reg)REG_NONE) {
- aa_consume_param_location(a, ai);
- return st;
- }
- if (st.kind == CG_LOCAL_STORAGE_REG) {
- if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
- compiler_panic(
- t->c, a->loc,
- "aarch64 param: register storage requires one direct part");
- }
- const ABIArgPart* pt = &ai->parts[0];
- u32 sz = pt->size;
- u32 sidx = size_idx_for_bytes(sz);
- if (pt->cls == ABI_CLASS_INT) {
- u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg});
- if (a->next_param_int < 8) {
- u32 src = a->next_param_int++;
- if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64)) {
- aa64_emit32(t->mc, aa64_fmov_d_x(dst, src));
- } else if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) {
- aa64_emit32(t->mc, aa64_fmov_s_w(dst, src));
- } else {
- u32 sf = (sz == 8) ? 1u : 0u;
- if (dst != src) aa64_emit32(t->mc, aa64_mov_reg(sf, dst, src));
- }
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- if (p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) ||
- p->type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) {
- aa64_emit_ldur_fp_off(t->mc, sidx, dst, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- } else {
- aa64_emit_ldur_off(t->mc, sidx, dst, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- }
- }
- } else if (pt->cls == ABI_CLASS_FP) {
- u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg});
- if (a->next_param_fp < 8) {
- u32 src = a->next_param_fp++;
- if (sz == 16) {
- if (dst != src) aa64_emit32(t->mc, aa64_mov_v16b(dst, src));
- } else {
- u32 type = (sz == 8) ? 1u : 0u;
- if (dst != src) aa64_emit32(t->mc, aa64_fmov_reg(type, dst, src));
- }
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += sz > 8 ? sz : 8;
- if (sz == 16)
- aa64_emit32(t->mc,
- aa64_ldur_q(dst, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off));
- else
- aa64_emit_ldur_fp_off(t->mc, sidx, dst, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- }
- } else {
- compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
- (int)pt->cls);
- }
- return st;
- }
- if (ai->kind == ABI_ARG_INDIRECT) {
- u32 ptr_reg;
- if (a->next_param_int < 8) {
- ptr_reg = a->next_param_int++;
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- aa64_emit_ldur_off(t->mc, 3, AA_TMP0, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- ptr_reg = AA_TMP0;
- }
- u32 nbytes = s->size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit_ldur_off(t->mc, 3, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
- aa64_emit_stur_off(t->mc, 3, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2);
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit_ldur_off(t->mc, 2, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
- aa64_emit_stur_off(t->mc, 2, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2);
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit_ldur_off(t->mc, 1, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
- aa64_emit_stur_off(t->mc, 1, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2);
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit_ldur_off(t->mc, 0, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
- aa64_emit_stur_off(t->mc, 0, AA_TMP1, 29, -(i32)s->off + (i32)i, AA_TMP2);
- i += 1;
- }
- return st;
- }
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- u32 part_off = pt->src_offset;
- u32 sz = pt->size;
- u32 sidx = size_idx_for_bytes(sz);
-
- if (pt->cls == ABI_CLASS_INT) {
- if (a->next_param_int < 8) {
- u32 reg = a->next_param_int++;
- aa64_emit_stur_off(t->mc, sidx, reg, 29, -(i32)s->off + (i32)part_off,
- AA_TMP0);
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29,
- -(i32)s->off + (i32)part_off, AA_TMP1);
- }
- } else if (pt->cls == ABI_CLASS_FP) {
- if (a->next_param_fp < 8) {
- u32 reg = a->next_param_fp++;
- if (sz == 16)
- aa64_emit32(t->mc,
- aa64_stur_q(reg, 29, -(i32)s->off + (i32)part_off));
- else
- aa64_emit_stur_fp_off(t->mc, sidx, reg, 29,
- -(i32)s->off + (i32)part_off, AA_TMP0);
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += sz > 8 ? sz : 8;
- if (sz == 16) {
- aa64_emit32(t->mc,
- aa64_ldur_q(AA_FP_TMP0, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off));
- aa64_emit32(
- t->mc, aa64_stur_q(AA_FP_TMP0, 29, -(i32)s->off + (i32)part_off));
- } else {
- aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, incoming_stack_base,
- incoming_stack_bias + (i32)caller_off, AA_TMP0);
- aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
- -(i32)s->off + (i32)part_off, AA_TMP0);
- }
- }
- } else {
- compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
- (int)pt->cls);
- }
- }
- return st;
-}
-
-/* ============================================================
- * Address materialization helpers
- * ============================================================ */
-
-static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
- return obj_symbol_extern_via_got(t->c, t->obj, sym);
-}
-
-void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(dst_reg));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0);
- u32 ldr_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0));
- mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0);
-}
-
-void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) {
- MCEmitter* mc = t->mc;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, dst_reg, sym);
- if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend);
- return;
- }
- u32 sec = mc->section_id;
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(dst_reg));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend,
- 0, 0);
- u32 add_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0));
- mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0,
- 0);
-}
-
-void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) {
- if (off == 0) {
- aa64_emit32(mc, aa64_mov_reg(1, Rd, base));
- return;
- }
- u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off;
- if (abs_off <= 0xfff) {
- if (off < 0)
- aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0));
- else
- aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0));
- return;
- }
- if ((abs_off >> 24) == 0) {
- u32 hi = (abs_off >> 12) & 0xfff;
- u32 lo = abs_off & 0xfff;
- if (off < 0) {
- if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1));
- if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0));
- } else {
- if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1));
- if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0));
- }
- return;
- }
- aa64_emit_load_imm(mc, 1, Rd, off);
- aa64_emit32(mc, aa64_add(1, Rd, base, Rd));
-}
-
-static int aa64_simm9_fits(i32 off) { return off >= -256 && off <= 255; }
-
-void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp) {
- if (aa64_simm9_fits(off)) {
- aa64_emit32(mc, aa64_ldur(size, Rt, Rn, off));
- return;
- }
- aa64_emit_addr_adjust(mc, tmp, Rn, off);
- aa64_emit32(mc, aa64_ldur(size, Rt, tmp, 0));
-}
-
-void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp) {
- if (aa64_simm9_fits(off)) {
- aa64_emit32(mc, aa64_stur(size, Rt, Rn, off));
- return;
- }
- aa64_emit_addr_adjust(mc, tmp, Rn, off);
- aa64_emit32(mc, aa64_stur(size, Rt, tmp, 0));
-}
-
-void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp) {
- if (aa64_simm9_fits(off)) {
- aa64_emit32(mc, aa64_ldur_fp(size, Rt, Rn, off));
- return;
- }
- aa64_emit_addr_adjust(mc, tmp, Rn, off);
- aa64_emit32(mc, aa64_ldur_fp(size, Rt, tmp, 0));
-}
-
-void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp) {
- if (aa64_simm9_fits(off)) {
- aa64_emit32(mc, aa64_stur_fp(size, Rt, Rn, off));
- return;
- }
- aa64_emit_addr_adjust(mc, tmp, Rn, off);
- aa64_emit32(mc, aa64_stur_fp(size, Rt, tmp, 0));
-}
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -1,355 +0,0 @@
-/* aarch64/internal.h — private types and forward decls shared across
- * emit.c / alloc.c / ops.c. NOT part of the public API. */
-#pragma once
-
-#include <string.h>
-
-#include "arch/aa64/asm.h"
-#include "arch/aa64/isa.h"
-#include "arch/aa64/regs.h"
-#include "arch/arch.h"
-#include "core/arena.h"
-#include "core/pool.h"
-#include "obj/obj.h"
-
-/* ============================================================
- * Local encoding helpers (kept here, not in aa64_isa.h).
- * ============================================================ */
-
-#define AA64_NOP 0xD503201Fu
-
-/* Hidden backend temporaries. These must stay outside the allocable pools and
- * outside optimizer scratch registers because CGTarget ops may clobber them
- * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */
-enum {
- AA_TMP0 = 9u,
- AA_TMP1 = 10u,
- AA_TMP2 = 11u,
- AA_FP_TMP0 = 31u,
-};
-#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k)))
-
-static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stur_q(u32 Rt, u32 Rn, i32 simm9) {
- return 0x3C800000u | (((u32)simm9 & 0x1ffu) << 12) | ((Rn & 0x1f) << 5) |
- (Rt & 0x1f);
-}
-static inline u32 aa64_ldur_q(u32 Rt, u32 Rn, i32 simm9) {
- return 0x3CC00000u | (((u32)simm9 & 0x1ffu) << 12) | ((Rn & 0x1f) << 5) |
- (Rt & 0x1f);
-}
-
-static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_str_q_uimm(u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> 4;
- return 0x3D800000u | ((sc & 0xfffu) << 10) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_mrs_tpidr_el0(u32 Rt) {
- return 0xD53BD040u | (Rt & 0x1fu);
-}
-static inline u32 aa64_b_base(void) { return 0x14000000u; }
-static inline u32 aa64_bl_base(void) { return 0x94000000u; }
-
-static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); }
-
-static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldr_q_uimm(u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> 4;
- return 0x3DC00000u | ((sc & 0xfffu) << 10) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) {
- return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_mov_v16b(u32 Rd, u32 Rn) {
- return 0x4EA01C00u | ((Rn & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) {
- return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_cset_eq(u32 sf, u32 Rd) {
- return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) {
- return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) {
- return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) {
- return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) {
- return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) {
- return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) {
- return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) {
- return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
- return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); }
-
-static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
- return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) |
- ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) {
- return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u);
-}
-
-static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fneg(u32 type, u32 Rd, u32 Rn) {
- return 0x1E214000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_fcmp(u32 type, u32 Rn, u32 Rm) {
- return 0x1E202000u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5);
-}
-
-static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-/* ============================================================
- * AAImpl types
- * ============================================================ */
-
-#define AA_PROLOGUE_WORDS \
- 25u /* worst case: sub sp + str/str/add-add fp + sret + 10 int + 8 fp */
-#define AA_PROLOGUE_FRAME_WORDS \
- 6u /* worst-case frame adjust + split fp/lr saves + add-add fp */
-
-typedef struct AASlot {
- u32 off;
- u32 size;
- u32 align;
- u8 kind;
- u8 pad[3];
-} AASlot;
-
-typedef struct AAScope {
- u8 kind;
- u8 has_else;
- u8 pad[2];
- MCLabel else_label;
- MCLabel end_label;
- Label break_label;
- Label continue_label;
-} AAScope;
-
-typedef struct AAImpl {
- CGTarget base;
- SrcLoc loc;
- const CGFuncDesc* fd;
-
- u32 func_start;
- u32 prologue_pos;
- u32 prologue_words;
- u32 post_prologue_off; /* end-of-prologue offset within function, for CFI */
- MCLabel epilogue_label;
- u8 known_frame;
- u8 omit_frame;
- u8 pad0[2];
-
- AASlot* slots;
- u32 nslots;
- u32 slots_cap;
- u32 cum_off;
- u32 max_outgoing;
-
- u32 next_param_int;
- u32 next_param_fp;
- u32 next_param_stack;
- u8 has_sret;
- FrameSlot sret_ptr_slot;
-
- u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */
- u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */
- u32 planned_cs_int_mask;
- u32 planned_cs_fp_mask;
- u8 has_planned_regs;
- u8 pad1[3];
-
- AAScope* scopes;
- u32 nscopes;
- u32 scopes_cap;
-
- u8 has_alloca;
- struct AAAllocaPatch {
- u32 pos;
- u32 dst_reg;
- }* add_patches;
- u32 nadd_patches;
- u32 add_patches_cap;
-
- u8 is_variadic;
- FrameSlot gp_save_slot;
- FrameSlot fp_save_slot;
-} AAImpl;
-
-/* ============================================================
- * Cross-file forward declarations
- * ============================================================ */
-
-/* emit.c helpers used in alloc.c / ops.c */
-void aa64_emit32(MCEmitter* mc, u32 word);
-void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word);
-void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm);
-void emit_sp_add(MCEmitter* mc, u32 imm);
-void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off);
-void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp);
-void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp);
-void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp);
-void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
- u32 tmp);
-void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym);
-void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
-
-/* emit.c public surface */
-FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
-void aa_func_begin(CGTarget* t, const CGFuncDesc* fd);
-void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd,
- const CGKnownFrameDesc* frame,
- FrameSlot* out_slots);
-void aa_func_end(CGTarget* t);
-CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p);
-
-/* alloc.c helpers used in emit.c / ops.c */
-AAImpl* impl_of(CGTarget* t);
-AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs);
-void aa_jump(CGTarget* t, Label l);
-
-/* ops.c helpers used in alloc.c */
-void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma);
-void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma);
-u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch);
-
-/* alloc.c helpers used in ops.c */
-void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op);
-void aa_alloc_vtable_init(CGTarget* t);
-void aa_coord_vtable_init(CGTarget* t);
-
-/* shared type helpers (defined in emit.c, used broadly) */
-int type_is_64(CfreeCgTypeId t);
-int type_is_fp_double(CfreeCgTypeId t);
-int type_is_signed(CfreeCgTypeId t);
-u32 type_byte_size(CfreeCgTypeId t);
-u32 size_idx_for_bytes(u32 nbytes);
-u32 reg_num(Operand op);
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -0,0 +1,3226 @@
+/* aa64 NativeTarget production-readiness checklist:
+ * - ABI completeness: finish AAPCS64/Linux va_list and register-save-area
+ * lowering, verify Apple/AAPCS64/Windows arm64 differences, handle all
+ * homogeneous aggregates, indirect/byval/sret corner cases, small aggregate
+ * splitting, multi-register returns, stack alignment, and ABI diagnostics.
+ * - Calls and returns: replace call-plus-return tail handling with true direct
+ * and indirect sibling calls, preserve musttail ABI guarantees, support stack
+ * argument reshuffling without clobbering live inputs, and cover all sret,
+ * variadic, FP, aggregate, and many-argument combinations.
+ * - Frame lowering: implement known-frame/prologue integration for optimized
+ * emission, spill/reload hooks, callee-save tracking for integer and FP/SIMD
+ * registers, large-frame probing/materialization as needed by each platform,
+ * dynamic alloca restoration, and unwind/debug frame metadata.
+ * - Operations and intrinsics: fill remaining scalar, FP, conversion, rounding,
+ * overflow, bit, vector/SIMD, trap, prefetch, and target-specific intrinsics;
+ * validate NaN/ordered/unordered FP compare semantics and integer narrowing
+ * behavior for every supported width.
+ * - Aggregates and memory: support large constants, overlap-safe memmove,
+ * optimized bulk copy/set selection, bitfield load/store, packed/unaligned
+ * accesses, volatile access constraints, and record/slice edge cases across
+ * direct and optimized lowering.
+ * - Atomics: replace ordinary load/store RMW/CAS sequences with correct LL/SC
+ * or LSE loops, implement acquire/release/seq_cst mappings precisely, handle
+ * failure ordering, byte/halfword/word/dword widths, and retry/clobber rules.
+ * - Inline and file-scope asm: complete register/memory/immediate constraints,
+ * named operands, tied operands, early-clobber and clobber validation, hard
+ * register conflicts, memory barriers, outputs for aggregates/FP values, and
+ * file-scope asm integration. */
+
+#include "arch/aa64/aa64.h"
+
+#include <string.h>
+
+#include "arch/aa64/asm.h"
+#include "arch/aa64/isa.h"
+#include "arch/aa64/regs.h"
+#include "abi/abi.h"
+#include "asm/asm.h"
+#include "asm/asm_lex.h"
+#include "cg/native_direct_target.h"
+#include "cg/type.h"
+#include "core/arena.h"
+#include "core/bytes.h"
+#include "core/pool.h"
+#include "core/slice.h"
+#include "obj/obj.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define AA_UNUSED_FN __attribute__((unused))
+#else
+#define AA_UNUSED_FN
+#endif
+
+enum {
+ AA_TMP0 = 16u,
+ AA_TMP1 = 17u,
+ AA_FP = 29u,
+ AA_LR = 30u,
+ AA_SP = 31u,
+ AA_FRAME_SAVE_SIZE = 16u,
+ AA_PROLOGUE_WORDS = 24u,
+ AA_TAIL_WORDS = 16u,
+};
+
+typedef struct AANativeSlot {
+ u32 off;
+ u32 size;
+ u32 align;
+ u8 kind;
+ u8 pad[3];
+} AANativeSlot;
+
+typedef struct AATailSite {
+ u32 pos;
+ NativeLoc callee;
+} AATailSite;
+
+typedef struct AAAllocaPatch {
+ u32 pos;
+ u32 dst_reg;
+} AAAllocaPatch;
+
+typedef struct AANativeTarget {
+ NativeTarget base;
+ SrcLoc loc;
+ const CGFuncDesc* func;
+
+ AANativeSlot* slots;
+ u32 nslots;
+ u32 slots_cap;
+ u32 cum_off;
+ u32 max_outgoing;
+ u32 incoming_stack_size;
+ u32 next_param_int;
+ u32 next_param_fp;
+ u32 next_param_stack;
+ NativeFrameSlot sret_ptr_slot;
+ NativeFrameSlot saved_tmp_slot;
+ NativeFrameSlot va_gr_slot;
+ NativeFrameSlot va_vr_slot;
+
+ AATailSite* tail_sites;
+ u32 ntail_sites;
+ u32 tail_sites_cap;
+ AAAllocaPatch* alloca_patches;
+ u32 nalloca_patches;
+ u32 alloca_patches_cap;
+
+ u32 func_start;
+ u32 prologue_pos;
+ MCLabel epilogue_label;
+} AANativeTarget;
+
+static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
+
+static void aa_panic(AANativeTarget* a, const char* msg) {
+ compiler_panic(a->base.c, a->loc, "aarch64 native target: %s", msg);
+}
+
+static void aa_emit32(MCEmitter* mc, u32 word) {
+ u8 b[4];
+ wr_u32_le(b, word);
+ mc->emit_bytes(mc, b, sizeof b);
+}
+
+static void aa_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
+ u8 b[4];
+ wr_u32_le(b, word);
+ obj_patch(obj, sec, off, b, sizeof b);
+}
+
+static u32 align_up_u32(u32 v, u32 align) {
+ u32 mask = align ? align - 1u : 0u;
+ return (v + mask) & ~mask;
+}
+
+static u32 type_size32(NativeTarget* t, CfreeCgTypeId type) {
+ u64 n = type ? cg_type_size(t->c, type) : 8u;
+ if (n == 0) n = 8u;
+ if (n > 16u) compiler_panic(t->c, (SrcLoc){0, 0, 0},
+ "aarch64 native target: scalar too large");
+ return (u32)n;
+}
+
+static u32 type_align32(NativeTarget* t, CfreeCgTypeId type) {
+ u64 n = type ? cg_type_align(t->c, type) : 8u;
+ if (n == 0) n = 1u;
+ if (n > 16u) n = 16u;
+ return (u32)n;
+}
+
+static u32 size_idx(u32 n) {
+ if (n <= 1u) return 0u;
+ if (n <= 2u) return 1u;
+ if (n <= 4u) return 2u;
+ return 3u;
+}
+
+static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
+
+static int loc_is_64(NativeTarget* t, NativeLoc loc) {
+ return type_size32(t, loc.type) == 8u || cg_type_is_ptr(t->c, loc.type);
+}
+
+static int loc_is_fp(NativeLoc loc) {
+ return (NativeAllocClass)loc.cls == NATIVE_REG_FP;
+}
+
+static __attribute__((unused)) int aa_use_got_for_sym(NativeTarget* t,
+ ObjSymId sym) {
+ return obj_symbol_extern_via_got(t->c, t->obj, sym);
+}
+
+static __attribute__((unused)) RelocKind aa_ldst_reloc_for_size(u32 size) {
+ switch (size) {
+ case 0:
+ return R_AARCH64_LDST8_ABS_LO12_NC;
+ case 1:
+ return R_AARCH64_LDST16_ABS_LO12_NC;
+ case 2:
+ return R_AARCH64_LDST32_ABS_LO12_NC;
+ case 3:
+ return R_AARCH64_LDST64_ABS_LO12_NC;
+ default:
+ return R_AARCH64_LDST64_ABS_LO12_NC;
+ }
+}
+
+static u32 aa_load_imm_words(u32* out, u32 cap, u32 sf, u32 rd, i64 imm) {
+ u64 v = (u64)imm;
+ u32 words = sf ? 4u : 2u;
+ u32 n = 0;
+ for (u32 i = 0; i < words; ++i) {
+ u32 part = (u32)((v >> (i * 16u)) & 0xffffu);
+ if (!part && n) continue;
+ if (n >= cap) return 0;
+ out[n] = n ? aa64_movk(sf, rd, part, i) : aa64_movz(sf, rd, part, i);
+ ++n;
+ }
+ if (!n) {
+ if (!cap) return 0;
+ out[n++] = aa64_movz(sf, rd, 0, 0);
+ }
+ return n;
+}
+
+static void aa_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) {
+ u32 words[4];
+ u32 n = aa_load_imm_words(words, 4u, sf, rd, imm);
+ for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
+}
+
+static void aa_emit_add_imm(AANativeTarget* a, u32 rd, u32 rn, i32 off) {
+ u32 imm12, sh;
+ MCEmitter* mc = a->base.mc;
+ if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) {
+ aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh));
+ return;
+ }
+ if (off < 0 && aa64_addsub_imm_fits(-(i64)off, &imm12, &sh)) {
+ aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh));
+ return;
+ }
+ aa_emit_load_imm(mc, 1, rd, off);
+ aa_emit32(mc, aa64_add(1, rd, rn, rd));
+}
+
+static __attribute__((unused)) void aa_emit_add_i64(AANativeTarget* a, u32 rd,
+ u32 rn, i64 off) {
+ u32 imm12, sh;
+ MCEmitter* mc = a->base.mc;
+ if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) {
+ aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh));
+ return;
+ }
+ if (off < 0 && aa64_addsub_imm_fits(-off, &imm12, &sh)) {
+ aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh));
+ return;
+ }
+ aa_emit_load_imm(mc, 1, rd, off);
+ aa_emit32(mc, aa64_add(1, rd, rn, rd));
+}
+
+static u32 aa_ldur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) {
+ return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size,
+ .V = v,
+ .opc = AA64_LDST_OPC_LDR,
+ .imm9 = (u32)simm9 & 0x1ffu,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static u32 aa_stur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) {
+ return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size,
+ .V = v,
+ .opc = AA64_LDST_OPC_STR,
+ .imm9 = (u32)simm9 & 0x1ffu,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static u32 aa_ldr_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
+ .V = v,
+ .opc = AA64_LDST_OPC_LDR,
+ .imm12 = sc,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static u32 aa_str_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
+ .V = v,
+ .opc = AA64_LDST_OPC_STR,
+ .imm12 = sc,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static u32 aa_ldr_uimm(u32 size, u32 rt, u32 rn, u32 byte_off) {
+ return aa_ldr_uimm_v(size, 0, rt, rn, byte_off);
+}
+
+static __attribute__((unused)) u32 aa_str_uimm(u32 size, u32 rt, u32 rn,
+ u32 byte_off) {
+ return aa_str_uimm_v(size, 0, rt, rn, byte_off);
+}
+
+static __attribute__((unused)) u32 aa_ldst_regoff_v(u32 size, u32 v, u32 load,
+ u32 rt, u32 rn, u32 rm,
+ u32 scaled) {
+ return ((size & 3u) << 30) | 0x38200800u | ((v & 1u) << 26) |
+ ((load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR) << 22) |
+ ((rm & 0x1fu) << 16) | (3u << 13) | ((scaled & 1u) << 12) |
+ ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_ldr_lit64(u32 rt, u32 imm19) {
+ return 0x58000000u | ((imm19 & 0x7ffffu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_mrs_tpidr_el0(u32 rt) {
+ return 0xd53bd040u | (rt & 0x1fu);
+}
+
+static u32 aa_fp_bin(u32 op, u32 is_double, u32 rd, u32 rn, u32 rm) {
+ return (is_double ? 0x1e600000u : 0x1e200000u) | op |
+ ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static u32 aa_fcmp(u32 is_double, u32 rn, u32 rm) {
+ return (is_double ? 0x1e602000u : 0x1e202000u) | ((rm & 0x1fu) << 16) |
+ ((rn & 0x1fu) << 5);
+}
+
+static u32 aa_fneg(u32 is_double, u32 rd, u32 rn) {
+ return (is_double ? 0x1e614000u : 0x1e214000u) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) {
+ return (is_double ? 0x1e604000u : 0x1e204000u) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
+ return (is64_src ? 0x9e220000u : 0x1e220000u) |
+ (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) |
+ (fd & 0x1fu);
+}
+
+static u32 aa_ucvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
+ return (is64_src ? 0x9e230000u : 0x1e230000u) |
+ (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) |
+ (fd & 0x1fu);
+}
+
+static u32 aa_fcvtzs(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) {
+ return (is64_dst ? 0x9e380000u : 0x1e380000u) |
+ (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_fcvtzu(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) {
+ return (is64_dst ? 0x9e390000u : 0x1e390000u) |
+ (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_fcvt_d_s(u32 rd, u32 rn) {
+ return 0x1e22c000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static u32 aa_fcvt_s_d(u32 rd, u32 rn) {
+ return 0x1e624000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static u32 aa_fmov_gpr_to_fp(u32 is64, u32 fd, u32 rn) {
+ return (is64 ? 0x9e670000u : 0x1e270000u) | ((rn & 0x1fu) << 5) |
+ (fd & 0x1fu);
+}
+
+static u32 aa_fmov_fp_to_gpr(u32 is64, u32 rd, u32 fn) {
+ return (is64 ? 0x9e660000u : 0x1e260000u) | ((fn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_clz(u32 sf, u32 rd, u32 rn) {
+ return (sf ? 0xdac01000u : 0x5ac01000u) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_rbit(u32 sf, u32 rd, u32 rn) {
+ return (sf ? 0xdac00000u : 0x5ac00000u) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_rev(u32 sf, u32 rd, u32 rn) {
+ return (sf ? 0xdac00c00u : 0x5ac00800u) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_sbfm(u32 sf, u32 rd, u32 rn, u32 immr, u32 imms) {
+ return (sf ? 0x93400000u : 0x13000000u) | ((immr & 0x3fu) << 16) |
+ ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_ubfm(u32 sf, u32 rd, u32 rn, u32 immr,
+ u32 imms) {
+ return (sf ? 0xd3400000u : 0x53000000u) | ((immr & 0x3fu) << 16) |
+ ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_ldaxr(u32 size, u32 rt, u32 rn) {
+ return (size << 30) | 0x085ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_ldxr(u32 size, u32 rt, u32 rn) {
+ return (size << 30) | 0x085f7c00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_stlxr(u32 size, u32 rs, u32 rt, u32 rn) {
+ return (size << 30) | 0x0800fc00u | ((rs & 0x1fu) << 16) |
+ ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_stxr(u32 size, u32 rs, u32 rt, u32 rn) {
+ return (size << 30) | 0x08007c00u | ((rs & 0x1fu) << 16) |
+ ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_ldar(u32 size, u32 rt, u32 rn) {
+ return (size << 30) | 0x08dffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static __attribute__((unused)) u32 aa_stlr(u32 size, u32 rt, u32 rn) {
+ return (size << 30) | 0x089ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
+}
+
+static u32 aa_umaddl(u32 rd, u32 rn, u32 rm, u32 ra) {
+ return 0x9ba00000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) |
+ ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static u32 aa_smaddl(u32 rd, u32 rn, u32 rm, u32 ra) {
+ return 0x9b200000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) |
+ ((rn & 0x1fu) << 5) | (rd & 0x1fu);
+}
+
+static u32 aa_smulh(u32 rd, u32 rn, u32 rm) {
+ return 0x9b407c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_umulh(u32 rd, u32 rn, u32 rm) {
+ return 0x9bc07c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
+static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) {
+ return aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd});
+}
+
+static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) {
+ return aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = 1, .op = 0, .S = 0, .shift = 0, .Rm = rm, .imm6 = shift,
+ .Rn = rn, .Rd = rd});
+}
+
+static u32 aa_cset(u32 sf, u32 rd, u32 cond) {
+ return aa64_csinc_enc(sf, rd, AA64_ZR, AA64_ZR, cond ^ 1u);
+}
+
+static u32 cmp_cond(CmpOp op) {
+ switch (op) {
+ case CMP_EQ:
+ return 0x0u;
+ case CMP_NE:
+ return 0x1u;
+ case CMP_LT_U:
+ return 0x3u;
+ case CMP_LE_U:
+ return 0x9u;
+ case CMP_GT_U:
+ return 0x8u;
+ case CMP_GE_U:
+ return 0x2u;
+ case CMP_LT_S:
+ return 0xbu;
+ case CMP_LE_S:
+ return 0xdu;
+ case CMP_GT_S:
+ return 0xcu;
+ case CMP_GE_S:
+ return 0xau;
+ case CMP_LT_F:
+ return 0x4u;
+ case CMP_LE_F:
+ return 0x9u;
+ case CMP_GT_F:
+ return 0xcu;
+ case CMP_GE_F:
+ return 0xau;
+ default:
+ return 0x0u;
+ }
+}
+
+static AANativeSlot* aa_slot(AANativeTarget* a, NativeFrameSlot slot) {
+ if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->nslots)
+ aa_panic(a, "bad frame slot");
+ return &a->slots[slot - 1u];
+}
+
+static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out,
+ i32* off_out) {
+ *base_out = AA_TMP0;
+ *off_out = addr.offset;
+ switch ((NativeAddrBaseKind)addr.base_kind) {
+ case NATIVE_ADDR_BASE_REG:
+ *base_out = addr.base.reg;
+ return;
+ case NATIVE_ADDR_BASE_FRAME: {
+ AANativeSlot* s = aa_slot(a, addr.base.frame);
+ *base_out = AA_FP;
+ *off_out = -(i32)s->off + addr.offset;
+ return;
+ }
+ case NATIVE_ADDR_BASE_GLOBAL: {
+ NativeLoc tmp;
+ memset(&tmp, 0, sizeof tmp);
+ tmp.kind = NATIVE_LOC_REG;
+ tmp.cls = NATIVE_REG_INT;
+ tmp.type = builtin_id(CFREE_CG_BUILTIN_I64);
+ tmp.v.reg = AA_TMP0;
+ a->base.load_addr(&a->base, tmp, addr);
+ *base_out = AA_TMP0;
+ *off_out = 0;
+ return;
+ }
+ default:
+ aa_panic(a, "unsupported address base");
+ }
+}
+
+static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off);
+static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off);
+
+static void aa_emit_mem_q(AANativeTarget* a, int load, NativeLoc reg,
+ NativeAddr addr) {
+ u32 base, rt;
+ i32 off;
+ MCEmitter* mc = a->base.mc;
+ if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
+ aa_panic(a, "unsupported q-register indexed memory access");
+ aa_addr_base(a, addr, &base, &off);
+ rt = loc_reg(reg);
+ if (off >= 0 && (((u32)off & 15u) == 0) && ((u32)off >> 4) <= 0xfffu) {
+ aa_emit32(mc, aa_ldst_q_uimm(load, rt, base, (u32)off));
+ return;
+ }
+ if (off >= -256 && off <= 255) {
+ aa_emit32(mc, aa_ldst_q_simm9(load, rt, base, off));
+ return;
+ }
+ aa_emit_add_imm(a, AA_TMP1, base, off);
+ aa_emit32(mc, aa_ldst_q_uimm(load, rt, AA_TMP1, 0));
+}
+
+static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg,
+ NativeAddr addr, MemAccess mem) {
+ u32 base, rt, sz;
+ i32 off;
+ MCEmitter* mc = a->base.mc;
+ rt = loc_reg(reg);
+ sz = size_idx(mem.size ? mem.size
+ : type_size32(&a->base, reg.type ? reg.type
+ : mem.type));
+ if (loc_is_fp(reg) && (mem.size ? mem.size
+ : type_size32(&a->base, reg.type
+ ? reg.type
+ : mem.type)) ==
+ 16u) {
+ aa_emit_mem_q(a, load, reg, addr);
+ return;
+ }
+ if (loc_is_fp(reg) && sz < 2u) sz = 2u;
+ if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL &&
+ addr.index_kind == NATIVE_ADDR_INDEX_NONE) {
+ i64 addend = addr.base.global.addend + (i64)addr.offset;
+ u32 scratch = (!load && rt == AA_TMP0) ? AA_TMP1 : AA_TMP0;
+ u32 pos = mc->pos(mc);
+ if (aa_use_got_for_sym(&a->base, addr.base.global.sym)) {
+ aa_emit32(mc, aa64_adrp(scratch, 0, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_GOT_PAGE,
+ addr.base.global.sym, 0, 0, 0);
+ pos = mc->pos(mc);
+ aa_emit32(mc, aa_ldr_uimm(3, scratch, scratch, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_LD64_GOT_LO12_NC,
+ addr.base.global.sym, 0, 0, 0);
+ if (addend) aa_emit_add_i64(a, scratch, scratch, addend);
+ aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, scratch, 0)
+ : aa_stur_v(sz, loc_is_fp(reg), rt, scratch, 0));
+ return;
+ }
+ aa_emit32(mc, aa64_adrp(scratch, 0, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21,
+ addr.base.global.sym, addend, 0, 0);
+ pos = mc->pos(mc);
+ aa_emit32(mc, load ? aa_ldr_uimm_v(sz, loc_is_fp(reg), rt, scratch, 0)
+ : aa_str_uimm_v(sz, loc_is_fp(reg), rt, scratch, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, aa_ldst_reloc_for_size(sz),
+ addr.base.global.sym, addend, 0, 0);
+ return;
+ }
+ aa_addr_base(a, addr, &base, &off);
+ if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) {
+ u32 use_base = base;
+ u32 scaled = 0;
+ if (addr.index_kind != NATIVE_ADDR_INDEX_REG)
+ aa_panic(a, "unsupported address index");
+ if (off) {
+ use_base = AA_TMP1;
+ aa_emit_add_imm(a, use_base, base, off);
+ }
+ if (addr.log2_scale == 0) {
+ scaled = 0;
+ } else if (addr.log2_scale == sz) {
+ scaled = 1;
+ } else {
+ aa_panic(a, "unsupported memory address scale");
+ }
+ aa_emit32(mc, aa_ldst_regoff_v(sz, loc_is_fp(reg), load, rt, use_base,
+ addr.index.reg, scaled));
+ return;
+ }
+ if (off >= 0 && (((u32)off & ((1u << sz) - 1u)) == 0) &&
+ ((u32)off >> sz) <= 0xfffu) {
+ aa_emit32(mc, load ? aa_ldr_uimm_v(sz, loc_is_fp(reg), rt, base, (u32)off)
+ : aa_str_uimm_v(sz, loc_is_fp(reg), rt, base, (u32)off));
+ return;
+ }
+ if (off >= -256 && off <= 255) {
+ aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, base, off)
+ : aa_stur_v(sz, loc_is_fp(reg), rt, base, off));
+ return;
+ }
+ aa_emit_add_imm(a, AA_TMP1, base, off);
+ aa_emit32(mc, load ? aa_ldur_v(sz, loc_is_fp(reg), rt, AA_TMP1, 0)
+ : aa_stur_v(sz, loc_is_fp(reg), rt, AA_TMP1, 0));
+}
+
+static NativeAllocClass aa_class_for_type(NativeTarget* t,
+ CfreeCgTypeId type) {
+ if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u)
+ return NATIVE_REG_FP;
+ return NATIVE_REG_INT;
+}
+
+static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr,
+ MemAccess mem) {
+ u32 sz;
+ (void)t;
+ if (!addr) return 0;
+ if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return 1;
+ if (addr->index_kind != NATIVE_ADDR_INDEX_REG) return 0;
+ if (addr->log2_scale == 0) return 1;
+ sz = size_idx(mem.size ? mem.size : 8u);
+ return addr->log2_scale == sz;
+}
+
+static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) {
+ if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return;
+ if (addr->index_kind != NATIVE_ADDR_INDEX_REG)
+ aa_panic(a, "unsupported address index");
+ if (addr->log2_scale > 4u) aa_panic(a, "unsupported address scale");
+ aa_emit32(a->base.mc, aa_add_lsl(rd, rd, addr->index.reg, addr->log2_scale));
+}
+
+static void aa_materialize_frame_index(AANativeTarget* a, NativeAddr* addr,
+ u32 avoid_reg) {
+ NativeAddr load;
+ NativeLoc idx;
+ MemAccess mem;
+ u32 reg;
+ if (addr->index_kind != NATIVE_ADDR_INDEX_FRAME_VALUE) return;
+ reg = avoid_reg == AA_TMP1 ? AA_TMP0 : AA_TMP1;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = addr->index.frame;
+ load.base_type = addr->index_type ? addr->index_type
+ : builtin_id(CFREE_CG_BUILTIN_I64);
+ memset(&idx, 0, sizeof idx);
+ idx.kind = NATIVE_LOC_REG;
+ idx.cls = NATIVE_REG_INT;
+ idx.type = load.base_type;
+ idx.v.reg = reg;
+ memset(&mem, 0, sizeof mem);
+ mem.type = load.base_type;
+ mem.size = 8;
+ mem.align = 8;
+ aa_emit_mem(a, 1, idx, load, mem);
+ addr->index_kind = NATIVE_ADDR_INDEX_REG;
+ addr->index.reg = reg;
+}
+
+static NativeLoc aa_reg_loc(CfreeCgTypeId type, NativeAllocClass cls, Reg reg);
+
+static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off) {
+ return aa64_ldst_uimm_pack((AA64LdStUimm){.size = 0,
+ .V = 1,
+ .opc = load ? 3u : 2u,
+ .imm12 = byte_off >> 4,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) {
+ return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = 0,
+ .V = 1,
+ .opc = load ? 3u : 2u,
+ .imm9 = (u32)byte_off & 0x1ffu,
+ .Rn = rn,
+ .Rt = rt});
+}
+
+static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg,
+ NativeFrameSlot slot, u32 offset) {
+ AANativeSlot* s = aa_slot(a, slot);
+ i32 off = -(i32)s->off + (i32)offset;
+ MCEmitter* mc = a->base.mc;
+ if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) {
+ aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off));
+ return;
+ }
+ if (off >= -256 && off <= 255) {
+ aa_emit32(mc, aa_ldst_q_simm9(load, qreg, AA_FP, off));
+ return;
+ }
+ aa_emit_add_imm(a, AA_TMP1, AA_FP, off);
+ aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_TMP1, 0));
+}
+
+static void aa_emit_variadic_reg_saves(AANativeTarget* a) {
+ NativeFrameSlotDesc sd;
+ NativeAddr addr;
+ MemAccess mem;
+ CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+ ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi);
+ if (vai.kind != ABI_VA_LIST_AAPCS64) return;
+ memset(&sd, 0, sizeof sd);
+ sd.type = i64;
+ sd.size = vai.gp_reg_count * vai.gp_slot_size;
+ sd.align = 8;
+ sd.kind = NATIVE_FRAME_SLOT_SAVE;
+ a->va_gr_slot = a->base.frame_slot(&a->base, &sd);
+ sd.size = vai.fp_reg_count * vai.fp_slot_size;
+ sd.align = 16;
+ a->va_vr_slot = a->base.frame_slot(&a->base, &sd);
+ memset(&mem, 0, sizeof mem);
+ mem.type = i64;
+ mem.size = 8;
+ mem.align = 8;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = a->va_gr_slot;
+ addr.base_type = i64;
+ for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) {
+ NativeLoc src = aa_reg_loc(i64, NATIVE_REG_INT, r);
+ addr.offset = (i32)(r * vai.gp_slot_size);
+ aa_emit_mem(a, 0, src, addr, mem);
+ }
+ for (u32 r = 0; r < vai.fp_reg_count && r < 8u; ++r)
+ aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size);
+}
+
+static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
+ AANativeTarget* a = aa_of(t);
+ MCEmitter* mc = t->mc;
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
+ a->func = fd;
+ a->nslots = 0;
+ a->cum_off = AA_FRAME_SAVE_SIZE;
+ a->max_outgoing = 0;
+ a->incoming_stack_size = 0;
+ a->next_param_int = 0;
+ a->next_param_fp = 0;
+ a->next_param_stack = 0;
+ a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
+ a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE;
+ a->va_gr_slot = NATIVE_FRAME_SLOT_NONE;
+ a->va_vr_slot = NATIVE_FRAME_SLOT_NONE;
+ a->ntail_sites = 0;
+ a->nalloca_patches = 0;
+ mc->set_section(mc, fd->text_section_id);
+ mc->emit_align(mc, 4, 0);
+ a->func_start = mc->pos(mc);
+ mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
+ if (mc->cfi_startproc) mc->cfi_startproc(mc);
+ a->prologue_pos = mc->pos(mc);
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu);
+ a->epilogue_label = mc->label_new(mc);
+ if (abi && abi->has_sret) {
+ NativeFrameSlotDesc sd;
+ NativeAddr addr;
+ NativeLoc src;
+ MemAccess mem;
+ memset(&sd, 0, sizeof sd);
+ sd.type = builtin_id(CFREE_CG_BUILTIN_I64);
+ sd.size = 8;
+ sd.align = 8;
+ sd.kind = NATIVE_FRAME_SLOT_SAVE;
+ a->sret_ptr_slot = t->frame_slot(t, &sd);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = a->sret_ptr_slot;
+ addr.base_type = sd.type;
+ memset(&src, 0, sizeof src);
+ src.kind = NATIVE_LOC_REG;
+ src.cls = NATIVE_REG_INT;
+ src.type = sd.type;
+ src.v.reg = 8u;
+ memset(&mem, 0, sizeof mem);
+ mem.type = sd.type;
+ mem.size = 8;
+ mem.align = 8;
+ aa_emit_mem(a, 0, src, addr, mem);
+ }
+ if (abi && abi->variadic) aa_emit_variadic_reg_saves(a);
+}
+
+static void aa_note_frame_state(NativeTarget* t,
+ const NativeFramePatchState* state) {
+ AANativeTarget* a = aa_of(t);
+ if (state && state->max_outgoing > a->max_outgoing)
+ a->max_outgoing = state->max_outgoing;
+}
+
+static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n,
+ u32 rd, i64 imm) {
+ u32 tmp[4];
+ u32 m = aa_load_imm_words(tmp, 4u, 1, rd, imm);
+ if (!m || *n + m > cap) aa_panic(a, "instruction patch too small");
+ for (u32 i = 0; i < m; ++i) words[(*n)++] = tmp[i];
+}
+
+static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap,
+ u32* n, u32 frame_size) {
+ u32 imm12, sh;
+ if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) {
+ if (*n >= cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_sub_imm(1, AA_SP, AA_SP, imm12, sh);
+ return;
+ }
+ aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
+ if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
+ words[(*n)++] = aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0);
+ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0);
+}
+
+static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap,
+ u32* n, u32 frame_size) {
+ u32 imm12, sh;
+ if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) {
+ if (*n >= cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh);
+ return;
+ }
+ aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
+ if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
+ words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0);
+}
+
+static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
+ u32* n, u32 frame_size) {
+ u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ u32 imm12, sh;
+ if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) {
+ if (*n >= cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, imm12, sh);
+ return;
+ }
+ aa_words_load_imm(a, words, cap, n, AA_TMP0, save_off);
+ if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
+ words[(*n)++] = aa64_add(1, AA_TMP1, AA_TMP1, AA_TMP0);
+}
+
+static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
+ u32* n, u32 frame_size) {
+ if (!frame_size) return;
+ if (*n + 4u > cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
+ words[(*n)++] = aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16);
+ words[(*n)++] = aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8);
+ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0);
+}
+
+static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) {
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 n = 0;
+ ObjSecId sec = a->func->text_section_id;
+ memset(words, 0, sizeof words);
+ if (frame_size) {
+ aa_words_sub_sp_frame(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
+ aa_words_saved_pair_addr(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
+ if (n + 2u > AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large");
+ words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0);
+ words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8);
+ aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
+ }
+ while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu;
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i)
+ aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]);
+}
+
+static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) {
+ MCEmitter* mc = a->base.mc;
+ if (!frame_size) return;
+ aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0));
+ aa_emit32(mc, aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16));
+ aa_emit32(mc, aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8));
+ aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0));
+}
+
+static void aa_patch_allocas(AANativeTarget* a) {
+ ObjSecId sec = a->func->text_section_id;
+ u32 imm12, sh;
+ for (u32 i = 0; i < a->nalloca_patches; ++i) {
+ AAAllocaPatch* p = &a->alloca_patches[i];
+ if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh))
+ aa_panic(a, "outgoing area too large for alloca result");
+ aa_patch32(a->base.obj, sec, p->pos,
+ aa64_add_imm(1, p->dst_reg, AA_SP, imm12, sh));
+ }
+}
+
+static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
+ ObjSecId sec = a->func->text_section_id;
+ for (u32 i = 0; i < a->ntail_sites; ++i) {
+ AATailSite* site = &a->tail_sites[i];
+ u32 words[AA_TAIL_WORDS];
+ u32 n = 0;
+ memset(words, 0, sizeof words);
+ aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size);
+ if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
+ if (site->callee.kind == NATIVE_LOC_REG) {
+ words[n++] = aa64_br(loc_reg(site->callee));
+ } else if (site->callee.kind == NATIVE_LOC_GLOBAL) {
+ while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
+ words[n++] = aa64_b(0);
+ } else {
+ aa_panic(a, "unsupported tail target");
+ }
+ while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
+ for (u32 w = 0; w < AA_TAIL_WORDS; ++w)
+ aa_patch32(a->base.obj, sec, site->pos + w * 4u, words[w]);
+ }
+}
+
+static void aa_func_end(NativeTarget* t) {
+ AANativeTarget* a = aa_of(t);
+ MCEmitter* mc = t->mc;
+ u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u);
+ mc->label_place(mc, a->epilogue_label);
+ aa_emit_restore_frame(a, frame_size);
+ aa_emit32(mc, aa64_ret(AA_LR));
+ aa_patch_prologue(a, frame_size);
+ aa_patch_allocas(a);
+ aa_patch_tail_sites(a, frame_size);
+ if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
+ mc->cfi_set_next_pc_offset(mc, AA_PROLOGUE_WORDS * 4u);
+ mc->cfi_def_cfa(mc, AA_FP, 0);
+ mc->cfi_offset(mc, AA_FP, -16);
+ mc->cfi_offset(mc, AA_LR, -8);
+ }
+ obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id,
+ a->func_start, mc->pos(mc) - a->func_start);
+ if (a->func->atomize) {
+ obj_atom_define(t->obj, a->func->text_section_id, a->func_start,
+ mc->pos(mc) - a->func_start, a->func->sym, 0);
+ }
+ if (mc->cfi_endproc) mc->cfi_endproc(mc);
+ mc_end_function(mc);
+ a->func = NULL;
+}
+
+static NativeFrameSlot aa_frame_slot(NativeTarget* t,
+ const NativeFrameSlotDesc* d) {
+ AANativeTarget* a = aa_of(t);
+ AANativeSlot* s;
+ u32 size = d->size ? d->size : 8u;
+ u32 align = d->align ? d->align : 1u;
+ if (a->nslots == a->slots_cap) {
+ u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u;
+ AANativeSlot* nb = arena_zarray(t->c->tu, AANativeSlot, cap);
+ if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots);
+ a->slots = nb;
+ a->slots_cap = cap;
+ }
+ a->cum_off = align_up_u32(a->cum_off + size, align);
+ s = &a->slots[a->nslots++];
+ s->off = a->cum_off;
+ s->size = size;
+ s->align = align;
+ s->kind = d->kind;
+ return a->nslots;
+}
+
+static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
+ const NativeKnownFrameDesc* frame,
+ NativeFrameSlot* out_slots) {
+ aa_func_begin(t, fd);
+ if (frame) {
+ AANativeTarget* a = aa_of(t);
+ if (frame->max_outgoing > a->max_outgoing)
+ a->max_outgoing = frame->max_outgoing;
+ for (u32 i = 0; i < frame->nslots; ++i) {
+ NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = slot;
+ }
+ }
+}
+
+static void aa_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
+ MemAccess mem) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = slot;
+ addr.base_type = src.type;
+ aa_emit_mem(aa_of(t), 0, src, addr, mem);
+}
+
+static void aa_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
+ MemAccess mem) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = slot;
+ addr.base_type = dst.type;
+ aa_emit_mem(aa_of(t), 1, dst, addr, mem);
+}
+
+static MCLabel aa_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
+
+static void aa_label_place(NativeTarget* t, MCLabel label) {
+ t->mc->label_place(t->mc, label);
+}
+
+static void aa_jump(NativeTarget* t, MCLabel label) {
+ aa_emit32(t->mc, aa64_b(0));
+ t->mc->emit_label_ref(t->mc, label, R_AARCH64_JUMP26, 4, 0);
+}
+
+static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs,
+ NativeLoc rhs, MCLabel label) {
+ if (loc_is_fp(lhs)) {
+ aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
+ loc_reg(rhs)));
+ } else {
+ u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
+ aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
+ }
+ aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)}));
+ t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
+}
+
+static void aa_indirect_branch(NativeTarget* t, NativeLoc addr,
+ const MCLabel* valid_targets, u32 ntargets) {
+ (void)valid_targets;
+ (void)ntargets;
+ aa_emit32(t->mc, aa64_br(loc_reg(addr)));
+}
+
+static void aa_load_label_addr(NativeTarget* t, NativeLoc dst,
+ MCLabel target) {
+ aa_emit32(t->mc, aa64_adr(loc_reg(dst), 0, 0));
+ aa_emit32(t->mc, aa64_b(3));
+ aa_emit32(t->mc, 0);
+ aa_emit32(t->mc, 0);
+ t->mc->emit_label_ref(t->mc, target, R_AARCH64_INTRA_LABEL_ADDR, 16, 0);
+}
+
+static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
+ if (loc_is_fp(dst) && loc_is_fp(src)) {
+ aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
+ loc_reg(src)));
+ } else if (loc_is_fp(dst)) {
+ aa_emit32(t->mc,
+ aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
+ } else if (loc_is_fp(src)) {
+ aa_emit32(t->mc,
+ aa_fmov_fp_to_gpr(loc_is_64(t, dst), loc_reg(dst), loc_reg(src)));
+ } else {
+ aa_emit32(t->mc,
+ aa64_mov_reg(loc_is_64(t, dst), loc_reg(dst), loc_reg(src)));
+ }
+}
+
+static NativeLoc aa_tmp_loc(CfreeCgTypeId type, Reg reg);
+
+static void aa_load_imm_native(NativeTarget* t, NativeLoc dst, i64 imm) {
+ aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), imm);
+}
+
+static void aa_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cbytes) {
+ u64 v = 0;
+ if (cbytes.size > 8u)
+ compiler_panic(t->c, ((AANativeTarget*)t)->loc,
+ "aarch64 native target: byte constant too large");
+ for (u32 i = 0; i < cbytes.size; ++i) v |= (u64)cbytes.bytes[i] << (i * 8u);
+ if (loc_is_fp(dst)) {
+ NativeLoc tmp = aa_tmp_loc(cbytes.type, AA_TMP0);
+ aa_emit_load_imm(t->mc, cbytes.size == 8u, AA_TMP0, (i64)v);
+ aa_move(t, dst, tmp);
+ } else {
+ aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), (i64)v);
+ }
+}
+
+static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
+ AANativeTarget* a = aa_of(t);
+ u32 rd = loc_reg(dst);
+ aa_materialize_frame_index(a, &addr, rd);
+ switch ((NativeAddrBaseKind)addr.base_kind) {
+ case NATIVE_ADDR_BASE_FRAME: {
+ AANativeSlot* s = aa_slot(a, addr.base.frame);
+ aa_emit_add_imm(a, rd, AA_FP, -(i32)s->off + addr.offset);
+ aa_apply_index(a, rd, &addr);
+ return;
+ }
+ case NATIVE_ADDR_BASE_FRAME_VALUE: {
+ NativeAddr load;
+ MemAccess mem;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = addr.base.frame;
+ load.base_type = addr.base_type ? addr.base_type
+ : builtin_id(CFREE_CG_BUILTIN_I64);
+ memset(&mem, 0, sizeof mem);
+ mem.type = load.base_type;
+ mem.size = 8;
+ mem.align = 8;
+ aa_emit_mem(a, 1, dst, load, mem);
+ if (addr.offset) aa_emit_add_imm(a, rd, rd, addr.offset);
+ aa_apply_index(a, rd, &addr);
+ return;
+ }
+ case NATIVE_ADDR_BASE_REG:
+ aa_emit_add_imm(a, rd, addr.base.reg, addr.offset);
+ aa_apply_index(a, rd, &addr);
+ return;
+ case NATIVE_ADDR_BASE_GLOBAL: {
+ i64 addend = addr.base.global.addend + (i64)addr.offset;
+ u32 pos = t->mc->pos(t->mc);
+ if (aa_use_got_for_sym(t, addr.base.global.sym)) {
+ aa_emit32(t->mc, aa64_adrp(rd, 0, 0));
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
+ R_AARCH64_ADR_GOT_PAGE, addr.base.global.sym, 0,
+ 0, 0);
+ pos = t->mc->pos(t->mc);
+ aa_emit32(t->mc, aa_ldr_uimm(3, rd, rd, 0));
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
+ R_AARCH64_LD64_GOT_LO12_NC,
+ addr.base.global.sym, 0, 0, 0);
+ if (addend) aa_emit_add_i64(a, rd, rd, addend);
+ aa_apply_index(a, rd, &addr);
+ return;
+ }
+ aa_emit32(t->mc, aa64_adrp(rd, 0, 0));
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
+ R_AARCH64_ADR_PREL_PG_HI21, addr.base.global.sym,
+ addend, 0, 0);
+ pos = t->mc->pos(t->mc);
+ aa_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
+ R_AARCH64_ADD_ABS_LO12_NC, addr.base.global.sym,
+ addend, 0, 0);
+ aa_apply_index(a, rd, &addr);
+ return;
+ }
+ default:
+ aa_panic(a, "unsupported load_addr");
+ }
+}
+
+static void aa_load_native(NativeTarget* t, NativeLoc dst, NativeAddr addr,
+ MemAccess mem) {
+ aa_emit_mem(aa_of(t), 1, dst, addr, mem);
+}
+
+static void aa_store_native(NativeTarget* t, NativeAddr addr, NativeLoc src,
+ MemAccess mem) {
+ aa_emit_mem(aa_of(t), 0, src, addr, mem);
+}
+
+static void aa_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
+ i64 addend) {
+ AANativeTarget* a = aa_of(t);
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst);
+ u32 pos;
+ if (obj_format_tls_via_descriptor(t->c)) {
+ aa_emit32(mc, aa64_adrp(0, 0, 0));
+ pos = mc->pos(mc) - 4u;
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGE21,
+ sym, 0, 0, 0);
+ aa_emit32(mc, aa_ldr_uimm(3, 0, 0, 0));
+ pos = mc->pos(mc) - 4u;
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGEOFF12,
+ sym, 0, 0, 0);
+ aa_emit32(mc, aa_ldr_uimm(3, AA_TMP0, 0, 0));
+ aa_emit32(mc, aa64_blr(AA_TMP0));
+ if (addend) aa_emit_add_i64(a, 0, 0, addend);
+ if (rd != 0) aa_emit32(mc, aa64_mov_reg(1, rd, 0));
+ return;
+ }
+ if (t->c->target.obj != CFREE_OBJ_ELF) {
+ aa_panic(a, "unsupported TLS object format");
+ }
+ aa_emit32(mc, aa_mrs_tpidr_el0(rd));
+ pos = mc->pos(mc);
+ aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_HI12,
+ sym, addend, 0, 0);
+ pos = mc->pos(mc);
+ aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC,
+ sym, addend, 0, 0);
+}
+
+static NativeLoc aa_tmp_loc(CfreeCgTypeId type, Reg reg) {
+ NativeLoc loc;
+ memset(&loc, 0, sizeof loc);
+ loc.kind = NATIVE_LOC_REG;
+ loc.cls = NATIVE_REG_INT;
+ loc.type = type;
+ loc.v.reg = reg;
+ return loc;
+}
+
+static NativeAddr aa_addr_plus(NativeAddr addr, u32 off) {
+ addr.offset += (i32)off;
+ return addr;
+}
+
+static void aa_copy_bytes_dir(NativeTarget* t, NativeAddr dst, NativeAddr src,
+ AggregateAccess access, int backward) {
+ CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+ CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+ CfreeCgTypeId i16 = builtin_id(CFREE_CG_BUILTIN_I16);
+ CfreeCgTypeId i8 = builtin_id(CFREE_CG_BUILTIN_I8);
+ NativeLoc tmp = aa_tmp_loc(i64, AA_TMP0);
+ u32 off = 0;
+ while (off < access.size) {
+ u32 rem = access.size - off;
+ u32 pos;
+ MemAccess mem = access.mem;
+ if (rem >= 8u) {
+ mem.type = i64;
+ mem.size = 8u;
+ } else if (rem >= 4u) {
+ mem.type = i32;
+ mem.size = 4u;
+ tmp.type = i32;
+ } else if (rem >= 2u) {
+ mem.type = i16;
+ mem.size = 2u;
+ tmp.type = i16;
+ } else {
+ mem.type = i8;
+ mem.size = 1u;
+ tmp.type = i8;
+ }
+ mem.align = mem.size;
+ pos = backward ? access.size - off - mem.size : off;
+ aa_load_native(t, tmp, aa_addr_plus(src, pos), mem);
+ aa_store_native(t, aa_addr_plus(dst, pos), tmp, mem);
+ off += mem.size;
+ tmp.type = i64;
+ }
+}
+
+static void aa_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
+ AggregateAccess access) {
+ aa_copy_bytes_dir(t, dst, src, access, 0);
+}
+
+static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
+ AggregateAccess access) {
+ CfreeCgTypeId i8 = builtin_id(CFREE_CG_BUILTIN_I8);
+ NativeLoc byte = byte_value;
+ MemAccess mem = access.mem;
+ mem.type = i8;
+ mem.size = 1u;
+ mem.align = 1u;
+ byte.type = i8;
+ for (u32 off = 0; off < access.size; ++off)
+ aa_store_native(t, aa_addr_plus(dst, off), byte, mem);
+}
+
+static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
+ NativeLoc rhs) {
+ u32 sf = loc_is_64(t, dst) ? 1u : 0u;
+ u32 rd = loc_reg(dst), rn = loc_reg(lhs), rm = loc_reg(rhs);
+ if (loc_is_fp(dst)) {
+ u32 d = type_size32(t, dst.type) == 8u;
+ switch (op) {
+ case BO_FADD:
+ aa_emit32(t->mc, aa_fp_bin(0x002800u, d, rd, rn, rm));
+ return;
+ case BO_FSUB:
+ aa_emit32(t->mc, aa_fp_bin(0x003800u, d, rd, rn, rm));
+ return;
+ case BO_FMUL:
+ aa_emit32(t->mc, aa_fp_bin(0x000800u, d, rd, rn, rm));
+ return;
+ case BO_FDIV:
+ aa_emit32(t->mc, aa_fp_bin(0x001800u, d, rd, rn, rm));
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported floating binary op");
+ }
+ }
+ switch (op) {
+ case BO_IADD:
+ aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
+ return;
+ case BO_ISUB:
+ aa_emit32(t->mc, aa64_sub(sf, rd, rn, rm));
+ return;
+ case BO_IMUL:
+ aa_emit32(t->mc, aa64_mul(sf, rd, rn, rm));
+ return;
+ case BO_SDIV:
+ aa_emit32(t->mc, aa64_sdiv(sf, rd, rn, rm));
+ return;
+ case BO_UDIV:
+ aa_emit32(t->mc, aa64_udiv(sf, rd, rn, rm));
+ return;
+ case BO_SREM:
+ aa_emit32(t->mc, aa64_sdiv(sf, AA_TMP0, rn, rm));
+ aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm));
+ aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0));
+ return;
+ case BO_UREM:
+ aa_emit32(t->mc, aa64_udiv(sf, AA_TMP0, rn, rm));
+ aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm));
+ aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0));
+ return;
+ case BO_AND:
+ aa_emit32(t->mc, aa64_and(sf, rd, rn, rm));
+ return;
+ case BO_OR:
+ aa_emit32(t->mc, aa64_orr(sf, rd, rn, rm));
+ return;
+ case BO_XOR:
+ aa_emit32(t->mc, aa64_eor(sf, rd, rn, rm));
+ return;
+ case BO_SHL:
+ aa_emit32(t->mc, aa64_lslv(sf, rd, rn, rm));
+ return;
+ case BO_SHR_U:
+ aa_emit32(t->mc, aa64_lsrv(sf, rd, rn, rm));
+ return;
+ case BO_SHR_S:
+ aa_emit32(t->mc, aa64_asrv(sf, rd, rn, rm));
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported binary op");
+ }
+}
+
+static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
+ u32 sf = loc_is_64(t, dst) ? 1u : 0u;
+ if (loc_is_fp(dst)) {
+ switch (op) {
+ case UO_FNEG:
+ case UO_NEG:
+ aa_emit32(t->mc, aa_fneg(type_size32(t, dst.type) == 8u, loc_reg(dst),
+ loc_reg(src)));
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported floating unary op");
+ }
+ }
+ switch (op) {
+ case UO_NEG:
+ aa_emit32(t->mc, aa64_neg(sf, loc_reg(dst), loc_reg(src)));
+ return;
+ case UO_BNOT:
+ aa_emit32(t->mc, aa64_mvn(sf, loc_reg(dst), loc_reg(src)));
+ return;
+ case UO_NOT:
+ aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(src), 0, 0));
+ aa_emit32(t->mc, aa_cset(sf, loc_reg(dst), 0x0u));
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported unary op");
+ }
+}
+
+static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
+ NativeLoc rhs) {
+ if (loc_is_fp(lhs)) {
+ aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
+ loc_reg(rhs)));
+ } else {
+ u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
+ aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
+ }
+ aa_emit32(t->mc, aa_cset(loc_is_64(t, dst), loc_reg(dst), cmp_cond(op)));
+}
+
+static void aa_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
+ NativeLoc src) {
+ int dst_fp = loc_is_fp(dst);
+ int src_fp = loc_is_fp(src);
+ switch (op) {
+ case CV_TRUNC:
+ case CV_BITCAST:
+ aa_move(t, dst, src);
+ return;
+ case CV_ZEXT: {
+ u32 src_bits = type_size32(t, src.type) * 8u;
+ u32 dst_bits = type_size32(t, dst.type) * 8u;
+ u32 sf = dst_bits > 32u;
+ if (src_bits >= dst_bits) {
+ aa_move(t, dst, src);
+ } else if (src_bits >= 32u) {
+ aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dst), loc_reg(src)));
+ } else {
+ aa_emit32(t->mc,
+ aa_ubfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u));
+ }
+ return;
+ }
+ case CV_SEXT: {
+ u32 src_bits = type_size32(t, src.type) * 8u;
+ u32 dst_bits = type_size32(t, dst.type) * 8u;
+ u32 sf = dst_bits > 32u;
+ if (src_bits >= dst_bits) {
+ aa_move(t, dst, src);
+ } else {
+ aa_emit32(t->mc,
+ aa_sbfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u));
+ }
+ return;
+ }
+ case CV_ITOF_S:
+ aa_emit32(t->mc, aa_scvtf(type_size32(t, dst.type) == 8u,
+ loc_is_64(t, src), loc_reg(dst),
+ loc_reg(src)));
+ return;
+ case CV_ITOF_U:
+ aa_emit32(t->mc, aa_ucvtf(type_size32(t, dst.type) == 8u,
+ loc_is_64(t, src), loc_reg(dst),
+ loc_reg(src)));
+ return;
+ case CV_FTOI_S:
+ aa_emit32(t->mc, aa_fcvtzs(loc_is_64(t, dst),
+ type_size32(t, src.type) == 8u, loc_reg(dst),
+ loc_reg(src)));
+ return;
+ case CV_FTOI_U:
+ aa_emit32(t->mc, aa_fcvtzu(loc_is_64(t, dst),
+ type_size32(t, src.type) == 8u, loc_reg(dst),
+ loc_reg(src)));
+ return;
+ case CV_FEXT:
+ if (dst_fp && src_fp) aa_emit32(t->mc, aa_fcvt_d_s(loc_reg(dst), loc_reg(src)));
+ else aa_move(t, dst, src);
+ return;
+ case CV_FTRUNC:
+ if (dst_fp && src_fp) aa_emit32(t->mc, aa_fcvt_s_d(loc_reg(dst), loc_reg(src)));
+ else aa_move(t, dst, src);
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported conversion");
+ }
+}
+
+static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
+ u32 align) {
+ AANativeTarget* a = aa_of(t);
+ u32 use_align = align < 16u ? 16u : align;
+ if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2");
+ if (a->nalloca_patches == a->alloca_patches_cap) {
+ u32 cap = a->alloca_patches_cap ? a->alloca_patches_cap * 2u : 8u;
+ AAAllocaPatch* nb = arena_zarray(t->c->tu, AAAllocaPatch, cap);
+ if (a->alloca_patches)
+ memcpy(nb, a->alloca_patches, sizeof(*nb) * a->nalloca_patches);
+ a->alloca_patches = nb;
+ a->alloca_patches_cap = cap;
+ }
+ aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u));
+ aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align);
+ aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0));
+ aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0));
+ aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0));
+ a->alloca_patches[a->nalloca_patches].pos = t->mc->pos(t->mc);
+ a->alloca_patches[a->nalloca_patches].dst_reg = loc_reg(dst);
+ a->nalloca_patches++;
+ aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0));
+}
+
+static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size) {
+ MemAccess mem;
+ memset(&mem, 0, sizeof mem);
+ mem.type = type;
+ mem.size = size ? size : type_size32(t, type);
+ mem.align = type_align32(t, type);
+ if (mem.align > mem.size && mem.size) mem.align = mem.size;
+ return mem;
+}
+
+static NativeLoc aa_reg_loc(CfreeCgTypeId type, NativeAllocClass cls, Reg reg) {
+ NativeLoc loc;
+ memset(&loc, 0, sizeof loc);
+ loc.kind = NATIVE_LOC_REG;
+ loc.cls = (u8)cls;
+ loc.type = type;
+ loc.v.reg = reg;
+ return loc;
+}
+
+static NativeLoc aa_stack_loc(CfreeCgTypeId type, NativeFrameSlot slot,
+ i32 offset) {
+ NativeLoc loc;
+ memset(&loc, 0, sizeof loc);
+ loc.kind = NATIVE_LOC_STACK;
+ loc.cls = NATIVE_REG_INT;
+ loc.type = type;
+ loc.v.stack.slot = slot;
+ loc.v.stack.offset = offset;
+ return loc;
+}
+
+static NativeAddr aa_loc_addr(AANativeTarget* a, NativeLoc loc, u32 offset) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((NativeLocKind)loc.kind) {
+ case NATIVE_LOC_FRAME:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = loc.v.frame;
+ addr.base_type = loc.type;
+ addr.offset = (i32)offset;
+ return addr;
+ case NATIVE_LOC_STACK:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = loc.v.stack.slot;
+ addr.base_type = loc.type;
+ addr.offset = loc.v.stack.offset + (i32)offset;
+ return addr;
+ case NATIVE_LOC_ADDR:
+ addr = loc.v.addr;
+ addr.offset += (i32)offset;
+ return addr;
+ default:
+ aa_panic(a, "location is not addressable");
+ }
+ return addr;
+}
+
+static void aa_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
+ AANativeTarget* a = aa_of(t);
+ NativeAddr addr = aa_loc_addr(a, src, 0);
+ aa_load_addr(t, dst, addr);
+}
+
+static void aa_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
+ u32 offset, u32 size) {
+ AANativeTarget* a = aa_of(t);
+ MemAccess mem = aa_mem_for_type(t, dst.type, size);
+ if (src.kind == NATIVE_LOC_REG) {
+ aa_move(t, dst, src);
+ return;
+ }
+ if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
+ src.kind == NATIVE_LOC_ADDR) {
+ NativeAddr addr = aa_loc_addr(a, src, offset);
+ addr.base_type = dst.type;
+ aa_emit_mem(a, 1, dst, addr, mem);
+ return;
+ }
+ if (src.kind == NATIVE_LOC_IMM) {
+ aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), src.v.imm);
+ return;
+ }
+ aa_panic(a, "unsupported call argument source");
+}
+
+static void aa_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
+ u32 offset, u32 size) {
+ AANativeTarget* a = aa_of(t);
+ MemAccess mem = aa_mem_for_type(t, src.type, size);
+ if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
+ dst.kind == NATIVE_LOC_ADDR) {
+ NativeAddr addr = aa_loc_addr(a, dst, offset);
+ addr.base_type = src.type;
+ aa_emit_mem(a, 0, src, addr, mem);
+ return;
+ }
+ if (dst.kind == NATIVE_LOC_REG) {
+ aa_move(t, dst, src);
+ return;
+ }
+ aa_panic(a, "unsupported call return destination");
+}
+
+static void aa_store_outgoing_part(NativeTarget* t, int tail_call,
+ u32 stack_off, NativeLoc src, u32 size) {
+ NativeAddr addr;
+ MemAccess mem = aa_mem_for_type(t, src.type, size);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = tail_call ? AA_FP : AA_SP;
+ addr.base_type = src.type;
+ addr.offset = (i32)stack_off;
+ aa_emit_mem(aa_of(t), 0, src, addr, mem);
+}
+
+static const ABIArgInfo* aa_param_abi(NativeTarget* t,
+ const ABIFuncInfo* abi,
+ const NativeCallDesc* desc, u32 i,
+ ABIArgInfo* scratch) {
+ if (abi && i < abi->nparams) return &abi->params[i];
+ memset(scratch, 0, sizeof *scratch);
+ scratch->kind = ABI_ARG_DIRECT;
+ scratch->flags = ABI_AF_NONE;
+ scratch->nparts = 1;
+ scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
+ ((ABIArgPart*)scratch->parts)[0].cls =
+ cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
+ ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type);
+ ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type);
+ ((ABIArgPart*)scratch->parts)[0].src_offset = 0;
+ return scratch;
+}
+
+static u32 aa_class_stack_size(const ABIArgInfo* ai) {
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
+ if (ai->kind == ABI_ARG_INDIRECT) return 8u;
+ return align_up_u32(ai->nparts ? ai->nparts * 8u : 8u, 8u);
+}
+
+static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
+ u32 next_int = 0, next_fp = 0, stack = 0;
+ for (u32 i = 0; i < desc->nargs; ++i) {
+ ABIArgInfo tmp;
+ const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
+ int force_stack = abi && abi->variadic && abi->vararg_on_stack &&
+ i >= abi->nparams;
+ if (ai->kind == ABI_ARG_IGNORE) continue;
+ if (force_stack) {
+ stack += aa_class_stack_size(ai);
+ continue;
+ }
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (next_int < 8u)
+ next_int++;
+ else
+ stack += 8u;
+ continue;
+ }
+ for (u32 p = 0; p < ai->nparts; ++p) {
+ const ABIArgPart* part = &ai->parts[p];
+ if (part->cls == ABI_CLASS_FP) {
+ if (next_fp < 8u)
+ next_fp++;
+ else
+ stack += 8u;
+ } else {
+ if (next_int < 8u)
+ next_int++;
+ else
+ stack += 8u;
+ }
+ }
+ }
+ return align_up_u32(stack, 16u);
+}
+
+static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
+ NativeCallPlan* plan) {
+ NativeCallPlanRet* rets;
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
+ memset(plan, 0, sizeof *plan);
+ rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
+ plan->callee = desc->callee;
+ plan->rets = rets;
+ plan->flags = desc->flags;
+ plan->has_sret = abi && abi->has_sret;
+ plan->is_variadic = abi && abi->variadic;
+ plan->stack_arg_size = aa_call_stack_size(t, desc);
+ if (plan->stack_arg_size > aa_of(t)->max_outgoing)
+ aa_of(t)->max_outgoing = plan->stack_arg_size;
+ {
+ u32 next_int = 0, next_fp = 0, stack = 0;
+ int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
+ if (abi && abi->has_sret) {
+ NativeLoc x8 = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64),
+ NATIVE_REG_INT, 8u);
+ if (desc->flags & CG_CALL_TAIL) {
+ AANativeTarget* a = aa_of(t);
+ NativeLoc saved =
+ aa_stack_loc(x8.type, a->sret_ptr_slot, 0);
+ aa_load_part(t, x8, saved, 0, 8);
+ } else if (desc->nresults) {
+ aa_addr_of_loc(t, x8, desc->results[0]);
+ }
+ }
+ for (u32 i = 0; i < desc->nargs; ++i) {
+ ABIArgInfo tmp;
+ const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
+ int force_stack = abi && abi->variadic && abi->vararg_on_stack &&
+ i >= abi->nparams;
+ if (ai->kind == ABI_ARG_IGNORE) continue;
+ if (force_stack) {
+ NativeLoc tmpreg =
+ aa_reg_loc(desc->args[i].type, NATIVE_REG_INT, AA_TMP0);
+ u32 n = aa_class_stack_size(ai);
+ u32 off = 0;
+ while (off < n) {
+ aa_load_part(t, tmpreg, desc->args[i], off, 8);
+ aa_store_outgoing_part(t, tail_call, stack + off, tmpreg, 8);
+ off += 8;
+ }
+ stack += n;
+ continue;
+ }
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ NativeLoc ptr;
+ if (next_int < 8u) {
+ ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ next_int++);
+ aa_addr_of_loc(t, ptr, desc->args[i]);
+ } else {
+ ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ AA_TMP0);
+ aa_addr_of_loc(t, ptr, desc->args[i]);
+ aa_store_outgoing_part(t, tail_call, stack, ptr, 8);
+ stack += 8u;
+ }
+ continue;
+ }
+ for (u32 p = 0; p < ai->nparts; ++p) {
+ const ABIArgPart* part = &ai->parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ if (cls == NATIVE_REG_FP && next_fp < 8u) {
+ NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_fp++);
+ aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
+ } else if (cls == NATIVE_REG_INT && next_int < 8u) {
+ NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_int++);
+ aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
+ } else {
+ NativeLoc tmpreg =
+ aa_reg_loc(desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u
+ : AA_TMP0);
+ aa_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
+ aa_store_outgoing_part(t, tail_call, stack, tmpreg, part->size);
+ stack += 8u;
+ }
+ }
+ }
+ }
+ if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
+ u32 nr = 0, ni = 0, nf = 0;
+ for (u32 p = 0; p < abi->ret.nparts; ++p) {
+ const ABIArgPart* part = &abi->ret.parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ rets[nr].src = aa_reg_loc(desc->results[0].type, cls,
+ cls == NATIVE_REG_FP ? nf++ : ni++);
+ rets[nr].dst = desc->results[0];
+ if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
+ rets[nr].dst = aa_stack_loc(desc->results[0].type,
+ desc->results[0].v.frame,
+ (i32)part->src_offset);
+ else if (rets[nr].dst.kind == NATIVE_LOC_STACK)
+ rets[nr].dst.v.stack.offset += (i32)part->src_offset;
+ else if (rets[nr].dst.kind == NATIVE_LOC_ADDR)
+ rets[nr].dst.v.addr.offset += (i32)part->src_offset;
+ rets[nr].mem = aa_mem_for_type(t, desc->results[0].type, part->size);
+ nr++;
+ }
+ plan->nrets = nr;
+ } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
+ plan->nrets = 0;
+ } else if (!abi && desc->nresults) {
+ rets[0].src = aa_reg_loc(desc->results[0].type, NATIVE_REG_INT, 0);
+ rets[0].dst = desc->results[0];
+ rets[0].mem = aa_mem_for_type(t, desc->results[0].type, 0);
+ plan->nrets = 1;
+ }
+}
+
+static void aa_ret(NativeTarget* t);
+
+static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+ AANativeTarget* a = aa_of(t);
+ if (a->ntail_sites == a->tail_sites_cap) {
+ u32 cap = a->tail_sites_cap ? a->tail_sites_cap * 2u : 8u;
+ AATailSite* nb = arena_zarray(t->c->tu, AATailSite, cap);
+ if (a->tail_sites) memcpy(nb, a->tail_sites, sizeof(*nb) * a->ntail_sites);
+ a->tail_sites = nb;
+ a->tail_sites_cap = cap;
+ }
+ a->tail_sites[a->ntail_sites].pos = t->mc->pos(t->mc);
+ a->tail_sites[a->ntail_sites].callee = callee;
+ a->ntail_sites++;
+ for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu);
+ if (callee.kind == NATIVE_LOC_GLOBAL) {
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id,
+ a->tail_sites[a->ntail_sites - 1u].pos +
+ (AA_TAIL_WORDS - 1u) * 4u,
+ R_AARCH64_JUMP26, callee.v.global.sym,
+ callee.v.global.addend, 0, 0);
+ }
+}
+
+static void aa_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
+ int is_tail = (plan->flags & CG_CALL_TAIL) != 0;
+ if (is_tail) {
+ if (plan->callee.kind != NATIVE_LOC_GLOBAL &&
+ plan->callee.kind != NATIVE_LOC_REG)
+ aa_panic(aa_of(t), "unsupported tail target");
+ aa_emit_tail_site(t, plan->callee);
+ return;
+ }
+ if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
+ aa_emit32(t->mc, aa64_bl(0));
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, t->mc->pos(t->mc) - 4u,
+ R_AARCH64_CALL26, plan->callee.v.global.sym,
+ plan->callee.v.global.addend, 0, 0);
+ return;
+ }
+ if (plan->callee.kind == NATIVE_LOC_REG) {
+ aa_emit32(t->mc, aa64_blr(loc_reg(plan->callee)));
+ return;
+ }
+ aa_panic(aa_of(t), "unsupported call target");
+}
+
+static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
+ const NativeLoc* values, u32 nvalues,
+ NativeCallPlanRet** out_rets, u32* out_nrets) {
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
+ NativeCallPlanRet* rets = NULL;
+ u32 nr = 0;
+ if (nvalues > 1u) aa_panic(aa_of(t), "multiple returns unsupported");
+ if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
+ if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
+ AANativeTarget* a = aa_of(t);
+ NativeLoc dstp = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64),
+ NATIVE_REG_INT, AA_TMP1);
+ NativeLoc saved = aa_stack_loc(dstp.type, a->sret_ptr_slot, 0);
+ NativeAddr dst_addr, src_addr;
+ AggregateAccess access;
+ aa_load_part(t, dstp, saved, 0, 8);
+ memset(&dst_addr, 0, sizeof dst_addr);
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = AA_TMP1;
+ dst_addr.base_type = values[0].type;
+ src_addr = aa_loc_addr(a, values[0], 0);
+ src_addr.base_type = values[0].type;
+ memset(&access, 0, sizeof access);
+ access.type = values[0].type;
+ access.size = (u32)cg_type_size(t->c, values[0].type);
+ access.align = type_align32(t, values[0].type);
+ aa_copy_bytes(t, dst_addr, src_addr, access);
+ *out_rets = NULL;
+ *out_nrets = 0;
+ return;
+ }
+ if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) {
+ u32 ni = 0, nf = 0;
+ for (u32 p = 0; p < abi->ret.nparts; ++p) {
+ const ABIArgPart* part = &abi->ret.parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ rets[nr].src = values[0];
+ if (rets[nr].src.kind == NATIVE_LOC_FRAME)
+ rets[nr].src = aa_stack_loc(values[0].type, values[0].v.frame,
+ (i32)part->src_offset);
+ else if (rets[nr].src.kind == NATIVE_LOC_STACK)
+ rets[nr].src.v.stack.offset += (i32)part->src_offset;
+ else if (rets[nr].src.kind == NATIVE_LOC_ADDR)
+ rets[nr].src.v.addr.offset += (i32)part->src_offset;
+ rets[nr].dst = aa_reg_loc(values[0].type, cls,
+ cls == NATIVE_REG_FP ? nf++ : ni++);
+ rets[nr].mem = aa_mem_for_type(t, values[0].type, part->size);
+ nr++;
+ }
+ } else if (nvalues) {
+ rets[0].src = values[0];
+ rets[0].dst = aa_reg_loc(values[0].type, NATIVE_REG_INT, 0);
+ rets[0].mem = aa_mem_for_type(t, values[0].type, 0);
+ nr = 1;
+ }
+ *out_rets = rets;
+ *out_nrets = nr;
+}
+
+static void aa_ret(NativeTarget* t) {
+ AANativeTarget* a = aa_of(t);
+ aa_jump(t, a->epilogue_label);
+}
+
+static u32 aa_bit_storage_reg_bits(u32 storage_bytes) {
+ return storage_bytes == 8u ? 64u : 32u;
+}
+
+static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
+ u32 bits = sf ? 64u : 32u;
+ if (!sh) {
+ if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
+ return;
+ }
+ aa_emit32(t->mc, aa_ubfm(sf, rd, rn, bits - sh, bits - 1u - sh));
+}
+
+static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
+ if (!sh) {
+ if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
+ return;
+ }
+ aa_emit32(t->mc, aa_ubfm(sf, rd, rn, sh, sf ? 63u : 31u));
+}
+
+static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
+ if (!sh) {
+ if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
+ return;
+ }
+ aa_emit32(t->mc, aa_sbfm(sf, rd, rn, sh, sf ? 63u : 31u));
+}
+
+static void aa_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
+ BitFieldAccess bf) {
+ u32 storage = bf.storage.size ? bf.storage.size : 4u;
+ u32 bits = aa_bit_storage_reg_bits(storage);
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u32 sf = bits == 64u;
+ NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset);
+ NativeLoc tmp = dst;
+ tmp.type = bf.storage.type ? bf.storage.type : dst.type;
+ aa_load_native(t, tmp, saddr, bf.storage);
+ aa_lsl_imm(t, sf, loc_reg(dst), loc_reg(dst),
+ bits - (u32)bf.bit_offset - width);
+ if (bf.signed_)
+ aa_asr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width);
+ else
+ aa_lsr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width);
+}
+
+static void aa_bitfield_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
+ BitFieldAccess bf) {
+ u32 storage = bf.storage.size ? bf.storage.size : 4u;
+ u32 bits = aa_bit_storage_reg_bits(storage);
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u32 sf = bits == 64u;
+ u64 ones = width >= 64u ? ~(u64)0 : ((1ull << width) - 1ull);
+ u64 field_mask = ones << bf.bit_offset;
+ NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset);
+ NativeLoc word = aa_tmp_loc(bf.storage.type ? bf.storage.type : src.type,
+ AA_TMP0);
+ aa_load_native(t, word, saddr, bf.storage);
+ aa_emit_load_imm(t->mc, sf, AA_TMP1, (i64)~field_mask);
+ aa_emit32(t->mc, aa64_and(sf, AA_TMP0, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa_ubfm(sf, AA_TMP1, loc_reg(src), 0, width - 1u));
+ aa_lsl_imm(t, sf, AA_TMP1, AA_TMP1, bf.bit_offset);
+ aa_emit32(t->mc, aa64_orr(sf, AA_TMP0, AA_TMP0, AA_TMP1));
+ aa_store_native(t, saddr, word, bf.storage);
+}
+
+static void aa_trap(NativeTarget* t);
+
+static int aa_order_acquire(MemOrder order) {
+ return order == MO_CONSUME || order == MO_ACQUIRE || order == MO_ACQ_REL ||
+ order == MO_SEQ_CST;
+}
+
+static int aa_order_release(MemOrder order) {
+ return order == MO_RELEASE || order == MO_ACQ_REL || order == MO_SEQ_CST;
+}
+
+static NativeLoc aa_i64_reg_loc(u32 reg) {
+ return aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
+}
+
+static void aa_atomic_addr_reg(NativeTarget* t, NativeAddr addr, u32 reg) {
+ NativeLoc dst = aa_i64_reg_loc(reg);
+ t->load_addr(t, dst, addr);
+}
+
+static u32 aa_saved_tmp_pick(u32 a, u32 b, u32 c) {
+ static const u32 regs[] = {11u, 12u, 13u, 14u, 15u};
+ for (u32 i = 0; i < sizeof regs / sizeof regs[0]; ++i) {
+ if (regs[i] != a && regs[i] != b && regs[i] != c) return regs[i];
+ }
+ return 15u;
+}
+
+static void aa_saved_tmp_spill(AANativeTarget* a, u32 reg) {
+ NativeFrameSlotDesc sd;
+ NativeAddr addr;
+ MemAccess mem;
+ memset(&sd, 0, sizeof sd);
+ if (a->saved_tmp_slot == NATIVE_FRAME_SLOT_NONE) {
+ sd.type = builtin_id(CFREE_CG_BUILTIN_I64);
+ sd.size = 8;
+ sd.align = 8;
+ sd.kind = NATIVE_FRAME_SLOT_SPILL;
+ a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd);
+ }
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = a->saved_tmp_slot;
+ addr.base_type = builtin_id(CFREE_CG_BUILTIN_I64);
+ mem = aa_mem_for_type(&a->base, addr.base_type, 8);
+ aa_store_native(&a->base, addr, aa_i64_reg_loc(reg), mem);
+}
+
+static void aa_saved_tmp_restore(AANativeTarget* a, u32 reg) {
+ NativeAddr addr;
+ MemAccess mem;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = a->saved_tmp_slot;
+ addr.base_type = builtin_id(CFREE_CG_BUILTIN_I64);
+ mem = aa_mem_for_type(&a->base, addr.base_type, 8);
+ aa_load_native(&a->base, aa_i64_reg_loc(reg), addr, mem);
+}
+
+static void aa_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
+ MemAccess mem, MemOrder order) {
+ u32 base = AA_TMP0;
+ u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type));
+ aa_atomic_addr_reg(t, addr, base);
+ aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldar(sz, loc_reg(dst), base)
+ : aa_ldr_uimm(sz, loc_reg(dst),
+ base, 0));
+ if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+static void aa_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
+ MemAccess mem, MemOrder order) {
+ u32 base = AA_TMP0;
+ u32 sz = size_idx(mem.size ? mem.size : type_size32(t, src.type));
+ if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+ aa_atomic_addr_reg(t, addr, base);
+ aa_emit32(t->mc, aa_order_release(order) ? aa_stlr(sz, loc_reg(src), base)
+ : aa_str_uimm(sz, loc_reg(src),
+ base, 0));
+ if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+static void aa_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst,
+ NativeAddr addr, NativeLoc val, MemAccess mem,
+ MemOrder order) {
+ AANativeTarget* a = aa_of(t);
+ u32 base = AA_TMP0;
+ u32 next_reg = AA_TMP1;
+ u32 status = aa_saved_tmp_pick(loc_reg(dst), loc_reg(val), base);
+ NativeLoc next = aa_tmp_loc(dst.type, next_reg);
+ MCLabel retry = t->mc->label_new(t->mc);
+ u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type));
+ if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+ aa_saved_tmp_spill(a, status);
+ aa_atomic_addr_reg(t, addr, base);
+ t->mc->label_place(t->mc, retry);
+ aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldaxr(sz, loc_reg(dst), base)
+ : aa_ldxr(sz, loc_reg(dst), base));
+ switch (op) {
+ case AO_XCHG:
+ aa_move(t, next, val);
+ break;
+ case AO_ADD:
+ aa_binop(t, BO_IADD, next, dst, val);
+ break;
+ case AO_SUB:
+ aa_binop(t, BO_ISUB, next, dst, val);
+ break;
+ case AO_AND:
+ aa_binop(t, BO_AND, next, dst, val);
+ break;
+ case AO_OR:
+ aa_binop(t, BO_OR, next, dst, val);
+ break;
+ case AO_XOR:
+ aa_binop(t, BO_XOR, next, dst, val);
+ break;
+ case AO_NAND:
+ aa_binop(t, BO_AND, next, dst, val);
+ aa_unop(t, UO_BNOT, next, next);
+ break;
+ default:
+ aa_panic(a, "unsupported atomic rmw op");
+ }
+ aa_emit32(t->mc, aa_order_release(order) ? aa_stlxr(sz, status, next_reg, base)
+ : aa_stxr(sz, status, next_reg,
+ base));
+ aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0));
+ t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0);
+ aa_saved_tmp_restore(a, status);
+ if (order == MO_SEQ_CST) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+static void aa_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
+ NativeAddr addr, NativeLoc expected,
+ NativeLoc desired, MemAccess mem, MemOrder success,
+ MemOrder failure) {
+ u32 base = AA_TMP0;
+ u32 status = AA_TMP1;
+ u32 sz = size_idx(mem.size ? mem.size : type_size32(t, prior.type));
+ u32 sf = sz == 3u;
+ int acquire = aa_order_acquire(success) || aa_order_acquire(failure);
+ int release = aa_order_release(success);
+ MCLabel retry = t->mc->label_new(t->mc);
+ MCLabel fail = t->mc->label_new(t->mc);
+ MCLabel done = t->mc->label_new(t->mc);
+ if (success == MO_SEQ_CST || failure == MO_SEQ_CST)
+ aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+ aa_atomic_addr_reg(t, addr, base);
+ t->mc->label_place(t->mc, retry);
+ aa_emit32(t->mc, acquire ? aa_ldaxr(sz, loc_reg(prior), base)
+ : aa_ldxr(sz, loc_reg(prior), base));
+ aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(prior), loc_reg(expected)));
+ aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_NE)}));
+ t->mc->emit_label_ref(t->mc, fail, R_AARCH64_CONDBR19, 4, 0);
+ aa_emit32(t->mc, release ? aa_stlxr(sz, status, loc_reg(desired), base)
+ : aa_stxr(sz, status, loc_reg(desired), base));
+ aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0));
+ t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0);
+ aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 1);
+ aa_jump(t, done);
+ t->mc->label_place(t->mc, fail);
+ aa_emit32(t->mc, aa64_clrex(AA64_BARRIER_OPT_SY));
+ aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 0);
+ t->mc->label_place(t->mc, done);
+ if (success == MO_SEQ_CST || failure == MO_SEQ_CST)
+ aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+static void aa_fence(NativeTarget* t, MemOrder order) {
+ if (order != MO_RELAXED) aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+static void aa_intrinsic(NativeTarget* t, IntrinKind kind,
+ const NativeLoc* dsts, u32 ndst,
+ const NativeLoc* args, u32 narg) {
+ AggregateAccess access;
+ NativeAddr dst_addr;
+ NativeAddr src_addr;
+ memset(&access, 0, sizeof access);
+ memset(&dst_addr, 0, sizeof dst_addr);
+ memset(&src_addr, 0, sizeof src_addr);
+ switch (kind) {
+ case INTRIN_NONE:
+ if (ndst == 1u && narg == 3u && loc_is_fp(dsts[0])) {
+ u32 d = type_size32(t, dsts[0].type) == 8u;
+ aa_emit32(t->mc, aa_fp_bin(0x000800u, d, loc_reg(dsts[0]),
+ loc_reg(args[0]), loc_reg(args[1])));
+ aa_emit32(t->mc, aa_fp_bin(0x002800u, d, loc_reg(dsts[0]),
+ loc_reg(dsts[0]), loc_reg(args[2])));
+ return;
+ }
+ break;
+ case INTRIN_CLZ:
+ if (ndst == 1u && narg == 1u) {
+ aa_emit32(t->mc, aa_clz(loc_is_64(t, args[0]), loc_reg(dsts[0]),
+ loc_reg(args[0])));
+ return;
+ }
+ break;
+ case INTRIN_CTZ:
+ if (ndst == 1u && narg == 1u) {
+ u32 sf = loc_is_64(t, args[0]);
+ aa_emit32(t->mc, aa_rbit(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+ aa_emit32(t->mc, aa_clz(sf, loc_reg(dsts[0]), loc_reg(dsts[0])));
+ return;
+ }
+ break;
+ case INTRIN_POPCOUNT:
+ if (ndst == 1u && narg == 1u) {
+ u32 sf = loc_is_64(t, args[0]);
+ u32 rd = loc_reg(dsts[0]);
+ u32 rn = loc_reg(args[0]);
+ MCLabel loop = t->mc->label_new(t->mc);
+ MCLabel done = t->mc->label_new(t->mc);
+ aa_emit_load_imm(t->mc, sf, rd, 0);
+ aa_emit32(t->mc, aa64_mov_reg(sf, AA_TMP0, rn));
+ t->mc->label_place(t->mc, loop);
+ aa_emit32(t->mc, aa64_cbz(sf, AA_TMP0, 0));
+ t->mc->emit_label_ref(t->mc, done, R_AARCH64_CONDBR19, 4, 0);
+ aa_emit_load_imm(t->mc, sf, AA_TMP1, 1);
+ aa_emit32(t->mc, aa64_and(sf, AA_TMP1, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa64_add(sf, rd, rd, AA_TMP1));
+ aa_emit_load_imm(t->mc, sf, AA_TMP1, 1);
+ aa_emit32(t->mc, aa64_lsrv(sf, AA_TMP0, AA_TMP0, AA_TMP1));
+ aa_jump(t, loop);
+ t->mc->label_place(t->mc, done);
+ return;
+ }
+ break;
+ case INTRIN_BSWAP16:
+ case INTRIN_BSWAP32:
+ case INTRIN_BSWAP64:
+ if (ndst == 1u && narg == 1u) {
+ u32 sf = kind == INTRIN_BSWAP64;
+ aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+ if (kind == INTRIN_BSWAP16) {
+ aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
+ aa_emit32(t->mc, aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]),
+ AA_TMP0));
+ }
+ return;
+ }
+ break;
+ case INTRIN_SADD_OVERFLOW:
+ case INTRIN_UADD_OVERFLOW:
+ case INTRIN_SSUB_OVERFLOW:
+ case INTRIN_USUB_OVERFLOW:
+ if (ndst == 2u && narg == 2u) {
+ u32 sf = loc_is_64(t, dsts[0]);
+ u32 rd = loc_reg(dsts[0]);
+ if (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW)
+ aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = sf, .op = 0, .S = 1,
+ .Rm = loc_reg(args[1]), .Rn = loc_reg(args[0]),
+ .Rd = rd}));
+ else
+ aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = sf, .op = 1, .S = 1,
+ .Rm = loc_reg(args[1]), .Rn = loc_reg(args[0]),
+ .Rd = rd}));
+ aa_emit32(t->mc,
+ aa_cset(loc_is_64(t, dsts[1]), loc_reg(dsts[1]),
+ (kind == INTRIN_SADD_OVERFLOW ||
+ kind == INTRIN_SSUB_OVERFLOW)
+ ? 0x6u
+ : (kind == INTRIN_UADD_OVERFLOW ? 0x2u : 0x3u)));
+ return;
+ }
+ break;
+ case INTRIN_SMUL_OVERFLOW:
+ case INTRIN_UMUL_OVERFLOW:
+ if (ndst == 2u && narg == 2u) {
+ u32 sf = loc_is_64(t, dsts[0]);
+ if (sf) {
+ if (kind == INTRIN_SMUL_OVERFLOW) {
+ aa_emit32(t->mc, aa_smulh(AA_TMP0, loc_reg(args[0]),
+ loc_reg(args[1])));
+ aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]),
+ loc_reg(args[1])));
+ aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 63, 63));
+ aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
+ } else {
+ aa_emit32(t->mc, aa_umulh(AA_TMP0, loc_reg(args[0]),
+ loc_reg(args[1])));
+ aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]),
+ loc_reg(args[1])));
+ aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA64_ZR));
+ aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
+ }
+ } else if (kind == INTRIN_SMUL_OVERFLOW) {
+ aa_emit32(t->mc, aa_smaddl(AA_TMP0, loc_reg(args[0]),
+ loc_reg(args[1]), AA64_ZR));
+ aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0));
+ aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 0, 31));
+ aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
+ } else {
+ aa_emit32(t->mc, aa_umaddl(AA_TMP0, loc_reg(args[0]),
+ loc_reg(args[1]), AA64_ZR));
+ aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0));
+ aa_emit_load_imm(t->mc, 1, AA_TMP1, 32);
+ aa_emit32(t->mc, aa64_lsrv(1, AA_TMP1, AA_TMP0, AA_TMP1));
+ aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP1, AA64_ZR));
+ aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
+ }
+ return;
+ }
+ break;
+ case INTRIN_MEMCPY:
+ if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
+ args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
+ aa_panic(aa_of(t), "unsupported memory intrinsic operands");
+ if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
+ aa_panic(aa_of(t), "unsupported memory intrinsic size");
+ access.size = (u32)args[2].v.imm;
+ access.align = 1u;
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = args[0].v.reg;
+ src_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ src_addr.base.reg = args[1].v.reg;
+ aa_copy_bytes(t, dst_addr, src_addr, access);
+ return;
+ case INTRIN_MEMMOVE: {
+ MCLabel forward = t->mc->label_new(t->mc);
+ MCLabel done = t->mc->label_new(t->mc);
+ if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
+ args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
+ aa_panic(aa_of(t), "unsupported memory intrinsic operands");
+ if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
+ aa_panic(aa_of(t), "unsupported memory intrinsic size");
+ access.size = (u32)args[2].v.imm;
+ access.align = 1u;
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = args[0].v.reg;
+ src_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ src_addr.base.reg = args[1].v.reg;
+ aa_emit32(t->mc,
+ aa_subs_reg(1, AA64_ZR, args[0].v.reg, args[1].v.reg));
+ aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_LT_U)}));
+ t->mc->emit_label_ref(t->mc, forward, R_AARCH64_CONDBR19, 4, 0);
+ aa_copy_bytes_dir(t, dst_addr, src_addr, access, 1);
+ aa_jump(t, done);
+ t->mc->label_place(t->mc, forward);
+ aa_copy_bytes_dir(t, dst_addr, src_addr, access, 0);
+ t->mc->label_place(t->mc, done);
+ return;
+ }
+ case INTRIN_MEMSET:
+ if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
+ args[2].kind != NATIVE_LOC_IMM)
+ aa_panic(aa_of(t), "unsupported memset operands");
+ if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
+ aa_panic(aa_of(t), "unsupported memset size");
+ access.size = (u32)args[2].v.imm;
+ access.align = 1u;
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = args[0].v.reg;
+ if (args[1].kind == NATIVE_LOC_IMM) {
+ NativeLoc byte = aa_tmp_loc(builtin_id(CFREE_CG_BUILTIN_I8), AA_TMP0);
+ aa_emit_load_imm(t->mc, 0, AA_TMP0, args[1].v.imm & 0xff);
+ aa_set_bytes(t, dst_addr, byte, access);
+ } else {
+ aa_set_bytes(t, dst_addr, args[1], access);
+ }
+ return;
+ case INTRIN_EXPECT:
+ case INTRIN_ASSUME_ALIGNED:
+ if (ndst == 1u && narg >= 1u) {
+ if (args[0].kind == NATIVE_LOC_IMM)
+ aa_load_imm_native(t, dsts[0], args[0].v.imm);
+ else
+ aa_move(t, dsts[0], args[0]);
+ }
+ return;
+ case INTRIN_PREFETCH:
+ return;
+ case INTRIN_TRAP:
+ case INTRIN_UNREACHABLE:
+ aa_trap(t);
+ return;
+ default:
+ aa_panic(aa_of(t), "unsupported compiler intrinsic");
+ }
+}
+
+static void aa_trap(NativeTarget* t) { aa_emit32(t->mc, aa64_brk(0)); }
+
+static void aa_file_scope_asm(NativeTarget* t, const char* src, size_t len) {
+ AsmLexer* lex = asm_lex_open_mem(t->c, "<file-scope-asm>", src, len);
+ asm_parse(t->c, lex, t->mc);
+ asm_lex_close(lex);
+}
+
+static void aa_set_loc(NativeTarget* t, SrcLoc loc) {
+ AANativeTarget* a = aa_of(t);
+ a->loc = loc;
+ if (t->mc && t->mc->set_loc) t->mc->set_loc(t->mc, loc);
+}
+
+static void aa_finalize(NativeTarget* t) {
+ if (t->mc) mc_emit_eh_frame(t->mc);
+}
+
+static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
+ NativeFrameSlot home);
+
+static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u};
+static const Reg aa_int_scratch[] = {9u, 10u};
+static const Reg aa_fp_allocable[] = {18u, 19u};
+static const Reg aa_fp_scratch[] = {20u, 21u};
+
+#define AA_PHYS_INT_ALLOC(r) \
+ {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
+ .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_INT_CALLER(r) \
+ {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLER_SAVED, .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_INT_ARG(r) \
+ {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = (r), \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
+ ((r) < 2u ? NATIVE_REG_RET : 0), \
+ .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_INT_CALLEE(r) \
+ {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLEE_SAVED, .spill_cost = 4u, .copy_cost = 1u}
+#define AA_PHYS_INT_RESERVED(r) \
+ {.reg = (r), .cls = NATIVE_REG_INT, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_RESERVED, .spill_cost = 0u, .copy_cost = 0u}
+
+static const NativePhysRegInfo aa_int_phys[] = {
+ AA_PHYS_INT_ARG(0u), AA_PHYS_INT_ARG(1u),
+ AA_PHYS_INT_ARG(2u), AA_PHYS_INT_ARG(3u),
+ AA_PHYS_INT_ARG(4u), AA_PHYS_INT_ARG(5u),
+ AA_PHYS_INT_ARG(6u), AA_PHYS_INT_ARG(7u),
+ AA_PHYS_INT_ALLOC(8u), AA_PHYS_INT_RESERVED(9u),
+ AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_ALLOC(11u),
+ AA_PHYS_INT_ALLOC(12u), AA_PHYS_INT_ALLOC(13u),
+ AA_PHYS_INT_ALLOC(14u), AA_PHYS_INT_ALLOC(15u),
+ AA_PHYS_INT_RESERVED(16u), AA_PHYS_INT_RESERVED(17u),
+ AA_PHYS_INT_RESERVED(18u), AA_PHYS_INT_CALLEE(19u),
+ AA_PHYS_INT_CALLEE(20u), AA_PHYS_INT_CALLEE(21u),
+ AA_PHYS_INT_CALLEE(22u), AA_PHYS_INT_CALLEE(23u),
+ AA_PHYS_INT_CALLEE(24u), AA_PHYS_INT_CALLEE(25u),
+ AA_PHYS_INT_CALLEE(26u), AA_PHYS_INT_CALLEE(27u),
+ AA_PHYS_INT_CALLEE(28u), AA_PHYS_INT_RESERVED(29u),
+ AA_PHYS_INT_RESERVED(30u), AA_PHYS_INT_RESERVED(31u),
+};
+
+#define AA_PHYS_FP_ALLOC(r) \
+ {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
+ .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_FP_CALLER(r) \
+ {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLER_SAVED, .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_FP_ARG(r) \
+ {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = (r), \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
+ ((r) < 4u ? NATIVE_REG_RET : 0), \
+ .spill_cost = 1u, .copy_cost = 1u}
+#define AA_PHYS_FP_CALLEE(r) \
+ {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLEE_SAVED, .spill_cost = 4u, .copy_cost = 1u}
+#define AA_PHYS_FP_RESERVED(r) \
+ {.reg = (r), .cls = NATIVE_REG_FP, .abi_index = 0xffu, \
+ .flags = NATIVE_REG_RESERVED, .spill_cost = 0u, .copy_cost = 0u}
+
+static const NativePhysRegInfo aa_fp_phys[] = {
+ AA_PHYS_FP_ARG(0u), AA_PHYS_FP_ARG(1u),
+ AA_PHYS_FP_ARG(2u), AA_PHYS_FP_ARG(3u),
+ AA_PHYS_FP_ARG(4u), AA_PHYS_FP_ARG(5u),
+ AA_PHYS_FP_ARG(6u), AA_PHYS_FP_ARG(7u),
+ AA_PHYS_FP_CALLEE(8u), AA_PHYS_FP_CALLEE(9u),
+ AA_PHYS_FP_CALLEE(10u), AA_PHYS_FP_CALLEE(11u),
+ AA_PHYS_FP_CALLEE(12u), AA_PHYS_FP_CALLEE(13u),
+ AA_PHYS_FP_CALLEE(14u), AA_PHYS_FP_CALLEE(15u),
+ AA_PHYS_FP_CALLER(16u), AA_PHYS_FP_CALLER(17u),
+ AA_PHYS_FP_ALLOC(18u), AA_PHYS_FP_ALLOC(19u),
+ AA_PHYS_FP_RESERVED(20u), AA_PHYS_FP_RESERVED(21u),
+ AA_PHYS_FP_CALLER(22u), AA_PHYS_FP_CALLER(23u),
+ AA_PHYS_FP_CALLER(24u), AA_PHYS_FP_CALLER(25u),
+ AA_PHYS_FP_CALLER(26u), AA_PHYS_FP_CALLER(27u),
+ AA_PHYS_FP_CALLER(28u), AA_PHYS_FP_CALLER(29u),
+ AA_PHYS_FP_CALLER(30u), AA_PHYS_FP_CALLER(31u),
+};
+
+static const NativeAllocClassInfo aa_classes[] = {
+ {.cls = NATIVE_REG_INT,
+ .allocable = aa_int_allocable,
+ .nallocable = sizeof aa_int_allocable / sizeof aa_int_allocable[0],
+ .scratch = aa_int_scratch,
+ .nscratch = sizeof aa_int_scratch / sizeof aa_int_scratch[0],
+ .phys = aa_int_phys,
+ .nphys = sizeof aa_int_phys / sizeof aa_int_phys[0],
+ .caller_saved_mask = 0x0007ffffu,
+ .callee_saved_mask = 0x1ff80000u,
+ .arg_mask = 0x000000ffu,
+ .ret_mask = 0x00000003u,
+ .reserved_mask = (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << AA_FP) |
+ (1u << AA_LR)},
+ {.cls = NATIVE_REG_FP,
+ .allocable = aa_fp_allocable,
+ .nallocable = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0],
+ .scratch = aa_fp_scratch,
+ .nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0],
+ .phys = aa_fp_phys,
+ .nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0],
+ .caller_saved_mask = 0xffffffffu,
+ .arg_mask = 0x000000ffu,
+ .ret_mask = 0x0000000fu},
+};
+
+static const NativeRegInfo aa_reg_info = {
+ .classes = aa_classes,
+ .nclasses = sizeof aa_classes / sizeof aa_classes[0],
+};
+
+NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
+ MCEmitter* mc) {
+ AANativeTarget* a = arena_znew(c->tu, AANativeTarget);
+ NativeTarget* t;
+ if (!a) return NULL;
+ t = &a->base;
+ t->c = c;
+ t->obj = obj;
+ t->mc = mc;
+ t->regs = &aa_reg_info;
+ t->class_for_type = aa_class_for_type;
+ t->addr_legal = aa_addr_legal;
+ t->func_begin = aa_func_begin;
+ t->func_begin_known_frame = aa_func_begin_known_frame;
+ t->note_frame_state = aa_note_frame_state;
+ t->func_end = aa_func_end;
+ t->frame_slot = aa_frame_slot;
+ t->bind_param = aa_bind_native_param;
+ t->label_new = aa_label_new;
+ t->label_place = aa_label_place;
+ t->jump = aa_jump;
+ t->cmp_branch = aa_cmp_branch;
+ t->indirect_branch = aa_indirect_branch;
+ t->load_label_addr = aa_load_label_addr;
+ t->move = aa_move;
+ t->load_imm = aa_load_imm_native;
+ t->load_const = aa_load_const;
+ t->load_addr = aa_load_addr;
+ t->load = aa_load_native;
+ t->store = aa_store_native;
+ t->tls_addr_of = aa_tls_addr_of;
+ t->copy_bytes = aa_copy_bytes;
+ t->set_bytes = aa_set_bytes;
+ t->bitfield_load = aa_bitfield_load;
+ t->bitfield_store = aa_bitfield_store;
+ t->binop = aa_binop;
+ t->unop = aa_unop;
+ t->cmp = aa_cmp;
+ t->convert = aa_convert;
+ t->alloca_ = aa_alloca;
+ t->spill = aa_spill;
+ t->reload = aa_reload;
+ t->plan_call = aa_plan_call;
+ t->emit_call = aa_emit_call;
+ t->plan_ret = aa_plan_ret;
+ t->ret = aa_ret;
+ t->atomic_load = aa_atomic_load;
+ t->atomic_store = aa_atomic_store;
+ t->atomic_rmw = aa_atomic_rmw;
+ t->atomic_cas = aa_atomic_cas;
+ t->fence = aa_fence;
+ t->intrinsic = aa_intrinsic;
+ t->file_scope_asm = aa_file_scope_asm;
+ t->trap = aa_trap;
+ t->set_loc = aa_set_loc;
+ t->finalize = aa_finalize;
+ return t;
+}
+
+static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
+ NativeFrameSlot home) {
+ AANativeTarget* a = aa_of(t);
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
+ const ABIArgInfo* ai = p->index < abi->nparams ? &abi->params[p->index] : NULL;
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ NativeLoc src = aa_reg_loc(p->type, NATIVE_REG_INT,
+ a->next_param_int < 8u ? a->next_param_int++
+ : AA_TMP0);
+ if (src.v.reg == AA_TMP0) {
+ NativeAddr saddr;
+ memset(&saddr, 0, sizeof saddr);
+ saddr.base_kind = NATIVE_ADDR_BASE_REG;
+ saddr.base.reg = AA_FP;
+ saddr.offset = (i32)a->next_param_stack;
+ aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8));
+ a->next_param_stack += 8u;
+ }
+ NativeAddr dst, from;
+ AggregateAccess access;
+ memset(&dst, 0, sizeof dst);
+ dst.base_kind = NATIVE_ADDR_BASE_FRAME;
+ dst.base.frame = home;
+ dst.base_type = p->type;
+ memset(&from, 0, sizeof from);
+ from.base_kind = NATIVE_ADDR_BASE_REG;
+ from.base.reg = src.v.reg;
+ from.base_type = p->type;
+ memset(&access, 0, sizeof access);
+ access.type = p->type;
+ access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
+ access.align = p->align ? p->align : type_align32(t, p->type);
+ aa_copy_bytes(t, dst, from, access);
+ return;
+ }
+ for (u32 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* part = &ai->parts[i];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ NativeLoc src;
+ if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
+ src = aa_reg_loc(p->type, cls, a->next_param_fp++);
+ } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
+ src = aa_reg_loc(p->type, cls, a->next_param_int++);
+ } else {
+ src = aa_reg_loc(p->type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0);
+ NativeAddr saddr;
+ memset(&saddr, 0, sizeof saddr);
+ saddr.base_kind = NATIVE_ADDR_BASE_REG;
+ saddr.base.reg = AA_FP;
+ saddr.base_type = p->type;
+ saddr.offset = (i32)a->next_param_stack;
+ aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size));
+ a->next_param_stack += 8u;
+ }
+ aa_store_part(t, aa_stack_loc(p->type, home, (i32)part->src_offset),
+ src, 0, part->size);
+ }
+ a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
+}
+
+static void aa_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
+ CGLocal local, NativeDirectLocal* l) {
+ (void)local;
+ aa_bind_native_param(d->native, p, l->home);
+}
+
+static const char* aa_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
+ NativeCallDesc nd;
+ NativeLoc* args = NULL;
+ NativeLoc* results = NULL;
+ u32 stack;
+ memset(&nd, 0, sizeof nd);
+ if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+ if (call->nresults)
+ results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+ for (u32 i = 0; i < call->nargs; ++i) {
+ args[i].kind = NATIVE_LOC_FRAME;
+ args[i].type = d->locals[call->args[i] - 1u].type;
+ args[i].cls = d->locals[call->args[i] - 1u].cls;
+ args[i].v.frame = d->locals[call->args[i] - 1u].home;
+ }
+ for (u32 i = 0; i < call->nresults; ++i) {
+ results[i].kind = NATIVE_LOC_FRAME;
+ results[i].type = d->locals[call->results[i] - 1u].type;
+ results[i].cls = d->locals[call->results[i] - 1u].cls;
+ results[i].v.frame = d->locals[call->results[i] - 1u].home;
+ }
+ nd.fn_type = call->fn_type;
+ nd.args = args;
+ nd.results = results;
+ nd.nargs = call->nargs;
+ nd.nresults = call->nresults;
+ stack = aa_call_stack_size(d->native, &nd);
+ if (stack > aa_of(d->native)->incoming_stack_size)
+ return "aarch64 tail call: stack argument area too small";
+ return NULL;
+}
+
+static NativeAddr aa_direct_addr(NativeDirectTarget* d, Operand op) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((OpKind)op.kind) {
+ case OPK_LOCAL:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ return addr;
+ case OPK_INDIRECT:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
+ addr.base.frame = d->locals[op.v.ind.base - 1u].home;
+ addr.cls = d->locals[op.v.ind.base - 1u].cls;
+ addr.base_type = d->locals[op.v.ind.base - 1u].type;
+ addr.offset = op.v.ind.ofs;
+ return addr;
+ default:
+ compiler_panic(d->base.c, d->loc,
+ "aarch64 native target: operand is not addressable");
+ }
+}
+
+static NativeAddr aa_direct_materialize_addr(NativeDirectTarget* d,
+ Operand op) {
+ NativeAddr addr = aa_direct_addr(d, op);
+ if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
+ NativeLoc base = aa_reg_loc(addr.base_type, NATIVE_REG_INT, AA_TMP1);
+ NativeAddr load;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = addr.base.frame;
+ load.base_type = addr.base_type;
+ aa_emit_mem(aa_of(d->native), 1, base, load,
+ aa_mem_for_type(d->native, addr.base_type, 8));
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = AA_TMP1;
+ }
+ return addr;
+}
+
+static NativeAddr aa_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ if (op.kind == OPK_LOCAL) {
+ NativeLoc base = aa_reg_loc(op.type, NATIVE_REG_INT, AA_TMP1);
+ NativeAddr load;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = d->locals[op.v.local - 1u].home;
+ load.base_type = op.type;
+ aa_emit_mem(aa_of(d->native), 1, base, load,
+ aa_mem_for_type(d->native, op.type, 8));
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = AA_TMP1;
+ addr.base_type = op.type;
+ return addr;
+ }
+ return aa_direct_materialize_addr(d, op);
+}
+
+static NativeAddr aa_reg_addr(CfreeCgTypeId type, u32 reg, i32 offset) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = reg;
+ addr.base_type = type;
+ addr.offset = offset;
+ return addr;
+}
+
+static void aa_load_ap_addr(NativeDirectTarget* d, Operand ap_addr,
+ u32 dst_reg) {
+ NativeLoc dst = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64),
+ NATIVE_REG_INT, dst_reg);
+ NativeAddr ap = aa_direct_pointer_addr(d, ap_addr);
+ d->native->load_addr(d->native, dst, ap);
+}
+
+static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) {
+ AANativeTarget* a = aa_of(d->native);
+ ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+ NativeLoc ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ AA_TMP0);
+ NativeAddr dst = aa_direct_pointer_addr(d, ap_addr);
+ if (vai.kind == ABI_VA_LIST_POINTER) {
+ aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+ aa_emit_mem(a, 0, ptr, dst, aa_mem_for_type(d->native, ptr.type, 8));
+ return;
+ }
+ if (vai.kind == ABI_VA_LIST_AAPCS64) {
+ CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32);
+ NativeLoc i32tmp = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1);
+ MemAccess ptr_mem = aa_mem_for_type(d->native, ptr.type, 8);
+ MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4);
+ AANativeSlot* gr = aa_slot(a, a->va_gr_slot);
+ AANativeSlot* vr = aa_slot(a, a->va_vr_slot);
+ u32 used_gr = a->next_param_int < vai.gp_reg_count ? a->next_param_int
+ : vai.gp_reg_count;
+ u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp
+ : vai.fp_reg_count;
+ aa_load_ap_addr(d, ap_addr, 15u);
+ aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+ aa_emit_mem(a, 0, ptr,
+ aa_reg_addr(ptr.type, 15u, (i32)vai.stack_offset), ptr_mem);
+ aa_emit_add_imm(a, AA_TMP0, AA_FP,
+ -(i32)gr->off +
+ (i32)(vai.gp_reg_count * vai.gp_slot_size));
+ aa_emit_mem(a, 0, ptr,
+ aa_reg_addr(ptr.type, 15u, (i32)vai.gr_top_offset), ptr_mem);
+ aa_emit_add_imm(a, AA_TMP0, AA_FP,
+ -(i32)vr->off +
+ (i32)(vai.fp_reg_count * vai.fp_slot_size));
+ aa_emit_mem(a, 0, ptr,
+ aa_reg_addr(ptr.type, 15u, (i32)vai.vr_top_offset), ptr_mem);
+ aa_emit_load_imm(a->base.mc, 0, AA_TMP1,
+ -(i32)((vai.gp_reg_count - used_gr) * vai.gp_slot_size));
+ aa_emit_mem(a, 0, i32tmp,
+ aa_reg_addr(i32_ty, 15u, (i32)vai.gr_offs_offset), i32_mem);
+ aa_emit_load_imm(a->base.mc, 0, AA_TMP1,
+ -(i32)((vai.fp_reg_count - used_vr) * vai.fp_slot_size));
+ aa_emit_mem(a, 0, i32tmp,
+ aa_reg_addr(i32_ty, 15u, (i32)vai.vr_offs_offset), i32_mem);
+ return;
+ }
+ {
+ compiler_panic(d->base.c, d->loc,
+ "aarch64 native target: unsupported va_list layout");
+ }
+}
+
+static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr,
+ CfreeCgTypeId type) {
+ AANativeTarget* a = aa_of(d->native);
+ ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+ NativeLoc cur = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ AA_TMP0);
+ NativeLoc val =
+ aa_reg_loc(type, cg_type_is_float(d->base.c, type) ? NATIVE_REG_FP
+ : NATIVE_REG_INT,
+ cg_type_is_float(d->base.c, type) ? 16u : 9u);
+ NativeAddr src, dst;
+ MemAccess ptr_mem = aa_mem_for_type(d->native, cur.type, 8);
+ MemAccess val_mem = aa_mem_for_type(d->native, type, type_size32(d->native, type));
+ if (vai.kind == ABI_VA_LIST_POINTER) {
+ NativeAddr ap = aa_direct_pointer_addr(d, ap_addr);
+ aa_emit_mem(a, 1, cur, ap, ptr_mem);
+ src = aa_reg_addr(type, AA_TMP0, 0);
+ aa_emit_mem(a, 1, val, src, val_mem);
+ aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8);
+ aa_emit_mem(a, 0, cur, ap, ptr_mem);
+ dst = aa_direct_materialize_addr(d, dst_op);
+ aa_emit_mem(a, 0, val, dst, val_mem);
+ return;
+ }
+ if (vai.kind == ABI_VA_LIST_AAPCS64) {
+ CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32);
+ NativeLoc off = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1);
+ MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4);
+ int is_fp = cg_type_is_float(d->base.c, type);
+ u32 offs_field = is_fp ? vai.vr_offs_offset : vai.gr_offs_offset;
+ u32 top_field = is_fp ? vai.vr_top_offset : vai.gr_top_offset;
+ u32 slot_size = is_fp ? vai.fp_slot_size : vai.gp_slot_size;
+ MCLabel stack_label = d->native->mc->label_new(d->native->mc);
+ MCLabel done_label = d->native->mc->label_new(d->native->mc);
+ aa_load_ap_addr(d, ap_addr, 15u);
+ aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem);
+ aa_emit32(a->base.mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0));
+ aa_emit32(a->base.mc,
+ aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)}));
+ a->base.mc->emit_label_ref(a->base.mc, stack_label, R_AARCH64_CONDBR19, 4,
+ 0);
+ aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)top_field),
+ ptr_mem);
+ aa_emit32(a->base.mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31));
+ aa_emit32(a->base.mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1));
+ aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
+ aa_emit_add_imm(a, AA_TMP1, AA_TMP1, (i32)slot_size);
+ aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem);
+ aa_emit32(a->base.mc, aa64_b(0));
+ a->base.mc->emit_label_ref(a->base.mc, done_label, R_AARCH64_JUMP26, 4, 0);
+ a->base.mc->label_place(a->base.mc, stack_label);
+ aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset),
+ ptr_mem);
+ aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
+ aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8);
+ aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset),
+ ptr_mem);
+ a->base.mc->label_place(a->base.mc, done_label);
+ dst = aa_direct_materialize_addr(d, dst_op);
+ aa_emit_mem(a, 0, val, dst, val_mem);
+ return;
+ }
+ compiler_panic(d->base.c, d->loc,
+ "aarch64 native target: unsupported va_list layout");
+}
+
+static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) {
+ (void)d;
+ (void)ap_addr;
+}
+
+static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr,
+ Operand src_ap_addr) {
+ AANativeTarget* a = aa_of(d->native);
+ ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+ NativeLoc tmp = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ AA_TMP0);
+ MemAccess mem = aa_mem_for_type(d->native, tmp.type, 8);
+ if (vai.kind == ABI_VA_LIST_POINTER) {
+ NativeAddr src = aa_direct_pointer_addr(d, src_ap_addr);
+ NativeAddr dst;
+ aa_emit_mem(a, 1, tmp, src, mem);
+ dst = aa_direct_pointer_addr(d, dst_ap_addr);
+ aa_emit_mem(a, 0, tmp, dst, mem);
+ return;
+ }
+ if (vai.kind == ABI_VA_LIST_AAPCS64) {
+ aa_load_ap_addr(d, src_ap_addr, 14u);
+ aa_load_ap_addr(d, dst_ap_addr, 15u);
+ for (u32 off = 0; off < vai.type.size; off += 8u) {
+ aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, 14u, (i32)off), mem);
+ aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, 15u, (i32)off), mem);
+ }
+ return;
+ }
+ compiler_panic(d->base.c, d->loc,
+ "aarch64 native target: unsupported va_list layout");
+}
+
+AA_UNUSED_FN static const char* aa_asm_constraint_body(const char* s) {
+ if (!s) return "";
+ if (s[0] == '=' && s[1] == '&') return s + 2;
+ if (s[0] == '=' || s[0] == '+' || s[0] == '&') return s + 1;
+ return s;
+}
+
+AA_UNUSED_FN static int aa_asm_constraint_early(const char* s) {
+ if (!s) return 0;
+ return (s[0] == '=' && s[1] == '&') || s[0] == '&';
+}
+
+AA_UNUSED_FN static int aa_asm_match_index(const char* s) {
+ int n = 0;
+ if (!s || s[0] < '0' || s[0] > '9') return -1;
+ for (const char* p = s; *p >= '0' && *p <= '9'; ++p) {
+ n = n * 10 + (*p - '0');
+ }
+ return n;
+}
+
+_Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) {
+ compiler_panic(d->base.c, d->loc, "aarch64 inline asm: %s", msg);
+}
+
+AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, CfreeCgTypeId type,
+ NativeAllocClass cls, Reg reg) {
+ memset(out, 0, sizeof *out);
+ out->kind = AA64_INLINE_OPK_REG;
+ out->pad[0] = (cls == NATIVE_REG_FP) ? AA64_INLINE_OPCLS_FP
+ : AA64_INLINE_OPCLS_INT;
+ out->type = type;
+ out->v.local = (CGLocal)reg;
+}
+
+AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, CfreeCgTypeId type,
+ Reg base) {
+ memset(out, 0, sizeof *out);
+ out->kind = OPK_INDIRECT;
+ out->type = type;
+ out->v.ind.base = (CGLocal)base;
+ out->v.ind.index = CG_LOCAL_NONE;
+}
+
+static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name,
+ NativeAllocClass* cls_out, Reg* reg_out) {
+ Slice s = pool_slice(d->base.c->global, name);
+ char buf[16];
+ uint32_t dwarf;
+ if (!s.s || !s.len) return 0;
+ if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
+ if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
+ if (s.len >= sizeof buf) aa_asm_panic(d, "clobber name is too long");
+ memcpy(buf, s.s, s.len);
+ buf[s.len] = '\0';
+ if (aa64_register_index(buf, &dwarf) != 0)
+ aa_asm_panic(d, "unknown clobber register");
+ if (dwarf <= 30u) {
+ *cls_out = NATIVE_REG_INT;
+ *reg_out = (Reg)dwarf;
+ return 1;
+ }
+ if (dwarf >= 64u && dwarf <= 95u) {
+ *cls_out = NATIVE_REG_FP;
+ *reg_out = (Reg)(dwarf - 64u);
+ return 1;
+ }
+ aa_asm_panic(d, "unsupported clobber register");
+ return 0;
+}
+
+AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d,
+ const Sym* clobbers, u32 nclob,
+ u32* int_mask, u32* fp_mask) {
+ *int_mask = 0;
+ *fp_mask = 0;
+ for (u32 i = 0; i < nclob; ++i) {
+ NativeAllocClass cls;
+ Reg reg;
+ if (!aa_asm_parse_reg_clobber(d, clobbers[i], &cls, ®)) continue;
+ if (cls == NATIVE_REG_INT)
+ *int_mask |= 1u << reg;
+ else if (cls == NATIVE_REG_FP)
+ *fp_mask |= 1u << reg;
+ }
+}
+
+AA_UNUSED_FN static Reg aa_asm_alloc_reg(NativeDirectTarget* d,
+ NativeAllocClass cls, u32* used_int,
+ u32* used_fp) {
+ static const Reg int_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u,
+ 7u, 8u, 11u, 12u, 13u, 14u, 15u};
+ static const Reg fp_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u,
+ 16u, 17u, 18u, 19u, 22u, 23u, 24u, 25u,
+ 26u, 27u, 28u, 29u, 30u, 31u};
+ const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
+ u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
+ : (u32)(sizeof int_pool / sizeof int_pool[0]);
+ u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
+ for (u32 i = 0; i < n; ++i) {
+ Reg r = pool[i];
+ if ((*used & (1u << r)) != 0) continue;
+ *used |= 1u << r;
+ return r;
+ }
+ aa_asm_panic(d, "out of registers for asm operands");
+ return REG_NONE;
+}
+
+AA_UNUSED_FN static NativeAllocClass
+aa_asm_constraint_class(NativeDirectTarget* d, const char* body) {
+ if (body[0] == 'r') return NATIVE_REG_INT;
+ if (body[0] == 'w') return NATIVE_REG_FP;
+ aa_asm_panic(d, "constraint is not a register constraint");
+ return NATIVE_REG_INT;
+}
+
+AA_UNUSED_FN static void aa_direct_load_operand_to_reg(NativeDirectTarget* d,
+ Operand op,
+ NativeLoc dst) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((OpKind)op.kind) {
+ case OPK_IMM:
+ if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
+ aa_asm_panic(d, "floating-point immediate asm input is unsupported");
+ d->native->load_imm(d->native, dst, op.v.imm);
+ return;
+ case OPK_LOCAL:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ aa_emit_mem(aa_of(d->native), 1, dst, addr,
+ aa_mem_for_type(d->native, op.type, 0));
+ return;
+ case OPK_GLOBAL:
+ addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
+ addr.base.global.sym = op.v.global.sym;
+ addr.base.global.addend = op.v.global.addend;
+ addr.base_type = op.type;
+ d->native->load_addr(d->native, dst, addr);
+ return;
+ case OPK_INDIRECT:
+ addr = aa_direct_materialize_addr(d, op);
+ aa_emit_mem(aa_of(d->native), 1, dst, addr,
+ aa_mem_for_type(d->native, op.type, 0));
+ return;
+ }
+ aa_asm_panic(d, "unsupported asm input operand");
+}
+
+AA_UNUSED_FN static void aa_direct_load_address_to_reg(NativeDirectTarget* d,
+ Operand op,
+ NativeLoc dst) {
+ NativeAddr addr = aa_direct_addr(d, op);
+ d->native->load_addr(d->native, dst, addr);
+}
+
+AA_UNUSED_FN static void aa_direct_store_reg_to_operand(NativeDirectTarget* d,
+ Operand op,
+ NativeLoc src) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ if (op.kind == OPK_LOCAL) {
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ } else {
+ addr = aa_direct_materialize_addr(d, op);
+ }
+ aa_emit_mem(aa_of(d->native), 0, src, addr,
+ aa_mem_for_type(d->native, op.type, 0));
+}
+
+typedef struct AAAsmSavedClobber {
+ NativeFrameSlot slot;
+ NativeAllocClass cls;
+ Reg reg;
+ CfreeCgTypeId type;
+} AAAsmSavedClobber;
+
+static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) {
+ NativeFrameSlotDesc desc;
+ NativeAddr addr;
+ NativeLoc reg;
+ memset(&desc, 0, sizeof desc);
+ desc.type = s->type;
+ desc.size = 8;
+ desc.align = 8;
+ desc.kind = NATIVE_FRAME_SLOT_SAVE;
+ s->slot = d->native->frame_slot(d->native, &desc);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = s->slot;
+ addr.base_type = s->type;
+ reg = aa_reg_loc(s->type, s->cls, s->reg);
+ aa_emit_mem(aa_of(d->native), 0, reg, addr,
+ aa_mem_for_type(d->native, s->type, 8));
+}
+
+AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d,
+ const AAAsmSavedClobber* s) {
+ NativeAddr addr;
+ NativeLoc reg = aa_reg_loc(s->type, s->cls, s->reg);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = s->slot;
+ addr.base_type = s->type;
+ aa_emit_mem(aa_of(d->native), 1, reg, addr,
+ aa_mem_for_type(d->native, s->type, 8));
+}
+
+AA_UNUSED_FN static AAAsmSavedClobber*
+aa_asm_save_callee_clobbers(NativeDirectTarget* d, u32 int_mask, u32 fp_mask,
+ u32* nsaved_out) {
+ AAAsmSavedClobber* saved =
+ arena_zarray(d->base.c->tu, AAAsmSavedClobber, 20u);
+ u32 n = 0;
+ CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+ CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64);
+ for (Reg r = 19u; r <= 28u; ++r) {
+ if ((int_mask & (1u << r)) == 0) continue;
+ saved[n].cls = NATIVE_REG_INT;
+ saved[n].reg = r;
+ saved[n].type = i64;
+ aa_asm_save_one(d, &saved[n++]);
+ }
+ for (Reg r = 8u; r <= 15u; ++r) {
+ if ((fp_mask & (1u << r)) == 0) continue;
+ saved[n].cls = NATIVE_REG_FP;
+ saved[n].reg = r;
+ saved[n].type = f64;
+ aa_asm_save_one(d, &saved[n++]);
+ }
+ *nsaved_out = n;
+ return saved;
+}
+
+static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
+ const AsmConstraint* outs, u32 nout,
+ Operand* out_ops, const AsmConstraint* ins,
+ u32 nin, const Operand* in_ops,
+ const Sym* clobbers, u32 nclob) {
+ Operand* bound_outs =
+ nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL;
+ Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL;
+ u32 clob_int, clob_fp, used_int, used_fp;
+ AAAsmSavedClobber* saved;
+ u32 nsaved;
+ AA64Asm* a;
+
+ aa_asm_clobber_masks(d, clobbers, nclob, &clob_int, &clob_fp);
+ used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) |
+ (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP);
+ used_fp = clob_fp | (1u << 20u) | (1u << 21u);
+
+ for (u32 i = 0; i < nout; ++i) {
+ const char* body = aa_asm_constraint_body(outs[i].str);
+ if (body[0] == 'r' || body[0] == 'w') {
+ NativeAllocClass cls = aa_asm_constraint_class(d, body);
+ Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp);
+ CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
+ aa_asm_bound_reg(&bound_outs[i], type, cls, reg);
+ if (outs[i].dir == ASM_INOUT) {
+ NativeLoc loc = aa_reg_loc(type, cls, reg);
+ aa_direct_load_operand_to_reg(d, out_ops[i], loc);
+ }
+ } else if (body[0] == 'm') {
+ Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
+ NativeLoc loc = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64),
+ NATIVE_REG_INT, reg);
+ CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
+ aa_direct_load_address_to_reg(d, out_ops[i], loc);
+ aa_asm_bound_mem(&bound_outs[i], type, reg);
+ } else {
+ aa_asm_panic(d, "unsupported output constraint");
+ }
+ }
+
+ for (u32 i = 0; i < nin; ++i) {
+ const char* body = aa_asm_constraint_body(ins[i].str);
+ int matched = aa_asm_match_index(body);
+ if (matched >= 0) {
+ if ((u32)matched >= nout)
+ aa_asm_panic(d, "matching constraint out of range");
+ if (aa_asm_constraint_early(outs[matched].str))
+ aa_asm_panic(d, "matching input names early-clobber output");
+ if (bound_outs[matched].kind != AA64_INLINE_OPK_REG)
+ aa_asm_panic(d, "matching constraint requires register output");
+ bound_ins[i] = bound_outs[matched];
+ aa_direct_load_operand_to_reg(
+ d, in_ops[i],
+ aa_reg_loc(bound_ins[i].type,
+ bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP
+ ? NATIVE_REG_FP
+ : NATIVE_REG_INT,
+ (Reg)bound_ins[i].v.local));
+ continue;
+ }
+ if (body[0] == 'r' || body[0] == 'w') {
+ NativeAllocClass cls = aa_asm_constraint_class(d, body);
+ Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp);
+ CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
+ aa_asm_bound_reg(&bound_ins[i], type, cls, reg);
+ aa_direct_load_operand_to_reg(d, in_ops[i],
+ aa_reg_loc(type, cls, reg));
+ } else if (body[0] == 'i') {
+ if (in_ops[i].kind != OPK_IMM)
+ aa_asm_panic(d, "immediate constraint requires immediate operand");
+ bound_ins[i] = in_ops[i];
+ } else if (body[0] == 'm') {
+ Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
+ NativeLoc loc = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64),
+ NATIVE_REG_INT, reg);
+ CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
+ aa_direct_load_address_to_reg(d, in_ops[i], loc);
+ aa_asm_bound_mem(&bound_ins[i], type, reg);
+ } else {
+ aa_asm_panic(d, "unsupported input constraint");
+ }
+ }
+
+ saved = aa_asm_save_callee_clobbers(d, clob_int, clob_fp, &nsaved);
+ a = aa64_asm_open(d->base.c);
+ aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
+ nclob);
+ aa64_asm_run_template(a, d->native->mc, tmpl);
+ aa64_asm_close(a);
+
+ for (u32 i = 0; i < nout; ++i) {
+ NativeAllocClass cls;
+ NativeLoc src;
+ if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue;
+ cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP
+ : NATIVE_REG_INT;
+ src = aa_reg_loc(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
+ aa_direct_store_reg_to_operand(d, out_ops[i], src);
+ }
+ for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(d, &saved[i - 1u]);
+}
+
+static const NativeOps aa_direct_ops = {
+ .bind_param = aa_bind_param,
+ .tail_call_unrealizable_reason = aa_no_tail,
+ .va_start_ = aa_va_start_,
+ .va_arg_ = aa_va_arg_,
+ .va_end_ = aa_va_end_,
+ .va_copy_ = aa_va_copy_,
+ .asm_block = aa_direct_asm_block,
+};
+
+const NativeOps* aa64_native_direct_ops(void) { return &aa_direct_ops; }
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -1,2908 +0,0 @@
-/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics,
- * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */
-
-#include "arch/aa64/internal.h"
-#include "cfree/config.h"
-#include "core/slice.h"
-
-/* ============================================================
- * Data movement
- * ============================================================ */
-
-static RelocKind ldst_lo12_reloc_for(u32 nbytes);
-
-static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) {
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm);
-}
-
-static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
- AAImpl* a = impl_of(t);
- if (dst.cls != RC_FP) {
- compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1");
- }
-
- Sym ro_name = pool_intern_slice(t->c->global, SLICE_LIT(".rodata"));
- ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u);
-
- u32 cur_section = t->mc->section_id;
- t->mc->set_section(t->mc, ro);
- u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4);
- t->mc->emit_bytes(t->mc, cb.bytes, cb.size);
-
- char namebuf[64];
- static u32 lit_seq = 0;
- int len = 0;
- {
- const char* prefix = ".LCFP";
- for (; prefix[len]; ++len) namebuf[len] = prefix[len];
- u32 v = lit_seq++;
- char tmp[16];
- int tn = 0;
- if (v == 0)
- tmp[tn++] = '0';
- else {
- while (v) {
- tmp[tn++] = '0' + (char)(v % 10);
- v /= 10;
- }
- }
- for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i];
- namebuf[len] = 0;
- }
- Sym sname = pool_intern_slice(t->c->global, slice_from_cstr(namebuf));
- ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off,
- (u64)cb.size);
-
- t->mc->set_section(t->mc, cur_section);
-
- u32 adrp_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0));
- t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21,
- sym, 0, 0, 0);
-
- u32 ldr_pos = t->mc->pos(t->mc);
- u32 sidx = size_idx_for_bytes(cb.size);
- if (cb.size == 16)
- aa64_emit32(t->mc, aa64_ldr_q_uimm(reg_num(dst), AA_TMP0, 0));
- else
- aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0));
- RelocKind lo12 = ldst_lo12_reloc_for(cb.size);
- t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0);
-}
-
-static void aa_copy(CGTarget* t, Operand dst, Operand src) {
- if (dst.cls == RC_FP && src.cls == RC_INT) {
- u32 sz = type_byte_size(dst.type);
- aa64_emit32(t->mc, sz == 8 ? aa64_fmov_d_x(reg_num(dst), reg_num(src))
- : aa64_fmov_s_w(reg_num(dst), reg_num(src)));
- return;
- }
- if (dst.cls == RC_INT && src.cls == RC_FP) {
- u32 sz = type_byte_size(src.type);
- aa64_emit32(t->mc, sz == 8 ? aa64_fmov_x_d(reg_num(dst), reg_num(src))
- : aa64_fmov_w_s(reg_num(dst), reg_num(src)));
- return;
- }
- if (dst.cls == RC_FP || src.cls == RC_FP) {
- if (type_byte_size(dst.type) == 16) {
- aa64_emit32(t->mc, aa64_mov_v16b(reg_num(dst), reg_num(src)));
- } else {
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src)));
- }
- return;
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
-}
-
-/* ============================================================
- * Load / store
- * ============================================================ */
-
-static RelocKind ldst_lo12_reloc_for(u32 nbytes) {
- switch (nbytes) {
- case 1:
- return R_AARCH64_LDST8_ABS_LO12_NC;
- case 2:
- return R_AARCH64_LDST16_ABS_LO12_NC;
- case 4:
- return R_AARCH64_LDST32_ABS_LO12_NC;
- case 8:
- return R_AARCH64_LDST64_ABS_LO12_NC;
- case 16:
- return R_AARCH64_LDST128_ABS_LO12_NC;
- default:
- return R_AARCH64_LDST64_ABS_LO12_NC;
- }
-}
-
-static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
- i32 off) {
- if (off < -256 || off > 255) {
- aa64_emit_addr_adjust(mc, AA_TMP0, rn, off);
- rn = AA_TMP0;
- off = 0;
- }
- if (sidx == 4)
- aa64_emit32(mc, aa64_ldur_q(rt, rn, off));
- else
- aa64_emit32(mc, aa64_ldur_fp(sidx, rt, rn, off));
-}
-
-static void aa_emit_str_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
- i32 off) {
- if (off < -256 || off > 255) {
- aa64_emit_addr_adjust(mc, AA_TMP0, rn, off);
- rn = AA_TMP0;
- off = 0;
- }
- if (sidx == 4)
- aa64_emit32(mc, aa64_stur_q(rt, rn, off));
- else
- aa64_emit32(mc, aa64_stur_fp(sidx, rt, rn, off));
-}
-
-static void aa_emit_ldr_fp_uimm_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
- u32 off) {
- if (sidx == 4)
- aa64_emit32(mc, aa64_ldr_q_uimm(rt, rn, off));
- else
- aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, rt, rn, off));
-}
-
-static void aa_emit_str_fp_uimm_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
- u32 off) {
- if (sidx == 4)
- aa64_emit32(mc, aa64_str_q_uimm(rt, rn, off));
- else
- aa64_emit32(mc, aa64_str_fp_uimm(sidx, rt, rn, off));
-}
-
-static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
- return obj_symbol_extern_via_got(t->c, t->obj, sym);
-}
-
-/* Effective-address descriptor produced by addr_mode. Mirrors the
- * Operand.v.ind shape after any required fixups (offset folded into a
- * scratch register when out of range, GLOBAL materialized into a register).
- * `index == REG_NONE` means plain base+offset; otherwise the indexed
- * register-offset form should be used and ofs is always 0. */
-typedef struct AAAddrMode {
- u32 base; /* physical register holding the base */
- u32 index; /* physical register holding the index, or REG_NONE */
- u32 log2_scale; /* 0..3 — only valid when index != REG_NONE */
- i32 ofs; /* signed displacement; 0 when index != REG_NONE */
-} AAAddrMode;
-
-/* Resolve an Operand addressing form to an AAAddrMode usable by the
- * load/store emitters. Handles all base kinds (LOCAL, INDIRECT, GLOBAL)
- * and folds out-of-range offsets through `tmp_reg` via
- * aa64_emit_addr_adjust, matching the prior addr_base contract.
- *
- * When the input INDIRECT carries an index, this routine preserves it in
- * the result. If a nonzero displacement is also present, it is added to
- * the base via the temp register so the indexed register-offset
- * instruction (which encodes no displacement) can use {tmp, index, 0}. */
-static AAAddrMode addr_mode(CGTarget* t, Operand addr, u32 tmp_reg) {
- AAImpl* a = impl_of(t);
- AAAddrMode m;
- m.base = 0u;
- m.index = REG_NONE;
- m.log2_scale = 0u;
- m.ofs = 0;
-
- if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_mode: bad slot");
- i32 off = -(i32)s->off;
- if (off >= -256 && off <= 255) {
- m.base = 29u;
- m.ofs = off;
- } else {
- aa64_emit_addr_adjust(t->mc, tmp_reg, 29u, off);
- m.base = tmp_reg;
- m.ofs = 0;
- }
- return m;
- }
- if (addr.kind == OPK_INDIRECT) {
- i32 off = addr.v.ind.ofs;
- u32 base = addr.v.ind.base & 0x1fu;
- Reg idx = addr.v.ind.index;
- if (idx == REG_NONE) {
- if (off >= -256 && off <= 255) {
- m.base = base;
- m.ofs = off;
- } else {
- aa64_emit_addr_adjust(t->mc, tmp_reg, base, off);
- m.base = tmp_reg;
- m.ofs = 0;
- }
- return m;
- }
- /* Indexed: fold any displacement into the base so the indexed
- * register-offset instruction can encode just {base, index, scale}. */
- if (off != 0) {
- aa64_emit_addr_adjust(t->mc, tmp_reg, base, off);
- m.base = tmp_reg;
- } else {
- m.base = base;
- }
- m.index = (u32)idx & 0x1fu;
- m.log2_scale = addr.v.ind.log2_scale & 0x3u;
- m.ofs = 0;
- return m;
- }
- if (addr.kind == OPK_GLOBAL) {
- emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend);
- m.base = tmp_reg;
- m.ofs = 0;
- return m;
- }
- compiler_panic(t->c, a->loc, "aarch64 addr_mode: unsupported kind %d",
- (int)addr.kind);
-}
-
-/* Assert that an Operand consumed by a non-load/store path carries no
- * EA index. Per doc/INDIRECT.md the cg layer never routes an indexed
- * OPK_INDIRECT to spill/reload, bitfield, atomics, copy_bytes/set_bytes,
- * inline asm, or addr_of; the assert catches upstream misrouting before
- * it silently produces incorrect addressing. */
-static inline void aa_assert_no_index(CGTarget* t, Operand addr,
- const char* where) {
- if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 %.*s: OPK_INDIRECT with index unexpected",
- SLICE_ARG(slice_from_cstr(where)));
- }
-}
-
-/* LDR (register), 32-bit option=LSL. Encodes
- * LDR<size> Wt|Xt, [Xn, Xm{, LSL #amt}] (integer)
- * where size in {0..3} selects byte/half/word/double; opc=01 (load).
- * S=0 -> no shift (amt=0); S=1 -> shift by `size` (amt=size).
- * The aarch64 register-offset addressing mode supports only those two
- * shift amounts (other values must be lowered upstream). */
-static inline u32 aa64_ldr_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x38606800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-static inline u32 aa64_str_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x38206800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-static inline u32 aa64_ldr_fp_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x3C606800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-static inline u32 aa64_str_fp_reg(u32 size, u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x3C206800u | (size << 30) | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-/* 128-bit Q register-offset variants (size encoded as size=00, opc bit
- * pattern 11 selects 128b). */
-static inline u32 aa64_ldr_q_reg(u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x3CE06800u | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-static inline u32 aa64_str_q_reg(u32 Rt, u32 Rn, u32 Rm, u32 S) {
- return 0x3CA06800u | ((Rm & 0x1fu) << 16) | ((S & 1u) << 12) |
- ((Rn & 0x1fu) << 5) | (Rt & 0x1fu);
-}
-
-/* True if `log2_scale` is legal for the aarch64 register-offset form at
- * a given access size index (sidx). The encoding supports S=0 (LSL #0)
- * and S=1 (LSL #sidx) — any other scale must be lowered by adding
- * `index << log2_scale` into the base via arch_lower_indexed before the
- * load/store. */
-static inline int aa_indexed_scale_legal(u32 sidx, u32 log2_scale, u32* S_out) {
- if (log2_scale == 0u) {
- *S_out = 0u;
- return 1;
- }
- if (log2_scale == sidx) {
- *S_out = 1u;
- return 1;
- }
- return 0;
-}
-
-void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
- u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
- u32 sidx = size_idx_for_bytes(sz);
-
- if (addr.kind == OPK_GLOBAL) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- ObjSymId sym = addr.v.global.sym;
- i64 add = addr.v.global.addend;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, AA_TMP0, sym);
- if (dst.cls == RC_FP) {
- aa_emit_ldr_fp_any(mc, sidx, reg_num(dst), AA_TMP0, (i32)add);
- } else {
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add));
- }
- return;
- }
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(AA_TMP0));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
- 0, 0);
- u32 ld_pos = mc->pos(mc);
- if (dst.cls == RC_FP) {
- aa_emit_ldr_fp_uimm_any(mc, sidx, reg_num(dst), AA_TMP0, 0);
- } else {
- aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0));
- }
- mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
- return;
- }
-
- /* Indexed register-offset form: emit `LDR Rt, [Rn, Rm{, LSL #s}]` when
- * the EA carries an index and the scale matches the encoding (S=0 →
- * LSL #0, S=1 → LSL #sidx). Otherwise fall back to
- * arch_lower_indexed, which materializes base+(index<<scale) into a
- * scratch and gives us a plain base+disp shape. */
- if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) {
- u32 S;
- if (aa_indexed_scale_legal(sidx, addr.v.ind.log2_scale & 0x3u, &S)) {
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (dst.cls == RC_FP) {
- if (sidx == 4u)
- aa64_emit32(t->mc, aa64_ldr_q_reg(reg_num(dst), m.base, m.index, S));
- else
- aa64_emit32(t->mc,
- aa64_ldr_fp_reg(sidx, reg_num(dst), m.base, m.index, S));
- } else {
- aa64_emit32(t->mc,
- aa64_ldr_reg(sidx, reg_num(dst), m.base, m.index, S));
- }
- return;
- }
- addr = arch_lower_indexed(t, addr, AA_TMP0);
- }
-
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (dst.cls == RC_FP) {
- aa_emit_ldr_fp_any(t->mc, sidx, reg_num(dst), m.base, m.ofs);
- } else {
- aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), m.base, m.ofs));
- }
-}
-
-void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) {
- u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
- u32 sidx = size_idx_for_bytes(sz);
-
- if (addr.kind == OPK_GLOBAL) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- ObjSymId sym = addr.v.global.sym;
- i64 add = addr.v.global.addend;
-
- u32 src_reg;
- u32 src_is_fp = 0;
- /* Zero immediate stores use wzr/xzr directly (reg 31). Avoids a
- * separate `mov wN, #0` and frees AA_TMP0 for the address base. */
- int src_imm_zero =
- (src.kind == OPK_IMM && src.v.imm == 0 && src.cls != RC_FP);
- if (src_imm_zero) {
- src_reg = 31u;
- } else if (src.kind == OPK_IMM) {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm);
- src_reg = AA_TMP0;
- } else if (src.cls == RC_FP) {
- src_reg = reg_num(src);
- src_is_fp = 1;
- } else {
- src_reg = reg_num(src);
- }
- u32 base = (src.kind == OPK_IMM && !src_imm_zero) ? AA_TMP1 : AA_TMP0;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, base, sym);
- if (src_is_fp) {
- aa_emit_str_fp_any(mc, sidx, src_reg, base, (i32)add);
- } else {
- aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add));
- }
- return;
- }
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(base));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
- 0, 0);
- u32 st_pos = mc->pos(mc);
- if (src_is_fp) {
- aa_emit_str_fp_uimm_any(mc, sidx, src_reg, base, 0);
- } else {
- aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0));
- }
- mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
- return;
- }
-
- /* Zero immediate stores use wzr/xzr directly (reg 31). */
- int src_imm_zero =
- (src.kind == OPK_IMM && src.v.imm == 0 && src.cls != RC_FP);
- u32 addr_tmp = (src.kind == OPK_IMM && !src_imm_zero) ? AA_TMP1 : AA_TMP0;
-
- /* Indexed register-offset form for STR when the EA's scale is legal.
- * Falls back to arch_lower_indexed when LSL doesn't fit the
- * instruction encoding (e.g. byte access with log2_scale=3). */
- if (addr.kind == OPK_INDIRECT && addr.v.ind.index != REG_NONE) {
- u32 S;
- if (aa_indexed_scale_legal(sidx, addr.v.ind.log2_scale & 0x3u, &S)) {
- AAAddrMode m = addr_mode(t, addr, addr_tmp);
- u32 src_reg;
- if (src_imm_zero) {
- src_reg = 31u;
- } else if (src.kind == OPK_IMM) {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm);
- src_reg = AA_TMP0;
- } else {
- src_reg = reg_num(src);
- }
- if (src.cls == RC_FP && !src_imm_zero) {
- if (sidx == 4u)
- aa64_emit32(t->mc, aa64_str_q_reg(src_reg, m.base, m.index, S));
- else
- aa64_emit32(t->mc,
- aa64_str_fp_reg(sidx, src_reg, m.base, m.index, S));
- } else {
- aa64_emit32(t->mc, aa64_str_reg(sidx, src_reg, m.base, m.index, S));
- }
- return;
- }
- addr = arch_lower_indexed(t, addr, addr_tmp);
- }
-
- AAAddrMode m = addr_mode(t, addr, addr_tmp);
-
- if (src_imm_zero) {
- aa64_emit32(t->mc, aa64_stur(sidx, 31u, m.base, m.ofs));
- return;
- }
- if (src.kind == OPK_IMM) {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm);
- aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, m.base, m.ofs));
- return;
- }
- if (src.cls == RC_FP) {
- aa_emit_str_fp_any(t->mc, sidx, reg_num(src), m.base, m.ofs);
- } else {
- aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), m.base, m.ofs));
- }
-}
-
-static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) {
- AAImpl* a = impl_of(t);
- if (lv.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, lv.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot");
- aa64_emit_addr_adjust(t->mc, reg_num(dst), 29, -(i32)s->off);
- return;
- }
- if (lv.kind == OPK_INDIRECT) {
- aa_assert_no_index(t, lv, "addr_of");
- i32 ofs = lv.v.ind.ofs;
- u32 base = lv.v.ind.base & 0x1f;
- aa64_emit_addr_adjust(t->mc, reg_num(dst), base, ofs);
- return;
- }
- if (lv.kind == OPK_GLOBAL) {
- u32 rd = reg_num(dst);
- ObjSymId sym = lv.v.global.sym;
- i64 addend = lv.v.global.addend;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, rd, sym);
- if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend);
- return;
- }
- u32 sec = t->mc->section_id;
- u32 adrp_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_adrp_base(rd));
- t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym,
- addend, 0, 0);
- u32 add_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
- t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym,
- addend, 0, 0);
- return;
- }
- compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented");
-}
-
-static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- u32 rd = reg_num(dst);
-
- if (obj_format_tls_via_descriptor(t->c)) {
- /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's
- * ABI is custom — x0 in/out as descriptor → TLV addr, all other
- * regs preserved — so we materialize via x0 and copy to `dst` only
- * when they differ. x0/x1 are scratch here (the regalloc only hands
- * out x19-x28), and x30 was saved at the prologue.
- *
- * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21
- * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12
- * ldr x1, [x0] ; descriptor[0] = thunk pointer
- * blr x1 ; x0 in/out
- * mov xdst, x0 ; only if dst != x0
- *
- * TLVP relocs do not carry an addend; nonzero addends are applied
- * after the call as a follow-on ADD/SUB on `dst`. */
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0,
- 0);
- u32 ldr_pos = mc->pos(mc);
- aa64_emit32(mc,
- aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0));
- mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0,
- 0, 0);
- aa64_emit32(mc,
- aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0));
- aa64_emit32(mc, aa64_blr(/*Rn=*/1));
- if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0));
- if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend);
- return;
- }
-
- /* Windows-on-ARM64 TLS Local-Exec.
- *
- * ldr xd, [x18, #0x58] ; xd = TEB->TlsSlots (TLS array)
- * adrp x16, _tls_index ; ADR_PREL_PG_HI21
- * ldr w16, [x16, :lo12:_tls_index] ; LDST32_ABS_LO12_NC
- * add xd, xd, x16, lsl #3 ; xd += index*8
- * ldr xd, [xd] ; xd = per-image TLS block base
- * add xd, xd, #:secrel_hi12:sym, lsl#12 ; SECREL_HIGH12A
- * add xd, xd, #:secrel_lo12:sym ; SECREL_LOW12A
- *
- * x16 (IP0) is a caller-saved intra-procedure-call scratch reg,
- * always safe to clobber inside a function body. The two ADD-imm12
- * SECREL fixups assume the merged .tls section is < 16 MiB; cfree
- * panics with a clear diagnostic at link time if that ever fails. */
- if (t->c->target.os == CFREE_OS_WINDOWS) {
- Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index"));
- ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name);
- if (idx_sym == 0) {
- idx_sym =
- obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0);
- }
- /* Windows ARM64 reserves x18 as the TEB pointer. Do not read
- * TPIDR_EL0 here; Wine and real Windows expose the TLS slots via
- * x18 + 0x58, matching clang/llvm-mingw codegen. */
- aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, rd, /*Rn=*/18,
- /*byte_off=*/0x58));
-
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(/*Rd=*/16));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, idx_sym, 0,
- 0, 0);
- u32 ldr_pos = mc->pos(mc);
- aa64_emit32(
- mc, aa64_ldr_uimm(/*size=*/2, /*Rt=*/16, /*Rn=*/16, /*byte_off=*/0));
- mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LDST32_ABS_LO12_NC, idx_sym,
- 0, 0, 0);
-
- /* add xd, xd, x16, LSL #3:
- * 0x8B000000 | (Rm << 16) | (3 << 10) | (Rn << 5) | Rd
- * sf=1, shift=LSL (00), Rm=16. */
- u32 add_shr = 0x8B000000u | (16u << 16) | (3u << 10) | ((rd & 0x1fu) << 5) |
- (rd & 0x1fu);
- aa64_emit32(mc, add_shr);
- aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, rd, rd, /*byte_off=*/0));
-
- /* add xd, xd, #(0 << 12), then patch HIGH12A: sh=1 in the encoding. */
- u32 hi_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, rd, /*imm12=*/0, /*sh=*/1));
- mc->emit_reloc_at(mc, sec, hi_pos, R_COFF_AARCH64_SECREL_HIGH12A, sym,
- addend, 0, 0);
- u32 lo_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, rd, /*imm12=*/0, /*sh=*/0));
- mc->emit_reloc_at(mc, sec, lo_pos, R_COFF_AARCH64_SECREL_LOW12A, sym,
- addend, 0, 0);
- return;
- }
-
- aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0));
-
- u32 hi_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1));
- mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym,
- addend, 0, 0);
-
- u32 lo_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0));
- mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym,
- addend, 0, 0);
-}
-
-/* ============================================================
- * Aggregate helpers
- * ============================================================ */
-
-static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) {
- if (op.kind == OPK_REG) return reg_num(op);
- if (op.kind == OPK_LOCAL) {
- AAImpl* a = impl_of(t);
- AASlot* s = aa64_slot_get(a, op.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot");
- aa64_emit_addr_adjust(t->mc, scratch, 29, -(i32)s->off);
- return scratch;
- }
- if (op.kind == OPK_GLOBAL) {
- emit_global_addr(t, scratch, op.v.global.sym, op.v.global.addend);
- return scratch;
- }
- if (op.kind == OPK_INDIRECT) {
- /* Aggregate helpers (copy_bytes/set_bytes, bitfield_*) take plain
- * pointer addresses; the cg contract guarantees no EA index here. */
- aa_assert_no_index(t, op, "agg address");
- u32 base = op.v.ind.base & 0x1fu;
- i32 ofs = op.v.ind.ofs;
- if (ofs == 0) return base;
- aa64_emit_addr_adjust(t->mc, scratch, base, ofs);
- return scratch;
- }
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 agg: address kind %d unsupported", (int)op.kind);
-}
-
-static void aa_emit_load_at(MCEmitter* mc, u32 size, u32 rt, u32 rn, u32 off) {
- if (off <= 255u)
- aa64_emit32(mc, aa64_ldur(size, rt, rn, (i32)off));
- else
- aa64_emit32(mc, aa64_ldr_uimm(size, rt, rn, off));
-}
-
-static void aa_emit_store_at(MCEmitter* mc, u32 size, u32 rt, u32 rn, u32 off) {
- if (off <= 255u)
- aa64_emit32(mc, aa64_stur(size, rt, rn, (i32)off));
- else
- aa64_emit32(mc, aa64_str_uimm(size, rt, rn, off));
-}
-
-static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr,
- AggregateAccess agg) {
- MCEmitter* mc = t->mc;
- u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
- u32 sr = agg_addr_reg(t, src_addr, (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1);
- u32 nbytes = agg.size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa_emit_load_at(mc, 3, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 3, AA_TMP2, dr, i);
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa_emit_load_at(mc, 2, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 2, AA_TMP2, dr, i);
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa_emit_load_at(mc, 1, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 1, AA_TMP2, dr, i);
- i += 2;
- }
- while (i < nbytes) {
- aa_emit_load_at(mc, 0, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 0, AA_TMP2, dr, i);
- i += 1;
- }
-}
-
-static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value,
- AggregateAccess agg) {
- MCEmitter* mc = t->mc;
- u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
-
- u32 byte;
- if (byte_value.kind == OPK_IMM) {
- byte = (u32)(byte_value.v.imm & 0xffu);
- } else {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 set_bytes: REG byte not yet supported");
- }
- u32 nbytes = agg.size;
-
- if (byte == 0) {
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa_emit_store_at(mc, 3, 31, dr, i);
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa_emit_store_at(mc, 2, 31, dr, i);
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa_emit_store_at(mc, 1, 31, dr, i);
- i += 2;
- }
- while (i < nbytes) {
- aa_emit_store_at(mc, 0, 31, dr, i);
- i += 1;
- }
- return;
- }
-
- u64 b64 = byte;
- b64 |= b64 << 8;
- b64 |= b64 << 16;
- b64 |= b64 << 32;
- aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64);
-
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa_emit_store_at(mc, 3, AA_TMP1, dr, i);
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa_emit_store_at(mc, 2, AA_TMP1, dr, i);
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa_emit_store_at(mc, 1, AA_TMP1, dr, i);
- i += 2;
- }
- while (i < nbytes) {
- aa_emit_store_at(mc, 0, AA_TMP1, dr, i);
- i += 1;
- }
-}
-
-/* ============================================================
- * Bitfields
- * ============================================================ */
-
-static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr,
- BitFieldAccess bf) {
- MCEmitter* mc = t->mc;
- u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 sf = (storage_bytes == 8u) ? 1u : 0u;
- u32 sidx = size_idx_for_bytes(storage_bytes);
- u32 rd = reg_num(dst);
-
- aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset));
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- u32 imms = lsb + width - 1u;
- if (bf.signed_) {
- aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms));
- } else {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms));
- }
-}
-
-static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src,
- BitFieldAccess bf) {
- MCEmitter* mc = t->mc;
- u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 sf = (storage_bytes == 8u) ? 1u : 0u;
- u32 sidx = size_idx_for_bytes(storage_bytes);
-
- aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
-
- u32 src_reg;
- if (src.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm);
- src_reg = AA_TMP2;
- } else if (src.kind == OPK_REG) {
- src_reg = reg_num(src);
- } else {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 bitfield_store: src kind %d unsupported",
- (int)src.kind);
- }
-
- u32 reg_size = sf ? 64u : 32u;
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- u32 immr = (reg_size - lsb) % reg_size;
- u32 imms = width - 1u;
- aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms));
-
- aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
-}
-
-/* ============================================================
- * Arithmetic helpers
- * ============================================================ */
-
-u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) {
- if (op.kind == OPK_REG) return reg_num(op);
- if (op.kind == OPK_IMM) {
- aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm);
- return scratch;
- }
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 binop: operand kind %d unsupported", (int)op.kind);
-}
-
-static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
- Operand b_op) {
- MCEmitter* mc = t->mc;
-
- if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) {
- if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 binop: FP op requires REG operands");
- }
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
- u32 rn = reg_num(a_op);
- u32 rm = reg_num(b_op);
- u32 w;
- switch (op) {
- case BO_FADD:
- w = aa64_fadd(type, rd, rn, rm);
- break;
- case BO_FSUB:
- w = aa64_fsub(type, rd, rn, rm);
- break;
- case BO_FMUL:
- w = aa64_fmul(type, rd, rn, rm);
- break;
- case BO_FDIV:
- w = aa64_fdiv(type, rd, rn, rm);
- break;
- default:
- w = 0;
- break;
- }
- aa64_emit32(mc, w);
- return;
- }
-
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
-
- switch (op) {
- case BO_IADD:
- case BO_AND:
- case BO_OR:
- case BO_XOR: {
- if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
- Operand t_op = a_op;
- a_op = b_op;
- b_op = t_op;
- }
- break;
- }
- default:
- break;
- }
-
- if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
- u32 rn_reg = reg_num(a_op);
- i64 imm = b_op.v.imm;
- u32 imm12, sh, N, immr, imms;
- switch (op) {
- case BO_IADD:
- if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
- aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh));
- return;
- }
- break;
- case BO_ISUB:
- if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
- aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh));
- return;
- }
- break;
- case BO_AND:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_OR:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_XOR:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_SHL: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- case BO_SHR_U: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- case BO_SHR_S: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- default:
- break;
- }
- }
-
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
-
- u32 word;
- switch (op) {
- case BO_IADD:
- word = aa64_add(sf, rd, rn, rm);
- break;
- case BO_ISUB:
- word = aa64_sub(sf, rd, rn, rm);
- break;
- case BO_IMUL:
- word = aa64_mul(sf, rd, rn, rm);
- break;
- case BO_AND:
- word = aa64_and(sf, rd, rn, rm);
- break;
- case BO_OR:
- word = aa64_orr(sf, rd, rn, rm);
- break;
- case BO_XOR:
- word = aa64_eor(sf, rd, rn, rm);
- break;
- case BO_SHL:
- word = aa64_lslv(sf, rd, rn, rm);
- break;
- case BO_SHR_U:
- word = aa64_lsrv(sf, rd, rn, rm);
- break;
- case BO_SHR_S:
- word = aa64_asrv(sf, rd, rn, rm);
- break;
- case BO_UDIV:
- word = aa64_udiv(sf, rd, rn, rm);
- break;
- case BO_SDIV:
- word = aa64_sdiv(sf, rd, rn, rm);
- break;
- case BO_SREM:
- aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm));
- word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
- break;
- case BO_UREM:
- aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm));
- word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
- break;
- case BO_FADD:
- case BO_FSUB:
- case BO_FMUL:
- case BO_FDIV:
- default:
- compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl",
- (int)op);
- }
- aa64_emit32(mc, word);
-}
-
-static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
- MCEmitter* mc = t->mc;
- u32 rd = reg_num(dst);
- u32 word;
-
- if (op == UO_FNEG) {
- if (dst.cls != RC_FP || a_op.kind != OPK_REG || a_op.cls != RC_FP) {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 unop: FP neg requires FP REG operand");
- }
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fneg(type, rd, reg_num(a_op)));
- return;
- }
-
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- switch (op) {
- case UO_NEG:
- word = aa64_neg(sf, rd, rn);
- break;
- case UO_BNOT:
- word = aa64_mvn(sf, rd, rn);
- break;
- case UO_NOT:
- aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0));
- word = aa64_cset_eq(sf, rd);
- break;
- default:
- compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl",
- (int)op);
- }
- aa64_emit32(mc, word);
-}
-
-static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 rd = reg_num(dst);
- u32 rn = reg_num(src);
-
- switch (k) {
- case CV_SEXT: {
- if (src.cls != RC_INT || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes");
- }
- u32 src_bits = type_byte_size(src.type) * 8u;
- u32 dst_bits = type_byte_size(dst.type) * 8u;
- u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
- if (src_bits >= dst_bits) {
- aa64_emit32(mc, aa64_mov_reg(sf_dst, rd, rn));
- return;
- }
- aa64_emit32(
- mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
- return;
- }
- case CV_ZEXT: {
- if (src.cls != RC_INT || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes");
- }
- u32 src_bits = type_byte_size(src.type) * 8u;
- u32 dst_bits = type_byte_size(dst.type) * 8u;
- u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
- if (src_bits >= dst_bits || src_bits == 32u) {
- aa64_emit32(mc, aa64_mov_reg(src_bits == 32u ? 0u : sf_dst, rd, rn));
- } else {
- aa64_emit32(
- mc, aa64_ubfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
- }
- return;
- }
- case CV_TRUNC: {
- aa64_emit32(mc, aa64_mov_reg(0, rd, rn));
- return;
- }
- case CV_ITOF_S: {
- u32 sf_src = type_is_64(src.type) ? 1u : 0u;
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn));
- return;
- }
- case CV_ITOF_U: {
- u32 sf_src = type_is_64(src.type) ? 1u : 0u;
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn));
- return;
- }
- case CV_FTOI_S: {
- if (src.cls != RC_FP || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 type = type_is_fp_double(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn));
- return;
- }
- case CV_FTOI_U: {
- if (src.cls != RC_FP || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes");
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 type = type_is_fp_double(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn));
- return;
- }
- case CV_FEXT: {
- aa64_emit32(mc, aa64_fcvt_d_s(rd, rn));
- return;
- }
- case CV_FTRUNC: {
- aa64_emit32(mc, aa64_fcvt_s_d(rd, rn));
- return;
- }
- case CV_BITCAST: {
- if (src.cls == RC_INT && dst.cls == RC_FP) {
- u32 sz = type_byte_size(dst.type);
- aa64_emit32(mc,
- sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn));
- } else if (src.cls == RC_FP && dst.cls == RC_INT) {
- u32 sz = type_byte_size(src.type);
- aa64_emit32(mc,
- sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 convert BITCAST: same-class not yet supported");
- }
- return;
- }
- default:
- compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k);
- }
-}
-
-/* ============================================================
- * Calls
- * ============================================================ */
-
-static Operand aa_call_stack_arg_addr(CGTarget* t, u32 stack_offset, int tail) {
- AAImpl* a = impl_of(t);
- Operand addr;
- memset(&addr, 0, sizeof addr);
- addr.kind = OPK_INDIRECT;
- addr.cls = RC_INT;
- addr.v.ind.base = tail && !a->omit_frame ? 29u : 31u;
- addr.v.ind.index = REG_NONE;
- addr.v.ind.ofs = (i32)stack_offset;
- if (tail && !a->omit_frame) addr.v.ind.ofs += 16;
- return addr;
-}
-
-static void aa_check_tail_stack_args(CGTarget* t, u32 stack_size) {
- AAImpl* a = impl_of(t);
- if (stack_size > a->next_param_stack) {
- compiler_panic(t->c, a->loc,
- "aarch64 tail call: stack argument area too small");
- }
-}
-
-static u32 aa_call_plan_stack_raw_size(const CGCallPlan* p) {
- u32 size = 0;
- for (u32 i = 0; i < p->nargs; ++i) {
- const CGCallPlanMove* m = &p->args[i];
- if (m->dst_kind == CG_CALL_PLAN_STACK ||
- m->dst_kind == CG_CALL_PLAN_TAIL_STACK) {
- u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u);
- if (end > size) size = end;
- }
- }
- return size;
-}
-
-static void aa_store_stack_reg(CGTarget* t, u32 reg, RegClass cls,
- CfreeCgTypeId type, u32 size, u32 stack_offset,
- int tail) {
- Operand addr = aa_call_stack_arg_addr(t, stack_offset, tail);
- Operand src;
- MemAccess ma;
- memset(&src, 0, sizeof src);
- memset(&ma, 0, sizeof ma);
- src.kind = OPK_REG;
- src.cls = (u8)cls;
- src.type = type;
- src.v.reg = reg;
- addr.type = type;
- ma.type = type;
- ma.size = size;
- ma.align = size ? size : 1u;
- aa_store(t, addr, src, ma);
-}
-
-static int aa_windows_fp_vararg(const CGTarget* t, const CGABIValue* av) {
- return t->c->target.os == CFREE_OS_WINDOWS && av && av->abi == NULL &&
- av->storage.cls == RC_FP;
-}
-
-static void aa_move_fp_to_int_reg(MCEmitter* mc, u32 dst_reg, Operand src,
- u32 size) {
- if (size == 8)
- aa64_emit32(mc, aa64_fmov_x_d(dst_reg, reg_num(src)));
- else
- aa64_emit32(mc, aa64_fmov_w_s(dst_reg, reg_num(src)));
-}
-
-static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
- const CGABIValue* av, u32* next_int, u32* next_fp,
- u32* stack_off, int tail) {
- AAImpl* a = impl_of(t);
- ABIArgInfo va_ai;
- ABIArgPart va_pt;
- const ABIArgInfo* ai = av->abi;
- if (!ai) {
- u32 sz = type_byte_size(av->type);
- memset(&va_ai, 0, sizeof va_ai);
- memset(&va_pt, 0, sizeof va_pt);
- va_ai.kind = ABI_ARG_DIRECT;
- va_ai.parts = &va_pt;
- va_ai.nparts = 1;
- va_pt.cls =
- aa_windows_fp_vararg(t, av)
- ? ABI_CLASS_INT
- : ((av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT);
- va_pt.size = sz;
- va_pt.align = sz;
- va_pt.src_offset = 0;
- ai = &va_ai;
- if (fi && fi->vararg_on_stack) {
- *next_int = 8;
- *next_fp = 8;
- }
- }
- if (ai->kind == ABI_ARG_IGNORE) return;
-
- if (ai->kind == ABI_ARG_INDIRECT) {
- u32 dst_reg;
- int to_stack = (*next_int >= 8);
- if (!to_stack)
- dst_reg = (*next_int)++;
- else
- dst_reg = AA_TMP0;
- if (av->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot");
- aa64_emit_addr_adjust(t->mc, dst_reg, 29, -(i32)s->off);
- } else if (av->storage.kind == OPK_INDIRECT) {
- aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f,
- av->storage.v.ind.ofs);
- } else if (av->storage.kind == OPK_GLOBAL) {
- emit_global_addr(t, dst_reg, av->storage.v.global.sym,
- av->storage.v.global.addend);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 call: INDIRECT arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- if (to_stack) {
- aa_store_stack_reg(t, dst_reg, RC_INT, av->type, 8, *stack_off, tail);
- *stack_off += 8;
- }
- return;
- }
-
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- u32 sz = pt->size;
- u32 sidx = size_idx_for_bytes(sz);
-
- if (pt->cls == ABI_CLASS_INT) {
- int to_stack = (*next_int >= 8);
- u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++;
- switch (av->storage.kind) {
- case OPK_IMM: {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm);
- break;
- }
- case OPK_REG: {
- u32 sf = (sz == 8) ? 1u : 0u;
- if (av->storage.cls == RC_FP)
- aa_move_fp_to_int_reg(t->mc, dst_reg, av->storage, sz);
- else
- aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage)));
- break;
- }
- case OPK_LOCAL: {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
- i32 off = -(i32)s->off + (i32)pt->src_offset;
- aa64_emit_ldur_off(t->mc, sidx, dst_reg, 29, off, dst_reg);
- break;
- }
- case OPK_INDIRECT: {
- aa_assert_no_index(t, av->storage, "call INT arg storage");
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.index = REG_NONE;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- AAAddrMode m = addr_mode(t, src, AA_TMP0);
- aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, m.base, m.ofs));
- break;
- }
- default:
- compiler_panic(t->c, a->loc,
- "aarch64 call: arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- if (to_stack) {
- aa_store_stack_reg(t, dst_reg, RC_INT, av->type, 8, *stack_off, tail);
- *stack_off += 8;
- }
- } else if (pt->cls == ABI_CLASS_FP) {
- int to_stack = (*next_fp >= 8);
- if (!to_stack) {
- u32 dst_reg = (*next_fp)++;
- switch (av->storage.kind) {
- case OPK_REG: {
- if (sz == 16)
- aa64_emit32(t->mc, aa64_mov_v16b(dst_reg, reg_num(av->storage)));
- else {
- u32 type = (sz == 8) ? 1u : 0u;
- aa64_emit32(t->mc,
- aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
- }
- break;
- }
- case OPK_LOCAL: {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s)
- compiler_panic(t->c, a->loc, "aarch64 call: bad FP arg slot");
- i32 off = -(i32)s->off + (i32)pt->src_offset;
- aa_emit_ldr_fp_any(t->mc, sidx, dst_reg, 29, off);
- break;
- }
- case OPK_INDIRECT: {
- aa_assert_no_index(t, av->storage, "call FP arg storage");
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.index = REG_NONE;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- AAAddrMode m = addr_mode(t, src, AA_TMP0);
- aa_emit_ldr_fp_any(t->mc, sidx, dst_reg, m.base, m.ofs);
- break;
- }
- default:
- compiler_panic(t->c, a->loc,
- "aarch64 call: FP arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- } else {
- switch (av->storage.kind) {
- case OPK_REG:
- aa_store_stack_reg(t, reg_num(av->storage), RC_FP, av->type, sz,
- *stack_off, tail);
- break;
- case OPK_LOCAL: {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s)
- compiler_panic(t->c, a->loc, "aarch64 call: bad FP arg slot");
- i32 off = -(i32)s->off + (i32)pt->src_offset;
- aa_emit_ldr_fp_any(t->mc, sidx, AA_FP_TMP0, 29, off);
- aa_store_stack_reg(t, AA_FP_TMP0, RC_FP, av->type, sz, *stack_off,
- tail);
- break;
- }
- case OPK_INDIRECT: {
- aa_assert_no_index(t, av->storage, "call FP stack-arg storage");
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.index = REG_NONE;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- AAAddrMode m = addr_mode(t, src, AA_TMP0);
- aa_emit_ldr_fp_any(t->mc, sidx, AA_FP_TMP0, m.base, m.ofs);
- aa_store_stack_reg(t, AA_FP_TMP0, RC_FP, av->type, sz, *stack_off,
- tail);
- break;
- }
- default:
- compiler_panic(
- t->c, a->loc,
- "aarch64 call: FP stack-arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- *stack_off += sz > 8 ? sz : 8;
- }
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl",
- (int)pt->cls);
- }
- }
-}
-
-static void count_arg_stack(CGTarget* t, const ABIFuncInfo* fi,
- const CGABIValue* av, u32* next_int, u32* next_fp,
- u32* stack_off) {
- ABIArgInfo va_ai;
- ABIArgPart va_pt;
- const ABIArgInfo* ai = av->abi;
- if (!ai) {
- u32 sz = type_byte_size(av->type);
- memset(&va_ai, 0, sizeof va_ai);
- memset(&va_pt, 0, sizeof va_pt);
- va_ai.kind = ABI_ARG_DIRECT;
- va_ai.parts = &va_pt;
- va_ai.nparts = 1;
- va_pt.cls =
- aa_windows_fp_vararg(t, av)
- ? ABI_CLASS_INT
- : ((av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT);
- va_pt.size = sz;
- va_pt.align = sz;
- va_pt.src_offset = 0;
- ai = &va_ai;
- if (fi && fi->vararg_on_stack) {
- *next_int = 8;
- *next_fp = 8;
- }
- }
- if (ai->kind == ABI_ARG_IGNORE) return;
- if (ai->kind == ABI_ARG_INDIRECT) {
- if (*next_int < 8)
- ++*next_int;
- else
- *stack_off += 8;
- return;
- }
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- if (pt->cls == ABI_CLASS_INT) {
- if (*next_int < 8)
- ++*next_int;
- else
- *stack_off += 8;
- } else if (pt->cls == ABI_CLASS_FP) {
- if (*next_fp < 8)
- ++*next_fp;
- else
- *stack_off += pt->size > 8 ? pt->size : 8;
- }
- }
-}
-
-static u32 aa_call_stack_size(CGTarget* t, const CGCallDesc* d) {
- (void)t;
- u32 next_int = 0, next_fp = 0, stack_off = 0;
- for (u32 i = 0; i < d->nargs; ++i)
- count_arg_stack(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off);
- return (stack_off + 15u) & ~15u;
-}
-
-/* Realizability of a sibling call (see CGTarget.tail_call_unrealizable_reason).
- * The callee's outgoing stack arguments must fit the area this function itself
- * received (next_param_stack); the tail prologue restore reuses those slots.
- * Variadic callees need no special handling — their arguments are placed by
- * the ordinary register/stack rules and the same fit check covers them. sret
- * callees are realizable too: aa_call forwards this function's own incoming
- * sret pointer (the return-shape precondition guarantees it matches). */
-static const char* aa_tail_call_unrealizable_reason(CGTarget* t,
- const CGCallDesc* d) {
- AAImpl* a = impl_of(t);
- u32 next_int = 0, next_fp = 0, stack_off = 0;
- for (u32 i = 0; i < d->nargs; ++i)
- count_arg_stack(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off);
- if (stack_off > a->next_param_stack)
- return "tail call stack arguments exceed the caller's parameter area";
- return NULL;
-}
-
-static u32 aa_collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) {
- u32 n = 0;
- for (u32 r = first; r <= last; ++r) {
- if (mask & (1u << r)) out[n++] = r;
- }
- return n;
-}
-
-static void aa_tail_restore_frame(CGTarget* t) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 int_regs[10];
- u32 fp_regs[8];
- u32 n_int_saves =
- aa_collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
- u32 n_fp_saves = aa_collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
- u32 int_save_off = a->max_outgoing;
- u32 fp_save_off = int_save_off + n_int_saves * 8u;
- u32 locals_off = fp_save_off + n_fp_saves * 8u;
- u32 fp_lr_off = locals_off + a->cum_off;
- u32 frame_size = (fp_lr_off + 16u + 15u) & ~15u;
- fp_lr_off = frame_size - 16u;
-
- if (a->omit_frame) return;
- if (a->has_alloca) {
- if (fp_lr_off <= 0xfff) {
- aa64_emit32(mc, aa64_sub_imm(1, 31, 29, fp_lr_off, 0));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 tail call: fp/lr offset too large");
- }
- }
- for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
- aa64_emit32(mc,
- aa64_ldr_fp_uimm(3, fp_regs[i], 31, fp_save_off + (u32)i * 8u));
- }
- for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
- aa64_emit32(mc,
- aa64_ldr_uimm(3, int_regs[i], 31, int_save_off + (u32)i * 8u));
- }
- if (fp_lr_off <= 504u) {
- aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
- } else {
- aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off));
- aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u));
- }
- emit_sp_add(mc, frame_size);
-}
-
-static void aa_tail_branch(CGTarget* t, Operand callee) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- if (callee.kind == OPK_REG) {
- if (reg_num(callee) != AA_TMP0)
- aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(callee)));
- aa_tail_restore_frame(t);
- aa64_emit32(mc, aa64_br(AA_TMP0));
- } else if (callee.kind == OPK_GLOBAL) {
- aa_tail_restore_frame(t);
- u32 b_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_b_base());
- mc->emit_reloc_at(mc, mc->section_id, b_pos, R_AARCH64_JUMP26,
- callee.v.global.sym, callee.v.global.addend, 0, 0);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 tail call: callee kind %d unsupported",
- (int)callee.kind);
- }
-}
-
-static void aa_call(CGTarget* t, const CGCallDesc* d) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- u32 next_int = 0, next_fp = 0, stack_off = 0;
-
- /* Ordinary sret call: point x8 at the caller-provided destination local.
- * A tail call instead forwards this function's own incoming sret pointer
- * (handled below), so skip this here. */
- if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) {
- if (d->ret.storage.kind != OPK_LOCAL) {
- compiler_panic(t->c, a->loc,
- "aarch64 call: sret destination must be LOCAL");
- }
- AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot");
- aa64_emit_addr_adjust(mc, 8, 29, -(i32)s->off);
- }
-
- for (u32 i = 0; i < d->nargs; ++i) {
- emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off,
- (d->flags & CG_CALL_TAIL) != 0);
- }
-
- u32 needed = (stack_off + 15u) & ~15u;
- if ((d->flags & CG_CALL_TAIL) == 0 && needed > a->max_outgoing) {
- if (a->known_frame) {
- compiler_panic(t->c, a->loc,
- "aarch64 call: known frame outgoing area too small");
- }
- a->max_outgoing = needed;
- }
-
- if (d->flags & CG_CALL_TAIL) {
- if (d->abi && d->abi->has_sret) {
- /* Forward this function's own incoming sret pointer (spilled to
- * sret_ptr_slot at entry) into x8 for the callee. The return-shape
- * precondition guarantees the callee writes the same type, so the
- * forwarded pointer is correct. Load while x29 still addresses this
- * frame, before aa_tail_branch tears it down; x8 is untouched by the
- * frame restore and any indirect-callee move (AA_TMP0 = x9). */
- AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE)
- ? aa64_slot_get(a, a->sret_ptr_slot)
- : NULL;
- if (!s)
- compiler_panic(t->c, a->loc,
- "aarch64 tail call: missing incoming sret slot");
- aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off));
- }
- aa_check_tail_stack_args(t, stack_off);
- aa_tail_branch(t, d->callee);
- return;
- }
-
- if (d->callee.kind == OPK_GLOBAL) {
- u32 bl_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_bl_base());
- mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26,
- d->callee.v.global.sym, d->callee.v.global.addend, 0, 0);
- } else if (d->callee.kind == OPK_REG) {
- aa64_emit32(mc, aa64_blr(reg_num(d->callee)));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported",
- (int)d->callee.kind);
- }
-
- const ABIArgInfo* ri = &d->abi->ret;
- if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) {
- return;
- }
- if (ri->nparts == 0) return;
-
- Operand rs = d->ret.storage;
- u32 next_int_ret = 0, next_fp_ret = 0;
- for (u16 i = 0; i < ri->nparts; ++i) {
- const ABIArgPart* p = &ri->parts[i];
- u32 src_reg;
- if (p->cls == ABI_CLASS_INT) {
- src_reg = next_int_ret++;
- } else if (p->cls == ABI_CLASS_FP) {
- src_reg = next_fp_ret++;
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl",
- (int)p->cls);
- }
-
- if (rs.kind == OPK_REG) {
- if (ri->nparts != 1) {
- compiler_panic(t->c, a->loc,
- "aarch64 call: REG ret_storage with %u parts",
- (unsigned)ri->nparts);
- }
- if (p->cls == ABI_CLASS_INT) {
- u32 sf = (p->size == 8) ? 1u : 0u;
- aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg));
- } else {
- if (p->size == 16)
- aa64_emit32(mc, aa64_mov_v16b(reg_num(rs), src_reg));
- else {
- u32 type = (p->size == 8) ? 1u : 0u;
- aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg));
- }
- }
- } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) {
- u32 base_reg;
- i32 base_off;
- if (rs.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, rs.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
- base_reg = 29;
- base_off = -(i32)s->off;
- } else {
- base_reg = rs.v.ind.base & 0x1f;
- base_off = rs.v.ind.ofs;
- }
- u32 sidx = size_idx_for_bytes(p->size);
- i32 off = base_off + (i32)p->src_offset;
- if (p->cls == ABI_CLASS_INT) {
- aa64_emit_stur_off(mc, sidx, src_reg, base_reg, off, AA_TMP0);
- } else {
- aa_emit_str_fp_any(mc, sidx, src_reg, base_reg, off);
- }
- } else if (rs.kind == OPK_IMM &&
- rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) {
- /* void return placeholder */
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 call: ret_storage kind %d unsupported",
- (int)rs.kind);
- }
- }
-}
-
-static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (p->flags & CG_CALL_TAIL) {
- if (p->has_sret) {
- /* Forward the function's own incoming sret pointer into x8 (see the
- * O0 path in aa_call). Load before aa_tail_branch tears the frame
- * down; x8 survives the restore and any indirect-callee move. */
- AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE)
- ? aa64_slot_get(a, a->sret_ptr_slot)
- : NULL;
- if (!s)
- compiler_panic(t->c, a->loc,
- "aarch64 tail call: missing incoming sret slot");
- aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off));
- }
- aa_check_tail_stack_args(t, aa_call_plan_stack_raw_size(p));
- aa_tail_branch(t, p->callee);
- return;
- }
-
- {
- u32 needed = (aa_call_plan_stack_raw_size(p) + 15u) & ~15u;
- if (needed > a->max_outgoing) {
- if (a->known_frame)
- compiler_panic(
- t->c, a->loc,
- "aarch64 call plan: known frame outgoing area too small");
- a->max_outgoing = needed;
- }
- }
-
- if (p->callee.kind == OPK_GLOBAL) {
- u32 bl_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_bl_base());
- mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26,
- p->callee.v.global.sym, p->callee.v.global.addend, 0, 0);
- } else if (p->callee.kind == OPK_REG) {
- aa64_emit32(mc, aa64_blr(reg_num(p->callee)));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 emit_call_plan: callee kind %d unsupported",
- (int)p->callee.kind);
- }
-}
-
-static Operand aa_call_plan_offset_operand(CGTarget* t, Operand op,
- u32 offset) {
- if (!offset) return op;
- if (op.kind == OPK_INDIRECT) {
- aa_assert_no_index(t, op, "call plan offset operand");
- op.v.ind.ofs += (i32)offset;
- } else if (op.kind == OPK_LOCAL) {
- AAImpl* a = impl_of(t);
- AASlot* s = aa64_slot_get(a, op.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call plan: bad slot");
- op.kind = OPK_INDIRECT;
- op.v.ind.base = 29;
- op.v.ind.index = REG_NONE;
- op.v.ind.log2_scale = 0;
- op.v.ind.ofs = -(i32)s->off + (i32)offset;
- }
- return op;
-}
-
-static void aa_load_call_arg(CGTarget* t, Operand dst,
- const CGCallPlanMove* m) {
- Operand src = aa_call_plan_offset_operand(t, m->src, m->src_offset);
- if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
- aa_addr_of(t, dst, src);
- return;
- }
- if (src.kind == OPK_GLOBAL) {
- aa_addr_of(t, dst, src);
- return;
- }
- aa_load(t, dst, src, m->mem);
-}
-
-static void aa_store_call_ret(CGTarget* t, const CGCallPlanRet* r,
- Operand src) {
- Operand dst = aa_call_plan_offset_operand(t, r->dst, r->dst_offset);
- aa_store(t, dst, src, r->mem);
-}
-
-static void aa_store_call_arg(CGTarget* t, const CGCallPlanMove* m) {
- Operand addr;
- addr = aa_call_stack_arg_addr(t, m->stack_offset,
- m->dst_kind == CG_CALL_PLAN_TAIL_STACK);
- addr.type = m->mem.type;
-
- if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
- Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
- tmp.v.reg = AA_TMP0;
- aa_load_call_arg(t, tmp, m);
- aa_store(t, addr, tmp, m->mem);
- return;
- }
-
- if (m->src.kind == OPK_REG || m->src.kind == OPK_IMM) {
- aa_store(t, addr, m->src, m->mem);
- return;
- }
- if (m->src.kind == OPK_GLOBAL) {
- Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
- tmp.v.reg = AA_TMP0;
- aa_load_call_arg(t, tmp, m);
- aa_store(t, addr, tmp, m->mem);
- return;
- }
- if (m->src.kind == OPK_LOCAL || m->src.kind == OPK_INDIRECT) {
- Operand tmp = {.kind = OPK_REG, .cls = m->cls, .type = m->mem.type};
- tmp.v.reg = m->cls == RC_FP ? AA_FP_TMP0 : AA_TMP0;
- aa_load_call_arg(t, tmp, m);
- aa_store(t, addr, tmp, m->mem);
- return;
- }
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 store_call_arg: source kind %d unsupported",
- (int)m->src.kind);
-}
-
-static void aa_ret(CGTarget* t, const CGABIValue* val) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (val) {
- const ABIArgInfo* ri = val->abi;
- if (ri && ri->kind == ABI_ARG_INDIRECT) {
- if (val->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot");
- if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
- if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
- }
- u32 nbytes = s->size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit_ldur_off(mc, 3, AA_TMP0, 29, -(i32)s->off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit_ldur_off(mc, 2, AA_TMP0, 29, -(i32)s->off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit_ldur_off(mc, 1, AA_TMP0, 29, -(i32)s->off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit_ldur_off(mc, 0, AA_TMP0, 29, -(i32)s->off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
- i += 1;
- }
- } else if (val->storage.kind == OPK_INDIRECT) {
- u32 nbytes = val->size;
- if (!nbytes) {
- compiler_panic(t->c, a->loc,
- "aarch64 ret indirect: missing aggregate size");
- }
- if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
- if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
- }
- u32 base_reg = val->storage.v.ind.base & 0x1f;
- i32 base_off = val->storage.v.ind.ofs;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit_ldur_off(mc, 3, AA_TMP0, base_reg, base_off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit_ldur_off(mc, 2, AA_TMP0, base_reg, base_off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit_ldur_off(mc, 1, AA_TMP0, base_reg, base_off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit_ldur_off(mc, 0, AA_TMP0, base_reg, base_off + (i32)i,
- AA_TMP0);
- aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
- i += 1;
- }
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 ret indirect: storage kind %d unsupported",
- (int)val->storage.kind);
- }
- } else if (val->storage.kind == OPK_REG) {
- if (val->storage.cls == RC_FP) {
- if (type_byte_size(val->storage.type) == 16) {
- if (reg_num(val->storage) != 0)
- aa64_emit32(mc, aa64_mov_v16b(/*Rd=*/0, reg_num(val->storage)));
- } else {
- u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
- if (reg_num(val->storage) != 0)
- aa64_emit32(mc,
- aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
- }
- } else {
- u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- if (reg_num(val->storage) != 0)
- aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
- }
- } else if (val->storage.kind == OPK_IMM) {
- u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm);
- } else if (val->storage.kind == OPK_LOCAL ||
- val->storage.kind == OPK_INDIRECT) {
- u32 base_reg;
- i32 base_off;
- if (val->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot");
- base_reg = 29;
- base_off = -(i32)s->off;
- } else {
- base_reg = val->storage.v.ind.base & 0x1f;
- base_off = val->storage.v.ind.ofs;
- }
- const ABIArgInfo* ri2 = val->abi;
- u16 nparts = ri2 ? ri2->nparts : 0;
- /* INT parts load into x0..x{n-1}. If the base address sits in one of
- * those registers, loading that part clobbers the base before later
- * parts are read (e.g. `ldur x0,[x0]; ldur w1,[x0,#8]`). Park the base
- * in a scratch (x10, never a return reg) when an earlier INT part would
- * overwrite it. FP parts target v-regs and never alias the int base. */
- u32 load_base = base_reg;
- for (u16 i = 0; i + 1u < nparts; ++i) {
- if (ri2->parts[i].cls == ABI_CLASS_INT && (u32)i == base_reg) {
- aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, AA_TMP1, base_reg));
- load_base = AA_TMP1;
- break;
- }
- }
- for (u16 i = 0; i < nparts; ++i) {
- const ABIArgPart* pt = &ri2->parts[i];
- u32 sidx = size_idx_for_bytes(pt->size);
- i32 off = base_off + (i32)pt->src_offset;
- if (pt->cls == ABI_CLASS_INT) {
- aa64_emit_ldur_off(mc, sidx, /*Rt=*/i, load_base, off, AA_TMP0);
- } else if (pt->cls == ABI_CLASS_FP) {
- aa_emit_ldr_fp_any(mc, sidx, /*Rt=*/i, load_base, off);
- } else {
- compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl",
- (int)pt->cls);
- }
- }
- }
- }
- if (a->omit_frame) {
- aa64_emit32(mc, aa64_ret(AA64_LR));
- return;
- }
- u32 bpos = mc->pos(mc);
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
- (void)bpos;
-}
-
-/* ============================================================
- * alloca
- * ============================================================ */
-
-static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (d.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG");
- }
- if (align > 16) {
- compiler_panic(t->c, a->loc,
- "aarch64 alloca: align %u > 16 not yet supported", align);
- }
-
- if (sz.kind == OPK_IMM) {
- i64 v = sz.v.imm;
- if (v < 0) {
- compiler_panic(t->c, a->loc, "aarch64 alloca: negative size");
- }
- u64 aligned = ((u64)v + 15u) & ~(u64)15u;
- if (aligned == 0) aligned = 16;
- if (aligned > 0xfffu) {
- compiler_panic(t->c, a->loc,
- "aarch64 alloca: const size %llu too large for v1",
- (unsigned long long)aligned);
- }
- aa64_emit32(mc,
- aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0));
- } else if (sz.kind == OPK_REG) {
- u32 sz_reg = reg_num(sz);
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0));
- aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63));
- aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59));
- aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported",
- (int)sz.kind);
- }
-
- if (a->nadd_patches == a->add_patches_cap) {
- u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4;
- struct AAAllocaPatch* nb =
- arena_array(t->c->tu, struct AAAllocaPatch, ncap);
- if (a->add_patches)
- memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches);
- a->add_patches = nb;
- a->add_patches_cap = ncap;
- }
- u32 dst_reg = reg_num(d);
- a->add_patches[a->nadd_patches].pos = mc->pos(mc);
- a->add_patches[a->nadd_patches].dst_reg = dst_reg;
- a->nadd_patches++;
- aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0));
- a->has_alloca = 1;
-}
-
-/* ============================================================
- * Varargs
- * ============================================================ */
-
-static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) {
- if (ofs == 0)
- aa64_emit32(mc, aa64_mov_reg(1, dst, 29));
- else if (ofs > 0 && (u32)ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0));
- else if (ofs < 0 && (u32)(-ofs) <= 0xfff)
- aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0));
- else {
- aa64_emit_load_imm(mc, 1, dst, ofs);
- aa64_emit32(mc, aa64_add(1, dst, 29, dst));
- }
-}
-
-static void aa_va_start_(CGTarget* t, Operand ap_op) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- if (!a->is_variadic) {
- compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic");
- }
- u32 ap = reg_num(ap_op);
- if (t->c->target.os == CFREE_OS_MACOS) {
- u32 ofs = 16u + a->next_param_stack;
- if (ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0));
- else {
- aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs);
- aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0));
- }
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0));
- return;
- }
- if (t->c->target.os == CFREE_OS_WINDOWS) {
- if (a->next_param_int < 8) {
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)(a->next_param_int * 8u));
- } else {
- u32 ofs = 16u + a->next_param_stack;
- if (ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0));
- else {
- aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs);
- aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0));
- }
- }
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0));
- return;
- }
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
-
- {
- u32 ofs = 16u + a->next_param_stack;
- if (ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0));
- else {
- aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs);
- aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0));
- }
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0));
- }
- emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8));
- emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16));
- aa64_emit_load_imm(mc, 0, AA_TMP0, (i64)((i32)(a->next_param_int * 8u) - 64));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24));
- aa64_emit_load_imm(mc, 0, AA_TMP0,
- (i64)((i32)(a->next_param_fp * 16u) - 128));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28));
-}
-
-static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op,
- CfreeCgTypeId ty) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 ap = reg_num(ap_op);
- int is_fp = (dst.cls == RC_FP);
- u32 offs_field = is_fp ? 28u : 24u;
- u32 top_field = is_fp ? 16u : 8u;
- u32 stride_reg = is_fp ? 16u : 8u;
- u32 sz = type_byte_size(ty);
- u32 sidx = size_idx_for_bytes(sz);
-
- if (t->c->target.os == CFREE_OS_MACOS) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0));
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0));
- aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0));
- return;
- }
- if (t->c->target.os == CFREE_OS_WINDOWS) {
- MCLabel L_store = mc->label_new(mc);
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0));
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0));
-
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- if (gs) {
- emit_fp_off(mc, AA_TMP2, -(i32)gs->off + 64);
- aa64_emit32(mc, aa64_subs_reg(1, 31u, AA_TMP1, AA_TMP2));
- aa64_emit32(mc, aa64_b_cond(0x1 /*NE*/));
- mc->emit_label_ref(mc, L_store, R_AARCH64_CONDBR19, 4, 0);
- u32 ofs = 16u + a->next_param_stack;
- if (ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, 29, ofs, 0));
- else {
- aa64_emit_load_imm(mc, 1, AA_TMP1, (i64)ofs);
- aa64_emit32(mc, aa64_add(1, AA_TMP1, 29, AA_TMP1));
- }
- }
- mc->label_place(mc, L_store);
- aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0));
- return;
- }
-
- MCLabel L_stack = mc->label_new(mc);
- MCLabel L_done = mc->label_new(mc);
-
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field));
- aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0));
- aa64_emit32(mc, aa64_b_cond(0xa /*GE*/));
- mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0);
-
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field));
- aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31));
- aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0));
- aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0));
- aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field));
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
-
- mc->label_place(mc, L_stack);
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0));
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0));
- aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0));
-
- mc->label_place(mc, L_done);
-}
-
-static void aa_va_end_(CGTarget* t, Operand a) {
- (void)t;
- (void)a;
-}
-
-static void aa_va_copy_(CGTarget* t, Operand d, Operand s) {
- MCEmitter* mc = t->mc;
- u32 dr = reg_num(d);
- u32 sr = reg_num(s);
- if (t->c->target.os == CFREE_OS_MACOS) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, 0));
- aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, 0));
- return;
- }
- if (t->c->target.os == CFREE_OS_WINDOWS) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, 0));
- aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, 0));
- return;
- }
- for (u32 i = 0; i < 32u; i += 8u) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i));
- }
-}
-
-/* ============================================================
- * Atomics
- * ============================================================ */
-
-static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_cbnz(u32 sf64, u32 Rt) {
- return 0x35000000u | (sf64 << 31) | (Rt & 0x1f);
-}
-
-static int mem_order_is_acquire(MemOrder o) {
- return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST ||
- o == MO_CONSUME;
-}
-static int mem_order_is_release(MemOrder o) {
- return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST;
-}
-
-static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma,
- MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- aa_assert_no_index(t, addr, "atomic_load");
- u32 base;
- if (addr.kind == OPK_REG) {
- base = reg_num(addr);
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot");
- base = AA_TMP0;
- aa64_emit_addr_adjust(mc, base, 29, -(i32)s->off);
- } else if (addr.kind == OPK_INDIRECT) {
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (m.ofs != 0) {
- aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs);
- base = AA_TMP0;
- } else {
- base = m.base;
- }
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_load: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (mem_order_is_acquire(ord)) {
- aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base));
- } else {
- u32 sidx = size_idx_for_bytes(ma.size);
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0));
- }
-}
-
-static void aa_atomic_store(CGTarget* t, Operand addr, Operand src,
- MemAccess ma, MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- u32 src_reg;
- if (src.kind == OPK_IMM) {
- src_reg = AA_TMP1;
- aa64_emit_load_imm(mc, sf, src_reg, src.v.imm);
- } else if (src.kind == OPK_REG) {
- src_reg = reg_num(src);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_store: src kind %d unsupported",
- (int)src.kind);
- }
- aa_assert_no_index(t, addr, "atomic_store");
- u32 base;
- if (addr.kind == OPK_REG) {
- base = reg_num(addr);
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot");
- base = AA_TMP0;
- aa64_emit_addr_adjust(mc, base, 29, -(i32)s->off);
- } else if (addr.kind == OPK_INDIRECT) {
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (m.ofs != 0) {
- aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs);
- base = AA_TMP0;
- } else {
- base = m.base;
- }
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_store: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (mem_order_is_release(ord)) {
- aa64_emit32(mc, aa64_stlr(sf, src_reg, base));
- } else {
- u32 sidx = size_idx_for_bytes(ma.size);
- aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0));
- }
-}
-
-static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new,
- u32 prior, u32 val) {
- switch (op) {
- case AO_XCHG:
- aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val));
- break;
- case AO_ADD:
- aa64_emit32(mc, aa64_add(sf, dst_new, prior, val));
- break;
- case AO_SUB:
- aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val));
- break;
- case AO_AND:
- aa64_emit32(mc, aa64_and(sf, dst_new, prior, val));
- break;
- case AO_OR:
- aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val));
- break;
- case AO_XOR:
- aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val));
- break;
- case AO_NAND:
- aa64_emit32(mc, aa64_and(sf, dst_new, prior, val));
- aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new));
- break;
- default:
- aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val));
- break;
- }
-}
-
-static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
- Operand val, MemAccess ma, MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- aa_assert_no_index(t, addr, "atomic_rmw");
- u32 base = AA_TMP0;
- if (addr.kind == OPK_REG) {
- aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot");
- aa64_emit_addr_adjust(mc, AA_TMP0, 29, -(i32)s->off);
- } else if (addr.kind == OPK_INDIRECT) {
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (m.base != AA_TMP0 || m.ofs != 0)
- aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs);
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported",
- (int)addr.kind);
- }
- u32 vreg = AA_TMP1;
- if (val.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, vreg, val.v.imm);
- } else if (val.kind == OPK_REG) {
- aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val)));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported",
- (int)val.kind);
- }
-
- int do_acq = mem_order_is_acquire(ord);
- int do_rel = mem_order_is_release(ord);
-
- MCLabel L_retry = mc->label_new(mc);
- mc->label_place(mc, L_retry);
-
- if (do_acq)
- aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base));
- else
- aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base));
-
- emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg);
-
- if (do_rel)
- aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base));
- else
- aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base));
-
- u32 cbnz_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_cbnz(0, vreg));
- mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
- (void)cbnz_pos;
-}
-
-static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
- Operand expected, Operand desired, MemAccess ma,
- MemOrder succ, MemOrder fail) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
- (void)fail;
-
- aa_assert_no_index(t, addr, "atomic_cas");
- u32 base = AA_TMP0;
- if (addr.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
- else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot");
- aa64_emit_addr_adjust(mc, AA_TMP0, 29, -(i32)s->off);
- } else if (addr.kind == OPK_INDIRECT) {
- AAAddrMode m = addr_mode(t, addr, AA_TMP0);
- if (m.base != AA_TMP0 || m.ofs != 0)
- aa64_emit_addr_adjust(mc, AA_TMP0, m.base, m.ofs);
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (expected.kind == OPK_IMM)
- aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm);
- else if (expected.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected)));
- else
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported",
- (int)expected.kind);
- if (desired.kind == OPK_IMM)
- aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm);
- else if (desired.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired)));
- else
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported",
- (int)desired.kind);
-
- int do_acq = mem_order_is_acquire(succ);
- int do_rel = mem_order_is_release(succ);
-
- MCLabel L_retry = mc->label_new(mc);
- MCLabel L_fail = mc->label_new(mc);
- MCLabel L_done = mc->label_new(mc);
-
- mc->label_place(mc, L_retry);
- if (do_acq)
- aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base));
- else
- aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base));
-
- aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1));
- aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/));
- mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0);
-
- if (do_rel)
- aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base));
- else
- aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base));
- aa64_emit32(mc, aa64_cbnz(0, AA_TMP1));
- mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
-
- aa64_emit_load_imm(mc, 0, reg_num(ok), 1);
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
-
- mc->label_place(mc, L_fail);
- aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY));
- aa64_emit_load_imm(mc, 0, reg_num(ok), 0);
-
- mc->label_place(mc, L_done);
-}
-
-static void aa_fence(CGTarget* t, MemOrder o) {
- (void)o;
- if (o == MO_RELAXED) return;
- aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
-}
-
-/* ============================================================
- * Intrinsics
- * ============================================================ */
-
-static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) {
- return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rev_w(u32 Rd, u32 Rn) {
- return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rev_x(u32 Rd, u32 Rn) {
- return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) {
- return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) {
- return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) {
- return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
-}
-static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) {
- return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
-}
-static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
- return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
- return aa64_dp3_pack((AA64DP3){
- .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
-}
-static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) {
- return aa64_smaddl(Rd, Rn, Rm, AA64_ZR);
-}
-static inline u32 aa64_umaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
- return aa64_dp3_pack((AA64DP3){
- .sf = 1, .op31 = 5, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
-}
-static inline u32 aa64_umull(u32 Rd, u32 Rn, u32 Rm) {
- return aa64_umaddl(Rd, Rn, Rm, AA64_ZR);
-}
-static inline u32 aa64_smulh(u32 Rd, u32 Rn, u32 Rm) {
- return 0x9B407C00u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_umulh(u32 Rd, u32 Rn, u32 Rm) {
- return 0x9BC07C00u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) {
- return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
- const Operand* args, u32 na) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- (void)nd;
-
- switch (kind) {
- case INTRIN_POPCOUNT: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sz_in = type_byte_size(src.type);
- if (sz_in == 8)
- aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src)));
- else
- aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src)));
- aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0));
- aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0));
- aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0));
- return;
- }
- case INTRIN_CLZ: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src)));
- return;
- }
- case INTRIN_CTZ: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src)));
- aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst)));
- return;
- }
- case INTRIN_BSWAP16: {
- aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_BSWAP32: {
- aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_BSWAP64: {
- aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_MEMCPY:
- case INTRIN_MEMMOVE: {
- Operand da = args[0], sa = args[1], nb = args[2];
- if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
- compiler_panic(
- t->c, a->loc,
- "aarch64 intrinsic: %.*s with non-const n or non-REG ptr",
- SLICE_ARG(
- slice_from_cstr(kind == INTRIN_MEMCPY ? "memcpy" : "memmove")));
- }
- u32 dr = reg_num(da);
- u32 sr = reg_num(sa);
- u32 n = (u32)nb.v.imm;
- if (kind == INTRIN_MEMCPY) {
- u32 i = 0;
- while (i + 8 <= n) {
- aa_emit_load_at(mc, 3, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 3, AA_TMP2, dr, i);
- i += 8;
- }
- while (i + 4 <= n) {
- aa_emit_load_at(mc, 2, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 2, AA_TMP2, dr, i);
- i += 4;
- }
- while (i + 2 <= n) {
- aa_emit_load_at(mc, 1, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 1, AA_TMP2, dr, i);
- i += 2;
- }
- while (i < n) {
- aa_emit_load_at(mc, 0, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 0, AA_TMP2, dr, i);
- i += 1;
- }
- } else {
- u32 i = n;
- while (i >= 8) {
- i -= 8;
- aa_emit_load_at(mc, 3, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 3, AA_TMP2, dr, i);
- }
- while (i >= 4) {
- i -= 4;
- aa_emit_load_at(mc, 2, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 2, AA_TMP2, dr, i);
- }
- while (i >= 2) {
- i -= 2;
- aa_emit_load_at(mc, 1, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 1, AA_TMP2, dr, i);
- }
- while (i >= 1) {
- i -= 1;
- aa_emit_load_at(mc, 0, AA_TMP2, sr, i);
- aa_emit_store_at(mc, 0, AA_TMP2, dr, i);
- }
- }
- return;
- }
- case INTRIN_MEMSET: {
- Operand da = args[0], bv = args[1], nb = args[2];
- if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
- compiler_panic(
- t->c, a->loc,
- "aarch64 intrinsic: memset with non-const n / non-REG ptr");
- }
- u32 dr = reg_num(da);
- u32 n = (u32)nb.v.imm;
- u32 byte;
- u32 src_reg;
- if (bv.kind == OPK_IMM) {
- byte = (u32)(bv.v.imm & 0xffu);
- if (byte == 0) {
- src_reg = 31u;
- } else {
- u64 b64 = byte;
- b64 |= b64 << 8;
- b64 |= b64 << 16;
- b64 |= b64 << 32;
- aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64);
- src_reg = AA_TMP2;
- }
- } else if (bv.kind == OPK_REG) {
- aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll);
- aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR));
- src_reg = AA_TMP2;
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 intrinsic: memset byte kind %d unsupported",
- (int)bv.kind);
- }
- u32 i = 0;
- while (i + 8 <= n) {
- aa_emit_store_at(mc, 3, src_reg, dr, i);
- i += 8;
- }
- while (i + 4 <= n) {
- aa_emit_store_at(mc, 2, src_reg, dr, i);
- i += 4;
- }
- while (i + 2 <= n) {
- aa_emit_store_at(mc, 1, src_reg, dr, i);
- i += 2;
- }
- while (i < n) {
- aa_emit_store_at(mc, 0, src_reg, dr, i);
- i += 1;
- }
- return;
- }
- case INTRIN_PREFETCH:
- (void)args;
- (void)na;
- return;
- case INTRIN_ASSUME_ALIGNED: {
- Operand src = args[0];
- Operand dst = dsts[0];
- if (reg_num(src) != reg_num(dst)) {
- aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src)));
- }
- return;
- }
- case INTRIN_EXPECT: {
- Operand val = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- if (val.kind == OPK_REG) {
- if (reg_num(val) != reg_num(dst)) {
- aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val)));
- }
- } else if (val.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 intrinsic: expect val kind %d unsupported",
- (int)val.kind);
- }
- return;
- }
- case INTRIN_UNREACHABLE:
- case INTRIN_TRAP:
- aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u));
- return;
- case INTRIN_SADD_OVERFLOW:
- case INTRIN_UADD_OVERFLOW:
- case INTRIN_SSUB_OVERFLOW:
- case INTRIN_USUB_OVERFLOW: {
- Operand a_op = args[0], b_op = args[1];
- Operand dval = dsts[0], dovf = dsts[1];
- u32 sf = type_is_64(dval.type) ? 1u : 0u;
- u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rb =
- aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- u32 word = (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW)
- ? aa64_adds_reg(sf, reg_num(dval), ra, rb)
- : aa64_subs_reg(sf, reg_num(dval), ra, rb);
- u32 cond = (kind == INTRIN_UADD_OVERFLOW) ? 0x2u /*CS*/
- : (kind == INTRIN_USUB_OVERFLOW) ? 0x3u /*CC*/
- : 0x6u /*VS*/;
- aa64_emit32(mc, word);
- aa64_emit32(mc, aa64_cset(0, reg_num(dovf), cond));
- return;
- }
- case INTRIN_SMUL_OVERFLOW: {
- Operand a_op = args[0], b_op = args[1];
- Operand dval = dsts[0], dovf = dsts[1];
- u32 sf = type_is_64(dval.type) ? 1u : 0u;
- u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rb =
- aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- if (sf) {
- aa64_emit32(mc, aa64_mul(1, reg_num(dval), ra, rb));
- aa64_emit32(mc, aa64_smulh(reg_num(dovf), ra, rb));
- aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, reg_num(dval), 63, 63));
- aa64_emit32(mc, aa64_subs_reg(1, 31u, reg_num(dovf), AA_TMP2));
- } else {
- aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb));
- aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2));
- aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2));
- }
- aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
- return;
- }
- case INTRIN_UMUL_OVERFLOW: {
- Operand a_op = args[0], b_op = args[1];
- Operand dval = dsts[0], dovf = dsts[1];
- u32 sf = type_is_64(dval.type) ? 1u : 0u;
- u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rb =
- aa64_force_reg_int(t, b_op, sf, (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- if (sf) {
- aa64_emit32(mc, aa64_mul(1, reg_num(dval), ra, rb));
- aa64_emit32(mc, aa64_umulh(reg_num(dovf), ra, rb));
- } else {
- aa64_emit32(mc, aa64_umull(AA_TMP2, ra, rb));
- aa64_emit32(mc, aa64_ubfm(1, reg_num(dovf), AA_TMP2, 32, 63));
- aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2));
- }
- aa64_emit32(mc, aa64_subs_imm(1, 31u, reg_num(dovf), 0));
- aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
- return;
- }
- default:
- compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported",
- (int)kind);
- }
-}
-
-/* ============================================================
- * Inline asm block
- * ============================================================ */
-
-static void aa_asm_block(CGTarget* t, const char* tmpl,
- const AsmConstraint* outs, u32 no, Operand* oo,
- const AsmConstraint* ins, u32 ni, const Operand* io,
- const Sym* clobs, u32 nc) {
- AAImpl* a_impl = impl_of(t);
- for (u32 i = 0; i < nc; ++i) {
- Reg phys;
- RegClass cls;
- if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue;
- if (cls == RC_INT) {
- if (phys >= 19u && phys <= 28u) a_impl->used_cs_int_mask |= 1u << phys;
- } else if (cls == RC_FP) {
- if (phys >= 8u && phys <= 15u) a_impl->used_cs_fp_mask |= 1u << phys;
- }
- }
- AA64Asm* a = aa64_asm_open(t->c);
- aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc);
- aa64_asm_run_template(a, t->mc, tmpl);
- aa64_asm_close(a);
-}
-
-/* ============================================================
- * Lifecycle / vtable constructor
- * ============================================================ */
-
-static void aa_set_loc(CGTarget* t, SrcLoc loc) {
- impl_of(t)->loc = loc;
- t->mc->set_loc(t->mc, loc);
-}
-
-static void aa_finalize(CGTarget* t) { (void)t; }
-
-static void aa_destroy(CGTarget* t) { (void)t; }
-
-static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); }
-
-CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
- AAImpl* a = arena_new(c->tu, AAImpl);
- memset(a, 0, sizeof *a);
-
- CGTarget* t = &a->base;
- t->c = c;
- t->obj = o;
- t->mc = m;
-
- t->func_begin = aa_func_begin;
- t->func_begin_known_frame = aa_func_begin_known_frame;
- t->func_end = aa_func_end;
- t->frame_slot = aa_frame_slot;
- t->param = aa_param;
-
- t->load_imm = aa_load_imm;
- t->load_const = aa_load_const;
- t->copy = aa_copy;
- t->load = aa_load;
- t->store = aa_store;
- t->addr_of = aa_addr_of;
- t->tls_addr_of = aa_tls_addr_of;
- t->copy_bytes = aa_copy_bytes;
- t->set_bytes = aa_set_bytes;
- t->bitfield_load = aa_bitfield_load;
- t->bitfield_store = aa_bitfield_store;
-
- t->binop = aa_binop;
- t->unop = aa_unop;
- t->convert = aa_convert;
-
- t->call = aa_call;
- t->load_call_arg = aa_load_call_arg;
- t->emit_call_plan = aa_emit_call_plan;
- t->store_call_arg = aa_store_call_arg;
- t->store_call_ret = aa_store_call_ret;
- t->call_stack_size = aa_call_stack_size;
- t->tail_call_unrealizable_reason = aa_tail_call_unrealizable_reason;
- t->ret = aa_ret;
-
- t->alloca_ = aa_alloca_;
- t->va_start_ = aa_va_start_;
- t->va_arg_ = aa_va_arg_;
- t->va_end_ = aa_va_end_;
- t->va_copy_ = aa_va_copy_;
-
- t->atomic_load = aa_atomic_load;
- t->atomic_store = aa_atomic_store;
- t->atomic_rmw = aa_atomic_rmw;
- t->atomic_cas = aa_atomic_cas;
- t->fence = aa_fence;
-
- t->intrinsic = aa_intrinsic;
- t->asm_block = aa_asm_block;
-
- t->set_loc = aa_set_loc;
- t->finalize = aa_finalize;
- t->destroy = aa_destroy;
-
- /* alloc/label/scope vtable entries */
- aa_alloc_vtable_init(t);
-#if CFREE_OPT_ENABLED
- aa_coord_vtable_init(t);
-#endif
-
- /* Suppress unused warning. */
- (void)type_is_signed;
-
- compiler_defer(c, cgt_cleanup, t);
- return t;
-}
diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c
@@ -1,373 +0,0 @@
-/* aarch64/opt_coord.c — opt/backend register coordination hooks.
- * Static arrays so opt_machinize can query the backend instead of
- * hard-coding arch knowledge. */
-
-#include "arch/aa64/internal.h"
-
-/* ============================================================
- * Static register tables reported to caller-owned allocators. */
-
-static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23, 24, 25, 26, 27, 28};
-static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23};
-
-static const Reg aa_int_scratch[] = {16, 17};
-static const Reg aa_fp_scratch[] = {24, 25};
-
-static const CGPhysRegInfo aa_int_phys[] = {
- {0, RC_INT, 0,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {1, RC_INT, 1,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {2, RC_INT, 2, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {3, RC_INT, 3, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {4, RC_INT, 4, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {5, RC_INT, 5, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {6, RC_INT, 6, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {7, RC_INT, 7, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0},
- {8, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0,
- 0},
- {12, RC_INT, 0xff,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0},
- {13, RC_INT, 0xff,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0},
- {14, RC_INT, 0xff,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0},
- {15, RC_INT, 0xff,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_TEMP_PREFERRED, 0, 0},
- {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {20, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {21, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {22, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {23, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {24, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {25, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {26, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {27, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {28, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
-};
-static const CGPhysRegInfo aa_fp_phys[] = {
- {0, RC_FP, 0,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {1, RC_FP, 1,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {2, RC_FP, 2,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {3, RC_FP, 3,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {4, RC_FP, 4,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {5, RC_FP, 5,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {6, RC_FP, 6,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {7, RC_FP, 7,
- CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG | CG_REG_RET, 0, 0},
- {8, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {9, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {10, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {11, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {12, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {13, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {14, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {15, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4},
- {16, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {17, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {18, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {19, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {20, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {21, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {22, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {23, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {26, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {27, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {28, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {29, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
- {30, RC_FP, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0},
-};
-
-/* ============================================================
- * Vtable methods */
-
-static void aa_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out,
- u32* nregs) {
- (void)t;
- switch (cls) {
- case RC_INT:
- *out = aa_int_allocable;
- *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0];
- break;
- case RC_FP:
- *out = aa_fp_allocable;
- *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0];
- break;
- default:
- *out = NULL;
- *nregs = 0;
- break;
- }
-}
-
-static void aa_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out,
- u32* nregs) {
- (void)t;
- switch (cls) {
- case RC_INT:
- *out = aa_int_scratch;
- *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0];
- break;
- case RC_FP:
- *out = aa_fp_scratch;
- *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0];
- break;
- default:
- *out = NULL;
- *nregs = 0;
- break;
- }
-}
-
-static void aa_get_phys_regs(CGTarget* t, RegClass cls,
- const CGPhysRegInfo** out, u32* nregs) {
- (void)t;
- switch (cls) {
- case RC_INT:
- *out = aa_int_phys;
- *nregs = sizeof aa_int_phys / sizeof aa_int_phys[0];
- break;
- case RC_FP:
- *out = aa_fp_phys;
- *nregs = sizeof aa_fp_phys / sizeof aa_fp_phys[0];
- break;
- default:
- *out = NULL;
- *nregs = 0;
- break;
- }
-}
-
-static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) {
- (void)t;
- switch (cls) {
- case RC_INT:
- /* AAPCS64 caller-saved: x0-x18, x30 */
- return reg <= 18 || reg == 30;
- case RC_FP:
- /* AAPCS64 caller-saved: v0-v7, v16-v31 */
- return reg <= 7 || reg >= 16;
- default:
- return 0;
- }
-}
-
-static u32 aa_call_clobber_mask(CGTarget* t, const CGCallDesc* d,
- RegClass cls) {
- (void)t;
- (void)d;
- if (cls == RC_INT) return ((1u << 19) - 1u) | (1u << 30);
- if (cls == RC_FP) return 0xFFFF00FFu;
- return 0;
-}
-
-static u32 aa_callee_save_mask(CGTarget* t, RegClass cls) {
- (void)t;
- if (cls == RC_INT) {
- u32 mask = 0;
- for (u32 r = 19; r <= 28; ++r) mask |= 1u << r;
- return mask;
- }
- if (cls == RC_FP) return 0x0000FF00u;
- return 0;
-}
-
-static u32 aa_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi,
- RegClass cls) {
- (void)t;
- if (!abi || abi->ret.kind == ABI_ARG_IGNORE ||
- abi->ret.kind == ABI_ARG_INDIRECT)
- return 0;
- u32 mask = 0, ni = 0, nf = 0;
- for (u16 i = 0; i < abi->ret.nparts; ++i) {
- const ABIArgPart* p = &abi->ret.parts[i];
- if (cls == RC_INT && p->cls == ABI_CLASS_INT)
- mask |= 1u << ni++;
- else if (cls == RC_FP && p->cls == ABI_CLASS_FP)
- mask |= 1u << nf++;
- }
- return mask;
-}
-
-static int aa_windows_fp_vararg_plan(CGTarget* t, const CGABIValue* av) {
- return t->c->target.os == CFREE_OS_WINDOWS && av && av->abi == NULL &&
- av->storage.cls == RC_FP;
-}
-
-static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
- memset(out, 0, sizeof *out);
- out->callee = d->callee;
- out->flags = d->flags;
- out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
- out->has_sret = d->abi && d->abi->has_sret;
- out->is_variadic = d->abi && d->abi->variadic;
- for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
- out->clobber_mask[c] = aa_call_clobber_mask(t, d, (RegClass)c);
- out->return_mask[c] = aa_return_reg_mask(t, d->abi, (RegClass)c);
- }
- u32 cap = d->nargs * 2u + 2u;
- out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u);
- out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4);
- u32 next_int = 0, next_fp = 0, stack = 0;
- /* Ordinary sret call: pass the destination address in x8. A tail call
- * instead forwards the function's own incoming sret pointer (handled in
- * aa_emit_call_plan), and ret.storage is the void sentinel, so skip it. */
- if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) {
- CGCallPlanMove* m = &out->args[out->nargs++];
- m->src = d->ret.storage;
- m->src_kind = CG_CALL_PLAN_SRC_ADDR;
- m->dst_kind = CG_CALL_PLAN_REG;
- m->cls = RC_INT;
- m->dst_reg = 8;
- m->mem.type = d->ret.type;
- m->mem.size = 8;
- m->mem.align = 8;
- }
- for (u32 a = 0; a < d->nargs; ++a) {
- const CGABIValue* av = &d->args[a];
- const ABIArgInfo* ai = av->abi;
- ABIArgInfo vai;
- ABIArgPart vap;
- if (!ai) {
- memset(&vai, 0, sizeof vai);
- memset(&vap, 0, sizeof vap);
- vap.cls = aa_windows_fp_vararg_plan(t, av)
- ? ABI_CLASS_INT
- : (av->storage.cls == RC_FP ? ABI_CLASS_FP : ABI_CLASS_INT);
- vap.size = type_byte_size(av->type);
- vai.kind = ABI_ARG_DIRECT;
- vai.nparts = 1;
- vai.parts = &vap;
- ai = &vai;
- if (d->abi && d->abi->vararg_on_stack) next_int = next_fp = 8;
- }
- if (ai->kind == ABI_ARG_IGNORE) continue;
- if (ai->kind == ABI_ARG_INDIRECT) {
- CGCallPlanMove* m = &out->args[out->nargs++];
- m->src = av->storage;
- m->src_kind = CG_CALL_PLAN_SRC_ADDR;
- m->cls = RC_INT;
- if (next_int < 8) {
- m->dst_kind = CG_CALL_PLAN_REG;
- m->dst_reg = next_int++;
- } else {
- m->dst_kind = CG_CALL_PLAN_STACK;
- m->stack_offset = stack;
- stack += 8;
- }
- m->mem.type = av->type;
- m->mem.size = 8;
- m->mem.align = 8;
- continue;
- }
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* p = &ai->parts[i];
- CGCallPlanMove* m = &out->args[out->nargs++];
- m->src = av->nparts ? av->parts[i].op : av->storage;
- m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset;
- m->mem.type = av->type;
- m->mem.size = p->size;
- m->mem.align = p->align ? p->align : p->size;
- if (p->cls == ABI_CLASS_FP) {
- m->cls = RC_FP;
- if (next_fp < 8) {
- m->dst_kind = CG_CALL_PLAN_REG;
- m->dst_reg = next_fp++;
- } else {
- m->dst_kind = CG_CALL_PLAN_STACK;
- m->stack_offset = stack;
- stack += p->size > 8 ? p->size : 8;
- }
- } else {
- m->cls = RC_INT;
- if (next_int < 8) {
- m->dst_kind = CG_CALL_PLAN_REG;
- m->dst_reg = next_int++;
- } else {
- m->dst_kind = CG_CALL_PLAN_STACK;
- m->stack_offset = stack;
- stack += 8;
- }
- }
- }
- }
- if ((d->flags & CG_CALL_TAIL) == 0 && d->abi &&
- d->abi->ret.kind != ABI_ARG_IGNORE &&
- d->abi->ret.kind != ABI_ARG_INDIRECT) {
- u32 ni = 0, nf = 0;
- for (u16 i = 0; i < d->abi->ret.nparts; ++i) {
- const ABIArgPart* p = &d->abi->ret.parts[i];
- CGCallPlanRet* r = &out->rets[out->nrets++];
- r->dst = d->ret.storage;
- r->dst_offset = p->src_offset;
- r->mem.type = d->ret.type;
- r->mem.size = p->size;
- r->mem.align = p->align ? p->align : p->size;
- if (p->cls == ABI_CLASS_FP) {
- r->cls = RC_FP;
- r->src_reg = nf++;
- } else {
- r->cls = RC_INT;
- r->src_reg = ni++;
- }
- }
- }
-}
-
-static void aa_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs,
- u32 n) {
- AAImpl* a = impl_of(t);
- for (u32 i = 0; i < n; ++i) {
- Reg r = regs[i];
- switch (cls) {
- case RC_INT:
- if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r;
- break;
- case RC_FP:
- if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r;
- break;
- default:
- break;
- }
- }
-}
-
-static void aa_plan_hard_regs(CGTarget* t, RegClass cls, const Reg* regs,
- u32 n) {
- AAImpl* a = impl_of(t);
- a->has_planned_regs = 1;
- for (u32 i = 0; i < n; ++i) {
- Reg r = regs[i];
- switch (cls) {
- case RC_INT:
- if (r >= 19u && r <= 28u) a->planned_cs_int_mask |= 1u << r;
- break;
- case RC_FP:
- if (r >= 8u && r <= 15u) a->planned_cs_fp_mask |= 1u << r;
- break;
- default:
- break;
- }
- }
-}
-
-void aa_coord_vtable_init(CGTarget* t) {
- t->get_allocable_regs = aa_get_allocable_regs;
- t->get_phys_regs = aa_get_phys_regs;
- t->get_scratch_regs = aa_get_scratch_regs;
- t->is_caller_saved = aa_is_caller_saved;
- t->call_clobber_mask = aa_call_clobber_mask;
- t->return_reg_mask = aa_return_reg_mask;
- t->callee_save_mask = aa_callee_save_mask;
- t->plan_call = aa_plan_call;
- t->plan_hard_regs = aa_plan_hard_regs;
- t->reserve_hard_regs = aa_reserve_hard_regs;
-}