kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

native.c (210050B)


      1 /* aa64 NativeTarget production-readiness checklist:
      2  * - ABI completeness: finish AAPCS64/Linux va_list and register-save-area
      3  *   lowering, verify Apple/AAPCS64/Windows arm64 differences, handle all
      4  *   homogeneous aggregates, indirect/byval/sret corner cases, small aggregate
      5  *   splitting, multi-register returns, stack alignment, and ABI diagnostics.
      6  * - Calls and returns: replace call-plus-return tail handling with true direct
      7  *   and indirect sibling calls, preserve musttail ABI guarantees, support stack
      8  *   argument reshuffling without clobbering live inputs, and cover all sret,
      9  *   variadic, FP, aggregate, and many-argument combinations.
     10  * - Frame lowering: implement known-frame/prologue integration for optimized
     11  *   emission, spill/reload hooks, callee-save tracking for integer and FP/SIMD
     12  *   registers, large-frame probing/materialization as needed by each platform,
     13  *   dynamic alloca restoration, and unwind/debug frame metadata.
     14  * - Operations and intrinsics: fill remaining scalar, FP, conversion, rounding,
     15  *   overflow, bit, vector/SIMD, trap, prefetch, and target-specific intrinsics;
     16  *   validate NaN/ordered/unordered FP compare semantics and integer narrowing
     17  *   behavior for every supported width.
     18  * - Aggregates and memory: support large constants, overlap-safe memmove,
     19  *   optimized bulk copy/set selection, bitfield load/store, packed/unaligned
     20  *   accesses, volatile access constraints, and record/slice edge cases across
     21  *   direct and optimized lowering.
     22  * - Atomics: replace ordinary load/store RMW/CAS sequences with correct LL/SC
     23  *   or LSE loops, implement acquire/release/seq_cst mappings precisely, handle
     24  *   failure ordering, byte/halfword/word/dword widths, and retry/clobber rules.
     25  * - Inline and file-scope asm: complete register/memory/immediate constraints,
     26  *   named operands, tied operands, early-clobber and clobber validation, hard
     27  *   register conflicts, memory barriers, outputs for aggregates/FP values, and
     28  *   file-scope asm integration. */
     29 
     30 #include <string.h>
     31 
     32 #include "abi/abi.h"
     33 #include "arch/aa64/aa64.h"
     34 #include "arch/aa64/asm.h"
     35 #include "arch/aa64/isa.h"
     36 #include "arch/aa64/regs.h"
     37 #include "asm/asm.h"
     38 #include "asm/asm_lex.h"
     39 #include "cg/native_argmove.h"
     40 #include "cg/native_asm.h"
     41 #include "cg/native_direct_target.h"
     42 #include "cg/native_frame.h"
     43 #include "cg/type.h"
     44 #include "core/arena.h"
     45 #include "core/bytes.h"
     46 #include "core/core.h"
     47 #include "core/pool.h"
     48 #include "core/slice.h"
     49 #include "obj/obj.h"
     50 
     51 #if defined(__GNUC__) || defined(__clang__)
     52 #define AA_UNUSED_FN __attribute__((unused))
     53 #else
     54 #define AA_UNUSED_FN
     55 #endif
     56 
     57 enum {
     58   AA_X8 = 8u, /* indirect-result (sret) register; usable as a copy base that
     59                  aa_copy_bytes (which scratches only x16/x17) never clobbers */
     60   AA_TMP0 = 16u,
     61   AA_TMP1 = 17u,
     62   AA_FP = 29u,
     63   AA_LR = 30u,
     64   AA_SP = 31u,
     65   AA_FRAME_SAVE_SIZE = 16u,
     66   /* Worst-case reserved prologue region (NDT single-pass path patches it in
     67    * place; the optimizer path reserves exactly what it emits). Sized to hold
     68    * the fat prologue plus the Windows large-frame stack probe (≤7 words, see
     69    * aa_words_stack_probe). */
     70   AA_PROLOGUE_WORDS = 32u,
     71   AA_TAIL_WORDS = 32u,
     72 };
     73 
     74 /* Windows/AArch64 TLS Local-Exec. The TEB pointer lives in the reserved
     75  * platform register x18 (never allocated; see AA_PHYS_INT_RESERVED(18)), and
     76  * the thread's TLS-array pointer (TEB.ThreadLocalStoragePointer) sits at
     77  * TEB+0x58 — same offset as on Win64/x86-64. */
     78 enum {
     79   AA_WIN_TEB_REG = 18u,
     80   AA_WIN_TEB_TLS_PTR_OFF = 0x58u,
     81 };
     82 
     83 /* ============================================================================
     84  * AAPCS64 frame layout
     85  *
     86  * Two layouts. Every fp- or sp-relative offset in this file is computed via one
     87  * of the aa_fp_off_ / aa_sp_off_ helpers below — no site does bare arithmetic
     88  * on AA_FP / AA_SP, and no site outside those helpers branches on the layout.
     89  *
     90  * TOP-RECORD (default — single-pass -O0, fat frames, and out_stack>0 small
     91  * frames). fp anchors at the caller's saved-pair address near the top; sp at
     92  * the bottom of the outgoing-arg area. Offsets are frame-size-independent.
     93  *
     94  *   high addr   caller's stack frame
     95  *               +------------------------------+
     96  *               | incoming stack args          |  aa_fp_off_in_arg(a,i) = 16+i
     97  *               +------------------------------+
     98  *      fp  -->  | saved x29 (prev fp)          |  aa_fp_off_saved_fp() = 0
     99  *               | saved x30 (prev lr)          |  aa_fp_off_saved_lr() = 8
    100  *               +------------------------------+
    101  *               | frame slots                  |  aa_fp_off_slot(a,off) = -off
    102  *               |   (callee-saves + locals     |
    103  *               |    + spills + sret/variadic) |
    104  *               +------------------------------+
    105  *               | outgoing args                |  aa_sp_off_out_arg(i)
    106  *      sp  -->  +------------------------------+
    107  *   low addr                                       CFA = fp + 16
    108  *
    109  * BOTTOM-RECORD (fp_at_bottom — known-frame -O1 small frames with
    110  * callee-saves/locals and out_stack==0). The record moves to the bottom so the
    111  * sp adjustment folds into a pre/post-indexed stp/ldp (−2 insns/call). fp = sp;
    112  * slots/callee-saves stack ABOVE the record at positive offsets. Offsets depend
    113  * on frame_size (hence known-frame only, where the frame is final before body).
    114  *
    115  *   high addr   caller's stack frame
    116  *               +------------------------------+
    117  *               | incoming stack args          |  aa_fp_off_in_arg(a,i) = N+i
    118  *               +------------------------------+   <- caller's sp = CFA = fp +
    119  * N | frame slots (+ align pad)    |  aa_fp_off_slot(a,off)=N-off |
    120  * (callee-saves + locals …)  |     (in [16, N), above record)
    121  *               +------------------------------+
    122  *  fp = sp -->  | saved x29 (prev fp)          |  aa_fp_off_saved_fp() = 0
    123  *               | saved x30 (prev lr)          |  aa_fp_off_saved_lr() = 8
    124  *   low addr    +------------------------------+   (N = frame_size;
    125  * out_stack==0)
    126  *
    127  * frame_size (N) = align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack).
    128  * Tail calls write outgoing args into the caller's incoming-args window —
    129  * physically the same address, expressed via aa_fp_off_tail_out_arg.
    130  * ========================================================================== */
    131 
    132 static u32 align_up_u32(u32 v, u32 align);
    133 
    134 typedef struct AAFrameLayout {
    135   u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals
    136                    * + spills + sret/variadic) */
    137   u32 out_stack;  /* max outgoing-arg bytes across all calls in this function */
    138   u32 top_home;   /* Windows-variadic GP register home area, reserved between
    139                    * the saved pair and the incoming stack args so the
    140                    * plain-pointer va_list walks register then stack varargs as
    141                    * one contiguous block (0 on every other ABI). */
    142   u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + top_home + slot_bytes +
    143                    * out_stack) */
    144 } AAFrameLayout;
    145 
    146 static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack,
    147                                             u32 top_home) {
    148   AAFrameLayout L;
    149   L.slot_bytes = slot_bytes;
    150   L.out_stack = out_stack;
    151   L.top_home = top_home;
    152   L.frame_size =
    153       align_up_u32(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + out_stack, 16u);
    154   return L;
    155 }
    156 
    157 /* FP-relative byte offsets. The saved-pair is at [fp]/[fp+8] in both the
    158  * top-record and bottom-record (fp_at_bottom) layouts, so these two are
    159  * layout-independent. The frame-size-dependent helpers — aa_fp_off_in_arg,
    160  * aa_fp_off_slot, aa_fp_off_tail_out_arg — branch on a->fp_at_bottom and are
    161  * defined after AANativeTarget (see aa_fp_off_* below aa_of). */
    162 static inline i32 aa_fp_off_saved_fp(void) { return 0; }
    163 static inline i32 aa_fp_off_saved_lr(void) { return 8; }
    164 
    165 /* SP-relative byte offsets. */
    166 static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; }
    167 static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) {
    168   return L->frame_size - AA_FRAME_SAVE_SIZE - L->top_home;
    169 }
    170 
    171 /* Frame slots and callee-save records are owned by the shared NativeFrame
    172  * bookkeeping (cg/native_frame.h); these aliases keep the aa64-local spellings.
    173  */
    174 typedef NativeFrameSlotEntry AANativeSlot;
    175 
    176 /* Deferred in-function patches, all resolved in aa_func_end once the frame
    177  * layout (max_outgoing, callee-saves) is final. One growable list carries both
    178  * kinds; each entry patches a disjoint, fixed code position, so insertion order
    179  * is irrelevant. The prologue region is patched separately (exactly one per
    180  * function, fixed position) and is not a list entry. */
    181 typedef enum AAPatchKind {
    182   AA_PATCH_ALLOCA, /* single instr: add dst, sp, #max_outgoing */
    183   AA_PATCH_TAIL,   /* AA_TAIL_WORDS region: callee restores + frame + br/b */
    184 } AAPatchKind;
    185 
    186 typedef struct AAPatch {
    187   AAPatchKind kind;
    188   u32 pos;
    189   union {
    190     u32 dst_reg;      /* AA_PATCH_ALLOCA */
    191     NativeLoc callee; /* AA_PATCH_TAIL */
    192   } u;
    193 } AAPatch;
    194 
    195 typedef NativeFrameCalleeSave AACalleeSave;
    196 
    197 typedef struct AANativeTarget {
    198   NativeTarget base;
    199   SrcLoc loc;
    200   const CGFuncDesc* func;
    201 
    202   /* Shared frame bookkeeping: slot table, cumulative offset, max-outgoing,
    203    * callee-save set, and the known_frame / has_alloca / frame_final flags. */
    204   NativeFrame frame;
    205   /* Final frame size, set once in aa_func_begin_known_frame when fp_at_bottom
    206    * is decided. Read by the fp-relative offset helpers in the bottom-record
    207    * layout (where slot/incoming-arg offsets depend on frame_size); meaningless
    208    * and unread on the single-pass path, which never sets fp_at_bottom. */
    209   u32 frame_size_final;
    210   u32 incoming_stack_size;
    211   /* Windows-variadic GP register home area size (gp_reg_count * gp_slot_size,
    212    * 64 today; 0 on every other ABI). When nonzero the function takes the fat
    213    * top-record layout and homes x0..x7 into [fp + AA_FRAME_SAVE_SIZE ..] so the
    214    * plain-pointer va_list can walk register then stack varargs contiguously. */
    215   u32 top_home_bytes;
    216   u32 next_param_int;
    217   u32 next_param_fp;
    218   u32 next_param_stack;
    219   NativeFrameSlot sret_ptr_slot;
    220   NativeFrameSlot saved_tmp_slot;
    221   NativeFrameSlot va_gr_slot;
    222   NativeFrameSlot va_vr_slot;
    223 
    224   AAPatch* patches;
    225   u32 npatches;
    226   u32 patches_cap;
    227   u32 nalloca; /* count of AA_PATCH_ALLOCA entries; gates slim prologue/frame */
    228 
    229   u32 func_start;
    230   u32 prologue_pos;
    231   u32 minimal_prologue_words; /* opt path: exact prologue length, else 0 */
    232   MCLabel epilogue_label;
    233 
    234   /* Set at func_end when this function qualifies for the slim prologue/epilogue
    235    * (Tier A: no body locals/spills, no callee-saves, no alloca, no outgoing
    236    * stack args, no sret/variadic). When set, the prologue patch and epilogue
    237    * emit a 2-insn `stp x29,x30,[sp,#-16]! ; mov x29,sp` and matching `ldp
    238    * x29,x30,[sp],#16 ; ret` instead of the fat 4+3-insn FP-frame form. */
    239   u8 slim_prologue;
    240   /* Set at func_end when frame_size - 16 fits stp's signed 7-bit scaled
    241    * immediate (frame_size <= 520). Skips the `add x17, sp, #(N-16)` scratch
    242    * materialization in the prologue (stp x29,x30,[sp,#N-16] instead) and
    243    * the matching `add x10, fp, #0` in the epilogue (ldp x29,x30,[sp,#N-16]
    244    * + add sp,sp,#N). Mutually exclusive with `slim_prologue` (Tier A wins
    245    * when both would apply) and `fp_at_bottom` (which wins for out_stack==0).
    246    * Now only reached for small frames with outgoing stack args (out_stack>0),
    247    * where the record cannot move to the bottom. Keeps the top-record layout. */
    248   u8 slim_small_frame;
    249   /* Set by aa_func_begin_known_frame for a small frame with callee-saves/locals
    250    * and no outgoing stack args: the frame record moves to the bottom of the
    251    * frame (fp = sp, `mov x29,sp`) so the sp adjustment folds into a pre-indexed
    252    * `stp x29,x30,[sp,#-N]!` entry and post-indexed `ldp x29,x30,[sp],#N` exit
    253    * (−2 insns/call vs slim_small_frame). Slots and callee-saves stack ABOVE the
    254    * record at positive fp offsets; incoming args sit at fp+frame_size; CFA =
    255    * fp+frame_size. The frame-size-dependent offsets are the reason this is only
    256    * available on the known-frame path (frame final before the body). Mutually
    257    * exclusive with slim_prologue (Tier A) and slim_small_frame; gated on
    258    * out_stack==0 && !has_alloca && frame_size <= 504. */
    259   u8 fp_at_bottom;
    260 } AANativeTarget;
    261 
    262 static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
    263 
    264 /* Layout-aware FP-relative offsets. Every frame use site goes through these;
    265  * the fp_at_bottom test lives here and nowhere else.
    266  *
    267  *   top-record (default):  record near the top, fp anchored at the saved pair.
    268  *     incoming args at fp+16+b, slots below fp at -off.  CFA = fp+16.
    269  *   bottom-record (fp_at_bottom): record at the bottom, fp = sp.
    270  *     incoming args at fp+frame_size+b, slots above the record at
    271  *     frame_size-off (in [16, frame_size), never overlapping the 16-byte
    272  *     record since frame_size = align16(16+cum_off) >= 16+cum_off).
    273  *     CFA = fp+frame_size. */
    274 static inline i32 aa_fp_off_in_arg(const AANativeTarget* a, u32 byte_off) {
    275   /* top-record incoming args sit above the saved pair and the (usually empty)
    276    * Windows-variadic GP home area; bottom-record never carries a home area. */
    277   u32 base = a->fp_at_bottom ? a->frame_size_final
    278                              : AA_FRAME_SAVE_SIZE + a->top_home_bytes;
    279   return (i32)(base + byte_off);
    280 }
    281 static inline i32 aa_fp_off_slot(const AANativeTarget* a, u32 slot_off) {
    282   return a->fp_at_bottom ? (i32)a->frame_size_final - (i32)slot_off
    283                          : -(i32)slot_off;
    284 }
    285 /* Outgoing stack args on a tail call land in the caller's incoming-arg window —
    286  * the same physical address the tail-callee will read via aa_fp_off_in_arg.
    287  * Same helper, distinct name for site-side intent. */
    288 static inline i32 aa_fp_off_tail_out_arg(const AANativeTarget* a,
    289                                          u32 byte_off) {
    290   return aa_fp_off_in_arg(a, byte_off);
    291 }
    292 /* CFA = caller's sp, expressed as an fp-relative offset (fp+16 top-record,
    293  * fp+frame_size bottom-record). Named so the CFI emit site stays layout-blind.
    294  */
    295 static inline i32 aa_cfa_off(const AANativeTarget* a) {
    296   return a->fp_at_bottom
    297              ? (i32)a->frame_size_final
    298              : (i32)(AA_FRAME_SAVE_SIZE + a->top_home_bytes);
    299 }
    300 
    301 /* fp-relative offset of GP home slot `i` (Windows variadic only). The home area
    302  * sits just above the saved pair and just below the incoming stack args, so
    303  * slot gp_reg_count coincides with incoming-arg byte 0 (top-record only — a
    304  * function with a home area never takes a slim/bottom layout). */
    305 static inline i32 aa_fp_off_home_slot(u32 i) {
    306   return (i32)(AA_FRAME_SAVE_SIZE + i * 8u);
    307 }
    308 
    309 static void aa_panic(AANativeTarget* a, const char* msg) {
    310   compiler_panic(a->base.c, a->loc, "aarch64 native target: %s", msg);
    311 }
    312 
    313 /* Declared locally rather than pulling in debug/debug.h, keeping the
    314  * backend's dependency on the Debug producer to this one entry point —
    315  * same pattern as the x64/rv64 emit TUs (see arch/mc.h). */
    316 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
    317 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
    318                                 u32 end_ofs);
    319 
    320 static void aa_emit32(MCEmitter* mc, u32 word) {
    321   u8 b[4];
    322   u32 ofs = obj_pos(mc->obj, mc->section_id);
    323   wr_u32_le(b, word);
    324   mc->emit_bytes(mc, b, sizeof b);
    325   /* Record one line-table row per instruction start (no-op when not -g). */
    326   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    327 }
    328 
    329 static void aa_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
    330   u8 b[4];
    331   wr_u32_le(b, word);
    332   obj_patch(obj, sec, off, b, sizeof b);
    333 }
    334 
    335 static u32 align_up_u32(u32 v, u32 align) {
    336   u32 mask = align ? align - 1u : 0u;
    337   return (v + mask) & ~mask;
    338 }
    339 
    340 static u32 type_size32(NativeTarget* t, KitCgTypeId type) {
    341   u64 n = type ? cg_type_size(t->c, type) : 8u;
    342   if (n == 0) n = 8u;
    343   if (n > 16u)
    344     compiler_panic(t->c, (SrcLoc){0, 0, 0},
    345                    "aarch64 native target: scalar too large");
    346   return (u32)n;
    347 }
    348 
    349 static u32 type_align32(NativeTarget* t, KitCgTypeId type) {
    350   u64 n = type ? cg_type_align(t->c, type) : 8u;
    351   if (n == 0) n = 1u;
    352   if (n > 16u) n = 16u;
    353   return (u32)n;
    354 }
    355 
    356 static u32 size_idx(u32 n) {
    357   if (n <= 1u) return 0u;
    358   if (n <= 2u) return 1u;
    359   if (n <= 4u) return 2u;
    360   return 3u;
    361 }
    362 
    363 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
    364 
    365 static int loc_is_64(NativeTarget* t, NativeLoc loc) {
    366   return type_size32(t, loc.type) == 8u || cg_type_is_ptr(t->c, loc.type);
    367 }
    368 
    369 /* native_loc_is_fp is shared in native_target.h. */
    370 
    371 static __attribute__((unused)) int aa_use_got_for_sym(NativeTarget* t,
    372                                                       ObjSymId sym) {
    373   return obj_symbol_extern_via_got(t->c, t->obj, sym);
    374 }
    375 
    376 static __attribute__((unused)) RelocKind aa_ldst_reloc_for_size(u32 size) {
    377   switch (size) {
    378     case 0:
    379       return R_AARCH64_LDST8_ABS_LO12_NC;
    380     case 1:
    381       return R_AARCH64_LDST16_ABS_LO12_NC;
    382     case 2:
    383       return R_AARCH64_LDST32_ABS_LO12_NC;
    384     case 3:
    385       return R_AARCH64_LDST64_ABS_LO12_NC;
    386     default:
    387       return R_AARCH64_LDST64_ABS_LO12_NC;
    388   }
    389 }
    390 
    391 static u32 aa_load_imm_words(u32* out, u32 cap, u32 sf, u32 rd, i64 imm) {
    392   u64 v = (u64)imm;
    393   u32 words = sf ? 4u : 2u;
    394   u32 n = 0;
    395   for (u32 i = 0; i < words; ++i) {
    396     u32 part = (u32)((v >> (i * 16u)) & 0xffffu);
    397     if (!part && n) continue;
    398     if (n >= cap) return 0;
    399     out[n] = n ? aa64_movk(sf, rd, part, i) : aa64_movz(sf, rd, part, i);
    400     ++n;
    401   }
    402   if (!n) {
    403     if (!cap) return 0;
    404     out[n++] = aa64_movz(sf, rd, 0, 0);
    405   }
    406   return n;
    407 }
    408 
    409 static void aa_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) {
    410   u32 words[4];
    411   u32 n = aa_load_imm_words(words, 4u, sf, rd, imm);
    412   for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
    413 }
    414 
    415 static void aa_emit_add_imm(AANativeTarget* a, u32 rd, u32 rn, i32 off) {
    416   u32 imm12, sh;
    417   MCEmitter* mc = a->base.mc;
    418   if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) {
    419     aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh));
    420     return;
    421   }
    422   if (off < 0 && aa64_addsub_imm_fits(-(i64)off, &imm12, &sh)) {
    423     aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh));
    424     return;
    425   }
    426   aa_emit_load_imm(mc, 1, rd, off);
    427   aa_emit32(mc, aa64_add(1, rd, rn, rd));
    428 }
    429 
    430 static __attribute__((unused)) void aa_emit_add_i64(AANativeTarget* a, u32 rd,
    431                                                     u32 rn, i64 off) {
    432   u32 imm12, sh;
    433   MCEmitter* mc = a->base.mc;
    434   if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) {
    435     aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh));
    436     return;
    437   }
    438   if (off < 0 && aa64_addsub_imm_fits(-off, &imm12, &sh)) {
    439     aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh));
    440     return;
    441   }
    442   aa_emit_load_imm(mc, 1, rd, off);
    443   aa_emit32(mc, aa64_add(1, rd, rn, rd));
    444 }
    445 
    446 static u32 aa_ldur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) {
    447   return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size,
    448                                               .V = v,
    449                                               .opc = AA64_LDST_OPC_LDR,
    450                                               .imm9 = (u32)simm9 & 0x1ffu,
    451                                               .Rn = rn,
    452                                               .Rt = rt});
    453 }
    454 
    455 static u32 aa_stur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) {
    456   return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size,
    457                                               .V = v,
    458                                               .opc = AA64_LDST_OPC_STR,
    459                                               .imm9 = (u32)simm9 & 0x1ffu,
    460                                               .Rn = rn,
    461                                               .Rt = rt});
    462 }
    463 
    464 static u32 aa_ldr_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) {
    465   u32 sc = byte_off >> size;
    466   return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
    467                                             .V = v,
    468                                             .opc = AA64_LDST_OPC_LDR,
    469                                             .imm12 = sc,
    470                                             .Rn = rn,
    471                                             .Rt = rt});
    472 }
    473 
    474 static u32 aa_str_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) {
    475   u32 sc = byte_off >> size;
    476   return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
    477                                             .V = v,
    478                                             .opc = AA64_LDST_OPC_STR,
    479                                             .imm12 = sc,
    480                                             .Rn = rn,
    481                                             .Rt = rt});
    482 }
    483 
    484 static u32 aa_ldr_uimm(u32 size, u32 rt, u32 rn, u32 byte_off) {
    485   return aa_ldr_uimm_v(size, 0, rt, rn, byte_off);
    486 }
    487 
    488 static __attribute__((unused)) u32 aa_str_uimm(u32 size, u32 rt, u32 rn,
    489                                                u32 byte_off) {
    490   return aa_str_uimm_v(size, 0, rt, rn, byte_off);
    491 }
    492 
    493 static __attribute__((unused)) u32 aa_ldst_regoff_v(u32 size, u32 v, u32 load,
    494                                                     u32 rt, u32 rn, u32 rm,
    495                                                     u32 scaled) {
    496   return ((size & 3u) << 30) | 0x38200800u | ((v & 1u) << 26) |
    497          ((load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR) << 22) |
    498          ((rm & 0x1fu) << 16) | (3u << 13) | ((scaled & 1u) << 12) |
    499          ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    500 }
    501 
    502 static __attribute__((unused)) u32 aa_ldr_lit64(u32 rt, u32 imm19) {
    503   return 0x58000000u | ((imm19 & 0x7ffffu) << 5) | (rt & 0x1fu);
    504 }
    505 
    506 static __attribute__((unused)) u32 aa_mrs_tpidr_el0(u32 rt) {
    507   return 0xd53bd040u | (rt & 0x1fu);
    508 }
    509 
    510 static u32 aa_fp_bin(u32 op, u32 is_double, u32 rd, u32 rn, u32 rm) {
    511   return (is_double ? 0x1e600000u : 0x1e200000u) | op | ((rm & 0x1fu) << 16) |
    512          ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    513 }
    514 
    515 static u32 aa_fcmp(u32 is_double, u32 rn, u32 rm) {
    516   return (is_double ? 0x1e602000u : 0x1e202000u) | ((rm & 0x1fu) << 16) |
    517          ((rn & 0x1fu) << 5);
    518 }
    519 
    520 static u32 aa_fneg(u32 is_double, u32 rd, u32 rn) {
    521   return (is_double ? 0x1e614000u : 0x1e214000u) | ((rn & 0x1fu) << 5) |
    522          (rd & 0x1fu);
    523 }
    524 
    525 static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) {
    526   return (is_double ? 0x1e604000u : 0x1e204000u) | ((rn & 0x1fu) << 5) |
    527          (rd & 0x1fu);
    528 }
    529 
    530 /* MOV Vd.16B, Vn.16B (alias of ORR Vd.16B, Vn.16B, Vn.16B): a full 128-bit
    531  * SIMD register copy. Used to move binary128 / long double values, which fmov
    532  * (scalar, max 64-bit) would truncate. */
    533 static u32 aa_mov_vec16(u32 rd, u32 rn) {
    534   return 0x4ea01c00u | ((rn & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
    535          (rd & 0x1fu);
    536 }
    537 
    538 static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
    539   return (is64_src ? 0x9e220000u : 0x1e220000u) |
    540          (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu);
    541 }
    542 
    543 static u32 aa_ucvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
    544   return (is64_src ? 0x9e230000u : 0x1e230000u) |
    545          (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu);
    546 }
    547 
    548 static u32 aa_fcvtzs(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) {
    549   return (is64_dst ? 0x9e380000u : 0x1e380000u) |
    550          (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | (rd & 0x1fu);
    551 }
    552 
    553 static u32 aa_fcvtzu(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) {
    554   return (is64_dst ? 0x9e390000u : 0x1e390000u) |
    555          (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | (rd & 0x1fu);
    556 }
    557 
    558 static u32 aa_fcvt_d_s(u32 rd, u32 rn) {
    559   return 0x1e22c000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    560 }
    561 
    562 static u32 aa_fcvt_s_d(u32 rd, u32 rn) {
    563   return 0x1e624000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    564 }
    565 
    566 static u32 aa_fmov_gpr_to_fp(u32 is64, u32 fd, u32 rn) {
    567   return (is64 ? 0x9e670000u : 0x1e270000u) | ((rn & 0x1fu) << 5) |
    568          (fd & 0x1fu);
    569 }
    570 
    571 static u32 aa_fmov_fp_to_gpr(u32 is64, u32 rd, u32 fn) {
    572   return (is64 ? 0x9e660000u : 0x1e260000u) | ((fn & 0x1fu) << 5) |
    573          (rd & 0x1fu);
    574 }
    575 
    576 static u32 aa_clz(u32 sf, u32 rd, u32 rn) {
    577   return (sf ? 0xdac01000u : 0x5ac01000u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    578 }
    579 
    580 static u32 aa_rbit(u32 sf, u32 rd, u32 rn) {
    581   return (sf ? 0xdac00000u : 0x5ac00000u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    582 }
    583 
    584 static u32 aa_rev(u32 sf, u32 rd, u32 rn) {
    585   return (sf ? 0xdac00c00u : 0x5ac00800u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    586 }
    587 
    588 static u32 aa_sbfm(u32 sf, u32 rd, u32 rn, u32 immr, u32 imms) {
    589   return (sf ? 0x93400000u : 0x13000000u) | ((immr & 0x3fu) << 16) |
    590          ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    591 }
    592 
    593 static __attribute__((unused)) u32 aa_ubfm(u32 sf, u32 rd, u32 rn, u32 immr,
    594                                            u32 imms) {
    595   return (sf ? 0xd3400000u : 0x53000000u) | ((immr & 0x3fu) << 16) |
    596          ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    597 }
    598 
    599 static __attribute__((unused)) u32 aa_ldaxr(u32 size, u32 rt, u32 rn) {
    600   return (size << 30) | 0x085ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    601 }
    602 
    603 static __attribute__((unused)) u32 aa_ldxr(u32 size, u32 rt, u32 rn) {
    604   return (size << 30) | 0x085f7c00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    605 }
    606 
    607 static __attribute__((unused)) u32 aa_stlxr(u32 size, u32 rs, u32 rt, u32 rn) {
    608   return (size << 30) | 0x0800fc00u | ((rs & 0x1fu) << 16) |
    609          ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    610 }
    611 
    612 static __attribute__((unused)) u32 aa_stxr(u32 size, u32 rs, u32 rt, u32 rn) {
    613   return (size << 30) | 0x08007c00u | ((rs & 0x1fu) << 16) |
    614          ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    615 }
    616 
    617 static __attribute__((unused)) u32 aa_ldar(u32 size, u32 rt, u32 rn) {
    618   return (size << 30) | 0x08dffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    619 }
    620 
    621 static __attribute__((unused)) u32 aa_stlr(u32 size, u32 rt, u32 rn) {
    622   return (size << 30) | 0x089ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu);
    623 }
    624 
    625 static u32 aa_umaddl(u32 rd, u32 rn, u32 rm, u32 ra) {
    626   return 0x9ba00000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) |
    627          ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    628 }
    629 
    630 static u32 aa_smaddl(u32 rd, u32 rn, u32 rm, u32 ra) {
    631   return 0x9b200000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) |
    632          ((rn & 0x1fu) << 5) | (rd & 0x1fu);
    633 }
    634 
    635 static u32 aa_smulh(u32 rd, u32 rn, u32 rm) {
    636   return 0x9b407c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
    637          (rd & 0x1fu);
    638 }
    639 
    640 static u32 aa_umulh(u32 rd, u32 rn, u32 rm) {
    641   return 0x9bc07c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
    642          (rd & 0x1fu);
    643 }
    644 
    645 static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) {
    646   return aa64_addsubsr_pack(
    647       (AA64AddSubSR){.sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd});
    648 }
    649 
    650 static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, NativeLoc rhs);
    651 
    652 static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) {
    653   return aa64_addsubsr_pack((AA64AddSubSR){.sf = 1,
    654                                            .op = 0,
    655                                            .S = 0,
    656                                            .shift = 0,
    657                                            .Rm = rm,
    658                                            .imm6 = shift,
    659                                            .Rn = rn,
    660                                            .Rd = rd});
    661 }
    662 
    663 static u32 aa_cset(u32 sf, u32 rd, u32 cond) {
    664   return aa64_csinc_enc(sf, rd, AA64_ZR, AA64_ZR, cond ^ 1u);
    665 }
    666 
    667 static u32 cmp_cond(CmpOp op) {
    668   switch (op) {
    669     case CMP_EQ:
    670       return 0x0u;
    671     case CMP_NE:
    672       return 0x1u;
    673     case CMP_LT_U:
    674       return 0x3u;
    675     case CMP_LE_U:
    676       return 0x9u;
    677     case CMP_GT_U:
    678       return 0x8u;
    679     case CMP_GE_U:
    680       return 0x2u;
    681     case CMP_LT_S:
    682       return 0xbu;
    683     case CMP_LE_S:
    684       return 0xdu;
    685     case CMP_GT_S:
    686       return 0xcu;
    687     case CMP_GE_S:
    688       return 0xau;
    689     /* FP predicates after FCMP set NZCV as: a<b -> N; a==b -> Z,C; a>b -> C;
    690      * unordered -> C,V. Each maps to a single condition except CMP_ONE_F /
    691      * CMP_UEQ_F (synthesized with two instructions in aa_cmp/aa_cmp_branch,
    692      * which intercept them before calling cmp_cond). */
    693     case CMP_OEQ_F:
    694       return 0x0u; /* EQ */
    695     case CMP_OLT_F:
    696       return 0x4u; /* MI */
    697     case CMP_OLE_F:
    698       return 0x9u; /* LS */
    699     case CMP_OGT_F:
    700       return 0xcu; /* GT */
    701     case CMP_OGE_F:
    702       return 0xau; /* GE */
    703     case CMP_UNE_F:
    704       return 0x1u; /* NE  (unordered or not-equal) */
    705     case CMP_ULT_F:
    706       return 0xbu; /* LT  (unordered or less-than) */
    707     case CMP_ULE_F:
    708       return 0xdu; /* LE  (unordered or less-or-equal) */
    709     case CMP_UGT_F:
    710       return 0x8u; /* HI  (unordered or greater-than) */
    711     case CMP_UGE_F:
    712       return 0x2u; /* CS  (unordered or greater-or-equal) */
    713     default:
    714       return 0x0u;
    715   }
    716 }
    717 
    718 static AANativeSlot* aa_slot(AANativeTarget* a, NativeFrameSlot slot) {
    719   return native_frame_slot_at(&a->frame, slot);
    720 }
    721 
    722 static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out,
    723                          i32* off_out) {
    724   *base_out = AA_TMP0;
    725   *off_out = addr.offset;
    726   switch ((NativeAddrBaseKind)addr.base_kind) {
    727     case NATIVE_ADDR_BASE_REG:
    728       *base_out = addr.base.reg;
    729       return;
    730     case NATIVE_ADDR_BASE_FRAME: {
    731       AANativeSlot* s = aa_slot(a, addr.base.frame);
    732       *base_out = AA_FP;
    733       *off_out = aa_fp_off_slot(a, s->off) + addr.offset;
    734       return;
    735     }
    736     case NATIVE_ADDR_BASE_GLOBAL: {
    737       NativeLoc tmp;
    738       memset(&tmp, 0, sizeof tmp);
    739       tmp.kind = NATIVE_LOC_REG;
    740       tmp.cls = NATIVE_REG_INT;
    741       tmp.type = builtin_id(KIT_CG_BUILTIN_I64);
    742       tmp.v.reg = AA_TMP0;
    743       a->base.load_addr(&a->base, tmp, addr);
    744       *base_out = AA_TMP0;
    745       *off_out = 0;
    746       return;
    747     }
    748     default:
    749       aa_panic(a, "unsupported address base");
    750   }
    751 }
    752 
    753 static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off);
    754 static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off);
    755 
    756 static void aa_emit_mem_q(AANativeTarget* a, int load, NativeLoc reg,
    757                           NativeAddr addr) {
    758   u32 base, rt;
    759   i32 off;
    760   MCEmitter* mc = a->base.mc;
    761   if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
    762     aa_panic(a, "unsupported q-register indexed memory access");
    763   aa_addr_base(a, addr, &base, &off);
    764   rt = loc_reg(reg);
    765   if (off >= 0 && (((u32)off & 15u) == 0) && ((u32)off >> 4) <= 0xfffu) {
    766     aa_emit32(mc, aa_ldst_q_uimm(load, rt, base, (u32)off));
    767     return;
    768   }
    769   if (off >= -256 && off <= 255) {
    770     aa_emit32(mc, aa_ldst_q_simm9(load, rt, base, off));
    771     return;
    772   }
    773   aa_emit_add_imm(a, AA_TMP1, base, off);
    774   aa_emit32(mc, aa_ldst_q_uimm(load, rt, AA_TMP1, 0));
    775 }
    776 
    777 static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg,
    778                         NativeAddr addr, MemAccess mem) {
    779   u32 base, rt, sz;
    780   i32 off;
    781   MCEmitter* mc = a->base.mc;
    782   rt = loc_reg(reg);
    783   sz = size_idx(mem.size
    784                     ? mem.size
    785                     : type_size32(&a->base, reg.type ? reg.type : mem.type));
    786   if (native_loc_is_fp(reg) &&
    787       (mem.size
    788            ? mem.size
    789            : type_size32(&a->base, reg.type ? reg.type : mem.type)) == 16u) {
    790     aa_emit_mem_q(a, load, reg, addr);
    791     return;
    792   }
    793   if (native_loc_is_fp(reg) && sz < 2u) sz = 2u;
    794   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL &&
    795       addr.index_kind == NATIVE_ADDR_INDEX_NONE) {
    796     i64 addend = addr.base.global.addend + (i64)addr.offset;
    797     u32 scratch = (!load && rt == AA_TMP0) ? AA_TMP1 : AA_TMP0;
    798     u32 pos = mc->pos(mc);
    799     if (aa_use_got_for_sym(&a->base, addr.base.global.sym)) {
    800       aa_emit32(mc, aa64_adrp(scratch, 0, 0));
    801       mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_GOT_PAGE,
    802                         addr.base.global.sym, 0, 0, 0);
    803       pos = mc->pos(mc);
    804       aa_emit32(mc, aa_ldr_uimm(3, scratch, scratch, 0));
    805       mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_LD64_GOT_LO12_NC,
    806                         addr.base.global.sym, 0, 0, 0);
    807       if (addend) aa_emit_add_i64(a, scratch, scratch, addend);
    808       aa_emit32(mc, load
    809                         ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, scratch, 0)
    810                         : aa_stur_v(sz, native_loc_is_fp(reg), rt, scratch, 0));
    811       return;
    812     }
    813     aa_emit32(mc, aa64_adrp(scratch, 0, 0));
    814     mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21,
    815                       addr.base.global.sym, addend, 0, 0);
    816     pos = mc->pos(mc);
    817     aa_emit32(mc,
    818               load ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0)
    819                    : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0));
    820     mc->emit_reloc_at(mc, mc->section_id, pos, aa_ldst_reloc_for_size(sz),
    821                       addr.base.global.sym, addend, 0, 0);
    822     return;
    823   }
    824   aa_addr_base(a, addr, &base, &off);
    825   if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) {
    826     u32 use_base = base;
    827     u32 scaled = 0;
    828     if (addr.index_kind != NATIVE_ADDR_INDEX_REG)
    829       aa_panic(a, "unsupported address index");
    830     if (off) {
    831       use_base = AA_TMP1;
    832       aa_emit_add_imm(a, use_base, base, off);
    833     }
    834     if (addr.log2_scale == 0) {
    835       scaled = 0;
    836     } else if (addr.log2_scale == sz) {
    837       scaled = 1;
    838     } else {
    839       aa_panic(a, "unsupported memory address scale");
    840     }
    841     aa_emit32(mc, aa_ldst_regoff_v(sz, native_loc_is_fp(reg), load, rt,
    842                                    use_base, addr.index.reg, scaled));
    843     return;
    844   }
    845   if (off >= 0 && (((u32)off & ((1u << sz) - 1u)) == 0) &&
    846       ((u32)off >> sz) <= 0xfffu) {
    847     aa_emit32(
    848         mc, load
    849                 ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off)
    850                 : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off));
    851     return;
    852   }
    853   if (off >= -256 && off <= 255) {
    854     aa_emit32(mc, load ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, base, off)
    855                        : aa_stur_v(sz, native_loc_is_fp(reg), rt, base, off));
    856     return;
    857   }
    858   aa_emit_add_imm(a, AA_TMP1, base, off);
    859   aa_emit32(mc, load ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, AA_TMP1, 0)
    860                      : aa_stur_v(sz, native_loc_is_fp(reg), rt, AA_TMP1, 0));
    861 }
    862 
    863 static NativeAllocClass aa_class_for_type(NativeTarget* t, KitCgTypeId type) {
    864   if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u)
    865     return NATIVE_REG_FP;
    866   return NATIVE_REG_INT;
    867 }
    868 
    869 static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr,
    870                          MemAccess mem) {
    871   u32 sz;
    872   (void)t;
    873   if (!addr) return 0;
    874   if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return 1;
    875   if (addr->index_kind != NATIVE_ADDR_INDEX_REG) return 0;
    876   if (addr->log2_scale == 0) return 1;
    877   sz = size_idx(mem.size ? mem.size : 8u);
    878   return addr->log2_scale == sz;
    879 }
    880 
    881 /* True if `mul Rd, Rn, #c` can be replaced by a single non-mul aarch64
    882  * instruction using only Rn as a source (no extra scratch reg). Constants
    883  * that match: 0, 1, -1, +/-2^k, 2^k+1, 1-2^k for k in [1..width-1]. The
    884  * shift exponent must fit imm6 for the operand width (width = 32 if !sf
    885  * else 64). The emit side is aa_emit_mul_const_imm. */
    886 static int aa64_imul_strength_reducible(u32 sf, i64 imm) {
    887   u32 max_sh = sf ? 63u : 31u;
    888   u64 a;
    889   if (imm == 0 || imm == 1 || imm == -1) return 1;
    890   /* +2^k */
    891   a = (u64)imm;
    892   if (imm > 0 && (a & (a - 1u)) == 0u) {
    893     u32 k = (u32)__builtin_ctzll(a);
    894     return k <= max_sh;
    895   }
    896   /* -2^k */
    897   if (imm < 0) {
    898     a = (u64)(-imm);
    899     if (a && (a & (a - 1u)) == 0u) {
    900       u32 k = (u32)__builtin_ctzll(a);
    901       return k >= 1u && k <= max_sh;
    902     }
    903   }
    904   /* 2^k + 1 (k >= 1, so c >= 3) */
    905   if (imm >= 3) {
    906     u64 m = (u64)(imm - 1);
    907     if ((m & (m - 1u)) == 0u) {
    908       u32 k = (u32)__builtin_ctzll(m);
    909       return k >= 1u && k <= max_sh;
    910     }
    911   }
    912   /* 1 - 2^k (k >= 1, so c <= -1) */
    913   if (imm <= -1) {
    914     u64 m = (u64)(1 - imm);
    915     if (m && (m & (m - 1u)) == 0u) {
    916       u32 k = (u32)__builtin_ctzll(m);
    917       return k >= 1u && k <= max_sh;
    918     }
    919   }
    920   return 0;
    921 }
    922 
    923 /* Which constant operands the backend can fold directly into an instruction
    924  * (so the optimizer can leave them as immediates instead of materializing a
    925  * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12),
    926  * any value for a plain register move (movz/movk synthesizes it), and
    927  * strength-reducible mul constants (handled in aa_binop via shift / shifted
    928  * add or sub). */
    929 static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
    930                         KitCgTypeId type, i64 imm) {
    931   u32 imm12, sh;
    932   switch (use) {
    933     case NATIVE_IMM_BINOP:
    934       if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB)
    935         return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
    936       if ((BinOp)op == BO_IMUL) {
    937         u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
    938         return aa64_imul_strength_reducible(sf, imm);
    939       }
    940       /* LSL/LSR/ASR #imm via the UBFM/SBFM aliases: shift count in range. */
    941       if ((BinOp)op == BO_SHL || (BinOp)op == BO_SHR_S ||
    942           (BinOp)op == BO_SHR_U) {
    943         u32 bits = type_size32(t, type) == 8u ? 64u : 32u;
    944         return imm >= 0 && (u64)imm < (u64)bits;
    945       }
    946       /* AND/ORR/EOR #bitmask: encodable as an AArch64 logical immediate. */
    947       if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) {
    948         u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
    949         u32 N, immr, imms;
    950         return aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms);
    951       }
    952       return 0;
    953     case NATIVE_IMM_CMP:
    954       /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
    955        * non-negative immediate. */
    956       return imm >= 0 && aa64_addsub_imm_fits(imm, &imm12, &sh);
    957     case NATIVE_IMM_ADDR_OFFSET:
    958       return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
    959     case NATIVE_IMM_MOVE:
    960       return 1;
    961   }
    962   return 0;
    963 }
    964 
    965 static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) {
    966   if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return;
    967   if (addr->index_kind != NATIVE_ADDR_INDEX_REG)
    968     aa_panic(a, "unsupported address index");
    969   if (addr->log2_scale > 4u) aa_panic(a, "unsupported address scale");
    970   aa_emit32(a->base.mc, aa_add_lsl(rd, rd, addr->index.reg, addr->log2_scale));
    971 }
    972 
    973 static void aa_materialize_frame_index(AANativeTarget* a, NativeAddr* addr,
    974                                        u32 avoid_reg) {
    975   NativeAddr load;
    976   NativeLoc idx;
    977   MemAccess mem;
    978   u32 reg;
    979   if (addr->index_kind != NATIVE_ADDR_INDEX_FRAME_VALUE) return;
    980   reg = avoid_reg == AA_TMP1 ? AA_TMP0 : AA_TMP1;
    981   memset(&load, 0, sizeof load);
    982   load.base_kind = NATIVE_ADDR_BASE_FRAME;
    983   load.base.frame = addr->index.frame;
    984   load.base_type =
    985       addr->index_type ? addr->index_type : builtin_id(KIT_CG_BUILTIN_I64);
    986   memset(&idx, 0, sizeof idx);
    987   idx.kind = NATIVE_LOC_REG;
    988   idx.cls = NATIVE_REG_INT;
    989   idx.type = load.base_type;
    990   idx.v.reg = reg;
    991   memset(&mem, 0, sizeof mem);
    992   mem.type = load.base_type;
    993   mem.size = 8;
    994   mem.align = 8;
    995   aa_emit_mem(a, 1, idx, load, mem);
    996   addr->index_kind = NATIVE_ADDR_INDEX_REG;
    997   addr->index.reg = reg;
    998 }
    999 
   1000 static NativeLoc native_loc_reg(KitCgTypeId type, NativeAllocClass cls,
   1001                                 Reg reg);
   1002 
   1003 static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off) {
   1004   return aa64_ldst_uimm_pack((AA64LdStUimm){.size = 0,
   1005                                             .V = 1,
   1006                                             .opc = load ? 3u : 2u,
   1007                                             .imm12 = byte_off >> 4,
   1008                                             .Rn = rn,
   1009                                             .Rt = rt});
   1010 }
   1011 
   1012 static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) {
   1013   return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = 0,
   1014                                               .V = 1,
   1015                                               .opc = load ? 3u : 2u,
   1016                                               .imm9 = (u32)byte_off & 0x1ffu,
   1017                                               .Rn = rn,
   1018                                               .Rt = rt});
   1019 }
   1020 
   1021 static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg,
   1022                             NativeFrameSlot slot, u32 offset) {
   1023   AANativeSlot* s = aa_slot(a, slot);
   1024   i32 off = aa_fp_off_slot(a, s->off) + (i32)offset;
   1025   MCEmitter* mc = a->base.mc;
   1026   if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) {
   1027     aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off));
   1028     return;
   1029   }
   1030   if (off >= -256 && off <= 255) {
   1031     aa_emit32(mc, aa_ldst_q_simm9(load, qreg, AA_FP, off));
   1032     return;
   1033   }
   1034   aa_emit_add_imm(a, AA_TMP1, AA_FP, off);
   1035   aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_TMP1, 0));
   1036 }
   1037 
   1038 /* Reserve the variadic register-save-area frame slots (gp then fp). Split from
   1039  * the store emission so the known-frame path can fix the full frame — including
   1040  * these slots — before the prologue, then emit the stores after it. */
   1041 static void aa_reserve_variadic_reg_saves(AANativeTarget* a) {
   1042   NativeFrameSlotDesc sd;
   1043   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   1044   ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi);
   1045   if (vai.kind != ABI_VA_LIST_AAPCS64) return;
   1046   memset(&sd, 0, sizeof sd);
   1047   sd.type = i64;
   1048   sd.size = vai.gp_reg_count * vai.gp_slot_size;
   1049   sd.align = 8;
   1050   sd.kind = NATIVE_FRAME_SLOT_SAVE;
   1051   a->va_gr_slot = a->base.frame_slot(&a->base, &sd);
   1052   sd.size = vai.fp_reg_count * vai.fp_slot_size;
   1053   sd.align = 16;
   1054   a->va_vr_slot = a->base.frame_slot(&a->base, &sd);
   1055 }
   1056 
   1057 /* Emit the stores into the variadic register-save area. For AAPCS64 these land
   1058  * in the reserved gr/vr frame slots (aa_reserve_variadic_reg_saves); for the
   1059  * Windows GP home area they land in [fp + AA_FRAME_SAVE_SIZE ..], the
   1060  * top-of-frame block contiguous with the incoming stack args. */
   1061 static void aa_emit_variadic_reg_save_stores(AANativeTarget* a) {
   1062   NativeAddr addr;
   1063   MemAccess mem;
   1064   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   1065   ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi);
   1066   if (vai.kind == ABI_VA_LIST_POINTER && a->top_home_bytes) {
   1067     /* Windows: home x0..x{gp_reg_count-1} so the plain-pointer va_list walks
   1068      * register then stack varargs as one block. The named leading registers are
   1069      * homed too (harmless): va_start skips past them. */
   1070     memset(&mem, 0, sizeof mem);
   1071     mem.type = i64;
   1072     mem.size = 8;
   1073     mem.align = 8;
   1074     memset(&addr, 0, sizeof addr);
   1075     addr.base_kind = NATIVE_ADDR_BASE_REG;
   1076     addr.base.reg = AA_FP;
   1077     addr.base_type = i64;
   1078     for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) {
   1079       NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r);
   1080       addr.offset = aa_fp_off_home_slot(r);
   1081       aa_emit_mem(a, 0, src, addr, mem);
   1082     }
   1083     return;
   1084   }
   1085   if (vai.kind != ABI_VA_LIST_AAPCS64) return;
   1086   memset(&mem, 0, sizeof mem);
   1087   mem.type = i64;
   1088   mem.size = 8;
   1089   mem.align = 8;
   1090   memset(&addr, 0, sizeof addr);
   1091   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1092   addr.base.frame = a->va_gr_slot;
   1093   addr.base_type = i64;
   1094   for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) {
   1095     NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r);
   1096     addr.offset = (i32)(r * vai.gp_slot_size);
   1097     aa_emit_mem(a, 0, src, addr, mem);
   1098   }
   1099   for (u32 r = 0; r < vai.fp_reg_count && r < 8u; ++r)
   1100     aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size);
   1101 }
   1102 
   1103 static void aa_emit_entry_saves(AANativeTarget* a);
   1104 
   1105 /* Per-function state reset + function-symbol / cfi / prologue-anchor setup
   1106  * shared by both entry points (aa_func_begin for the single-pass path,
   1107  * aa_func_begin_known_frame for the optimizer path). Emits no prologue. */
   1108 static void aa_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   1109   AANativeTarget* a = aa_of(t);
   1110   MCEmitter* mc = t->mc;
   1111   a->func = fd;
   1112   /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
   1113    * callee-save set, and known_frame/has_alloca/frame_final. cum_off counts
   1114    * frame-slot bytes below fp; the saved fp/lr pair (16 bytes at [fp, fp+8]) is
   1115    * *not* part of it — aa_build_layout adds it in aa_func_end. */
   1116   native_frame_reset(&a->frame);
   1117   a->incoming_stack_size = 0;
   1118   a->next_param_int = 0;
   1119   a->next_param_fp = 0;
   1120   /* 0-based byte cursor for incoming stack args (also reported as the
   1121    * caller's incoming_stack_size for tail-call realizability). bind_param
   1122    * forms its fp-relative address via aa_fp_off_in_arg(next_param_stack),
   1123    * which adds the saved-pair offset. */
   1124   a->next_param_stack = 0;
   1125   a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
   1126   a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE;
   1127   a->va_gr_slot = NATIVE_FRAME_SLOT_NONE;
   1128   a->va_vr_slot = NATIVE_FRAME_SLOT_NONE;
   1129   a->npatches = 0;
   1130   a->nalloca = 0;
   1131   a->slim_prologue = 0;
   1132   a->slim_small_frame = 0;
   1133   a->fp_at_bottom = 0;
   1134   a->frame_size_final = 0;
   1135   /* Windows variadic functions reserve a GP register home area at the top of
   1136    * the frame (just below the incoming stack args). The plain-pointer va_list
   1137    * then walks register-passed then stack-passed varargs as one block. Other
   1138    * ABIs leave gp_reg_count 0 here: Apple ARM64 routes all varargs to the
   1139    * stack, AAPCS64 uses a struct va_list with separate reg-save pointers. */
   1140   {
   1141     const ABIFuncInfo* fi = abi_cg_func_info(t->c->abi, fd->fn_type);
   1142     ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   1143     a->top_home_bytes = (fi && fi->variadic && vai.kind == ABI_VA_LIST_POINTER)
   1144                             ? vai.gp_reg_count * vai.gp_slot_size
   1145                             : 0u;
   1146   }
   1147   mc->set_section(mc, fd->text_section_id);
   1148   mc->emit_align(mc, 4, 0);
   1149   a->func_start = mc->pos(mc);
   1150   mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
   1151   if (mc->cfi_startproc) mc->cfi_startproc(mc);
   1152   a->prologue_pos = mc->pos(mc);
   1153   a->minimal_prologue_words = 0;
   1154   a->epilogue_label = mc->label_new(mc);
   1155 }
   1156 
   1157 /* Single-pass (NativeDirectTarget) entry point: the frame is not known up
   1158  * front, so reserve a worst-case prologue region (patched in aa_func_end once
   1159  * max_outgoing / callee-saves are final) and emit the entry saves now. */
   1160 static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   1161   AANativeTarget* a = aa_of(t);
   1162   MCEmitter* mc = t->mc;
   1163   aa_func_begin_common(t, fd);
   1164   for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu);
   1165   aa_emit_entry_saves(a);
   1166 }
   1167 
   1168 /* Reserve the entry-save frame slots: the sret-pointer home (x8) and, for
   1169  * variadic functions, the argument register-save area. Reserving is split from
   1170  * emitting so the known-frame path can fix the full frame before the prologue;
   1171  * the single-pass path runs both back to back via aa_emit_entry_saves. */
   1172 static void aa_reserve_entry_saves(AANativeTarget* a) {
   1173   NativeTarget* t = &a->base;
   1174   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   1175   if (abi && abi->has_sret) {
   1176     NativeFrameSlotDesc sd;
   1177     memset(&sd, 0, sizeof sd);
   1178     sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1179     sd.size = 8;
   1180     sd.align = 8;
   1181     sd.kind = NATIVE_FRAME_SLOT_SAVE;
   1182     a->sret_ptr_slot = t->frame_slot(t, &sd);
   1183   }
   1184   if (abi && abi->variadic) aa_reserve_variadic_reg_saves(a);
   1185 }
   1186 
   1187 /* Emit the entry-save stores (x8 → sret slot, then the variadic reg-save area).
   1188  * Slots must already be reserved (aa_reserve_entry_saves). */
   1189 static void aa_emit_entry_save_stores(AANativeTarget* a) {
   1190   NativeTarget* t = &a->base;
   1191   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   1192   if (abi && abi->has_sret) {
   1193     NativeAddr addr;
   1194     NativeLoc src;
   1195     MemAccess mem;
   1196     KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   1197     memset(&addr, 0, sizeof addr);
   1198     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1199     addr.base.frame = a->sret_ptr_slot;
   1200     addr.base_type = i64;
   1201     memset(&src, 0, sizeof src);
   1202     src.kind = NATIVE_LOC_REG;
   1203     src.cls = NATIVE_REG_INT;
   1204     src.type = i64;
   1205     src.v.reg = 8u;
   1206     memset(&mem, 0, sizeof mem);
   1207     mem.type = i64;
   1208     mem.size = 8;
   1209     mem.align = 8;
   1210     aa_emit_mem(a, 0, src, addr, mem);
   1211   }
   1212   if (abi && abi->variadic) aa_emit_variadic_reg_save_stores(a);
   1213 }
   1214 
   1215 /* Reserve + emit the entry saves back to back. Single-pass (NativeDirectTarget)
   1216  * path, where the prologue region is a reserved worst-case block and slot
   1217  * offsets need not be final before it. */
   1218 static void aa_emit_entry_saves(AANativeTarget* a) {
   1219   aa_reserve_entry_saves(a);
   1220   aa_emit_entry_save_stores(a);
   1221 }
   1222 
   1223 static void aa_note_frame_state(NativeTarget* t,
   1224                                 const NativeFramePatchState* state) {
   1225   AANativeTarget* a = aa_of(t);
   1226   if (state && state->max_outgoing > a->frame.max_outgoing)
   1227     a->frame.max_outgoing = state->max_outgoing;
   1228 }
   1229 
   1230 /* Reserve a save slot for each callee-saved register the allocator used. Runs
   1231  * before frame-slot mapping so these slots get the lowest offsets, keeping the
   1232  * prologue stores within stur's signed-9-bit range. The prologue/epilogue
   1233  * save/restore is emitted from this list in aa_patch_prologue / aa_func_end. */
   1234 static void aa_reserve_callee_saves(NativeTarget* t, const u32* used,
   1235                                     u32 nclasses) {
   1236   AANativeTarget* a = aa_of(t);
   1237   /* aa64 homes each callee-save in its own 8-byte frame slot (reserved before
   1238    * the body slots so they sit nearest fp, in stur range), so alloc_slots=1.
   1239    * Adjacent integer slots are later paired into stp/ldp. */
   1240   NativeFrameSaveSpec spec[NATIVE_REG_VEC + 1];
   1241   memset(spec, 0, sizeof spec);
   1242   spec[NATIVE_REG_INT].size = 8;
   1243   spec[NATIVE_REG_INT].align = 8;
   1244   spec[NATIVE_REG_INT].type = builtin_id(KIT_CG_BUILTIN_I64);
   1245   spec[NATIVE_REG_FP].size = 8;
   1246   spec[NATIVE_REG_FP].align = 8;
   1247   spec[NATIVE_REG_FP].type = builtin_id(KIT_CG_BUILTIN_F64);
   1248   native_frame_set_callee_saves(&a->frame, used, nclasses, spec,
   1249                                 NATIVE_REG_VEC + 1, 1);
   1250 }
   1251 
   1252 static MemAccess aa_mem_for_type(NativeTarget* t, KitCgTypeId type, u32 size);
   1253 static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
   1254                                   u32 cap, u32* n);
   1255 
   1256 static void aa_emit_callee_restores(AANativeTarget* a) {
   1257   u32 words[AA_PROLOGUE_WORDS];
   1258   u32 n = 0;
   1259   aa_words_callee_saves(a, 0, words, AA_PROLOGUE_WORDS, &n);
   1260   for (u32 i = 0; i < n; ++i) aa_emit32(a->base.mc, words[i]);
   1261 }
   1262 
   1263 static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n,
   1264                               u32 rd, i64 imm) {
   1265   u32 tmp[4];
   1266   u32 m = aa_load_imm_words(tmp, 4u, 1, rd, imm);
   1267   if (!m || *n + m > cap) aa_panic(a, "instruction patch too small");
   1268   for (u32 i = 0; i < m; ++i) words[(*n)++] = tmp[i];
   1269 }
   1270 
   1271 /* Windows large-frame stack probe. kit's prologue reserves the whole frame in
   1272  * one `sub sp, sp, #N`, but Windows grows a thread stack one guard page at a
   1273  * time: a sub that jumps SP more than a page past the guard page leaves the
   1274  * skipped pages uncommitted, and the first store into them faults (and, since
   1275  * SP itself is then in uncommitted memory, the fault can't even be delivered).
   1276  * Touch every page the frame spans, top-down, so each guard page commits in
   1277  * turn before the sub. Inlined (no external __chkstk symbol / no reloc in the
   1278  * patched prologue region); mirrors the linker's aa64_coff_chkstk body. Only
   1279  * x16/x17 are clobbered — the following sub-sp / saved-pair material re-derives
   1280  * both. Emitted only when frame_size > interval (one page). */
   1281 static void aa_words_stack_probe(AANativeTarget* a, u32* words, u32 cap, u32* n,
   1282                                  u32 frame_size, u32 interval) {
   1283   u32 imm12, sh;
   1284   if (!aa64_addsub_imm_fits(interval, &imm12, &sh))
   1285     aa_panic(a, "stack-probe interval not an addsub immediate");
   1286   /* x16 = frame_size ; x17 = sp */
   1287   aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
   1288   if (*n + 5u > cap) aa_panic(a, "instruction patch too small");
   1289   words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); /* mov x17, sp */
   1290   /* loop: x17 -= page ; x16 -= page (sets flags) ; touch [x17] ; b.gt loop */
   1291   words[(*n)++] = aa64_sub_imm(1, AA_TMP1, AA_TMP1, imm12, sh);
   1292   words[(*n)++] = aa64_subs_imm12(1, AA_TMP0, AA_TMP0, imm12, sh);
   1293   words[(*n)++] = aa64_ldr64_uimm12(31, AA_TMP1, 0); /* ldr xzr, [x17] */
   1294   /* branch back to the `sub x17` three words above while x16 stays positive */
   1295   words[(*n)++] =
   1296       aa64_brcond_pack((AA64BrCond){.imm19 = (u32)(-3), .cond = 0xcu /* GT */});
   1297 }
   1298 
   1299 static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap,
   1300                                   u32* n, u32 frame_size) {
   1301   u32 imm12, sh;
   1302   if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) {
   1303     if (*n >= cap) aa_panic(a, "instruction patch too small");
   1304     words[(*n)++] = aa64_sub_imm(1, AA_SP, AA_SP, imm12, sh);
   1305     return;
   1306   }
   1307   aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
   1308   if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
   1309   words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
   1310   words[(*n)++] = aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0);
   1311   words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0);
   1312 }
   1313 
   1314 /* Anchor fp at the AAPCS64 saved-pair address (= sp + saved-pair offset).
   1315  * The slim_prologue path achieves the same anchor in a single insn via
   1316  * `add x29, sp, #0` after the pre-decrement stp moves sp to the saved-pair. */
   1317 static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap,
   1318                                        u32* n, const AAFrameLayout* L) {
   1319   u32 imm12, sh;
   1320   u32 anchor = aa_sp_off_saved_pair(L);
   1321   if (aa64_addsub_imm_fits(anchor, &imm12, &sh)) {
   1322     if (*n >= cap) aa_panic(a, "instruction patch too small");
   1323     words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh);
   1324     return;
   1325   }
   1326   aa_words_load_imm(a, words, cap, n, AA_TMP0, anchor);
   1327   if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
   1328   words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
   1329   words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0);
   1330 }
   1331 
   1332 /* x17 = address of the saved-pair slot (= sp + saved-pair offset). Used by
   1333  * the fat prologue to materialize the stp destination when the offset
   1334  * doesn't fit stp's signed-7-bit-scaled immediate. */
   1335 static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
   1336                                      u32* n, const AAFrameLayout* L) {
   1337   u32 save_off = aa_sp_off_saved_pair(L);
   1338   u32 imm12, sh;
   1339   if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) {
   1340     if (*n >= cap) aa_panic(a, "instruction patch too small");
   1341     words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, imm12, sh);
   1342     return;
   1343   }
   1344   aa_words_load_imm(a, words, cap, n, AA_TMP0, save_off);
   1345   if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
   1346   words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
   1347   words[(*n)++] = aa64_add(1, AA_TMP1, AA_TMP1, AA_TMP0);
   1348 }
   1349 
   1350 static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
   1351                                    u32* n, const AAFrameLayout* L) {
   1352   if (!L->frame_size) return;
   1353   if (a->slim_prologue) {
   1354     if (*n + 1u > cap) aa_panic(a, "instruction patch too small");
   1355     /* `ldp x29, x30, [sp], #16` — pop saved pair, restore sp. */
   1356     words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2);
   1357     return;
   1358   }
   1359   if (a->fp_at_bottom) {
   1360     /* Bottom-record fold: `ldp x29,x30,[sp],#N` reloads the pair from the
   1361      * bottom AND releases the whole frame in one insn. Callee-saves were
   1362      * already restored by aa_emit_callee_restores. -1 insn vs slim_small_frame
   1363      * (which needs a separate `add sp`). N <= 504 holds the post-index imm. */
   1364     if (*n + 1u > cap) aa_panic(a, "instruction patch too small");
   1365     words[(*n)++] =
   1366         aa64_ldp64_post(AA_FP, AA_LR, AA_SP, (i32)(L->frame_size / 8u));
   1367     return;
   1368   }
   1369   if (a->slim_small_frame) {
   1370     /* `ldp x29,x30,[sp,#saved_pair] ; add sp,sp,#frame_size` — load through
   1371      * sp avoids the fat path's `add x10, fp, #0` scratch, and the subsequent
   1372      * `add sp` unwinds without depending on the (now-clobbered) old fp. */
   1373     u32 save_off = aa_sp_off_saved_pair(L);
   1374     u32 imm12, sh;
   1375     if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
   1376     words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
   1377     if (!aa64_addsub_imm_fits(L->frame_size, &imm12, &sh))
   1378       aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
   1379     words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh);
   1380     return;
   1381   }
   1382   if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
   1383   /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then restore
   1384    * sp to fp + CFA-offset (= caller's original sp = CFA). The CFA offset is
   1385    * AA_FRAME_SAVE_SIZE normally, plus the Windows-variadic GP home area when
   1386    * present. */
   1387   words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
   1388   words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0);
   1389   words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, (u32)aa_cfa_off(a), 0);
   1390 }
   1391 
   1392 /* Emit callee-save store (save=1) or restore (save=0) words into `words`,
   1393  * pairing adjacent integer registers into a single stp/ldp.
   1394  * reserve_callee_saves allocates consecutive 8-byte slots in order, so
   1395  * callee_saves[i] sits 8 bytes above callee_saves[i+1]; for an int pair the
   1396  * lower-addressed reg[i+1] is the stp's Rt and reg[i] is Rt2. FP registers (and
   1397  * an unpaired trailing int) use the single-register stur/ldur form. */
   1398 static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
   1399                                   u32 cap, u32* n) {
   1400   for (u32 i = 0; i < a->frame.ncallee_saves;) {
   1401     const AACalleeSave* cs = &a->frame.callee_saves[i];
   1402     i32 off = aa_fp_off_slot(a, aa_slot(a, cs->slot)->off);
   1403     if (i + 1u < a->frame.ncallee_saves && cs->cls == (u8)NATIVE_REG_INT &&
   1404         a->frame.callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) {
   1405       const AACalleeSave* cs2 = &a->frame.callee_saves[i + 1u];
   1406       i32 off2 = aa_fp_off_slot(a, aa_slot(a, cs2->slot)->off);
   1407       /* cs2 is reserved after cs (larger slot.off), so it is the lower address
   1408        * in both layouts (off2 = off - 8): stp's Rt = cs2, Rt2 = cs, base off2.
   1409        * stp/ldp's signed-7-bit scaled immediate reaches ±504. */
   1410       if (off2 < -512 || off2 > 504)
   1411         aa_panic(a, "callee-save pair offset out of prologue range");
   1412       if (*n >= cap) aa_panic(a, "prologue too large");
   1413       words[(*n)++] = save
   1414                           ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8)
   1415                           : aa64_ldp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8);
   1416       i += 2u;
   1417     } else {
   1418       u32 v = cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u;
   1419       if (*n >= cap) aa_panic(a, "prologue too large");
   1420       if (a->fp_at_bottom) {
   1421         /* Positive, 8-aligned offset above the record (up to frame_size-8 ≤
   1422          * 496): the unscaled stur (±256) can't reach it, so use the scaled
   1423          * unsigned-imm str/ldr. */
   1424         if (off < 0 || (u32)off > 0x7ff8u)
   1425           aa_panic(a, "callee-save offset out of prologue range");
   1426         words[(*n)++] = save ? aa_str_uimm_v(3, v, cs->reg, AA_FP, (u32)off)
   1427                              : aa_ldr_uimm_v(3, v, cs->reg, AA_FP, (u32)off);
   1428       } else {
   1429         if (off < -256 || off > 255)
   1430           aa_panic(a, "callee-save offset out of prologue range");
   1431         words[(*n)++] = save ? aa_stur_v(3, v, cs->reg, AA_FP, off)
   1432                              : aa_ldur_v(3, v, cs->reg, AA_FP, off);
   1433       }
   1434       i += 1u;
   1435     }
   1436   }
   1437 }
   1438 
   1439 /* Build the prologue instruction words for `L` into `words` (capacity `cap`),
   1440  * returning the count. Shared by the NativeDirectTarget patch path (reserves
   1441  * a fixed worst-case region, then patches it here) and the optimizer path
   1442  * (aa_func_begin_known_frame emits exactly these words up front).
   1443  *
   1444  * All variants establish a post-prologue state defined by L: saved x29/x30 at
   1445  * [fp]/[fp+8], callee-saves at aa_fp_off_slot of each. The top-record variants
   1446  * leave fp = sp + aa_sp_off_saved_pair(L) (saved-pair near the top); the
   1447  * bottom-record variant leaves fp = sp (saved-pair at the bottom). */
   1448 static u32 aa_build_prologue_words(AANativeTarget* a, const AAFrameLayout* L,
   1449                                    u32* words, u32 cap) {
   1450   u32 n = 0;
   1451   if (!L->frame_size) return 0;
   1452   if (a->slim_prologue) {
   1453     if (cap < 2u) aa_panic(a, "prologue too large");
   1454     /* `stp x29, x30, [sp, #-16]!; add x29, sp, #0` — the pre-decrement stp
   1455      * moves sp down to the saved-pair address, so a no-op add anchors fp
   1456      * there directly. AAPCS64 frame record. */
   1457     words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2);
   1458     words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0);
   1459     return n;
   1460   }
   1461   if (a->fp_at_bottom) {
   1462     /* Bottom-record fold: `stp x29,x30,[sp,#-N]!` decrements sp by the whole
   1463      * frame AND saves the pair at the new bottom in one insn; `mov x29,sp`
   1464      * (add #0) anchors fp there. Callee-saves then stack above the record at
   1465      * positive offsets. -2 insns/call vs the top-record slim_small_frame. */
   1466     if (n + 2u > cap) aa_panic(a, "prologue too large");
   1467     words[n++] =
   1468         aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -(i32)(L->frame_size / 8u));
   1469     words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0);
   1470     aa_words_callee_saves(a, 1, words, cap, &n);
   1471     return n;
   1472   }
   1473   /* On targets that don't auto-grow the stack (Windows), probe each page the
   1474    * frame spans before the single large `sub sp` jumps past the guard page.
   1475    * slim_prologue/fp_at_bottom returned above — their frames are bounded to
   1476    * one page (≤16 / ≤504 bytes), so only this path can exceed `interval`. */
   1477   {
   1478     u32 interval = abi_stack_probe_interval(a->base.c->abi);
   1479     if (interval && L->frame_size > interval)
   1480       aa_words_stack_probe(a, words, cap, &n, L->frame_size, interval);
   1481   }
   1482   aa_words_sub_sp_frame(a, words, cap, &n, L->frame_size);
   1483   if (a->slim_small_frame) {
   1484     /* `stp x29, x30, [sp, #saved_pair_off]` — skip the `add x17, sp, #...`
   1485      * scratch the fat path needs. Valid when the offset fits stp's
   1486      * signed-7-bit scaled immediate (saved_pair_off <= 504). */
   1487     u32 save_off = aa_sp_off_saved_pair(L);
   1488     if (n >= cap) aa_panic(a, "prologue too large");
   1489     words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
   1490   } else {
   1491     aa_words_saved_pair_addr(a, words, cap, &n, L);
   1492     if (n >= cap) aa_panic(a, "prologue too large");
   1493     words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
   1494   }
   1495   aa_words_frame_ptr_from_sp(a, words, cap, &n, L);
   1496   /* Save callee-saved registers the allocator used (fp-relative; their slots
   1497    * were reserved first by aa_reserve_callee_saves so offsets fit stur). */
   1498   aa_words_callee_saves(a, 1, words, cap, &n);
   1499   return n;
   1500 }
   1501 
   1502 /* Patch the reserved prologue region (`region` words at prologue_pos) with the
   1503  * real prologue for `L`. Used by the NativeDirectTarget single-pass path,
   1504  * which reserves AA_PROLOGUE_WORDS up front before the frame is known. The
   1505  * optimizer path reserves exactly the words it needs, so `region` equals
   1506  * the real prologue length and no tail remains. */
   1507 static void aa_patch_prologue(AANativeTarget* a, const AAFrameLayout* L,
   1508                               u32 region) {
   1509   u32 words[AA_PROLOGUE_WORDS];
   1510   u32 n;
   1511   ObjSecId sec = a->func->text_section_id;
   1512   if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large");
   1513   memset(words, 0, sizeof words);
   1514   n = aa_build_prologue_words(a, L, words, region);
   1515   /* If the real prologue is shorter than the reserved region (the worst-case
   1516    * NDT reservation), branch straight to the body rather than leaving the
   1517    * trailing slots as NOPs that fall through and execute on every call. */
   1518   if (n < region) {
   1519     words[n] = aa64_b(region - n);
   1520     for (u32 i = n + 1u; i < region; ++i) words[i] = 0xd503201fu;
   1521   }
   1522   for (u32 i = 0; i < region; ++i)
   1523     aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]);
   1524 }
   1525 
   1526 static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) {
   1527   MCEmitter* mc = a->base.mc;
   1528   u32 words[AA_PROLOGUE_WORDS];
   1529   u32 n = 0;
   1530   if (!L->frame_size) return;
   1531   aa_words_restore_frame(a, words, AA_PROLOGUE_WORDS, &n, L);
   1532   for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
   1533 }
   1534 
   1535 /* Reserve one entry in the deferred-patch list, growing (arena-doubling) as
   1536  * needed. The returned pointer is stable until the next aa_patch_alloc. */
   1537 static AAPatch* aa_patch_alloc(AANativeTarget* a) {
   1538   if (a->npatches == a->patches_cap) {
   1539     u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
   1540     AAPatch* nb = arena_zarray(a->base.c->tu, AAPatch, cap);
   1541     if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
   1542     a->patches = nb;
   1543     a->patches_cap = cap;
   1544   }
   1545   return &a->patches[a->npatches++];
   1546 }
   1547 
   1548 /* Append FP-relative loads that restore the saved callee registers (stp/ldp
   1549  * paired, same as the prologue saves). Shared by the tail-call patch; the
   1550  * function epilogue uses aa_emit_callee_restores. */
   1551 static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
   1552                                      u32* n) {
   1553   aa_words_callee_saves(a, 0, words, cap, n);
   1554 }
   1555 
   1556 /* Drain the deferred-patch list. Each entry targets a disjoint, fixed code
   1557  * position, so insertion order does not affect output. */
   1558 static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) {
   1559   ObjSecId sec = a->func->text_section_id;
   1560   for (u32 i = 0; i < a->npatches; ++i) {
   1561     AAPatch* p = &a->patches[i];
   1562     if (p->kind == AA_PATCH_ALLOCA) {
   1563       u32 imm12, sh;
   1564       if (!aa64_addsub_imm_fits(a->frame.max_outgoing, &imm12, &sh))
   1565         aa_panic(a, "outgoing area too large for alloca result");
   1566       aa_patch32(a->base.obj, sec, p->pos,
   1567                  aa64_add_imm(1, p->u.dst_reg, AA_SP, imm12, sh));
   1568     } else { /* AA_PATCH_TAIL */
   1569       NativeLoc callee = p->u.callee;
   1570       u32 words[AA_TAIL_WORDS];
   1571       u32 n = 0;
   1572       memset(words, 0, sizeof words);
   1573       aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
   1574       aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L);
   1575       if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
   1576       if (callee.kind == NATIVE_LOC_REG) {
   1577         words[n++] = aa64_br(loc_reg(callee));
   1578       } else if (callee.kind == NATIVE_LOC_GLOBAL) {
   1579         while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
   1580         words[n++] = aa64_b(0);
   1581       } else {
   1582         aa_panic(a, "unsupported tail target");
   1583       }
   1584       while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
   1585       for (u32 w = 0; w < AA_TAIL_WORDS; ++w)
   1586         aa_patch32(a->base.obj, sec, p->pos + w * 4u, words[w]);
   1587     }
   1588   }
   1589 }
   1590 
   1591 static void aa_func_end(NativeTarget* t) {
   1592   AANativeTarget* a = aa_of(t);
   1593   MCEmitter* mc = t->mc;
   1594   AAFrameLayout L =
   1595       aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
   1596   /* known_frame (optimizer): prologue, allocas, and tail epilogues were emitted
   1597    * final and slim eligibility was settled in aa_func_begin_known_frame — there
   1598    * is nothing to patch. Single-pass (NDT): a worst-case prologue region was
   1599    * reserved and the deferred patches recorded; resolve them now that the frame
   1600    * is final. The NDT path always uses the fat prologue/epilogue (slim_* left 0
   1601    * by aa_func_begin_common, since its reserved region is much larger). */
   1602   u32 prologue_region =
   1603       a->frame.known_frame ? a->minimal_prologue_words : AA_PROLOGUE_WORDS;
   1604   mc->label_place(mc, a->epilogue_label);
   1605   aa_emit_callee_restores(a);
   1606   aa_emit_restore_frame(a, &L);
   1607   aa_emit32(mc, aa64_ret(AA_LR));
   1608   if (a->frame.known_frame) {
   1609     /* The frame-planning pre-pass plus final prologue/alloca/tail emission must
   1610      * leave nothing deferred; a stray patch would mean a body-time frame change
   1611      * the final prologue never saw. */
   1612     if (a->npatches != 0) aa_panic(a, "known-frame path left deferred patches");
   1613   } else {
   1614     aa_patch_prologue(a, &L, prologue_region);
   1615     aa_apply_patches(a, &L);
   1616   }
   1617   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
   1618     i32 cfa = aa_cfa_off(a);
   1619     mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
   1620     /* CFA = caller's sp, an fp-relative offset that depends on the layout:
   1621      * fp+16 (top-record) or fp+frame_size (bottom-record). saved fp/lr live at
   1622      * [fp]/[fp+8] in both, hence at CFA-cfa / CFA-cfa+8. */
   1623     mc->cfi_def_cfa(mc, AA_FP, cfa);
   1624     mc->cfi_offset(mc, AA_FP, aa_fp_off_saved_fp() - cfa);
   1625     mc->cfi_offset(mc, AA_LR, aa_fp_off_saved_lr() - cfa);
   1626   }
   1627   obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id,
   1628                     a->func_start, mc->pos(mc) - a->func_start);
   1629   if (a->func->atomize) {
   1630     obj_atom_define(t->obj, a->func->text_section_id, a->func_start,
   1631                     mc->pos(mc) - a->func_start, a->func->sym, 0);
   1632   }
   1633   /* Hand the function's PC range to the Debug producer so its line program
   1634    * (and DW_AT_low_pc/high_pc) cover this function — emit_section_line skips
   1635    * functions without a recorded range. */
   1636   if (mc->debug)
   1637     debug_func_pc_range(mc->debug, a->func->text_section_id, a->func_start,
   1638                         mc->pos(mc));
   1639   if (mc->cfi_endproc) mc->cfi_endproc(mc);
   1640   mc_end_function(mc);
   1641   a->func = NULL;
   1642 }
   1643 
   1644 static NativeFrameSlot aa_frame_slot(NativeTarget* t,
   1645                                      const NativeFrameSlotDesc* d) {
   1646   return native_frame_slot_alloc(&aa_of(t)->frame, d);
   1647 }
   1648 
   1649 static int aa_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
   1650                                    CGDebugLoc* out) {
   1651   AANativeTarget* a = aa_of(t);
   1652   AANativeSlot* s;
   1653   i32 fp_off;
   1654   if (!out) return 0;
   1655   memset(out, 0, sizeof *out);
   1656   if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
   1657   s = aa_slot(a, slot);
   1658   fp_off = aa_fp_off_slot(a, s->off);
   1659   out->kind = CG_DEBUG_LOC_FRAME;
   1660   /* The hosted dbg stop snapshot currently carries x29/fp as the frame base
   1661    * for variable materialization, so report the same FP-relative slot offset
   1662    * used by native memory operands. */
   1663   out->v.frame_ofs = fp_off;
   1664   return 1;
   1665 }
   1666 
   1667 static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   1668                                  u32 nclob, u32* int_mask, u32* fp_mask);
   1669 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks
   1670  * (cg/native_asm.h); it reads the masks from t->regs->classes. */
   1671 
   1672 /* Build the callee-saved set the prologue must preserve: the allocator-assigned
   1673  * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
   1674  * block clobbers. The latter are opaque to the optimizer's operand scan, so it
   1675  * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
   1676  * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
   1677  * and keep only the callee-saved ones, per AAPCS64: x19..x28 and the low 64
   1678  * bits of v8..v15 (x29/x30 are the frame pointer and link register, handled by
   1679  * the prologue head, not as ordinary callee-saves). This is the same register
   1680  * selection the per-block spill used, hoisted into the prologue. */
   1681 static u32 aa_known_callee_saves(NativeTarget* t,
   1682                                  const NativeKnownFrameDesc* frame, u32* out,
   1683                                  u32 cap) {
   1684   u32 ncls = frame->ncallee_classes;
   1685   u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
   1686   if (ncls > cap) ncls = cap;
   1687   for (u32 c = 0; c < ncls; ++c)
   1688     out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
   1689   if (frame->asm_clobbers && frame->nasm_clobbers) {
   1690     AANativeTarget* a = aa_of(t);
   1691     SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   1692     aa_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
   1693                          &clob_int, &clob_fp);
   1694   }
   1695   native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
   1696                                &abi_fp);
   1697   clob_int |= abi_int;
   1698   clob_fp |= abi_fp;
   1699   for (Reg r = 0; r < 32u; ++r) {
   1700     if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && r >= 19u && r <= 28u)
   1701       out[NATIVE_REG_INT] |= 1u << r;
   1702     if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && r >= 8u && r <= 15u)
   1703       out[NATIVE_REG_FP] |= 1u << r;
   1704   }
   1705   return ncls;
   1706 }
   1707 
   1708 /* Optimizer entry point: the full frame is supplied up front, so the prologue,
   1709  * entry saves, slim-form eligibility, allocas, and tail epilogues are all final
   1710  * the moment they are emitted — no back-patching (aa_func_end skips the patch
   1711  * passes when a->frame.known_frame). Slot creation order matches the
   1712  * single-pass path (callee-saves first for stur range, then the static slots,
   1713  * then sret/variadic entry saves) so offsets are identical to what the patch
   1714  * path would produce. */
   1715 static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   1716                                       const NativeKnownFrameDesc* frame,
   1717                                       NativeFrameSlot* out_slots) {
   1718   AANativeTarget* a = aa_of(t);
   1719   AAFrameLayout L;
   1720   u32 words[AA_PROLOGUE_WORDS];
   1721   u32 n;
   1722   aa_func_begin_common(t, fd);
   1723   a->frame.known_frame = 1;
   1724   if (frame) {
   1725     u32 cs[NATIVE_CALL_PLAN_CLASSES];
   1726     u32 ncs = aa_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
   1727     a->frame.has_alloca = frame->has_alloca;
   1728     if (ncs) aa_reserve_callee_saves(t, cs, ncs);
   1729     for (u32 i = 0; i < frame->nslots; ++i) {
   1730       NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]);
   1731       if (out_slots) out_slots[i] = slot;
   1732     }
   1733     aa_reserve_entry_saves(a);
   1734     /* Reserve the atomic-RMW scratch spill last (matching its lazy position in
   1735      * the single-pass path), so aa_saved_tmp_spill reuses it instead of growing
   1736      * the frame mid-body. */
   1737     if (frame->needs_scratch_spill) {
   1738       NativeFrameSlotDesc sd;
   1739       memset(&sd, 0, sizeof sd);
   1740       sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1741       sd.size = 8;
   1742       sd.align = 8;
   1743       sd.kind = NATIVE_FRAME_SLOT_SPILL;
   1744       a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd);
   1745     }
   1746     if (frame->max_outgoing > a->frame.max_outgoing)
   1747       a->frame.max_outgoing = frame->max_outgoing;
   1748   }
   1749   /* Frame is final: slot_bytes (cum_off) and out_stack (max_outgoing) are both
   1750    * known, so the prologue immediates and slim-form choice are settled here.
   1751    * frame_size_final must be set before aa_build_prologue_words / entry saves,
   1752    * since the bottom-record offset helpers read it. */
   1753   L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing,
   1754                       a->top_home_bytes);
   1755   a->frame_size_final = L.frame_size;
   1756   /* Slim Tier A: no callee-saves, no alloca, no body slots, no outgoing stack
   1757    * args — the whole frame is the 16-byte record. fp_at_bottom: a small frame
   1758    * with callee-saves/locals and no outgoing stack args; the record moves to
   1759    * the bottom (fp = sp) so sp adjustment folds into the pre/post-indexed
   1760    * stp/ldp (frame_size <= 504 keeps the post-index ldp imm in range).
   1761    * Otherwise slim_small_frame keeps the top-record layout but skips the
   1762    * x17/x10 scratch (out_stack>0 small frames land here). A Windows-variadic
   1763    * home area forces the fat top-record layout: it lives above the saved pair,
   1764    * which neither the slim forms (saved pair at the very top) nor the
   1765    * bottom-record (saved pair at the very bottom) leave room for. (See
   1766    * aa_func_end for the single-pass path, which never takes any slim form.) */
   1767   a->slim_prologue = a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
   1768                      L.slot_bytes == 0 && L.out_stack == 0 && !a->top_home_bytes;
   1769   a->fp_at_bottom = !a->slim_prologue && !a->frame.has_alloca &&
   1770                     L.out_stack == 0 && L.frame_size <= 504u &&
   1771                     !a->top_home_bytes;
   1772   a->slim_small_frame = !a->slim_prologue && !a->fp_at_bottom &&
   1773                         !a->frame.has_alloca && !a->top_home_bytes &&
   1774                         aa_sp_off_saved_pair(&L) <= 504u;
   1775   n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS);
   1776   for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
   1777   a->minimal_prologue_words = n;
   1778   a->frame.frame_final = 1;
   1779   aa_emit_entry_save_stores(a);
   1780 }
   1781 
   1782 static void aa_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
   1783                      MemAccess mem) {
   1784   NativeAddr addr;
   1785   memset(&addr, 0, sizeof addr);
   1786   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1787   addr.base.frame = slot;
   1788   addr.base_type = src.type;
   1789   aa_emit_mem(aa_of(t), 0, src, addr, mem);
   1790 }
   1791 
   1792 static void aa_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
   1793                       MemAccess mem) {
   1794   NativeAddr addr;
   1795   memset(&addr, 0, sizeof addr);
   1796   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1797   addr.base.frame = slot;
   1798   addr.base_type = dst.type;
   1799   aa_emit_mem(aa_of(t), 1, dst, addr, mem);
   1800 }
   1801 
   1802 static MCLabel aa_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
   1803 
   1804 static void aa_label_place(NativeTarget* t, MCLabel label) {
   1805   t->mc->label_place(t->mc, label);
   1806 }
   1807 
   1808 static void aa_jump(NativeTarget* t, MCLabel label) {
   1809   aa_emit32(t->mc, aa64_b(0));
   1810   t->mc->emit_label_ref(t->mc, label, R_AARCH64_JUMP26, 4, 0);
   1811 }
   1812 
   1813 static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs,
   1814                           NativeLoc rhs, MCLabel label) {
   1815   aa_emit_cmp_to_flags(t, lhs, rhs);
   1816   /* CMP_ONE_F / CMP_UEQ_F have no single FP condition: take the branch from a
   1817    * pair of conditional branches to the same label (no scratch register). */
   1818   if (op == CMP_ONE_F) {
   1819     /* ordered & !=: branch if a<b (MI) or a>b (GT). */
   1820     aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x4u})); /* MI */
   1821     t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
   1822     aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0xcu})); /* GT */
   1823     t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
   1824     return;
   1825   }
   1826   if (op == CMP_UEQ_F) {
   1827     /* unordered | ==: branch if a==b (EQ) or unordered (VS). */
   1828     aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x0u})); /* EQ */
   1829     t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
   1830     aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x6u})); /* VS */
   1831     t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
   1832     return;
   1833   }
   1834   aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)}));
   1835   t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
   1836 }
   1837 
   1838 static void aa_indirect_branch(NativeTarget* t, NativeLoc addr,
   1839                                const MCLabel* valid_targets, u32 ntargets) {
   1840   (void)valid_targets;
   1841   (void)ntargets;
   1842   aa_emit32(t->mc, aa64_br(loc_reg(addr)));
   1843 }
   1844 
   1845 static void aa_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel target) {
   1846   /* `&&label` address-take: adrp/add with the ADR_PREL_PG_HI21 +
   1847    * ADD_ABS_LO12_NC relocation pair against the label's per-block local symbol
   1848    * — the same form used to address a global — so the reference is genuinely
   1849    * relocatable (reaches ±4 GiB) and any assembler resolves it from the symbol.
   1850    * Replaces the old 16-byte INTRA-label sequence with a baked offset. */
   1851   MCEmitter* mc = t->mc;
   1852   u32 rd = loc_reg(dst);
   1853   ObjSymId sym = mc_label_symbol(mc, target);
   1854   u32 pos = mc->pos(mc);
   1855   aa_emit32(mc, aa64_adrp(rd, 0, 0));
   1856   mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21, sym, 0,
   1857                     0, 0);
   1858   pos = mc->pos(mc);
   1859   aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0));
   1860   mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADD_ABS_LO12_NC, sym, 0,
   1861                     0, 0);
   1862 }
   1863 
   1864 static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   1865   /* Identity move elision: same-class same-reg is a no-op on aarch64
   1866    * regardless of width (mov xN,xN and mov wN,wN both leave the low bits
   1867    * untouched). Catches no-op IR_CONVERT (BITCAST, ZEXT/SEXT with
   1868    * src_bits>=dst_bits, FEXT/FTRUNC across-class) when the allocator put
   1869    * dst and src in the same hard reg — common post #2.5 return-reg
   1870    * coalescing, e.g. `convert opnds=[v0,v0]` after a pointer-returning call
   1871    * was emitting `mov x0,x0`. Cross-class (fp<->gpr) bitcasts are not
   1872    * elided here even when the reg numbers match — the register files are
   1873    * disjoint. */
   1874   if (dst.kind == NATIVE_LOC_REG && src.kind == NATIVE_LOC_REG &&
   1875       native_loc_is_fp(dst) == native_loc_is_fp(src) && dst.v.reg == src.v.reg)
   1876     return;
   1877   if (native_loc_is_fp(dst) && native_loc_is_fp(src)) {
   1878     if (type_size32(t, dst.type) == 16u)
   1879       aa_emit32(t->mc, aa_mov_vec16(loc_reg(dst), loc_reg(src)));
   1880     else
   1881       aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
   1882                                   loc_reg(src)));
   1883   } else if (native_loc_is_fp(dst)) {
   1884     aa_emit32(t->mc,
   1885               aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
   1886   } else if (native_loc_is_fp(src)) {
   1887     aa_emit32(t->mc,
   1888               aa_fmov_fp_to_gpr(loc_is_64(t, dst), loc_reg(dst), loc_reg(src)));
   1889   } else {
   1890     aa_emit32(t->mc,
   1891               aa64_mov_reg(loc_is_64(t, dst), loc_reg(dst), loc_reg(src)));
   1892   }
   1893 }
   1894 
   1895 static NativeLoc aa_tmp_loc(KitCgTypeId type, Reg reg);
   1896 
   1897 static void aa_load_imm_native(NativeTarget* t, NativeLoc dst, i64 imm) {
   1898   aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), imm);
   1899 }
   1900 
   1901 static void aa_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cbytes) {
   1902   u64 v = 0;
   1903   if (cbytes.size > 8u)
   1904     compiler_panic(t->c, ((AANativeTarget*)t)->loc,
   1905                    "aarch64 native target: byte constant too large");
   1906   for (u32 i = 0; i < cbytes.size; ++i) v |= (u64)cbytes.bytes[i] << (i * 8u);
   1907   if (native_loc_is_fp(dst)) {
   1908     NativeLoc tmp = aa_tmp_loc(cbytes.type, AA_TMP0);
   1909     aa_emit_load_imm(t->mc, cbytes.size == 8u, AA_TMP0, (i64)v);
   1910     aa_move(t, dst, tmp);
   1911   } else {
   1912     aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), (i64)v);
   1913   }
   1914 }
   1915 
   1916 static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
   1917   AANativeTarget* a = aa_of(t);
   1918   u32 rd = loc_reg(dst);
   1919   aa_materialize_frame_index(a, &addr, rd);
   1920   switch ((NativeAddrBaseKind)addr.base_kind) {
   1921     case NATIVE_ADDR_BASE_FRAME: {
   1922       AANativeSlot* s = aa_slot(a, addr.base.frame);
   1923       aa_emit_add_imm(a, rd, AA_FP, aa_fp_off_slot(a, s->off) + addr.offset);
   1924       aa_apply_index(a, rd, &addr);
   1925       return;
   1926     }
   1927     case NATIVE_ADDR_BASE_FRAME_VALUE: {
   1928       NativeAddr load;
   1929       MemAccess mem;
   1930       memset(&load, 0, sizeof load);
   1931       load.base_kind = NATIVE_ADDR_BASE_FRAME;
   1932       load.base.frame = addr.base.frame;
   1933       load.base_type =
   1934           addr.base_type ? addr.base_type : builtin_id(KIT_CG_BUILTIN_I64);
   1935       memset(&mem, 0, sizeof mem);
   1936       mem.type = load.base_type;
   1937       mem.size = 8;
   1938       mem.align = 8;
   1939       aa_emit_mem(a, 1, dst, load, mem);
   1940       if (addr.offset) aa_emit_add_imm(a, rd, rd, addr.offset);
   1941       aa_apply_index(a, rd, &addr);
   1942       return;
   1943     }
   1944     case NATIVE_ADDR_BASE_REG:
   1945       aa_emit_add_imm(a, rd, addr.base.reg, addr.offset);
   1946       aa_apply_index(a, rd, &addr);
   1947       return;
   1948     case NATIVE_ADDR_BASE_GLOBAL: {
   1949       i64 addend = addr.base.global.addend + (i64)addr.offset;
   1950       u32 pos = t->mc->pos(t->mc);
   1951       if (aa_use_got_for_sym(t, addr.base.global.sym)) {
   1952         aa_emit32(t->mc, aa64_adrp(rd, 0, 0));
   1953         t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
   1954                              R_AARCH64_ADR_GOT_PAGE, addr.base.global.sym, 0, 0,
   1955                              0);
   1956         pos = t->mc->pos(t->mc);
   1957         aa_emit32(t->mc, aa_ldr_uimm(3, rd, rd, 0));
   1958         t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
   1959                              R_AARCH64_LD64_GOT_LO12_NC, addr.base.global.sym,
   1960                              0, 0, 0);
   1961         if (addend) aa_emit_add_i64(a, rd, rd, addend);
   1962         aa_apply_index(a, rd, &addr);
   1963         return;
   1964       }
   1965       aa_emit32(t->mc, aa64_adrp(rd, 0, 0));
   1966       t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
   1967                            R_AARCH64_ADR_PREL_PG_HI21, addr.base.global.sym,
   1968                            addend, 0, 0);
   1969       pos = t->mc->pos(t->mc);
   1970       aa_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
   1971       t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos,
   1972                            R_AARCH64_ADD_ABS_LO12_NC, addr.base.global.sym,
   1973                            addend, 0, 0);
   1974       aa_apply_index(a, rd, &addr);
   1975       return;
   1976     }
   1977     default:
   1978       aa_panic(a, "unsupported load_addr");
   1979   }
   1980 }
   1981 
   1982 static void aa_load_native(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   1983                            MemAccess mem) {
   1984   aa_emit_mem(aa_of(t), 1, dst, addr, mem);
   1985 }
   1986 
   1987 static void aa_store_native(NativeTarget* t, NativeAddr addr, NativeLoc src,
   1988                             MemAccess mem) {
   1989   aa_emit_mem(aa_of(t), 0, src, addr, mem);
   1990 }
   1991 
   1992 /* Windows/AArch64 TLS Local-Exec (PE-COFF). Mirrors x64_tls_addr_of_win64:
   1993  *   ldr  rd,  [x18, #0x58]            ; TEB.ThreadLocalStoragePointer
   1994  *   adrp x16, _tls_index              ; PAGEBASE_REL21
   1995  *   add  x16, x16, :lo12:_tls_index   ; PAGEOFFSET_12A
   1996  *   ldr  w16, [x16]                   ; module's TLS index
   1997  *   ldr  rd,  [rd, x16, lsl #3]       ; this module's TLS block base
   1998  *   add  rd,  rd, #:secrel_hi12:sym   ; SECREL_HIGH12A (sh=1)
   1999  *   add  rd,  rd, #:secrel_lo12:sym   ; SECREL_LOW12A  (sh=0)
   2000  * We materialize &_tls_index via ADRP+ADD (not LDR :lo12:) on purpose: the
   2001  * COFF reader collapses LDST32→LDST64 width, so an LDR :lo12: form would be
   2002  * mis-scaled at link time; ADD_ABS_LO12_NC carries no width and round-trips
   2003  * cleanly. AA_TMP0 (x16) is the reserved scratch; rd is an allocated reg
   2004  * distinct from x16/x17/x18. */
   2005 static void aa_tls_addr_of_win(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2006                                i64 addend) {
   2007   MCEmitter* mc = t->mc;
   2008   u32 sec = mc->section_id;
   2009   u32 rd = loc_reg(dst);
   2010   u32 pos;
   2011   Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index"));
   2012   ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name);
   2013   if (idx_sym == 0)
   2014     idx_sym =
   2015         obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0);
   2016   /* (1) rd = TEB.ThreadLocalStoragePointer. */
   2017   aa_emit32(mc, aa_ldr_uimm(3, rd, AA_WIN_TEB_REG, AA_WIN_TEB_TLS_PTR_OFF));
   2018   /* (2)+(3) x16 = &_tls_index via ADRP + ADD. */
   2019   pos = mc->pos(mc);
   2020   aa_emit32(mc, aa64_adrp(AA_TMP0, 0, 0));
   2021   mc->emit_reloc_at(mc, sec, pos, R_AARCH64_ADR_PREL_PG_HI21, idx_sym, 0, 0, 0);
   2022   pos = mc->pos(mc);
   2023   aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_TMP0, 0, 0));
   2024   mc->emit_reloc_at(mc, sec, pos, R_AARCH64_ADD_ABS_LO12_NC, idx_sym, 0, 0, 0);
   2025   /* (4) w16 = _tls_index (the loaded value). */
   2026   aa_emit32(mc, aa_ldr_uimm(2, AA_TMP0, AA_TMP0, 0));
   2027   /* (5) rd = TLS array slot for this module: ldr rd, [rd, x16, lsl #3]. */
   2028   aa_emit32(mc, aa_ldst_regoff_v(3, 0, 1, rd, rd, AA_TMP0, 1));
   2029   /* (6) rd += :secrel_hi12:sym  (ADD with sh=1; linker patches imm12). */
   2030   pos = mc->pos(mc);
   2031   aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1));
   2032   mc->emit_reloc_at(mc, sec, pos, R_COFF_AARCH64_SECREL_HIGH12A, sym, addend, 1,
   2033                     0);
   2034   /* (7) rd += :secrel_lo12:sym  (ADD with sh=0). */
   2035   pos = mc->pos(mc);
   2036   aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0));
   2037   mc->emit_reloc_at(mc, sec, pos, R_COFF_AARCH64_SECREL_LOW12A, sym, addend, 1,
   2038                     0);
   2039 }
   2040 
   2041 static void aa_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2042                            i64 addend) {
   2043   AANativeTarget* a = aa_of(t);
   2044   MCEmitter* mc = t->mc;
   2045   u32 rd = loc_reg(dst);
   2046   u32 pos;
   2047   if (obj_format_tls_via_descriptor(t->c)) {
   2048     aa_emit32(mc, aa64_adrp(0, 0, 0));
   2049     pos = mc->pos(mc) - 4u;
   2050     mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGE21, sym,
   2051                       0, 0, 0);
   2052     aa_emit32(mc, aa_ldr_uimm(3, 0, 0, 0));
   2053     pos = mc->pos(mc) - 4u;
   2054     mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGEOFF12,
   2055                       sym, 0, 0, 0);
   2056     aa_emit32(mc, aa_ldr_uimm(3, AA_TMP0, 0, 0));
   2057     aa_emit32(mc, aa64_blr(AA_TMP0));
   2058     if (addend) aa_emit_add_i64(a, 0, 0, addend);
   2059     if (rd != 0) aa_emit32(mc, aa64_mov_reg(1, rd, 0));
   2060     return;
   2061   }
   2062   if (obj_format_tls_model(t->c) == OBJ_TLS_WINDOWS_TEB) {
   2063     aa_tls_addr_of_win(t, dst, sym, addend);
   2064     return;
   2065   }
   2066   if (t->c->target.obj != KIT_OBJ_ELF) {
   2067     aa_panic(a, "unsupported TLS object format");
   2068   }
   2069   aa_emit32(mc, aa_mrs_tpidr_el0(rd));
   2070   pos = mc->pos(mc);
   2071   aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1));
   2072   mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_HI12,
   2073                     sym, addend, 0, 0);
   2074   pos = mc->pos(mc);
   2075   aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0));
   2076   mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC,
   2077                     sym, addend, 0, 0);
   2078 }
   2079 
   2080 static NativeLoc aa_tmp_loc(KitCgTypeId type, Reg reg) {
   2081   NativeLoc loc;
   2082   memset(&loc, 0, sizeof loc);
   2083   loc.kind = NATIVE_LOC_REG;
   2084   loc.cls = NATIVE_REG_INT;
   2085   loc.type = type;
   2086   loc.v.reg = reg;
   2087   return loc;
   2088 }
   2089 
   2090 static NativeAddr aa_addr_plus(NativeAddr addr, u32 off) {
   2091   addr.offset += (i32)off;
   2092   return addr;
   2093 }
   2094 
   2095 static void aa_copy_bytes_dir(NativeTarget* t, NativeAddr dst, NativeAddr src,
   2096                               AggregateAccess access, int backward) {
   2097   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   2098   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   2099   KitCgTypeId i16 = builtin_id(KIT_CG_BUILTIN_I16);
   2100   KitCgTypeId i8 = builtin_id(KIT_CG_BUILTIN_I8);
   2101   NativeLoc tmp = aa_tmp_loc(i64, AA_TMP0);
   2102   u32 off = 0;
   2103   while (off < access.size) {
   2104     u32 rem = access.size - off;
   2105     u32 pos;
   2106     MemAccess mem = access.mem;
   2107     if (rem >= 8u) {
   2108       mem.type = i64;
   2109       mem.size = 8u;
   2110     } else if (rem >= 4u) {
   2111       mem.type = i32;
   2112       mem.size = 4u;
   2113       tmp.type = i32;
   2114     } else if (rem >= 2u) {
   2115       mem.type = i16;
   2116       mem.size = 2u;
   2117       tmp.type = i16;
   2118     } else {
   2119       mem.type = i8;
   2120       mem.size = 1u;
   2121       tmp.type = i8;
   2122     }
   2123     mem.align = mem.size;
   2124     pos = backward ? access.size - off - mem.size : off;
   2125     aa_load_native(t, tmp, aa_addr_plus(src, pos), mem);
   2126     aa_store_native(t, aa_addr_plus(dst, pos), tmp, mem);
   2127     off += mem.size;
   2128     tmp.type = i64;
   2129   }
   2130 }
   2131 
   2132 static void aa_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
   2133                           AggregateAccess access) {
   2134   aa_copy_bytes_dir(t, dst, src, access, 0);
   2135 }
   2136 
   2137 static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
   2138                          AggregateAccess access) {
   2139   KitCgTypeId i8 = builtin_id(KIT_CG_BUILTIN_I8);
   2140   NativeLoc byte = byte_value;
   2141   MemAccess mem = access.mem;
   2142   mem.type = i8;
   2143   mem.size = 1u;
   2144   mem.align = 1u;
   2145   byte.type = i8;
   2146   for (u32 off = 0; off < access.size; ++off)
   2147     aa_store_native(t, aa_addr_plus(dst, off), byte, mem);
   2148 }
   2149 
   2150 static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
   2151 static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
   2152 static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
   2153 
   2154 /* Strength-reduce `mul rd, rn, #imm` for the constants accepted by
   2155  * aa64_imul_strength_reducible into a single non-mul instruction. Callers
   2156  * must gate on aa64_imul_strength_reducible(sf, imm) — this routine panics
   2157  * on unhandled constants. */
   2158 static void aa_emit_mul_const_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn,
   2159                                   i64 imm) {
   2160   u64 a;
   2161   if (imm == 0) {
   2162     aa_emit32(t->mc, aa64_mov_reg(sf, rd, AA64_ZR));
   2163     return;
   2164   }
   2165   if (imm == 1) {
   2166     if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
   2167     return;
   2168   }
   2169   if (imm == -1) {
   2170     aa_emit32(t->mc, aa64_neg(sf, rd, rn));
   2171     return;
   2172   }
   2173   /* +2^k: lsl rd, rn, #k */
   2174   a = (u64)imm;
   2175   if (imm > 0 && (a & (a - 1u)) == 0u) {
   2176     u32 k = (u32)__builtin_ctzll(a);
   2177     aa_lsl_imm(t, sf, rd, rn, k);
   2178     return;
   2179   }
   2180   /* -2^k: sub rd, xzr, rn, lsl #k */
   2181   if (imm < 0) {
   2182     a = (u64)(-imm);
   2183     if (a && (a & (a - 1u)) == 0u) {
   2184       u32 k = (u32)__builtin_ctzll(a);
   2185       aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
   2186                                                          .op = 1u,
   2187                                                          .S = 0u,
   2188                                                          .shift = 0u,
   2189                                                          .Rm = rn,
   2190                                                          .imm6 = k,
   2191                                                          .Rn = AA64_ZR,
   2192                                                          .Rd = rd}));
   2193       return;
   2194     }
   2195   }
   2196   /* 2^k + 1: add rd, rn, rn, lsl #k */
   2197   if (imm >= 3) {
   2198     u64 m = (u64)(imm - 1);
   2199     if ((m & (m - 1u)) == 0u) {
   2200       u32 k = (u32)__builtin_ctzll(m);
   2201       aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
   2202                                                          .op = 0u,
   2203                                                          .S = 0u,
   2204                                                          .shift = 0u,
   2205                                                          .Rm = rn,
   2206                                                          .imm6 = k,
   2207                                                          .Rn = rn,
   2208                                                          .Rd = rd}));
   2209       return;
   2210     }
   2211   }
   2212   /* 1 - 2^k: sub rd, rn, rn, lsl #k */
   2213   if (imm <= -1) {
   2214     u64 m = (u64)(1 - imm);
   2215     if (m && (m & (m - 1u)) == 0u) {
   2216       u32 k = (u32)__builtin_ctzll(m);
   2217       aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
   2218                                                          .op = 1u,
   2219                                                          .S = 0u,
   2220                                                          .shift = 0u,
   2221                                                          .Rm = rn,
   2222                                                          .imm6 = k,
   2223                                                          .Rn = rn,
   2224                                                          .Rd = rd}));
   2225       return;
   2226     }
   2227   }
   2228   aa_panic(aa_of(t), "aa_emit_mul_const_imm: unhandled constant");
   2229 }
   2230 
   2231 static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
   2232                      NativeLoc rhs) {
   2233   u32 sf = loc_is_64(t, dst) ? 1u : 0u;
   2234   u32 rd = loc_reg(dst), rn = loc_reg(lhs), rm = loc_reg(rhs);
   2235   if (native_loc_is_fp(dst)) {
   2236     u32 d = type_size32(t, dst.type) == 8u;
   2237     switch (op) {
   2238       case BO_FADD:
   2239         aa_emit32(t->mc, aa_fp_bin(0x002800u, d, rd, rn, rm));
   2240         return;
   2241       case BO_FSUB:
   2242         aa_emit32(t->mc, aa_fp_bin(0x003800u, d, rd, rn, rm));
   2243         return;
   2244       case BO_FMUL:
   2245         aa_emit32(t->mc, aa_fp_bin(0x000800u, d, rd, rn, rm));
   2246         return;
   2247       case BO_FDIV:
   2248         aa_emit32(t->mc, aa_fp_bin(0x001800u, d, rd, rn, rm));
   2249         return;
   2250       default:
   2251         aa_panic(aa_of(t), "unsupported floating binary op");
   2252     }
   2253   }
   2254   if (rhs.kind == NATIVE_LOC_IMM && (op == BO_IADD || op == BO_ISUB)) {
   2255     i64 imm = rhs.v.imm;
   2256     int is_add = (op == BO_IADD);
   2257     u32 imm12, sh;
   2258     if (imm < 0) {
   2259       is_add = !is_add;
   2260       imm = -imm;
   2261     }
   2262     if (!aa64_addsub_imm_fits(imm, &imm12, &sh))
   2263       aa_panic(aa_of(t), "binop immediate not encodable");
   2264     aa_emit32(t->mc, is_add ? aa64_add_imm(sf, rd, rn, imm12, sh)
   2265                             : aa64_sub_imm(sf, rd, rn, imm12, sh));
   2266     return;
   2267   }
   2268   if (rhs.kind == NATIVE_LOC_IMM && op == BO_IMUL) {
   2269     aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm);
   2270     return;
   2271   }
   2272   if (rhs.kind == NATIVE_LOC_IMM &&
   2273       (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S)) {
   2274     u32 shamt = (u32)rhs.v.imm; /* imm_legal guarantees 0 <= imm < datasize */
   2275     if (op == BO_SHL)
   2276       aa_lsl_imm(t, sf, rd, rn, shamt);
   2277     else if (op == BO_SHR_U)
   2278       aa_lsr_imm(t, sf, rd, rn, shamt);
   2279     else
   2280       aa_asr_imm(t, sf, rd, rn, shamt);
   2281     return;
   2282   }
   2283   if (rhs.kind == NATIVE_LOC_IMM &&
   2284       (op == BO_AND || op == BO_OR || op == BO_XOR)) {
   2285     u32 N, immr, imms;
   2286     if (!aa64_logimm_encode((u64)rhs.v.imm, sf, &N, &immr, &imms))
   2287       aa_panic(aa_of(t), "logical immediate not encodable");
   2288     if (op == BO_AND)
   2289       aa_emit32(t->mc, aa64_and_imm(sf, rd, rn, N, immr, imms));
   2290     else if (op == BO_OR)
   2291       aa_emit32(t->mc, aa64_orr_imm(sf, rd, rn, N, immr, imms));
   2292     else
   2293       aa_emit32(t->mc, aa64_eor_imm(sf, rd, rn, N, immr, imms));
   2294     return;
   2295   }
   2296   switch (op) {
   2297     case BO_IADD:
   2298       aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
   2299       return;
   2300     case BO_ISUB:
   2301       aa_emit32(t->mc, aa64_sub(sf, rd, rn, rm));
   2302       return;
   2303     case BO_IMUL:
   2304       aa_emit32(t->mc, aa64_mul(sf, rd, rn, rm));
   2305       return;
   2306     case BO_SDIV:
   2307       aa_emit32(t->mc, aa64_sdiv(sf, rd, rn, rm));
   2308       return;
   2309     case BO_UDIV:
   2310       aa_emit32(t->mc, aa64_udiv(sf, rd, rn, rm));
   2311       return;
   2312     case BO_SREM:
   2313       aa_emit32(t->mc, aa64_sdiv(sf, AA_TMP0, rn, rm));
   2314       aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm));
   2315       aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0));
   2316       return;
   2317     case BO_UREM:
   2318       aa_emit32(t->mc, aa64_udiv(sf, AA_TMP0, rn, rm));
   2319       aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm));
   2320       aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0));
   2321       return;
   2322     case BO_AND:
   2323       aa_emit32(t->mc, aa64_and(sf, rd, rn, rm));
   2324       return;
   2325     case BO_OR:
   2326       aa_emit32(t->mc, aa64_orr(sf, rd, rn, rm));
   2327       return;
   2328     case BO_XOR:
   2329       aa_emit32(t->mc, aa64_eor(sf, rd, rn, rm));
   2330       return;
   2331     case BO_SHL:
   2332       aa_emit32(t->mc, aa64_lslv(sf, rd, rn, rm));
   2333       return;
   2334     case BO_SHR_U:
   2335       aa_emit32(t->mc, aa64_lsrv(sf, rd, rn, rm));
   2336       return;
   2337     case BO_SHR_S:
   2338       aa_emit32(t->mc, aa64_asrv(sf, rd, rn, rm));
   2339       return;
   2340     default:
   2341       aa_panic(aa_of(t), "unsupported binary op");
   2342   }
   2343 }
   2344 
   2345 static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
   2346   u32 sf = loc_is_64(t, dst) ? 1u : 0u;
   2347   if (native_loc_is_fp(dst)) {
   2348     switch (op) {
   2349       case UO_FNEG:
   2350       case UO_NEG:
   2351         aa_emit32(t->mc, aa_fneg(type_size32(t, dst.type) == 8u, loc_reg(dst),
   2352                                  loc_reg(src)));
   2353         return;
   2354       default:
   2355         aa_panic(aa_of(t), "unsupported floating unary op");
   2356     }
   2357   }
   2358   switch (op) {
   2359     case UO_NEG:
   2360       aa_emit32(t->mc, aa64_neg(sf, loc_reg(dst), loc_reg(src)));
   2361       return;
   2362     case UO_BNOT:
   2363       aa_emit32(t->mc, aa64_mvn(sf, loc_reg(dst), loc_reg(src)));
   2364       return;
   2365     case UO_NOT:
   2366       aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(src), 0, 0));
   2367       aa_emit32(t->mc, aa_cset(sf, loc_reg(dst), 0x0u));
   2368       return;
   2369     default:
   2370       aa_panic(aa_of(t), "unsupported unary op");
   2371   }
   2372 }
   2373 
   2374 static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs,
   2375                                  NativeLoc rhs) {
   2376   if (native_loc_is_fp(lhs)) {
   2377     aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
   2378                              loc_reg(rhs)));
   2379     return;
   2380   }
   2381   {
   2382     u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
   2383     if (rhs.kind == NATIVE_LOC_IMM) {
   2384       u32 imm12 = 0, sh = 0;
   2385       if (rhs.v.imm < 0 || !aa64_addsub_imm_fits(rhs.v.imm, &imm12, &sh))
   2386         aa_panic(aa_of(t), "cmp immediate not encodable");
   2387       aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(lhs), imm12, sh));
   2388       return;
   2389     }
   2390     aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
   2391   }
   2392 }
   2393 
   2394 static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
   2395                    NativeLoc rhs) {
   2396   u32 sf = loc_is_64(t, dst);
   2397   u32 rd = loc_reg(dst);
   2398   aa_emit_cmp_to_flags(t, lhs, rhs);
   2399   /* CMP_ONE_F (ordered & !=) and CMP_UEQ_F (unordered | ==) have no single
   2400    * AArch64 FP condition. After FCMP, unordered sets V (and Z=0), so VC
   2401    * (V==0) selects "ordered". */
   2402   if (op == CMP_ONE_F) {
   2403     /* ordered & not-equal: NE masked to the ordered case. */
   2404     aa_emit32(t->mc, aa_cset(sf, rd, 0x1u)); /* cset  rd, NE */
   2405     aa_emit32(t->mc,
   2406               aa64_csel_enc(sf, rd, rd, AA64_ZR, 0x7u)); /* csel  rd,rd,zr,VC */
   2407     return;
   2408   }
   2409   if (op == CMP_UEQ_F) {
   2410     /* equal, or forced to 1 when unordered. */
   2411     aa_emit32(t->mc, aa_cset(sf, rd, 0x0u)); /* cset  rd, EQ */
   2412     aa_emit32(t->mc, aa64_csinc_enc(sf, rd, rd, AA64_ZR,
   2413                                     0x7u)); /* csinc rd,rd,zr,VC */
   2414     return;
   2415   }
   2416   aa_emit32(t->mc, aa_cset(sf, rd, cmp_cond(op)));
   2417 }
   2418 
   2419 static void aa_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
   2420                        NativeLoc src) {
   2421   int dst_fp = native_loc_is_fp(dst);
   2422   int src_fp = native_loc_is_fp(src);
   2423   switch (op) {
   2424     case CV_TRUNC:
   2425     case CV_BITCAST:
   2426       aa_move(t, dst, src);
   2427       return;
   2428     case CV_ZEXT: {
   2429       u32 src_bits = type_size32(t, src.type) * 8u;
   2430       u32 dst_bits = type_size32(t, dst.type) * 8u;
   2431       u32 sf = dst_bits > 32u;
   2432       if (src_bits >= dst_bits) {
   2433         aa_move(t, dst, src);
   2434       } else if (src_bits >= 32u) {
   2435         aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dst), loc_reg(src)));
   2436       } else {
   2437         aa_emit32(t->mc,
   2438                   aa_ubfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u));
   2439       }
   2440       return;
   2441     }
   2442     case CV_SEXT: {
   2443       u32 src_bits = type_size32(t, src.type) * 8u;
   2444       u32 dst_bits = type_size32(t, dst.type) * 8u;
   2445       u32 sf = dst_bits > 32u;
   2446       if (src_bits >= dst_bits) {
   2447         aa_move(t, dst, src);
   2448       } else {
   2449         aa_emit32(t->mc,
   2450                   aa_sbfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u));
   2451       }
   2452       return;
   2453     }
   2454     case CV_ITOF_S:
   2455       aa_emit32(t->mc, aa_scvtf(type_size32(t, dst.type) == 8u,
   2456                                 loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
   2457       return;
   2458     case CV_ITOF_U:
   2459       aa_emit32(t->mc, aa_ucvtf(type_size32(t, dst.type) == 8u,
   2460                                 loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
   2461       return;
   2462     case CV_FTOI_S:
   2463       aa_emit32(t->mc,
   2464                 aa_fcvtzs(loc_is_64(t, dst), type_size32(t, src.type) == 8u,
   2465                           loc_reg(dst), loc_reg(src)));
   2466       return;
   2467     case CV_FTOI_U:
   2468       aa_emit32(t->mc,
   2469                 aa_fcvtzu(loc_is_64(t, dst), type_size32(t, src.type) == 8u,
   2470                           loc_reg(dst), loc_reg(src)));
   2471       return;
   2472     case CV_FEXT:
   2473       if (dst_fp && src_fp)
   2474         aa_emit32(t->mc, aa_fcvt_d_s(loc_reg(dst), loc_reg(src)));
   2475       else
   2476         aa_move(t, dst, src);
   2477       return;
   2478     case CV_FTRUNC:
   2479       if (dst_fp && src_fp)
   2480         aa_emit32(t->mc, aa_fcvt_s_d(loc_reg(dst), loc_reg(src)));
   2481       else
   2482         aa_move(t, dst, src);
   2483       return;
   2484     default:
   2485       aa_panic(aa_of(t), "unsupported conversion");
   2486   }
   2487 }
   2488 
   2489 static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
   2490                       u32 align) {
   2491   AANativeTarget* a = aa_of(t);
   2492   u32 use_align = align < 16u ? 16u : align;
   2493   if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2");
   2494   aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u));
   2495   aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align);
   2496   aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1));
   2497   aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0));
   2498   aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0));
   2499   aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0));
   2500   /* The alloca result is sp + outgoing-area bytes. On the known-frame path
   2501    * max_outgoing is already final, so emit the final `add dst, sp, #N` here; on
   2502    * the single-pass path it is not known yet, so record a patch. */
   2503   if (a->frame.known_frame) {
   2504     u32 imm12, sh;
   2505     if (!aa64_addsub_imm_fits(a->frame.max_outgoing, &imm12, &sh))
   2506       aa_panic(a, "outgoing area too large for alloca result");
   2507     aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, imm12, sh));
   2508   } else {
   2509     AAPatch* p = aa_patch_alloc(a);
   2510     p->kind = AA_PATCH_ALLOCA;
   2511     p->pos = t->mc->pos(t->mc);
   2512     p->u.dst_reg = loc_reg(dst);
   2513     a->nalloca++;
   2514     aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0));
   2515   }
   2516 }
   2517 
   2518 static MemAccess aa_mem_for_type(NativeTarget* t, KitCgTypeId type, u32 size) {
   2519   MemAccess mem;
   2520   memset(&mem, 0, sizeof mem);
   2521   mem.type = type;
   2522   mem.size = size ? size : type_size32(t, type);
   2523   mem.align = type_align32(t, type);
   2524   if (mem.align > mem.size && mem.size) mem.align = mem.size;
   2525   return mem;
   2526 }
   2527 
   2528 /* native_loc_reg / native_loc_stack are shared in native_target.h. */
   2529 
   2530 static NativeAddr aa_loc_addr(AANativeTarget* a, NativeLoc loc, u32 offset) {
   2531   NativeAddr addr;
   2532   memset(&addr, 0, sizeof addr);
   2533   switch ((NativeLocKind)loc.kind) {
   2534     case NATIVE_LOC_FRAME:
   2535       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2536       addr.base.frame = loc.v.frame;
   2537       addr.base_type = loc.type;
   2538       addr.offset = (i32)offset;
   2539       return addr;
   2540     case NATIVE_LOC_STACK:
   2541       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2542       addr.base.frame = loc.v.stack.slot;
   2543       addr.base_type = loc.type;
   2544       addr.offset = loc.v.stack.offset + (i32)offset;
   2545       return addr;
   2546     case NATIVE_LOC_ADDR:
   2547       addr = loc.v.addr;
   2548       addr.offset += (i32)offset;
   2549       return addr;
   2550     default:
   2551       aa_panic(a, "location is not addressable");
   2552   }
   2553   return addr;
   2554 }
   2555 
   2556 static void aa_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   2557   AANativeTarget* a = aa_of(t);
   2558   NativeAddr addr = aa_loc_addr(a, src, 0);
   2559   aa_load_addr(t, dst, addr);
   2560 }
   2561 
   2562 static void aa_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2563                          u32 offset, u32 size) {
   2564   AANativeTarget* a = aa_of(t);
   2565   MemAccess mem = aa_mem_for_type(t, dst.type, size);
   2566   if (src.kind == NATIVE_LOC_REG) {
   2567     aa_move(t, dst, src);
   2568     return;
   2569   }
   2570   if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
   2571       src.kind == NATIVE_LOC_ADDR) {
   2572     NativeAddr addr = aa_loc_addr(a, src, offset);
   2573     addr.base_type = dst.type;
   2574     aa_emit_mem(a, 1, dst, addr, mem);
   2575     return;
   2576   }
   2577   if (src.kind == NATIVE_LOC_IMM) {
   2578     aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), src.v.imm);
   2579     return;
   2580   }
   2581   aa_panic(a, "unsupported call argument source");
   2582 }
   2583 
   2584 static void aa_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2585                           u32 offset, u32 size) {
   2586   AANativeTarget* a = aa_of(t);
   2587   MemAccess mem = aa_mem_for_type(t, src.type, size);
   2588   if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
   2589       dst.kind == NATIVE_LOC_ADDR) {
   2590     NativeAddr addr = aa_loc_addr(a, dst, offset);
   2591     addr.base_type = src.type;
   2592     aa_emit_mem(a, 0, src, addr, mem);
   2593     return;
   2594   }
   2595   if (dst.kind == NATIVE_LOC_REG) {
   2596     aa_move(t, dst, src);
   2597     return;
   2598   }
   2599   aa_panic(a, "unsupported call return destination");
   2600 }
   2601 
   2602 static void aa_store_outgoing_part(NativeTarget* t, int tail_call,
   2603                                    u32 stack_off, NativeLoc src, u32 size) {
   2604   NativeAddr addr;
   2605   MemAccess mem = aa_mem_for_type(t, src.type, size);
   2606   memset(&addr, 0, sizeof addr);
   2607   addr.base_kind = NATIVE_ADDR_BASE_REG;
   2608   addr.base.reg = tail_call ? AA_FP : AA_SP;
   2609   addr.base_type = src.type;
   2610   /* Tail calls write outgoing args into the caller's incoming-args window
   2611    * (= [fp + 16 + off], same address the tail-callee will read via
   2612    * aa_fp_off_in_arg). Non-tail calls write to the sp-anchored outgoing
   2613    * area at the bottom of the caller's frame. */
   2614   addr.offset = tail_call ? aa_fp_off_tail_out_arg(aa_of(t), stack_off)
   2615                           : aa_sp_off_out_arg(stack_off);
   2616   aa_emit_mem(aa_of(t), 0, src, addr, mem);
   2617 }
   2618 
   2619 static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
   2620                                       const NativeCallDesc* desc, u32 i,
   2621                                       ABIArgInfo* scratch) {
   2622   if (abi && i < abi->nparams) return &abi->params[i];
   2623   memset(scratch, 0, sizeof *scratch);
   2624   scratch->kind = ABI_ARG_DIRECT;
   2625   scratch->flags = ABI_AF_NONE;
   2626   scratch->nparts = 1;
   2627   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
   2628   /* Windows ARM64 routes variadic floating-point arguments through the integer
   2629    * registers/stack (the classifier's remap_fp_parts_to_int does the same for
   2630    * the *named* params of a variadic function); the value's bit pattern moves
   2631    * via fmov x,d. Every other ABI keeps the `...` FP args in v registers. */
   2632   ((ABIArgPart*)scratch->parts)[0].cls =
   2633       (cg_type_is_float(t->c, desc->args[i].type) &&
   2634        !(abi && abi->vararg_fp_via_int))
   2635           ? ABI_CLASS_FP
   2636           : ABI_CLASS_INT;
   2637   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
   2638   ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type);
   2639   ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type);
   2640   ((ABIArgPart*)scratch->parts)[0].src_offset = 0;
   2641   return scratch;
   2642 }
   2643 
   2644 /* Stack footprint of a single argument part. AAPCS64 uses 8-byte slots. Apple
   2645  * ARM64 uses compact 4-byte slots for fixed stack-passed int32-sized values,
   2646  * but its forced stack variadics still use 8-byte slots. */
   2647 static u32 aa_stack_arg_min_align(const ABIFuncInfo* abi) {
   2648   return (abi && abi->stack_arg_min_align) ? abi->stack_arg_min_align : 8u;
   2649 }
   2650 
   2651 static u32 aa_vararg_stack_arg_min_align(const ABIFuncInfo* abi) {
   2652   if (abi && abi->vararg_stack_arg_min_align)
   2653     return abi->vararg_stack_arg_min_align;
   2654   return aa_stack_arg_min_align(abi);
   2655 }
   2656 
   2657 static u32 aa_vararg_stack_start(const ABIFuncInfo* abi, u32 cursor) {
   2658   return align_up_u32(cursor, aa_vararg_stack_arg_min_align(abi));
   2659 }
   2660 
   2661 /* Natural stack alignment of a part, capped at 16 (binary128). */
   2662 static u32 aa_part_stack_align_min(u32 min_align, const ABIArgPart* part) {
   2663   u32 al = part->align ? part->align : 8u;
   2664   if (al < min_align) al = min_align;
   2665   if (al > 16u) al = 16u;
   2666   return al;
   2667 }
   2668 
   2669 static u32 aa_part_stack_align(const ABIFuncInfo* abi,
   2670                                const ABIArgPart* part) {
   2671   return aa_part_stack_align_min(aa_stack_arg_min_align(abi), part);
   2672 }
   2673 
   2674 static u32 aa_part_vararg_stack_align(const ABIFuncInfo* abi,
   2675                                       const ABIArgPart* part) {
   2676   return aa_part_stack_align_min(aa_vararg_stack_arg_min_align(abi), part);
   2677 }
   2678 
   2679 static u32 aa_part_stack_size(const ABIFuncInfo* abi,
   2680                               const ABIArgPart* part) {
   2681   return align_up_u32(part->size ? part->size : 8u,
   2682                       aa_part_stack_align(abi, part));
   2683 }
   2684 
   2685 static u32 aa_part_vararg_stack_size(const ABIFuncInfo* abi,
   2686                                      const ABIArgPart* part) {
   2687   return align_up_u32(part->size ? part->size : 8u,
   2688                       aa_part_vararg_stack_align(abi, part));
   2689 }
   2690 
   2691 /* The scalar type used to move one ABI part through a register. Aggregate
   2692  * args/results are split into parts; each part must move at its own width, not
   2693  * the (possibly >8-byte) aggregate width. */
   2694 static KitCgTypeId aa_part_scalar_type(const ABIArgPart* part) {
   2695   if (part->cls == ABI_CLASS_FP) {
   2696     if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32);
   2697     if (part->size <= 8u) return builtin_id(KIT_CG_BUILTIN_F64);
   2698     return builtin_id(KIT_CG_BUILTIN_F128);
   2699   }
   2700   switch (part->size) {
   2701     case 1u:
   2702       return builtin_id(KIT_CG_BUILTIN_I8);
   2703     case 2u:
   2704       return builtin_id(KIT_CG_BUILTIN_I16);
   2705     case 4u:
   2706       return builtin_id(KIT_CG_BUILTIN_I32);
   2707     default:
   2708       return builtin_id(KIT_CG_BUILTIN_I64);
   2709   }
   2710 }
   2711 
   2712 static u32 aa_class_vararg_stack_size(const ABIFuncInfo* abi,
   2713                                       const ABIArgInfo* ai) {
   2714   u32 total = 0;
   2715   u32 min_align = aa_vararg_stack_arg_min_align(abi);
   2716   if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
   2717   if (ai->kind == ABI_ARG_INDIRECT) return 8u;
   2718   for (u32 p = 0; p < ai->nparts; ++p) {
   2719     total = align_up_u32(total, aa_part_vararg_stack_align(abi, &ai->parts[p]));
   2720     total += aa_part_vararg_stack_size(abi, &ai->parts[p]);
   2721   }
   2722   return align_up_u32(total ? total : min_align, min_align);
   2723 }
   2724 
   2725 static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
   2726   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2727   u32 next_int = 0, next_fp = 0, stack = 0;
   2728   for (u32 i = 0; i < desc->nargs; ++i) {
   2729     ABIArgInfo tmp;
   2730     const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
   2731     int force_stack =
   2732         abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   2733     if (ai->kind == ABI_ARG_IGNORE) continue;
   2734     if (force_stack) {
   2735       stack = aa_vararg_stack_start(abi, stack);
   2736       stack += aa_class_vararg_stack_size(abi, ai);
   2737       continue;
   2738     }
   2739     if (ai->kind == ABI_ARG_INDIRECT) {
   2740       if (next_int < 8u)
   2741         next_int++;
   2742       else
   2743         stack += 8u;
   2744       continue;
   2745     }
   2746     for (u32 p = 0; p < ai->nparts; ++p) {
   2747       const ABIArgPart* part = &ai->parts[p];
   2748       if (part->cls == ABI_CLASS_FP) {
   2749         if (next_fp < 8u)
   2750           next_fp++;
   2751         else {
   2752           stack = align_up_u32(stack, aa_part_stack_align(abi, part));
   2753           stack += aa_part_stack_size(abi, part);
   2754         }
   2755       } else {
   2756         if (next_int < 8u)
   2757           next_int++;
   2758         else {
   2759           stack = align_up_u32(stack, aa_part_stack_align(abi, part));
   2760           stack += aa_part_stack_size(abi, part);
   2761         }
   2762       }
   2763     }
   2764   }
   2765   return align_up_u32(stack, 16u);
   2766 }
   2767 
   2768 /* Stack-argument bytes a call with `fn_type`'s fixed parameters uses. Reuses
   2769  * aa_call_stack_size by routing the declared params through it (their ABI
   2770  * classification is independent of the actual operand locations, which
   2771  * aa_call_stack_size ignores for register/stack placement). */
   2772 static u32 aa_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   2773                                     int* variadic, u32* nparams) {
   2774   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
   2775   NativeCallDesc d;
   2776   if (variadic) *variadic = abi ? (int)abi->variadic : 0;
   2777   if (nparams) *nparams = abi ? abi->nparams : 0u;
   2778   memset(&d, 0, sizeof d);
   2779   d.fn_type = fn_type;
   2780   d.nargs = abi ? abi->nparams : 0u;
   2781   if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
   2782   return aa_call_stack_size(t, &d);
   2783 }
   2784 
   2785 /* Pure NativeTarget.call_stack_bytes: outgoing stack bytes for a full call
   2786  * descriptor (handles variadic stack args, unlike signature_stack_bytes which
   2787  * sees only the fixed params). aa_call_stack_size reads only fn_type and each
   2788  * args[i].type, so the frame-planning pre-pass can call this before emitting.
   2789  */
   2790 static u32 aa_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
   2791   return aa_call_stack_size(t, desc);
   2792 }
   2793 
   2794 /* One register-passed call argument: write `src` (or its address) into the
   2795  * argument register `dst`. Collected during planning and emitted as a batch so
   2796  * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */
   2797 typedef NativeArgMove AAArgMove;
   2798 
   2799 /* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */
   2800 #define AA_MAX_REG_ARG_MOVES 16u
   2801 
   2802 static void aa_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   2803   if (m->is_addr)
   2804     aa_addr_of_loc(t, m->dst, m->src);
   2805   else
   2806     aa_load_part(t, m->dst, m->src, m->src_offset, m->size);
   2807 }
   2808 
   2809 /* Emit register-argument moves as a parallel copy via the shared scheduler:
   2810  * every register is read by all moves that source it before any move overwrites
   2811  * it; a true cycle is broken through a scratch. The allocator usually arranges
   2812  * a conflict-free order, but not always (notably variadic args, where it can
   2813  * leave a prior call's result in x0 even though x0 is this call's first arg
   2814  * register), so the backend must not assume a safe order. Cycle scratch is
   2815  * AA_TMP1 (x17) for int and v16 for fp — distinct from x16 (AA_TMP0), which may
   2816  * hold a stashed indirect callee. */
   2817 static void aa_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
   2818                                   u32 n) {
   2819   NativeArgShuffle s;
   2820   if (n > AA_MAX_REG_ARG_MOVES)
   2821     aa_panic(aa_of(t), "too many register arguments");
   2822   memset(&s, 0, sizeof s);
   2823   s.t = t;
   2824   s.emit_one = aa_emit_one_arg_move;
   2825   s.reg_move = aa_move;
   2826   s.scratch[NATIVE_REG_INT] = AA_TMP1;
   2827   s.scratch[NATIVE_REG_FP] = 16u;
   2828   native_arg_shuffle(&s, moves, n);
   2829 }
   2830 
   2831 static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   2832                          NativeCallPlan* plan) {
   2833   NativeCallPlanRet* rets;
   2834   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2835   memset(plan, 0, sizeof *plan);
   2836   rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
   2837   plan->callee = desc->callee;
   2838   plan->rets = rets;
   2839   plan->flags = desc->flags;
   2840   plan->has_sret = abi && abi->has_sret;
   2841   plan->is_variadic = abi && abi->variadic;
   2842   plan->stack_arg_size = aa_call_stack_size(t, desc);
   2843   native_frame_note_outgoing(&aa_of(t)->frame, plan->stack_arg_size);
   2844   /* Indirect call whose callee lives in x0..x7: the upcoming arg-load loop
   2845    * writes those same registers and would clobber the function pointer
   2846    * before blr reads it. Stash callee into AA_TMP0 (x16) up front and
   2847    * retarget the call. (AA_TMP0 is a backend scratch, never an arg reg.) */
   2848   if (plan->callee.kind == NATIVE_LOC_REG &&
   2849       (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
   2850       plan->callee.v.reg < 8u) {
   2851     NativeLoc scratch =
   2852         native_loc_reg(plan->callee.type, NATIVE_REG_INT, AA_TMP0);
   2853     aa_move(t, scratch, plan->callee);
   2854     plan->callee = scratch;
   2855   }
   2856   {
   2857     u32 next_int = 0, next_fp = 0, stack = 0, nmoves = 0;
   2858     int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
   2859     AAArgMove moves[AA_MAX_REG_ARG_MOVES];
   2860     /* Stack-passed arguments are stored inline as we walk, *before* any
   2861      * argument register is written, so a stack-arg source that the allocator
   2862      * left in an arg register (e.g. a prior call's result still in x0, consumed
   2863      * as a variadic stack arg) is read while it is still live. Stack stores
   2864      * only touch memory and the AA_TMP0/v16 scratch, never an arg-register
   2865      * source, so emitting them first cannot clobber a register-arg source.
   2866      * Register-passed arguments are collected and emitted afterward as a
   2867      * parallel copy (aa_emit_reg_arg_moves) so they likewise never overwrite a
   2868      * register another argument still needs to read. */
   2869     for (u32 i = 0; i < desc->nargs; ++i) {
   2870       ABIArgInfo tmp;
   2871       const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
   2872       int force_stack =
   2873           abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   2874       if (ai->kind == ABI_ARG_IGNORE) continue;
   2875       if (force_stack) {
   2876         NativeLoc tmpreg =
   2877             native_loc_reg(desc->args[i].type, NATIVE_REG_INT, AA_TMP0);
   2878         u32 n = aa_class_vararg_stack_size(abi, ai);
   2879         u32 off = 0;
   2880         stack = aa_vararg_stack_start(abi, stack);
   2881         while (off < n) {
   2882           u32 chunk = (n - off > 8u) ? 8u : (n - off);
   2883           aa_load_part(t, tmpreg, desc->args[i], off, chunk);
   2884           aa_store_outgoing_part(t, tail_call, stack + off, tmpreg, chunk);
   2885           off += chunk;
   2886         }
   2887         stack += n;
   2888         continue;
   2889       }
   2890       if (ai->kind == ABI_ARG_INDIRECT) {
   2891         if (next_int < 8u) {
   2892           AAArgMove* m = &moves[nmoves++];
   2893           m->dst = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64),
   2894                                   NATIVE_REG_INT, next_int++);
   2895           m->src = desc->args[i];
   2896           m->src_offset = 0;
   2897           m->size = 8;
   2898           m->is_addr = 1;
   2899         } else {
   2900           NativeLoc ptr = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64),
   2901                                          NATIVE_REG_INT, AA_TMP0);
   2902           aa_addr_of_loc(t, ptr, desc->args[i]);
   2903           aa_store_outgoing_part(t, tail_call, stack, ptr, 8);
   2904           stack += 8u;
   2905         }
   2906         continue;
   2907       }
   2908       for (u32 p = 0; p < ai->nparts; ++p) {
   2909         const ABIArgPart* part = &ai->parts[p];
   2910         NativeAllocClass cls =
   2911             part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2912         if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
   2913             (cls == NATIVE_REG_INT && next_int < 8u)) {
   2914           AAArgMove* m = &moves[nmoves++];
   2915           m->dst =
   2916               native_loc_reg(desc->args[i].type, cls,
   2917                              cls == NATIVE_REG_FP ? next_fp++ : next_int++);
   2918           m->src = desc->args[i];
   2919           m->src_offset = part->src_offset;
   2920           m->size = part->size;
   2921           m->is_addr = 0;
   2922         } else {
   2923           NativeLoc tmpreg = native_loc_reg(
   2924               desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0);
   2925           aa_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
   2926           stack = align_up_u32(stack, aa_part_stack_align(abi, part));
   2927           aa_store_outgoing_part(t, tail_call, stack, tmpreg, part->size);
   2928           stack += aa_part_stack_size(abi, part);
   2929         }
   2930       }
   2931     }
   2932     aa_emit_reg_arg_moves(t, moves, nmoves);
   2933     /* Set the indirect-result register (x8) *after* the argument loads: an
   2934      * argument source may have been allocated to x8, and the sret pointer load
   2935      * would otherwise clobber it before it is moved into its argument
   2936      * register. */
   2937     if (abi && abi->has_sret) {
   2938       NativeLoc x8 =
   2939           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 8u);
   2940       if (desc->flags & CG_CALL_TAIL) {
   2941         AANativeTarget* a = aa_of(t);
   2942         NativeLoc saved = native_loc_stack(x8.type, a->sret_ptr_slot, 0);
   2943         aa_load_part(t, x8, saved, 0, 8);
   2944       } else if (desc->nresults) {
   2945         aa_addr_of_loc(t, x8, desc->results[0]);
   2946       }
   2947     }
   2948   }
   2949   if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
   2950     u32 nr = 0, ni = 0, nf = 0;
   2951     for (u32 p = 0; p < abi->ret.nparts; ++p) {
   2952       const ABIArgPart* part = &abi->ret.parts[p];
   2953       NativeAllocClass cls =
   2954           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2955       KitCgTypeId pty = aa_part_scalar_type(part);
   2956       rets[nr].src =
   2957           native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
   2958       rets[nr].dst = desc->results[0];
   2959       if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
   2960         rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
   2961                                         (i32)part->src_offset);
   2962       else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
   2963         rets[nr].dst.v.stack.offset += (i32)part->src_offset;
   2964         rets[nr].dst.type = pty;
   2965       } else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) {
   2966         rets[nr].dst.v.addr.offset += (i32)part->src_offset;
   2967         rets[nr].dst.type = pty;
   2968       }
   2969       rets[nr].mem = aa_mem_for_type(t, pty, part->size);
   2970       nr++;
   2971     }
   2972     plan->nrets = nr;
   2973   } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
   2974     plan->nrets = 0;
   2975   } else if (!abi && desc->nresults) {
   2976     rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, 0);
   2977     rets[0].dst = desc->results[0];
   2978     rets[0].mem = aa_mem_for_type(t, desc->results[0].type, 0);
   2979     plan->nrets = 1;
   2980   }
   2981 }
   2982 
   2983 static void aa_ret(NativeTarget* t);
   2984 
   2985 static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) {
   2986   AANativeTarget* a = aa_of(t);
   2987   if (a->frame.known_frame) {
   2988     /* Frame is final: emit the tail epilogue (callee restores + frame restore +
   2989      * branch) directly, exactly the words aa_apply_patches would patch in but
   2990      * without the reserved NOP padding. */
   2991     AAFrameLayout L =
   2992       aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
   2993     u32 words[AA_TAIL_WORDS];
   2994     u32 n = 0;
   2995     aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
   2996     aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, &L);
   2997     if (n >= AA_TAIL_WORDS) aa_panic(a, "tail epilogue too large");
   2998     for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
   2999     if (callee.kind == NATIVE_LOC_REG) {
   3000       aa_emit32(t->mc, aa64_br(loc_reg(callee)));
   3001     } else if (callee.kind == NATIVE_LOC_GLOBAL) {
   3002       u32 pos = t->mc->pos(t->mc);
   3003       aa_emit32(t->mc, aa64_b(0));
   3004       t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, R_AARCH64_JUMP26,
   3005                            callee.v.global.sym, callee.v.global.addend, 0, 0);
   3006     } else {
   3007       aa_panic(a, "unsupported tail target");
   3008     }
   3009     return;
   3010   }
   3011   /* Single-pass: reserve a worst-case region and record a patch; the callee
   3012    * restores and frame restore depend on the not-yet-final frame layout. */
   3013   AAPatch* p = aa_patch_alloc(a);
   3014   p->kind = AA_PATCH_TAIL;
   3015   p->pos = t->mc->pos(t->mc);
   3016   p->u.callee = callee;
   3017   for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu);
   3018   if (callee.kind == NATIVE_LOC_GLOBAL) {
   3019     t->mc->emit_reloc_at(t->mc, t->mc->section_id,
   3020                          p->pos + (AA_TAIL_WORDS - 1u) * 4u, R_AARCH64_JUMP26,
   3021                          callee.v.global.sym, callee.v.global.addend, 0, 0);
   3022   }
   3023 }
   3024 
   3025 static void aa_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   3026   int is_tail = (plan->flags & CG_CALL_TAIL) != 0;
   3027   if (is_tail) {
   3028     if (plan->callee.kind != NATIVE_LOC_GLOBAL &&
   3029         plan->callee.kind != NATIVE_LOC_REG)
   3030       aa_panic(aa_of(t), "unsupported tail target");
   3031     aa_emit_tail_site(t, plan->callee);
   3032     return;
   3033   }
   3034   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
   3035     aa_emit32(t->mc, aa64_bl(0));
   3036     t->mc->emit_reloc_at(t->mc, t->mc->section_id, t->mc->pos(t->mc) - 4u,
   3037                          R_AARCH64_CALL26, plan->callee.v.global.sym,
   3038                          plan->callee.v.global.addend, 0, 0);
   3039     return;
   3040   }
   3041   if (plan->callee.kind == NATIVE_LOC_REG) {
   3042     aa_emit32(t->mc, aa64_blr(loc_reg(plan->callee)));
   3043     return;
   3044   }
   3045   aa_panic(aa_of(t), "unsupported call target");
   3046 }
   3047 
   3048 static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
   3049                         const NativeLoc* value,
   3050                         NativeCallPlanRet** out_rets, u32* out_nrets) {
   3051   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   3052   NativeCallPlanRet* rets = NULL;
   3053   u32 nr = 0;
   3054   if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
   3055   if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
   3056     AANativeTarget* a = aa_of(t);
   3057     /* Hold the sret destination pointer in x8, not AA_TMP1: aa_copy_bytes
   3058      * materializes out-of-range source/dest frame offsets into AA_TMP1, which
   3059      * would clobber the destination base mid-copy (only triggered once a frame
   3060      * is large enough that the source offset escapes stur's signed-9 range). */
   3061     NativeLoc dstp =
   3062         native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_X8);
   3063     NativeLoc saved = native_loc_stack(dstp.type, a->sret_ptr_slot, 0);
   3064     NativeAddr dst_addr, src_addr;
   3065     AggregateAccess access;
   3066     aa_load_part(t, dstp, saved, 0, 8);
   3067     memset(&dst_addr, 0, sizeof dst_addr);
   3068     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   3069     dst_addr.base.reg = AA_X8;
   3070     dst_addr.base_type = value->type;
   3071     src_addr = aa_loc_addr(a, *value, 0);
   3072     src_addr.base_type = value->type;
   3073     memset(&access, 0, sizeof access);
   3074     access.type = value->type;
   3075     access.size = (u32)cg_type_size(t->c, value->type);
   3076     access.align = type_align32(t, value->type);
   3077     aa_copy_bytes(t, dst_addr, src_addr, access);
   3078     *out_rets = NULL;
   3079     *out_nrets = 0;
   3080     return;
   3081   }
   3082   if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) {
   3083     u32 ni = 0, nf = 0;
   3084     for (u32 p = 0; p < abi->ret.nparts; ++p) {
   3085       const ABIArgPart* part = &abi->ret.parts[p];
   3086       NativeAllocClass cls =
   3087           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   3088       KitCgTypeId pty = aa_part_scalar_type(part);
   3089       rets[nr].src = *value;
   3090       if (rets[nr].src.kind == NATIVE_LOC_FRAME)
   3091         rets[nr].src =
   3092             native_loc_stack(pty, value->v.frame, (i32)part->src_offset);
   3093       else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
   3094         rets[nr].src.v.stack.offset += (i32)part->src_offset;
   3095         rets[nr].src.type = pty;
   3096       } else if (rets[nr].src.kind == NATIVE_LOC_ADDR) {
   3097         rets[nr].src.v.addr.offset += (i32)part->src_offset;
   3098         rets[nr].src.type = pty;
   3099       }
   3100       rets[nr].dst =
   3101           native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
   3102       rets[nr].mem = aa_mem_for_type(t, pty, part->size);
   3103       nr++;
   3104     }
   3105   } else if (value) {
   3106     rets[0].src = *value;
   3107     rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, 0);
   3108     rets[0].mem = aa_mem_for_type(t, value->type, 0);
   3109     nr = 1;
   3110   }
   3111   *out_rets = rets;
   3112   *out_nrets = nr;
   3113 }
   3114 
   3115 static void aa_ret(NativeTarget* t) {
   3116   AANativeTarget* a = aa_of(t);
   3117   aa_jump(t, a->epilogue_label);
   3118 }
   3119 
   3120 static u32 aa_bit_storage_reg_bits(u32 storage_bytes) {
   3121   return storage_bytes == 8u ? 64u : 32u;
   3122 }
   3123 
   3124 static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
   3125   u32 bits = sf ? 64u : 32u;
   3126   if (!sh) {
   3127     if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
   3128     return;
   3129   }
   3130   aa_emit32(t->mc, aa_ubfm(sf, rd, rn, bits - sh, bits - 1u - sh));
   3131 }
   3132 
   3133 static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
   3134   if (!sh) {
   3135     if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
   3136     return;
   3137   }
   3138   aa_emit32(t->mc, aa_ubfm(sf, rd, rn, sh, sf ? 63u : 31u));
   3139 }
   3140 
   3141 static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) {
   3142   if (!sh) {
   3143     if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
   3144     return;
   3145   }
   3146   aa_emit32(t->mc, aa_sbfm(sf, rd, rn, sh, sf ? 63u : 31u));
   3147 }
   3148 
   3149 static void aa_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   3150                              BitFieldAccess bf) {
   3151   u32 storage = bf.storage.size ? bf.storage.size : 4u;
   3152   u32 bits = aa_bit_storage_reg_bits(storage);
   3153   u32 width = bf.bit_width ? bf.bit_width : 1u;
   3154   u32 sf = bits == 64u;
   3155   NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset);
   3156   NativeLoc tmp = dst;
   3157   tmp.type = bf.storage.type ? bf.storage.type : dst.type;
   3158   aa_load_native(t, tmp, saddr, bf.storage);
   3159   aa_lsl_imm(t, sf, loc_reg(dst), loc_reg(dst),
   3160              bits - (u32)bf.bit_offset - width);
   3161   if (bf.signed_)
   3162     aa_asr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width);
   3163   else
   3164     aa_lsr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width);
   3165 }
   3166 
   3167 static void aa_bitfield_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
   3168                               BitFieldAccess bf) {
   3169   u32 storage = bf.storage.size ? bf.storage.size : 4u;
   3170   u32 bits = aa_bit_storage_reg_bits(storage);
   3171   u32 width = bf.bit_width ? bf.bit_width : 1u;
   3172   u32 sf = bits == 64u;
   3173   u64 ones = width >= 64u ? ~(u64)0 : ((1ull << width) - 1ull);
   3174   u64 field_mask = ones << bf.bit_offset;
   3175   NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset);
   3176   NativeLoc word =
   3177       aa_tmp_loc(bf.storage.type ? bf.storage.type : src.type, AA_TMP0);
   3178   aa_load_native(t, word, saddr, bf.storage);
   3179   aa_emit_load_imm(t->mc, sf, AA_TMP1, (i64)~field_mask);
   3180   aa_emit32(t->mc, aa64_and(sf, AA_TMP0, AA_TMP0, AA_TMP1));
   3181   aa_emit32(t->mc, aa_ubfm(sf, AA_TMP1, loc_reg(src), 0, width - 1u));
   3182   aa_lsl_imm(t, sf, AA_TMP1, AA_TMP1, bf.bit_offset);
   3183   aa_emit32(t->mc, aa64_orr(sf, AA_TMP0, AA_TMP0, AA_TMP1));
   3184   aa_store_native(t, saddr, word, bf.storage);
   3185 }
   3186 
   3187 static void aa_trap(NativeTarget* t);
   3188 
   3189 static int aa_order_acquire(KitCgMemOrder order) {
   3190   return order == KIT_CG_MO_CONSUME || order == KIT_CG_MO_ACQUIRE ||
   3191          order == KIT_CG_MO_ACQ_REL || order == KIT_CG_MO_SEQ_CST;
   3192 }
   3193 
   3194 static int aa_order_release(KitCgMemOrder order) {
   3195   return order == KIT_CG_MO_RELEASE || order == KIT_CG_MO_ACQ_REL ||
   3196          order == KIT_CG_MO_SEQ_CST;
   3197 }
   3198 
   3199 static NativeLoc aa_i64_reg_loc(u32 reg) {
   3200   return native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
   3201 }
   3202 
   3203 static void aa_atomic_addr_reg(NativeTarget* t, NativeAddr addr, u32 reg) {
   3204   NativeLoc dst = aa_i64_reg_loc(reg);
   3205   t->load_addr(t, dst, addr);
   3206 }
   3207 
   3208 static u32 aa_saved_tmp_pick(u32 a, u32 b, u32 c) {
   3209   static const u32 regs[] = {11u, 12u, 13u, 14u, 15u};
   3210   for (u32 i = 0; i < sizeof regs / sizeof regs[0]; ++i) {
   3211     if (regs[i] != a && regs[i] != b && regs[i] != c) return regs[i];
   3212   }
   3213   return 15u;
   3214 }
   3215 
   3216 static void aa_saved_tmp_spill(AANativeTarget* a, u32 reg) {
   3217   NativeFrameSlotDesc sd;
   3218   NativeAddr addr;
   3219   MemAccess mem;
   3220   memset(&sd, 0, sizeof sd);
   3221   if (a->saved_tmp_slot == NATIVE_FRAME_SLOT_NONE) {
   3222     sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   3223     sd.size = 8;
   3224     sd.align = 8;
   3225     sd.kind = NATIVE_FRAME_SLOT_SPILL;
   3226     a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd);
   3227   }
   3228   memset(&addr, 0, sizeof addr);
   3229   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3230   addr.base.frame = a->saved_tmp_slot;
   3231   addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
   3232   mem = aa_mem_for_type(&a->base, addr.base_type, 8);
   3233   aa_store_native(&a->base, addr, aa_i64_reg_loc(reg), mem);
   3234 }
   3235 
   3236 static void aa_saved_tmp_restore(AANativeTarget* a, u32 reg) {
   3237   NativeAddr addr;
   3238   MemAccess mem;
   3239   memset(&addr, 0, sizeof addr);
   3240   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3241   addr.base.frame = a->saved_tmp_slot;
   3242   addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
   3243   mem = aa_mem_for_type(&a->base, addr.base_type, 8);
   3244   aa_load_native(&a->base, aa_i64_reg_loc(reg), addr, mem);
   3245 }
   3246 
   3247 static void aa_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   3248                            MemAccess mem, KitCgMemOrder order) {
   3249   u32 base = AA_TMP0;
   3250   u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type));
   3251   aa_atomic_addr_reg(t, addr, base);
   3252   aa_emit32(t->mc, aa_order_acquire(order)
   3253                        ? aa_ldar(sz, loc_reg(dst), base)
   3254                        : aa_ldr_uimm(sz, loc_reg(dst), base, 0));
   3255   if (order == KIT_CG_MO_SEQ_CST)
   3256     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3257 }
   3258 
   3259 static void aa_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
   3260                             MemAccess mem, KitCgMemOrder order) {
   3261   u32 base = AA_TMP0;
   3262   u32 sz = size_idx(mem.size ? mem.size : type_size32(t, src.type));
   3263   if (order == KIT_CG_MO_SEQ_CST)
   3264     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3265   aa_atomic_addr_reg(t, addr, base);
   3266   aa_emit32(t->mc, aa_order_release(order)
   3267                        ? aa_stlr(sz, loc_reg(src), base)
   3268                        : aa_str_uimm(sz, loc_reg(src), base, 0));
   3269   if (order == KIT_CG_MO_SEQ_CST)
   3270     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3271 }
   3272 
   3273 static void aa_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
   3274                           NativeAddr addr, NativeLoc val, MemAccess mem,
   3275                           KitCgMemOrder order) {
   3276   AANativeTarget* a = aa_of(t);
   3277   u32 base = AA_TMP0;
   3278   u32 next_reg = AA_TMP1;
   3279   u32 status = aa_saved_tmp_pick(loc_reg(dst), loc_reg(val), base);
   3280   NativeLoc next = aa_tmp_loc(dst.type, next_reg);
   3281   MCLabel retry = t->mc->label_new(t->mc);
   3282   u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type));
   3283   if (order == KIT_CG_MO_SEQ_CST)
   3284     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3285   aa_saved_tmp_spill(a, status);
   3286   aa_atomic_addr_reg(t, addr, base);
   3287   t->mc->label_place(t->mc, retry);
   3288   aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldaxr(sz, loc_reg(dst), base)
   3289                                            : aa_ldxr(sz, loc_reg(dst), base));
   3290   switch (op) {
   3291     case KIT_CG_ATOMIC_XCHG:
   3292       aa_move(t, next, val);
   3293       break;
   3294     case KIT_CG_ATOMIC_ADD:
   3295       aa_binop(t, BO_IADD, next, dst, val);
   3296       break;
   3297     case KIT_CG_ATOMIC_SUB:
   3298       aa_binop(t, BO_ISUB, next, dst, val);
   3299       break;
   3300     case KIT_CG_ATOMIC_AND:
   3301       aa_binop(t, BO_AND, next, dst, val);
   3302       break;
   3303     case KIT_CG_ATOMIC_OR:
   3304       aa_binop(t, BO_OR, next, dst, val);
   3305       break;
   3306     case KIT_CG_ATOMIC_XOR:
   3307       aa_binop(t, BO_XOR, next, dst, val);
   3308       break;
   3309     case KIT_CG_ATOMIC_NAND:
   3310       aa_binop(t, BO_AND, next, dst, val);
   3311       aa_unop(t, UO_BNOT, next, next);
   3312       break;
   3313     default:
   3314       aa_panic(a, "unsupported atomic rmw op");
   3315   }
   3316   aa_emit32(t->mc, aa_order_release(order)
   3317                        ? aa_stlxr(sz, status, next_reg, base)
   3318                        : aa_stxr(sz, status, next_reg, base));
   3319   aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0));
   3320   t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0);
   3321   aa_saved_tmp_restore(a, status);
   3322   if (order == KIT_CG_MO_SEQ_CST)
   3323     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3324 }
   3325 
   3326 static void aa_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
   3327                           NativeAddr addr, NativeLoc expected,
   3328                           NativeLoc desired, MemAccess mem,
   3329                           KitCgMemOrder success, KitCgMemOrder failure) {
   3330   u32 base = AA_TMP0;
   3331   u32 status = AA_TMP1;
   3332   u32 sz = size_idx(mem.size ? mem.size : type_size32(t, prior.type));
   3333   u32 sf = sz == 3u;
   3334   int acquire = aa_order_acquire(success) || aa_order_acquire(failure);
   3335   int release = aa_order_release(success);
   3336   MCLabel retry = t->mc->label_new(t->mc);
   3337   MCLabel fail = t->mc->label_new(t->mc);
   3338   MCLabel done = t->mc->label_new(t->mc);
   3339   if (success == KIT_CG_MO_SEQ_CST || failure == KIT_CG_MO_SEQ_CST)
   3340     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3341   aa_atomic_addr_reg(t, addr, base);
   3342   t->mc->label_place(t->mc, retry);
   3343   aa_emit32(t->mc, acquire ? aa_ldaxr(sz, loc_reg(prior), base)
   3344                            : aa_ldxr(sz, loc_reg(prior), base));
   3345   aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(prior), loc_reg(expected)));
   3346   aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_NE)}));
   3347   t->mc->emit_label_ref(t->mc, fail, R_AARCH64_CONDBR19, 4, 0);
   3348   aa_emit32(t->mc, release ? aa_stlxr(sz, status, loc_reg(desired), base)
   3349                            : aa_stxr(sz, status, loc_reg(desired), base));
   3350   aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0));
   3351   t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0);
   3352   aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 1);
   3353   aa_jump(t, done);
   3354   t->mc->label_place(t->mc, fail);
   3355   aa_emit32(t->mc, aa64_clrex(AA64_BARRIER_OPT_SY));
   3356   aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 0);
   3357   t->mc->label_place(t->mc, done);
   3358   if (success == KIT_CG_MO_SEQ_CST || failure == KIT_CG_MO_SEQ_CST)
   3359     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3360 }
   3361 
   3362 static void aa_fence(NativeTarget* t, KitCgMemOrder order) {
   3363   if (order != KIT_CG_MO_RELAXED)
   3364     aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
   3365 }
   3366 
   3367 /* Map a KitCgBarrierScope (passed as an immediate arg to DMB/DSB) onto an
   3368  * AArch64 barrier domain option. Defaults to full-system (SY) when the scope
   3369  * is absent or unrecognized. */
   3370 static u32 aa_barrier_opt(const NativeLoc* args, u32 narg) {
   3371   if (narg < 1u || args[0].kind != NATIVE_LOC_IMM) return AA64_BARRIER_OPT_SY;
   3372   switch ((KitCgBarrierScope)args[0].v.imm) {
   3373     case KIT_CG_BARRIER_FULL:
   3374       return AA64_BARRIER_OPT_SY;
   3375     case KIT_CG_BARRIER_INNER:
   3376       return AA64_BARRIER_OPT_ISH;
   3377     case KIT_CG_BARRIER_INNER_STORE:
   3378       return AA64_BARRIER_OPT_ISHST;
   3379     case KIT_CG_BARRIER_OUTER:
   3380       return AA64_BARRIER_OPT_OSH;
   3381     case KIT_CG_BARRIER_OUTER_STORE:
   3382       return AA64_BARRIER_OPT_OSHST;
   3383     case KIT_CG_BARRIER_NON_SHARE:
   3384       return AA64_BARRIER_OPT_NSH;
   3385   }
   3386   return AA64_BARRIER_OPT_SY;
   3387 }
   3388 
   3389 static void aa_intrinsic(NativeTarget* t, IntrinKind kind,
   3390                          const NativeLoc* dsts, u32 ndst, const NativeLoc* args,
   3391                          u32 narg) {
   3392   AggregateAccess access;
   3393   NativeAddr dst_addr;
   3394   NativeAddr src_addr;
   3395   memset(&access, 0, sizeof access);
   3396   memset(&dst_addr, 0, sizeof dst_addr);
   3397   memset(&src_addr, 0, sizeof src_addr);
   3398   switch (kind) {
   3399     case INTRIN_NONE:
   3400       if (ndst == 1u && narg == 3u && native_loc_is_fp(dsts[0])) {
   3401         u32 d = type_size32(t, dsts[0].type) == 8u;
   3402         aa_emit32(t->mc, aa_fp_bin(0x000800u, d, loc_reg(dsts[0]),
   3403                                    loc_reg(args[0]), loc_reg(args[1])));
   3404         aa_emit32(t->mc, aa_fp_bin(0x002800u, d, loc_reg(dsts[0]),
   3405                                    loc_reg(dsts[0]), loc_reg(args[2])));
   3406         return;
   3407       }
   3408       break;
   3409     case INTRIN_CLZ:
   3410       if (ndst == 1u && narg == 1u) {
   3411         aa_emit32(t->mc, aa_clz(loc_is_64(t, args[0]), loc_reg(dsts[0]),
   3412                                 loc_reg(args[0])));
   3413         return;
   3414       }
   3415       break;
   3416     case INTRIN_CTZ:
   3417       if (ndst == 1u && narg == 1u) {
   3418         u32 sf = loc_is_64(t, args[0]);
   3419         aa_emit32(t->mc, aa_rbit(sf, loc_reg(dsts[0]), loc_reg(args[0])));
   3420         aa_emit32(t->mc, aa_clz(sf, loc_reg(dsts[0]), loc_reg(dsts[0])));
   3421         return;
   3422       }
   3423       break;
   3424     case INTRIN_POPCOUNT:
   3425       if (ndst == 1u && narg == 1u) {
   3426         u32 sf = loc_is_64(t, args[0]);
   3427         u32 rd = loc_reg(dsts[0]);
   3428         u32 rn = loc_reg(args[0]);
   3429         MCLabel loop = t->mc->label_new(t->mc);
   3430         MCLabel done = t->mc->label_new(t->mc);
   3431         aa_emit_load_imm(t->mc, sf, rd, 0);
   3432         aa_emit32(t->mc, aa64_mov_reg(sf, AA_TMP0, rn));
   3433         t->mc->label_place(t->mc, loop);
   3434         aa_emit32(t->mc, aa64_cbz(sf, AA_TMP0, 0));
   3435         t->mc->emit_label_ref(t->mc, done, R_AARCH64_CONDBR19, 4, 0);
   3436         aa_emit_load_imm(t->mc, sf, AA_TMP1, 1);
   3437         aa_emit32(t->mc, aa64_and(sf, AA_TMP1, AA_TMP0, AA_TMP1));
   3438         aa_emit32(t->mc, aa64_add(sf, rd, rd, AA_TMP1));
   3439         aa_emit_load_imm(t->mc, sf, AA_TMP1, 1);
   3440         aa_emit32(t->mc, aa64_lsrv(sf, AA_TMP0, AA_TMP0, AA_TMP1));
   3441         aa_jump(t, loop);
   3442         t->mc->label_place(t->mc, done);
   3443         return;
   3444       }
   3445       break;
   3446     case INTRIN_BSWAP:
   3447       if (ndst == 1u && narg == 1u) {
   3448         u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
   3449         switch (width) {
   3450           case 2: {
   3451             u32 sf = 0;
   3452             aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
   3453             aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
   3454             aa_emit32(t->mc, aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]),
   3455                                        AA_TMP0));
   3456             return;
   3457           }
   3458           case 4: {
   3459             u32 sf = 0;
   3460             aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
   3461             return;
   3462           }
   3463           case 8: {
   3464             u32 sf = 1;
   3465             aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
   3466             return;
   3467           }
   3468           default:
   3469             break;
   3470         }
   3471       }
   3472       break;
   3473     case INTRIN_SADD_OVERFLOW:
   3474     case INTRIN_UADD_OVERFLOW:
   3475     case INTRIN_SSUB_OVERFLOW:
   3476     case INTRIN_USUB_OVERFLOW:
   3477       if (ndst == 2u && narg == 2u) {
   3478         u32 sf = loc_is_64(t, dsts[0]);
   3479         u32 rd = loc_reg(dsts[0]);
   3480         if (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW)
   3481           aa_emit32(t->mc,
   3482                     aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
   3483                                                       .op = 0,
   3484                                                       .S = 1,
   3485                                                       .Rm = loc_reg(args[1]),
   3486                                                       .Rn = loc_reg(args[0]),
   3487                                                       .Rd = rd}));
   3488         else
   3489           aa_emit32(t->mc,
   3490                     aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
   3491                                                       .op = 1,
   3492                                                       .S = 1,
   3493                                                       .Rm = loc_reg(args[1]),
   3494                                                       .Rn = loc_reg(args[0]),
   3495                                                       .Rd = rd}));
   3496         aa_emit32(t->mc,
   3497                   aa_cset(loc_is_64(t, dsts[1]), loc_reg(dsts[1]),
   3498                           (kind == INTRIN_SADD_OVERFLOW ||
   3499                            kind == INTRIN_SSUB_OVERFLOW)
   3500                               ? 0x6u
   3501                               : (kind == INTRIN_UADD_OVERFLOW ? 0x2u : 0x3u)));
   3502         return;
   3503       }
   3504       break;
   3505     case INTRIN_SMUL_OVERFLOW:
   3506     case INTRIN_UMUL_OVERFLOW:
   3507       if (ndst == 2u && narg == 2u) {
   3508         u32 sf = loc_is_64(t, dsts[0]);
   3509         if (sf) {
   3510           if (kind == INTRIN_SMUL_OVERFLOW) {
   3511             aa_emit32(t->mc,
   3512                       aa_smulh(AA_TMP0, loc_reg(args[0]), loc_reg(args[1])));
   3513             aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]),
   3514                                       loc_reg(args[1])));
   3515             aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 63, 63));
   3516             aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1));
   3517             aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
   3518           } else {
   3519             aa_emit32(t->mc,
   3520                       aa_umulh(AA_TMP0, loc_reg(args[0]), loc_reg(args[1])));
   3521             aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]),
   3522                                       loc_reg(args[1])));
   3523             aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA64_ZR));
   3524             aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
   3525           }
   3526         } else if (kind == INTRIN_SMUL_OVERFLOW) {
   3527           aa_emit32(t->mc, aa_smaddl(AA_TMP0, loc_reg(args[0]),
   3528                                      loc_reg(args[1]), AA64_ZR));
   3529           aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0));
   3530           aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 0, 31));
   3531           aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1));
   3532           aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
   3533         } else {
   3534           aa_emit32(t->mc, aa_umaddl(AA_TMP0, loc_reg(args[0]),
   3535                                      loc_reg(args[1]), AA64_ZR));
   3536           aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0));
   3537           aa_emit_load_imm(t->mc, 1, AA_TMP1, 32);
   3538           aa_emit32(t->mc, aa64_lsrv(1, AA_TMP1, AA_TMP0, AA_TMP1));
   3539           aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP1, AA64_ZR));
   3540           aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE)));
   3541         }
   3542         return;
   3543       }
   3544       break;
   3545     case INTRIN_MEMMOVE: {
   3546       MCLabel forward = t->mc->label_new(t->mc);
   3547       MCLabel done = t->mc->label_new(t->mc);
   3548       if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
   3549           args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
   3550         aa_panic(aa_of(t), "unsupported memory intrinsic operands");
   3551       if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
   3552         aa_panic(aa_of(t), "unsupported memory intrinsic size");
   3553       access.size = (u32)args[2].v.imm;
   3554       access.align = 1u;
   3555       dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   3556       dst_addr.base.reg = args[0].v.reg;
   3557       src_addr.base_kind = NATIVE_ADDR_BASE_REG;
   3558       src_addr.base.reg = args[1].v.reg;
   3559       aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, args[0].v.reg, args[1].v.reg));
   3560       aa_emit32(t->mc,
   3561                 aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_LT_U)}));
   3562       t->mc->emit_label_ref(t->mc, forward, R_AARCH64_CONDBR19, 4, 0);
   3563       aa_copy_bytes_dir(t, dst_addr, src_addr, access, 1);
   3564       aa_jump(t, done);
   3565       t->mc->label_place(t->mc, forward);
   3566       aa_copy_bytes_dir(t, dst_addr, src_addr, access, 0);
   3567       t->mc->label_place(t->mc, done);
   3568       return;
   3569     }
   3570     case INTRIN_EXPECT:
   3571     case INTRIN_ASSUME_ALIGNED:
   3572       if (ndst == 1u && narg >= 1u) {
   3573         if (args[0].kind == NATIVE_LOC_IMM)
   3574           aa_load_imm_native(t, dsts[0], args[0].v.imm);
   3575         else
   3576           aa_move(t, dsts[0], args[0]);
   3577       }
   3578       return;
   3579     case INTRIN_PREFETCH:
   3580       return;
   3581     case INTRIN_TRAP:
   3582       aa_trap(t);
   3583       return;
   3584     case INTRIN_SYSCALL:
   3585       if (ndst == 1u && narg >= 1u && narg <= 7u) {
   3586         static const u32 syscall_regs[7] = {AA_X8, 0u, 1u, 2u, 3u, 4u, 5u};
   3587         AAArgMove moves[7];
   3588         for (u32 i = 0; i < narg; ++i) {
   3589           AAArgMove* m = &moves[i];
   3590           memset(m, 0, sizeof *m);
   3591           m->dst =
   3592               native_loc_reg(dsts[0].type, NATIVE_REG_INT, syscall_regs[i]);
   3593           m->src = args[i];
   3594           m->size = t->c->target.ptr_size;
   3595         }
   3596         aa_emit_reg_arg_moves(t, moves, narg);
   3597         aa_emit32(t->mc, aa64_svc(0));
   3598         aa_move(t, dsts[0], native_loc_reg(dsts[0].type, NATIVE_REG_INT, 0));
   3599       }
   3600       return;
   3601     case INTRIN_CPU_NOP:
   3602       aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_NOP));
   3603       return;
   3604     case INTRIN_CPU_YIELD:
   3605       aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_YIELD));
   3606       return;
   3607     case INTRIN_WFI:
   3608       aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_WFI));
   3609       return;
   3610     case INTRIN_WFE:
   3611       aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_WFE));
   3612       return;
   3613     case INTRIN_SEV:
   3614       aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_SEV));
   3615       return;
   3616     case INTRIN_ISB:
   3617       aa_emit32(t->mc, aa64_isb(AA64_BARRIER_OPT_SY));
   3618       return;
   3619     case INTRIN_DMB:
   3620       aa_emit32(t->mc, aa64_dmb(aa_barrier_opt(args, narg)));
   3621       return;
   3622     case INTRIN_DSB:
   3623       aa_emit32(t->mc, aa64_dsb(aa_barrier_opt(args, narg)));
   3624       return;
   3625     case INTRIN_IRQ_SAVE:
   3626       /* Read the interrupt-mask state, then mask D,A,I,F. */
   3627       if (ndst == 1u) {
   3628         aa_emit32(t->mc, aa64_mrs_daif(loc_reg(dsts[0])));
   3629         aa_emit32(t->mc, aa64_msr_daifset(AA64_DAIF_ALL));
   3630       }
   3631       return;
   3632     case INTRIN_IRQ_RESTORE:
   3633       if (narg == 1u) aa_emit32(t->mc, aa64_msr_daif(loc_reg(args[0])));
   3634       return;
   3635     case INTRIN_IRQ_DISABLE:
   3636       aa_emit32(t->mc, aa64_msr_daifset(AA64_DAIF_ALL));
   3637       return;
   3638     case INTRIN_IRQ_ENABLE:
   3639       aa_emit32(t->mc, aa64_msr_daifclr(AA64_DAIF_ALL));
   3640       return;
   3641     case INTRIN_FRAME_ADDRESS:
   3642     case INTRIN_RETURN_ADDRESS:
   3643       /* Walk the AAPCS64 frame-record chain. Every kit prologue stores
   3644        * {x29, x30} and anchors x29 at the record: [x29] = caller's x29,
   3645        * [x29 + 8] = saved x30 (this frame's return address). The level is a
   3646        * compile-time constant, so the walk unrolls to `level` dependent loads. */
   3647       if (ndst == 1u) {
   3648         u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM)
   3649                         ? (u32)args[0].v.imm
   3650                         : 0u;
   3651         u32 rd = loc_reg(dsts[0]);
   3652         aa_emit32(t->mc, aa64_mov_reg(1, rd, AA_FP));
   3653         for (u32 i = 0; i < level; ++i)
   3654           aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 0)); /* rd = *(rd) */
   3655         if (kind == INTRIN_RETURN_ADDRESS)
   3656           aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 1)); /* rd = *(rd + 8) */
   3657       }
   3658       return;
   3659     default:
   3660       aa_panic(aa_of(t), "unsupported compiler intrinsic");
   3661   }
   3662 }
   3663 
   3664 static void aa_trap(NativeTarget* t) { aa_emit32(t->mc, aa64_brk(0)); }
   3665 
   3666 /* file_scope_asm + finalize are shared (cg/native_asm.h). */
   3667 
   3668 static int aa_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
   3669                                   u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
   3670   mask[0] = mask[1] = mask[2] = 0;
   3671   if ((NativeMachineOpKind)op->kind == NATIVE_MOP_TLS_ADDR) {
   3672     /* ELF Local-Exec materializes the address using only the destination
   3673      * register (mrs tpidr_el0 + add/add into rd) — no extra clobbers. The
   3674      * Mach-O TLV sequence loads the descriptor into x0 and calls the resolver
   3675      * thunk through x16, clobbering x0/x16/x17 and the link register; the JIT
   3676      * relaxation of that same sequence keeps the x0/x16/x17 footprint. Model
   3677      * the descriptor-model clobbers so a value live across a TLS access is not
   3678      * left in one of these registers. */
   3679     if (!obj_format_tls_via_descriptor(t->c)) return 0;
   3680     mask[NATIVE_REG_INT] =
   3681         (1u << 0) | (1u << 16) | (1u << 17) | (1u << AA_LR);
   3682     return 1;
   3683   }
   3684   if ((NativeMachineOpKind)op->kind != NATIVE_MOP_INTRINSIC ||
   3685       (IntrinKind)op->intrin != INTRIN_SYSCALL)
   3686     return 0;
   3687   mask[NATIVE_REG_INT] = (1u << 0) | (1u << 1) | (1u << 2) | (1u << 3) |
   3688                          (1u << 4) | (1u << 5) | (1u << AA_X8);
   3689   return 1;
   3690 }
   3691 
   3692 static void aa_set_loc(NativeTarget* t, SrcLoc loc) {
   3693   AANativeTarget* a = aa_of(t);
   3694   a->loc = loc;
   3695   if (t->mc && t->mc->set_loc) t->mc->set_loc(t->mc, loc);
   3696 }
   3697 
   3698 static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
   3699                                  NativeLoc dst);
   3700 
   3701 /* Caller-saved allocables come first so the allocator prefers them (lower
   3702  * spill_cost); callee-saved x19..x28 / v8..v15 are appended and only chosen
   3703  * under register pressure, after which the prologue saves/restores them. */
   3704 static const Reg aa_int_allocable[] = {8u,  11u, 12u, 13u, 14u, 15u, 19u, 20u,
   3705                                        21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u};
   3706 /* Three int scratch registers, not two: a 3-operand op (e.g. `binop dst, a, b`
   3707  * or `store [base+index], value`) whose dst/sources all spill needs three
   3708  * distinct scratch regs at emit time — the IR spill-rewrite round-robins
   3709  * operands across this pool and the native emitter materializes each into one.
   3710  * With only two, an immediate operand of an all-spilled binop had nowhere to
   3711  * land. x9/x10/x11 are all caller-saved temporaries reserved out of the
   3712  * allocable set below. */
   3713 static const Reg aa_int_scratch[] = {9u, 10u, 11u};
   3714 static const Reg aa_fp_allocable[] = {18u, 19u, 8u,  9u,  10u,
   3715                                       11u, 12u, 13u, 14u, 15u};
   3716 static const Reg aa_fp_scratch[] = {20u, 21u};
   3717 
   3718 #define AA_PHYS_INT_ALLOC(r)                                \
   3719   {.reg = (r),                                              \
   3720    .cls = NATIVE_REG_INT,                                   \
   3721    .abi_index = 0xffu,                                      \
   3722    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
   3723    .spill_cost = 1u,                                        \
   3724    .copy_cost = 1u}
   3725 #define AA_PHYS_INT_CALLER(r)        \
   3726   {.reg = (r),                       \
   3727    .cls = NATIVE_REG_INT,            \
   3728    .abi_index = 0xffu,               \
   3729    .flags = NATIVE_REG_CALLER_SAVED, \
   3730    .spill_cost = 1u,                 \
   3731    .copy_cost = 1u}
   3732 #define AA_PHYS_INT_ARG(r)                             \
   3733   {.reg = (r),                                         \
   3734    .cls = NATIVE_REG_INT,                              \
   3735    .abi_index = (r),                                   \
   3736    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
   3737             ((r) < 2u ? NATIVE_REG_RET : 0),           \
   3738    .spill_cost = 1u,                                   \
   3739    .copy_cost = 1u}
   3740 #define AA_PHYS_INT_CALLEE(r)                               \
   3741   {.reg = (r),                                              \
   3742    .cls = NATIVE_REG_INT,                                   \
   3743    .abi_index = 0xffu,                                      \
   3744    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
   3745    .spill_cost = 4u,                                        \
   3746    .copy_cost = 1u}
   3747 #define AA_PHYS_INT_RESERVED(r)  \
   3748   {.reg = (r),                   \
   3749    .cls = NATIVE_REG_INT,        \
   3750    .abi_index = 0xffu,           \
   3751    .flags = NATIVE_REG_RESERVED, \
   3752    .spill_cost = 0u,             \
   3753    .copy_cost = 0u}
   3754 
   3755 static const NativePhysRegInfo aa_int_phys[] = {
   3756     AA_PHYS_INT_ARG(0u),       AA_PHYS_INT_ARG(1u),
   3757     AA_PHYS_INT_ARG(2u),       AA_PHYS_INT_ARG(3u),
   3758     AA_PHYS_INT_ARG(4u),       AA_PHYS_INT_ARG(5u),
   3759     AA_PHYS_INT_ARG(6u),       AA_PHYS_INT_ARG(7u),
   3760     AA_PHYS_INT_ALLOC(8u),     AA_PHYS_INT_RESERVED(9u),
   3761     AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_RESERVED(11u),
   3762     AA_PHYS_INT_ALLOC(12u),    AA_PHYS_INT_ALLOC(13u),
   3763     AA_PHYS_INT_ALLOC(14u),    AA_PHYS_INT_ALLOC(15u),
   3764     AA_PHYS_INT_RESERVED(16u), AA_PHYS_INT_RESERVED(17u),
   3765     AA_PHYS_INT_RESERVED(18u), AA_PHYS_INT_CALLEE(19u),
   3766     AA_PHYS_INT_CALLEE(20u),   AA_PHYS_INT_CALLEE(21u),
   3767     AA_PHYS_INT_CALLEE(22u),   AA_PHYS_INT_CALLEE(23u),
   3768     AA_PHYS_INT_CALLEE(24u),   AA_PHYS_INT_CALLEE(25u),
   3769     AA_PHYS_INT_CALLEE(26u),   AA_PHYS_INT_CALLEE(27u),
   3770     AA_PHYS_INT_CALLEE(28u),   AA_PHYS_INT_RESERVED(29u),
   3771     AA_PHYS_INT_RESERVED(30u), AA_PHYS_INT_RESERVED(31u),
   3772 };
   3773 
   3774 #define AA_PHYS_FP_ALLOC(r)                                 \
   3775   {.reg = (r),                                              \
   3776    .cls = NATIVE_REG_FP,                                    \
   3777    .abi_index = 0xffu,                                      \
   3778    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
   3779    .spill_cost = 1u,                                        \
   3780    .copy_cost = 1u}
   3781 #define AA_PHYS_FP_CALLER(r)         \
   3782   {.reg = (r),                       \
   3783    .cls = NATIVE_REG_FP,             \
   3784    .abi_index = 0xffu,               \
   3785    .flags = NATIVE_REG_CALLER_SAVED, \
   3786    .spill_cost = 1u,                 \
   3787    .copy_cost = 1u}
   3788 #define AA_PHYS_FP_ARG(r)                              \
   3789   {.reg = (r),                                         \
   3790    .cls = NATIVE_REG_FP,                               \
   3791    .abi_index = (r),                                   \
   3792    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
   3793             ((r) < 4u ? NATIVE_REG_RET : 0),           \
   3794    .spill_cost = 1u,                                   \
   3795    .copy_cost = 1u}
   3796 #define AA_PHYS_FP_CALLEE(r)                                \
   3797   {.reg = (r),                                              \
   3798    .cls = NATIVE_REG_FP,                                    \
   3799    .abi_index = 0xffu,                                      \
   3800    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
   3801    .spill_cost = 4u,                                        \
   3802    .copy_cost = 1u}
   3803 #define AA_PHYS_FP_RESERVED(r)   \
   3804   {.reg = (r),                   \
   3805    .cls = NATIVE_REG_FP,         \
   3806    .abi_index = 0xffu,           \
   3807    .flags = NATIVE_REG_RESERVED, \
   3808    .spill_cost = 0u,             \
   3809    .copy_cost = 0u}
   3810 
   3811 static const NativePhysRegInfo aa_fp_phys[] = {
   3812     AA_PHYS_FP_ARG(0u),       AA_PHYS_FP_ARG(1u),     AA_PHYS_FP_ARG(2u),
   3813     AA_PHYS_FP_ARG(3u),       AA_PHYS_FP_ARG(4u),     AA_PHYS_FP_ARG(5u),
   3814     AA_PHYS_FP_ARG(6u),       AA_PHYS_FP_ARG(7u),     AA_PHYS_FP_CALLEE(8u),
   3815     AA_PHYS_FP_CALLEE(9u),    AA_PHYS_FP_CALLEE(10u), AA_PHYS_FP_CALLEE(11u),
   3816     AA_PHYS_FP_CALLEE(12u),   AA_PHYS_FP_CALLEE(13u), AA_PHYS_FP_CALLEE(14u),
   3817     AA_PHYS_FP_CALLEE(15u),   AA_PHYS_FP_CALLER(16u), AA_PHYS_FP_CALLER(17u),
   3818     AA_PHYS_FP_ALLOC(18u),    AA_PHYS_FP_ALLOC(19u),  AA_PHYS_FP_RESERVED(20u),
   3819     AA_PHYS_FP_RESERVED(21u), AA_PHYS_FP_CALLER(22u), AA_PHYS_FP_CALLER(23u),
   3820     AA_PHYS_FP_CALLER(24u),   AA_PHYS_FP_CALLER(25u), AA_PHYS_FP_CALLER(26u),
   3821     AA_PHYS_FP_CALLER(27u),   AA_PHYS_FP_CALLER(28u), AA_PHYS_FP_CALLER(29u),
   3822     AA_PHYS_FP_CALLER(30u),   AA_PHYS_FP_CALLER(31u),
   3823 };
   3824 
   3825 static const NativeAllocClassInfo aa_classes[] = {
   3826     {.cls = NATIVE_REG_INT,
   3827      .allocable = aa_int_allocable,
   3828      .nallocable = sizeof aa_int_allocable / sizeof aa_int_allocable[0],
   3829      .scratch = aa_int_scratch,
   3830      .nscratch = sizeof aa_int_scratch / sizeof aa_int_scratch[0],
   3831      .phys = aa_int_phys,
   3832      .nphys = sizeof aa_int_phys / sizeof aa_int_phys[0],
   3833      .caller_saved_mask = 0x0007ffffu,
   3834      .callee_saved_mask = 0x1ff80000u,
   3835      .arg_mask = 0x000000ffu,
   3836      .ret_mask = 0x00000003u,
   3837      .reserved_mask =
   3838          (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << AA_FP) | (1u << AA_LR)},
   3839     {.cls = NATIVE_REG_FP,
   3840      .allocable = aa_fp_allocable,
   3841      .nallocable = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0],
   3842      .scratch = aa_fp_scratch,
   3843      .nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0],
   3844      .phys = aa_fp_phys,
   3845      .nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0],
   3846      /* v8..v15 are callee-saved (low 64 bits per AAPCS64); the rest are
   3847       * caller-saved. */
   3848      .caller_saved_mask = 0xffff00ffu,
   3849      .callee_saved_mask = 0x0000ff00u,
   3850      .arg_mask = 0x000000ffu,
   3851      .ret_mask = 0x0000000fu},
   3852 };
   3853 
   3854 /* Resolve a register name ("x8", "v3", ...) to its (class, Reg). Powers the
   3855  * optimizer's inline-asm clobber masks and explicit hard-register operands
   3856  * ("{x8}" from a GNU local register variable). x0..x30 are DWARF 0..30; the
   3857  * SIMD/FP bank v0..v31 is DWARF 64..95. Returns non-zero for a non-register
   3858  * name (cc/memory/unknown), which the caller skips. */
   3859 static int aa_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
   3860                            NativeAllocClass* cls_out) {
   3861   char buf[16];
   3862   uint32_t dwarf;
   3863   (void)ri;
   3864   if (!name.s || !name.len || name.len >= sizeof buf) return 1;
   3865   memcpy(buf, name.s, name.len);
   3866   buf[name.len] = '\0';
   3867   if (aa64_register_index(buf, &dwarf) != 0) return 1;
   3868   if (dwarf <= 30u) {
   3869     *cls_out = NATIVE_REG_INT;
   3870     *out = (Reg)dwarf;
   3871     return 0;
   3872   }
   3873   if (dwarf >= 64u && dwarf <= 95u) {
   3874     *cls_out = NATIVE_REG_FP;
   3875     *out = (Reg)(dwarf - 64u);
   3876     return 0;
   3877   }
   3878   return 1;
   3879 }
   3880 
   3881 static int aa_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
   3882                                  Reg reg) {
   3883   (void)ri;
   3884   if (cls == NATIVE_REG_INT) {
   3885     if (reg <= 8u) return 1;
   3886     if (reg >= 12u && reg <= 15u) return 1;
   3887     if (reg >= 19u && reg <= 28u) return 1;
   3888     return 0;
   3889   }
   3890   if (cls == NATIVE_REG_FP) {
   3891     if (reg <= 19u) return 1;
   3892     if (reg >= 22u && reg <= 31u) return 1;
   3893   }
   3894   return 0;
   3895 }
   3896 
   3897 static int aa_asm_constraint_reg(const NativeRegInfo* ri, const char* body,
   3898                                  NativeAllocClass* cls_out, Reg* fixed_out,
   3899                                  u32* allowed_mask_out) {
   3900   (void)ri;
   3901   if (!body || !body[0] || body[1]) return 0;
   3902   if (fixed_out) *fixed_out = REG_NONE;
   3903   if (allowed_mask_out) *allowed_mask_out = 0;
   3904   switch (body[0]) {
   3905     case 'r':
   3906       if (cls_out) *cls_out = NATIVE_REG_INT;
   3907       return 1;
   3908     case 'w':
   3909       if (cls_out) *cls_out = NATIVE_REG_FP;
   3910       return 1;
   3911     case 'x':
   3912       if (cls_out) *cls_out = NATIVE_REG_FP;
   3913       if (allowed_mask_out) *allowed_mask_out = 0x0000ffffu; /* v0..v15 */
   3914       return 1;
   3915     case 'y':
   3916       if (cls_out) *cls_out = NATIVE_REG_FP;
   3917       if (allowed_mask_out) *allowed_mask_out = 0x000000ffu; /* v0..v7 */
   3918       return 1;
   3919     default:
   3920       return 0;
   3921   }
   3922 }
   3923 
   3924 static const NativeRegInfo aa_reg_info = {
   3925     .classes = aa_classes,
   3926     .nclasses = sizeof aa_classes / sizeof aa_classes[0],
   3927     .resolve_name = aa_resolve_name,
   3928     .asm_operand_reg_ok = aa_asm_operand_reg_ok,
   3929     .asm_constraint_reg = aa_asm_constraint_reg,
   3930 };
   3931 
   3932 static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr);
   3933 static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
   3934                              KitCgTypeId type);
   3935 static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr);
   3936 static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr,
   3937                               NativeLoc src_ap_ptr);
   3938 static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
   3939                                 const AsmConstraint* outs, u32 nout,
   3940                                 NativeLoc* out_locs, const AsmConstraint* ins,
   3941                                 u32 nin, const NativeLoc* in_locs,
   3942                                 const Sym* clobbers, u32 nclob);
   3943 
   3944 NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
   3945                                      MCEmitter* mc) {
   3946   AANativeTarget* a = arena_znew(c->tu, AANativeTarget);
   3947   NativeTarget* t;
   3948   if (!a) return NULL;
   3949   t = &a->base;
   3950   t->c = c;
   3951   t->obj = obj;
   3952   t->mc = mc;
   3953   native_frame_init(&a->frame, c);
   3954   t->regs = &aa_reg_info;
   3955   t->class_for_type = aa_class_for_type;
   3956   t->imm_legal = aa_imm_legal;
   3957   t->addr_legal = aa_addr_legal;
   3958   t->machine_op_clobbers = aa_machine_op_clobbers;
   3959   t->func_begin = aa_func_begin;
   3960   t->func_begin_known_frame = aa_func_begin_known_frame;
   3961   t->note_frame_state = aa_note_frame_state;
   3962   t->reserve_callee_saves = aa_reserve_callee_saves;
   3963   t->signature_stack_bytes = aa_signature_stack_bytes;
   3964   t->call_stack_bytes = aa_call_stack_bytes;
   3965   t->has_store_zero_reg = 1;
   3966   t->store_zero_reg = 31u; /* wzr/xzr in the Rt position of a store */
   3967   t->func_end = aa_func_end;
   3968   t->frame_slot = aa_frame_slot;
   3969   t->frame_slot_debug_loc = aa_frame_slot_debug_loc;
   3970   t->bind_param = aa_bind_native_param;
   3971   t->label_new = aa_label_new;
   3972   t->label_place = aa_label_place;
   3973   t->jump = aa_jump;
   3974   t->cmp_branch = aa_cmp_branch;
   3975   t->indirect_branch = aa_indirect_branch;
   3976   t->load_label_addr = aa_load_label_addr;
   3977   t->move = aa_move;
   3978   t->load_imm = aa_load_imm_native;
   3979   t->load_const = aa_load_const;
   3980   t->load_addr = aa_load_addr;
   3981   t->load = aa_load_native;
   3982   t->store = aa_store_native;
   3983   t->tls_addr_of = aa_tls_addr_of;
   3984   t->copy_bytes = aa_copy_bytes;
   3985   t->set_bytes = aa_set_bytes;
   3986   t->bitfield_load = aa_bitfield_load;
   3987   t->bitfield_store = aa_bitfield_store;
   3988   t->binop = aa_binop;
   3989   t->unop = aa_unop;
   3990   t->cmp = aa_cmp;
   3991   t->convert = aa_convert;
   3992   t->alloca_ = aa_alloca;
   3993   t->spill = aa_spill;
   3994   t->reload = aa_reload;
   3995   t->plan_call = aa_plan_call;
   3996   t->emit_call = aa_emit_call;
   3997   t->plan_ret = aa_plan_ret;
   3998   t->ret = aa_ret;
   3999   t->atomic_load = aa_atomic_load;
   4000   t->atomic_store = aa_atomic_store;
   4001   t->atomic_rmw = aa_atomic_rmw;
   4002   t->atomic_cas = aa_atomic_cas;
   4003   t->fence = aa_fence;
   4004   t->va_start_ = aa_va_start_native;
   4005   t->va_arg_ = aa_va_arg_native;
   4006   t->va_end_ = aa_va_end_native;
   4007   t->va_copy_ = aa_va_copy_native;
   4008   t->intrinsic = aa_intrinsic;
   4009   t->asm_block = aa_asm_block_native;
   4010   t->file_scope_asm = native_file_scope_asm;
   4011   t->trap = aa_trap;
   4012   t->set_loc = aa_set_loc;
   4013   t->finalize = native_finalize;
   4014   return t;
   4015 }
   4016 
   4017 /* Place the incoming parameter into `dst`: a hard register (the common
   4018  * register-allocated scalar case -> a single arg-reg move, or a stack load
   4019  * straight into the register), a frame slot (address-taken / aggregate /
   4020  * spilled), or nowhere (unused). Incoming arg registers are never allocable,
   4021  * so a register dst never aliases an incoming arg register. */
   4022 static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
   4023                                  NativeLoc dst) {
   4024   AANativeTarget* a = aa_of(t);
   4025   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   4026   const ABIArgInfo* ai =
   4027       p->index < abi->nparams ? &abi->params[p->index] : NULL;
   4028   int to_reg = dst.kind == NATIVE_LOC_REG;
   4029   if (!ai || ai->kind == ABI_ARG_IGNORE) return;
   4030   if (ai->kind == ABI_ARG_INDIRECT) {
   4031     NativeAddr d_addr, from;
   4032     AggregateAccess access;
   4033     NativeLoc src =
   4034         native_loc_reg(p->type, NATIVE_REG_INT,
   4035                        a->next_param_int < 8u ? a->next_param_int++ : AA_TMP0);
   4036     if (src.v.reg == AA_TMP0) {
   4037       NativeAddr saddr;
   4038       memset(&saddr, 0, sizeof saddr);
   4039       saddr.base_kind = NATIVE_ADDR_BASE_REG;
   4040       saddr.base.reg = AA_FP;
   4041       saddr.offset = aa_fp_off_in_arg(a, a->next_param_stack);
   4042       aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8));
   4043       a->next_param_stack += 8u;
   4044     }
   4045     if (dst.kind != NATIVE_LOC_FRAME)
   4046       aa_panic(a, "indirect parameter requires a frame destination");
   4047     memset(&d_addr, 0, sizeof d_addr);
   4048     d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4049     d_addr.base.frame = dst.v.frame;
   4050     d_addr.base_type = p->type;
   4051     memset(&from, 0, sizeof from);
   4052     from.base_kind = NATIVE_ADDR_BASE_REG;
   4053     from.base.reg = src.v.reg;
   4054     from.base_type = p->type;
   4055     memset(&access, 0, sizeof access);
   4056     access.type = p->type;
   4057     access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
   4058     access.align = p->align ? p->align : type_align32(t, p->type);
   4059     aa_copy_bytes(t, d_addr, from, access);
   4060     return;
   4061   }
   4062   for (u32 i = 0; i < ai->nparts; ++i) {
   4063     const ABIArgPart* part = &ai->parts[i];
   4064     NativeAllocClass cls =
   4065         part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   4066     int reg_dst = to_reg && (NativeAllocClass)dst.cls == cls;
   4067     NativeLoc src;
   4068     if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
   4069       src = native_loc_reg(p->type, cls, a->next_param_fp++);
   4070     } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
   4071       src = native_loc_reg(p->type, cls, a->next_param_int++);
   4072     } else {
   4073       /* Stack-passed part: load straight into the dst register when possible,
   4074        * otherwise a scratch for the store-to-frame path. */
   4075       Reg tmp =
   4076           reg_dst ? (Reg)dst.v.reg : (cls == NATIVE_REG_FP ? 16u : AA_TMP0);
   4077       NativeAddr saddr;
   4078       src = native_loc_reg(p->type, cls, tmp);
   4079       a->next_param_stack =
   4080           align_up_u32(a->next_param_stack, aa_part_stack_align(abi, part));
   4081       memset(&saddr, 0, sizeof saddr);
   4082       saddr.base_kind = NATIVE_ADDR_BASE_REG;
   4083       saddr.base.reg = AA_FP;
   4084       saddr.base_type = p->type;
   4085       saddr.offset = aa_fp_off_in_arg(a, a->next_param_stack);
   4086       aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size));
   4087       a->next_param_stack += aa_part_stack_size(abi, part);
   4088     }
   4089     if (dst.kind == NATIVE_LOC_NONE) {
   4090       /* Unused parameter: only the ABI cursor advances. */
   4091     } else if (to_reg) {
   4092       NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type,
   4093                                    (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
   4094       if (!(src.kind == NATIVE_LOC_REG && src.v.reg == d.v.reg &&
   4095             (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
   4096         aa_move(t, d, src);
   4097     } else {
   4098       aa_store_part(
   4099           t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
   4100           0, part->size);
   4101     }
   4102   }
   4103   a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
   4104 }
   4105 
   4106 static void aa_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   4107                           CGLocal local, NativeDirectLocal* l) {
   4108   NativeLoc dst;
   4109   (void)local;
   4110   memset(&dst, 0, sizeof dst);
   4111   dst.kind = NATIVE_LOC_FRAME;
   4112   dst.type = p->type;
   4113   dst.v.frame = l->home;
   4114   aa_bind_native_param(d->native, p, dst);
   4115 }
   4116 
   4117 static const char* aa_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
   4118   NativeCallDesc nd;
   4119   NativeLoc* args = NULL;
   4120   NativeLoc* results = NULL;
   4121   u32 stack;
   4122   memset(&nd, 0, sizeof nd);
   4123   u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u;
   4124   if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
   4125   if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults);
   4126   for (u32 i = 0; i < call->nargs; ++i) {
   4127     args[i].kind = NATIVE_LOC_FRAME;
   4128     args[i].type = d->locals[call->args[i] - 1u].type;
   4129     args[i].cls = d->locals[call->args[i] - 1u].cls;
   4130     args[i].v.frame = d->locals[call->args[i] - 1u].home;
   4131   }
   4132   if (nresults) {
   4133     results[0].kind = NATIVE_LOC_FRAME;
   4134     results[0].type = d->locals[call->result - 1u].type;
   4135     results[0].cls = d->locals[call->result - 1u].cls;
   4136     results[0].v.frame = d->locals[call->result - 1u].home;
   4137   }
   4138   nd.fn_type = call->fn_type;
   4139   nd.args = args;
   4140   nd.results = results;
   4141   nd.nargs = call->nargs;
   4142   nd.nresults = nresults;
   4143   stack = aa_call_stack_size(d->native, &nd);
   4144   if (stack > aa_of(d->native)->incoming_stack_size)
   4145     return "aarch64 tail call: stack argument area too small";
   4146   return NULL;
   4147 }
   4148 
   4149 static NativeAddr aa_direct_addr(NativeDirectTarget* d, Operand op) {
   4150   NativeAddr addr;
   4151   memset(&addr, 0, sizeof addr);
   4152   switch ((OpKind)op.kind) {
   4153     case OPK_LOCAL:
   4154       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4155       addr.base.frame = d->locals[op.v.local - 1u].home;
   4156       addr.base_type = op.type;
   4157       return addr;
   4158     case OPK_INDIRECT:
   4159       addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
   4160       addr.base.frame = d->locals[op.v.ind.base - 1u].home;
   4161       addr.cls = d->locals[op.v.ind.base - 1u].cls;
   4162       addr.base_type = d->locals[op.v.ind.base - 1u].type;
   4163       addr.offset = op.v.ind.ofs;
   4164       return addr;
   4165     default:
   4166       compiler_panic(d->base.c, d->loc,
   4167                      "aarch64 native target: operand is not addressable");
   4168   }
   4169 }
   4170 
   4171 static NativeAddr aa_direct_materialize_addr(NativeDirectTarget* d,
   4172                                              Operand op) {
   4173   NativeAddr addr = aa_direct_addr(d, op);
   4174   if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   4175     NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, AA_TMP1);
   4176     NativeAddr load;
   4177     memset(&load, 0, sizeof load);
   4178     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   4179     load.base.frame = addr.base.frame;
   4180     load.base_type = addr.base_type;
   4181     aa_emit_mem(aa_of(d->native), 1, base, load,
   4182                 aa_mem_for_type(d->native, addr.base_type, 8));
   4183     addr.base_kind = NATIVE_ADDR_BASE_REG;
   4184     addr.base.reg = AA_TMP1;
   4185   }
   4186   return addr;
   4187 }
   4188 
   4189 static NativeAddr aa_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
   4190   NativeAddr addr;
   4191   memset(&addr, 0, sizeof addr);
   4192   if (op.kind == OPK_LOCAL) {
   4193     NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, AA_TMP1);
   4194     NativeAddr load;
   4195     memset(&load, 0, sizeof load);
   4196     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   4197     load.base.frame = d->locals[op.v.local - 1u].home;
   4198     load.base_type = op.type;
   4199     aa_emit_mem(aa_of(d->native), 1, base, load,
   4200                 aa_mem_for_type(d->native, op.type, 8));
   4201     addr.base_kind = NATIVE_ADDR_BASE_REG;
   4202     addr.base.reg = AA_TMP1;
   4203     addr.base_type = op.type;
   4204     return addr;
   4205   }
   4206   return aa_direct_materialize_addr(d, op);
   4207 }
   4208 
   4209 static NativeAddr aa_reg_addr(KitCgTypeId type, u32 reg, i32 offset) {
   4210   NativeAddr addr;
   4211   memset(&addr, 0, sizeof addr);
   4212   addr.base_kind = NATIVE_ADDR_BASE_REG;
   4213   addr.base.reg = reg;
   4214   addr.base_type = type;
   4215   addr.offset = offset;
   4216   return addr;
   4217 }
   4218 
   4219 static void aa_load_ap_addr(NativeDirectTarget* d, Operand ap_addr,
   4220                             u32 dst_reg) {
   4221   NativeLoc dst =
   4222       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, dst_reg);
   4223   NativeAddr ap = aa_direct_pointer_addr(d, ap_addr);
   4224   d->native->load_addr(d->native, dst, ap);
   4225 }
   4226 
   4227 /* The va cores use only non-allocable registers for their temporaries
   4228  * (scratch x9/x10, reserved x16=TMP0 / x17=TMP1, vector v16) so they never
   4229  * clobber a value the optimizer's register allocator may hold live across the
   4230  * op. The va_list base register is supplied by the caller (ap.base.reg), which
   4231  * the optimizer materializes into a safe register before the call. */
   4232 static u32 aa_va_base_reg(AANativeTarget* a, NativeAddr ap) {
   4233   if (ap.base_kind != NATIVE_ADDR_BASE_REG)
   4234     compiler_panic(a->base.c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
   4235                    "aarch64 native target: va_list pointer not in register");
   4236   return ap.base.reg;
   4237 }
   4238 
   4239 /* va_list layout is queried from the ABI; the optimizer/direct callers pass the
   4240  * va_list pointer opaquely. `ap` addresses the va_list object itself. */
   4241 static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
   4242   NativeTarget* t = &a->base;
   4243   const ABIFuncInfo* abi =
   4244       a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL;
   4245   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   4246   NativeLoc ptr =
   4247       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
   4248   if (vai.kind == ABI_VA_LIST_POINTER) {
   4249     if (a->top_home_bytes) {
   4250       /* Windows: `va_list = &<first vararg>` inside the contiguous
   4251        * [GP home area | incoming stack args] block. Named args consume the
   4252        * leading slots; next_param_int (FP params remapped to GP included) plus
   4253        * next_param_stack locate the first unnamed slot. Home slot
   4254        * gp_reg_count coincides with incoming-arg byte 0, so a single formula
   4255        * spans both regions. */
   4256       i32 off =
   4257           aa_fp_off_home_slot(a->next_param_int) + (i32)a->next_param_stack;
   4258       aa_emit_add_imm(a, AA_TMP0, AA_FP, off);
   4259       aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
   4260       return;
   4261     }
   4262     /* `va_list = &<first vararg>`. Variadic stack args follow the fixed
   4263      * incoming params in the same caller window. Apple ARM64 compact fixed
   4264      * stack args may leave this cursor at +4, while the first variadic slot
   4265      * starts at the next 8-byte boundary. */
   4266     u32 stack = aa_vararg_stack_start(abi, a->next_param_stack);
   4267     aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a, stack));
   4268     aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
   4269     return;
   4270   }
   4271   if (vai.kind == ABI_VA_LIST_AAPCS64) {
   4272     KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32);
   4273     NativeLoc i32tmp = native_loc_reg(i32_ty, NATIVE_REG_INT, AA_TMP1);
   4274     MemAccess ptr_mem = aa_mem_for_type(t, ptr.type, 8);
   4275     MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4);
   4276     AANativeSlot* gr = aa_slot(a, a->va_gr_slot);
   4277     AANativeSlot* vr = aa_slot(a, a->va_vr_slot);
   4278     u32 base = aa_va_base_reg(a, ap);
   4279     u32 used_gr = a->next_param_int < vai.gp_reg_count ? a->next_param_int
   4280                                                        : vai.gp_reg_count;
   4281     u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp
   4282                                                       : vai.fp_reg_count;
   4283     /* __stack points at the incoming stack args, which sit above the saved
   4284      * fp/lr pair — the same address bind_param uses (aa_fp_off_in_arg), not the
   4285      * raw next_param_stack cursor. */
   4286     aa_emit_add_imm(a, AA_TMP0, AA_FP,
   4287                     aa_fp_off_in_arg(a, a->next_param_stack));
   4288     aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset),
   4289                 ptr_mem);
   4290     aa_emit_add_imm(a, AA_TMP0, AA_FP,
   4291                     aa_fp_off_slot(a, gr->off) +
   4292                         (i32)(vai.gp_reg_count * vai.gp_slot_size));
   4293     aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.gr_top_offset),
   4294                 ptr_mem);
   4295     aa_emit_add_imm(a, AA_TMP0, AA_FP,
   4296                     aa_fp_off_slot(a, vr->off) +
   4297                         (i32)(vai.fp_reg_count * vai.fp_slot_size));
   4298     aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.vr_top_offset),
   4299                 ptr_mem);
   4300     aa_emit_load_imm(t->mc, 0, AA_TMP1,
   4301                      -(i32)((vai.gp_reg_count - used_gr) * vai.gp_slot_size));
   4302     aa_emit_mem(a, 0, i32tmp,
   4303                 aa_reg_addr(i32_ty, base, (i32)vai.gr_offs_offset), i32_mem);
   4304     aa_emit_load_imm(t->mc, 0, AA_TMP1,
   4305                      -(i32)((vai.fp_reg_count - used_vr) * vai.fp_slot_size));
   4306     aa_emit_mem(a, 0, i32tmp,
   4307                 aa_reg_addr(i32_ty, base, (i32)vai.vr_offs_offset), i32_mem);
   4308     return;
   4309   }
   4310   compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
   4311                  "aarch64 native target: unsupported va_list layout");
   4312 }
   4313 
   4314 static void aa_va_arg_core(AANativeTarget* a, NativeLoc dst, NativeAddr ap,
   4315                            KitCgTypeId type) {
   4316   NativeTarget* t = &a->base;
   4317   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   4318   NativeLoc cur =
   4319       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
   4320   /* The fetched value is written directly into the caller-provided register
   4321    * `dst`, which the caller guarantees is distinct from the va_list base
   4322    * register. Only TMP0/TMP1 are used as private scratch. */
   4323   NativeLoc val = dst;
   4324   NativeAddr src;
   4325   MemAccess ptr_mem = aa_mem_for_type(t, cur.type, 8);
   4326   MemAccess val_mem = aa_mem_for_type(t, type, type_size32(t, type));
   4327   if (dst.kind != NATIVE_LOC_REG)
   4328     compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
   4329                    "aarch64 native target: va_arg destination must be a "
   4330                    "register");
   4331   if (vai.kind == ABI_VA_LIST_POINTER) {
   4332     aa_emit_mem(a, 1, cur, ap, ptr_mem);
   4333     src = aa_reg_addr(type, AA_TMP0, 0);
   4334     {
   4335       const ABIFuncInfo* abi =
   4336           a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL;
   4337       ABIArgPart part;
   4338       memset(&part, 0, sizeof part);
   4339       part.cls = cg_type_is_float(t->c, type) ? ABI_CLASS_FP : ABI_CLASS_INT;
   4340       part.size = type_size32(t, type);
   4341       part.align = type_align32(t, type);
   4342       aa_emit_add_imm(a, AA_TMP1, AA_TMP0,
   4343                       (i32)aa_part_vararg_stack_size(abi, &part));
   4344     }
   4345     aa_emit_mem(a, 0, native_loc_reg(cur.type, NATIVE_REG_INT, AA_TMP1), ap,
   4346                 ptr_mem);
   4347     aa_emit_mem(a, 1, val, src, val_mem);
   4348     return;
   4349   }
   4350   if (vai.kind == ABI_VA_LIST_AAPCS64) {
   4351     KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32);
   4352     NativeLoc off = native_loc_reg(i32_ty, NATIVE_REG_INT, AA_TMP1);
   4353     MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4);
   4354     int is_fp = cg_type_is_float(t->c, type);
   4355     u32 base = aa_va_base_reg(a, ap);
   4356     u32 offs_field = is_fp ? vai.vr_offs_offset : vai.gr_offs_offset;
   4357     u32 top_field = is_fp ? vai.vr_top_offset : vai.gr_top_offset;
   4358     u32 slot_size = is_fp ? vai.fp_slot_size : vai.gp_slot_size;
   4359     MCLabel stack_label = t->mc->label_new(t->mc);
   4360     MCLabel done_label = t->mc->label_new(t->mc);
   4361     aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem);
   4362     aa_emit32(t->mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0));
   4363     aa_emit32(t->mc,
   4364               aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)}));
   4365     t->mc->emit_label_ref(t->mc, stack_label, R_AARCH64_CONDBR19, 4, 0);
   4366     aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)top_field),
   4367                 ptr_mem);
   4368     aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31));
   4369     aa_emit32(t->mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1));
   4370     aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
   4371     aa_emit_add_imm(a, AA_TMP1, AA_TMP1, (i32)slot_size);
   4372     aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem);
   4373     aa_emit32(t->mc, aa64_b(0));
   4374     t->mc->emit_label_ref(t->mc, done_label, R_AARCH64_JUMP26, 4, 0);
   4375     t->mc->label_place(t->mc, stack_label);
   4376     aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset),
   4377                 ptr_mem);
   4378     aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
   4379     aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8);
   4380     aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset),
   4381                 ptr_mem);
   4382     t->mc->label_place(t->mc, done_label);
   4383     return;
   4384   }
   4385   compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
   4386                  "aarch64 native target: unsupported va_list layout");
   4387 }
   4388 
   4389 static void aa_va_copy_core(AANativeTarget* a, NativeAddr dst_ap,
   4390                             NativeAddr src_ap) {
   4391   NativeTarget* t = &a->base;
   4392   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   4393   NativeLoc tmp =
   4394       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
   4395   MemAccess mem = aa_mem_for_type(t, tmp.type, 8);
   4396   if (vai.kind == ABI_VA_LIST_POINTER) {
   4397     aa_emit_mem(a, 1, tmp, src_ap, mem);
   4398     aa_emit_mem(a, 0, tmp, dst_ap, mem);
   4399     return;
   4400   }
   4401   if (vai.kind == ABI_VA_LIST_AAPCS64) {
   4402     u32 sb = aa_va_base_reg(a, src_ap);
   4403     u32 db = aa_va_base_reg(a, dst_ap);
   4404     for (u32 off = 0; off < vai.type.size; off += 8u) {
   4405       aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, sb, (i32)off), mem);
   4406       aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, db, (i32)off), mem);
   4407     }
   4408     return;
   4409   }
   4410   compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
   4411                  "aarch64 native target: unsupported va_list layout");
   4412 }
   4413 
   4414 /* ---- Direct-path (NativeDirectTarget) wrappers: convert semantic operands to
   4415  * NativeAddr/NativeLoc, then call the shared cores above. ---- */
   4416 
   4417 /* The cores reserve x16/x17 (TMP0/TMP1) as private scratch and require the
   4418  * va_list base register(s) to be distinct from those. aa_direct_pointer_addr
   4419  * returns the pointer in TMP1, so the direct wrappers first relocate it into
   4420  * x9/x10 before calling the cores. */
   4421 static NativeAddr aa_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
   4422                                     u32 reg) {
   4423   aa_load_ap_addr(d, ap_addr, reg);
   4424   return aa_reg_addr(builtin_id(KIT_CG_BUILTIN_I64), reg, 0);
   4425 }
   4426 
   4427 static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) {
   4428   aa_va_start_core(aa_of(d->native), aa_direct_va_base(d, ap_addr, 10u));
   4429 }
   4430 
   4431 static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr,
   4432                        KitCgTypeId type) {
   4433   AANativeTarget* a = aa_of(d->native);
   4434   int is_fp = cg_type_is_float(d->base.c, type);
   4435   NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
   4436                                  is_fp ? 16u : 9u);
   4437   MemAccess val_mem =
   4438       aa_mem_for_type(d->native, type, type_size32(d->native, type));
   4439   NativeAddr dst;
   4440   aa_va_arg_core(a, res, aa_direct_va_base(d, ap_addr, 10u), type);
   4441   dst = aa_direct_materialize_addr(d, dst_op);
   4442   aa_emit_mem(a, 0, res, dst, val_mem);
   4443 }
   4444 
   4445 static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) {
   4446   (void)d;
   4447   (void)ap_addr;
   4448 }
   4449 
   4450 static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr,
   4451                         Operand src_ap_addr) {
   4452   AANativeTarget* a = aa_of(d->native);
   4453   NativeAddr src = aa_direct_va_base(d, src_ap_addr, 9u);
   4454   NativeAddr dst = aa_direct_va_base(d, dst_ap_addr, 10u);
   4455   aa_va_copy_core(a, dst, src);
   4456 }
   4457 
   4458 /* ---- NativeTarget (optimizer) hooks: the optimizer passes the va_list
   4459  * pointer as a materialized register; layout is resolved inside the cores. ----
   4460  */
   4461 
   4462 static NativeAddr aa_va_addr_from_ptr(NativeLoc ap_ptr) {
   4463   NativeAddr addr;
   4464   memset(&addr, 0, sizeof addr);
   4465   addr.base_kind = NATIVE_ADDR_BASE_REG;
   4466   addr.cls = NATIVE_REG_INT;
   4467   addr.base.reg = ap_ptr.v.reg;
   4468   addr.base_type = ap_ptr.type;
   4469   return addr;
   4470 }
   4471 
   4472 static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
   4473   aa_va_start_core(aa_of(t), aa_va_addr_from_ptr(ap_ptr));
   4474 }
   4475 
   4476 static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
   4477                              KitCgTypeId type) {
   4478   aa_va_arg_core(aa_of(t), dst, aa_va_addr_from_ptr(ap_ptr), type);
   4479 }
   4480 
   4481 static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
   4482   (void)t;
   4483   (void)ap_ptr;
   4484 }
   4485 
   4486 static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr,
   4487                               NativeLoc src_ap_ptr) {
   4488   aa_va_copy_core(aa_of(t), aa_va_addr_from_ptr(dst_ap_ptr),
   4489                   aa_va_addr_from_ptr(src_ap_ptr));
   4490 }
   4491 
   4492 /* constraint_body / constraint_early / match_index are shared
   4493  * (cg/native_asm.h). */
   4494 
   4495 _Noreturn static void aa_asm_panic_at(Compiler* c, SrcLoc loc,
   4496                                       const char* msg) {
   4497   compiler_panic(c, loc, "aarch64 inline asm: %s", msg);
   4498 }
   4499 
   4500 _Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) {
   4501   aa_asm_panic_at(d->base.c, d->loc, msg);
   4502 }
   4503 
   4504 AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, KitCgTypeId type,
   4505                                           NativeAllocClass cls, Reg reg) {
   4506   memset(out, 0, sizeof *out);
   4507   out->kind = AA64_INLINE_OPK_REG;
   4508   out->pad[0] =
   4509       (cls == NATIVE_REG_FP) ? AA64_INLINE_OPCLS_FP : AA64_INLINE_OPCLS_INT;
   4510   out->type = type;
   4511   out->v.local = (CGLocal)reg;
   4512 }
   4513 
   4514 AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, KitCgTypeId type,
   4515                                           Reg base) {
   4516   memset(out, 0, sizeof *out);
   4517   out->kind = OPK_INDIRECT;
   4518   out->type = type;
   4519   out->v.ind.base = (CGLocal)base;
   4520   out->v.ind.index = CG_LOCAL_NONE;
   4521 }
   4522 
   4523 static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
   4524                                     NativeAllocClass* cls_out, Reg* reg_out) {
   4525   Slice s = pool_slice(c->global, name);
   4526   char buf[16];
   4527   uint32_t dwarf;
   4528   if (!s.s || !s.len) return 0;
   4529   if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
   4530   if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
   4531   if (s.len >= sizeof buf) aa_asm_panic_at(c, loc, "clobber name is too long");
   4532   memcpy(buf, s.s, s.len);
   4533   buf[s.len] = '\0';
   4534   if (aa64_register_index(buf, &dwarf) != 0)
   4535     aa_asm_panic_at(c, loc, "unknown clobber register");
   4536   if (dwarf <= 30u) {
   4537     *cls_out = NATIVE_REG_INT;
   4538     *reg_out = (Reg)dwarf;
   4539     return 1;
   4540   }
   4541   if (dwarf >= 64u && dwarf <= 95u) {
   4542     *cls_out = NATIVE_REG_FP;
   4543     *reg_out = (Reg)(dwarf - 64u);
   4544     return 1;
   4545   }
   4546   aa_asm_panic_at(c, loc, "unsupported clobber register");
   4547   return 0;
   4548 }
   4549 
   4550 static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   4551                                  u32 nclob, u32* int_mask, u32* fp_mask) {
   4552   *int_mask = 0;
   4553   *fp_mask = 0;
   4554   for (u32 i = 0; i < nclob; ++i) {
   4555     NativeAllocClass cls;
   4556     Reg reg;
   4557     if (!aa_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, &reg)) continue;
   4558     if (cls == NATIVE_REG_INT)
   4559       *int_mask |= 1u << reg;
   4560     else if (cls == NATIVE_REG_FP)
   4561       *fp_mask |= 1u << reg;
   4562   }
   4563 }
   4564 
   4565 AA_UNUSED_FN static Reg aa_asm_alloc_reg(NativeDirectTarget* d,
   4566                                          NativeAllocClass cls,
   4567                                          u32 allowed_mask, u32* used_int,
   4568                                          u32* used_fp) {
   4569   static const Reg int_pool[] = {0u, 1u, 2u,  3u,  4u,  5u,  6u,
   4570                                  7u, 8u, 11u, 12u, 13u, 14u, 15u};
   4571   static const Reg fp_pool[] = {0u,  1u,  2u,  3u,  4u,  5u,  6u,  7u,
   4572                                 16u, 17u, 18u, 19u, 22u, 23u, 24u, 25u,
   4573                                 26u, 27u, 28u, 29u, 30u, 31u};
   4574   const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
   4575   u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
   4576                                : (u32)(sizeof int_pool / sizeof int_pool[0]);
   4577   u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
   4578   for (u32 i = 0; i < n; ++i) {
   4579     Reg r = pool[i];
   4580     if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue;
   4581     if ((*used & (1u << r)) != 0) continue;
   4582     *used |= 1u << r;
   4583     return r;
   4584   }
   4585   aa_asm_panic(d, "out of registers for asm operands");
   4586   return REG_NONE;
   4587 }
   4588 
   4589 static int aa_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
   4590                                        const char* constraint,
   4591                                        NativeAsmRegPin* pin) {
   4592   NativeAsmRegPinStatus st =
   4593       native_asm_resolve_pin(d->native, reg, constraint, pin);
   4594   if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
   4595   if (st != NATIVE_ASM_REG_PIN_OK)
   4596     aa_asm_panic(d, native_asm_pin_status_message(st));
   4597   return 1;
   4598 }
   4599 
   4600 AA_UNUSED_FN static void aa_direct_load_operand_to_reg(NativeDirectTarget* d,
   4601                                                        Operand op,
   4602                                                        NativeLoc dst) {
   4603   NativeAddr addr;
   4604   memset(&addr, 0, sizeof addr);
   4605   switch ((OpKind)op.kind) {
   4606     case OPK_IMM:
   4607       if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
   4608         aa_asm_panic(d, "floating-point immediate asm input is unsupported");
   4609       d->native->load_imm(d->native, dst, op.v.imm);
   4610       return;
   4611     case OPK_LOCAL:
   4612       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4613       addr.base.frame = d->locals[op.v.local - 1u].home;
   4614       addr.base_type = op.type;
   4615       aa_emit_mem(aa_of(d->native), 1, dst, addr,
   4616                   aa_mem_for_type(d->native, op.type, 0));
   4617       return;
   4618     case OPK_GLOBAL:
   4619       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   4620       addr.base.global.sym = op.v.global.sym;
   4621       addr.base.global.addend = op.v.global.addend;
   4622       addr.base_type = op.type;
   4623       d->native->load_addr(d->native, dst, addr);
   4624       return;
   4625     case OPK_INDIRECT:
   4626       addr = aa_direct_materialize_addr(d, op);
   4627       aa_emit_mem(aa_of(d->native), 1, dst, addr,
   4628                   aa_mem_for_type(d->native, op.type, 0));
   4629       return;
   4630   }
   4631   aa_asm_panic(d, "unsupported asm input operand");
   4632 }
   4633 
   4634 AA_UNUSED_FN static void aa_direct_load_address_to_reg(NativeDirectTarget* d,
   4635                                                        Operand op,
   4636                                                        NativeLoc dst) {
   4637   NativeAddr addr = aa_direct_addr(d, op);
   4638   d->native->load_addr(d->native, dst, addr);
   4639 }
   4640 
   4641 AA_UNUSED_FN static void aa_direct_store_reg_to_operand(NativeDirectTarget* d,
   4642                                                         Operand op,
   4643                                                         NativeLoc src) {
   4644   NativeAddr addr;
   4645   memset(&addr, 0, sizeof addr);
   4646   if (op.kind == OPK_LOCAL) {
   4647     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4648     addr.base.frame = d->locals[op.v.local - 1u].home;
   4649     addr.base_type = op.type;
   4650   } else {
   4651     addr = aa_direct_materialize_addr(d, op);
   4652   }
   4653   aa_emit_mem(aa_of(d->native), 0, src, addr,
   4654               aa_mem_for_type(d->native, op.type, 0));
   4655 }
   4656 
   4657 typedef struct AAAsmSavedClobber {
   4658   NativeFrameSlot slot;
   4659   NativeAllocClass cls;
   4660   Reg reg;
   4661   KitCgTypeId type;
   4662 } AAAsmSavedClobber;
   4663 
   4664 static void aa_asm_save_one(AANativeTarget* a, AAAsmSavedClobber* s) {
   4665   NativeFrameSlotDesc desc;
   4666   NativeAddr addr;
   4667   NativeLoc reg;
   4668   memset(&desc, 0, sizeof desc);
   4669   desc.type = s->type;
   4670   desc.size = 8;
   4671   desc.align = 8;
   4672   desc.kind = NATIVE_FRAME_SLOT_SAVE;
   4673   s->slot = a->base.frame_slot(&a->base, &desc);
   4674   memset(&addr, 0, sizeof addr);
   4675   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4676   addr.base.frame = s->slot;
   4677   addr.base_type = s->type;
   4678   reg = native_loc_reg(s->type, s->cls, s->reg);
   4679   aa_emit_mem(a, 0, reg, addr, aa_mem_for_type(&a->base, s->type, 8));
   4680 }
   4681 
   4682 AA_UNUSED_FN static void aa_asm_restore_one(AANativeTarget* a,
   4683                                             const AAAsmSavedClobber* s) {
   4684   NativeAddr addr;
   4685   NativeLoc reg = native_loc_reg(s->type, s->cls, s->reg);
   4686   memset(&addr, 0, sizeof addr);
   4687   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4688   addr.base.frame = s->slot;
   4689   addr.base_type = s->type;
   4690   aa_emit_mem(a, 1, reg, addr, aa_mem_for_type(&a->base, s->type, 8));
   4691 }
   4692 
   4693 AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers(
   4694     AANativeTarget* a, u32 int_mask, u32 fp_mask, u32* nsaved_out) {
   4695   AAAsmSavedClobber* saved =
   4696       arena_zarray(a->base.c->tu, AAAsmSavedClobber, 20u);
   4697   u32 n = 0;
   4698   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   4699   KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   4700   for (Reg r = 19u; r <= 28u; ++r) {
   4701     if ((int_mask & (1u << r)) == 0) continue;
   4702     saved[n].cls = NATIVE_REG_INT;
   4703     saved[n].reg = r;
   4704     saved[n].type = i64;
   4705     aa_asm_save_one(a, &saved[n++]);
   4706   }
   4707   for (Reg r = 8u; r <= 15u; ++r) {
   4708     if ((fp_mask & (1u << r)) == 0) continue;
   4709     saved[n].cls = NATIVE_REG_FP;
   4710     saved[n].reg = r;
   4711     saved[n].type = f64;
   4712     aa_asm_save_one(a, &saved[n++]);
   4713   }
   4714   *nsaved_out = n;
   4715   return saved;
   4716 }
   4717 
   4718 static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
   4719                                 const AsmConstraint* outs, u32 nout,
   4720                                 Operand* out_ops, const AsmConstraint* ins,
   4721                                 u32 nin, const Operand* in_ops,
   4722                                 const Sym* clobbers, u32 nclob,
   4723                                 u32 clobber_abi_sets) {
   4724   Operand* bound_outs =
   4725       nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL;
   4726   Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL;
   4727   u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   4728   AAAsmSavedClobber* saved;
   4729   u32 nsaved;
   4730   AA64Asm* a;
   4731 
   4732   aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
   4733   native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
   4734   clob_int |= abi_int;
   4735   clob_fp |= abi_fp;
   4736   used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) |
   4737              (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP);
   4738   used_fp = clob_fp | (1u << 20u) | (1u << 21u);
   4739 
   4740   for (u32 i = 0; i < nout; ++i) {
   4741     const char* body = native_asm_constraint_body(outs[i].str);
   4742     NativeAsmRegPin pin;
   4743     if (aa_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
   4744       /* GNU local register variable: pin to the named hard register. */
   4745       KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   4746       if (pin.cls == NATIVE_REG_FP) {
   4747         used_fp |= 1u << pin.reg;
   4748         clob_fp |= 1u << pin.reg;
   4749       } else {
   4750         used_int |= 1u << pin.reg;
   4751         clob_int |= 1u << pin.reg;
   4752       }
   4753       aa_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
   4754     } else {
   4755       NativeAsmConstraintInfo info;
   4756       if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) {
   4757         Reg reg = info.fixed_reg != REG_NONE
   4758                       ? info.fixed_reg
   4759                       : aa_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4760                                          &used_int, &used_fp);
   4761         KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   4762         if (info.cls == NATIVE_REG_FP) {
   4763           used_fp |= 1u << reg;
   4764           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4765         } else {
   4766           used_int |= 1u << reg;
   4767           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4768         }
   4769         aa_asm_bound_reg(&bound_outs[i], type, info.cls, reg);
   4770       } else if (body[0] == 'm') {
   4771         Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4772         KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   4773         aa_asm_bound_mem(&bound_outs[i], type, reg);
   4774       } else {
   4775         aa_asm_panic(d, "unsupported output constraint");
   4776       }
   4777     }
   4778   }
   4779 
   4780   for (u32 i = 0; i < nin; ++i) {
   4781     const char* body = native_asm_constraint_body(ins[i].str);
   4782     int matched = native_asm_match_index(body);
   4783     if (matched >= 0) {
   4784       if ((u32)matched >= nout)
   4785         aa_asm_panic(d, "matching constraint out of range");
   4786       if (native_asm_constraint_early(outs[matched].str))
   4787         aa_asm_panic(d, "matching input names early-clobber output");
   4788       if (bound_outs[matched].kind != AA64_INLINE_OPK_REG)
   4789         aa_asm_panic(d, "matching constraint requires register output");
   4790       bound_ins[i] = bound_outs[matched];
   4791       continue;
   4792     }
   4793     NativeAsmRegPin pin;
   4794     if (aa_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
   4795       /* GNU local register variable: pin to the named hard register. */
   4796       KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   4797       if (pin.cls == NATIVE_REG_FP) {
   4798         used_fp |= 1u << pin.reg;
   4799         clob_fp |= 1u << pin.reg;
   4800       } else {
   4801         used_int |= 1u << pin.reg;
   4802         clob_int |= 1u << pin.reg;
   4803       }
   4804       aa_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
   4805     } else {
   4806       NativeAsmConstraintInfo info;
   4807       if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) {
   4808         Reg reg = info.fixed_reg != REG_NONE
   4809                       ? info.fixed_reg
   4810                       : aa_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4811                                          &used_int, &used_fp);
   4812         KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   4813         if (info.cls == NATIVE_REG_FP) {
   4814           used_fp |= 1u << reg;
   4815           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4816         } else {
   4817           used_int |= 1u << reg;
   4818           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4819         }
   4820         aa_asm_bound_reg(&bound_ins[i], type, info.cls, reg);
   4821       } else if (body[0] == 'i') {
   4822         if (in_ops[i].kind != OPK_IMM)
   4823           aa_asm_panic(d, "immediate constraint requires immediate operand");
   4824         bound_ins[i] = in_ops[i];
   4825       } else if (body[0] == 'm') {
   4826         Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4827         KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   4828         aa_asm_bound_mem(&bound_ins[i], type, reg);
   4829       } else {
   4830         aa_asm_panic(d, "unsupported input constraint");
   4831       }
   4832     }
   4833   }
   4834 
   4835   saved =
   4836       aa_asm_save_callee_clobbers(aa_of(d->native), clob_int, clob_fp, &nsaved);
   4837   for (u32 i = 0; i < nout; ++i) {
   4838     if (bound_outs[i].kind == AA64_INLINE_OPK_REG) {
   4839       NativeAllocClass cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP
   4840                                  ? NATIVE_REG_FP
   4841                                  : NATIVE_REG_INT;
   4842       if (outs[i].dir == KIT_CG_ASM_INOUT) {
   4843         aa_direct_load_operand_to_reg(
   4844             d, out_ops[i],
   4845             native_loc_reg(bound_outs[i].type, cls,
   4846                            (Reg)bound_outs[i].v.local));
   4847       }
   4848     } else if (bound_outs[i].kind == OPK_INDIRECT) {
   4849       NativeLoc loc =
   4850           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4851                          (Reg)bound_outs[i].v.ind.base);
   4852       aa_direct_load_address_to_reg(d, out_ops[i], loc);
   4853     }
   4854   }
   4855   for (u32 i = 0; i < nin; ++i) {
   4856     if (bound_ins[i].kind == AA64_INLINE_OPK_REG) {
   4857       NativeAllocClass cls = bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP
   4858                                  ? NATIVE_REG_FP
   4859                                  : NATIVE_REG_INT;
   4860       aa_direct_load_operand_to_reg(
   4861           d, in_ops[i],
   4862           native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
   4863     } else if (bound_ins[i].kind == OPK_INDIRECT) {
   4864       NativeLoc loc =
   4865           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4866                          (Reg)bound_ins[i].v.ind.base);
   4867       aa_direct_load_address_to_reg(d, in_ops[i], loc);
   4868     }
   4869   }
   4870   a = aa64_asm_open(d->base.c);
   4871   aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   4872                    nclob);
   4873   aa64_asm_run_template(a, d->native->mc, tmpl);
   4874   aa64_asm_close(a);
   4875 
   4876   for (u32 i = 0; i < nout; ++i) {
   4877     NativeAllocClass cls;
   4878     NativeLoc src;
   4879     if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue;
   4880     cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   4881                                                        : NATIVE_REG_INT;
   4882     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   4883     aa_direct_store_reg_to_operand(d, out_ops[i], src);
   4884   }
   4885   for (u32 i = nsaved; i > 0; --i)
   4886     aa_asm_restore_one(aa_of(d->native), &saved[i - 1u]);
   4887 }
   4888 
   4889 /* ---- NativeTarget (optimizer) asm hook ----
   4890  *
   4891  * The optimizer has already allocated every operand register and arranged the
   4892  * surrounding data flow (inputs are live in their registers on entry, outputs
   4893  * are consumed from their registers on exit; the asm's clobber_mask kept the
   4894  * allocator from holding live values in clobbered registers). So unlike the
   4895  * direct path this hook does NOT self-allocate registers and does NOT load
   4896  * inputs / store outputs -- it only binds the pre-allocated registers to the
   4897  * template, materializing memory-operand base addresses into the reserved
   4898  * scratch registers and saving/restoring callee-saved registers the asm
   4899  * clobbers (the only ABI obligation the allocator cannot discharge itself). */
   4900 
   4901 static NativeAddr aa_asm_loc_to_addr(AANativeTarget* a, SrcLoc loc,
   4902                                      NativeLoc src) {
   4903   NativeAddr addr;
   4904   memset(&addr, 0, sizeof addr);
   4905   addr.base_type = src.type;
   4906   switch ((NativeLocKind)src.kind) {
   4907     case NATIVE_LOC_FRAME:
   4908       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   4909       addr.base.frame = src.v.frame;
   4910       return addr;
   4911     case NATIVE_LOC_ADDR:
   4912       return src.v.addr;
   4913     case NATIVE_LOC_GLOBAL:
   4914       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   4915       addr.base.global.sym = src.v.global.sym;
   4916       addr.base.global.addend = src.v.global.addend;
   4917       return addr;
   4918     case NATIVE_LOC_REG:
   4919       addr.base_kind = NATIVE_ADDR_BASE_REG;
   4920       addr.cls = NATIVE_REG_INT;
   4921       addr.base.reg = src.v.reg;
   4922       return addr;
   4923     default:
   4924       aa_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
   4925   }
   4926 }
   4927 
   4928 /* Resolve a memory-constraint operand to a single base register with zero
   4929  * offset, folding any frame/global/offset into a scratch register. At most the
   4930  * two reserved scratch registers are used across one asm block. */
   4931 static Reg aa_asm_native_mem_base(AANativeTarget* a, SrcLoc loc, NativeLoc src,
   4932                                   u32* ntmp) {
   4933   NativeAddr addr = aa_asm_loc_to_addr(a, loc, src);
   4934   u32 base;
   4935   i32 off;
   4936   Reg dst;
   4937   if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
   4938     aa_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
   4939   aa_addr_base(a, addr, &base, &off);
   4940   if (off == 0) return (Reg)base;
   4941   if (*ntmp >= 2u)
   4942     aa_asm_panic_at(a->base.c, loc, "too many memory asm operands");
   4943   dst = (*ntmp == 0u) ? AA_TMP0 : AA_TMP1;
   4944   (*ntmp)++;
   4945   aa_emit_add_imm(a, dst, base, off);
   4946   return dst;
   4947 }
   4948 
   4949 static void aa_asm_load_loc_to_reg(AANativeTarget* a, SrcLoc loc, NativeLoc src,
   4950                                    NativeLoc dst) {
   4951   NativeTarget* t = &a->base;
   4952   NativeAllocClass cls = (NativeAllocClass)dst.cls;
   4953   if (src.kind == NATIVE_LOC_REG) {
   4954     if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src);
   4955     return;
   4956   }
   4957   if (src.kind == NATIVE_LOC_IMM) {
   4958     if (cls != NATIVE_REG_INT)
   4959       aa_asm_panic_at(t->c, loc,
   4960                       "floating-point immediate asm input is unsupported");
   4961     t->load_imm(t, dst, src.v.imm);
   4962     return;
   4963   }
   4964   aa_emit_mem(a, 1, dst, aa_asm_loc_to_addr(a, loc, src),
   4965               aa_mem_for_type(t, dst.type, type_size32(t, dst.type)));
   4966 }
   4967 
   4968 static void aa_asm_store_reg_to_loc(AANativeTarget* a, SrcLoc loc,
   4969                                     NativeLoc dst, NativeLoc src) {
   4970   NativeTarget* t = &a->base;
   4971   if (dst.kind == NATIVE_LOC_REG) {
   4972     if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src);
   4973     return;
   4974   }
   4975   aa_emit_mem(a, 0, src, aa_asm_loc_to_addr(a, loc, dst),
   4976               aa_mem_for_type(t, src.type, type_size32(t, src.type)));
   4977 }
   4978 
   4979 static void aa_asm_bind_native(AANativeTarget* a, SrcLoc loc, Operand* out,
   4980                                const char* constraint, KitCgTypeId type,
   4981                                NativeLoc src, u32* ntmp) {
   4982   const char* body = native_asm_constraint_body(constraint);
   4983   NativeAsmConstraintInfo info;
   4984   if (native_asm_constraint_reg_info(&a->base, constraint, &info)) {
   4985     if (src.kind != NATIVE_LOC_REG)
   4986       aa_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
   4987     if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg)
   4988       aa_asm_panic_at(a->base.c, loc,
   4989                       "fixed-register asm operand in wrong register");
   4990     if (info.allowed_mask &&
   4991         ((Reg)src.v.reg >= 32 ||
   4992          (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0))
   4993       compiler_panic(
   4994           a->base.c, loc,
   4995           "aarch64 inline asm: constraint %s got cls%u reg%u outside %08x",
   4996           constraint, (unsigned)info.cls, (unsigned)src.v.reg,
   4997           (unsigned)info.allowed_mask);
   4998     aa_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg);
   4999   } else if (body[0] == 'i') {
   5000     if (src.kind != NATIVE_LOC_IMM)
   5001       aa_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
   5002     memset(out, 0, sizeof *out);
   5003     out->kind = OPK_IMM;
   5004     out->type = type;
   5005     out->v.imm = src.v.imm;
   5006   } else if (body[0] == 'm') {
   5007     aa_asm_bound_mem(out, type, aa_asm_native_mem_base(a, loc, src, ntmp));
   5008   } else {
   5009     aa_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
   5010   }
   5011 }
   5012 
   5013 static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
   5014                                 const AsmConstraint* outs, u32 nout,
   5015                                 NativeLoc* out_locs, const AsmConstraint* ins,
   5016                                 u32 nin, const NativeLoc* in_locs,
   5017                                 const Sym* clobbers, u32 nclob) {
   5018   AANativeTarget* a = aa_of(t);
   5019   Compiler* c = t->c;
   5020   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   5021   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   5022   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   5023   u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL;
   5024   u32 ntmp = 0;
   5025   AA64Asm* asmh;
   5026 
   5027   for (u32 i = 0; i < nout; ++i) {
   5028     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
   5029     NativeLoc outloc = out_locs[i];
   5030     NativeAsmPinnedLoc pinned =
   5031         native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
   5032     if (pinned.has_pin) {
   5033       if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   5034         aa_asm_panic_at(c, loc,
   5035                         native_asm_pin_status_message(pinned.pin_status));
   5036       if (pinned.wrong_reg)
   5037         aa_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
   5038       outloc = pinned.loc;
   5039       if (pinned.needs_stage) {
   5040         staged_outs[i] = 1u;
   5041         if (outs[i].dir == KIT_CG_ASM_INOUT)
   5042           aa_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
   5043       }
   5044     }
   5045     aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, &ntmp);
   5046   }
   5047   for (u32 i = 0; i < nin; ++i) {
   5048     const char* body = native_asm_constraint_body(ins[i].str);
   5049     int matched = native_asm_match_index(body);
   5050     KitCgTypeId type;
   5051     if (matched >= 0) {
   5052       if ((u32)matched >= nout)
   5053         aa_asm_panic_at(c, loc, "matching constraint out of range");
   5054       bound_ins[i] = bound_outs[matched];
   5055       continue;
   5056     }
   5057     type = ins[i].type ? ins[i].type : in_locs[i].type;
   5058     {
   5059       const char* in_body = native_asm_constraint_body(ins[i].str);
   5060       NativeAsmConstraintInfo info;
   5061       NativeLoc inloc = in_locs[i];
   5062       NativeAsmPinnedLoc pinned =
   5063           native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
   5064       /* A register-constrained input whose value is an address-taken local
   5065        * arrives in a frame slot: the optimizer cannot keep an address-taken
   5066        * local live in a register across the block, so the "inputs are already
   5067        * in registers" contract does not hold for it. Load it into a reserved
   5068        * scratch register (as the direct path does) before binding. With no
   5069        * hard pin, only unrestricted integer constraints can use this scratch;
   5070        * restricted register sets must already arrive in an allowed hard
   5071        * register. */
   5072       if (pinned.has_pin) {
   5073         if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   5074           aa_asm_panic_at(c, loc,
   5075                           native_asm_pin_status_message(pinned.pin_status));
   5076         if (pinned.wrong_reg)
   5077           aa_asm_panic_at(c, loc,
   5078                           "hard-register asm operand in wrong register");
   5079         inloc = pinned.loc;
   5080         if (pinned.needs_stage)
   5081           aa_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
   5082       } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
   5083                  info.cls == NATIVE_REG_INT && info.allowed_mask == 0 &&
   5084                  inloc.kind != NATIVE_LOC_REG) {
   5085         Reg r;
   5086         if (ntmp >= 2u) aa_asm_panic_at(c, loc, "too many memory asm operands");
   5087         r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1;
   5088         ntmp++;
   5089         inloc = native_loc_reg(type, NATIVE_REG_INT, r);
   5090         aa_emit_mem(a, 1, inloc, aa_asm_loc_to_addr(a, loc, in_locs[i]),
   5091                     aa_mem_for_type(t, type, type_size32(t, type)));
   5092       }
   5093       (void)in_body;
   5094       aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   5095     }
   5096   }
   5097 
   5098   /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
   5099    * masks and aa_known_callee_saves folded the callee-saved ones into the
   5100    * function's saved set, so the prologue/epilogue already preserve them. */
   5101   asmh = aa64_asm_open(c);
   5102   aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   5103                    nclob);
   5104   aa64_asm_run_template(asmh, t->mc, tmpl);
   5105   aa64_asm_close(asmh);
   5106 
   5107   for (u32 i = 0; i < nout; ++i) {
   5108     NativeAllocClass cls;
   5109     NativeLoc src;
   5110     if (!staged_outs || !staged_outs[i]) continue;
   5111     if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue;
   5112     cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   5113                                                        : NATIVE_REG_INT;
   5114     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   5115     aa_asm_store_reg_to_loc(a, loc, out_locs[i], src);
   5116   }
   5117 }
   5118 
   5119 static const NativeOps aa_direct_ops = {
   5120     .bind_param = aa_bind_param,
   5121     .tail_call_unrealizable_reason = aa_no_tail,
   5122     .va_start_ = aa_va_start_,
   5123     .va_arg_ = aa_va_arg_,
   5124     .va_end_ = aa_va_end_,
   5125     .va_copy_ = aa_va_copy_,
   5126     .asm_block = aa_direct_asm_block,
   5127 };
   5128 
   5129 const NativeOps* aa64_native_direct_ops(void) { return &aa_direct_ops; }