native.c (210050B)
1 /* aa64 NativeTarget production-readiness checklist: 2 * - ABI completeness: finish AAPCS64/Linux va_list and register-save-area 3 * lowering, verify Apple/AAPCS64/Windows arm64 differences, handle all 4 * homogeneous aggregates, indirect/byval/sret corner cases, small aggregate 5 * splitting, multi-register returns, stack alignment, and ABI diagnostics. 6 * - Calls and returns: replace call-plus-return tail handling with true direct 7 * and indirect sibling calls, preserve musttail ABI guarantees, support stack 8 * argument reshuffling without clobbering live inputs, and cover all sret, 9 * variadic, FP, aggregate, and many-argument combinations. 10 * - Frame lowering: implement known-frame/prologue integration for optimized 11 * emission, spill/reload hooks, callee-save tracking for integer and FP/SIMD 12 * registers, large-frame probing/materialization as needed by each platform, 13 * dynamic alloca restoration, and unwind/debug frame metadata. 14 * - Operations and intrinsics: fill remaining scalar, FP, conversion, rounding, 15 * overflow, bit, vector/SIMD, trap, prefetch, and target-specific intrinsics; 16 * validate NaN/ordered/unordered FP compare semantics and integer narrowing 17 * behavior for every supported width. 18 * - Aggregates and memory: support large constants, overlap-safe memmove, 19 * optimized bulk copy/set selection, bitfield load/store, packed/unaligned 20 * accesses, volatile access constraints, and record/slice edge cases across 21 * direct and optimized lowering. 22 * - Atomics: replace ordinary load/store RMW/CAS sequences with correct LL/SC 23 * or LSE loops, implement acquire/release/seq_cst mappings precisely, handle 24 * failure ordering, byte/halfword/word/dword widths, and retry/clobber rules. 25 * - Inline and file-scope asm: complete register/memory/immediate constraints, 26 * named operands, tied operands, early-clobber and clobber validation, hard 27 * register conflicts, memory barriers, outputs for aggregates/FP values, and 28 * file-scope asm integration. */ 29 30 #include <string.h> 31 32 #include "abi/abi.h" 33 #include "arch/aa64/aa64.h" 34 #include "arch/aa64/asm.h" 35 #include "arch/aa64/isa.h" 36 #include "arch/aa64/regs.h" 37 #include "asm/asm.h" 38 #include "asm/asm_lex.h" 39 #include "cg/native_argmove.h" 40 #include "cg/native_asm.h" 41 #include "cg/native_direct_target.h" 42 #include "cg/native_frame.h" 43 #include "cg/type.h" 44 #include "core/arena.h" 45 #include "core/bytes.h" 46 #include "core/core.h" 47 #include "core/pool.h" 48 #include "core/slice.h" 49 #include "obj/obj.h" 50 51 #if defined(__GNUC__) || defined(__clang__) 52 #define AA_UNUSED_FN __attribute__((unused)) 53 #else 54 #define AA_UNUSED_FN 55 #endif 56 57 enum { 58 AA_X8 = 8u, /* indirect-result (sret) register; usable as a copy base that 59 aa_copy_bytes (which scratches only x16/x17) never clobbers */ 60 AA_TMP0 = 16u, 61 AA_TMP1 = 17u, 62 AA_FP = 29u, 63 AA_LR = 30u, 64 AA_SP = 31u, 65 AA_FRAME_SAVE_SIZE = 16u, 66 /* Worst-case reserved prologue region (NDT single-pass path patches it in 67 * place; the optimizer path reserves exactly what it emits). Sized to hold 68 * the fat prologue plus the Windows large-frame stack probe (≤7 words, see 69 * aa_words_stack_probe). */ 70 AA_PROLOGUE_WORDS = 32u, 71 AA_TAIL_WORDS = 32u, 72 }; 73 74 /* Windows/AArch64 TLS Local-Exec. The TEB pointer lives in the reserved 75 * platform register x18 (never allocated; see AA_PHYS_INT_RESERVED(18)), and 76 * the thread's TLS-array pointer (TEB.ThreadLocalStoragePointer) sits at 77 * TEB+0x58 — same offset as on Win64/x86-64. */ 78 enum { 79 AA_WIN_TEB_REG = 18u, 80 AA_WIN_TEB_TLS_PTR_OFF = 0x58u, 81 }; 82 83 /* ============================================================================ 84 * AAPCS64 frame layout 85 * 86 * Two layouts. Every fp- or sp-relative offset in this file is computed via one 87 * of the aa_fp_off_ / aa_sp_off_ helpers below — no site does bare arithmetic 88 * on AA_FP / AA_SP, and no site outside those helpers branches on the layout. 89 * 90 * TOP-RECORD (default — single-pass -O0, fat frames, and out_stack>0 small 91 * frames). fp anchors at the caller's saved-pair address near the top; sp at 92 * the bottom of the outgoing-arg area. Offsets are frame-size-independent. 93 * 94 * high addr caller's stack frame 95 * +------------------------------+ 96 * | incoming stack args | aa_fp_off_in_arg(a,i) = 16+i 97 * +------------------------------+ 98 * fp --> | saved x29 (prev fp) | aa_fp_off_saved_fp() = 0 99 * | saved x30 (prev lr) | aa_fp_off_saved_lr() = 8 100 * +------------------------------+ 101 * | frame slots | aa_fp_off_slot(a,off) = -off 102 * | (callee-saves + locals | 103 * | + spills + sret/variadic) | 104 * +------------------------------+ 105 * | outgoing args | aa_sp_off_out_arg(i) 106 * sp --> +------------------------------+ 107 * low addr CFA = fp + 16 108 * 109 * BOTTOM-RECORD (fp_at_bottom — known-frame -O1 small frames with 110 * callee-saves/locals and out_stack==0). The record moves to the bottom so the 111 * sp adjustment folds into a pre/post-indexed stp/ldp (−2 insns/call). fp = sp; 112 * slots/callee-saves stack ABOVE the record at positive offsets. Offsets depend 113 * on frame_size (hence known-frame only, where the frame is final before body). 114 * 115 * high addr caller's stack frame 116 * +------------------------------+ 117 * | incoming stack args | aa_fp_off_in_arg(a,i) = N+i 118 * +------------------------------+ <- caller's sp = CFA = fp + 119 * N | frame slots (+ align pad) | aa_fp_off_slot(a,off)=N-off | 120 * (callee-saves + locals …) | (in [16, N), above record) 121 * +------------------------------+ 122 * fp = sp --> | saved x29 (prev fp) | aa_fp_off_saved_fp() = 0 123 * | saved x30 (prev lr) | aa_fp_off_saved_lr() = 8 124 * low addr +------------------------------+ (N = frame_size; 125 * out_stack==0) 126 * 127 * frame_size (N) = align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack). 128 * Tail calls write outgoing args into the caller's incoming-args window — 129 * physically the same address, expressed via aa_fp_off_tail_out_arg. 130 * ========================================================================== */ 131 132 static u32 align_up_u32(u32 v, u32 align); 133 134 typedef struct AAFrameLayout { 135 u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals 136 * + spills + sret/variadic) */ 137 u32 out_stack; /* max outgoing-arg bytes across all calls in this function */ 138 u32 top_home; /* Windows-variadic GP register home area, reserved between 139 * the saved pair and the incoming stack args so the 140 * plain-pointer va_list walks register then stack varargs as 141 * one contiguous block (0 on every other ABI). */ 142 u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + 143 * out_stack) */ 144 } AAFrameLayout; 145 146 static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack, 147 u32 top_home) { 148 AAFrameLayout L; 149 L.slot_bytes = slot_bytes; 150 L.out_stack = out_stack; 151 L.top_home = top_home; 152 L.frame_size = 153 align_up_u32(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + out_stack, 16u); 154 return L; 155 } 156 157 /* FP-relative byte offsets. The saved-pair is at [fp]/[fp+8] in both the 158 * top-record and bottom-record (fp_at_bottom) layouts, so these two are 159 * layout-independent. The frame-size-dependent helpers — aa_fp_off_in_arg, 160 * aa_fp_off_slot, aa_fp_off_tail_out_arg — branch on a->fp_at_bottom and are 161 * defined after AANativeTarget (see aa_fp_off_* below aa_of). */ 162 static inline i32 aa_fp_off_saved_fp(void) { return 0; } 163 static inline i32 aa_fp_off_saved_lr(void) { return 8; } 164 165 /* SP-relative byte offsets. */ 166 static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; } 167 static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) { 168 return L->frame_size - AA_FRAME_SAVE_SIZE - L->top_home; 169 } 170 171 /* Frame slots and callee-save records are owned by the shared NativeFrame 172 * bookkeeping (cg/native_frame.h); these aliases keep the aa64-local spellings. 173 */ 174 typedef NativeFrameSlotEntry AANativeSlot; 175 176 /* Deferred in-function patches, all resolved in aa_func_end once the frame 177 * layout (max_outgoing, callee-saves) is final. One growable list carries both 178 * kinds; each entry patches a disjoint, fixed code position, so insertion order 179 * is irrelevant. The prologue region is patched separately (exactly one per 180 * function, fixed position) and is not a list entry. */ 181 typedef enum AAPatchKind { 182 AA_PATCH_ALLOCA, /* single instr: add dst, sp, #max_outgoing */ 183 AA_PATCH_TAIL, /* AA_TAIL_WORDS region: callee restores + frame + br/b */ 184 } AAPatchKind; 185 186 typedef struct AAPatch { 187 AAPatchKind kind; 188 u32 pos; 189 union { 190 u32 dst_reg; /* AA_PATCH_ALLOCA */ 191 NativeLoc callee; /* AA_PATCH_TAIL */ 192 } u; 193 } AAPatch; 194 195 typedef NativeFrameCalleeSave AACalleeSave; 196 197 typedef struct AANativeTarget { 198 NativeTarget base; 199 SrcLoc loc; 200 const CGFuncDesc* func; 201 202 /* Shared frame bookkeeping: slot table, cumulative offset, max-outgoing, 203 * callee-save set, and the known_frame / has_alloca / frame_final flags. */ 204 NativeFrame frame; 205 /* Final frame size, set once in aa_func_begin_known_frame when fp_at_bottom 206 * is decided. Read by the fp-relative offset helpers in the bottom-record 207 * layout (where slot/incoming-arg offsets depend on frame_size); meaningless 208 * and unread on the single-pass path, which never sets fp_at_bottom. */ 209 u32 frame_size_final; 210 u32 incoming_stack_size; 211 /* Windows-variadic GP register home area size (gp_reg_count * gp_slot_size, 212 * 64 today; 0 on every other ABI). When nonzero the function takes the fat 213 * top-record layout and homes x0..x7 into [fp + AA_FRAME_SAVE_SIZE ..] so the 214 * plain-pointer va_list can walk register then stack varargs contiguously. */ 215 u32 top_home_bytes; 216 u32 next_param_int; 217 u32 next_param_fp; 218 u32 next_param_stack; 219 NativeFrameSlot sret_ptr_slot; 220 NativeFrameSlot saved_tmp_slot; 221 NativeFrameSlot va_gr_slot; 222 NativeFrameSlot va_vr_slot; 223 224 AAPatch* patches; 225 u32 npatches; 226 u32 patches_cap; 227 u32 nalloca; /* count of AA_PATCH_ALLOCA entries; gates slim prologue/frame */ 228 229 u32 func_start; 230 u32 prologue_pos; 231 u32 minimal_prologue_words; /* opt path: exact prologue length, else 0 */ 232 MCLabel epilogue_label; 233 234 /* Set at func_end when this function qualifies for the slim prologue/epilogue 235 * (Tier A: no body locals/spills, no callee-saves, no alloca, no outgoing 236 * stack args, no sret/variadic). When set, the prologue patch and epilogue 237 * emit a 2-insn `stp x29,x30,[sp,#-16]! ; mov x29,sp` and matching `ldp 238 * x29,x30,[sp],#16 ; ret` instead of the fat 4+3-insn FP-frame form. */ 239 u8 slim_prologue; 240 /* Set at func_end when frame_size - 16 fits stp's signed 7-bit scaled 241 * immediate (frame_size <= 520). Skips the `add x17, sp, #(N-16)` scratch 242 * materialization in the prologue (stp x29,x30,[sp,#N-16] instead) and 243 * the matching `add x10, fp, #0` in the epilogue (ldp x29,x30,[sp,#N-16] 244 * + add sp,sp,#N). Mutually exclusive with `slim_prologue` (Tier A wins 245 * when both would apply) and `fp_at_bottom` (which wins for out_stack==0). 246 * Now only reached for small frames with outgoing stack args (out_stack>0), 247 * where the record cannot move to the bottom. Keeps the top-record layout. */ 248 u8 slim_small_frame; 249 /* Set by aa_func_begin_known_frame for a small frame with callee-saves/locals 250 * and no outgoing stack args: the frame record moves to the bottom of the 251 * frame (fp = sp, `mov x29,sp`) so the sp adjustment folds into a pre-indexed 252 * `stp x29,x30,[sp,#-N]!` entry and post-indexed `ldp x29,x30,[sp],#N` exit 253 * (−2 insns/call vs slim_small_frame). Slots and callee-saves stack ABOVE the 254 * record at positive fp offsets; incoming args sit at fp+frame_size; CFA = 255 * fp+frame_size. The frame-size-dependent offsets are the reason this is only 256 * available on the known-frame path (frame final before the body). Mutually 257 * exclusive with slim_prologue (Tier A) and slim_small_frame; gated on 258 * out_stack==0 && !has_alloca && frame_size <= 504. */ 259 u8 fp_at_bottom; 260 } AANativeTarget; 261 262 static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; } 263 264 /* Layout-aware FP-relative offsets. Every frame use site goes through these; 265 * the fp_at_bottom test lives here and nowhere else. 266 * 267 * top-record (default): record near the top, fp anchored at the saved pair. 268 * incoming args at fp+16+b, slots below fp at -off. CFA = fp+16. 269 * bottom-record (fp_at_bottom): record at the bottom, fp = sp. 270 * incoming args at fp+frame_size+b, slots above the record at 271 * frame_size-off (in [16, frame_size), never overlapping the 16-byte 272 * record since frame_size = align16(16+cum_off) >= 16+cum_off). 273 * CFA = fp+frame_size. */ 274 static inline i32 aa_fp_off_in_arg(const AANativeTarget* a, u32 byte_off) { 275 /* top-record incoming args sit above the saved pair and the (usually empty) 276 * Windows-variadic GP home area; bottom-record never carries a home area. */ 277 u32 base = a->fp_at_bottom ? a->frame_size_final 278 : AA_FRAME_SAVE_SIZE + a->top_home_bytes; 279 return (i32)(base + byte_off); 280 } 281 static inline i32 aa_fp_off_slot(const AANativeTarget* a, u32 slot_off) { 282 return a->fp_at_bottom ? (i32)a->frame_size_final - (i32)slot_off 283 : -(i32)slot_off; 284 } 285 /* Outgoing stack args on a tail call land in the caller's incoming-arg window — 286 * the same physical address the tail-callee will read via aa_fp_off_in_arg. 287 * Same helper, distinct name for site-side intent. */ 288 static inline i32 aa_fp_off_tail_out_arg(const AANativeTarget* a, 289 u32 byte_off) { 290 return aa_fp_off_in_arg(a, byte_off); 291 } 292 /* CFA = caller's sp, expressed as an fp-relative offset (fp+16 top-record, 293 * fp+frame_size bottom-record). Named so the CFI emit site stays layout-blind. 294 */ 295 static inline i32 aa_cfa_off(const AANativeTarget* a) { 296 return a->fp_at_bottom 297 ? (i32)a->frame_size_final 298 : (i32)(AA_FRAME_SAVE_SIZE + a->top_home_bytes); 299 } 300 301 /* fp-relative offset of GP home slot `i` (Windows variadic only). The home area 302 * sits just above the saved pair and just below the incoming stack args, so 303 * slot gp_reg_count coincides with incoming-arg byte 0 (top-record only — a 304 * function with a home area never takes a slim/bottom layout). */ 305 static inline i32 aa_fp_off_home_slot(u32 i) { 306 return (i32)(AA_FRAME_SAVE_SIZE + i * 8u); 307 } 308 309 static void aa_panic(AANativeTarget* a, const char* msg) { 310 compiler_panic(a->base.c, a->loc, "aarch64 native target: %s", msg); 311 } 312 313 /* Declared locally rather than pulling in debug/debug.h, keeping the 314 * backend's dependency on the Debug producer to this one entry point — 315 * same pattern as the x64/rv64 emit TUs (see arch/mc.h). */ 316 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); 317 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs, 318 u32 end_ofs); 319 320 static void aa_emit32(MCEmitter* mc, u32 word) { 321 u8 b[4]; 322 u32 ofs = obj_pos(mc->obj, mc->section_id); 323 wr_u32_le(b, word); 324 mc->emit_bytes(mc, b, sizeof b); 325 /* Record one line-table row per instruction start (no-op when not -g). */ 326 if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); 327 } 328 329 static void aa_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) { 330 u8 b[4]; 331 wr_u32_le(b, word); 332 obj_patch(obj, sec, off, b, sizeof b); 333 } 334 335 static u32 align_up_u32(u32 v, u32 align) { 336 u32 mask = align ? align - 1u : 0u; 337 return (v + mask) & ~mask; 338 } 339 340 static u32 type_size32(NativeTarget* t, KitCgTypeId type) { 341 u64 n = type ? cg_type_size(t->c, type) : 8u; 342 if (n == 0) n = 8u; 343 if (n > 16u) 344 compiler_panic(t->c, (SrcLoc){0, 0, 0}, 345 "aarch64 native target: scalar too large"); 346 return (u32)n; 347 } 348 349 static u32 type_align32(NativeTarget* t, KitCgTypeId type) { 350 u64 n = type ? cg_type_align(t->c, type) : 8u; 351 if (n == 0) n = 1u; 352 if (n > 16u) n = 16u; 353 return (u32)n; 354 } 355 356 static u32 size_idx(u32 n) { 357 if (n <= 1u) return 0u; 358 if (n <= 2u) return 1u; 359 if (n <= 4u) return 2u; 360 return 3u; 361 } 362 363 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; } 364 365 static int loc_is_64(NativeTarget* t, NativeLoc loc) { 366 return type_size32(t, loc.type) == 8u || cg_type_is_ptr(t->c, loc.type); 367 } 368 369 /* native_loc_is_fp is shared in native_target.h. */ 370 371 static __attribute__((unused)) int aa_use_got_for_sym(NativeTarget* t, 372 ObjSymId sym) { 373 return obj_symbol_extern_via_got(t->c, t->obj, sym); 374 } 375 376 static __attribute__((unused)) RelocKind aa_ldst_reloc_for_size(u32 size) { 377 switch (size) { 378 case 0: 379 return R_AARCH64_LDST8_ABS_LO12_NC; 380 case 1: 381 return R_AARCH64_LDST16_ABS_LO12_NC; 382 case 2: 383 return R_AARCH64_LDST32_ABS_LO12_NC; 384 case 3: 385 return R_AARCH64_LDST64_ABS_LO12_NC; 386 default: 387 return R_AARCH64_LDST64_ABS_LO12_NC; 388 } 389 } 390 391 static u32 aa_load_imm_words(u32* out, u32 cap, u32 sf, u32 rd, i64 imm) { 392 u64 v = (u64)imm; 393 u32 words = sf ? 4u : 2u; 394 u32 n = 0; 395 for (u32 i = 0; i < words; ++i) { 396 u32 part = (u32)((v >> (i * 16u)) & 0xffffu); 397 if (!part && n) continue; 398 if (n >= cap) return 0; 399 out[n] = n ? aa64_movk(sf, rd, part, i) : aa64_movz(sf, rd, part, i); 400 ++n; 401 } 402 if (!n) { 403 if (!cap) return 0; 404 out[n++] = aa64_movz(sf, rd, 0, 0); 405 } 406 return n; 407 } 408 409 static void aa_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) { 410 u32 words[4]; 411 u32 n = aa_load_imm_words(words, 4u, sf, rd, imm); 412 for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]); 413 } 414 415 static void aa_emit_add_imm(AANativeTarget* a, u32 rd, u32 rn, i32 off) { 416 u32 imm12, sh; 417 MCEmitter* mc = a->base.mc; 418 if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) { 419 aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh)); 420 return; 421 } 422 if (off < 0 && aa64_addsub_imm_fits(-(i64)off, &imm12, &sh)) { 423 aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh)); 424 return; 425 } 426 aa_emit_load_imm(mc, 1, rd, off); 427 aa_emit32(mc, aa64_add(1, rd, rn, rd)); 428 } 429 430 static __attribute__((unused)) void aa_emit_add_i64(AANativeTarget* a, u32 rd, 431 u32 rn, i64 off) { 432 u32 imm12, sh; 433 MCEmitter* mc = a->base.mc; 434 if (off >= 0 && aa64_addsub_imm_fits(off, &imm12, &sh)) { 435 aa_emit32(mc, aa64_add_imm(1, rd, rn, imm12, sh)); 436 return; 437 } 438 if (off < 0 && aa64_addsub_imm_fits(-off, &imm12, &sh)) { 439 aa_emit32(mc, aa64_sub_imm(1, rd, rn, imm12, sh)); 440 return; 441 } 442 aa_emit_load_imm(mc, 1, rd, off); 443 aa_emit32(mc, aa64_add(1, rd, rn, rd)); 444 } 445 446 static u32 aa_ldur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) { 447 return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size, 448 .V = v, 449 .opc = AA64_LDST_OPC_LDR, 450 .imm9 = (u32)simm9 & 0x1ffu, 451 .Rn = rn, 452 .Rt = rt}); 453 } 454 455 static u32 aa_stur_v(u32 size, u32 v, u32 rt, u32 rn, i32 simm9) { 456 return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size, 457 .V = v, 458 .opc = AA64_LDST_OPC_STR, 459 .imm9 = (u32)simm9 & 0x1ffu, 460 .Rn = rn, 461 .Rt = rt}); 462 } 463 464 static u32 aa_ldr_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) { 465 u32 sc = byte_off >> size; 466 return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size, 467 .V = v, 468 .opc = AA64_LDST_OPC_LDR, 469 .imm12 = sc, 470 .Rn = rn, 471 .Rt = rt}); 472 } 473 474 static u32 aa_str_uimm_v(u32 size, u32 v, u32 rt, u32 rn, u32 byte_off) { 475 u32 sc = byte_off >> size; 476 return aa64_ldst_uimm_pack((AA64LdStUimm){.size = size, 477 .V = v, 478 .opc = AA64_LDST_OPC_STR, 479 .imm12 = sc, 480 .Rn = rn, 481 .Rt = rt}); 482 } 483 484 static u32 aa_ldr_uimm(u32 size, u32 rt, u32 rn, u32 byte_off) { 485 return aa_ldr_uimm_v(size, 0, rt, rn, byte_off); 486 } 487 488 static __attribute__((unused)) u32 aa_str_uimm(u32 size, u32 rt, u32 rn, 489 u32 byte_off) { 490 return aa_str_uimm_v(size, 0, rt, rn, byte_off); 491 } 492 493 static __attribute__((unused)) u32 aa_ldst_regoff_v(u32 size, u32 v, u32 load, 494 u32 rt, u32 rn, u32 rm, 495 u32 scaled) { 496 return ((size & 3u) << 30) | 0x38200800u | ((v & 1u) << 26) | 497 ((load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR) << 22) | 498 ((rm & 0x1fu) << 16) | (3u << 13) | ((scaled & 1u) << 12) | 499 ((rn & 0x1fu) << 5) | (rt & 0x1fu); 500 } 501 502 static __attribute__((unused)) u32 aa_ldr_lit64(u32 rt, u32 imm19) { 503 return 0x58000000u | ((imm19 & 0x7ffffu) << 5) | (rt & 0x1fu); 504 } 505 506 static __attribute__((unused)) u32 aa_mrs_tpidr_el0(u32 rt) { 507 return 0xd53bd040u | (rt & 0x1fu); 508 } 509 510 static u32 aa_fp_bin(u32 op, u32 is_double, u32 rd, u32 rn, u32 rm) { 511 return (is_double ? 0x1e600000u : 0x1e200000u) | op | ((rm & 0x1fu) << 16) | 512 ((rn & 0x1fu) << 5) | (rd & 0x1fu); 513 } 514 515 static u32 aa_fcmp(u32 is_double, u32 rn, u32 rm) { 516 return (is_double ? 0x1e602000u : 0x1e202000u) | ((rm & 0x1fu) << 16) | 517 ((rn & 0x1fu) << 5); 518 } 519 520 static u32 aa_fneg(u32 is_double, u32 rd, u32 rn) { 521 return (is_double ? 0x1e614000u : 0x1e214000u) | ((rn & 0x1fu) << 5) | 522 (rd & 0x1fu); 523 } 524 525 static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) { 526 return (is_double ? 0x1e604000u : 0x1e204000u) | ((rn & 0x1fu) << 5) | 527 (rd & 0x1fu); 528 } 529 530 /* MOV Vd.16B, Vn.16B (alias of ORR Vd.16B, Vn.16B, Vn.16B): a full 128-bit 531 * SIMD register copy. Used to move binary128 / long double values, which fmov 532 * (scalar, max 64-bit) would truncate. */ 533 static u32 aa_mov_vec16(u32 rd, u32 rn) { 534 return 0x4ea01c00u | ((rn & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | 535 (rd & 0x1fu); 536 } 537 538 static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) { 539 return (is64_src ? 0x9e220000u : 0x1e220000u) | 540 (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu); 541 } 542 543 static u32 aa_ucvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) { 544 return (is64_src ? 0x9e230000u : 0x1e230000u) | 545 (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu); 546 } 547 548 static u32 aa_fcvtzs(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) { 549 return (is64_dst ? 0x9e380000u : 0x1e380000u) | 550 (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | (rd & 0x1fu); 551 } 552 553 static u32 aa_fcvtzu(u32 is64_dst, u32 is_double_src, u32 rd, u32 fn) { 554 return (is64_dst ? 0x9e390000u : 0x1e390000u) | 555 (is_double_src ? 0x00400000u : 0) | ((fn & 0x1fu) << 5) | (rd & 0x1fu); 556 } 557 558 static u32 aa_fcvt_d_s(u32 rd, u32 rn) { 559 return 0x1e22c000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 560 } 561 562 static u32 aa_fcvt_s_d(u32 rd, u32 rn) { 563 return 0x1e624000u | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 564 } 565 566 static u32 aa_fmov_gpr_to_fp(u32 is64, u32 fd, u32 rn) { 567 return (is64 ? 0x9e670000u : 0x1e270000u) | ((rn & 0x1fu) << 5) | 568 (fd & 0x1fu); 569 } 570 571 static u32 aa_fmov_fp_to_gpr(u32 is64, u32 rd, u32 fn) { 572 return (is64 ? 0x9e660000u : 0x1e260000u) | ((fn & 0x1fu) << 5) | 573 (rd & 0x1fu); 574 } 575 576 static u32 aa_clz(u32 sf, u32 rd, u32 rn) { 577 return (sf ? 0xdac01000u : 0x5ac01000u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 578 } 579 580 static u32 aa_rbit(u32 sf, u32 rd, u32 rn) { 581 return (sf ? 0xdac00000u : 0x5ac00000u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 582 } 583 584 static u32 aa_rev(u32 sf, u32 rd, u32 rn) { 585 return (sf ? 0xdac00c00u : 0x5ac00800u) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 586 } 587 588 static u32 aa_sbfm(u32 sf, u32 rd, u32 rn, u32 immr, u32 imms) { 589 return (sf ? 0x93400000u : 0x13000000u) | ((immr & 0x3fu) << 16) | 590 ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 591 } 592 593 static __attribute__((unused)) u32 aa_ubfm(u32 sf, u32 rd, u32 rn, u32 immr, 594 u32 imms) { 595 return (sf ? 0xd3400000u : 0x53000000u) | ((immr & 0x3fu) << 16) | 596 ((imms & 0x3fu) << 10) | ((rn & 0x1fu) << 5) | (rd & 0x1fu); 597 } 598 599 static __attribute__((unused)) u32 aa_ldaxr(u32 size, u32 rt, u32 rn) { 600 return (size << 30) | 0x085ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); 601 } 602 603 static __attribute__((unused)) u32 aa_ldxr(u32 size, u32 rt, u32 rn) { 604 return (size << 30) | 0x085f7c00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); 605 } 606 607 static __attribute__((unused)) u32 aa_stlxr(u32 size, u32 rs, u32 rt, u32 rn) { 608 return (size << 30) | 0x0800fc00u | ((rs & 0x1fu) << 16) | 609 ((rn & 0x1fu) << 5) | (rt & 0x1fu); 610 } 611 612 static __attribute__((unused)) u32 aa_stxr(u32 size, u32 rs, u32 rt, u32 rn) { 613 return (size << 30) | 0x08007c00u | ((rs & 0x1fu) << 16) | 614 ((rn & 0x1fu) << 5) | (rt & 0x1fu); 615 } 616 617 static __attribute__((unused)) u32 aa_ldar(u32 size, u32 rt, u32 rn) { 618 return (size << 30) | 0x08dffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); 619 } 620 621 static __attribute__((unused)) u32 aa_stlr(u32 size, u32 rt, u32 rn) { 622 return (size << 30) | 0x089ffc00u | ((rn & 0x1fu) << 5) | (rt & 0x1fu); 623 } 624 625 static u32 aa_umaddl(u32 rd, u32 rn, u32 rm, u32 ra) { 626 return 0x9ba00000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) | 627 ((rn & 0x1fu) << 5) | (rd & 0x1fu); 628 } 629 630 static u32 aa_smaddl(u32 rd, u32 rn, u32 rm, u32 ra) { 631 return 0x9b200000u | ((rm & 0x1fu) << 16) | ((ra & 0x1fu) << 10) | 632 ((rn & 0x1fu) << 5) | (rd & 0x1fu); 633 } 634 635 static u32 aa_smulh(u32 rd, u32 rn, u32 rm) { 636 return 0x9b407c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | 637 (rd & 0x1fu); 638 } 639 640 static u32 aa_umulh(u32 rd, u32 rn, u32 rm) { 641 return 0x9bc07c00u | ((rm & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | 642 (rd & 0x1fu); 643 } 644 645 static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) { 646 return aa64_addsubsr_pack( 647 (AA64AddSubSR){.sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd}); 648 } 649 650 static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, NativeLoc rhs); 651 652 static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) { 653 return aa64_addsubsr_pack((AA64AddSubSR){.sf = 1, 654 .op = 0, 655 .S = 0, 656 .shift = 0, 657 .Rm = rm, 658 .imm6 = shift, 659 .Rn = rn, 660 .Rd = rd}); 661 } 662 663 static u32 aa_cset(u32 sf, u32 rd, u32 cond) { 664 return aa64_csinc_enc(sf, rd, AA64_ZR, AA64_ZR, cond ^ 1u); 665 } 666 667 static u32 cmp_cond(CmpOp op) { 668 switch (op) { 669 case CMP_EQ: 670 return 0x0u; 671 case CMP_NE: 672 return 0x1u; 673 case CMP_LT_U: 674 return 0x3u; 675 case CMP_LE_U: 676 return 0x9u; 677 case CMP_GT_U: 678 return 0x8u; 679 case CMP_GE_U: 680 return 0x2u; 681 case CMP_LT_S: 682 return 0xbu; 683 case CMP_LE_S: 684 return 0xdu; 685 case CMP_GT_S: 686 return 0xcu; 687 case CMP_GE_S: 688 return 0xau; 689 /* FP predicates after FCMP set NZCV as: a<b -> N; a==b -> Z,C; a>b -> C; 690 * unordered -> C,V. Each maps to a single condition except CMP_ONE_F / 691 * CMP_UEQ_F (synthesized with two instructions in aa_cmp/aa_cmp_branch, 692 * which intercept them before calling cmp_cond). */ 693 case CMP_OEQ_F: 694 return 0x0u; /* EQ */ 695 case CMP_OLT_F: 696 return 0x4u; /* MI */ 697 case CMP_OLE_F: 698 return 0x9u; /* LS */ 699 case CMP_OGT_F: 700 return 0xcu; /* GT */ 701 case CMP_OGE_F: 702 return 0xau; /* GE */ 703 case CMP_UNE_F: 704 return 0x1u; /* NE (unordered or not-equal) */ 705 case CMP_ULT_F: 706 return 0xbu; /* LT (unordered or less-than) */ 707 case CMP_ULE_F: 708 return 0xdu; /* LE (unordered or less-or-equal) */ 709 case CMP_UGT_F: 710 return 0x8u; /* HI (unordered or greater-than) */ 711 case CMP_UGE_F: 712 return 0x2u; /* CS (unordered or greater-or-equal) */ 713 default: 714 return 0x0u; 715 } 716 } 717 718 static AANativeSlot* aa_slot(AANativeTarget* a, NativeFrameSlot slot) { 719 return native_frame_slot_at(&a->frame, slot); 720 } 721 722 static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out, 723 i32* off_out) { 724 *base_out = AA_TMP0; 725 *off_out = addr.offset; 726 switch ((NativeAddrBaseKind)addr.base_kind) { 727 case NATIVE_ADDR_BASE_REG: 728 *base_out = addr.base.reg; 729 return; 730 case NATIVE_ADDR_BASE_FRAME: { 731 AANativeSlot* s = aa_slot(a, addr.base.frame); 732 *base_out = AA_FP; 733 *off_out = aa_fp_off_slot(a, s->off) + addr.offset; 734 return; 735 } 736 case NATIVE_ADDR_BASE_GLOBAL: { 737 NativeLoc tmp; 738 memset(&tmp, 0, sizeof tmp); 739 tmp.kind = NATIVE_LOC_REG; 740 tmp.cls = NATIVE_REG_INT; 741 tmp.type = builtin_id(KIT_CG_BUILTIN_I64); 742 tmp.v.reg = AA_TMP0; 743 a->base.load_addr(&a->base, tmp, addr); 744 *base_out = AA_TMP0; 745 *off_out = 0; 746 return; 747 } 748 default: 749 aa_panic(a, "unsupported address base"); 750 } 751 } 752 753 static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off); 754 static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off); 755 756 static void aa_emit_mem_q(AANativeTarget* a, int load, NativeLoc reg, 757 NativeAddr addr) { 758 u32 base, rt; 759 i32 off; 760 MCEmitter* mc = a->base.mc; 761 if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) 762 aa_panic(a, "unsupported q-register indexed memory access"); 763 aa_addr_base(a, addr, &base, &off); 764 rt = loc_reg(reg); 765 if (off >= 0 && (((u32)off & 15u) == 0) && ((u32)off >> 4) <= 0xfffu) { 766 aa_emit32(mc, aa_ldst_q_uimm(load, rt, base, (u32)off)); 767 return; 768 } 769 if (off >= -256 && off <= 255) { 770 aa_emit32(mc, aa_ldst_q_simm9(load, rt, base, off)); 771 return; 772 } 773 aa_emit_add_imm(a, AA_TMP1, base, off); 774 aa_emit32(mc, aa_ldst_q_uimm(load, rt, AA_TMP1, 0)); 775 } 776 777 static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg, 778 NativeAddr addr, MemAccess mem) { 779 u32 base, rt, sz; 780 i32 off; 781 MCEmitter* mc = a->base.mc; 782 rt = loc_reg(reg); 783 sz = size_idx(mem.size 784 ? mem.size 785 : type_size32(&a->base, reg.type ? reg.type : mem.type)); 786 if (native_loc_is_fp(reg) && 787 (mem.size 788 ? mem.size 789 : type_size32(&a->base, reg.type ? reg.type : mem.type)) == 16u) { 790 aa_emit_mem_q(a, load, reg, addr); 791 return; 792 } 793 if (native_loc_is_fp(reg) && sz < 2u) sz = 2u; 794 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL && 795 addr.index_kind == NATIVE_ADDR_INDEX_NONE) { 796 i64 addend = addr.base.global.addend + (i64)addr.offset; 797 u32 scratch = (!load && rt == AA_TMP0) ? AA_TMP1 : AA_TMP0; 798 u32 pos = mc->pos(mc); 799 if (aa_use_got_for_sym(&a->base, addr.base.global.sym)) { 800 aa_emit32(mc, aa64_adrp(scratch, 0, 0)); 801 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_GOT_PAGE, 802 addr.base.global.sym, 0, 0, 0); 803 pos = mc->pos(mc); 804 aa_emit32(mc, aa_ldr_uimm(3, scratch, scratch, 0)); 805 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_LD64_GOT_LO12_NC, 806 addr.base.global.sym, 0, 0, 0); 807 if (addend) aa_emit_add_i64(a, scratch, scratch, addend); 808 aa_emit32(mc, load 809 ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, scratch, 0) 810 : aa_stur_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); 811 return; 812 } 813 aa_emit32(mc, aa64_adrp(scratch, 0, 0)); 814 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21, 815 addr.base.global.sym, addend, 0, 0); 816 pos = mc->pos(mc); 817 aa_emit32(mc, 818 load ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0) 819 : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); 820 mc->emit_reloc_at(mc, mc->section_id, pos, aa_ldst_reloc_for_size(sz), 821 addr.base.global.sym, addend, 0, 0); 822 return; 823 } 824 aa_addr_base(a, addr, &base, &off); 825 if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) { 826 u32 use_base = base; 827 u32 scaled = 0; 828 if (addr.index_kind != NATIVE_ADDR_INDEX_REG) 829 aa_panic(a, "unsupported address index"); 830 if (off) { 831 use_base = AA_TMP1; 832 aa_emit_add_imm(a, use_base, base, off); 833 } 834 if (addr.log2_scale == 0) { 835 scaled = 0; 836 } else if (addr.log2_scale == sz) { 837 scaled = 1; 838 } else { 839 aa_panic(a, "unsupported memory address scale"); 840 } 841 aa_emit32(mc, aa_ldst_regoff_v(sz, native_loc_is_fp(reg), load, rt, 842 use_base, addr.index.reg, scaled)); 843 return; 844 } 845 if (off >= 0 && (((u32)off & ((1u << sz) - 1u)) == 0) && 846 ((u32)off >> sz) <= 0xfffu) { 847 aa_emit32( 848 mc, load 849 ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off) 850 : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off)); 851 return; 852 } 853 if (off >= -256 && off <= 255) { 854 aa_emit32(mc, load ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, base, off) 855 : aa_stur_v(sz, native_loc_is_fp(reg), rt, base, off)); 856 return; 857 } 858 aa_emit_add_imm(a, AA_TMP1, base, off); 859 aa_emit32(mc, load ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, AA_TMP1, 0) 860 : aa_stur_v(sz, native_loc_is_fp(reg), rt, AA_TMP1, 0)); 861 } 862 863 static NativeAllocClass aa_class_for_type(NativeTarget* t, KitCgTypeId type) { 864 if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u) 865 return NATIVE_REG_FP; 866 return NATIVE_REG_INT; 867 } 868 869 static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr, 870 MemAccess mem) { 871 u32 sz; 872 (void)t; 873 if (!addr) return 0; 874 if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return 1; 875 if (addr->index_kind != NATIVE_ADDR_INDEX_REG) return 0; 876 if (addr->log2_scale == 0) return 1; 877 sz = size_idx(mem.size ? mem.size : 8u); 878 return addr->log2_scale == sz; 879 } 880 881 /* True if `mul Rd, Rn, #c` can be replaced by a single non-mul aarch64 882 * instruction using only Rn as a source (no extra scratch reg). Constants 883 * that match: 0, 1, -1, +/-2^k, 2^k+1, 1-2^k for k in [1..width-1]. The 884 * shift exponent must fit imm6 for the operand width (width = 32 if !sf 885 * else 64). The emit side is aa_emit_mul_const_imm. */ 886 static int aa64_imul_strength_reducible(u32 sf, i64 imm) { 887 u32 max_sh = sf ? 63u : 31u; 888 u64 a; 889 if (imm == 0 || imm == 1 || imm == -1) return 1; 890 /* +2^k */ 891 a = (u64)imm; 892 if (imm > 0 && (a & (a - 1u)) == 0u) { 893 u32 k = (u32)__builtin_ctzll(a); 894 return k <= max_sh; 895 } 896 /* -2^k */ 897 if (imm < 0) { 898 a = (u64)(-imm); 899 if (a && (a & (a - 1u)) == 0u) { 900 u32 k = (u32)__builtin_ctzll(a); 901 return k >= 1u && k <= max_sh; 902 } 903 } 904 /* 2^k + 1 (k >= 1, so c >= 3) */ 905 if (imm >= 3) { 906 u64 m = (u64)(imm - 1); 907 if ((m & (m - 1u)) == 0u) { 908 u32 k = (u32)__builtin_ctzll(m); 909 return k >= 1u && k <= max_sh; 910 } 911 } 912 /* 1 - 2^k (k >= 1, so c <= -1) */ 913 if (imm <= -1) { 914 u64 m = (u64)(1 - imm); 915 if (m && (m & (m - 1u)) == 0u) { 916 u32 k = (u32)__builtin_ctzll(m); 917 return k >= 1u && k <= max_sh; 918 } 919 } 920 return 0; 921 } 922 923 /* Which constant operands the backend can fold directly into an instruction 924 * (so the optimizer can leave them as immediates instead of materializing a 925 * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12), 926 * any value for a plain register move (movz/movk synthesizes it), and 927 * strength-reducible mul constants (handled in aa_binop via shift / shifted 928 * add or sub). */ 929 static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, 930 KitCgTypeId type, i64 imm) { 931 u32 imm12, sh; 932 switch (use) { 933 case NATIVE_IMM_BINOP: 934 if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB) 935 return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh); 936 if ((BinOp)op == BO_IMUL) { 937 u32 sf = type_size32(t, type) == 8u ? 1u : 0u; 938 return aa64_imul_strength_reducible(sf, imm); 939 } 940 /* LSL/LSR/ASR #imm via the UBFM/SBFM aliases: shift count in range. */ 941 if ((BinOp)op == BO_SHL || (BinOp)op == BO_SHR_S || 942 (BinOp)op == BO_SHR_U) { 943 u32 bits = type_size32(t, type) == 8u ? 64u : 32u; 944 return imm >= 0 && (u64)imm < (u64)bits; 945 } 946 /* AND/ORR/EOR #bitmask: encodable as an AArch64 logical immediate. */ 947 if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) { 948 u32 sf = type_size32(t, type) == 8u ? 1u : 0u; 949 u32 N, immr, imms; 950 return aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms); 951 } 952 return 0; 953 case NATIVE_IMM_CMP: 954 /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a 955 * non-negative immediate. */ 956 return imm >= 0 && aa64_addsub_imm_fits(imm, &imm12, &sh); 957 case NATIVE_IMM_ADDR_OFFSET: 958 return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh); 959 case NATIVE_IMM_MOVE: 960 return 1; 961 } 962 return 0; 963 } 964 965 static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) { 966 if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return; 967 if (addr->index_kind != NATIVE_ADDR_INDEX_REG) 968 aa_panic(a, "unsupported address index"); 969 if (addr->log2_scale > 4u) aa_panic(a, "unsupported address scale"); 970 aa_emit32(a->base.mc, aa_add_lsl(rd, rd, addr->index.reg, addr->log2_scale)); 971 } 972 973 static void aa_materialize_frame_index(AANativeTarget* a, NativeAddr* addr, 974 u32 avoid_reg) { 975 NativeAddr load; 976 NativeLoc idx; 977 MemAccess mem; 978 u32 reg; 979 if (addr->index_kind != NATIVE_ADDR_INDEX_FRAME_VALUE) return; 980 reg = avoid_reg == AA_TMP1 ? AA_TMP0 : AA_TMP1; 981 memset(&load, 0, sizeof load); 982 load.base_kind = NATIVE_ADDR_BASE_FRAME; 983 load.base.frame = addr->index.frame; 984 load.base_type = 985 addr->index_type ? addr->index_type : builtin_id(KIT_CG_BUILTIN_I64); 986 memset(&idx, 0, sizeof idx); 987 idx.kind = NATIVE_LOC_REG; 988 idx.cls = NATIVE_REG_INT; 989 idx.type = load.base_type; 990 idx.v.reg = reg; 991 memset(&mem, 0, sizeof mem); 992 mem.type = load.base_type; 993 mem.size = 8; 994 mem.align = 8; 995 aa_emit_mem(a, 1, idx, load, mem); 996 addr->index_kind = NATIVE_ADDR_INDEX_REG; 997 addr->index.reg = reg; 998 } 999 1000 static NativeLoc native_loc_reg(KitCgTypeId type, NativeAllocClass cls, 1001 Reg reg); 1002 1003 static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off) { 1004 return aa64_ldst_uimm_pack((AA64LdStUimm){.size = 0, 1005 .V = 1, 1006 .opc = load ? 3u : 2u, 1007 .imm12 = byte_off >> 4, 1008 .Rn = rn, 1009 .Rt = rt}); 1010 } 1011 1012 static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) { 1013 return aa64_ldst_simm9_pack((AA64LdStSimm9){.size = 0, 1014 .V = 1, 1015 .opc = load ? 3u : 2u, 1016 .imm9 = (u32)byte_off & 0x1ffu, 1017 .Rn = rn, 1018 .Rt = rt}); 1019 } 1020 1021 static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg, 1022 NativeFrameSlot slot, u32 offset) { 1023 AANativeSlot* s = aa_slot(a, slot); 1024 i32 off = aa_fp_off_slot(a, s->off) + (i32)offset; 1025 MCEmitter* mc = a->base.mc; 1026 if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) { 1027 aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off)); 1028 return; 1029 } 1030 if (off >= -256 && off <= 255) { 1031 aa_emit32(mc, aa_ldst_q_simm9(load, qreg, AA_FP, off)); 1032 return; 1033 } 1034 aa_emit_add_imm(a, AA_TMP1, AA_FP, off); 1035 aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_TMP1, 0)); 1036 } 1037 1038 /* Reserve the variadic register-save-area frame slots (gp then fp). Split from 1039 * the store emission so the known-frame path can fix the full frame — including 1040 * these slots — before the prologue, then emit the stores after it. */ 1041 static void aa_reserve_variadic_reg_saves(AANativeTarget* a) { 1042 NativeFrameSlotDesc sd; 1043 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 1044 ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi); 1045 if (vai.kind != ABI_VA_LIST_AAPCS64) return; 1046 memset(&sd, 0, sizeof sd); 1047 sd.type = i64; 1048 sd.size = vai.gp_reg_count * vai.gp_slot_size; 1049 sd.align = 8; 1050 sd.kind = NATIVE_FRAME_SLOT_SAVE; 1051 a->va_gr_slot = a->base.frame_slot(&a->base, &sd); 1052 sd.size = vai.fp_reg_count * vai.fp_slot_size; 1053 sd.align = 16; 1054 a->va_vr_slot = a->base.frame_slot(&a->base, &sd); 1055 } 1056 1057 /* Emit the stores into the variadic register-save area. For AAPCS64 these land 1058 * in the reserved gr/vr frame slots (aa_reserve_variadic_reg_saves); for the 1059 * Windows GP home area they land in [fp + AA_FRAME_SAVE_SIZE ..], the 1060 * top-of-frame block contiguous with the incoming stack args. */ 1061 static void aa_emit_variadic_reg_save_stores(AANativeTarget* a) { 1062 NativeAddr addr; 1063 MemAccess mem; 1064 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 1065 ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi); 1066 if (vai.kind == ABI_VA_LIST_POINTER && a->top_home_bytes) { 1067 /* Windows: home x0..x{gp_reg_count-1} so the plain-pointer va_list walks 1068 * register then stack varargs as one block. The named leading registers are 1069 * homed too (harmless): va_start skips past them. */ 1070 memset(&mem, 0, sizeof mem); 1071 mem.type = i64; 1072 mem.size = 8; 1073 mem.align = 8; 1074 memset(&addr, 0, sizeof addr); 1075 addr.base_kind = NATIVE_ADDR_BASE_REG; 1076 addr.base.reg = AA_FP; 1077 addr.base_type = i64; 1078 for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) { 1079 NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r); 1080 addr.offset = aa_fp_off_home_slot(r); 1081 aa_emit_mem(a, 0, src, addr, mem); 1082 } 1083 return; 1084 } 1085 if (vai.kind != ABI_VA_LIST_AAPCS64) return; 1086 memset(&mem, 0, sizeof mem); 1087 mem.type = i64; 1088 mem.size = 8; 1089 mem.align = 8; 1090 memset(&addr, 0, sizeof addr); 1091 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1092 addr.base.frame = a->va_gr_slot; 1093 addr.base_type = i64; 1094 for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) { 1095 NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r); 1096 addr.offset = (i32)(r * vai.gp_slot_size); 1097 aa_emit_mem(a, 0, src, addr, mem); 1098 } 1099 for (u32 r = 0; r < vai.fp_reg_count && r < 8u; ++r) 1100 aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size); 1101 } 1102 1103 static void aa_emit_entry_saves(AANativeTarget* a); 1104 1105 /* Per-function state reset + function-symbol / cfi / prologue-anchor setup 1106 * shared by both entry points (aa_func_begin for the single-pass path, 1107 * aa_func_begin_known_frame for the optimizer path). Emits no prologue. */ 1108 static void aa_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { 1109 AANativeTarget* a = aa_of(t); 1110 MCEmitter* mc = t->mc; 1111 a->func = fd; 1112 /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing, 1113 * callee-save set, and known_frame/has_alloca/frame_final. cum_off counts 1114 * frame-slot bytes below fp; the saved fp/lr pair (16 bytes at [fp, fp+8]) is 1115 * *not* part of it — aa_build_layout adds it in aa_func_end. */ 1116 native_frame_reset(&a->frame); 1117 a->incoming_stack_size = 0; 1118 a->next_param_int = 0; 1119 a->next_param_fp = 0; 1120 /* 0-based byte cursor for incoming stack args (also reported as the 1121 * caller's incoming_stack_size for tail-call realizability). bind_param 1122 * forms its fp-relative address via aa_fp_off_in_arg(next_param_stack), 1123 * which adds the saved-pair offset. */ 1124 a->next_param_stack = 0; 1125 a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; 1126 a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE; 1127 a->va_gr_slot = NATIVE_FRAME_SLOT_NONE; 1128 a->va_vr_slot = NATIVE_FRAME_SLOT_NONE; 1129 a->npatches = 0; 1130 a->nalloca = 0; 1131 a->slim_prologue = 0; 1132 a->slim_small_frame = 0; 1133 a->fp_at_bottom = 0; 1134 a->frame_size_final = 0; 1135 /* Windows variadic functions reserve a GP register home area at the top of 1136 * the frame (just below the incoming stack args). The plain-pointer va_list 1137 * then walks register-passed then stack-passed varargs as one block. Other 1138 * ABIs leave gp_reg_count 0 here: Apple ARM64 routes all varargs to the 1139 * stack, AAPCS64 uses a struct va_list with separate reg-save pointers. */ 1140 { 1141 const ABIFuncInfo* fi = abi_cg_func_info(t->c->abi, fd->fn_type); 1142 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 1143 a->top_home_bytes = (fi && fi->variadic && vai.kind == ABI_VA_LIST_POINTER) 1144 ? vai.gp_reg_count * vai.gp_slot_size 1145 : 0u; 1146 } 1147 mc->set_section(mc, fd->text_section_id); 1148 mc->emit_align(mc, 4, 0); 1149 a->func_start = mc->pos(mc); 1150 mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); 1151 if (mc->cfi_startproc) mc->cfi_startproc(mc); 1152 a->prologue_pos = mc->pos(mc); 1153 a->minimal_prologue_words = 0; 1154 a->epilogue_label = mc->label_new(mc); 1155 } 1156 1157 /* Single-pass (NativeDirectTarget) entry point: the frame is not known up 1158 * front, so reserve a worst-case prologue region (patched in aa_func_end once 1159 * max_outgoing / callee-saves are final) and emit the entry saves now. */ 1160 static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { 1161 AANativeTarget* a = aa_of(t); 1162 MCEmitter* mc = t->mc; 1163 aa_func_begin_common(t, fd); 1164 for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu); 1165 aa_emit_entry_saves(a); 1166 } 1167 1168 /* Reserve the entry-save frame slots: the sret-pointer home (x8) and, for 1169 * variadic functions, the argument register-save area. Reserving is split from 1170 * emitting so the known-frame path can fix the full frame before the prologue; 1171 * the single-pass path runs both back to back via aa_emit_entry_saves. */ 1172 static void aa_reserve_entry_saves(AANativeTarget* a) { 1173 NativeTarget* t = &a->base; 1174 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 1175 if (abi && abi->has_sret) { 1176 NativeFrameSlotDesc sd; 1177 memset(&sd, 0, sizeof sd); 1178 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 1179 sd.size = 8; 1180 sd.align = 8; 1181 sd.kind = NATIVE_FRAME_SLOT_SAVE; 1182 a->sret_ptr_slot = t->frame_slot(t, &sd); 1183 } 1184 if (abi && abi->variadic) aa_reserve_variadic_reg_saves(a); 1185 } 1186 1187 /* Emit the entry-save stores (x8 → sret slot, then the variadic reg-save area). 1188 * Slots must already be reserved (aa_reserve_entry_saves). */ 1189 static void aa_emit_entry_save_stores(AANativeTarget* a) { 1190 NativeTarget* t = &a->base; 1191 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 1192 if (abi && abi->has_sret) { 1193 NativeAddr addr; 1194 NativeLoc src; 1195 MemAccess mem; 1196 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 1197 memset(&addr, 0, sizeof addr); 1198 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1199 addr.base.frame = a->sret_ptr_slot; 1200 addr.base_type = i64; 1201 memset(&src, 0, sizeof src); 1202 src.kind = NATIVE_LOC_REG; 1203 src.cls = NATIVE_REG_INT; 1204 src.type = i64; 1205 src.v.reg = 8u; 1206 memset(&mem, 0, sizeof mem); 1207 mem.type = i64; 1208 mem.size = 8; 1209 mem.align = 8; 1210 aa_emit_mem(a, 0, src, addr, mem); 1211 } 1212 if (abi && abi->variadic) aa_emit_variadic_reg_save_stores(a); 1213 } 1214 1215 /* Reserve + emit the entry saves back to back. Single-pass (NativeDirectTarget) 1216 * path, where the prologue region is a reserved worst-case block and slot 1217 * offsets need not be final before it. */ 1218 static void aa_emit_entry_saves(AANativeTarget* a) { 1219 aa_reserve_entry_saves(a); 1220 aa_emit_entry_save_stores(a); 1221 } 1222 1223 static void aa_note_frame_state(NativeTarget* t, 1224 const NativeFramePatchState* state) { 1225 AANativeTarget* a = aa_of(t); 1226 if (state && state->max_outgoing > a->frame.max_outgoing) 1227 a->frame.max_outgoing = state->max_outgoing; 1228 } 1229 1230 /* Reserve a save slot for each callee-saved register the allocator used. Runs 1231 * before frame-slot mapping so these slots get the lowest offsets, keeping the 1232 * prologue stores within stur's signed-9-bit range. The prologue/epilogue 1233 * save/restore is emitted from this list in aa_patch_prologue / aa_func_end. */ 1234 static void aa_reserve_callee_saves(NativeTarget* t, const u32* used, 1235 u32 nclasses) { 1236 AANativeTarget* a = aa_of(t); 1237 /* aa64 homes each callee-save in its own 8-byte frame slot (reserved before 1238 * the body slots so they sit nearest fp, in stur range), so alloc_slots=1. 1239 * Adjacent integer slots are later paired into stp/ldp. */ 1240 NativeFrameSaveSpec spec[NATIVE_REG_VEC + 1]; 1241 memset(spec, 0, sizeof spec); 1242 spec[NATIVE_REG_INT].size = 8; 1243 spec[NATIVE_REG_INT].align = 8; 1244 spec[NATIVE_REG_INT].type = builtin_id(KIT_CG_BUILTIN_I64); 1245 spec[NATIVE_REG_FP].size = 8; 1246 spec[NATIVE_REG_FP].align = 8; 1247 spec[NATIVE_REG_FP].type = builtin_id(KIT_CG_BUILTIN_F64); 1248 native_frame_set_callee_saves(&a->frame, used, nclasses, spec, 1249 NATIVE_REG_VEC + 1, 1); 1250 } 1251 1252 static MemAccess aa_mem_for_type(NativeTarget* t, KitCgTypeId type, u32 size); 1253 static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, 1254 u32 cap, u32* n); 1255 1256 static void aa_emit_callee_restores(AANativeTarget* a) { 1257 u32 words[AA_PROLOGUE_WORDS]; 1258 u32 n = 0; 1259 aa_words_callee_saves(a, 0, words, AA_PROLOGUE_WORDS, &n); 1260 for (u32 i = 0; i < n; ++i) aa_emit32(a->base.mc, words[i]); 1261 } 1262 1263 static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n, 1264 u32 rd, i64 imm) { 1265 u32 tmp[4]; 1266 u32 m = aa_load_imm_words(tmp, 4u, 1, rd, imm); 1267 if (!m || *n + m > cap) aa_panic(a, "instruction patch too small"); 1268 for (u32 i = 0; i < m; ++i) words[(*n)++] = tmp[i]; 1269 } 1270 1271 /* Windows large-frame stack probe. kit's prologue reserves the whole frame in 1272 * one `sub sp, sp, #N`, but Windows grows a thread stack one guard page at a 1273 * time: a sub that jumps SP more than a page past the guard page leaves the 1274 * skipped pages uncommitted, and the first store into them faults (and, since 1275 * SP itself is then in uncommitted memory, the fault can't even be delivered). 1276 * Touch every page the frame spans, top-down, so each guard page commits in 1277 * turn before the sub. Inlined (no external __chkstk symbol / no reloc in the 1278 * patched prologue region); mirrors the linker's aa64_coff_chkstk body. Only 1279 * x16/x17 are clobbered — the following sub-sp / saved-pair material re-derives 1280 * both. Emitted only when frame_size > interval (one page). */ 1281 static void aa_words_stack_probe(AANativeTarget* a, u32* words, u32 cap, u32* n, 1282 u32 frame_size, u32 interval) { 1283 u32 imm12, sh; 1284 if (!aa64_addsub_imm_fits(interval, &imm12, &sh)) 1285 aa_panic(a, "stack-probe interval not an addsub immediate"); 1286 /* x16 = frame_size ; x17 = sp */ 1287 aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size); 1288 if (*n + 5u > cap) aa_panic(a, "instruction patch too small"); 1289 words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); /* mov x17, sp */ 1290 /* loop: x17 -= page ; x16 -= page (sets flags) ; touch [x17] ; b.gt loop */ 1291 words[(*n)++] = aa64_sub_imm(1, AA_TMP1, AA_TMP1, imm12, sh); 1292 words[(*n)++] = aa64_subs_imm12(1, AA_TMP0, AA_TMP0, imm12, sh); 1293 words[(*n)++] = aa64_ldr64_uimm12(31, AA_TMP1, 0); /* ldr xzr, [x17] */ 1294 /* branch back to the `sub x17` three words above while x16 stays positive */ 1295 words[(*n)++] = 1296 aa64_brcond_pack((AA64BrCond){.imm19 = (u32)(-3), .cond = 0xcu /* GT */}); 1297 } 1298 1299 static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap, 1300 u32* n, u32 frame_size) { 1301 u32 imm12, sh; 1302 if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) { 1303 if (*n >= cap) aa_panic(a, "instruction patch too small"); 1304 words[(*n)++] = aa64_sub_imm(1, AA_SP, AA_SP, imm12, sh); 1305 return; 1306 } 1307 aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size); 1308 if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); 1309 words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); 1310 words[(*n)++] = aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0); 1311 words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0); 1312 } 1313 1314 /* Anchor fp at the AAPCS64 saved-pair address (= sp + saved-pair offset). 1315 * The slim_prologue path achieves the same anchor in a single insn via 1316 * `add x29, sp, #0` after the pre-decrement stp moves sp to the saved-pair. */ 1317 static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap, 1318 u32* n, const AAFrameLayout* L) { 1319 u32 imm12, sh; 1320 u32 anchor = aa_sp_off_saved_pair(L); 1321 if (aa64_addsub_imm_fits(anchor, &imm12, &sh)) { 1322 if (*n >= cap) aa_panic(a, "instruction patch too small"); 1323 words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh); 1324 return; 1325 } 1326 aa_words_load_imm(a, words, cap, n, AA_TMP0, anchor); 1327 if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); 1328 words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); 1329 words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0); 1330 } 1331 1332 /* x17 = address of the saved-pair slot (= sp + saved-pair offset). Used by 1333 * the fat prologue to materialize the stp destination when the offset 1334 * doesn't fit stp's signed-7-bit-scaled immediate. */ 1335 static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap, 1336 u32* n, const AAFrameLayout* L) { 1337 u32 save_off = aa_sp_off_saved_pair(L); 1338 u32 imm12, sh; 1339 if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) { 1340 if (*n >= cap) aa_panic(a, "instruction patch too small"); 1341 words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, imm12, sh); 1342 return; 1343 } 1344 aa_words_load_imm(a, words, cap, n, AA_TMP0, save_off); 1345 if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); 1346 words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); 1347 words[(*n)++] = aa64_add(1, AA_TMP1, AA_TMP1, AA_TMP0); 1348 } 1349 1350 static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap, 1351 u32* n, const AAFrameLayout* L) { 1352 if (!L->frame_size) return; 1353 if (a->slim_prologue) { 1354 if (*n + 1u > cap) aa_panic(a, "instruction patch too small"); 1355 /* `ldp x29, x30, [sp], #16` — pop saved pair, restore sp. */ 1356 words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2); 1357 return; 1358 } 1359 if (a->fp_at_bottom) { 1360 /* Bottom-record fold: `ldp x29,x30,[sp],#N` reloads the pair from the 1361 * bottom AND releases the whole frame in one insn. Callee-saves were 1362 * already restored by aa_emit_callee_restores. -1 insn vs slim_small_frame 1363 * (which needs a separate `add sp`). N <= 504 holds the post-index imm. */ 1364 if (*n + 1u > cap) aa_panic(a, "instruction patch too small"); 1365 words[(*n)++] = 1366 aa64_ldp64_post(AA_FP, AA_LR, AA_SP, (i32)(L->frame_size / 8u)); 1367 return; 1368 } 1369 if (a->slim_small_frame) { 1370 /* `ldp x29,x30,[sp,#saved_pair] ; add sp,sp,#frame_size` — load through 1371 * sp avoids the fat path's `add x10, fp, #0` scratch, and the subsequent 1372 * `add sp` unwinds without depending on the (now-clobbered) old fp. */ 1373 u32 save_off = aa_sp_off_saved_pair(L); 1374 u32 imm12, sh; 1375 if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); 1376 words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)); 1377 if (!aa64_addsub_imm_fits(L->frame_size, &imm12, &sh)) 1378 aa_panic(a, "slim_small_frame: frame_size out of addsub imm range"); 1379 words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh); 1380 return; 1381 } 1382 if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); 1383 /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then restore 1384 * sp to fp + CFA-offset (= caller's original sp = CFA). The CFA offset is 1385 * AA_FRAME_SAVE_SIZE normally, plus the Windows-variadic GP home area when 1386 * present. */ 1387 words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0); 1388 words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0); 1389 words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, (u32)aa_cfa_off(a), 0); 1390 } 1391 1392 /* Emit callee-save store (save=1) or restore (save=0) words into `words`, 1393 * pairing adjacent integer registers into a single stp/ldp. 1394 * reserve_callee_saves allocates consecutive 8-byte slots in order, so 1395 * callee_saves[i] sits 8 bytes above callee_saves[i+1]; for an int pair the 1396 * lower-addressed reg[i+1] is the stp's Rt and reg[i] is Rt2. FP registers (and 1397 * an unpaired trailing int) use the single-register stur/ldur form. */ 1398 static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, 1399 u32 cap, u32* n) { 1400 for (u32 i = 0; i < a->frame.ncallee_saves;) { 1401 const AACalleeSave* cs = &a->frame.callee_saves[i]; 1402 i32 off = aa_fp_off_slot(a, aa_slot(a, cs->slot)->off); 1403 if (i + 1u < a->frame.ncallee_saves && cs->cls == (u8)NATIVE_REG_INT && 1404 a->frame.callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) { 1405 const AACalleeSave* cs2 = &a->frame.callee_saves[i + 1u]; 1406 i32 off2 = aa_fp_off_slot(a, aa_slot(a, cs2->slot)->off); 1407 /* cs2 is reserved after cs (larger slot.off), so it is the lower address 1408 * in both layouts (off2 = off - 8): stp's Rt = cs2, Rt2 = cs, base off2. 1409 * stp/ldp's signed-7-bit scaled immediate reaches ±504. */ 1410 if (off2 < -512 || off2 > 504) 1411 aa_panic(a, "callee-save pair offset out of prologue range"); 1412 if (*n >= cap) aa_panic(a, "prologue too large"); 1413 words[(*n)++] = save 1414 ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8) 1415 : aa64_ldp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8); 1416 i += 2u; 1417 } else { 1418 u32 v = cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u; 1419 if (*n >= cap) aa_panic(a, "prologue too large"); 1420 if (a->fp_at_bottom) { 1421 /* Positive, 8-aligned offset above the record (up to frame_size-8 ≤ 1422 * 496): the unscaled stur (±256) can't reach it, so use the scaled 1423 * unsigned-imm str/ldr. */ 1424 if (off < 0 || (u32)off > 0x7ff8u) 1425 aa_panic(a, "callee-save offset out of prologue range"); 1426 words[(*n)++] = save ? aa_str_uimm_v(3, v, cs->reg, AA_FP, (u32)off) 1427 : aa_ldr_uimm_v(3, v, cs->reg, AA_FP, (u32)off); 1428 } else { 1429 if (off < -256 || off > 255) 1430 aa_panic(a, "callee-save offset out of prologue range"); 1431 words[(*n)++] = save ? aa_stur_v(3, v, cs->reg, AA_FP, off) 1432 : aa_ldur_v(3, v, cs->reg, AA_FP, off); 1433 } 1434 i += 1u; 1435 } 1436 } 1437 } 1438 1439 /* Build the prologue instruction words for `L` into `words` (capacity `cap`), 1440 * returning the count. Shared by the NativeDirectTarget patch path (reserves 1441 * a fixed worst-case region, then patches it here) and the optimizer path 1442 * (aa_func_begin_known_frame emits exactly these words up front). 1443 * 1444 * All variants establish a post-prologue state defined by L: saved x29/x30 at 1445 * [fp]/[fp+8], callee-saves at aa_fp_off_slot of each. The top-record variants 1446 * leave fp = sp + aa_sp_off_saved_pair(L) (saved-pair near the top); the 1447 * bottom-record variant leaves fp = sp (saved-pair at the bottom). */ 1448 static u32 aa_build_prologue_words(AANativeTarget* a, const AAFrameLayout* L, 1449 u32* words, u32 cap) { 1450 u32 n = 0; 1451 if (!L->frame_size) return 0; 1452 if (a->slim_prologue) { 1453 if (cap < 2u) aa_panic(a, "prologue too large"); 1454 /* `stp x29, x30, [sp, #-16]!; add x29, sp, #0` — the pre-decrement stp 1455 * moves sp down to the saved-pair address, so a no-op add anchors fp 1456 * there directly. AAPCS64 frame record. */ 1457 words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2); 1458 words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0); 1459 return n; 1460 } 1461 if (a->fp_at_bottom) { 1462 /* Bottom-record fold: `stp x29,x30,[sp,#-N]!` decrements sp by the whole 1463 * frame AND saves the pair at the new bottom in one insn; `mov x29,sp` 1464 * (add #0) anchors fp there. Callee-saves then stack above the record at 1465 * positive offsets. -2 insns/call vs the top-record slim_small_frame. */ 1466 if (n + 2u > cap) aa_panic(a, "prologue too large"); 1467 words[n++] = 1468 aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -(i32)(L->frame_size / 8u)); 1469 words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0); 1470 aa_words_callee_saves(a, 1, words, cap, &n); 1471 return n; 1472 } 1473 /* On targets that don't auto-grow the stack (Windows), probe each page the 1474 * frame spans before the single large `sub sp` jumps past the guard page. 1475 * slim_prologue/fp_at_bottom returned above — their frames are bounded to 1476 * one page (≤16 / ≤504 bytes), so only this path can exceed `interval`. */ 1477 { 1478 u32 interval = abi_stack_probe_interval(a->base.c->abi); 1479 if (interval && L->frame_size > interval) 1480 aa_words_stack_probe(a, words, cap, &n, L->frame_size, interval); 1481 } 1482 aa_words_sub_sp_frame(a, words, cap, &n, L->frame_size); 1483 if (a->slim_small_frame) { 1484 /* `stp x29, x30, [sp, #saved_pair_off]` — skip the `add x17, sp, #...` 1485 * scratch the fat path needs. Valid when the offset fits stp's 1486 * signed-7-bit scaled immediate (saved_pair_off <= 504). */ 1487 u32 save_off = aa_sp_off_saved_pair(L); 1488 if (n >= cap) aa_panic(a, "prologue too large"); 1489 words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)); 1490 } else { 1491 aa_words_saved_pair_addr(a, words, cap, &n, L); 1492 if (n >= cap) aa_panic(a, "prologue too large"); 1493 words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */ 1494 } 1495 aa_words_frame_ptr_from_sp(a, words, cap, &n, L); 1496 /* Save callee-saved registers the allocator used (fp-relative; their slots 1497 * were reserved first by aa_reserve_callee_saves so offsets fit stur). */ 1498 aa_words_callee_saves(a, 1, words, cap, &n); 1499 return n; 1500 } 1501 1502 /* Patch the reserved prologue region (`region` words at prologue_pos) with the 1503 * real prologue for `L`. Used by the NativeDirectTarget single-pass path, 1504 * which reserves AA_PROLOGUE_WORDS up front before the frame is known. The 1505 * optimizer path reserves exactly the words it needs, so `region` equals 1506 * the real prologue length and no tail remains. */ 1507 static void aa_patch_prologue(AANativeTarget* a, const AAFrameLayout* L, 1508 u32 region) { 1509 u32 words[AA_PROLOGUE_WORDS]; 1510 u32 n; 1511 ObjSecId sec = a->func->text_section_id; 1512 if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large"); 1513 memset(words, 0, sizeof words); 1514 n = aa_build_prologue_words(a, L, words, region); 1515 /* If the real prologue is shorter than the reserved region (the worst-case 1516 * NDT reservation), branch straight to the body rather than leaving the 1517 * trailing slots as NOPs that fall through and execute on every call. */ 1518 if (n < region) { 1519 words[n] = aa64_b(region - n); 1520 for (u32 i = n + 1u; i < region; ++i) words[i] = 0xd503201fu; 1521 } 1522 for (u32 i = 0; i < region; ++i) 1523 aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]); 1524 } 1525 1526 static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) { 1527 MCEmitter* mc = a->base.mc; 1528 u32 words[AA_PROLOGUE_WORDS]; 1529 u32 n = 0; 1530 if (!L->frame_size) return; 1531 aa_words_restore_frame(a, words, AA_PROLOGUE_WORDS, &n, L); 1532 for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]); 1533 } 1534 1535 /* Reserve one entry in the deferred-patch list, growing (arena-doubling) as 1536 * needed. The returned pointer is stable until the next aa_patch_alloc. */ 1537 static AAPatch* aa_patch_alloc(AANativeTarget* a) { 1538 if (a->npatches == a->patches_cap) { 1539 u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; 1540 AAPatch* nb = arena_zarray(a->base.c->tu, AAPatch, cap); 1541 if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches); 1542 a->patches = nb; 1543 a->patches_cap = cap; 1544 } 1545 return &a->patches[a->npatches++]; 1546 } 1547 1548 /* Append FP-relative loads that restore the saved callee registers (stp/ldp 1549 * paired, same as the prologue saves). Shared by the tail-call patch; the 1550 * function epilogue uses aa_emit_callee_restores. */ 1551 static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap, 1552 u32* n) { 1553 aa_words_callee_saves(a, 0, words, cap, n); 1554 } 1555 1556 /* Drain the deferred-patch list. Each entry targets a disjoint, fixed code 1557 * position, so insertion order does not affect output. */ 1558 static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) { 1559 ObjSecId sec = a->func->text_section_id; 1560 for (u32 i = 0; i < a->npatches; ++i) { 1561 AAPatch* p = &a->patches[i]; 1562 if (p->kind == AA_PATCH_ALLOCA) { 1563 u32 imm12, sh; 1564 if (!aa64_addsub_imm_fits(a->frame.max_outgoing, &imm12, &sh)) 1565 aa_panic(a, "outgoing area too large for alloca result"); 1566 aa_patch32(a->base.obj, sec, p->pos, 1567 aa64_add_imm(1, p->u.dst_reg, AA_SP, imm12, sh)); 1568 } else { /* AA_PATCH_TAIL */ 1569 NativeLoc callee = p->u.callee; 1570 u32 words[AA_TAIL_WORDS]; 1571 u32 n = 0; 1572 memset(words, 0, sizeof words); 1573 aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); 1574 aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L); 1575 if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); 1576 if (callee.kind == NATIVE_LOC_REG) { 1577 words[n++] = aa64_br(loc_reg(callee)); 1578 } else if (callee.kind == NATIVE_LOC_GLOBAL) { 1579 while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu; 1580 words[n++] = aa64_b(0); 1581 } else { 1582 aa_panic(a, "unsupported tail target"); 1583 } 1584 while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu; 1585 for (u32 w = 0; w < AA_TAIL_WORDS; ++w) 1586 aa_patch32(a->base.obj, sec, p->pos + w * 4u, words[w]); 1587 } 1588 } 1589 } 1590 1591 static void aa_func_end(NativeTarget* t) { 1592 AANativeTarget* a = aa_of(t); 1593 MCEmitter* mc = t->mc; 1594 AAFrameLayout L = 1595 aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes); 1596 /* known_frame (optimizer): prologue, allocas, and tail epilogues were emitted 1597 * final and slim eligibility was settled in aa_func_begin_known_frame — there 1598 * is nothing to patch. Single-pass (NDT): a worst-case prologue region was 1599 * reserved and the deferred patches recorded; resolve them now that the frame 1600 * is final. The NDT path always uses the fat prologue/epilogue (slim_* left 0 1601 * by aa_func_begin_common, since its reserved region is much larger). */ 1602 u32 prologue_region = 1603 a->frame.known_frame ? a->minimal_prologue_words : AA_PROLOGUE_WORDS; 1604 mc->label_place(mc, a->epilogue_label); 1605 aa_emit_callee_restores(a); 1606 aa_emit_restore_frame(a, &L); 1607 aa_emit32(mc, aa64_ret(AA_LR)); 1608 if (a->frame.known_frame) { 1609 /* The frame-planning pre-pass plus final prologue/alloca/tail emission must 1610 * leave nothing deferred; a stray patch would mean a body-time frame change 1611 * the final prologue never saw. */ 1612 if (a->npatches != 0) aa_panic(a, "known-frame path left deferred patches"); 1613 } else { 1614 aa_patch_prologue(a, &L, prologue_region); 1615 aa_apply_patches(a, &L); 1616 } 1617 if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { 1618 i32 cfa = aa_cfa_off(a); 1619 mc->cfi_set_next_pc_offset(mc, prologue_region * 4u); 1620 /* CFA = caller's sp, an fp-relative offset that depends on the layout: 1621 * fp+16 (top-record) or fp+frame_size (bottom-record). saved fp/lr live at 1622 * [fp]/[fp+8] in both, hence at CFA-cfa / CFA-cfa+8. */ 1623 mc->cfi_def_cfa(mc, AA_FP, cfa); 1624 mc->cfi_offset(mc, AA_FP, aa_fp_off_saved_fp() - cfa); 1625 mc->cfi_offset(mc, AA_LR, aa_fp_off_saved_lr() - cfa); 1626 } 1627 obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id, 1628 a->func_start, mc->pos(mc) - a->func_start); 1629 if (a->func->atomize) { 1630 obj_atom_define(t->obj, a->func->text_section_id, a->func_start, 1631 mc->pos(mc) - a->func_start, a->func->sym, 0); 1632 } 1633 /* Hand the function's PC range to the Debug producer so its line program 1634 * (and DW_AT_low_pc/high_pc) cover this function — emit_section_line skips 1635 * functions without a recorded range. */ 1636 if (mc->debug) 1637 debug_func_pc_range(mc->debug, a->func->text_section_id, a->func_start, 1638 mc->pos(mc)); 1639 if (mc->cfi_endproc) mc->cfi_endproc(mc); 1640 mc_end_function(mc); 1641 a->func = NULL; 1642 } 1643 1644 static NativeFrameSlot aa_frame_slot(NativeTarget* t, 1645 const NativeFrameSlotDesc* d) { 1646 return native_frame_slot_alloc(&aa_of(t)->frame, d); 1647 } 1648 1649 static int aa_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot, 1650 CGDebugLoc* out) { 1651 AANativeTarget* a = aa_of(t); 1652 AANativeSlot* s; 1653 i32 fp_off; 1654 if (!out) return 0; 1655 memset(out, 0, sizeof *out); 1656 if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0; 1657 s = aa_slot(a, slot); 1658 fp_off = aa_fp_off_slot(a, s->off); 1659 out->kind = CG_DEBUG_LOC_FRAME; 1660 /* The hosted dbg stop snapshot currently carries x29/fp as the frame base 1661 * for variable materialization, so report the same FP-relative slot offset 1662 * used by native memory operands. */ 1663 out->v.frame_ofs = fp_off; 1664 return 1; 1665 } 1666 1667 static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 1668 u32 nclob, u32* int_mask, u32* fp_mask); 1669 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks 1670 * (cg/native_asm.h); it reads the masks from t->regs->classes. */ 1671 1672 /* Build the callee-saved set the prologue must preserve: the allocator-assigned 1673 * callee-saved registers (frame->callee_saved_used) plus any an inline-asm 1674 * block clobbers. The latter are opaque to the optimizer's operand scan, so it 1675 * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral 1676 * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks 1677 * and keep only the callee-saved ones, per AAPCS64: x19..x28 and the low 64 1678 * bits of v8..v15 (x29/x30 are the frame pointer and link register, handled by 1679 * the prologue head, not as ordinary callee-saves). This is the same register 1680 * selection the per-block spill used, hoisted into the prologue. */ 1681 static u32 aa_known_callee_saves(NativeTarget* t, 1682 const NativeKnownFrameDesc* frame, u32* out, 1683 u32 cap) { 1684 u32 ncls = frame->ncallee_classes; 1685 u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; 1686 if (ncls > cap) ncls = cap; 1687 for (u32 c = 0; c < ncls; ++c) 1688 out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; 1689 if (frame->asm_clobbers && frame->nasm_clobbers) { 1690 AANativeTarget* a = aa_of(t); 1691 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 1692 aa_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, 1693 &clob_int, &clob_fp); 1694 } 1695 native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int, 1696 &abi_fp); 1697 clob_int |= abi_int; 1698 clob_fp |= abi_fp; 1699 for (Reg r = 0; r < 32u; ++r) { 1700 if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && r >= 19u && r <= 28u) 1701 out[NATIVE_REG_INT] |= 1u << r; 1702 if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && r >= 8u && r <= 15u) 1703 out[NATIVE_REG_FP] |= 1u << r; 1704 } 1705 return ncls; 1706 } 1707 1708 /* Optimizer entry point: the full frame is supplied up front, so the prologue, 1709 * entry saves, slim-form eligibility, allocas, and tail epilogues are all final 1710 * the moment they are emitted — no back-patching (aa_func_end skips the patch 1711 * passes when a->frame.known_frame). Slot creation order matches the 1712 * single-pass path (callee-saves first for stur range, then the static slots, 1713 * then sret/variadic entry saves) so offsets are identical to what the patch 1714 * path would produce. */ 1715 static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, 1716 const NativeKnownFrameDesc* frame, 1717 NativeFrameSlot* out_slots) { 1718 AANativeTarget* a = aa_of(t); 1719 AAFrameLayout L; 1720 u32 words[AA_PROLOGUE_WORDS]; 1721 u32 n; 1722 aa_func_begin_common(t, fd); 1723 a->frame.known_frame = 1; 1724 if (frame) { 1725 u32 cs[NATIVE_CALL_PLAN_CLASSES]; 1726 u32 ncs = aa_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES); 1727 a->frame.has_alloca = frame->has_alloca; 1728 if (ncs) aa_reserve_callee_saves(t, cs, ncs); 1729 for (u32 i = 0; i < frame->nslots; ++i) { 1730 NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]); 1731 if (out_slots) out_slots[i] = slot; 1732 } 1733 aa_reserve_entry_saves(a); 1734 /* Reserve the atomic-RMW scratch spill last (matching its lazy position in 1735 * the single-pass path), so aa_saved_tmp_spill reuses it instead of growing 1736 * the frame mid-body. */ 1737 if (frame->needs_scratch_spill) { 1738 NativeFrameSlotDesc sd; 1739 memset(&sd, 0, sizeof sd); 1740 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 1741 sd.size = 8; 1742 sd.align = 8; 1743 sd.kind = NATIVE_FRAME_SLOT_SPILL; 1744 a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd); 1745 } 1746 if (frame->max_outgoing > a->frame.max_outgoing) 1747 a->frame.max_outgoing = frame->max_outgoing; 1748 } 1749 /* Frame is final: slot_bytes (cum_off) and out_stack (max_outgoing) are both 1750 * known, so the prologue immediates and slim-form choice are settled here. 1751 * frame_size_final must be set before aa_build_prologue_words / entry saves, 1752 * since the bottom-record offset helpers read it. */ 1753 L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, 1754 a->top_home_bytes); 1755 a->frame_size_final = L.frame_size; 1756 /* Slim Tier A: no callee-saves, no alloca, no body slots, no outgoing stack 1757 * args — the whole frame is the 16-byte record. fp_at_bottom: a small frame 1758 * with callee-saves/locals and no outgoing stack args; the record moves to 1759 * the bottom (fp = sp) so sp adjustment folds into the pre/post-indexed 1760 * stp/ldp (frame_size <= 504 keeps the post-index ldp imm in range). 1761 * Otherwise slim_small_frame keeps the top-record layout but skips the 1762 * x17/x10 scratch (out_stack>0 small frames land here). A Windows-variadic 1763 * home area forces the fat top-record layout: it lives above the saved pair, 1764 * which neither the slim forms (saved pair at the very top) nor the 1765 * bottom-record (saved pair at the very bottom) leave room for. (See 1766 * aa_func_end for the single-pass path, which never takes any slim form.) */ 1767 a->slim_prologue = a->frame.ncallee_saves == 0 && !a->frame.has_alloca && 1768 L.slot_bytes == 0 && L.out_stack == 0 && !a->top_home_bytes; 1769 a->fp_at_bottom = !a->slim_prologue && !a->frame.has_alloca && 1770 L.out_stack == 0 && L.frame_size <= 504u && 1771 !a->top_home_bytes; 1772 a->slim_small_frame = !a->slim_prologue && !a->fp_at_bottom && 1773 !a->frame.has_alloca && !a->top_home_bytes && 1774 aa_sp_off_saved_pair(&L) <= 504u; 1775 n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS); 1776 for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]); 1777 a->minimal_prologue_words = n; 1778 a->frame.frame_final = 1; 1779 aa_emit_entry_save_stores(a); 1780 } 1781 1782 static void aa_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot, 1783 MemAccess mem) { 1784 NativeAddr addr; 1785 memset(&addr, 0, sizeof addr); 1786 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1787 addr.base.frame = slot; 1788 addr.base_type = src.type; 1789 aa_emit_mem(aa_of(t), 0, src, addr, mem); 1790 } 1791 1792 static void aa_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot, 1793 MemAccess mem) { 1794 NativeAddr addr; 1795 memset(&addr, 0, sizeof addr); 1796 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1797 addr.base.frame = slot; 1798 addr.base_type = dst.type; 1799 aa_emit_mem(aa_of(t), 1, dst, addr, mem); 1800 } 1801 1802 static MCLabel aa_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); } 1803 1804 static void aa_label_place(NativeTarget* t, MCLabel label) { 1805 t->mc->label_place(t->mc, label); 1806 } 1807 1808 static void aa_jump(NativeTarget* t, MCLabel label) { 1809 aa_emit32(t->mc, aa64_b(0)); 1810 t->mc->emit_label_ref(t->mc, label, R_AARCH64_JUMP26, 4, 0); 1811 } 1812 1813 static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs, 1814 NativeLoc rhs, MCLabel label) { 1815 aa_emit_cmp_to_flags(t, lhs, rhs); 1816 /* CMP_ONE_F / CMP_UEQ_F have no single FP condition: take the branch from a 1817 * pair of conditional branches to the same label (no scratch register). */ 1818 if (op == CMP_ONE_F) { 1819 /* ordered & !=: branch if a<b (MI) or a>b (GT). */ 1820 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x4u})); /* MI */ 1821 t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); 1822 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0xcu})); /* GT */ 1823 t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); 1824 return; 1825 } 1826 if (op == CMP_UEQ_F) { 1827 /* unordered | ==: branch if a==b (EQ) or unordered (VS). */ 1828 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x0u})); /* EQ */ 1829 t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); 1830 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = 0x6u})); /* VS */ 1831 t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); 1832 return; 1833 } 1834 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)})); 1835 t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); 1836 } 1837 1838 static void aa_indirect_branch(NativeTarget* t, NativeLoc addr, 1839 const MCLabel* valid_targets, u32 ntargets) { 1840 (void)valid_targets; 1841 (void)ntargets; 1842 aa_emit32(t->mc, aa64_br(loc_reg(addr))); 1843 } 1844 1845 static void aa_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel target) { 1846 /* `&&label` address-take: adrp/add with the ADR_PREL_PG_HI21 + 1847 * ADD_ABS_LO12_NC relocation pair against the label's per-block local symbol 1848 * — the same form used to address a global — so the reference is genuinely 1849 * relocatable (reaches ±4 GiB) and any assembler resolves it from the symbol. 1850 * Replaces the old 16-byte INTRA-label sequence with a baked offset. */ 1851 MCEmitter* mc = t->mc; 1852 u32 rd = loc_reg(dst); 1853 ObjSymId sym = mc_label_symbol(mc, target); 1854 u32 pos = mc->pos(mc); 1855 aa_emit32(mc, aa64_adrp(rd, 0, 0)); 1856 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21, sym, 0, 1857 0, 0); 1858 pos = mc->pos(mc); 1859 aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0)); 1860 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADD_ABS_LO12_NC, sym, 0, 1861 0, 0); 1862 } 1863 1864 static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { 1865 /* Identity move elision: same-class same-reg is a no-op on aarch64 1866 * regardless of width (mov xN,xN and mov wN,wN both leave the low bits 1867 * untouched). Catches no-op IR_CONVERT (BITCAST, ZEXT/SEXT with 1868 * src_bits>=dst_bits, FEXT/FTRUNC across-class) when the allocator put 1869 * dst and src in the same hard reg — common post #2.5 return-reg 1870 * coalescing, e.g. `convert opnds=[v0,v0]` after a pointer-returning call 1871 * was emitting `mov x0,x0`. Cross-class (fp<->gpr) bitcasts are not 1872 * elided here even when the reg numbers match — the register files are 1873 * disjoint. */ 1874 if (dst.kind == NATIVE_LOC_REG && src.kind == NATIVE_LOC_REG && 1875 native_loc_is_fp(dst) == native_loc_is_fp(src) && dst.v.reg == src.v.reg) 1876 return; 1877 if (native_loc_is_fp(dst) && native_loc_is_fp(src)) { 1878 if (type_size32(t, dst.type) == 16u) 1879 aa_emit32(t->mc, aa_mov_vec16(loc_reg(dst), loc_reg(src))); 1880 else 1881 aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst), 1882 loc_reg(src))); 1883 } else if (native_loc_is_fp(dst)) { 1884 aa_emit32(t->mc, 1885 aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src))); 1886 } else if (native_loc_is_fp(src)) { 1887 aa_emit32(t->mc, 1888 aa_fmov_fp_to_gpr(loc_is_64(t, dst), loc_reg(dst), loc_reg(src))); 1889 } else { 1890 aa_emit32(t->mc, 1891 aa64_mov_reg(loc_is_64(t, dst), loc_reg(dst), loc_reg(src))); 1892 } 1893 } 1894 1895 static NativeLoc aa_tmp_loc(KitCgTypeId type, Reg reg); 1896 1897 static void aa_load_imm_native(NativeTarget* t, NativeLoc dst, i64 imm) { 1898 aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), imm); 1899 } 1900 1901 static void aa_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cbytes) { 1902 u64 v = 0; 1903 if (cbytes.size > 8u) 1904 compiler_panic(t->c, ((AANativeTarget*)t)->loc, 1905 "aarch64 native target: byte constant too large"); 1906 for (u32 i = 0; i < cbytes.size; ++i) v |= (u64)cbytes.bytes[i] << (i * 8u); 1907 if (native_loc_is_fp(dst)) { 1908 NativeLoc tmp = aa_tmp_loc(cbytes.type, AA_TMP0); 1909 aa_emit_load_imm(t->mc, cbytes.size == 8u, AA_TMP0, (i64)v); 1910 aa_move(t, dst, tmp); 1911 } else { 1912 aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), (i64)v); 1913 } 1914 } 1915 1916 static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { 1917 AANativeTarget* a = aa_of(t); 1918 u32 rd = loc_reg(dst); 1919 aa_materialize_frame_index(a, &addr, rd); 1920 switch ((NativeAddrBaseKind)addr.base_kind) { 1921 case NATIVE_ADDR_BASE_FRAME: { 1922 AANativeSlot* s = aa_slot(a, addr.base.frame); 1923 aa_emit_add_imm(a, rd, AA_FP, aa_fp_off_slot(a, s->off) + addr.offset); 1924 aa_apply_index(a, rd, &addr); 1925 return; 1926 } 1927 case NATIVE_ADDR_BASE_FRAME_VALUE: { 1928 NativeAddr load; 1929 MemAccess mem; 1930 memset(&load, 0, sizeof load); 1931 load.base_kind = NATIVE_ADDR_BASE_FRAME; 1932 load.base.frame = addr.base.frame; 1933 load.base_type = 1934 addr.base_type ? addr.base_type : builtin_id(KIT_CG_BUILTIN_I64); 1935 memset(&mem, 0, sizeof mem); 1936 mem.type = load.base_type; 1937 mem.size = 8; 1938 mem.align = 8; 1939 aa_emit_mem(a, 1, dst, load, mem); 1940 if (addr.offset) aa_emit_add_imm(a, rd, rd, addr.offset); 1941 aa_apply_index(a, rd, &addr); 1942 return; 1943 } 1944 case NATIVE_ADDR_BASE_REG: 1945 aa_emit_add_imm(a, rd, addr.base.reg, addr.offset); 1946 aa_apply_index(a, rd, &addr); 1947 return; 1948 case NATIVE_ADDR_BASE_GLOBAL: { 1949 i64 addend = addr.base.global.addend + (i64)addr.offset; 1950 u32 pos = t->mc->pos(t->mc); 1951 if (aa_use_got_for_sym(t, addr.base.global.sym)) { 1952 aa_emit32(t->mc, aa64_adrp(rd, 0, 0)); 1953 t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, 1954 R_AARCH64_ADR_GOT_PAGE, addr.base.global.sym, 0, 0, 1955 0); 1956 pos = t->mc->pos(t->mc); 1957 aa_emit32(t->mc, aa_ldr_uimm(3, rd, rd, 0)); 1958 t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, 1959 R_AARCH64_LD64_GOT_LO12_NC, addr.base.global.sym, 1960 0, 0, 0); 1961 if (addend) aa_emit_add_i64(a, rd, rd, addend); 1962 aa_apply_index(a, rd, &addr); 1963 return; 1964 } 1965 aa_emit32(t->mc, aa64_adrp(rd, 0, 0)); 1966 t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, 1967 R_AARCH64_ADR_PREL_PG_HI21, addr.base.global.sym, 1968 addend, 0, 0); 1969 pos = t->mc->pos(t->mc); 1970 aa_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); 1971 t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, 1972 R_AARCH64_ADD_ABS_LO12_NC, addr.base.global.sym, 1973 addend, 0, 0); 1974 aa_apply_index(a, rd, &addr); 1975 return; 1976 } 1977 default: 1978 aa_panic(a, "unsupported load_addr"); 1979 } 1980 } 1981 1982 static void aa_load_native(NativeTarget* t, NativeLoc dst, NativeAddr addr, 1983 MemAccess mem) { 1984 aa_emit_mem(aa_of(t), 1, dst, addr, mem); 1985 } 1986 1987 static void aa_store_native(NativeTarget* t, NativeAddr addr, NativeLoc src, 1988 MemAccess mem) { 1989 aa_emit_mem(aa_of(t), 0, src, addr, mem); 1990 } 1991 1992 /* Windows/AArch64 TLS Local-Exec (PE-COFF). Mirrors x64_tls_addr_of_win64: 1993 * ldr rd, [x18, #0x58] ; TEB.ThreadLocalStoragePointer 1994 * adrp x16, _tls_index ; PAGEBASE_REL21 1995 * add x16, x16, :lo12:_tls_index ; PAGEOFFSET_12A 1996 * ldr w16, [x16] ; module's TLS index 1997 * ldr rd, [rd, x16, lsl #3] ; this module's TLS block base 1998 * add rd, rd, #:secrel_hi12:sym ; SECREL_HIGH12A (sh=1) 1999 * add rd, rd, #:secrel_lo12:sym ; SECREL_LOW12A (sh=0) 2000 * We materialize &_tls_index via ADRP+ADD (not LDR :lo12:) on purpose: the 2001 * COFF reader collapses LDST32→LDST64 width, so an LDR :lo12: form would be 2002 * mis-scaled at link time; ADD_ABS_LO12_NC carries no width and round-trips 2003 * cleanly. AA_TMP0 (x16) is the reserved scratch; rd is an allocated reg 2004 * distinct from x16/x17/x18. */ 2005 static void aa_tls_addr_of_win(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2006 i64 addend) { 2007 MCEmitter* mc = t->mc; 2008 u32 sec = mc->section_id; 2009 u32 rd = loc_reg(dst); 2010 u32 pos; 2011 Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index")); 2012 ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name); 2013 if (idx_sym == 0) 2014 idx_sym = 2015 obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); 2016 /* (1) rd = TEB.ThreadLocalStoragePointer. */ 2017 aa_emit32(mc, aa_ldr_uimm(3, rd, AA_WIN_TEB_REG, AA_WIN_TEB_TLS_PTR_OFF)); 2018 /* (2)+(3) x16 = &_tls_index via ADRP + ADD. */ 2019 pos = mc->pos(mc); 2020 aa_emit32(mc, aa64_adrp(AA_TMP0, 0, 0)); 2021 mc->emit_reloc_at(mc, sec, pos, R_AARCH64_ADR_PREL_PG_HI21, idx_sym, 0, 0, 0); 2022 pos = mc->pos(mc); 2023 aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_TMP0, 0, 0)); 2024 mc->emit_reloc_at(mc, sec, pos, R_AARCH64_ADD_ABS_LO12_NC, idx_sym, 0, 0, 0); 2025 /* (4) w16 = _tls_index (the loaded value). */ 2026 aa_emit32(mc, aa_ldr_uimm(2, AA_TMP0, AA_TMP0, 0)); 2027 /* (5) rd = TLS array slot for this module: ldr rd, [rd, x16, lsl #3]. */ 2028 aa_emit32(mc, aa_ldst_regoff_v(3, 0, 1, rd, rd, AA_TMP0, 1)); 2029 /* (6) rd += :secrel_hi12:sym (ADD with sh=1; linker patches imm12). */ 2030 pos = mc->pos(mc); 2031 aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1)); 2032 mc->emit_reloc_at(mc, sec, pos, R_COFF_AARCH64_SECREL_HIGH12A, sym, addend, 1, 2033 0); 2034 /* (7) rd += :secrel_lo12:sym (ADD with sh=0). */ 2035 pos = mc->pos(mc); 2036 aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0)); 2037 mc->emit_reloc_at(mc, sec, pos, R_COFF_AARCH64_SECREL_LOW12A, sym, addend, 1, 2038 0); 2039 } 2040 2041 static void aa_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2042 i64 addend) { 2043 AANativeTarget* a = aa_of(t); 2044 MCEmitter* mc = t->mc; 2045 u32 rd = loc_reg(dst); 2046 u32 pos; 2047 if (obj_format_tls_via_descriptor(t->c)) { 2048 aa_emit32(mc, aa64_adrp(0, 0, 0)); 2049 pos = mc->pos(mc) - 4u; 2050 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 2051 0, 0, 0); 2052 aa_emit32(mc, aa_ldr_uimm(3, 0, 0, 0)); 2053 pos = mc->pos(mc) - 4u; 2054 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, 2055 sym, 0, 0, 0); 2056 aa_emit32(mc, aa_ldr_uimm(3, AA_TMP0, 0, 0)); 2057 aa_emit32(mc, aa64_blr(AA_TMP0)); 2058 if (addend) aa_emit_add_i64(a, 0, 0, addend); 2059 if (rd != 0) aa_emit32(mc, aa64_mov_reg(1, rd, 0)); 2060 return; 2061 } 2062 if (obj_format_tls_model(t->c) == OBJ_TLS_WINDOWS_TEB) { 2063 aa_tls_addr_of_win(t, dst, sym, addend); 2064 return; 2065 } 2066 if (t->c->target.obj != KIT_OBJ_ELF) { 2067 aa_panic(a, "unsupported TLS object format"); 2068 } 2069 aa_emit32(mc, aa_mrs_tpidr_el0(rd)); 2070 pos = mc->pos(mc); 2071 aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 1)); 2072 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, 2073 sym, addend, 0, 0); 2074 pos = mc->pos(mc); 2075 aa_emit32(mc, aa64_add_imm(1, rd, rd, 0, 0)); 2076 mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, 2077 sym, addend, 0, 0); 2078 } 2079 2080 static NativeLoc aa_tmp_loc(KitCgTypeId type, Reg reg) { 2081 NativeLoc loc; 2082 memset(&loc, 0, sizeof loc); 2083 loc.kind = NATIVE_LOC_REG; 2084 loc.cls = NATIVE_REG_INT; 2085 loc.type = type; 2086 loc.v.reg = reg; 2087 return loc; 2088 } 2089 2090 static NativeAddr aa_addr_plus(NativeAddr addr, u32 off) { 2091 addr.offset += (i32)off; 2092 return addr; 2093 } 2094 2095 static void aa_copy_bytes_dir(NativeTarget* t, NativeAddr dst, NativeAddr src, 2096 AggregateAccess access, int backward) { 2097 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 2098 KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); 2099 KitCgTypeId i16 = builtin_id(KIT_CG_BUILTIN_I16); 2100 KitCgTypeId i8 = builtin_id(KIT_CG_BUILTIN_I8); 2101 NativeLoc tmp = aa_tmp_loc(i64, AA_TMP0); 2102 u32 off = 0; 2103 while (off < access.size) { 2104 u32 rem = access.size - off; 2105 u32 pos; 2106 MemAccess mem = access.mem; 2107 if (rem >= 8u) { 2108 mem.type = i64; 2109 mem.size = 8u; 2110 } else if (rem >= 4u) { 2111 mem.type = i32; 2112 mem.size = 4u; 2113 tmp.type = i32; 2114 } else if (rem >= 2u) { 2115 mem.type = i16; 2116 mem.size = 2u; 2117 tmp.type = i16; 2118 } else { 2119 mem.type = i8; 2120 mem.size = 1u; 2121 tmp.type = i8; 2122 } 2123 mem.align = mem.size; 2124 pos = backward ? access.size - off - mem.size : off; 2125 aa_load_native(t, tmp, aa_addr_plus(src, pos), mem); 2126 aa_store_native(t, aa_addr_plus(dst, pos), tmp, mem); 2127 off += mem.size; 2128 tmp.type = i64; 2129 } 2130 } 2131 2132 static void aa_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, 2133 AggregateAccess access) { 2134 aa_copy_bytes_dir(t, dst, src, access, 0); 2135 } 2136 2137 static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, 2138 AggregateAccess access) { 2139 KitCgTypeId i8 = builtin_id(KIT_CG_BUILTIN_I8); 2140 NativeLoc byte = byte_value; 2141 MemAccess mem = access.mem; 2142 mem.type = i8; 2143 mem.size = 1u; 2144 mem.align = 1u; 2145 byte.type = i8; 2146 for (u32 off = 0; off < access.size; ++off) 2147 aa_store_native(t, aa_addr_plus(dst, off), byte, mem); 2148 } 2149 2150 static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); 2151 static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); 2152 static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); 2153 2154 /* Strength-reduce `mul rd, rn, #imm` for the constants accepted by 2155 * aa64_imul_strength_reducible into a single non-mul instruction. Callers 2156 * must gate on aa64_imul_strength_reducible(sf, imm) — this routine panics 2157 * on unhandled constants. */ 2158 static void aa_emit_mul_const_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, 2159 i64 imm) { 2160 u64 a; 2161 if (imm == 0) { 2162 aa_emit32(t->mc, aa64_mov_reg(sf, rd, AA64_ZR)); 2163 return; 2164 } 2165 if (imm == 1) { 2166 if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); 2167 return; 2168 } 2169 if (imm == -1) { 2170 aa_emit32(t->mc, aa64_neg(sf, rd, rn)); 2171 return; 2172 } 2173 /* +2^k: lsl rd, rn, #k */ 2174 a = (u64)imm; 2175 if (imm > 0 && (a & (a - 1u)) == 0u) { 2176 u32 k = (u32)__builtin_ctzll(a); 2177 aa_lsl_imm(t, sf, rd, rn, k); 2178 return; 2179 } 2180 /* -2^k: sub rd, xzr, rn, lsl #k */ 2181 if (imm < 0) { 2182 a = (u64)(-imm); 2183 if (a && (a & (a - 1u)) == 0u) { 2184 u32 k = (u32)__builtin_ctzll(a); 2185 aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf, 2186 .op = 1u, 2187 .S = 0u, 2188 .shift = 0u, 2189 .Rm = rn, 2190 .imm6 = k, 2191 .Rn = AA64_ZR, 2192 .Rd = rd})); 2193 return; 2194 } 2195 } 2196 /* 2^k + 1: add rd, rn, rn, lsl #k */ 2197 if (imm >= 3) { 2198 u64 m = (u64)(imm - 1); 2199 if ((m & (m - 1u)) == 0u) { 2200 u32 k = (u32)__builtin_ctzll(m); 2201 aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf, 2202 .op = 0u, 2203 .S = 0u, 2204 .shift = 0u, 2205 .Rm = rn, 2206 .imm6 = k, 2207 .Rn = rn, 2208 .Rd = rd})); 2209 return; 2210 } 2211 } 2212 /* 1 - 2^k: sub rd, rn, rn, lsl #k */ 2213 if (imm <= -1) { 2214 u64 m = (u64)(1 - imm); 2215 if (m && (m & (m - 1u)) == 0u) { 2216 u32 k = (u32)__builtin_ctzll(m); 2217 aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf, 2218 .op = 1u, 2219 .S = 0u, 2220 .shift = 0u, 2221 .Rm = rn, 2222 .imm6 = k, 2223 .Rn = rn, 2224 .Rd = rd})); 2225 return; 2226 } 2227 } 2228 aa_panic(aa_of(t), "aa_emit_mul_const_imm: unhandled constant"); 2229 } 2230 2231 static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs, 2232 NativeLoc rhs) { 2233 u32 sf = loc_is_64(t, dst) ? 1u : 0u; 2234 u32 rd = loc_reg(dst), rn = loc_reg(lhs), rm = loc_reg(rhs); 2235 if (native_loc_is_fp(dst)) { 2236 u32 d = type_size32(t, dst.type) == 8u; 2237 switch (op) { 2238 case BO_FADD: 2239 aa_emit32(t->mc, aa_fp_bin(0x002800u, d, rd, rn, rm)); 2240 return; 2241 case BO_FSUB: 2242 aa_emit32(t->mc, aa_fp_bin(0x003800u, d, rd, rn, rm)); 2243 return; 2244 case BO_FMUL: 2245 aa_emit32(t->mc, aa_fp_bin(0x000800u, d, rd, rn, rm)); 2246 return; 2247 case BO_FDIV: 2248 aa_emit32(t->mc, aa_fp_bin(0x001800u, d, rd, rn, rm)); 2249 return; 2250 default: 2251 aa_panic(aa_of(t), "unsupported floating binary op"); 2252 } 2253 } 2254 if (rhs.kind == NATIVE_LOC_IMM && (op == BO_IADD || op == BO_ISUB)) { 2255 i64 imm = rhs.v.imm; 2256 int is_add = (op == BO_IADD); 2257 u32 imm12, sh; 2258 if (imm < 0) { 2259 is_add = !is_add; 2260 imm = -imm; 2261 } 2262 if (!aa64_addsub_imm_fits(imm, &imm12, &sh)) 2263 aa_panic(aa_of(t), "binop immediate not encodable"); 2264 aa_emit32(t->mc, is_add ? aa64_add_imm(sf, rd, rn, imm12, sh) 2265 : aa64_sub_imm(sf, rd, rn, imm12, sh)); 2266 return; 2267 } 2268 if (rhs.kind == NATIVE_LOC_IMM && op == BO_IMUL) { 2269 aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm); 2270 return; 2271 } 2272 if (rhs.kind == NATIVE_LOC_IMM && 2273 (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S)) { 2274 u32 shamt = (u32)rhs.v.imm; /* imm_legal guarantees 0 <= imm < datasize */ 2275 if (op == BO_SHL) 2276 aa_lsl_imm(t, sf, rd, rn, shamt); 2277 else if (op == BO_SHR_U) 2278 aa_lsr_imm(t, sf, rd, rn, shamt); 2279 else 2280 aa_asr_imm(t, sf, rd, rn, shamt); 2281 return; 2282 } 2283 if (rhs.kind == NATIVE_LOC_IMM && 2284 (op == BO_AND || op == BO_OR || op == BO_XOR)) { 2285 u32 N, immr, imms; 2286 if (!aa64_logimm_encode((u64)rhs.v.imm, sf, &N, &immr, &imms)) 2287 aa_panic(aa_of(t), "logical immediate not encodable"); 2288 if (op == BO_AND) 2289 aa_emit32(t->mc, aa64_and_imm(sf, rd, rn, N, immr, imms)); 2290 else if (op == BO_OR) 2291 aa_emit32(t->mc, aa64_orr_imm(sf, rd, rn, N, immr, imms)); 2292 else 2293 aa_emit32(t->mc, aa64_eor_imm(sf, rd, rn, N, immr, imms)); 2294 return; 2295 } 2296 switch (op) { 2297 case BO_IADD: 2298 aa_emit32(t->mc, aa64_add(sf, rd, rn, rm)); 2299 return; 2300 case BO_ISUB: 2301 aa_emit32(t->mc, aa64_sub(sf, rd, rn, rm)); 2302 return; 2303 case BO_IMUL: 2304 aa_emit32(t->mc, aa64_mul(sf, rd, rn, rm)); 2305 return; 2306 case BO_SDIV: 2307 aa_emit32(t->mc, aa64_sdiv(sf, rd, rn, rm)); 2308 return; 2309 case BO_UDIV: 2310 aa_emit32(t->mc, aa64_udiv(sf, rd, rn, rm)); 2311 return; 2312 case BO_SREM: 2313 aa_emit32(t->mc, aa64_sdiv(sf, AA_TMP0, rn, rm)); 2314 aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm)); 2315 aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0)); 2316 return; 2317 case BO_UREM: 2318 aa_emit32(t->mc, aa64_udiv(sf, AA_TMP0, rn, rm)); 2319 aa_emit32(t->mc, aa64_mul(sf, AA_TMP0, AA_TMP0, rm)); 2320 aa_emit32(t->mc, aa64_sub(sf, rd, rn, AA_TMP0)); 2321 return; 2322 case BO_AND: 2323 aa_emit32(t->mc, aa64_and(sf, rd, rn, rm)); 2324 return; 2325 case BO_OR: 2326 aa_emit32(t->mc, aa64_orr(sf, rd, rn, rm)); 2327 return; 2328 case BO_XOR: 2329 aa_emit32(t->mc, aa64_eor(sf, rd, rn, rm)); 2330 return; 2331 case BO_SHL: 2332 aa_emit32(t->mc, aa64_lslv(sf, rd, rn, rm)); 2333 return; 2334 case BO_SHR_U: 2335 aa_emit32(t->mc, aa64_lsrv(sf, rd, rn, rm)); 2336 return; 2337 case BO_SHR_S: 2338 aa_emit32(t->mc, aa64_asrv(sf, rd, rn, rm)); 2339 return; 2340 default: 2341 aa_panic(aa_of(t), "unsupported binary op"); 2342 } 2343 } 2344 2345 static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { 2346 u32 sf = loc_is_64(t, dst) ? 1u : 0u; 2347 if (native_loc_is_fp(dst)) { 2348 switch (op) { 2349 case UO_FNEG: 2350 case UO_NEG: 2351 aa_emit32(t->mc, aa_fneg(type_size32(t, dst.type) == 8u, loc_reg(dst), 2352 loc_reg(src))); 2353 return; 2354 default: 2355 aa_panic(aa_of(t), "unsupported floating unary op"); 2356 } 2357 } 2358 switch (op) { 2359 case UO_NEG: 2360 aa_emit32(t->mc, aa64_neg(sf, loc_reg(dst), loc_reg(src))); 2361 return; 2362 case UO_BNOT: 2363 aa_emit32(t->mc, aa64_mvn(sf, loc_reg(dst), loc_reg(src))); 2364 return; 2365 case UO_NOT: 2366 aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(src), 0, 0)); 2367 aa_emit32(t->mc, aa_cset(sf, loc_reg(dst), 0x0u)); 2368 return; 2369 default: 2370 aa_panic(aa_of(t), "unsupported unary op"); 2371 } 2372 } 2373 2374 static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, 2375 NativeLoc rhs) { 2376 if (native_loc_is_fp(lhs)) { 2377 aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs), 2378 loc_reg(rhs))); 2379 return; 2380 } 2381 { 2382 u32 sf = loc_is_64(t, lhs) ? 1u : 0u; 2383 if (rhs.kind == NATIVE_LOC_IMM) { 2384 u32 imm12 = 0, sh = 0; 2385 if (rhs.v.imm < 0 || !aa64_addsub_imm_fits(rhs.v.imm, &imm12, &sh)) 2386 aa_panic(aa_of(t), "cmp immediate not encodable"); 2387 aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(lhs), imm12, sh)); 2388 return; 2389 } 2390 aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs))); 2391 } 2392 } 2393 2394 static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs, 2395 NativeLoc rhs) { 2396 u32 sf = loc_is_64(t, dst); 2397 u32 rd = loc_reg(dst); 2398 aa_emit_cmp_to_flags(t, lhs, rhs); 2399 /* CMP_ONE_F (ordered & !=) and CMP_UEQ_F (unordered | ==) have no single 2400 * AArch64 FP condition. After FCMP, unordered sets V (and Z=0), so VC 2401 * (V==0) selects "ordered". */ 2402 if (op == CMP_ONE_F) { 2403 /* ordered & not-equal: NE masked to the ordered case. */ 2404 aa_emit32(t->mc, aa_cset(sf, rd, 0x1u)); /* cset rd, NE */ 2405 aa_emit32(t->mc, 2406 aa64_csel_enc(sf, rd, rd, AA64_ZR, 0x7u)); /* csel rd,rd,zr,VC */ 2407 return; 2408 } 2409 if (op == CMP_UEQ_F) { 2410 /* equal, or forced to 1 when unordered. */ 2411 aa_emit32(t->mc, aa_cset(sf, rd, 0x0u)); /* cset rd, EQ */ 2412 aa_emit32(t->mc, aa64_csinc_enc(sf, rd, rd, AA64_ZR, 2413 0x7u)); /* csinc rd,rd,zr,VC */ 2414 return; 2415 } 2416 aa_emit32(t->mc, aa_cset(sf, rd, cmp_cond(op))); 2417 } 2418 2419 static void aa_convert(NativeTarget* t, ConvKind op, NativeLoc dst, 2420 NativeLoc src) { 2421 int dst_fp = native_loc_is_fp(dst); 2422 int src_fp = native_loc_is_fp(src); 2423 switch (op) { 2424 case CV_TRUNC: 2425 case CV_BITCAST: 2426 aa_move(t, dst, src); 2427 return; 2428 case CV_ZEXT: { 2429 u32 src_bits = type_size32(t, src.type) * 8u; 2430 u32 dst_bits = type_size32(t, dst.type) * 8u; 2431 u32 sf = dst_bits > 32u; 2432 if (src_bits >= dst_bits) { 2433 aa_move(t, dst, src); 2434 } else if (src_bits >= 32u) { 2435 aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dst), loc_reg(src))); 2436 } else { 2437 aa_emit32(t->mc, 2438 aa_ubfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u)); 2439 } 2440 return; 2441 } 2442 case CV_SEXT: { 2443 u32 src_bits = type_size32(t, src.type) * 8u; 2444 u32 dst_bits = type_size32(t, dst.type) * 8u; 2445 u32 sf = dst_bits > 32u; 2446 if (src_bits >= dst_bits) { 2447 aa_move(t, dst, src); 2448 } else { 2449 aa_emit32(t->mc, 2450 aa_sbfm(sf, loc_reg(dst), loc_reg(src), 0, src_bits - 1u)); 2451 } 2452 return; 2453 } 2454 case CV_ITOF_S: 2455 aa_emit32(t->mc, aa_scvtf(type_size32(t, dst.type) == 8u, 2456 loc_is_64(t, src), loc_reg(dst), loc_reg(src))); 2457 return; 2458 case CV_ITOF_U: 2459 aa_emit32(t->mc, aa_ucvtf(type_size32(t, dst.type) == 8u, 2460 loc_is_64(t, src), loc_reg(dst), loc_reg(src))); 2461 return; 2462 case CV_FTOI_S: 2463 aa_emit32(t->mc, 2464 aa_fcvtzs(loc_is_64(t, dst), type_size32(t, src.type) == 8u, 2465 loc_reg(dst), loc_reg(src))); 2466 return; 2467 case CV_FTOI_U: 2468 aa_emit32(t->mc, 2469 aa_fcvtzu(loc_is_64(t, dst), type_size32(t, src.type) == 8u, 2470 loc_reg(dst), loc_reg(src))); 2471 return; 2472 case CV_FEXT: 2473 if (dst_fp && src_fp) 2474 aa_emit32(t->mc, aa_fcvt_d_s(loc_reg(dst), loc_reg(src))); 2475 else 2476 aa_move(t, dst, src); 2477 return; 2478 case CV_FTRUNC: 2479 if (dst_fp && src_fp) 2480 aa_emit32(t->mc, aa_fcvt_s_d(loc_reg(dst), loc_reg(src))); 2481 else 2482 aa_move(t, dst, src); 2483 return; 2484 default: 2485 aa_panic(aa_of(t), "unsupported conversion"); 2486 } 2487 } 2488 2489 static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, 2490 u32 align) { 2491 AANativeTarget* a = aa_of(t); 2492 u32 use_align = align < 16u ? 16u : align; 2493 if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2"); 2494 aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u)); 2495 aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align); 2496 aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1)); 2497 aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0)); 2498 aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0)); 2499 aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0)); 2500 /* The alloca result is sp + outgoing-area bytes. On the known-frame path 2501 * max_outgoing is already final, so emit the final `add dst, sp, #N` here; on 2502 * the single-pass path it is not known yet, so record a patch. */ 2503 if (a->frame.known_frame) { 2504 u32 imm12, sh; 2505 if (!aa64_addsub_imm_fits(a->frame.max_outgoing, &imm12, &sh)) 2506 aa_panic(a, "outgoing area too large for alloca result"); 2507 aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, imm12, sh)); 2508 } else { 2509 AAPatch* p = aa_patch_alloc(a); 2510 p->kind = AA_PATCH_ALLOCA; 2511 p->pos = t->mc->pos(t->mc); 2512 p->u.dst_reg = loc_reg(dst); 2513 a->nalloca++; 2514 aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0)); 2515 } 2516 } 2517 2518 static MemAccess aa_mem_for_type(NativeTarget* t, KitCgTypeId type, u32 size) { 2519 MemAccess mem; 2520 memset(&mem, 0, sizeof mem); 2521 mem.type = type; 2522 mem.size = size ? size : type_size32(t, type); 2523 mem.align = type_align32(t, type); 2524 if (mem.align > mem.size && mem.size) mem.align = mem.size; 2525 return mem; 2526 } 2527 2528 /* native_loc_reg / native_loc_stack are shared in native_target.h. */ 2529 2530 static NativeAddr aa_loc_addr(AANativeTarget* a, NativeLoc loc, u32 offset) { 2531 NativeAddr addr; 2532 memset(&addr, 0, sizeof addr); 2533 switch ((NativeLocKind)loc.kind) { 2534 case NATIVE_LOC_FRAME: 2535 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2536 addr.base.frame = loc.v.frame; 2537 addr.base_type = loc.type; 2538 addr.offset = (i32)offset; 2539 return addr; 2540 case NATIVE_LOC_STACK: 2541 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2542 addr.base.frame = loc.v.stack.slot; 2543 addr.base_type = loc.type; 2544 addr.offset = loc.v.stack.offset + (i32)offset; 2545 return addr; 2546 case NATIVE_LOC_ADDR: 2547 addr = loc.v.addr; 2548 addr.offset += (i32)offset; 2549 return addr; 2550 default: 2551 aa_panic(a, "location is not addressable"); 2552 } 2553 return addr; 2554 } 2555 2556 static void aa_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { 2557 AANativeTarget* a = aa_of(t); 2558 NativeAddr addr = aa_loc_addr(a, src, 0); 2559 aa_load_addr(t, dst, addr); 2560 } 2561 2562 static void aa_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2563 u32 offset, u32 size) { 2564 AANativeTarget* a = aa_of(t); 2565 MemAccess mem = aa_mem_for_type(t, dst.type, size); 2566 if (src.kind == NATIVE_LOC_REG) { 2567 aa_move(t, dst, src); 2568 return; 2569 } 2570 if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK || 2571 src.kind == NATIVE_LOC_ADDR) { 2572 NativeAddr addr = aa_loc_addr(a, src, offset); 2573 addr.base_type = dst.type; 2574 aa_emit_mem(a, 1, dst, addr, mem); 2575 return; 2576 } 2577 if (src.kind == NATIVE_LOC_IMM) { 2578 aa_emit_load_imm(t->mc, loc_is_64(t, dst), loc_reg(dst), src.v.imm); 2579 return; 2580 } 2581 aa_panic(a, "unsupported call argument source"); 2582 } 2583 2584 static void aa_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2585 u32 offset, u32 size) { 2586 AANativeTarget* a = aa_of(t); 2587 MemAccess mem = aa_mem_for_type(t, src.type, size); 2588 if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK || 2589 dst.kind == NATIVE_LOC_ADDR) { 2590 NativeAddr addr = aa_loc_addr(a, dst, offset); 2591 addr.base_type = src.type; 2592 aa_emit_mem(a, 0, src, addr, mem); 2593 return; 2594 } 2595 if (dst.kind == NATIVE_LOC_REG) { 2596 aa_move(t, dst, src); 2597 return; 2598 } 2599 aa_panic(a, "unsupported call return destination"); 2600 } 2601 2602 static void aa_store_outgoing_part(NativeTarget* t, int tail_call, 2603 u32 stack_off, NativeLoc src, u32 size) { 2604 NativeAddr addr; 2605 MemAccess mem = aa_mem_for_type(t, src.type, size); 2606 memset(&addr, 0, sizeof addr); 2607 addr.base_kind = NATIVE_ADDR_BASE_REG; 2608 addr.base.reg = tail_call ? AA_FP : AA_SP; 2609 addr.base_type = src.type; 2610 /* Tail calls write outgoing args into the caller's incoming-args window 2611 * (= [fp + 16 + off], same address the tail-callee will read via 2612 * aa_fp_off_in_arg). Non-tail calls write to the sp-anchored outgoing 2613 * area at the bottom of the caller's frame. */ 2614 addr.offset = tail_call ? aa_fp_off_tail_out_arg(aa_of(t), stack_off) 2615 : aa_sp_off_out_arg(stack_off); 2616 aa_emit_mem(aa_of(t), 0, src, addr, mem); 2617 } 2618 2619 static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi, 2620 const NativeCallDesc* desc, u32 i, 2621 ABIArgInfo* scratch) { 2622 if (abi && i < abi->nparams) return &abi->params[i]; 2623 memset(scratch, 0, sizeof *scratch); 2624 scratch->kind = ABI_ARG_DIRECT; 2625 scratch->flags = ABI_AF_NONE; 2626 scratch->nparts = 1; 2627 scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); 2628 /* Windows ARM64 routes variadic floating-point arguments through the integer 2629 * registers/stack (the classifier's remap_fp_parts_to_int does the same for 2630 * the *named* params of a variadic function); the value's bit pattern moves 2631 * via fmov x,d. Every other ABI keeps the `...` FP args in v registers. */ 2632 ((ABIArgPart*)scratch->parts)[0].cls = 2633 (cg_type_is_float(t->c, desc->args[i].type) && 2634 !(abi && abi->vararg_fp_via_int)) 2635 ? ABI_CLASS_FP 2636 : ABI_CLASS_INT; 2637 ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; 2638 ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type); 2639 ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type); 2640 ((ABIArgPart*)scratch->parts)[0].src_offset = 0; 2641 return scratch; 2642 } 2643 2644 /* Stack footprint of a single argument part. AAPCS64 uses 8-byte slots. Apple 2645 * ARM64 uses compact 4-byte slots for fixed stack-passed int32-sized values, 2646 * but its forced stack variadics still use 8-byte slots. */ 2647 static u32 aa_stack_arg_min_align(const ABIFuncInfo* abi) { 2648 return (abi && abi->stack_arg_min_align) ? abi->stack_arg_min_align : 8u; 2649 } 2650 2651 static u32 aa_vararg_stack_arg_min_align(const ABIFuncInfo* abi) { 2652 if (abi && abi->vararg_stack_arg_min_align) 2653 return abi->vararg_stack_arg_min_align; 2654 return aa_stack_arg_min_align(abi); 2655 } 2656 2657 static u32 aa_vararg_stack_start(const ABIFuncInfo* abi, u32 cursor) { 2658 return align_up_u32(cursor, aa_vararg_stack_arg_min_align(abi)); 2659 } 2660 2661 /* Natural stack alignment of a part, capped at 16 (binary128). */ 2662 static u32 aa_part_stack_align_min(u32 min_align, const ABIArgPart* part) { 2663 u32 al = part->align ? part->align : 8u; 2664 if (al < min_align) al = min_align; 2665 if (al > 16u) al = 16u; 2666 return al; 2667 } 2668 2669 static u32 aa_part_stack_align(const ABIFuncInfo* abi, 2670 const ABIArgPart* part) { 2671 return aa_part_stack_align_min(aa_stack_arg_min_align(abi), part); 2672 } 2673 2674 static u32 aa_part_vararg_stack_align(const ABIFuncInfo* abi, 2675 const ABIArgPart* part) { 2676 return aa_part_stack_align_min(aa_vararg_stack_arg_min_align(abi), part); 2677 } 2678 2679 static u32 aa_part_stack_size(const ABIFuncInfo* abi, 2680 const ABIArgPart* part) { 2681 return align_up_u32(part->size ? part->size : 8u, 2682 aa_part_stack_align(abi, part)); 2683 } 2684 2685 static u32 aa_part_vararg_stack_size(const ABIFuncInfo* abi, 2686 const ABIArgPart* part) { 2687 return align_up_u32(part->size ? part->size : 8u, 2688 aa_part_vararg_stack_align(abi, part)); 2689 } 2690 2691 /* The scalar type used to move one ABI part through a register. Aggregate 2692 * args/results are split into parts; each part must move at its own width, not 2693 * the (possibly >8-byte) aggregate width. */ 2694 static KitCgTypeId aa_part_scalar_type(const ABIArgPart* part) { 2695 if (part->cls == ABI_CLASS_FP) { 2696 if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32); 2697 if (part->size <= 8u) return builtin_id(KIT_CG_BUILTIN_F64); 2698 return builtin_id(KIT_CG_BUILTIN_F128); 2699 } 2700 switch (part->size) { 2701 case 1u: 2702 return builtin_id(KIT_CG_BUILTIN_I8); 2703 case 2u: 2704 return builtin_id(KIT_CG_BUILTIN_I16); 2705 case 4u: 2706 return builtin_id(KIT_CG_BUILTIN_I32); 2707 default: 2708 return builtin_id(KIT_CG_BUILTIN_I64); 2709 } 2710 } 2711 2712 static u32 aa_class_vararg_stack_size(const ABIFuncInfo* abi, 2713 const ABIArgInfo* ai) { 2714 u32 total = 0; 2715 u32 min_align = aa_vararg_stack_arg_min_align(abi); 2716 if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; 2717 if (ai->kind == ABI_ARG_INDIRECT) return 8u; 2718 for (u32 p = 0; p < ai->nparts; ++p) { 2719 total = align_up_u32(total, aa_part_vararg_stack_align(abi, &ai->parts[p])); 2720 total += aa_part_vararg_stack_size(abi, &ai->parts[p]); 2721 } 2722 return align_up_u32(total ? total : min_align, min_align); 2723 } 2724 2725 static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { 2726 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2727 u32 next_int = 0, next_fp = 0, stack = 0; 2728 for (u32 i = 0; i < desc->nargs; ++i) { 2729 ABIArgInfo tmp; 2730 const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp); 2731 int force_stack = 2732 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 2733 if (ai->kind == ABI_ARG_IGNORE) continue; 2734 if (force_stack) { 2735 stack = aa_vararg_stack_start(abi, stack); 2736 stack += aa_class_vararg_stack_size(abi, ai); 2737 continue; 2738 } 2739 if (ai->kind == ABI_ARG_INDIRECT) { 2740 if (next_int < 8u) 2741 next_int++; 2742 else 2743 stack += 8u; 2744 continue; 2745 } 2746 for (u32 p = 0; p < ai->nparts; ++p) { 2747 const ABIArgPart* part = &ai->parts[p]; 2748 if (part->cls == ABI_CLASS_FP) { 2749 if (next_fp < 8u) 2750 next_fp++; 2751 else { 2752 stack = align_up_u32(stack, aa_part_stack_align(abi, part)); 2753 stack += aa_part_stack_size(abi, part); 2754 } 2755 } else { 2756 if (next_int < 8u) 2757 next_int++; 2758 else { 2759 stack = align_up_u32(stack, aa_part_stack_align(abi, part)); 2760 stack += aa_part_stack_size(abi, part); 2761 } 2762 } 2763 } 2764 } 2765 return align_up_u32(stack, 16u); 2766 } 2767 2768 /* Stack-argument bytes a call with `fn_type`'s fixed parameters uses. Reuses 2769 * aa_call_stack_size by routing the declared params through it (their ABI 2770 * classification is independent of the actual operand locations, which 2771 * aa_call_stack_size ignores for register/stack placement). */ 2772 static u32 aa_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 2773 int* variadic, u32* nparams) { 2774 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); 2775 NativeCallDesc d; 2776 if (variadic) *variadic = abi ? (int)abi->variadic : 0; 2777 if (nparams) *nparams = abi ? abi->nparams : 0u; 2778 memset(&d, 0, sizeof d); 2779 d.fn_type = fn_type; 2780 d.nargs = abi ? abi->nparams : 0u; 2781 if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); 2782 return aa_call_stack_size(t, &d); 2783 } 2784 2785 /* Pure NativeTarget.call_stack_bytes: outgoing stack bytes for a full call 2786 * descriptor (handles variadic stack args, unlike signature_stack_bytes which 2787 * sees only the fixed params). aa_call_stack_size reads only fn_type and each 2788 * args[i].type, so the frame-planning pre-pass can call this before emitting. 2789 */ 2790 static u32 aa_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { 2791 return aa_call_stack_size(t, desc); 2792 } 2793 2794 /* One register-passed call argument: write `src` (or its address) into the 2795 * argument register `dst`. Collected during planning and emitted as a batch so 2796 * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */ 2797 typedef NativeArgMove AAArgMove; 2798 2799 /* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */ 2800 #define AA_MAX_REG_ARG_MOVES 16u 2801 2802 static void aa_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { 2803 if (m->is_addr) 2804 aa_addr_of_loc(t, m->dst, m->src); 2805 else 2806 aa_load_part(t, m->dst, m->src, m->src_offset, m->size); 2807 } 2808 2809 /* Emit register-argument moves as a parallel copy via the shared scheduler: 2810 * every register is read by all moves that source it before any move overwrites 2811 * it; a true cycle is broken through a scratch. The allocator usually arranges 2812 * a conflict-free order, but not always (notably variadic args, where it can 2813 * leave a prior call's result in x0 even though x0 is this call's first arg 2814 * register), so the backend must not assume a safe order. Cycle scratch is 2815 * AA_TMP1 (x17) for int and v16 for fp — distinct from x16 (AA_TMP0), which may 2816 * hold a stashed indirect callee. */ 2817 static void aa_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, 2818 u32 n) { 2819 NativeArgShuffle s; 2820 if (n > AA_MAX_REG_ARG_MOVES) 2821 aa_panic(aa_of(t), "too many register arguments"); 2822 memset(&s, 0, sizeof s); 2823 s.t = t; 2824 s.emit_one = aa_emit_one_arg_move; 2825 s.reg_move = aa_move; 2826 s.scratch[NATIVE_REG_INT] = AA_TMP1; 2827 s.scratch[NATIVE_REG_FP] = 16u; 2828 native_arg_shuffle(&s, moves, n); 2829 } 2830 2831 static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, 2832 NativeCallPlan* plan) { 2833 NativeCallPlanRet* rets; 2834 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2835 memset(plan, 0, sizeof *plan); 2836 rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL; 2837 plan->callee = desc->callee; 2838 plan->rets = rets; 2839 plan->flags = desc->flags; 2840 plan->has_sret = abi && abi->has_sret; 2841 plan->is_variadic = abi && abi->variadic; 2842 plan->stack_arg_size = aa_call_stack_size(t, desc); 2843 native_frame_note_outgoing(&aa_of(t)->frame, plan->stack_arg_size); 2844 /* Indirect call whose callee lives in x0..x7: the upcoming arg-load loop 2845 * writes those same registers and would clobber the function pointer 2846 * before blr reads it. Stash callee into AA_TMP0 (x16) up front and 2847 * retarget the call. (AA_TMP0 is a backend scratch, never an arg reg.) */ 2848 if (plan->callee.kind == NATIVE_LOC_REG && 2849 (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && 2850 plan->callee.v.reg < 8u) { 2851 NativeLoc scratch = 2852 native_loc_reg(plan->callee.type, NATIVE_REG_INT, AA_TMP0); 2853 aa_move(t, scratch, plan->callee); 2854 plan->callee = scratch; 2855 } 2856 { 2857 u32 next_int = 0, next_fp = 0, stack = 0, nmoves = 0; 2858 int tail_call = (desc->flags & CG_CALL_TAIL) != 0; 2859 AAArgMove moves[AA_MAX_REG_ARG_MOVES]; 2860 /* Stack-passed arguments are stored inline as we walk, *before* any 2861 * argument register is written, so a stack-arg source that the allocator 2862 * left in an arg register (e.g. a prior call's result still in x0, consumed 2863 * as a variadic stack arg) is read while it is still live. Stack stores 2864 * only touch memory and the AA_TMP0/v16 scratch, never an arg-register 2865 * source, so emitting them first cannot clobber a register-arg source. 2866 * Register-passed arguments are collected and emitted afterward as a 2867 * parallel copy (aa_emit_reg_arg_moves) so they likewise never overwrite a 2868 * register another argument still needs to read. */ 2869 for (u32 i = 0; i < desc->nargs; ++i) { 2870 ABIArgInfo tmp; 2871 const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp); 2872 int force_stack = 2873 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 2874 if (ai->kind == ABI_ARG_IGNORE) continue; 2875 if (force_stack) { 2876 NativeLoc tmpreg = 2877 native_loc_reg(desc->args[i].type, NATIVE_REG_INT, AA_TMP0); 2878 u32 n = aa_class_vararg_stack_size(abi, ai); 2879 u32 off = 0; 2880 stack = aa_vararg_stack_start(abi, stack); 2881 while (off < n) { 2882 u32 chunk = (n - off > 8u) ? 8u : (n - off); 2883 aa_load_part(t, tmpreg, desc->args[i], off, chunk); 2884 aa_store_outgoing_part(t, tail_call, stack + off, tmpreg, chunk); 2885 off += chunk; 2886 } 2887 stack += n; 2888 continue; 2889 } 2890 if (ai->kind == ABI_ARG_INDIRECT) { 2891 if (next_int < 8u) { 2892 AAArgMove* m = &moves[nmoves++]; 2893 m->dst = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), 2894 NATIVE_REG_INT, next_int++); 2895 m->src = desc->args[i]; 2896 m->src_offset = 0; 2897 m->size = 8; 2898 m->is_addr = 1; 2899 } else { 2900 NativeLoc ptr = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), 2901 NATIVE_REG_INT, AA_TMP0); 2902 aa_addr_of_loc(t, ptr, desc->args[i]); 2903 aa_store_outgoing_part(t, tail_call, stack, ptr, 8); 2904 stack += 8u; 2905 } 2906 continue; 2907 } 2908 for (u32 p = 0; p < ai->nparts; ++p) { 2909 const ABIArgPart* part = &ai->parts[p]; 2910 NativeAllocClass cls = 2911 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2912 if ((cls == NATIVE_REG_FP && next_fp < 8u) || 2913 (cls == NATIVE_REG_INT && next_int < 8u)) { 2914 AAArgMove* m = &moves[nmoves++]; 2915 m->dst = 2916 native_loc_reg(desc->args[i].type, cls, 2917 cls == NATIVE_REG_FP ? next_fp++ : next_int++); 2918 m->src = desc->args[i]; 2919 m->src_offset = part->src_offset; 2920 m->size = part->size; 2921 m->is_addr = 0; 2922 } else { 2923 NativeLoc tmpreg = native_loc_reg( 2924 desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0); 2925 aa_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); 2926 stack = align_up_u32(stack, aa_part_stack_align(abi, part)); 2927 aa_store_outgoing_part(t, tail_call, stack, tmpreg, part->size); 2928 stack += aa_part_stack_size(abi, part); 2929 } 2930 } 2931 } 2932 aa_emit_reg_arg_moves(t, moves, nmoves); 2933 /* Set the indirect-result register (x8) *after* the argument loads: an 2934 * argument source may have been allocated to x8, and the sret pointer load 2935 * would otherwise clobber it before it is moved into its argument 2936 * register. */ 2937 if (abi && abi->has_sret) { 2938 NativeLoc x8 = 2939 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 8u); 2940 if (desc->flags & CG_CALL_TAIL) { 2941 AANativeTarget* a = aa_of(t); 2942 NativeLoc saved = native_loc_stack(x8.type, a->sret_ptr_slot, 0); 2943 aa_load_part(t, x8, saved, 0, 8); 2944 } else if (desc->nresults) { 2945 aa_addr_of_loc(t, x8, desc->results[0]); 2946 } 2947 } 2948 } 2949 if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { 2950 u32 nr = 0, ni = 0, nf = 0; 2951 for (u32 p = 0; p < abi->ret.nparts; ++p) { 2952 const ABIArgPart* part = &abi->ret.parts[p]; 2953 NativeAllocClass cls = 2954 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2955 KitCgTypeId pty = aa_part_scalar_type(part); 2956 rets[nr].src = 2957 native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); 2958 rets[nr].dst = desc->results[0]; 2959 if (rets[nr].dst.kind == NATIVE_LOC_FRAME) 2960 rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, 2961 (i32)part->src_offset); 2962 else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { 2963 rets[nr].dst.v.stack.offset += (i32)part->src_offset; 2964 rets[nr].dst.type = pty; 2965 } else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) { 2966 rets[nr].dst.v.addr.offset += (i32)part->src_offset; 2967 rets[nr].dst.type = pty; 2968 } 2969 rets[nr].mem = aa_mem_for_type(t, pty, part->size); 2970 nr++; 2971 } 2972 plan->nrets = nr; 2973 } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { 2974 plan->nrets = 0; 2975 } else if (!abi && desc->nresults) { 2976 rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, 0); 2977 rets[0].dst = desc->results[0]; 2978 rets[0].mem = aa_mem_for_type(t, desc->results[0].type, 0); 2979 plan->nrets = 1; 2980 } 2981 } 2982 2983 static void aa_ret(NativeTarget* t); 2984 2985 static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) { 2986 AANativeTarget* a = aa_of(t); 2987 if (a->frame.known_frame) { 2988 /* Frame is final: emit the tail epilogue (callee restores + frame restore + 2989 * branch) directly, exactly the words aa_apply_patches would patch in but 2990 * without the reserved NOP padding. */ 2991 AAFrameLayout L = 2992 aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes); 2993 u32 words[AA_TAIL_WORDS]; 2994 u32 n = 0; 2995 aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); 2996 aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, &L); 2997 if (n >= AA_TAIL_WORDS) aa_panic(a, "tail epilogue too large"); 2998 for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]); 2999 if (callee.kind == NATIVE_LOC_REG) { 3000 aa_emit32(t->mc, aa64_br(loc_reg(callee))); 3001 } else if (callee.kind == NATIVE_LOC_GLOBAL) { 3002 u32 pos = t->mc->pos(t->mc); 3003 aa_emit32(t->mc, aa64_b(0)); 3004 t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, R_AARCH64_JUMP26, 3005 callee.v.global.sym, callee.v.global.addend, 0, 0); 3006 } else { 3007 aa_panic(a, "unsupported tail target"); 3008 } 3009 return; 3010 } 3011 /* Single-pass: reserve a worst-case region and record a patch; the callee 3012 * restores and frame restore depend on the not-yet-final frame layout. */ 3013 AAPatch* p = aa_patch_alloc(a); 3014 p->kind = AA_PATCH_TAIL; 3015 p->pos = t->mc->pos(t->mc); 3016 p->u.callee = callee; 3017 for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu); 3018 if (callee.kind == NATIVE_LOC_GLOBAL) { 3019 t->mc->emit_reloc_at(t->mc, t->mc->section_id, 3020 p->pos + (AA_TAIL_WORDS - 1u) * 4u, R_AARCH64_JUMP26, 3021 callee.v.global.sym, callee.v.global.addend, 0, 0); 3022 } 3023 } 3024 3025 static void aa_emit_call(NativeTarget* t, const NativeCallPlan* plan) { 3026 int is_tail = (plan->flags & CG_CALL_TAIL) != 0; 3027 if (is_tail) { 3028 if (plan->callee.kind != NATIVE_LOC_GLOBAL && 3029 plan->callee.kind != NATIVE_LOC_REG) 3030 aa_panic(aa_of(t), "unsupported tail target"); 3031 aa_emit_tail_site(t, plan->callee); 3032 return; 3033 } 3034 if (plan->callee.kind == NATIVE_LOC_GLOBAL) { 3035 aa_emit32(t->mc, aa64_bl(0)); 3036 t->mc->emit_reloc_at(t->mc, t->mc->section_id, t->mc->pos(t->mc) - 4u, 3037 R_AARCH64_CALL26, plan->callee.v.global.sym, 3038 plan->callee.v.global.addend, 0, 0); 3039 return; 3040 } 3041 if (plan->callee.kind == NATIVE_LOC_REG) { 3042 aa_emit32(t->mc, aa64_blr(loc_reg(plan->callee))); 3043 return; 3044 } 3045 aa_panic(aa_of(t), "unsupported call target"); 3046 } 3047 3048 static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd, 3049 const NativeLoc* value, 3050 NativeCallPlanRet** out_rets, u32* out_nrets) { 3051 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 3052 NativeCallPlanRet* rets = NULL; 3053 u32 nr = 0; 3054 if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4); 3055 if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) { 3056 AANativeTarget* a = aa_of(t); 3057 /* Hold the sret destination pointer in x8, not AA_TMP1: aa_copy_bytes 3058 * materializes out-of-range source/dest frame offsets into AA_TMP1, which 3059 * would clobber the destination base mid-copy (only triggered once a frame 3060 * is large enough that the source offset escapes stur's signed-9 range). */ 3061 NativeLoc dstp = 3062 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_X8); 3063 NativeLoc saved = native_loc_stack(dstp.type, a->sret_ptr_slot, 0); 3064 NativeAddr dst_addr, src_addr; 3065 AggregateAccess access; 3066 aa_load_part(t, dstp, saved, 0, 8); 3067 memset(&dst_addr, 0, sizeof dst_addr); 3068 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 3069 dst_addr.base.reg = AA_X8; 3070 dst_addr.base_type = value->type; 3071 src_addr = aa_loc_addr(a, *value, 0); 3072 src_addr.base_type = value->type; 3073 memset(&access, 0, sizeof access); 3074 access.type = value->type; 3075 access.size = (u32)cg_type_size(t->c, value->type); 3076 access.align = type_align32(t, value->type); 3077 aa_copy_bytes(t, dst_addr, src_addr, access); 3078 *out_rets = NULL; 3079 *out_nrets = 0; 3080 return; 3081 } 3082 if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) { 3083 u32 ni = 0, nf = 0; 3084 for (u32 p = 0; p < abi->ret.nparts; ++p) { 3085 const ABIArgPart* part = &abi->ret.parts[p]; 3086 NativeAllocClass cls = 3087 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 3088 KitCgTypeId pty = aa_part_scalar_type(part); 3089 rets[nr].src = *value; 3090 if (rets[nr].src.kind == NATIVE_LOC_FRAME) 3091 rets[nr].src = 3092 native_loc_stack(pty, value->v.frame, (i32)part->src_offset); 3093 else if (rets[nr].src.kind == NATIVE_LOC_STACK) { 3094 rets[nr].src.v.stack.offset += (i32)part->src_offset; 3095 rets[nr].src.type = pty; 3096 } else if (rets[nr].src.kind == NATIVE_LOC_ADDR) { 3097 rets[nr].src.v.addr.offset += (i32)part->src_offset; 3098 rets[nr].src.type = pty; 3099 } 3100 rets[nr].dst = 3101 native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); 3102 rets[nr].mem = aa_mem_for_type(t, pty, part->size); 3103 nr++; 3104 } 3105 } else if (value) { 3106 rets[0].src = *value; 3107 rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, 0); 3108 rets[0].mem = aa_mem_for_type(t, value->type, 0); 3109 nr = 1; 3110 } 3111 *out_rets = rets; 3112 *out_nrets = nr; 3113 } 3114 3115 static void aa_ret(NativeTarget* t) { 3116 AANativeTarget* a = aa_of(t); 3117 aa_jump(t, a->epilogue_label); 3118 } 3119 3120 static u32 aa_bit_storage_reg_bits(u32 storage_bytes) { 3121 return storage_bytes == 8u ? 64u : 32u; 3122 } 3123 3124 static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { 3125 u32 bits = sf ? 64u : 32u; 3126 if (!sh) { 3127 if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); 3128 return; 3129 } 3130 aa_emit32(t->mc, aa_ubfm(sf, rd, rn, bits - sh, bits - 1u - sh)); 3131 } 3132 3133 static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { 3134 if (!sh) { 3135 if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); 3136 return; 3137 } 3138 aa_emit32(t->mc, aa_ubfm(sf, rd, rn, sh, sf ? 63u : 31u)); 3139 } 3140 3141 static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh) { 3142 if (!sh) { 3143 if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn)); 3144 return; 3145 } 3146 aa_emit32(t->mc, aa_sbfm(sf, rd, rn, sh, sf ? 63u : 31u)); 3147 } 3148 3149 static void aa_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 3150 BitFieldAccess bf) { 3151 u32 storage = bf.storage.size ? bf.storage.size : 4u; 3152 u32 bits = aa_bit_storage_reg_bits(storage); 3153 u32 width = bf.bit_width ? bf.bit_width : 1u; 3154 u32 sf = bits == 64u; 3155 NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset); 3156 NativeLoc tmp = dst; 3157 tmp.type = bf.storage.type ? bf.storage.type : dst.type; 3158 aa_load_native(t, tmp, saddr, bf.storage); 3159 aa_lsl_imm(t, sf, loc_reg(dst), loc_reg(dst), 3160 bits - (u32)bf.bit_offset - width); 3161 if (bf.signed_) 3162 aa_asr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width); 3163 else 3164 aa_lsr_imm(t, sf, loc_reg(dst), loc_reg(dst), bits - width); 3165 } 3166 3167 static void aa_bitfield_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 3168 BitFieldAccess bf) { 3169 u32 storage = bf.storage.size ? bf.storage.size : 4u; 3170 u32 bits = aa_bit_storage_reg_bits(storage); 3171 u32 width = bf.bit_width ? bf.bit_width : 1u; 3172 u32 sf = bits == 64u; 3173 u64 ones = width >= 64u ? ~(u64)0 : ((1ull << width) - 1ull); 3174 u64 field_mask = ones << bf.bit_offset; 3175 NativeAddr saddr = aa_addr_plus(addr, bf.storage_offset); 3176 NativeLoc word = 3177 aa_tmp_loc(bf.storage.type ? bf.storage.type : src.type, AA_TMP0); 3178 aa_load_native(t, word, saddr, bf.storage); 3179 aa_emit_load_imm(t->mc, sf, AA_TMP1, (i64)~field_mask); 3180 aa_emit32(t->mc, aa64_and(sf, AA_TMP0, AA_TMP0, AA_TMP1)); 3181 aa_emit32(t->mc, aa_ubfm(sf, AA_TMP1, loc_reg(src), 0, width - 1u)); 3182 aa_lsl_imm(t, sf, AA_TMP1, AA_TMP1, bf.bit_offset); 3183 aa_emit32(t->mc, aa64_orr(sf, AA_TMP0, AA_TMP0, AA_TMP1)); 3184 aa_store_native(t, saddr, word, bf.storage); 3185 } 3186 3187 static void aa_trap(NativeTarget* t); 3188 3189 static int aa_order_acquire(KitCgMemOrder order) { 3190 return order == KIT_CG_MO_CONSUME || order == KIT_CG_MO_ACQUIRE || 3191 order == KIT_CG_MO_ACQ_REL || order == KIT_CG_MO_SEQ_CST; 3192 } 3193 3194 static int aa_order_release(KitCgMemOrder order) { 3195 return order == KIT_CG_MO_RELEASE || order == KIT_CG_MO_ACQ_REL || 3196 order == KIT_CG_MO_SEQ_CST; 3197 } 3198 3199 static NativeLoc aa_i64_reg_loc(u32 reg) { 3200 return native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); 3201 } 3202 3203 static void aa_atomic_addr_reg(NativeTarget* t, NativeAddr addr, u32 reg) { 3204 NativeLoc dst = aa_i64_reg_loc(reg); 3205 t->load_addr(t, dst, addr); 3206 } 3207 3208 static u32 aa_saved_tmp_pick(u32 a, u32 b, u32 c) { 3209 static const u32 regs[] = {11u, 12u, 13u, 14u, 15u}; 3210 for (u32 i = 0; i < sizeof regs / sizeof regs[0]; ++i) { 3211 if (regs[i] != a && regs[i] != b && regs[i] != c) return regs[i]; 3212 } 3213 return 15u; 3214 } 3215 3216 static void aa_saved_tmp_spill(AANativeTarget* a, u32 reg) { 3217 NativeFrameSlotDesc sd; 3218 NativeAddr addr; 3219 MemAccess mem; 3220 memset(&sd, 0, sizeof sd); 3221 if (a->saved_tmp_slot == NATIVE_FRAME_SLOT_NONE) { 3222 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 3223 sd.size = 8; 3224 sd.align = 8; 3225 sd.kind = NATIVE_FRAME_SLOT_SPILL; 3226 a->saved_tmp_slot = a->base.frame_slot(&a->base, &sd); 3227 } 3228 memset(&addr, 0, sizeof addr); 3229 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3230 addr.base.frame = a->saved_tmp_slot; 3231 addr.base_type = builtin_id(KIT_CG_BUILTIN_I64); 3232 mem = aa_mem_for_type(&a->base, addr.base_type, 8); 3233 aa_store_native(&a->base, addr, aa_i64_reg_loc(reg), mem); 3234 } 3235 3236 static void aa_saved_tmp_restore(AANativeTarget* a, u32 reg) { 3237 NativeAddr addr; 3238 MemAccess mem; 3239 memset(&addr, 0, sizeof addr); 3240 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3241 addr.base.frame = a->saved_tmp_slot; 3242 addr.base_type = builtin_id(KIT_CG_BUILTIN_I64); 3243 mem = aa_mem_for_type(&a->base, addr.base_type, 8); 3244 aa_load_native(&a->base, aa_i64_reg_loc(reg), addr, mem); 3245 } 3246 3247 static void aa_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 3248 MemAccess mem, KitCgMemOrder order) { 3249 u32 base = AA_TMP0; 3250 u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type)); 3251 aa_atomic_addr_reg(t, addr, base); 3252 aa_emit32(t->mc, aa_order_acquire(order) 3253 ? aa_ldar(sz, loc_reg(dst), base) 3254 : aa_ldr_uimm(sz, loc_reg(dst), base, 0)); 3255 if (order == KIT_CG_MO_SEQ_CST) 3256 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3257 } 3258 3259 static void aa_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 3260 MemAccess mem, KitCgMemOrder order) { 3261 u32 base = AA_TMP0; 3262 u32 sz = size_idx(mem.size ? mem.size : type_size32(t, src.type)); 3263 if (order == KIT_CG_MO_SEQ_CST) 3264 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3265 aa_atomic_addr_reg(t, addr, base); 3266 aa_emit32(t->mc, aa_order_release(order) 3267 ? aa_stlr(sz, loc_reg(src), base) 3268 : aa_str_uimm(sz, loc_reg(src), base, 0)); 3269 if (order == KIT_CG_MO_SEQ_CST) 3270 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3271 } 3272 3273 static void aa_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst, 3274 NativeAddr addr, NativeLoc val, MemAccess mem, 3275 KitCgMemOrder order) { 3276 AANativeTarget* a = aa_of(t); 3277 u32 base = AA_TMP0; 3278 u32 next_reg = AA_TMP1; 3279 u32 status = aa_saved_tmp_pick(loc_reg(dst), loc_reg(val), base); 3280 NativeLoc next = aa_tmp_loc(dst.type, next_reg); 3281 MCLabel retry = t->mc->label_new(t->mc); 3282 u32 sz = size_idx(mem.size ? mem.size : type_size32(t, dst.type)); 3283 if (order == KIT_CG_MO_SEQ_CST) 3284 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3285 aa_saved_tmp_spill(a, status); 3286 aa_atomic_addr_reg(t, addr, base); 3287 t->mc->label_place(t->mc, retry); 3288 aa_emit32(t->mc, aa_order_acquire(order) ? aa_ldaxr(sz, loc_reg(dst), base) 3289 : aa_ldxr(sz, loc_reg(dst), base)); 3290 switch (op) { 3291 case KIT_CG_ATOMIC_XCHG: 3292 aa_move(t, next, val); 3293 break; 3294 case KIT_CG_ATOMIC_ADD: 3295 aa_binop(t, BO_IADD, next, dst, val); 3296 break; 3297 case KIT_CG_ATOMIC_SUB: 3298 aa_binop(t, BO_ISUB, next, dst, val); 3299 break; 3300 case KIT_CG_ATOMIC_AND: 3301 aa_binop(t, BO_AND, next, dst, val); 3302 break; 3303 case KIT_CG_ATOMIC_OR: 3304 aa_binop(t, BO_OR, next, dst, val); 3305 break; 3306 case KIT_CG_ATOMIC_XOR: 3307 aa_binop(t, BO_XOR, next, dst, val); 3308 break; 3309 case KIT_CG_ATOMIC_NAND: 3310 aa_binop(t, BO_AND, next, dst, val); 3311 aa_unop(t, UO_BNOT, next, next); 3312 break; 3313 default: 3314 aa_panic(a, "unsupported atomic rmw op"); 3315 } 3316 aa_emit32(t->mc, aa_order_release(order) 3317 ? aa_stlxr(sz, status, next_reg, base) 3318 : aa_stxr(sz, status, next_reg, base)); 3319 aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0)); 3320 t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0); 3321 aa_saved_tmp_restore(a, status); 3322 if (order == KIT_CG_MO_SEQ_CST) 3323 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3324 } 3325 3326 static void aa_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, 3327 NativeAddr addr, NativeLoc expected, 3328 NativeLoc desired, MemAccess mem, 3329 KitCgMemOrder success, KitCgMemOrder failure) { 3330 u32 base = AA_TMP0; 3331 u32 status = AA_TMP1; 3332 u32 sz = size_idx(mem.size ? mem.size : type_size32(t, prior.type)); 3333 u32 sf = sz == 3u; 3334 int acquire = aa_order_acquire(success) || aa_order_acquire(failure); 3335 int release = aa_order_release(success); 3336 MCLabel retry = t->mc->label_new(t->mc); 3337 MCLabel fail = t->mc->label_new(t->mc); 3338 MCLabel done = t->mc->label_new(t->mc); 3339 if (success == KIT_CG_MO_SEQ_CST || failure == KIT_CG_MO_SEQ_CST) 3340 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3341 aa_atomic_addr_reg(t, addr, base); 3342 t->mc->label_place(t->mc, retry); 3343 aa_emit32(t->mc, acquire ? aa_ldaxr(sz, loc_reg(prior), base) 3344 : aa_ldxr(sz, loc_reg(prior), base)); 3345 aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(prior), loc_reg(expected))); 3346 aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_NE)})); 3347 t->mc->emit_label_ref(t->mc, fail, R_AARCH64_CONDBR19, 4, 0); 3348 aa_emit32(t->mc, release ? aa_stlxr(sz, status, loc_reg(desired), base) 3349 : aa_stxr(sz, status, loc_reg(desired), base)); 3350 aa_emit32(t->mc, aa64_cbnz_imm(0, status, 0)); 3351 t->mc->emit_label_ref(t->mc, retry, R_AARCH64_CONDBR19, 4, 0); 3352 aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 1); 3353 aa_jump(t, done); 3354 t->mc->label_place(t->mc, fail); 3355 aa_emit32(t->mc, aa64_clrex(AA64_BARRIER_OPT_SY)); 3356 aa_emit_load_imm(t->mc, loc_is_64(t, ok), loc_reg(ok), 0); 3357 t->mc->label_place(t->mc, done); 3358 if (success == KIT_CG_MO_SEQ_CST || failure == KIT_CG_MO_SEQ_CST) 3359 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3360 } 3361 3362 static void aa_fence(NativeTarget* t, KitCgMemOrder order) { 3363 if (order != KIT_CG_MO_RELAXED) 3364 aa_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); 3365 } 3366 3367 /* Map a KitCgBarrierScope (passed as an immediate arg to DMB/DSB) onto an 3368 * AArch64 barrier domain option. Defaults to full-system (SY) when the scope 3369 * is absent or unrecognized. */ 3370 static u32 aa_barrier_opt(const NativeLoc* args, u32 narg) { 3371 if (narg < 1u || args[0].kind != NATIVE_LOC_IMM) return AA64_BARRIER_OPT_SY; 3372 switch ((KitCgBarrierScope)args[0].v.imm) { 3373 case KIT_CG_BARRIER_FULL: 3374 return AA64_BARRIER_OPT_SY; 3375 case KIT_CG_BARRIER_INNER: 3376 return AA64_BARRIER_OPT_ISH; 3377 case KIT_CG_BARRIER_INNER_STORE: 3378 return AA64_BARRIER_OPT_ISHST; 3379 case KIT_CG_BARRIER_OUTER: 3380 return AA64_BARRIER_OPT_OSH; 3381 case KIT_CG_BARRIER_OUTER_STORE: 3382 return AA64_BARRIER_OPT_OSHST; 3383 case KIT_CG_BARRIER_NON_SHARE: 3384 return AA64_BARRIER_OPT_NSH; 3385 } 3386 return AA64_BARRIER_OPT_SY; 3387 } 3388 3389 static void aa_intrinsic(NativeTarget* t, IntrinKind kind, 3390 const NativeLoc* dsts, u32 ndst, const NativeLoc* args, 3391 u32 narg) { 3392 AggregateAccess access; 3393 NativeAddr dst_addr; 3394 NativeAddr src_addr; 3395 memset(&access, 0, sizeof access); 3396 memset(&dst_addr, 0, sizeof dst_addr); 3397 memset(&src_addr, 0, sizeof src_addr); 3398 switch (kind) { 3399 case INTRIN_NONE: 3400 if (ndst == 1u && narg == 3u && native_loc_is_fp(dsts[0])) { 3401 u32 d = type_size32(t, dsts[0].type) == 8u; 3402 aa_emit32(t->mc, aa_fp_bin(0x000800u, d, loc_reg(dsts[0]), 3403 loc_reg(args[0]), loc_reg(args[1]))); 3404 aa_emit32(t->mc, aa_fp_bin(0x002800u, d, loc_reg(dsts[0]), 3405 loc_reg(dsts[0]), loc_reg(args[2]))); 3406 return; 3407 } 3408 break; 3409 case INTRIN_CLZ: 3410 if (ndst == 1u && narg == 1u) { 3411 aa_emit32(t->mc, aa_clz(loc_is_64(t, args[0]), loc_reg(dsts[0]), 3412 loc_reg(args[0]))); 3413 return; 3414 } 3415 break; 3416 case INTRIN_CTZ: 3417 if (ndst == 1u && narg == 1u) { 3418 u32 sf = loc_is_64(t, args[0]); 3419 aa_emit32(t->mc, aa_rbit(sf, loc_reg(dsts[0]), loc_reg(args[0]))); 3420 aa_emit32(t->mc, aa_clz(sf, loc_reg(dsts[0]), loc_reg(dsts[0]))); 3421 return; 3422 } 3423 break; 3424 case INTRIN_POPCOUNT: 3425 if (ndst == 1u && narg == 1u) { 3426 u32 sf = loc_is_64(t, args[0]); 3427 u32 rd = loc_reg(dsts[0]); 3428 u32 rn = loc_reg(args[0]); 3429 MCLabel loop = t->mc->label_new(t->mc); 3430 MCLabel done = t->mc->label_new(t->mc); 3431 aa_emit_load_imm(t->mc, sf, rd, 0); 3432 aa_emit32(t->mc, aa64_mov_reg(sf, AA_TMP0, rn)); 3433 t->mc->label_place(t->mc, loop); 3434 aa_emit32(t->mc, aa64_cbz(sf, AA_TMP0, 0)); 3435 t->mc->emit_label_ref(t->mc, done, R_AARCH64_CONDBR19, 4, 0); 3436 aa_emit_load_imm(t->mc, sf, AA_TMP1, 1); 3437 aa_emit32(t->mc, aa64_and(sf, AA_TMP1, AA_TMP0, AA_TMP1)); 3438 aa_emit32(t->mc, aa64_add(sf, rd, rd, AA_TMP1)); 3439 aa_emit_load_imm(t->mc, sf, AA_TMP1, 1); 3440 aa_emit32(t->mc, aa64_lsrv(sf, AA_TMP0, AA_TMP0, AA_TMP1)); 3441 aa_jump(t, loop); 3442 t->mc->label_place(t->mc, done); 3443 return; 3444 } 3445 break; 3446 case INTRIN_BSWAP: 3447 if (ndst == 1u && narg == 1u) { 3448 u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); 3449 switch (width) { 3450 case 2: { 3451 u32 sf = 0; 3452 aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); 3453 aa_emit_load_imm(t->mc, 0, AA_TMP0, 16); 3454 aa_emit32(t->mc, aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), 3455 AA_TMP0)); 3456 return; 3457 } 3458 case 4: { 3459 u32 sf = 0; 3460 aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); 3461 return; 3462 } 3463 case 8: { 3464 u32 sf = 1; 3465 aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); 3466 return; 3467 } 3468 default: 3469 break; 3470 } 3471 } 3472 break; 3473 case INTRIN_SADD_OVERFLOW: 3474 case INTRIN_UADD_OVERFLOW: 3475 case INTRIN_SSUB_OVERFLOW: 3476 case INTRIN_USUB_OVERFLOW: 3477 if (ndst == 2u && narg == 2u) { 3478 u32 sf = loc_is_64(t, dsts[0]); 3479 u32 rd = loc_reg(dsts[0]); 3480 if (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW) 3481 aa_emit32(t->mc, 3482 aa64_addsubsr_pack((AA64AddSubSR){.sf = sf, 3483 .op = 0, 3484 .S = 1, 3485 .Rm = loc_reg(args[1]), 3486 .Rn = loc_reg(args[0]), 3487 .Rd = rd})); 3488 else 3489 aa_emit32(t->mc, 3490 aa64_addsubsr_pack((AA64AddSubSR){.sf = sf, 3491 .op = 1, 3492 .S = 1, 3493 .Rm = loc_reg(args[1]), 3494 .Rn = loc_reg(args[0]), 3495 .Rd = rd})); 3496 aa_emit32(t->mc, 3497 aa_cset(loc_is_64(t, dsts[1]), loc_reg(dsts[1]), 3498 (kind == INTRIN_SADD_OVERFLOW || 3499 kind == INTRIN_SSUB_OVERFLOW) 3500 ? 0x6u 3501 : (kind == INTRIN_UADD_OVERFLOW ? 0x2u : 0x3u))); 3502 return; 3503 } 3504 break; 3505 case INTRIN_SMUL_OVERFLOW: 3506 case INTRIN_UMUL_OVERFLOW: 3507 if (ndst == 2u && narg == 2u) { 3508 u32 sf = loc_is_64(t, dsts[0]); 3509 if (sf) { 3510 if (kind == INTRIN_SMUL_OVERFLOW) { 3511 aa_emit32(t->mc, 3512 aa_smulh(AA_TMP0, loc_reg(args[0]), loc_reg(args[1]))); 3513 aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]), 3514 loc_reg(args[1]))); 3515 aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 63, 63)); 3516 aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1)); 3517 aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); 3518 } else { 3519 aa_emit32(t->mc, 3520 aa_umulh(AA_TMP0, loc_reg(args[0]), loc_reg(args[1]))); 3521 aa_emit32(t->mc, aa64_mul(1, loc_reg(dsts[0]), loc_reg(args[0]), 3522 loc_reg(args[1]))); 3523 aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA64_ZR)); 3524 aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); 3525 } 3526 } else if (kind == INTRIN_SMUL_OVERFLOW) { 3527 aa_emit32(t->mc, aa_smaddl(AA_TMP0, loc_reg(args[0]), 3528 loc_reg(args[1]), AA64_ZR)); 3529 aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0)); 3530 aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, loc_reg(dsts[0]), 0, 31)); 3531 aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP0, AA_TMP1)); 3532 aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); 3533 } else { 3534 aa_emit32(t->mc, aa_umaddl(AA_TMP0, loc_reg(args[0]), 3535 loc_reg(args[1]), AA64_ZR)); 3536 aa_emit32(t->mc, aa64_mov_reg(0, loc_reg(dsts[0]), AA_TMP0)); 3537 aa_emit_load_imm(t->mc, 1, AA_TMP1, 32); 3538 aa_emit32(t->mc, aa64_lsrv(1, AA_TMP1, AA_TMP0, AA_TMP1)); 3539 aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, AA_TMP1, AA64_ZR)); 3540 aa_emit32(t->mc, aa_cset(0, loc_reg(dsts[1]), cmp_cond(CMP_NE))); 3541 } 3542 return; 3543 } 3544 break; 3545 case INTRIN_MEMMOVE: { 3546 MCLabel forward = t->mc->label_new(t->mc); 3547 MCLabel done = t->mc->label_new(t->mc); 3548 if (narg != 3u || args[0].kind != NATIVE_LOC_REG || 3549 args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) 3550 aa_panic(aa_of(t), "unsupported memory intrinsic operands"); 3551 if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) 3552 aa_panic(aa_of(t), "unsupported memory intrinsic size"); 3553 access.size = (u32)args[2].v.imm; 3554 access.align = 1u; 3555 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 3556 dst_addr.base.reg = args[0].v.reg; 3557 src_addr.base_kind = NATIVE_ADDR_BASE_REG; 3558 src_addr.base.reg = args[1].v.reg; 3559 aa_emit32(t->mc, aa_subs_reg(1, AA64_ZR, args[0].v.reg, args[1].v.reg)); 3560 aa_emit32(t->mc, 3561 aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_LT_U)})); 3562 t->mc->emit_label_ref(t->mc, forward, R_AARCH64_CONDBR19, 4, 0); 3563 aa_copy_bytes_dir(t, dst_addr, src_addr, access, 1); 3564 aa_jump(t, done); 3565 t->mc->label_place(t->mc, forward); 3566 aa_copy_bytes_dir(t, dst_addr, src_addr, access, 0); 3567 t->mc->label_place(t->mc, done); 3568 return; 3569 } 3570 case INTRIN_EXPECT: 3571 case INTRIN_ASSUME_ALIGNED: 3572 if (ndst == 1u && narg >= 1u) { 3573 if (args[0].kind == NATIVE_LOC_IMM) 3574 aa_load_imm_native(t, dsts[0], args[0].v.imm); 3575 else 3576 aa_move(t, dsts[0], args[0]); 3577 } 3578 return; 3579 case INTRIN_PREFETCH: 3580 return; 3581 case INTRIN_TRAP: 3582 aa_trap(t); 3583 return; 3584 case INTRIN_SYSCALL: 3585 if (ndst == 1u && narg >= 1u && narg <= 7u) { 3586 static const u32 syscall_regs[7] = {AA_X8, 0u, 1u, 2u, 3u, 4u, 5u}; 3587 AAArgMove moves[7]; 3588 for (u32 i = 0; i < narg; ++i) { 3589 AAArgMove* m = &moves[i]; 3590 memset(m, 0, sizeof *m); 3591 m->dst = 3592 native_loc_reg(dsts[0].type, NATIVE_REG_INT, syscall_regs[i]); 3593 m->src = args[i]; 3594 m->size = t->c->target.ptr_size; 3595 } 3596 aa_emit_reg_arg_moves(t, moves, narg); 3597 aa_emit32(t->mc, aa64_svc(0)); 3598 aa_move(t, dsts[0], native_loc_reg(dsts[0].type, NATIVE_REG_INT, 0)); 3599 } 3600 return; 3601 case INTRIN_CPU_NOP: 3602 aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_NOP)); 3603 return; 3604 case INTRIN_CPU_YIELD: 3605 aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_YIELD)); 3606 return; 3607 case INTRIN_WFI: 3608 aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_WFI)); 3609 return; 3610 case INTRIN_WFE: 3611 aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_WFE)); 3612 return; 3613 case INTRIN_SEV: 3614 aa_emit32(t->mc, aa64_hint(AA64_HINT_OP_SEV)); 3615 return; 3616 case INTRIN_ISB: 3617 aa_emit32(t->mc, aa64_isb(AA64_BARRIER_OPT_SY)); 3618 return; 3619 case INTRIN_DMB: 3620 aa_emit32(t->mc, aa64_dmb(aa_barrier_opt(args, narg))); 3621 return; 3622 case INTRIN_DSB: 3623 aa_emit32(t->mc, aa64_dsb(aa_barrier_opt(args, narg))); 3624 return; 3625 case INTRIN_IRQ_SAVE: 3626 /* Read the interrupt-mask state, then mask D,A,I,F. */ 3627 if (ndst == 1u) { 3628 aa_emit32(t->mc, aa64_mrs_daif(loc_reg(dsts[0]))); 3629 aa_emit32(t->mc, aa64_msr_daifset(AA64_DAIF_ALL)); 3630 } 3631 return; 3632 case INTRIN_IRQ_RESTORE: 3633 if (narg == 1u) aa_emit32(t->mc, aa64_msr_daif(loc_reg(args[0]))); 3634 return; 3635 case INTRIN_IRQ_DISABLE: 3636 aa_emit32(t->mc, aa64_msr_daifset(AA64_DAIF_ALL)); 3637 return; 3638 case INTRIN_IRQ_ENABLE: 3639 aa_emit32(t->mc, aa64_msr_daifclr(AA64_DAIF_ALL)); 3640 return; 3641 case INTRIN_FRAME_ADDRESS: 3642 case INTRIN_RETURN_ADDRESS: 3643 /* Walk the AAPCS64 frame-record chain. Every kit prologue stores 3644 * {x29, x30} and anchors x29 at the record: [x29] = caller's x29, 3645 * [x29 + 8] = saved x30 (this frame's return address). The level is a 3646 * compile-time constant, so the walk unrolls to `level` dependent loads. */ 3647 if (ndst == 1u) { 3648 u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) 3649 ? (u32)args[0].v.imm 3650 : 0u; 3651 u32 rd = loc_reg(dsts[0]); 3652 aa_emit32(t->mc, aa64_mov_reg(1, rd, AA_FP)); 3653 for (u32 i = 0; i < level; ++i) 3654 aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 0)); /* rd = *(rd) */ 3655 if (kind == INTRIN_RETURN_ADDRESS) 3656 aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 1)); /* rd = *(rd + 8) */ 3657 } 3658 return; 3659 default: 3660 aa_panic(aa_of(t), "unsupported compiler intrinsic"); 3661 } 3662 } 3663 3664 static void aa_trap(NativeTarget* t) { aa_emit32(t->mc, aa64_brk(0)); } 3665 3666 /* file_scope_asm + finalize are shared (cg/native_asm.h). */ 3667 3668 static int aa_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, 3669 u32 mask[NATIVE_CALL_PLAN_CLASSES]) { 3670 mask[0] = mask[1] = mask[2] = 0; 3671 if ((NativeMachineOpKind)op->kind == NATIVE_MOP_TLS_ADDR) { 3672 /* ELF Local-Exec materializes the address using only the destination 3673 * register (mrs tpidr_el0 + add/add into rd) — no extra clobbers. The 3674 * Mach-O TLV sequence loads the descriptor into x0 and calls the resolver 3675 * thunk through x16, clobbering x0/x16/x17 and the link register; the JIT 3676 * relaxation of that same sequence keeps the x0/x16/x17 footprint. Model 3677 * the descriptor-model clobbers so a value live across a TLS access is not 3678 * left in one of these registers. */ 3679 if (!obj_format_tls_via_descriptor(t->c)) return 0; 3680 mask[NATIVE_REG_INT] = 3681 (1u << 0) | (1u << 16) | (1u << 17) | (1u << AA_LR); 3682 return 1; 3683 } 3684 if ((NativeMachineOpKind)op->kind != NATIVE_MOP_INTRINSIC || 3685 (IntrinKind)op->intrin != INTRIN_SYSCALL) 3686 return 0; 3687 mask[NATIVE_REG_INT] = (1u << 0) | (1u << 1) | (1u << 2) | (1u << 3) | 3688 (1u << 4) | (1u << 5) | (1u << AA_X8); 3689 return 1; 3690 } 3691 3692 static void aa_set_loc(NativeTarget* t, SrcLoc loc) { 3693 AANativeTarget* a = aa_of(t); 3694 a->loc = loc; 3695 if (t->mc && t->mc->set_loc) t->mc->set_loc(t->mc, loc); 3696 } 3697 3698 static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, 3699 NativeLoc dst); 3700 3701 /* Caller-saved allocables come first so the allocator prefers them (lower 3702 * spill_cost); callee-saved x19..x28 / v8..v15 are appended and only chosen 3703 * under register pressure, after which the prologue saves/restores them. */ 3704 static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u, 19u, 20u, 3705 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u}; 3706 /* Three int scratch registers, not two: a 3-operand op (e.g. `binop dst, a, b` 3707 * or `store [base+index], value`) whose dst/sources all spill needs three 3708 * distinct scratch regs at emit time — the IR spill-rewrite round-robins 3709 * operands across this pool and the native emitter materializes each into one. 3710 * With only two, an immediate operand of an all-spilled binop had nowhere to 3711 * land. x9/x10/x11 are all caller-saved temporaries reserved out of the 3712 * allocable set below. */ 3713 static const Reg aa_int_scratch[] = {9u, 10u, 11u}; 3714 static const Reg aa_fp_allocable[] = {18u, 19u, 8u, 9u, 10u, 3715 11u, 12u, 13u, 14u, 15u}; 3716 static const Reg aa_fp_scratch[] = {20u, 21u}; 3717 3718 #define AA_PHYS_INT_ALLOC(r) \ 3719 {.reg = (r), \ 3720 .cls = NATIVE_REG_INT, \ 3721 .abi_index = 0xffu, \ 3722 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 3723 .spill_cost = 1u, \ 3724 .copy_cost = 1u} 3725 #define AA_PHYS_INT_CALLER(r) \ 3726 {.reg = (r), \ 3727 .cls = NATIVE_REG_INT, \ 3728 .abi_index = 0xffu, \ 3729 .flags = NATIVE_REG_CALLER_SAVED, \ 3730 .spill_cost = 1u, \ 3731 .copy_cost = 1u} 3732 #define AA_PHYS_INT_ARG(r) \ 3733 {.reg = (r), \ 3734 .cls = NATIVE_REG_INT, \ 3735 .abi_index = (r), \ 3736 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 3737 ((r) < 2u ? NATIVE_REG_RET : 0), \ 3738 .spill_cost = 1u, \ 3739 .copy_cost = 1u} 3740 #define AA_PHYS_INT_CALLEE(r) \ 3741 {.reg = (r), \ 3742 .cls = NATIVE_REG_INT, \ 3743 .abi_index = 0xffu, \ 3744 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 3745 .spill_cost = 4u, \ 3746 .copy_cost = 1u} 3747 #define AA_PHYS_INT_RESERVED(r) \ 3748 {.reg = (r), \ 3749 .cls = NATIVE_REG_INT, \ 3750 .abi_index = 0xffu, \ 3751 .flags = NATIVE_REG_RESERVED, \ 3752 .spill_cost = 0u, \ 3753 .copy_cost = 0u} 3754 3755 static const NativePhysRegInfo aa_int_phys[] = { 3756 AA_PHYS_INT_ARG(0u), AA_PHYS_INT_ARG(1u), 3757 AA_PHYS_INT_ARG(2u), AA_PHYS_INT_ARG(3u), 3758 AA_PHYS_INT_ARG(4u), AA_PHYS_INT_ARG(5u), 3759 AA_PHYS_INT_ARG(6u), AA_PHYS_INT_ARG(7u), 3760 AA_PHYS_INT_ALLOC(8u), AA_PHYS_INT_RESERVED(9u), 3761 AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_RESERVED(11u), 3762 AA_PHYS_INT_ALLOC(12u), AA_PHYS_INT_ALLOC(13u), 3763 AA_PHYS_INT_ALLOC(14u), AA_PHYS_INT_ALLOC(15u), 3764 AA_PHYS_INT_RESERVED(16u), AA_PHYS_INT_RESERVED(17u), 3765 AA_PHYS_INT_RESERVED(18u), AA_PHYS_INT_CALLEE(19u), 3766 AA_PHYS_INT_CALLEE(20u), AA_PHYS_INT_CALLEE(21u), 3767 AA_PHYS_INT_CALLEE(22u), AA_PHYS_INT_CALLEE(23u), 3768 AA_PHYS_INT_CALLEE(24u), AA_PHYS_INT_CALLEE(25u), 3769 AA_PHYS_INT_CALLEE(26u), AA_PHYS_INT_CALLEE(27u), 3770 AA_PHYS_INT_CALLEE(28u), AA_PHYS_INT_RESERVED(29u), 3771 AA_PHYS_INT_RESERVED(30u), AA_PHYS_INT_RESERVED(31u), 3772 }; 3773 3774 #define AA_PHYS_FP_ALLOC(r) \ 3775 {.reg = (r), \ 3776 .cls = NATIVE_REG_FP, \ 3777 .abi_index = 0xffu, \ 3778 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 3779 .spill_cost = 1u, \ 3780 .copy_cost = 1u} 3781 #define AA_PHYS_FP_CALLER(r) \ 3782 {.reg = (r), \ 3783 .cls = NATIVE_REG_FP, \ 3784 .abi_index = 0xffu, \ 3785 .flags = NATIVE_REG_CALLER_SAVED, \ 3786 .spill_cost = 1u, \ 3787 .copy_cost = 1u} 3788 #define AA_PHYS_FP_ARG(r) \ 3789 {.reg = (r), \ 3790 .cls = NATIVE_REG_FP, \ 3791 .abi_index = (r), \ 3792 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 3793 ((r) < 4u ? NATIVE_REG_RET : 0), \ 3794 .spill_cost = 1u, \ 3795 .copy_cost = 1u} 3796 #define AA_PHYS_FP_CALLEE(r) \ 3797 {.reg = (r), \ 3798 .cls = NATIVE_REG_FP, \ 3799 .abi_index = 0xffu, \ 3800 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 3801 .spill_cost = 4u, \ 3802 .copy_cost = 1u} 3803 #define AA_PHYS_FP_RESERVED(r) \ 3804 {.reg = (r), \ 3805 .cls = NATIVE_REG_FP, \ 3806 .abi_index = 0xffu, \ 3807 .flags = NATIVE_REG_RESERVED, \ 3808 .spill_cost = 0u, \ 3809 .copy_cost = 0u} 3810 3811 static const NativePhysRegInfo aa_fp_phys[] = { 3812 AA_PHYS_FP_ARG(0u), AA_PHYS_FP_ARG(1u), AA_PHYS_FP_ARG(2u), 3813 AA_PHYS_FP_ARG(3u), AA_PHYS_FP_ARG(4u), AA_PHYS_FP_ARG(5u), 3814 AA_PHYS_FP_ARG(6u), AA_PHYS_FP_ARG(7u), AA_PHYS_FP_CALLEE(8u), 3815 AA_PHYS_FP_CALLEE(9u), AA_PHYS_FP_CALLEE(10u), AA_PHYS_FP_CALLEE(11u), 3816 AA_PHYS_FP_CALLEE(12u), AA_PHYS_FP_CALLEE(13u), AA_PHYS_FP_CALLEE(14u), 3817 AA_PHYS_FP_CALLEE(15u), AA_PHYS_FP_CALLER(16u), AA_PHYS_FP_CALLER(17u), 3818 AA_PHYS_FP_ALLOC(18u), AA_PHYS_FP_ALLOC(19u), AA_PHYS_FP_RESERVED(20u), 3819 AA_PHYS_FP_RESERVED(21u), AA_PHYS_FP_CALLER(22u), AA_PHYS_FP_CALLER(23u), 3820 AA_PHYS_FP_CALLER(24u), AA_PHYS_FP_CALLER(25u), AA_PHYS_FP_CALLER(26u), 3821 AA_PHYS_FP_CALLER(27u), AA_PHYS_FP_CALLER(28u), AA_PHYS_FP_CALLER(29u), 3822 AA_PHYS_FP_CALLER(30u), AA_PHYS_FP_CALLER(31u), 3823 }; 3824 3825 static const NativeAllocClassInfo aa_classes[] = { 3826 {.cls = NATIVE_REG_INT, 3827 .allocable = aa_int_allocable, 3828 .nallocable = sizeof aa_int_allocable / sizeof aa_int_allocable[0], 3829 .scratch = aa_int_scratch, 3830 .nscratch = sizeof aa_int_scratch / sizeof aa_int_scratch[0], 3831 .phys = aa_int_phys, 3832 .nphys = sizeof aa_int_phys / sizeof aa_int_phys[0], 3833 .caller_saved_mask = 0x0007ffffu, 3834 .callee_saved_mask = 0x1ff80000u, 3835 .arg_mask = 0x000000ffu, 3836 .ret_mask = 0x00000003u, 3837 .reserved_mask = 3838 (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << AA_FP) | (1u << AA_LR)}, 3839 {.cls = NATIVE_REG_FP, 3840 .allocable = aa_fp_allocable, 3841 .nallocable = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0], 3842 .scratch = aa_fp_scratch, 3843 .nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0], 3844 .phys = aa_fp_phys, 3845 .nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0], 3846 /* v8..v15 are callee-saved (low 64 bits per AAPCS64); the rest are 3847 * caller-saved. */ 3848 .caller_saved_mask = 0xffff00ffu, 3849 .callee_saved_mask = 0x0000ff00u, 3850 .arg_mask = 0x000000ffu, 3851 .ret_mask = 0x0000000fu}, 3852 }; 3853 3854 /* Resolve a register name ("x8", "v3", ...) to its (class, Reg). Powers the 3855 * optimizer's inline-asm clobber masks and explicit hard-register operands 3856 * ("{x8}" from a GNU local register variable). x0..x30 are DWARF 0..30; the 3857 * SIMD/FP bank v0..v31 is DWARF 64..95. Returns non-zero for a non-register 3858 * name (cc/memory/unknown), which the caller skips. */ 3859 static int aa_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, 3860 NativeAllocClass* cls_out) { 3861 char buf[16]; 3862 uint32_t dwarf; 3863 (void)ri; 3864 if (!name.s || !name.len || name.len >= sizeof buf) return 1; 3865 memcpy(buf, name.s, name.len); 3866 buf[name.len] = '\0'; 3867 if (aa64_register_index(buf, &dwarf) != 0) return 1; 3868 if (dwarf <= 30u) { 3869 *cls_out = NATIVE_REG_INT; 3870 *out = (Reg)dwarf; 3871 return 0; 3872 } 3873 if (dwarf >= 64u && dwarf <= 95u) { 3874 *cls_out = NATIVE_REG_FP; 3875 *out = (Reg)(dwarf - 64u); 3876 return 0; 3877 } 3878 return 1; 3879 } 3880 3881 static int aa_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, 3882 Reg reg) { 3883 (void)ri; 3884 if (cls == NATIVE_REG_INT) { 3885 if (reg <= 8u) return 1; 3886 if (reg >= 12u && reg <= 15u) return 1; 3887 if (reg >= 19u && reg <= 28u) return 1; 3888 return 0; 3889 } 3890 if (cls == NATIVE_REG_FP) { 3891 if (reg <= 19u) return 1; 3892 if (reg >= 22u && reg <= 31u) return 1; 3893 } 3894 return 0; 3895 } 3896 3897 static int aa_asm_constraint_reg(const NativeRegInfo* ri, const char* body, 3898 NativeAllocClass* cls_out, Reg* fixed_out, 3899 u32* allowed_mask_out) { 3900 (void)ri; 3901 if (!body || !body[0] || body[1]) return 0; 3902 if (fixed_out) *fixed_out = REG_NONE; 3903 if (allowed_mask_out) *allowed_mask_out = 0; 3904 switch (body[0]) { 3905 case 'r': 3906 if (cls_out) *cls_out = NATIVE_REG_INT; 3907 return 1; 3908 case 'w': 3909 if (cls_out) *cls_out = NATIVE_REG_FP; 3910 return 1; 3911 case 'x': 3912 if (cls_out) *cls_out = NATIVE_REG_FP; 3913 if (allowed_mask_out) *allowed_mask_out = 0x0000ffffu; /* v0..v15 */ 3914 return 1; 3915 case 'y': 3916 if (cls_out) *cls_out = NATIVE_REG_FP; 3917 if (allowed_mask_out) *allowed_mask_out = 0x000000ffu; /* v0..v7 */ 3918 return 1; 3919 default: 3920 return 0; 3921 } 3922 } 3923 3924 static const NativeRegInfo aa_reg_info = { 3925 .classes = aa_classes, 3926 .nclasses = sizeof aa_classes / sizeof aa_classes[0], 3927 .resolve_name = aa_resolve_name, 3928 .asm_operand_reg_ok = aa_asm_operand_reg_ok, 3929 .asm_constraint_reg = aa_asm_constraint_reg, 3930 }; 3931 3932 static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr); 3933 static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, 3934 KitCgTypeId type); 3935 static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr); 3936 static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr, 3937 NativeLoc src_ap_ptr); 3938 static void aa_asm_block_native(NativeTarget* t, const char* tmpl, 3939 const AsmConstraint* outs, u32 nout, 3940 NativeLoc* out_locs, const AsmConstraint* ins, 3941 u32 nin, const NativeLoc* in_locs, 3942 const Sym* clobbers, u32 nclob); 3943 3944 NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, 3945 MCEmitter* mc) { 3946 AANativeTarget* a = arena_znew(c->tu, AANativeTarget); 3947 NativeTarget* t; 3948 if (!a) return NULL; 3949 t = &a->base; 3950 t->c = c; 3951 t->obj = obj; 3952 t->mc = mc; 3953 native_frame_init(&a->frame, c); 3954 t->regs = &aa_reg_info; 3955 t->class_for_type = aa_class_for_type; 3956 t->imm_legal = aa_imm_legal; 3957 t->addr_legal = aa_addr_legal; 3958 t->machine_op_clobbers = aa_machine_op_clobbers; 3959 t->func_begin = aa_func_begin; 3960 t->func_begin_known_frame = aa_func_begin_known_frame; 3961 t->note_frame_state = aa_note_frame_state; 3962 t->reserve_callee_saves = aa_reserve_callee_saves; 3963 t->signature_stack_bytes = aa_signature_stack_bytes; 3964 t->call_stack_bytes = aa_call_stack_bytes; 3965 t->has_store_zero_reg = 1; 3966 t->store_zero_reg = 31u; /* wzr/xzr in the Rt position of a store */ 3967 t->func_end = aa_func_end; 3968 t->frame_slot = aa_frame_slot; 3969 t->frame_slot_debug_loc = aa_frame_slot_debug_loc; 3970 t->bind_param = aa_bind_native_param; 3971 t->label_new = aa_label_new; 3972 t->label_place = aa_label_place; 3973 t->jump = aa_jump; 3974 t->cmp_branch = aa_cmp_branch; 3975 t->indirect_branch = aa_indirect_branch; 3976 t->load_label_addr = aa_load_label_addr; 3977 t->move = aa_move; 3978 t->load_imm = aa_load_imm_native; 3979 t->load_const = aa_load_const; 3980 t->load_addr = aa_load_addr; 3981 t->load = aa_load_native; 3982 t->store = aa_store_native; 3983 t->tls_addr_of = aa_tls_addr_of; 3984 t->copy_bytes = aa_copy_bytes; 3985 t->set_bytes = aa_set_bytes; 3986 t->bitfield_load = aa_bitfield_load; 3987 t->bitfield_store = aa_bitfield_store; 3988 t->binop = aa_binop; 3989 t->unop = aa_unop; 3990 t->cmp = aa_cmp; 3991 t->convert = aa_convert; 3992 t->alloca_ = aa_alloca; 3993 t->spill = aa_spill; 3994 t->reload = aa_reload; 3995 t->plan_call = aa_plan_call; 3996 t->emit_call = aa_emit_call; 3997 t->plan_ret = aa_plan_ret; 3998 t->ret = aa_ret; 3999 t->atomic_load = aa_atomic_load; 4000 t->atomic_store = aa_atomic_store; 4001 t->atomic_rmw = aa_atomic_rmw; 4002 t->atomic_cas = aa_atomic_cas; 4003 t->fence = aa_fence; 4004 t->va_start_ = aa_va_start_native; 4005 t->va_arg_ = aa_va_arg_native; 4006 t->va_end_ = aa_va_end_native; 4007 t->va_copy_ = aa_va_copy_native; 4008 t->intrinsic = aa_intrinsic; 4009 t->asm_block = aa_asm_block_native; 4010 t->file_scope_asm = native_file_scope_asm; 4011 t->trap = aa_trap; 4012 t->set_loc = aa_set_loc; 4013 t->finalize = native_finalize; 4014 return t; 4015 } 4016 4017 /* Place the incoming parameter into `dst`: a hard register (the common 4018 * register-allocated scalar case -> a single arg-reg move, or a stack load 4019 * straight into the register), a frame slot (address-taken / aggregate / 4020 * spilled), or nowhere (unused). Incoming arg registers are never allocable, 4021 * so a register dst never aliases an incoming arg register. */ 4022 static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, 4023 NativeLoc dst) { 4024 AANativeTarget* a = aa_of(t); 4025 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 4026 const ABIArgInfo* ai = 4027 p->index < abi->nparams ? &abi->params[p->index] : NULL; 4028 int to_reg = dst.kind == NATIVE_LOC_REG; 4029 if (!ai || ai->kind == ABI_ARG_IGNORE) return; 4030 if (ai->kind == ABI_ARG_INDIRECT) { 4031 NativeAddr d_addr, from; 4032 AggregateAccess access; 4033 NativeLoc src = 4034 native_loc_reg(p->type, NATIVE_REG_INT, 4035 a->next_param_int < 8u ? a->next_param_int++ : AA_TMP0); 4036 if (src.v.reg == AA_TMP0) { 4037 NativeAddr saddr; 4038 memset(&saddr, 0, sizeof saddr); 4039 saddr.base_kind = NATIVE_ADDR_BASE_REG; 4040 saddr.base.reg = AA_FP; 4041 saddr.offset = aa_fp_off_in_arg(a, a->next_param_stack); 4042 aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8)); 4043 a->next_param_stack += 8u; 4044 } 4045 if (dst.kind != NATIVE_LOC_FRAME) 4046 aa_panic(a, "indirect parameter requires a frame destination"); 4047 memset(&d_addr, 0, sizeof d_addr); 4048 d_addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4049 d_addr.base.frame = dst.v.frame; 4050 d_addr.base_type = p->type; 4051 memset(&from, 0, sizeof from); 4052 from.base_kind = NATIVE_ADDR_BASE_REG; 4053 from.base.reg = src.v.reg; 4054 from.base_type = p->type; 4055 memset(&access, 0, sizeof access); 4056 access.type = p->type; 4057 access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); 4058 access.align = p->align ? p->align : type_align32(t, p->type); 4059 aa_copy_bytes(t, d_addr, from, access); 4060 return; 4061 } 4062 for (u32 i = 0; i < ai->nparts; ++i) { 4063 const ABIArgPart* part = &ai->parts[i]; 4064 NativeAllocClass cls = 4065 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 4066 int reg_dst = to_reg && (NativeAllocClass)dst.cls == cls; 4067 NativeLoc src; 4068 if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) { 4069 src = native_loc_reg(p->type, cls, a->next_param_fp++); 4070 } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) { 4071 src = native_loc_reg(p->type, cls, a->next_param_int++); 4072 } else { 4073 /* Stack-passed part: load straight into the dst register when possible, 4074 * otherwise a scratch for the store-to-frame path. */ 4075 Reg tmp = 4076 reg_dst ? (Reg)dst.v.reg : (cls == NATIVE_REG_FP ? 16u : AA_TMP0); 4077 NativeAddr saddr; 4078 src = native_loc_reg(p->type, cls, tmp); 4079 a->next_param_stack = 4080 align_up_u32(a->next_param_stack, aa_part_stack_align(abi, part)); 4081 memset(&saddr, 0, sizeof saddr); 4082 saddr.base_kind = NATIVE_ADDR_BASE_REG; 4083 saddr.base.reg = AA_FP; 4084 saddr.base_type = p->type; 4085 saddr.offset = aa_fp_off_in_arg(a, a->next_param_stack); 4086 aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size)); 4087 a->next_param_stack += aa_part_stack_size(abi, part); 4088 } 4089 if (dst.kind == NATIVE_LOC_NONE) { 4090 /* Unused parameter: only the ABI cursor advances. */ 4091 } else if (to_reg) { 4092 NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type, 4093 (NativeAllocClass)dst.cls, (Reg)dst.v.reg); 4094 if (!(src.kind == NATIVE_LOC_REG && src.v.reg == d.v.reg && 4095 (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) 4096 aa_move(t, d, src); 4097 } else { 4098 aa_store_part( 4099 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, 4100 0, part->size); 4101 } 4102 } 4103 a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); 4104 } 4105 4106 static void aa_bind_param(NativeDirectTarget* d, const CGParamDesc* p, 4107 CGLocal local, NativeDirectLocal* l) { 4108 NativeLoc dst; 4109 (void)local; 4110 memset(&dst, 0, sizeof dst); 4111 dst.kind = NATIVE_LOC_FRAME; 4112 dst.type = p->type; 4113 dst.v.frame = l->home; 4114 aa_bind_native_param(d->native, p, dst); 4115 } 4116 4117 static const char* aa_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { 4118 NativeCallDesc nd; 4119 NativeLoc* args = NULL; 4120 NativeLoc* results = NULL; 4121 u32 stack; 4122 memset(&nd, 0, sizeof nd); 4123 u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u; 4124 if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); 4125 if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults); 4126 for (u32 i = 0; i < call->nargs; ++i) { 4127 args[i].kind = NATIVE_LOC_FRAME; 4128 args[i].type = d->locals[call->args[i] - 1u].type; 4129 args[i].cls = d->locals[call->args[i] - 1u].cls; 4130 args[i].v.frame = d->locals[call->args[i] - 1u].home; 4131 } 4132 if (nresults) { 4133 results[0].kind = NATIVE_LOC_FRAME; 4134 results[0].type = d->locals[call->result - 1u].type; 4135 results[0].cls = d->locals[call->result - 1u].cls; 4136 results[0].v.frame = d->locals[call->result - 1u].home; 4137 } 4138 nd.fn_type = call->fn_type; 4139 nd.args = args; 4140 nd.results = results; 4141 nd.nargs = call->nargs; 4142 nd.nresults = nresults; 4143 stack = aa_call_stack_size(d->native, &nd); 4144 if (stack > aa_of(d->native)->incoming_stack_size) 4145 return "aarch64 tail call: stack argument area too small"; 4146 return NULL; 4147 } 4148 4149 static NativeAddr aa_direct_addr(NativeDirectTarget* d, Operand op) { 4150 NativeAddr addr; 4151 memset(&addr, 0, sizeof addr); 4152 switch ((OpKind)op.kind) { 4153 case OPK_LOCAL: 4154 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4155 addr.base.frame = d->locals[op.v.local - 1u].home; 4156 addr.base_type = op.type; 4157 return addr; 4158 case OPK_INDIRECT: 4159 addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; 4160 addr.base.frame = d->locals[op.v.ind.base - 1u].home; 4161 addr.cls = d->locals[op.v.ind.base - 1u].cls; 4162 addr.base_type = d->locals[op.v.ind.base - 1u].type; 4163 addr.offset = op.v.ind.ofs; 4164 return addr; 4165 default: 4166 compiler_panic(d->base.c, d->loc, 4167 "aarch64 native target: operand is not addressable"); 4168 } 4169 } 4170 4171 static NativeAddr aa_direct_materialize_addr(NativeDirectTarget* d, 4172 Operand op) { 4173 NativeAddr addr = aa_direct_addr(d, op); 4174 if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 4175 NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, AA_TMP1); 4176 NativeAddr load; 4177 memset(&load, 0, sizeof load); 4178 load.base_kind = NATIVE_ADDR_BASE_FRAME; 4179 load.base.frame = addr.base.frame; 4180 load.base_type = addr.base_type; 4181 aa_emit_mem(aa_of(d->native), 1, base, load, 4182 aa_mem_for_type(d->native, addr.base_type, 8)); 4183 addr.base_kind = NATIVE_ADDR_BASE_REG; 4184 addr.base.reg = AA_TMP1; 4185 } 4186 return addr; 4187 } 4188 4189 static NativeAddr aa_direct_pointer_addr(NativeDirectTarget* d, Operand op) { 4190 NativeAddr addr; 4191 memset(&addr, 0, sizeof addr); 4192 if (op.kind == OPK_LOCAL) { 4193 NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, AA_TMP1); 4194 NativeAddr load; 4195 memset(&load, 0, sizeof load); 4196 load.base_kind = NATIVE_ADDR_BASE_FRAME; 4197 load.base.frame = d->locals[op.v.local - 1u].home; 4198 load.base_type = op.type; 4199 aa_emit_mem(aa_of(d->native), 1, base, load, 4200 aa_mem_for_type(d->native, op.type, 8)); 4201 addr.base_kind = NATIVE_ADDR_BASE_REG; 4202 addr.base.reg = AA_TMP1; 4203 addr.base_type = op.type; 4204 return addr; 4205 } 4206 return aa_direct_materialize_addr(d, op); 4207 } 4208 4209 static NativeAddr aa_reg_addr(KitCgTypeId type, u32 reg, i32 offset) { 4210 NativeAddr addr; 4211 memset(&addr, 0, sizeof addr); 4212 addr.base_kind = NATIVE_ADDR_BASE_REG; 4213 addr.base.reg = reg; 4214 addr.base_type = type; 4215 addr.offset = offset; 4216 return addr; 4217 } 4218 4219 static void aa_load_ap_addr(NativeDirectTarget* d, Operand ap_addr, 4220 u32 dst_reg) { 4221 NativeLoc dst = 4222 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, dst_reg); 4223 NativeAddr ap = aa_direct_pointer_addr(d, ap_addr); 4224 d->native->load_addr(d->native, dst, ap); 4225 } 4226 4227 /* The va cores use only non-allocable registers for their temporaries 4228 * (scratch x9/x10, reserved x16=TMP0 / x17=TMP1, vector v16) so they never 4229 * clobber a value the optimizer's register allocator may hold live across the 4230 * op. The va_list base register is supplied by the caller (ap.base.reg), which 4231 * the optimizer materializes into a safe register before the call. */ 4232 static u32 aa_va_base_reg(AANativeTarget* a, NativeAddr ap) { 4233 if (ap.base_kind != NATIVE_ADDR_BASE_REG) 4234 compiler_panic(a->base.c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, 4235 "aarch64 native target: va_list pointer not in register"); 4236 return ap.base.reg; 4237 } 4238 4239 /* va_list layout is queried from the ABI; the optimizer/direct callers pass the 4240 * va_list pointer opaquely. `ap` addresses the va_list object itself. */ 4241 static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) { 4242 NativeTarget* t = &a->base; 4243 const ABIFuncInfo* abi = 4244 a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL; 4245 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 4246 NativeLoc ptr = 4247 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); 4248 if (vai.kind == ABI_VA_LIST_POINTER) { 4249 if (a->top_home_bytes) { 4250 /* Windows: `va_list = &<first vararg>` inside the contiguous 4251 * [GP home area | incoming stack args] block. Named args consume the 4252 * leading slots; next_param_int (FP params remapped to GP included) plus 4253 * next_param_stack locate the first unnamed slot. Home slot 4254 * gp_reg_count coincides with incoming-arg byte 0, so a single formula 4255 * spans both regions. */ 4256 i32 off = 4257 aa_fp_off_home_slot(a->next_param_int) + (i32)a->next_param_stack; 4258 aa_emit_add_imm(a, AA_TMP0, AA_FP, off); 4259 aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8)); 4260 return; 4261 } 4262 /* `va_list = &<first vararg>`. Variadic stack args follow the fixed 4263 * incoming params in the same caller window. Apple ARM64 compact fixed 4264 * stack args may leave this cursor at +4, while the first variadic slot 4265 * starts at the next 8-byte boundary. */ 4266 u32 stack = aa_vararg_stack_start(abi, a->next_param_stack); 4267 aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a, stack)); 4268 aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8)); 4269 return; 4270 } 4271 if (vai.kind == ABI_VA_LIST_AAPCS64) { 4272 KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32); 4273 NativeLoc i32tmp = native_loc_reg(i32_ty, NATIVE_REG_INT, AA_TMP1); 4274 MemAccess ptr_mem = aa_mem_for_type(t, ptr.type, 8); 4275 MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4); 4276 AANativeSlot* gr = aa_slot(a, a->va_gr_slot); 4277 AANativeSlot* vr = aa_slot(a, a->va_vr_slot); 4278 u32 base = aa_va_base_reg(a, ap); 4279 u32 used_gr = a->next_param_int < vai.gp_reg_count ? a->next_param_int 4280 : vai.gp_reg_count; 4281 u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp 4282 : vai.fp_reg_count; 4283 /* __stack points at the incoming stack args, which sit above the saved 4284 * fp/lr pair — the same address bind_param uses (aa_fp_off_in_arg), not the 4285 * raw next_param_stack cursor. */ 4286 aa_emit_add_imm(a, AA_TMP0, AA_FP, 4287 aa_fp_off_in_arg(a, a->next_param_stack)); 4288 aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset), 4289 ptr_mem); 4290 aa_emit_add_imm(a, AA_TMP0, AA_FP, 4291 aa_fp_off_slot(a, gr->off) + 4292 (i32)(vai.gp_reg_count * vai.gp_slot_size)); 4293 aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.gr_top_offset), 4294 ptr_mem); 4295 aa_emit_add_imm(a, AA_TMP0, AA_FP, 4296 aa_fp_off_slot(a, vr->off) + 4297 (i32)(vai.fp_reg_count * vai.fp_slot_size)); 4298 aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.vr_top_offset), 4299 ptr_mem); 4300 aa_emit_load_imm(t->mc, 0, AA_TMP1, 4301 -(i32)((vai.gp_reg_count - used_gr) * vai.gp_slot_size)); 4302 aa_emit_mem(a, 0, i32tmp, 4303 aa_reg_addr(i32_ty, base, (i32)vai.gr_offs_offset), i32_mem); 4304 aa_emit_load_imm(t->mc, 0, AA_TMP1, 4305 -(i32)((vai.fp_reg_count - used_vr) * vai.fp_slot_size)); 4306 aa_emit_mem(a, 0, i32tmp, 4307 aa_reg_addr(i32_ty, base, (i32)vai.vr_offs_offset), i32_mem); 4308 return; 4309 } 4310 compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, 4311 "aarch64 native target: unsupported va_list layout"); 4312 } 4313 4314 static void aa_va_arg_core(AANativeTarget* a, NativeLoc dst, NativeAddr ap, 4315 KitCgTypeId type) { 4316 NativeTarget* t = &a->base; 4317 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 4318 NativeLoc cur = 4319 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); 4320 /* The fetched value is written directly into the caller-provided register 4321 * `dst`, which the caller guarantees is distinct from the va_list base 4322 * register. Only TMP0/TMP1 are used as private scratch. */ 4323 NativeLoc val = dst; 4324 NativeAddr src; 4325 MemAccess ptr_mem = aa_mem_for_type(t, cur.type, 8); 4326 MemAccess val_mem = aa_mem_for_type(t, type, type_size32(t, type)); 4327 if (dst.kind != NATIVE_LOC_REG) 4328 compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, 4329 "aarch64 native target: va_arg destination must be a " 4330 "register"); 4331 if (vai.kind == ABI_VA_LIST_POINTER) { 4332 aa_emit_mem(a, 1, cur, ap, ptr_mem); 4333 src = aa_reg_addr(type, AA_TMP0, 0); 4334 { 4335 const ABIFuncInfo* abi = 4336 a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL; 4337 ABIArgPart part; 4338 memset(&part, 0, sizeof part); 4339 part.cls = cg_type_is_float(t->c, type) ? ABI_CLASS_FP : ABI_CLASS_INT; 4340 part.size = type_size32(t, type); 4341 part.align = type_align32(t, type); 4342 aa_emit_add_imm(a, AA_TMP1, AA_TMP0, 4343 (i32)aa_part_vararg_stack_size(abi, &part)); 4344 } 4345 aa_emit_mem(a, 0, native_loc_reg(cur.type, NATIVE_REG_INT, AA_TMP1), ap, 4346 ptr_mem); 4347 aa_emit_mem(a, 1, val, src, val_mem); 4348 return; 4349 } 4350 if (vai.kind == ABI_VA_LIST_AAPCS64) { 4351 KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32); 4352 NativeLoc off = native_loc_reg(i32_ty, NATIVE_REG_INT, AA_TMP1); 4353 MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4); 4354 int is_fp = cg_type_is_float(t->c, type); 4355 u32 base = aa_va_base_reg(a, ap); 4356 u32 offs_field = is_fp ? vai.vr_offs_offset : vai.gr_offs_offset; 4357 u32 top_field = is_fp ? vai.vr_top_offset : vai.gr_top_offset; 4358 u32 slot_size = is_fp ? vai.fp_slot_size : vai.gp_slot_size; 4359 MCLabel stack_label = t->mc->label_new(t->mc); 4360 MCLabel done_label = t->mc->label_new(t->mc); 4361 aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem); 4362 aa_emit32(t->mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0)); 4363 aa_emit32(t->mc, 4364 aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)})); 4365 t->mc->emit_label_ref(t->mc, stack_label, R_AARCH64_CONDBR19, 4, 0); 4366 aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)top_field), 4367 ptr_mem); 4368 aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31)); 4369 aa_emit32(t->mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1)); 4370 aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem); 4371 aa_emit_add_imm(a, AA_TMP1, AA_TMP1, (i32)slot_size); 4372 aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem); 4373 aa_emit32(t->mc, aa64_b(0)); 4374 t->mc->emit_label_ref(t->mc, done_label, R_AARCH64_JUMP26, 4, 0); 4375 t->mc->label_place(t->mc, stack_label); 4376 aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset), 4377 ptr_mem); 4378 aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem); 4379 aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8); 4380 aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset), 4381 ptr_mem); 4382 t->mc->label_place(t->mc, done_label); 4383 return; 4384 } 4385 compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, 4386 "aarch64 native target: unsupported va_list layout"); 4387 } 4388 4389 static void aa_va_copy_core(AANativeTarget* a, NativeAddr dst_ap, 4390 NativeAddr src_ap) { 4391 NativeTarget* t = &a->base; 4392 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 4393 NativeLoc tmp = 4394 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); 4395 MemAccess mem = aa_mem_for_type(t, tmp.type, 8); 4396 if (vai.kind == ABI_VA_LIST_POINTER) { 4397 aa_emit_mem(a, 1, tmp, src_ap, mem); 4398 aa_emit_mem(a, 0, tmp, dst_ap, mem); 4399 return; 4400 } 4401 if (vai.kind == ABI_VA_LIST_AAPCS64) { 4402 u32 sb = aa_va_base_reg(a, src_ap); 4403 u32 db = aa_va_base_reg(a, dst_ap); 4404 for (u32 off = 0; off < vai.type.size; off += 8u) { 4405 aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, sb, (i32)off), mem); 4406 aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, db, (i32)off), mem); 4407 } 4408 return; 4409 } 4410 compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, 4411 "aarch64 native target: unsupported va_list layout"); 4412 } 4413 4414 /* ---- Direct-path (NativeDirectTarget) wrappers: convert semantic operands to 4415 * NativeAddr/NativeLoc, then call the shared cores above. ---- */ 4416 4417 /* The cores reserve x16/x17 (TMP0/TMP1) as private scratch and require the 4418 * va_list base register(s) to be distinct from those. aa_direct_pointer_addr 4419 * returns the pointer in TMP1, so the direct wrappers first relocate it into 4420 * x9/x10 before calling the cores. */ 4421 static NativeAddr aa_direct_va_base(NativeDirectTarget* d, Operand ap_addr, 4422 u32 reg) { 4423 aa_load_ap_addr(d, ap_addr, reg); 4424 return aa_reg_addr(builtin_id(KIT_CG_BUILTIN_I64), reg, 0); 4425 } 4426 4427 static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) { 4428 aa_va_start_core(aa_of(d->native), aa_direct_va_base(d, ap_addr, 10u)); 4429 } 4430 4431 static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr, 4432 KitCgTypeId type) { 4433 AANativeTarget* a = aa_of(d->native); 4434 int is_fp = cg_type_is_float(d->base.c, type); 4435 NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, 4436 is_fp ? 16u : 9u); 4437 MemAccess val_mem = 4438 aa_mem_for_type(d->native, type, type_size32(d->native, type)); 4439 NativeAddr dst; 4440 aa_va_arg_core(a, res, aa_direct_va_base(d, ap_addr, 10u), type); 4441 dst = aa_direct_materialize_addr(d, dst_op); 4442 aa_emit_mem(a, 0, res, dst, val_mem); 4443 } 4444 4445 static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) { 4446 (void)d; 4447 (void)ap_addr; 4448 } 4449 4450 static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr, 4451 Operand src_ap_addr) { 4452 AANativeTarget* a = aa_of(d->native); 4453 NativeAddr src = aa_direct_va_base(d, src_ap_addr, 9u); 4454 NativeAddr dst = aa_direct_va_base(d, dst_ap_addr, 10u); 4455 aa_va_copy_core(a, dst, src); 4456 } 4457 4458 /* ---- NativeTarget (optimizer) hooks: the optimizer passes the va_list 4459 * pointer as a materialized register; layout is resolved inside the cores. ---- 4460 */ 4461 4462 static NativeAddr aa_va_addr_from_ptr(NativeLoc ap_ptr) { 4463 NativeAddr addr; 4464 memset(&addr, 0, sizeof addr); 4465 addr.base_kind = NATIVE_ADDR_BASE_REG; 4466 addr.cls = NATIVE_REG_INT; 4467 addr.base.reg = ap_ptr.v.reg; 4468 addr.base_type = ap_ptr.type; 4469 return addr; 4470 } 4471 4472 static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr) { 4473 aa_va_start_core(aa_of(t), aa_va_addr_from_ptr(ap_ptr)); 4474 } 4475 4476 static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, 4477 KitCgTypeId type) { 4478 aa_va_arg_core(aa_of(t), dst, aa_va_addr_from_ptr(ap_ptr), type); 4479 } 4480 4481 static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr) { 4482 (void)t; 4483 (void)ap_ptr; 4484 } 4485 4486 static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr, 4487 NativeLoc src_ap_ptr) { 4488 aa_va_copy_core(aa_of(t), aa_va_addr_from_ptr(dst_ap_ptr), 4489 aa_va_addr_from_ptr(src_ap_ptr)); 4490 } 4491 4492 /* constraint_body / constraint_early / match_index are shared 4493 * (cg/native_asm.h). */ 4494 4495 _Noreturn static void aa_asm_panic_at(Compiler* c, SrcLoc loc, 4496 const char* msg) { 4497 compiler_panic(c, loc, "aarch64 inline asm: %s", msg); 4498 } 4499 4500 _Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) { 4501 aa_asm_panic_at(d->base.c, d->loc, msg); 4502 } 4503 4504 AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, KitCgTypeId type, 4505 NativeAllocClass cls, Reg reg) { 4506 memset(out, 0, sizeof *out); 4507 out->kind = AA64_INLINE_OPK_REG; 4508 out->pad[0] = 4509 (cls == NATIVE_REG_FP) ? AA64_INLINE_OPCLS_FP : AA64_INLINE_OPCLS_INT; 4510 out->type = type; 4511 out->v.local = (CGLocal)reg; 4512 } 4513 4514 AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, KitCgTypeId type, 4515 Reg base) { 4516 memset(out, 0, sizeof *out); 4517 out->kind = OPK_INDIRECT; 4518 out->type = type; 4519 out->v.ind.base = (CGLocal)base; 4520 out->v.ind.index = CG_LOCAL_NONE; 4521 } 4522 4523 static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, 4524 NativeAllocClass* cls_out, Reg* reg_out) { 4525 Slice s = pool_slice(c->global, name); 4526 char buf[16]; 4527 uint32_t dwarf; 4528 if (!s.s || !s.len) return 0; 4529 if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; 4530 if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; 4531 if (s.len >= sizeof buf) aa_asm_panic_at(c, loc, "clobber name is too long"); 4532 memcpy(buf, s.s, s.len); 4533 buf[s.len] = '\0'; 4534 if (aa64_register_index(buf, &dwarf) != 0) 4535 aa_asm_panic_at(c, loc, "unknown clobber register"); 4536 if (dwarf <= 30u) { 4537 *cls_out = NATIVE_REG_INT; 4538 *reg_out = (Reg)dwarf; 4539 return 1; 4540 } 4541 if (dwarf >= 64u && dwarf <= 95u) { 4542 *cls_out = NATIVE_REG_FP; 4543 *reg_out = (Reg)(dwarf - 64u); 4544 return 1; 4545 } 4546 aa_asm_panic_at(c, loc, "unsupported clobber register"); 4547 return 0; 4548 } 4549 4550 static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 4551 u32 nclob, u32* int_mask, u32* fp_mask) { 4552 *int_mask = 0; 4553 *fp_mask = 0; 4554 for (u32 i = 0; i < nclob; ++i) { 4555 NativeAllocClass cls; 4556 Reg reg; 4557 if (!aa_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue; 4558 if (cls == NATIVE_REG_INT) 4559 *int_mask |= 1u << reg; 4560 else if (cls == NATIVE_REG_FP) 4561 *fp_mask |= 1u << reg; 4562 } 4563 } 4564 4565 AA_UNUSED_FN static Reg aa_asm_alloc_reg(NativeDirectTarget* d, 4566 NativeAllocClass cls, 4567 u32 allowed_mask, u32* used_int, 4568 u32* used_fp) { 4569 static const Reg int_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 4570 7u, 8u, 11u, 12u, 13u, 14u, 15u}; 4571 static const Reg fp_pool[] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 4572 16u, 17u, 18u, 19u, 22u, 23u, 24u, 25u, 4573 26u, 27u, 28u, 29u, 30u, 31u}; 4574 const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool; 4575 u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0]) 4576 : (u32)(sizeof int_pool / sizeof int_pool[0]); 4577 u32* used = cls == NATIVE_REG_FP ? used_fp : used_int; 4578 for (u32 i = 0; i < n; ++i) { 4579 Reg r = pool[i]; 4580 if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue; 4581 if ((*used & (1u << r)) != 0) continue; 4582 *used |= 1u << r; 4583 return r; 4584 } 4585 aa_asm_panic(d, "out of registers for asm operands"); 4586 return REG_NONE; 4587 } 4588 4589 static int aa_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, 4590 const char* constraint, 4591 NativeAsmRegPin* pin) { 4592 NativeAsmRegPinStatus st = 4593 native_asm_resolve_pin(d->native, reg, constraint, pin); 4594 if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; 4595 if (st != NATIVE_ASM_REG_PIN_OK) 4596 aa_asm_panic(d, native_asm_pin_status_message(st)); 4597 return 1; 4598 } 4599 4600 AA_UNUSED_FN static void aa_direct_load_operand_to_reg(NativeDirectTarget* d, 4601 Operand op, 4602 NativeLoc dst) { 4603 NativeAddr addr; 4604 memset(&addr, 0, sizeof addr); 4605 switch ((OpKind)op.kind) { 4606 case OPK_IMM: 4607 if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) 4608 aa_asm_panic(d, "floating-point immediate asm input is unsupported"); 4609 d->native->load_imm(d->native, dst, op.v.imm); 4610 return; 4611 case OPK_LOCAL: 4612 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4613 addr.base.frame = d->locals[op.v.local - 1u].home; 4614 addr.base_type = op.type; 4615 aa_emit_mem(aa_of(d->native), 1, dst, addr, 4616 aa_mem_for_type(d->native, op.type, 0)); 4617 return; 4618 case OPK_GLOBAL: 4619 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 4620 addr.base.global.sym = op.v.global.sym; 4621 addr.base.global.addend = op.v.global.addend; 4622 addr.base_type = op.type; 4623 d->native->load_addr(d->native, dst, addr); 4624 return; 4625 case OPK_INDIRECT: 4626 addr = aa_direct_materialize_addr(d, op); 4627 aa_emit_mem(aa_of(d->native), 1, dst, addr, 4628 aa_mem_for_type(d->native, op.type, 0)); 4629 return; 4630 } 4631 aa_asm_panic(d, "unsupported asm input operand"); 4632 } 4633 4634 AA_UNUSED_FN static void aa_direct_load_address_to_reg(NativeDirectTarget* d, 4635 Operand op, 4636 NativeLoc dst) { 4637 NativeAddr addr = aa_direct_addr(d, op); 4638 d->native->load_addr(d->native, dst, addr); 4639 } 4640 4641 AA_UNUSED_FN static void aa_direct_store_reg_to_operand(NativeDirectTarget* d, 4642 Operand op, 4643 NativeLoc src) { 4644 NativeAddr addr; 4645 memset(&addr, 0, sizeof addr); 4646 if (op.kind == OPK_LOCAL) { 4647 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4648 addr.base.frame = d->locals[op.v.local - 1u].home; 4649 addr.base_type = op.type; 4650 } else { 4651 addr = aa_direct_materialize_addr(d, op); 4652 } 4653 aa_emit_mem(aa_of(d->native), 0, src, addr, 4654 aa_mem_for_type(d->native, op.type, 0)); 4655 } 4656 4657 typedef struct AAAsmSavedClobber { 4658 NativeFrameSlot slot; 4659 NativeAllocClass cls; 4660 Reg reg; 4661 KitCgTypeId type; 4662 } AAAsmSavedClobber; 4663 4664 static void aa_asm_save_one(AANativeTarget* a, AAAsmSavedClobber* s) { 4665 NativeFrameSlotDesc desc; 4666 NativeAddr addr; 4667 NativeLoc reg; 4668 memset(&desc, 0, sizeof desc); 4669 desc.type = s->type; 4670 desc.size = 8; 4671 desc.align = 8; 4672 desc.kind = NATIVE_FRAME_SLOT_SAVE; 4673 s->slot = a->base.frame_slot(&a->base, &desc); 4674 memset(&addr, 0, sizeof addr); 4675 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4676 addr.base.frame = s->slot; 4677 addr.base_type = s->type; 4678 reg = native_loc_reg(s->type, s->cls, s->reg); 4679 aa_emit_mem(a, 0, reg, addr, aa_mem_for_type(&a->base, s->type, 8)); 4680 } 4681 4682 AA_UNUSED_FN static void aa_asm_restore_one(AANativeTarget* a, 4683 const AAAsmSavedClobber* s) { 4684 NativeAddr addr; 4685 NativeLoc reg = native_loc_reg(s->type, s->cls, s->reg); 4686 memset(&addr, 0, sizeof addr); 4687 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4688 addr.base.frame = s->slot; 4689 addr.base_type = s->type; 4690 aa_emit_mem(a, 1, reg, addr, aa_mem_for_type(&a->base, s->type, 8)); 4691 } 4692 4693 AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers( 4694 AANativeTarget* a, u32 int_mask, u32 fp_mask, u32* nsaved_out) { 4695 AAAsmSavedClobber* saved = 4696 arena_zarray(a->base.c->tu, AAAsmSavedClobber, 20u); 4697 u32 n = 0; 4698 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 4699 KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64); 4700 for (Reg r = 19u; r <= 28u; ++r) { 4701 if ((int_mask & (1u << r)) == 0) continue; 4702 saved[n].cls = NATIVE_REG_INT; 4703 saved[n].reg = r; 4704 saved[n].type = i64; 4705 aa_asm_save_one(a, &saved[n++]); 4706 } 4707 for (Reg r = 8u; r <= 15u; ++r) { 4708 if ((fp_mask & (1u << r)) == 0) continue; 4709 saved[n].cls = NATIVE_REG_FP; 4710 saved[n].reg = r; 4711 saved[n].type = f64; 4712 aa_asm_save_one(a, &saved[n++]); 4713 } 4714 *nsaved_out = n; 4715 return saved; 4716 } 4717 4718 static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, 4719 const AsmConstraint* outs, u32 nout, 4720 Operand* out_ops, const AsmConstraint* ins, 4721 u32 nin, const Operand* in_ops, 4722 const Sym* clobbers, u32 nclob, 4723 u32 clobber_abi_sets) { 4724 Operand* bound_outs = 4725 nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL; 4726 Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL; 4727 u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; 4728 AAAsmSavedClobber* saved; 4729 u32 nsaved; 4730 AA64Asm* a; 4731 4732 aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp); 4733 native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); 4734 clob_int |= abi_int; 4735 clob_fp |= abi_fp; 4736 used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) | 4737 (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP); 4738 used_fp = clob_fp | (1u << 20u) | (1u << 21u); 4739 4740 for (u32 i = 0; i < nout; ++i) { 4741 const char* body = native_asm_constraint_body(outs[i].str); 4742 NativeAsmRegPin pin; 4743 if (aa_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { 4744 /* GNU local register variable: pin to the named hard register. */ 4745 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 4746 if (pin.cls == NATIVE_REG_FP) { 4747 used_fp |= 1u << pin.reg; 4748 clob_fp |= 1u << pin.reg; 4749 } else { 4750 used_int |= 1u << pin.reg; 4751 clob_int |= 1u << pin.reg; 4752 } 4753 aa_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); 4754 } else { 4755 NativeAsmConstraintInfo info; 4756 if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) { 4757 Reg reg = info.fixed_reg != REG_NONE 4758 ? info.fixed_reg 4759 : aa_asm_alloc_reg(d, info.cls, info.allowed_mask, 4760 &used_int, &used_fp); 4761 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 4762 if (info.cls == NATIVE_REG_FP) { 4763 used_fp |= 1u << reg; 4764 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4765 } else { 4766 used_int |= 1u << reg; 4767 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4768 } 4769 aa_asm_bound_reg(&bound_outs[i], type, info.cls, reg); 4770 } else if (body[0] == 'm') { 4771 Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4772 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 4773 aa_asm_bound_mem(&bound_outs[i], type, reg); 4774 } else { 4775 aa_asm_panic(d, "unsupported output constraint"); 4776 } 4777 } 4778 } 4779 4780 for (u32 i = 0; i < nin; ++i) { 4781 const char* body = native_asm_constraint_body(ins[i].str); 4782 int matched = native_asm_match_index(body); 4783 if (matched >= 0) { 4784 if ((u32)matched >= nout) 4785 aa_asm_panic(d, "matching constraint out of range"); 4786 if (native_asm_constraint_early(outs[matched].str)) 4787 aa_asm_panic(d, "matching input names early-clobber output"); 4788 if (bound_outs[matched].kind != AA64_INLINE_OPK_REG) 4789 aa_asm_panic(d, "matching constraint requires register output"); 4790 bound_ins[i] = bound_outs[matched]; 4791 continue; 4792 } 4793 NativeAsmRegPin pin; 4794 if (aa_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { 4795 /* GNU local register variable: pin to the named hard register. */ 4796 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 4797 if (pin.cls == NATIVE_REG_FP) { 4798 used_fp |= 1u << pin.reg; 4799 clob_fp |= 1u << pin.reg; 4800 } else { 4801 used_int |= 1u << pin.reg; 4802 clob_int |= 1u << pin.reg; 4803 } 4804 aa_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); 4805 } else { 4806 NativeAsmConstraintInfo info; 4807 if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) { 4808 Reg reg = info.fixed_reg != REG_NONE 4809 ? info.fixed_reg 4810 : aa_asm_alloc_reg(d, info.cls, info.allowed_mask, 4811 &used_int, &used_fp); 4812 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 4813 if (info.cls == NATIVE_REG_FP) { 4814 used_fp |= 1u << reg; 4815 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4816 } else { 4817 used_int |= 1u << reg; 4818 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4819 } 4820 aa_asm_bound_reg(&bound_ins[i], type, info.cls, reg); 4821 } else if (body[0] == 'i') { 4822 if (in_ops[i].kind != OPK_IMM) 4823 aa_asm_panic(d, "immediate constraint requires immediate operand"); 4824 bound_ins[i] = in_ops[i]; 4825 } else if (body[0] == 'm') { 4826 Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4827 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 4828 aa_asm_bound_mem(&bound_ins[i], type, reg); 4829 } else { 4830 aa_asm_panic(d, "unsupported input constraint"); 4831 } 4832 } 4833 } 4834 4835 saved = 4836 aa_asm_save_callee_clobbers(aa_of(d->native), clob_int, clob_fp, &nsaved); 4837 for (u32 i = 0; i < nout; ++i) { 4838 if (bound_outs[i].kind == AA64_INLINE_OPK_REG) { 4839 NativeAllocClass cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP 4840 ? NATIVE_REG_FP 4841 : NATIVE_REG_INT; 4842 if (outs[i].dir == KIT_CG_ASM_INOUT) { 4843 aa_direct_load_operand_to_reg( 4844 d, out_ops[i], 4845 native_loc_reg(bound_outs[i].type, cls, 4846 (Reg)bound_outs[i].v.local)); 4847 } 4848 } else if (bound_outs[i].kind == OPK_INDIRECT) { 4849 NativeLoc loc = 4850 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4851 (Reg)bound_outs[i].v.ind.base); 4852 aa_direct_load_address_to_reg(d, out_ops[i], loc); 4853 } 4854 } 4855 for (u32 i = 0; i < nin; ++i) { 4856 if (bound_ins[i].kind == AA64_INLINE_OPK_REG) { 4857 NativeAllocClass cls = bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP 4858 ? NATIVE_REG_FP 4859 : NATIVE_REG_INT; 4860 aa_direct_load_operand_to_reg( 4861 d, in_ops[i], 4862 native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); 4863 } else if (bound_ins[i].kind == OPK_INDIRECT) { 4864 NativeLoc loc = 4865 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4866 (Reg)bound_ins[i].v.ind.base); 4867 aa_direct_load_address_to_reg(d, in_ops[i], loc); 4868 } 4869 } 4870 a = aa64_asm_open(d->base.c); 4871 aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 4872 nclob); 4873 aa64_asm_run_template(a, d->native->mc, tmpl); 4874 aa64_asm_close(a); 4875 4876 for (u32 i = 0; i < nout; ++i) { 4877 NativeAllocClass cls; 4878 NativeLoc src; 4879 if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue; 4880 cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP 4881 : NATIVE_REG_INT; 4882 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 4883 aa_direct_store_reg_to_operand(d, out_ops[i], src); 4884 } 4885 for (u32 i = nsaved; i > 0; --i) 4886 aa_asm_restore_one(aa_of(d->native), &saved[i - 1u]); 4887 } 4888 4889 /* ---- NativeTarget (optimizer) asm hook ---- 4890 * 4891 * The optimizer has already allocated every operand register and arranged the 4892 * surrounding data flow (inputs are live in their registers on entry, outputs 4893 * are consumed from their registers on exit; the asm's clobber_mask kept the 4894 * allocator from holding live values in clobbered registers). So unlike the 4895 * direct path this hook does NOT self-allocate registers and does NOT load 4896 * inputs / store outputs -- it only binds the pre-allocated registers to the 4897 * template, materializing memory-operand base addresses into the reserved 4898 * scratch registers and saving/restoring callee-saved registers the asm 4899 * clobbers (the only ABI obligation the allocator cannot discharge itself). */ 4900 4901 static NativeAddr aa_asm_loc_to_addr(AANativeTarget* a, SrcLoc loc, 4902 NativeLoc src) { 4903 NativeAddr addr; 4904 memset(&addr, 0, sizeof addr); 4905 addr.base_type = src.type; 4906 switch ((NativeLocKind)src.kind) { 4907 case NATIVE_LOC_FRAME: 4908 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 4909 addr.base.frame = src.v.frame; 4910 return addr; 4911 case NATIVE_LOC_ADDR: 4912 return src.v.addr; 4913 case NATIVE_LOC_GLOBAL: 4914 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 4915 addr.base.global.sym = src.v.global.sym; 4916 addr.base.global.addend = src.v.global.addend; 4917 return addr; 4918 case NATIVE_LOC_REG: 4919 addr.base_kind = NATIVE_ADDR_BASE_REG; 4920 addr.cls = NATIVE_REG_INT; 4921 addr.base.reg = src.v.reg; 4922 return addr; 4923 default: 4924 aa_asm_panic_at(a->base.c, loc, "unsupported memory asm operand"); 4925 } 4926 } 4927 4928 /* Resolve a memory-constraint operand to a single base register with zero 4929 * offset, folding any frame/global/offset into a scratch register. At most the 4930 * two reserved scratch registers are used across one asm block. */ 4931 static Reg aa_asm_native_mem_base(AANativeTarget* a, SrcLoc loc, NativeLoc src, 4932 u32* ntmp) { 4933 NativeAddr addr = aa_asm_loc_to_addr(a, loc, src); 4934 u32 base; 4935 i32 off; 4936 Reg dst; 4937 if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) 4938 aa_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported"); 4939 aa_addr_base(a, addr, &base, &off); 4940 if (off == 0) return (Reg)base; 4941 if (*ntmp >= 2u) 4942 aa_asm_panic_at(a->base.c, loc, "too many memory asm operands"); 4943 dst = (*ntmp == 0u) ? AA_TMP0 : AA_TMP1; 4944 (*ntmp)++; 4945 aa_emit_add_imm(a, dst, base, off); 4946 return dst; 4947 } 4948 4949 static void aa_asm_load_loc_to_reg(AANativeTarget* a, SrcLoc loc, NativeLoc src, 4950 NativeLoc dst) { 4951 NativeTarget* t = &a->base; 4952 NativeAllocClass cls = (NativeAllocClass)dst.cls; 4953 if (src.kind == NATIVE_LOC_REG) { 4954 if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src); 4955 return; 4956 } 4957 if (src.kind == NATIVE_LOC_IMM) { 4958 if (cls != NATIVE_REG_INT) 4959 aa_asm_panic_at(t->c, loc, 4960 "floating-point immediate asm input is unsupported"); 4961 t->load_imm(t, dst, src.v.imm); 4962 return; 4963 } 4964 aa_emit_mem(a, 1, dst, aa_asm_loc_to_addr(a, loc, src), 4965 aa_mem_for_type(t, dst.type, type_size32(t, dst.type))); 4966 } 4967 4968 static void aa_asm_store_reg_to_loc(AANativeTarget* a, SrcLoc loc, 4969 NativeLoc dst, NativeLoc src) { 4970 NativeTarget* t = &a->base; 4971 if (dst.kind == NATIVE_LOC_REG) { 4972 if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src); 4973 return; 4974 } 4975 aa_emit_mem(a, 0, src, aa_asm_loc_to_addr(a, loc, dst), 4976 aa_mem_for_type(t, src.type, type_size32(t, src.type))); 4977 } 4978 4979 static void aa_asm_bind_native(AANativeTarget* a, SrcLoc loc, Operand* out, 4980 const char* constraint, KitCgTypeId type, 4981 NativeLoc src, u32* ntmp) { 4982 const char* body = native_asm_constraint_body(constraint); 4983 NativeAsmConstraintInfo info; 4984 if (native_asm_constraint_reg_info(&a->base, constraint, &info)) { 4985 if (src.kind != NATIVE_LOC_REG) 4986 aa_asm_panic_at(a->base.c, loc, "register asm operand not in a register"); 4987 if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg) 4988 aa_asm_panic_at(a->base.c, loc, 4989 "fixed-register asm operand in wrong register"); 4990 if (info.allowed_mask && 4991 ((Reg)src.v.reg >= 32 || 4992 (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0)) 4993 compiler_panic( 4994 a->base.c, loc, 4995 "aarch64 inline asm: constraint %s got cls%u reg%u outside %08x", 4996 constraint, (unsigned)info.cls, (unsigned)src.v.reg, 4997 (unsigned)info.allowed_mask); 4998 aa_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg); 4999 } else if (body[0] == 'i') { 5000 if (src.kind != NATIVE_LOC_IMM) 5001 aa_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate"); 5002 memset(out, 0, sizeof *out); 5003 out->kind = OPK_IMM; 5004 out->type = type; 5005 out->v.imm = src.v.imm; 5006 } else if (body[0] == 'm') { 5007 aa_asm_bound_mem(out, type, aa_asm_native_mem_base(a, loc, src, ntmp)); 5008 } else { 5009 aa_asm_panic_at(a->base.c, loc, "unsupported asm constraint"); 5010 } 5011 } 5012 5013 static void aa_asm_block_native(NativeTarget* t, const char* tmpl, 5014 const AsmConstraint* outs, u32 nout, 5015 NativeLoc* out_locs, const AsmConstraint* ins, 5016 u32 nin, const NativeLoc* in_locs, 5017 const Sym* clobbers, u32 nclob) { 5018 AANativeTarget* a = aa_of(t); 5019 Compiler* c = t->c; 5020 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 5021 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 5022 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 5023 u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL; 5024 u32 ntmp = 0; 5025 AA64Asm* asmh; 5026 5027 for (u32 i = 0; i < nout; ++i) { 5028 KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; 5029 NativeLoc outloc = out_locs[i]; 5030 NativeAsmPinnedLoc pinned = 5031 native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc); 5032 if (pinned.has_pin) { 5033 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 5034 aa_asm_panic_at(c, loc, 5035 native_asm_pin_status_message(pinned.pin_status)); 5036 if (pinned.wrong_reg) 5037 aa_asm_panic_at(c, loc, "hard-register asm operand in wrong register"); 5038 outloc = pinned.loc; 5039 if (pinned.needs_stage) { 5040 staged_outs[i] = 1u; 5041 if (outs[i].dir == KIT_CG_ASM_INOUT) 5042 aa_asm_load_loc_to_reg(a, loc, out_locs[i], outloc); 5043 } 5044 } 5045 aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, &ntmp); 5046 } 5047 for (u32 i = 0; i < nin; ++i) { 5048 const char* body = native_asm_constraint_body(ins[i].str); 5049 int matched = native_asm_match_index(body); 5050 KitCgTypeId type; 5051 if (matched >= 0) { 5052 if ((u32)matched >= nout) 5053 aa_asm_panic_at(c, loc, "matching constraint out of range"); 5054 bound_ins[i] = bound_outs[matched]; 5055 continue; 5056 } 5057 type = ins[i].type ? ins[i].type : in_locs[i].type; 5058 { 5059 const char* in_body = native_asm_constraint_body(ins[i].str); 5060 NativeAsmConstraintInfo info; 5061 NativeLoc inloc = in_locs[i]; 5062 NativeAsmPinnedLoc pinned = 5063 native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc); 5064 /* A register-constrained input whose value is an address-taken local 5065 * arrives in a frame slot: the optimizer cannot keep an address-taken 5066 * local live in a register across the block, so the "inputs are already 5067 * in registers" contract does not hold for it. Load it into a reserved 5068 * scratch register (as the direct path does) before binding. With no 5069 * hard pin, only unrestricted integer constraints can use this scratch; 5070 * restricted register sets must already arrive in an allowed hard 5071 * register. */ 5072 if (pinned.has_pin) { 5073 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 5074 aa_asm_panic_at(c, loc, 5075 native_asm_pin_status_message(pinned.pin_status)); 5076 if (pinned.wrong_reg) 5077 aa_asm_panic_at(c, loc, 5078 "hard-register asm operand in wrong register"); 5079 inloc = pinned.loc; 5080 if (pinned.needs_stage) 5081 aa_asm_load_loc_to_reg(a, loc, in_locs[i], inloc); 5082 } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) && 5083 info.cls == NATIVE_REG_INT && info.allowed_mask == 0 && 5084 inloc.kind != NATIVE_LOC_REG) { 5085 Reg r; 5086 if (ntmp >= 2u) aa_asm_panic_at(c, loc, "too many memory asm operands"); 5087 r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1; 5088 ntmp++; 5089 inloc = native_loc_reg(type, NATIVE_REG_INT, r); 5090 aa_emit_mem(a, 1, inloc, aa_asm_loc_to_addr(a, loc, in_locs[i]), 5091 aa_mem_for_type(t, type, type_size32(t, type))); 5092 } 5093 (void)in_body; 5094 aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); 5095 } 5096 } 5097 5098 /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber 5099 * masks and aa_known_callee_saves folded the callee-saved ones into the 5100 * function's saved set, so the prologue/epilogue already preserve them. */ 5101 asmh = aa64_asm_open(c); 5102 aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 5103 nclob); 5104 aa64_asm_run_template(asmh, t->mc, tmpl); 5105 aa64_asm_close(asmh); 5106 5107 for (u32 i = 0; i < nout; ++i) { 5108 NativeAllocClass cls; 5109 NativeLoc src; 5110 if (!staged_outs || !staged_outs[i]) continue; 5111 if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue; 5112 cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP 5113 : NATIVE_REG_INT; 5114 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 5115 aa_asm_store_reg_to_loc(a, loc, out_locs[i], src); 5116 } 5117 } 5118 5119 static const NativeOps aa_direct_ops = { 5120 .bind_param = aa_bind_param, 5121 .tail_call_unrealizable_reason = aa_no_tail, 5122 .va_start_ = aa_va_start_, 5123 .va_arg_ = aa_va_arg_, 5124 .va_end_ = aa_va_end_, 5125 .va_copy_ = aa_va_copy_, 5126 .asm_block = aa_direct_asm_block, 5127 }; 5128 5129 const NativeOps* aa64_native_direct_ops(void) { return &aa_direct_ops; }