native.c - kit

native.c (171676B)
      1 /* src/arch/x64/native.c — x86-64 (SysV / Win64) NativeTarget implementation.
      2  *
      3  * Mirrors the rv64 reference (src/arch/rv64/native.c): a physical-emission
      4  * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by
      5  * the optimizer emit path. ABI decisions route through abi/ and the per-OS
      6  * X64ABIRegs (x64_abi_for_os); this file owns ISA emission and the x64 frame
      7  * layout.
      8  *
      9  * Frame model (single, rbp-anchored): the prologue does `push rbp; mov rbp,rsp;
     10  * sub rsp,frame_size`. Local/spill slots live below rbp at positive byte
     11  * offsets `off` (address = rbp - off). Incoming stack args sit above the saved
     12  * return address at [rbp + 16 + shadow_space + ...]. Callee-saved GPRs (and, on
     13  * Win64, XMMs) are saved below the locals; outgoing args sit at [rsp + 0..].
     14  * The single-pass (-O0) prologue reserves a NOP placeholder patched in func_end
     15  * once max_outgoing and callee-saves are known.
     16  *
     17  * Register model. INT scratch (never allocable, never driver scratch): R10 and
     18  * R11 — the emit paths' fixed temporaries. FP scratch: XMM14 and XMM15. RSP/RBP
     19  * are reserved (stack/frame pointers). RAX is reserved too (return value, the
     20  * div/mul implicit operand), but it is NOT an emit temp, so inline asm may pin
     21  * an operand to it (the Linux syscall idiom) — see x64_asm_operand_reg_ok.
     22  * The driver scratch pool is R8/R9 (int) and XMM4/XMM5 (fp), caller-saved on
     23  * both SysV and Win64 and disjoint from the emit temps so a hook never clobbers
     24  * an operand parked there. Scratch registers are reserved from allocation.
     25  * Callee-saved set is resolved per-OS via x64_abi_for_os at runtime (the
     26  * legality masks below are SysV's, the conservative superset that both ABIs'
     27  * allocators respect — Win64's extra callee-saves RDI/RSI/xmm6-15 only shrink
     28  * the allocable pool, never grow it). */
     29 
     30 #include <string.h>
     31 
     32 #include "abi/abi.h"
     33 #include "arch/x64/asm.h"
     34 #include "arch/x64/emit.h"
     35 #include "arch/x64/isa.h"
     36 #include "arch/x64/regs.h"
     37 #include "arch/x64/x64.h"
     38 #include "asm/asm.h"
     39 #include "asm/asm_lex.h"
     40 #include "cg/native_argmove.h"
     41 #include "cg/native_asm.h"
     42 #include "cg/native_direct_target.h"
     43 #include "cg/native_frame.h"
     44 #include "cg/type.h"
     45 #include "core/arena.h"
     46 #include "core/bytes.h"
     47 #include "core/pool.h"
     48 #include "core/slice.h"
     49 #include "obj/obj.h"
     50 
     51 enum {
     52   X64_TMP_INT = X64_R10,      /* emit-internal int scratch (reserved) */
     53   X64_TMP_INT2 = X64_R11,     /* emit-internal int scratch (reserved) */
     54   X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */
     55   X64_TMP_FP2 = X64_XMM15,    /* emit-internal fp scratch (reserved) */
     56   X64_MAX_REG_ARG_MOVES = 16u,
     57   /* Deferred entry register-binds (-O1): bounded by simultaneously-live
     58    * register-homed param parts, i.e. the allocable register count. */
     59   X64_MAX_BIND_MOVES = 32u,
     60   X64_MAX_CS_FP_REGS = 10u, /* Win64 xmm6..xmm15 */
     61 };
     62 
     63 /* ============================ target state ============================ */
     64 
     65 /* Frame slots and callee-save records live in the shared NativeFrame
     66  * bookkeeping (cg/native_frame.h); these aliases keep the x64-local spellings.
     67  * x64 reads only .reg/.cls of a callee-save (it computes save offsets below the
     68  * locals rather than homing them in frame slots, so .slot/.type stay unused).
     69  */
     70 typedef NativeFrameSlotEntry X64NativeSlot;
     71 typedef NativeFrameCalleeSave X64CalleeSave;
     72 
     73 typedef enum X64PatchKind { X64_PATCH_ALLOCA } X64PatchKind;
     74 
     75 typedef struct X64Patch {
     76   u8 kind; /* X64PatchKind */
     77   u32 pos; /* byte offset of the disp32 to patch */
     78 } X64Patch;
     79 
     80 typedef struct X64NativeTarget {
     81   NativeTarget base;
     82   SrcLoc loc;
     83   const CGFuncDesc* func;
     84 
     85   /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
     86    * set, and the known_frame / has_alloca / frame_final flags. */
     87   NativeFrame frame;
     88   u32 frame_size_final;
     89 
     90   u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
     91   u32 next_param_int;
     92   u32 next_param_fp;
     93   u32 next_param_stack;
     94   u8 has_sret;
     95   u8 is_variadic;
     96   NativeFrameSlot sret_ptr_slot;
     97   NativeFrameSlot reg_save_slot; /* SysV variadic 176B __va_list_tag area */
     98 
     99   X64Patch* patches;
    100   u32 npatches;
    101   u32 patches_cap;
    102   u32 nalloca;
    103 
    104   u32 func_start;
    105   u32 prologue_pos;
    106   u32 prologue_nbytes;
    107   MCLabel epilogue_label;
    108 
    109   /* Known-frame (-O1) prologue cost-model tiers, settled in
    110    * x64_func_begin_known_frame; both 0 on the single-pass path (which can't
    111    * know the frame up front). Either one suppresses the `sub rsp` reservation;
    112    * the rbp frame record (push rbp; mov rbp,rsp) and every rbp-relative offset
    113    * stay unchanged, so the epilogue (`leave`), CFI (CFA = rbp+16), and debug
    114    * locs are identical to the fat shape. slim_frame   - empty frame (no
    115    * callee-saves/locals/outgoing/alloca): the `sub rsp` reserved nothing, so it
    116    * is simply dropped. Safe for non-leaves (push rbp keeps rsp 16-aligned for
    117    * calls, and nothing lives below rsp). SysV + Win64. redzone_leaf - SysV leaf
    118    * with a small frame (<= 128B, no alloca, no outgoing args):
    119    * locals/callee-saves stay at their rbp-relative offsets, which now land in
    120    * the 128-byte red zone instead of a reserved region. Leaf-only — a call
    121    * would clobber the red zone. */
    122   u8 slim_frame;
    123   u8 redzone_leaf;
    124 
    125   /* Optimizer (-O1) entry binds: register-destination param binds are deferred
    126    * here and resolved as a parallel copy in x64_bind_params_end, since the
    127    * allocator may rotate params across the incoming arg registers — a
    128    * permutation the naive per-param move order would clobber. */
    129   NativeArgMove bind_moves[X64_MAX_BIND_MOVES];
    130   u32 nbind_moves;
    131 
    132   const X64ABIRegs* abi;
    133 } X64NativeTarget;
    134 
    135 static X64NativeTarget* x64_of(NativeTarget* t) { return (X64NativeTarget*)t; }
    136 
    137 static _Noreturn void x64_panic(X64NativeTarget* a, const char* msg) {
    138   compiler_panic(a->base.c, a->loc, "x64 native target: %s", msg);
    139 }
    140 
    141 static X64NativeSlot* x64_slot_get(X64NativeTarget* a, NativeFrameSlot fs) {
    142   return native_frame_slot_at(&a->frame, fs);
    143 }
    144 
    145 static u32 align_up_u32(u32 v, u32 align) {
    146   u32 mask = align ? align - 1u : 0u;
    147   return (v + mask) & ~mask;
    148 }
    149 
    150 /* ============================ type helpers ============================ */
    151 
    152 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h
    153  * (native_type_size, native_type_align, native_mem_for_type,
    154  * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack,
    155  * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */
    156 
    157 /* A scalar value occupies a 64-bit register when it is pointer-sized or wider
    158  * (drives REX.W selection). */
    159 static int x64_is_64(NativeTarget* t, KitCgTypeId type) {
    160   return native_type_size(t, type) >= 8u || cg_type_is_ptr(t->c, type);
    161 }
    162 
    163 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0xfu; }
    164 
    165 /* SSE scalar prefix: F2 (double / 8-byte) vs F3 (single / 4-byte). */
    166 static u8 sse_scalar_prefix(u32 size) { return size == 8u ? 0xF2u : 0xF3u; }
    167 
    168 /* Forward decls for the rel32 branch emitters (used by convert before the
    169  * control-flow section defines them). */
    170 static void emit_jmp_rel32(MCEmitter* mc, MCLabel l);
    171 static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l);
    172 
    173 /* ============================ register tables ============================ */
    174 
    175 #define X64_PHYS_INT_ARG(r)                                                  \
    176   {.reg = (r),                                                               \
    177    .cls = NATIVE_REG_INT,                                                    \
    178    .abi_index = 0xffu,                                                       \
    179    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \
    180    .spill_cost = 1u,                                                         \
    181    .copy_cost = 1u}
    182 #define X64_PHYS_INT_ARG_RESERVED(r)                                        \
    183   {.reg = (r),                                                              \
    184    .cls = NATIVE_REG_INT,                                                   \
    185    .abi_index = 0xffu,                                                      \
    186    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \
    187    .spill_cost = 0u,                                                        \
    188    .copy_cost = 0u}
    189 #define X64_PHYS_INT_RET_ARG(r)                                               \
    190   {.reg = (r),                                                                \
    191    .cls = NATIVE_REG_INT,                                                     \
    192    .abi_index = 0xffu,                                                        \
    193    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    194             NATIVE_REG_RET,                                                   \
    195    .spill_cost = 1u,                                                          \
    196    .copy_cost = 1u}
    197 #define X64_PHYS_INT_CALLER(r)                              \
    198   {.reg = (r),                                              \
    199    .cls = NATIVE_REG_INT,                                   \
    200    .abi_index = 0xffu,                                      \
    201    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    202    .spill_cost = 1u,                                        \
    203    .copy_cost = 1u}
    204 #define X64_PHYS_INT_CALLEE(r)                              \
    205   {.reg = (r),                                              \
    206    .cls = NATIVE_REG_INT,                                   \
    207    .abi_index = 0xffu,                                      \
    208    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
    209    .spill_cost = 4u,                                        \
    210    .copy_cost = 1u}
    211 #define X64_PHYS_INT_RESERVED(r) \
    212   {.reg = (r),                   \
    213    .cls = NATIVE_REG_INT,        \
    214    .abi_index = 0xffu,           \
    215    .flags = NATIVE_REG_RESERVED, \
    216    .spill_cost = 0u,             \
    217    .copy_cost = 0u}
    218 
    219 /* Allocable int pool, opt's spill/reload set. R8/R9 are the driver scratch
    220  * pool; R10/R11 are emit scratch (reserved); RAX is reserved (return / div-mul,
    221  * asm-pinnable). */
    222 static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15};
    223 static const Reg x64_int_scratch[] = {X64_R8, X64_R9};
    224 
    225 static const NativePhysRegInfo x64_int_phys[] = {
    226     X64_PHYS_INT_RESERVED(X64_RAX), /* return / div-mul (asm-pinnable) */
    227     X64_PHYS_INT_ARG(X64_RCX),
    228     X64_PHYS_INT_RET_ARG(X64_RDX),
    229     X64_PHYS_INT_RESERVED(X64_RBX),
    230     X64_PHYS_INT_RESERVED(X64_RSP), /* stack pointer */
    231     X64_PHYS_INT_RESERVED(X64_RBP), /* frame pointer */
    232     X64_PHYS_INT_ARG(X64_RSI),
    233     X64_PHYS_INT_ARG(X64_RDI),
    234     X64_PHYS_INT_ARG_RESERVED(X64_R8), /* driver scratch */
    235     X64_PHYS_INT_ARG_RESERVED(X64_R9), /* driver scratch */
    236     X64_PHYS_INT_RESERVED(X64_R10),    /* emit scratch */
    237     X64_PHYS_INT_RESERVED(X64_R11),    /* emit scratch */
    238     X64_PHYS_INT_RESERVED(X64_R12),
    239     X64_PHYS_INT_CALLEE(X64_R13),
    240     X64_PHYS_INT_CALLEE(X64_R14),
    241     X64_PHYS_INT_CALLEE(X64_R15),
    242 };
    243 
    244 #define X64_PHYS_FP_ARG_RET(r)                                                \
    245   {.reg = (r),                                                                \
    246    .cls = NATIVE_REG_FP,                                                      \
    247    .abi_index = 0xffu,                                                        \
    248    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    249             NATIVE_REG_RET,                                                   \
    250    .spill_cost = 1u,                                                          \
    251    .copy_cost = 1u}
    252 #define X64_PHYS_FP_ARG(r)                                                   \
    253   {.reg = (r),                                                               \
    254    .cls = NATIVE_REG_FP,                                                     \
    255    .abi_index = 0xffu,                                                       \
    256    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \
    257    .spill_cost = 1u,                                                         \
    258    .copy_cost = 1u}
    259 #define X64_PHYS_FP_ARG_RESERVED(r)                                         \
    260   {.reg = (r),                                                              \
    261    .cls = NATIVE_REG_FP,                                                    \
    262    .abi_index = 0xffu,                                                      \
    263    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \
    264    .spill_cost = 0u,                                                        \
    265    .copy_cost = 0u}
    266 #define X64_PHYS_FP_CALLER(r)                               \
    267   {.reg = (r),                                              \
    268    .cls = NATIVE_REG_FP,                                    \
    269    .abi_index = 0xffu,                                      \
    270    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    271    .spill_cost = 1u,                                        \
    272    .copy_cost = 1u}
    273 #define X64_PHYS_FP_RESERVED(r)  \
    274   {.reg = (r),                   \
    275    .cls = NATIVE_REG_FP,         \
    276    .abi_index = 0xffu,           \
    277    .flags = NATIVE_REG_RESERVED, \
    278    .spill_cost = 0u,             \
    279    .copy_cost = 0u}
    280 
    281 /* Allocable FP pool: xmm6..xmm11 (keep arg/ret xmm0..5 clear). xmm4/xmm5 are
    282  * driver scratch; xmm14/xmm15 are emit scratch. */
    283 static const Reg x64_fp_allocable[] = {
    284     X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11};
    285 static const Reg x64_fp_scratch[] = {X64_XMM4, X64_XMM5};
    286 
    287 static const NativePhysRegInfo x64_fp_phys[] = {
    288     X64_PHYS_FP_ARG_RET(X64_XMM0),       X64_PHYS_FP_ARG_RET(X64_XMM1),
    289     X64_PHYS_FP_ARG(X64_XMM2),           X64_PHYS_FP_ARG(X64_XMM3),
    290     X64_PHYS_FP_ARG_RESERVED(X64_XMM4),  X64_PHYS_FP_ARG_RESERVED(X64_XMM5),
    291     X64_PHYS_FP_CALLER(X64_XMM6),        X64_PHYS_FP_CALLER(X64_XMM7),
    292     X64_PHYS_FP_CALLER(X64_XMM8),        X64_PHYS_FP_CALLER(X64_XMM0 + 9),
    293     X64_PHYS_FP_CALLER(X64_XMM0 + 10),   X64_PHYS_FP_CALLER(X64_XMM0 + 11),
    294     X64_PHYS_FP_RESERVED(X64_XMM0 + 12), X64_PHYS_FP_RESERVED(X64_XMM0 + 13),
    295     X64_PHYS_FP_RESERVED(X64_XMM0 + 14), /* emit scratch */
    296     X64_PHYS_FP_RESERVED(X64_XMM15),     /* emit scratch */
    297 };
    298 
    299 static const NativeAllocClassInfo x64_classes[] = {
    300     {.cls = NATIVE_REG_INT,
    301      .allocable = x64_int_allocable,
    302      .nallocable = sizeof x64_int_allocable / sizeof x64_int_allocable[0],
    303      .scratch = x64_int_scratch,
    304      .nscratch = sizeof x64_int_scratch / sizeof x64_int_scratch[0],
    305      .phys = x64_int_phys,
    306      .nphys = sizeof x64_int_phys / sizeof x64_int_phys[0],
    307      /* caller-saved: rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 (SysV) */
    308      .caller_saved_mask = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) |
    309                           (1u << X64_RSI) | (1u << X64_RDI) | (1u << X64_R8) |
    310                           (1u << X64_R9) | (1u << X64_R10) | (1u << X64_R11),
    311      /* callee-saved: rbx,r12,r13,r14,r15 (rbp handled by prologue head) */
    312      .callee_saved_mask = (1u << X64_RBX) | (1u << X64_R12) | (1u << X64_R13) |
    313                           (1u << X64_R14) | (1u << X64_R15),
    314      /* SysV arg regs rdi,rsi,rdx,rcx,r8,r9 */
    315      .arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) |
    316                  (1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9),
    317      .ret_mask = (1u << X64_RAX) | (1u << X64_RDX),
    318      /* rax, rsp, rbp reserved; r8/r9 driver scratch; r10/r11 emit scratch */
    319      .reserved_mask = (1u << X64_RAX) | (1u << X64_RSP) | (1u << X64_RBP) |
    320                       (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10) |
    321                       (1u << X64_R11) | (1u << X64_RBX) | (1u << X64_R12)},
    322     {.cls = NATIVE_REG_FP,
    323      .allocable = x64_fp_allocable,
    324      .nallocable = sizeof x64_fp_allocable / sizeof x64_fp_allocable[0],
    325      .scratch = x64_fp_scratch,
    326      .nscratch = sizeof x64_fp_scratch / sizeof x64_fp_scratch[0],
    327      .phys = x64_fp_phys,
    328      .nphys = sizeof x64_fp_phys / sizeof x64_fp_phys[0],
    329      /* All xmm caller-saved on SysV. */
    330      .caller_saved_mask = 0xffffu,
    331      .callee_saved_mask = 0u,
    332      .arg_mask = 0xffu, /* xmm0..xmm7 */
    333      .ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1),
    334      /* xmm4/xmm5 driver scratch; xmm14/xmm15 emit scratch. */
    335      .reserved_mask = (1u << X64_XMM4) | (1u << X64_XMM5) |
    336                       (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) |
    337                       (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15)},
    338 };
    339 
    340 /* Resolve a register name ("r10", "xmm3", ...) to its (class, Reg). Powers the
    341  * optimizer's inline-asm clobber masks and explicit hard-register operands
    342  * ("{r10}" from a GNU local register variable). GPR names map through the HW
    343  * encoding; xmm names through the DWARF index table. Returns non-zero for a
    344  * non-register name (cc/memory/unknown), which the caller skips. */
    345 static int x64_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
    346                             NativeAllocClass* cls_out) {
    347   char buf[16];
    348   uint32_t idx;
    349   (void)ri;
    350   if (!name.s || !name.len || name.len >= sizeof buf) return 1;
    351   memcpy(buf, name.s, name.len);
    352   buf[name.len] = '\0';
    353   if (x64_register_hw_index(buf, &idx) == 0 && idx <= 15u) {
    354     *cls_out = NATIVE_REG_INT;
    355     *out = (Reg)idx;
    356     return 0;
    357   }
    358   if (x64_register_index(buf, &idx) == 0 && idx >= 17u && idx <= 32u) {
    359     *cls_out = NATIVE_REG_FP;
    360     *out = (Reg)(idx - 17u);
    361     return 0;
    362   }
    363   return 1;
    364 }
    365 
    366 static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
    367                                   Reg reg) {
    368   (void)ri;
    369   if (cls == NATIVE_REG_INT) {
    370     switch (reg) {
    371       /* RAX is reserved but not an emit temp, so it is a legal asm pin (the
    372        * Linux syscall number/return register). R8/R9 are driver scratch and
    373        * R10/R11 are emit scratch, so those stay excluded. */
    374       case X64_RAX:
    375       case X64_RBX:
    376       case X64_RCX:
    377       case X64_RDX:
    378       case X64_RSI:
    379       case X64_RDI:
    380       case X64_R12:
    381       case X64_R13:
    382       case X64_R14:
    383       case X64_R15:
    384         return 1;
    385       default:
    386         return 0;
    387     }
    388   }
    389   if (cls == NATIVE_REG_FP)
    390     return reg <= X64_XMM0 + 13u && reg != X64_XMM4 && reg != X64_XMM5;
    391   return 0;
    392 }
    393 
    394 static int x64_asm_constraint_reg(const NativeRegInfo* ri, const char* body,
    395                                   NativeAllocClass* cls_out, Reg* fixed_out,
    396                                   u32* allowed_mask_out) {
    397   (void)ri;
    398   if (!body || !body[0] || body[1]) return 0;
    399   if (fixed_out) *fixed_out = REG_NONE;
    400   if (allowed_mask_out) *allowed_mask_out = 0;
    401   switch (body[0]) {
    402     case 'r':
    403     case 'q':
    404       if (cls_out) *cls_out = NATIVE_REG_INT;
    405       return 1;
    406     case 'a':
    407       if (cls_out) *cls_out = NATIVE_REG_INT;
    408       if (fixed_out) *fixed_out = X64_RAX;
    409       return 1;
    410     case 'b':
    411       if (cls_out) *cls_out = NATIVE_REG_INT;
    412       if (fixed_out) *fixed_out = X64_RBX;
    413       return 1;
    414     case 'c':
    415       if (cls_out) *cls_out = NATIVE_REG_INT;
    416       if (fixed_out) *fixed_out = X64_RCX;
    417       return 1;
    418     case 'd':
    419       if (cls_out) *cls_out = NATIVE_REG_INT;
    420       if (fixed_out) *fixed_out = X64_RDX;
    421       return 1;
    422     case 'S':
    423       if (cls_out) *cls_out = NATIVE_REG_INT;
    424       if (fixed_out) *fixed_out = X64_RSI;
    425       return 1;
    426     case 'D':
    427       if (cls_out) *cls_out = NATIVE_REG_INT;
    428       if (fixed_out) *fixed_out = X64_RDI;
    429       return 1;
    430     case 'x':
    431     case 'v':
    432       if (cls_out) *cls_out = NATIVE_REG_FP;
    433       return 1;
    434     default:
    435       return 0;
    436   }
    437 }
    438 
    439 static const NativeRegInfo x64_reg_info = {
    440     .classes = x64_classes,
    441     .nclasses = sizeof x64_classes / sizeof x64_classes[0],
    442     .resolve_name = x64_resolve_name,
    443     .asm_operand_reg_ok = x64_asm_operand_reg_ok,
    444     .asm_constraint_reg = x64_asm_constraint_reg,
    445 };
    446 
    447 /* ============================ legality ============================ */
    448 
    449 static int x64_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
    450                          KitCgTypeId type, i64 imm) {
    451   (void)t;
    452   (void)type;
    453   switch (use) {
    454     case NATIVE_IMM_MOVE:
    455       return 1;
    456     case NATIVE_IMM_BINOP:
    457       switch ((BinOp)op) {
    458         case BO_IADD:
    459         case BO_ISUB:
    460         case BO_AND:
    461         case BO_OR:
    462         case BO_XOR:
    463         case BO_IMUL:
    464           return imm_fits_i32(imm);
    465         case BO_SHL:
    466         case BO_SHR_S:
    467         case BO_SHR_U:
    468           return imm >= 0 && imm <= 63;
    469         default:
    470           return 0;
    471       }
    472     case NATIVE_IMM_CMP:
    473       return imm_fits_i32(imm);
    474     case NATIVE_IMM_ADDR_OFFSET:
    475       return imm_fits_i32(imm);
    476   }
    477   return 0;
    478 }
    479 
    480 static int x64_addr_legal(NativeTarget* t, const NativeAddr* addr,
    481                           MemAccess mem) {
    482   (void)t;
    483   (void)mem;
    484   if (!addr) return 0;
    485   if (addr->base_kind != NATIVE_ADDR_BASE_REG &&
    486       addr->base_kind != NATIVE_ADDR_BASE_FRAME)
    487     return 0;
    488   /* x64 supports [base + index*scale + disp32]; index must be a register. */
    489   if (addr->index_kind != NATIVE_ADDR_INDEX_NONE &&
    490       addr->index_kind != NATIVE_ADDR_INDEX_REG)
    491     return 0;
    492   return imm_fits_i32(addr->offset);
    493 }
    494 
    495 /* ============================ globals / addresses ============================
    496  */
    497 
    498 static int x64_use_got_for_sym(NativeTarget* t, ObjSymId sym) {
    499   return obj_symbol_extern_via_got(t->c, t->obj, sym);
    500 }
    501 
    502 /* PC-relative reloc kind for a non-GOT &sym reference. Functions use PLT32 so
    503  * the linker can route through a PLT; data uses plain PC32. */
    504 static u32 x64_pcrel_reloc_for_sym(NativeTarget* t, ObjSymId sym) {
    505   const ObjSym* s = obj_symbol_get(t->obj, sym);
    506   if (s && (s->kind == SK_FUNC || s->kind == SK_IFUNC)) return R_X64_PLT32;
    507   return R_PC32;
    508 }
    509 
    510 /* Materialize &sym + addend into dst_reg. Local/static-link symbols use
    511  * `lea rd, [rip + disp32]`; GOT-routed externs use `mov rd, [rip + GOT]` then
    512  * add any nonzero addend. */
    513 static void x64_emit_global_lea(NativeTarget* t, u32 dst_reg, ObjSymId sym,
    514                                 i64 addend) {
    515   MCEmitter* mc = t->mc;
    516   u32 sec = mc->section_id;
    517   if (x64_use_got_for_sym(t, sym)) {
    518     u8 op;
    519     u32 disp_pos;
    520     emit_rex(mc, 1, dst_reg, 0, 0);
    521     op = X64_OPC_MOV_R_RM;
    522     mc->emit_bytes(mc, &op, 1);
    523     {
    524       u8 mr = modrm(0u, dst_reg & 7u, 5u); /* [rip + disp32] */
    525       mc->emit_bytes(mc, &mr, 1);
    526     }
    527     disp_pos = mc->pos(mc);
    528     emit_u32le(mc, 0);
    529     mc->emit_reloc_at(mc, sec, disp_pos, R_X64_REX_GOTPCRELX, sym, -4, 1, 0);
    530     if (addend) {
    531       i32 a = (i32)addend;
    532       emit_rex(mc, 1, 0, 0, dst_reg);
    533       if (imm_fits_i8(a)) {
    534         u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, dst_reg & 7u),
    535                      (u8)a};
    536         mc->emit_bytes(mc, buf, 3);
    537       } else {
    538         u8 buf[2] = {X64_OPC_ALU_IMM32,
    539                      modrm(3u, X64_ALU_SUB_ADD, dst_reg & 7u)};
    540         mc->emit_bytes(mc, buf, 2);
    541         emit_u32le(mc, (u32)a);
    542       }
    543     }
    544     return;
    545   }
    546   {
    547     u8 op = X64_OPC_LEA;
    548     u32 disp_pos;
    549     emit_rex(mc, 1, dst_reg, 0, 0);
    550     mc->emit_bytes(mc, &op, 1);
    551     {
    552       u8 mr = modrm(0u, dst_reg & 7u, 5u); /* [rip + disp32] */
    553       mc->emit_bytes(mc, &mr, 1);
    554     }
    555     disp_pos = mc->pos(mc);
    556     emit_u32le(mc, 0);
    557     mc->emit_reloc_at(mc, sec, disp_pos, x64_pcrel_reloc_for_sym(t, sym), sym,
    558                       addend - 4, 1, 0);
    559   }
    560 }
    561 
    562 /* Resolve a NativeAddr to (base, index, log2_scale, off). Materializes
    563  * FRAME/FRAME_VALUE/GLOBAL bases into the supplied scratch register. */
    564 static u32 x64_resolve_addr(X64NativeTarget* a, const NativeAddr* addr,
    565                             u32 scratch, u32* idx_out, u32* scale_out,
    566                             i32* off_out) {
    567   NativeTarget* t = &a->base;
    568   u32 base;
    569   i32 off;
    570   switch (addr->base_kind) {
    571     case NATIVE_ADDR_BASE_REG:
    572       base = addr->base.reg & 0xfu;
    573       off = addr->offset;
    574       break;
    575     case NATIVE_ADDR_BASE_FRAME: {
    576       X64NativeSlot* s = x64_slot_get(a, addr->base.frame);
    577       base = X64_RBP;
    578       off = -(i32)s->off + addr->offset;
    579       break;
    580     }
    581     case NATIVE_ADDR_BASE_FRAME_VALUE: {
    582       X64NativeSlot* s = x64_slot_get(a, addr->base.frame);
    583       emit_mov_load(t->mc, 8, 0, scratch, X64_RBP, -(i32)s->off);
    584       base = scratch;
    585       off = addr->offset;
    586       break;
    587     }
    588     case NATIVE_ADDR_BASE_GLOBAL:
    589       x64_emit_global_lea(t, scratch, addr->base.global.sym,
    590                           addr->base.global.addend);
    591       base = scratch;
    592       off = addr->offset;
    593       break;
    594     default:
    595       x64_panic(a, "unsupported address base");
    596   }
    597   if (addr->index_kind == NATIVE_ADDR_INDEX_REG) {
    598     *idx_out = addr->index.reg & 0xfu;
    599     *scale_out = addr->log2_scale;
    600   } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) {
    601     X64NativeSlot* s = x64_slot_get(a, addr->index.frame);
    602     emit_mov_load(t->mc, 8, 0, X64_TMP_INT2, X64_RBP, -(i32)s->off);
    603     *idx_out = X64_TMP_INT2;
    604     *scale_out = addr->log2_scale;
    605   } else {
    606     *idx_out = REG_NONE;
    607     *scale_out = 0;
    608   }
    609   *off_out = off;
    610   return base;
    611 }
    612 
    613 /* ============================ memory ============================ */
    614 
    615 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem.
    616  * Materializes the address through X64_TMP_INT2 (r11) for non-reg bases. */
    617 static void x64_emit_mem(X64NativeTarget* a, int is_load, NativeLoc reg,
    618                          NativeAddr addr, MemAccess mem) {
    619   NativeTarget* t = &a->base;
    620   MCEmitter* mc = t->mc;
    621   u32 r = loc_reg(reg);
    622   int fp = native_loc_is_fp(reg);
    623   u32 sz = mem.size ? mem.size : native_type_size(t, reg.type);
    624   u32 base, idx, scale;
    625   i32 off;
    626 
    627   /* Global base: fold into a single rip-relative access when local. */
    628   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL &&
    629       addr.index_kind == NATIVE_ADDR_INDEX_NONE &&
    630       !x64_use_got_for_sym(t, addr.base.global.sym)) {
    631     ObjSymId sym = addr.base.global.sym;
    632     i64 ad = addr.base.global.addend + addr.offset;
    633     u32 sec = mc->section_id;
    634     u32 disp_pos;
    635     if (fp) {
    636       u8 prefix = sse_scalar_prefix(sz);
    637       mc->emit_bytes(mc, &prefix, 1);
    638       emit_rex(mc, 0, r, 0, 0);
    639       {
    640         u8 op2[2] = {X64_OPC_TWOBYTE, (u8)(is_load ? 0x10u : 0x11u)};
    641         mc->emit_bytes(mc, op2, 2);
    642       }
    643     } else if (sz == 8 || sz == 4) {
    644       emit_rex(mc, sz == 8, r, 0, 0);
    645       {
    646         u8 op = is_load ? X64_OPC_MOV_R_RM : X64_OPC_MOV_RM_R;
    647         mc->emit_bytes(mc, &op, 1);
    648       }
    649     } else if (sz == 2) {
    650       if (is_load) {
    651         emit_rex(mc, 0, r, 0, 0);
    652         {
    653           u8 op2[2] = {X64_OPC_TWOBYTE, X64_OPC_MOVZX_W};
    654           mc->emit_bytes(mc, op2, 2);
    655         }
    656       } else {
    657         u8 p = X64_OPSIZE_PFX;
    658         mc->emit_bytes(mc, &p, 1);
    659         emit_rex(mc, 0, r, 0, 0);
    660         {
    661           u8 op = X64_OPC_MOV_RM_R;
    662           mc->emit_bytes(mc, &op, 1);
    663         }
    664       }
    665     } else { /* size 1 */
    666       if (is_load) {
    667         emit_rex(mc, 0, r, 0, 0);
    668         {
    669           u8 op2[2] = {X64_OPC_TWOBYTE, X64_OPC_MOVZX_B};
    670           mc->emit_bytes(mc, op2, 2);
    671         }
    672       } else {
    673         emit_rex_force(mc, 0, r, 0, 0);
    674         {
    675           u8 op = X64_OPC_MOV_RM_R8;
    676           mc->emit_bytes(mc, &op, 1);
    677         }
    678       }
    679     }
    680     {
    681       u8 mr = modrm(0u, r & 7u, 5u);
    682       mc->emit_bytes(mc, &mr, 1);
    683     }
    684     disp_pos = mc->pos(mc);
    685     emit_u32le(mc, 0);
    686     mc->emit_reloc_at(mc, sec, disp_pos, x64_pcrel_reloc_for_sym(t, sym), sym,
    687                       ad - 4, 1, 0);
    688     return;
    689   }
    690 
    691   base = x64_resolve_addr(a, &addr, X64_TMP_INT2, &idx, &scale, &off);
    692   if (fp) {
    693     u8 prefix = sse_scalar_prefix(sz);
    694     if (is_load)
    695       emit_sse_load_idx(mc, prefix, 0x10, r, base, idx, scale, off);
    696     else
    697       emit_sse_store_idx(mc, prefix, 0x11, r, base, idx, scale, off);
    698   } else if (is_load) {
    699     /* Loads narrower than 4 bytes zero-extend (sign-extension is applied by a
    700      * later CV_SEXT). */
    701     emit_mov_load_idx(mc, sz, 0, r, base, idx, scale, off);
    702   } else {
    703     emit_mov_store_idx(mc, sz, r, base, idx, scale, off);
    704   }
    705 }
    706 
    707 /* ============================ moves / data ============================ */
    708 
    709 static void x64_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
    710   MCEmitter* mc = t->mc;
    711   int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src);
    712   u32 rd = loc_reg(dst), rs = loc_reg(src);
    713   if (dfp && sfp) {
    714     if (rd == rs) return;
    715     emit_sse_rr(mc, sse_scalar_prefix(native_type_size(t, dst.type)), 0x10, rd,
    716                 rs);
    717     return;
    718   }
    719   if (dfp && !sfp) { /* movd/movq gpr -> xmm: 66 0F 6E /r */
    720     int w = native_type_size(t, dst.type) == 8u;
    721     emit_sse_rr_w(mc, 0x66, 0x6E, w, rd, rs);
    722     return;
    723   }
    724   if (!dfp && sfp) { /* movd/movq xmm -> gpr: 66 0F 7E /r (xmm is reg field) */
    725     int w = native_type_size(t, src.type) == 8u;
    726     emit_sse_rr_w(mc, 0x66, 0x7E, w, rs, rd);
    727     return;
    728   }
    729   if (rd == rs) return;
    730   emit_mov_rr(mc, x64_is_64(t, dst.type) ? 1 : 0, rd, rs);
    731 }
    732 
    733 static void x64_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) {
    734   x64_emit_load_imm(t->mc, x64_is_64(t, dst.type) ? 1 : 0, loc_reg(dst), imm);
    735 }
    736 
    737 /* FP constant: materialize the bit pattern in a GPR scratch, then movd/movq
    738  * into the FPR. Integer constant: plain load_imm. */
    739 static void x64_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) {
    740   u64 v = 0;
    741   u32 i;
    742   for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
    743   if (!native_loc_is_fp(dst)) {
    744     x64_load_imm(t, dst, (i64)v);
    745     return;
    746   }
    747   x64_emit_load_imm(t->mc, cb.size == 8u, X64_TMP_INT, (i64)v);
    748   emit_sse_rr_w(t->mc, 0x66, 0x6E, cb.size == 8u, loc_reg(dst), X64_TMP_INT);
    749 }
    750 
    751 static void x64_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
    752   X64NativeTarget* a = x64_of(t);
    753   MCEmitter* mc = t->mc;
    754   u32 rd = loc_reg(dst);
    755   u32 base, idx, scale;
    756   i32 off;
    757   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL &&
    758       addr.index_kind == NATIVE_ADDR_INDEX_NONE) {
    759     x64_emit_global_lea(t, rd, addr.base.global.sym,
    760                         addr.base.global.addend + addr.offset);
    761     return;
    762   }
    763   base = x64_resolve_addr(a, &addr, rd, &idx, &scale, &off);
    764   if (idx == REG_NONE) {
    765     if (base == rd && off == 0) return; /* already &slot in rd */
    766     emit_lea(mc, rd, base, off);
    767     return;
    768   }
    769   /* lea rd, [base + idx*scale + off] */
    770   {
    771     u8 buf[16];
    772     u32 n = 0;
    773     n += x64_pack_rex(buf + n, 1, rd, idx, base);
    774     buf[n++] = X64_OPC_LEA;
    775     n += x64_pack_mem_sib(buf + n, rd, base, idx, scale, off);
    776     mc->emit_bytes(mc, buf, n);
    777   }
    778 }
    779 
    780 static void x64_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
    781                      MemAccess mem) {
    782   x64_emit_mem(x64_of(t), 1, dst, addr, mem);
    783 }
    784 static void x64_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
    785                       MemAccess mem) {
    786   x64_emit_mem(x64_of(t), 0, src, addr, mem);
    787 }
    788 
    789 /* Resolve an addressable NativeAddr to a bare base register (no index, off 0)
    790  * by emitting an lea into `scratch` when needed. */
    791 static u32 x64_addr_to_base_reg(X64NativeTarget* a, NativeAddr addr,
    792                                 u32 scratch) {
    793   MCEmitter* mc = a->base.mc;
    794   u32 base, idx, scale;
    795   i32 off;
    796   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL &&
    797       addr.index_kind == NATIVE_ADDR_INDEX_NONE) {
    798     x64_emit_global_lea(&a->base, scratch, addr.base.global.sym,
    799                         addr.base.global.addend + addr.offset);
    800     return scratch;
    801   }
    802   base = x64_resolve_addr(a, &addr, scratch, &idx, &scale, &off);
    803   if (idx == REG_NONE && off == 0) return base;
    804   if (idx == REG_NONE) {
    805     emit_lea(mc, scratch, base, off);
    806     return scratch;
    807   }
    808   {
    809     u8 buf[16];
    810     u32 n = 0;
    811     n += x64_pack_rex(buf + n, 1, scratch, idx, base);
    812     buf[n++] = X64_OPC_LEA;
    813     n += x64_pack_mem_sib(buf + n, scratch, base, idx, scale, off);
    814     mc->emit_bytes(mc, buf, n);
    815   }
    816   return scratch;
    817 }
    818 
    819 /* copy_bytes: resolve dst into r11 and src into rax (both bare pointers), then
    820  * unrolled granule copy through rdx. dst is resolved first (its base may live
    821  * in r11 from a FRAME_VALUE load) and src second so the two never alias. */
    822 static void x64_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
    823                            AggregateAccess access) {
    824   X64NativeTarget* a = x64_of(t);
    825   /* Copy chunk by chunk (8/4/2/1) through the value scratch rax, letting
    826    * x64_emit_mem resolve each address with its own scratch (r11). Uses only the
    827    * reserved emit scratch (rax/r11) — no ad-hoc allocable temp (previously
    828    * rdx), which the optimizer may have live across the copy. */
    829   KitCgTypeId tys[4];
    830   u32 n = access.size, i = 0;
    831   tys[0] = builtin_id(KIT_CG_BUILTIN_I64);
    832   tys[1] = builtin_id(KIT_CG_BUILTIN_I32);
    833   tys[2] = builtin_id(KIT_CG_BUILTIN_I16);
    834   tys[3] = builtin_id(KIT_CG_BUILTIN_I8);
    835   while (i < n) {
    836     u32 rem = n - i, s;
    837     KitCgTypeId ty;
    838     NativeAddr sa = src, da = dst;
    839     NativeLoc val;
    840     MemAccess mem;
    841     if (rem >= 8u) {
    842       s = 8u;
    843       ty = tys[0];
    844     } else if (rem >= 4u) {
    845       s = 4u;
    846       ty = tys[1];
    847     } else if (rem >= 2u) {
    848       s = 2u;
    849       ty = tys[2];
    850     } else {
    851       s = 1u;
    852       ty = tys[3];
    853     }
    854     sa.offset += (i32)i;
    855     sa.base_type = ty;
    856     da.offset += (i32)i;
    857     da.base_type = ty;
    858     val = native_loc_reg(ty, NATIVE_REG_INT, X64_TMP_INT);
    859     memset(&mem, 0, sizeof mem);
    860     mem.type = ty;
    861     mem.size = s;
    862     mem.align = s;
    863     x64_emit_mem(a, 1, val, sa, mem); /* rax = [src + i] */
    864     x64_emit_mem(a, 0, val, da, mem); /* [dst + i] = rax */
    865     i += s;
    866   }
    867 }
    868 
    869 static void x64_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
    870                           AggregateAccess access) {
    871   X64NativeTarget* a = x64_of(t);
    872   MCEmitter* mc = t->mc;
    873   u32 dr = x64_addr_to_base_reg(a, dst, X64_TMP_INT2);
    874   u32 n = access.size, i = 0;
    875   /* Broadcast the byte across 8 bytes into rax. */
    876   if (byte_value.kind == NATIVE_LOC_IMM) {
    877     u8 b = (u8)(byte_value.v.imm & 0xffu);
    878     u64 b64 = b;
    879     b64 |= b64 << 8;
    880     b64 |= b64 << 16;
    881     b64 |= b64 << 32;
    882     x64_emit_load_imm(mc, 1, X64_RAX, (i64)b64);
    883   } else {
    884     /* Replicate the low byte of a register via multiply by 0x0101..01. */
    885     x64_emit_load_imm(mc, 1, X64_R11, (i64)0x0101010101010101ll);
    886     emit_mov_rr(mc, 1, X64_RAX, loc_reg(byte_value));
    887     emit_imul_rr(mc, 1, X64_RAX, X64_R11);
    888   }
    889   while (i + 8u <= n) {
    890     emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
    891     i += 8u;
    892   }
    893   while (i + 4u <= n) {
    894     emit_mov_store(mc, 4, X64_RAX, dr, (i32)i);
    895     i += 4u;
    896   }
    897   while (i + 2u <= n) {
    898     emit_mov_store(mc, 2, X64_RAX, dr, (i32)i);
    899     i += 2u;
    900   }
    901   while (i < n) {
    902     emit_mov_store(mc, 1, X64_RAX, dr, (i32)i);
    903     i += 1u;
    904   }
    905 }
    906 
    907 /* ============================ bitfields ============================ */
    908 
    909 static void x64_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra,
    910                               BitFieldAccess bf) {
    911   X64NativeTarget* a = x64_of(t);
    912   MCEmitter* mc = t->mc;
    913   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
    914   int w = storage_bytes == 8u ? 1 : 0;
    915   u32 reg_size = w ? 64u : 32u;
    916   u32 lsb = bf.bit_offset;
    917   u32 width = bf.bit_width ? bf.bit_width : 1u;
    918   u32 rd = loc_reg(dst);
    919   u32 base;
    920   ra.offset += (i32)bf.storage_offset;
    921   base = x64_addr_to_base_reg(a, ra, X64_TMP_INT2);
    922   emit_mov_load(mc, storage_bytes, 0, rd, base, 0);
    923   {
    924     u8 left = (u8)(reg_size - lsb - width);
    925     u8 right = (u8)(reg_size - width);
    926     if (left) emit_shift_imm(mc, w, X64_SHIFT_SUB_SHL, rd, left);
    927     if (right)
    928       emit_shift_imm(mc, w, bf.signed_ ? X64_SHIFT_SUB_SAR : X64_SHIFT_SUB_SHR,
    929                      rd, right);
    930   }
    931 }
    932 
    933 static void x64_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src,
    934                                BitFieldAccess bf) {
    935   X64NativeTarget* a = x64_of(t);
    936   MCEmitter* mc = t->mc;
    937   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
    938   int w = storage_bytes == 8u ? 1 : 0;
    939   u32 lsb = bf.bit_offset;
    940   u32 width = bf.bit_width ? bf.bit_width : 1u;
    941   u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
    942   u64 mask = ones << lsb;
    943   u32 src_reg = loc_reg(src);
    944   u32 base;
    945   ra.offset += (i32)bf.storage_offset;
    946   /* Stabilize the base into r11 before consuming rax/rcx/rdx scratch. */
    947   base = x64_addr_to_base_reg(a, ra, X64_TMP_INT2);
    948   /* rax = storage; rax &= ~mask. */
    949   emit_mov_load(mc, storage_bytes, 0, X64_RAX, base, 0);
    950   x64_emit_load_imm(mc, w, X64_RCX, (i64)~mask);
    951   emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RAX, X64_RCX);
    952   /* rcx = (src & ones) << lsb. */
    953   emit_mov_rr(mc, w, X64_RCX, src_reg);
    954   x64_emit_load_imm(mc, w, X64_RDX, (i64)ones);
    955   emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX);
    956   if (lsb) emit_shift_imm(mc, w, X64_SHIFT_SUB_SHL, X64_RCX, (u8)lsb);
    957   emit_alu_rr(mc, w, X64_OPC_ALU_OR, X64_RAX, X64_RCX);
    958   emit_mov_store(mc, storage_bytes, X64_RAX, base, 0);
    959 }
    960 
    961 /* ============================ arithmetic ============================ */
    962 
    963 static void x64_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
    964                       NativeLoc bop) {
    965   X64NativeTarget* a = x64_of(t);
    966   MCEmitter* mc = t->mc;
    967   u32 rd = loc_reg(dst);
    968 
    969   /* FP binops: two-address. dst = aop op bop. */
    970   if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) {
    971     u32 ra = loc_reg(aop), rb = loc_reg(bop);
    972     u8 prefix = sse_scalar_prefix(native_type_size(t, dst.type));
    973     u8 opcode;
    974     switch (op) {
    975       case BO_FADD:
    976         opcode = 0x58;
    977         break;
    978       case BO_FSUB:
    979         opcode = 0x5C;
    980         break;
    981       case BO_FMUL:
    982         opcode = 0x59;
    983         break;
    984       default:
    985         opcode = 0x5E;
    986         break; /* BO_FDIV */
    987     }
    988     if (rd == rb && rd != ra) {
    989       if (op == BO_FADD || op == BO_FMUL) { /* commutative */
    990         emit_sse_rr(mc, prefix, opcode, rd, ra);
    991         return;
    992       }
    993       /* non-commutative dst==rb: stage rb in fp scratch. */
    994       emit_sse_rr(mc, prefix, 0x10, X64_TMP_FP2, rb);
    995       emit_sse_rr(mc, prefix, 0x10, rd, ra);
    996       emit_sse_rr(mc, prefix, opcode, rd, X64_TMP_FP2);
    997       return;
    998     }
    999     if (rd != ra) emit_sse_rr(mc, prefix, 0x10, rd, ra);
   1000     emit_sse_rr(mc, prefix, opcode, rd, rb);
   1001     return;
   1002   }
   1003 
   1004   {
   1005     int w = x64_is_64(t, dst.type) ? 1 : 0;
   1006     int b_imm = bop.kind == NATIVE_LOC_IMM;
   1007     i64 imm = b_imm ? bop.v.imm : 0;
   1008     u32 ra = loc_reg(aop);
   1009 
   1010     /* Division: rax/rdx implicit; divisor must avoid rax/rdx. */
   1011     if (op == BO_SDIV || op == BO_UDIV || op == BO_SREM || op == BO_UREM) {
   1012       u32 rb;
   1013       if (ra != X64_RAX) emit_mov_rr(mc, w, X64_RAX, ra);
   1014       if (b_imm) {
   1015         x64_emit_load_imm(mc, w, X64_R11, imm);
   1016         rb = X64_R11;
   1017       } else {
   1018         rb = loc_reg(bop);
   1019         if (rb == X64_RAX || rb == X64_RDX) {
   1020           emit_mov_rr(mc, w, X64_R11, rb);
   1021           rb = X64_R11;
   1022         }
   1023       }
   1024       if (op == BO_SDIV || op == BO_SREM) {
   1025         emit_cqo_or_cdq(mc, w);
   1026         emit_f7_rm(mc, w, X64_F7_SUB_IDIV, rb);
   1027       } else {
   1028         emit_xor_self(mc, w, X64_RDX);
   1029         emit_f7_rm(mc, w, X64_F7_SUB_DIV, rb);
   1030       }
   1031       {
   1032         u32 result = (op == BO_SREM || op == BO_UREM) ? X64_RDX : X64_RAX;
   1033         if (rd != result) emit_mov_rr(mc, w, rd, result);
   1034       }
   1035       return;
   1036     }
   1037 
   1038     /* Shifts: count in CL or imm8. */
   1039     if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) {
   1040       u32 sub = (op == BO_SHL)     ? X64_SHIFT_SUB_SHL
   1041                 : (op == BO_SHR_U) ? X64_SHIFT_SUB_SHR
   1042                                    : X64_SHIFT_SUB_SAR;
   1043       if (b_imm) {
   1044         u32 wbits = w ? 64u : 32u;
   1045         if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   1046         emit_shift_imm(mc, w, sub, rd, (u8)((u64)imm & (wbits - 1u)));
   1047         return;
   1048       }
   1049       {
   1050         u32 rb = loc_reg(bop);
   1051         /* Place the count in cl and the value in dst. Stage the count through
   1052          * r11 first so neither move clobbers the other when the value already
   1053          * sits in rcx or the count sits in dst. (The optimizer additionally
   1054          * keeps values live across the shift out of rcx — see
   1055          * x64_machine_op_clobbers.) */
   1056         if (rb != X64_RCX) {
   1057           emit_mov_rr(mc, 0, X64_TMP_INT2, rb);
   1058           if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   1059           emit_mov_rr(mc, 0, X64_RCX, X64_TMP_INT2);
   1060         } else if (rd != ra) {
   1061           emit_mov_rr(mc, w, rd, ra);
   1062         }
   1063       }
   1064       emit_shift_cl(mc, w, sub, rd);
   1065       return;
   1066     }
   1067 
   1068     /* IMM-form fast paths (b_imm guaranteed legal by imm_legal: imm32). */
   1069     if (b_imm && (op == BO_IADD || op == BO_ISUB || op == BO_AND ||
   1070                   op == BO_OR || op == BO_XOR || op == BO_IMUL)) {
   1071       if (op == BO_IMUL) {
   1072         if (imm_fits_i8(imm)) {
   1073           emit_imul_imm8(mc, w, rd, ra, (i8)imm);
   1074           return;
   1075         }
   1076         emit_imul_imm32(mc, w, rd, ra, (i32)imm);
   1077         return;
   1078       }
   1079       {
   1080         u32 sub;
   1081         switch (op) {
   1082           case BO_IADD:
   1083             sub = X64_ALU_SUB_ADD;
   1084             break;
   1085           case BO_OR:
   1086             sub = X64_ALU_SUB_OR;
   1087             break;
   1088           case BO_AND:
   1089             sub = X64_ALU_SUB_AND;
   1090             break;
   1091           case BO_ISUB:
   1092             sub = X64_ALU_SUB_SUB;
   1093             break;
   1094           default:
   1095             sub = X64_ALU_SUB_XOR;
   1096             break; /* BO_XOR */
   1097         }
   1098         if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   1099         if (imm_fits_i8(imm))
   1100           emit_alu_imm8(mc, w, sub, rd, (i8)imm);
   1101         else
   1102           emit_alu_imm32(mc, w, sub, rd, (i32)imm);
   1103         return;
   1104       }
   1105     }
   1106 
   1107     /* Generic 2-operand ALU: dst = ra op rb. Preserve rb if dst == rb. */
   1108     {
   1109       u32 rb = loc_reg(bop);
   1110       if (rd == rb && rd != ra) {
   1111         switch (op) {
   1112           case BO_IADD:
   1113             emit_alu_rr(mc, w, X64_OPC_ALU_ADD, rd, ra);
   1114             return;
   1115           case BO_AND:
   1116             emit_alu_rr(mc, w, X64_OPC_ALU_AND, rd, ra);
   1117             return;
   1118           case BO_OR:
   1119             emit_alu_rr(mc, w, X64_OPC_ALU_OR, rd, ra);
   1120             return;
   1121           case BO_XOR:
   1122             emit_alu_rr(mc, w, X64_OPC_ALU_XOR, rd, ra);
   1123             return;
   1124           case BO_IMUL:
   1125             emit_imul_rr(mc, w, rd, ra);
   1126             return;
   1127           default:
   1128             break; /* ISUB falls through: stage rb */
   1129         }
   1130         emit_mov_rr(mc, w, X64_R11, rb);
   1131         rb = X64_R11;
   1132       }
   1133       if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   1134       switch (op) {
   1135         case BO_IADD:
   1136           emit_alu_rr(mc, w, X64_OPC_ALU_ADD, rd, rb);
   1137           break;
   1138         case BO_ISUB:
   1139           emit_alu_rr(mc, w, X64_OPC_ALU_SUB, rd, rb);
   1140           break;
   1141         case BO_AND:
   1142           emit_alu_rr(mc, w, X64_OPC_ALU_AND, rd, rb);
   1143           break;
   1144         case BO_OR:
   1145           emit_alu_rr(mc, w, X64_OPC_ALU_OR, rd, rb);
   1146           break;
   1147         case BO_XOR:
   1148           emit_alu_rr(mc, w, X64_OPC_ALU_XOR, rd, rb);
   1149           break;
   1150         case BO_IMUL:
   1151           emit_imul_rr(mc, w, rd, rb);
   1152           break;
   1153         default:
   1154           x64_panic(a, "unsupported binop");
   1155       }
   1156     }
   1157   }
   1158 }
   1159 
   1160 /* FP sign-mask constant materialized in fp scratch for FNEG. */
   1161 static void x64_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
   1162   X64NativeTarget* a = x64_of(t);
   1163   MCEmitter* mc = t->mc;
   1164   u32 rd = loc_reg(dst), rs = loc_reg(src);
   1165   if (op == UO_FNEG) {
   1166     int dbl = native_type_size(t, dst.type) == 8u;
   1167     if (rd != rs)
   1168       emit_sse_rr(mc, sse_scalar_prefix(dbl ? 8u : 4u), 0x10, rd, rs);
   1169     /* sign mask into fp scratch via gpr, then XORPS/XORPD. */
   1170     x64_emit_load_imm(mc, dbl, X64_TMP_INT,
   1171                       dbl ? (i64)0x8000000000000000ull : (i64)0x80000000ull);
   1172     emit_sse_rr_w(mc, 0x66, 0x6E, dbl, X64_TMP_FP2, X64_TMP_INT);
   1173     emit_sse_rr(mc, dbl ? 0x66 : 0, 0x57, rd, X64_TMP_FP2);
   1174     return;
   1175   }
   1176   {
   1177     int w = x64_is_64(t, dst.type) ? 1 : 0;
   1178     switch (op) {
   1179       case UO_NEG:
   1180         if (rd != rs) emit_mov_rr(mc, w, rd, rs);
   1181         emit_f7_rm(mc, w, X64_F7_SUB_NEG, rd);
   1182         return;
   1183       case UO_BNOT:
   1184         if (rd != rs) emit_mov_rr(mc, w, rd, rs);
   1185         emit_f7_rm(mc, w, X64_F7_SUB_NOT, rd);
   1186         return;
   1187       case UO_NOT:
   1188         /* !x -> (x == 0) as 0/1. */
   1189         emit_test_self(mc, w, rs);
   1190         emit_setcc(mc, X64_CC_E, rd);
   1191         emit_movzx_r32_r8(mc, rd, rd);
   1192         return;
   1193       default:
   1194         x64_panic(a, "unsupported unop");
   1195     }
   1196   }
   1197 }
   1198 
   1199 /* ============================ compares ============================ */
   1200 
   1201 static u32 cmp_to_cc(CmpOp op) {
   1202   switch (op) {
   1203     case CMP_EQ:
   1204       return X64_CC_E;
   1205     case CMP_NE:
   1206       return X64_CC_NE;
   1207     case CMP_LT_U:
   1208       return X64_CC_B;
   1209     case CMP_LE_U:
   1210       return X64_CC_BE;
   1211     case CMP_GT_U:
   1212       return X64_CC_A;
   1213     case CMP_GE_U:
   1214       return X64_CC_AE;
   1215     case CMP_LT_S:
   1216       return X64_CC_L;
   1217     case CMP_LE_S:
   1218       return X64_CC_LE;
   1219     case CMP_GT_S:
   1220       return X64_CC_G;
   1221     case CMP_GE_S:
   1222       return X64_CC_GE;
   1223     default:
   1224       return X64_CC_E;
   1225   }
   1226 }
   1227 
   1228 static int cmp_is_fp(CmpOp op, NativeLoc aop) {
   1229   /* FP-ness is self-describing from the opcode; FP eq/ne are distinct opcodes
   1230    * (CMP_OEQ_F/CMP_UNE_F), so no operand-class sniffing is needed. */
   1231   (void)aop;
   1232   return op >= CMP_OEQ_F;
   1233 }
   1234 
   1235 /* Emit `cmp ra, rb` (or ucomis[sd] for FP), setting flags from ra - rb. */
   1236 static void x64_emit_cmp_flags(NativeTarget* t, NativeLoc aop, NativeLoc bop,
   1237                                int fp) {
   1238   X64NativeTarget* a = x64_of(t);
   1239   MCEmitter* mc = t->mc;
   1240   if (fp) {
   1241     u8 prefix = native_type_size(t, aop.type) == 8u ? 0x66u : 0u;
   1242     emit_sse_rr(mc, prefix, 0x2E, loc_reg(aop), loc_reg(bop)); /* ucomis */
   1243     return;
   1244   }
   1245   {
   1246     int w = x64_is_64(t, aop.type) ? 1 : 0;
   1247     u32 ra = loc_reg(aop);
   1248     if (bop.kind == NATIVE_LOC_IMM) {
   1249       i64 imm = bop.v.imm;
   1250       if (imm_fits_i8(imm))
   1251         emit_alu_imm8(mc, w, X64_ALU_SUB_CMP, ra, (i8)imm);
   1252       else
   1253         emit_alu_imm32(mc, w, X64_ALU_SUB_CMP, ra, (i32)imm);
   1254       return;
   1255     }
   1256     emit_alu_rr(mc, w, X64_OPC_ALU_CMP, ra, loc_reg(bop));
   1257     (void)a;
   1258   }
   1259 }
   1260 
   1261 /* FP ordered setcc: result = (primary cc) && !unordered (NP). */
   1262 static void x64_fp_setcc_ordered(NativeTarget* t, u32 primary, u32 dst) {
   1263   MCEmitter* mc = t->mc;
   1264   emit_setcc(mc, primary, dst);
   1265   emit_movzx_r32_r8(mc, dst, dst);
   1266   emit_setcc(mc, X64_CC_NP, X64_R11);
   1267   emit_movzx_r32_r8(mc, X64_R11, X64_R11);
   1268   emit_alu_rr(mc, 0, X64_OPC_ALU_AND, dst, X64_R11);
   1269 }
   1270 
   1271 /* FP unordered predicate: result = (primary cc) || unordered (P). */
   1272 static void x64_fp_setcc_unord(NativeTarget* t, u32 primary, u32 dst) {
   1273   MCEmitter* mc = t->mc;
   1274   emit_setcc(mc, primary, dst);
   1275   emit_movzx_r32_r8(mc, dst, dst);
   1276   emit_setcc(mc, X64_CC_P, X64_R11);
   1277   emit_movzx_r32_r8(mc, X64_R11, X64_R11);
   1278   emit_alu_rr(mc, 0, X64_OPC_ALU_OR, dst, X64_R11);
   1279 }
   1280 
   1281 static void x64_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop,
   1282                     NativeLoc bop) {
   1283   MCEmitter* mc = t->mc;
   1284   u32 d = loc_reg(dst);
   1285   int fp = cmp_is_fp(op, aop);
   1286   x64_emit_cmp_flags(t, aop, bop, fp);
   1287   if (fp) {
   1288     /* ucomis sets ZF/CF and, when unordered (NaN), also PF. Each predicate's
   1289      * flag formula is built explicitly (NOT blindly as !(opposite)):
   1290      *   ordered:    E/B/BE alias {==,<,<=} only when also NP (not-parity);
   1291      *               NE/A/AE already exclude unordered, so they stand alone.
   1292      *   unordered:  E/B/BE already include the unordered case (ZF/CF set on
   1293      *               NaN), so they stand alone; NE/A/AE need an OR with P. */
   1294     switch (op) {
   1295       /* ordered: require not-unordered (NP) on the equality-flag cases */
   1296       case CMP_OEQ_F:
   1297         x64_fp_setcc_ordered(t, X64_CC_E, d);
   1298         return;
   1299       case CMP_OLT_F:
   1300         x64_fp_setcc_ordered(t, X64_CC_B, d);
   1301         return;
   1302       case CMP_OLE_F:
   1303         x64_fp_setcc_ordered(t, X64_CC_BE, d);
   1304         return;
   1305       case CMP_ONE_F:
   1306         emit_setcc(mc, X64_CC_NE, d);
   1307         break;
   1308       case CMP_OGT_F:
   1309         emit_setcc(mc, X64_CC_A, d);
   1310         break;
   1311       case CMP_OGE_F:
   1312         emit_setcc(mc, X64_CC_AE, d);
   1313         break;
   1314       /* unordered: OR-with-P on the cases that exclude unordered */
   1315       case CMP_UEQ_F:
   1316         emit_setcc(mc, X64_CC_E, d);
   1317         break;
   1318       case CMP_ULT_F:
   1319         emit_setcc(mc, X64_CC_B, d);
   1320         break;
   1321       case CMP_ULE_F:
   1322         emit_setcc(mc, X64_CC_BE, d);
   1323         break;
   1324       case CMP_UNE_F:
   1325         x64_fp_setcc_unord(t, X64_CC_NE, d);
   1326         return;
   1327       case CMP_UGT_F:
   1328         x64_fp_setcc_unord(t, X64_CC_A, d);
   1329         return;
   1330       case CMP_UGE_F:
   1331         x64_fp_setcc_unord(t, X64_CC_AE, d);
   1332         return;
   1333       default:
   1334         emit_setcc(mc, cmp_to_cc(op), d);
   1335         break;
   1336     }
   1337     emit_movzx_r32_r8(mc, d, d);
   1338     return;
   1339   }
   1340   emit_setcc(mc, cmp_to_cc(op), d);
   1341   emit_movzx_r32_r8(mc, d, d);
   1342 }
   1343 
   1344 /* ============================ converts ============================ */
   1345 
   1346 static void x64_convert(NativeTarget* t, ConvKind k, NativeLoc dst,
   1347                         NativeLoc src) {
   1348   X64NativeTarget* a = x64_of(t);
   1349   MCEmitter* mc = t->mc;
   1350   u32 rd = loc_reg(dst), rs = loc_reg(src);
   1351   switch (k) {
   1352     case CV_SEXT: {
   1353       u32 src_sz = native_type_size(t, src.type);
   1354       int w = x64_is_64(t, dst.type) ? 1 : 0;
   1355       emit_extend_rr(mc, w, 1, src_sz, rd, rs);
   1356       return;
   1357     }
   1358     case CV_ZEXT: {
   1359       u32 src_sz = native_type_size(t, src.type);
   1360       int w = x64_is_64(t, dst.type) ? 1 : 0;
   1361       emit_extend_rr(mc, w, 0, src_sz, rd, rs);
   1362       return;
   1363     }
   1364     case CV_TRUNC:
   1365       emit_mov_rr(mc, 0, rd, rs); /* low 32 bits; clears high */
   1366       return;
   1367     case CV_ITOF_S:
   1368     case CV_ITOF_U: {
   1369       int w_src = x64_is_64(t, src.type) ? 1 : 0;
   1370       u8 prefix = sse_scalar_prefix(native_type_size(t, dst.type));
   1371       if (k == CV_ITOF_U && w_src == 1) {
   1372         MCLabel L_high = mc->label_new(mc);
   1373         MCLabel L_done = mc->label_new(mc);
   1374         emit_test_self(mc, 1, rs);
   1375         emit_jcc_rel32(mc, X64_CC_S, L_high);
   1376         emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, rs);
   1377         emit_jmp_rel32(mc, L_done);
   1378         mc->label_place(mc, L_high);
   1379         emit_mov_rr(mc, 1, X64_R11, rs);
   1380         emit_mov_rr(mc, 1, X64_RAX, rs);
   1381         emit_alu_imm8(mc, 1, X64_ALU_SUB_AND, X64_RAX, 1);
   1382         emit_shift_imm(mc, 1, X64_SHIFT_SUB_SHR, X64_R11, 1);
   1383         emit_alu_rr(mc, 1, X64_OPC_ALU_OR, X64_R11, X64_RAX);
   1384         emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, X64_R11);
   1385         emit_sse_rr(mc, prefix, 0x58, rd, rd);
   1386         mc->label_place(mc, L_done);
   1387         return;
   1388       }
   1389       if (k == CV_ITOF_U) {
   1390         emit_extend_rr(mc, 0, 0, 4, X64_R11, rs); /* zext u32 -> 64 */
   1391         rs = X64_R11;
   1392         w_src = 1;
   1393       }
   1394       emit_sse_rr_w(mc, prefix, 0x2A, w_src, rd, rs);
   1395       return;
   1396     }
   1397     case CV_FTOI_S:
   1398     case CV_FTOI_U: {
   1399       int w_dst = x64_is_64(t, dst.type) ? 1 : 0;
   1400       u8 prefix = sse_scalar_prefix(native_type_size(t, src.type));
   1401       /* Unsigned 64-bit FTOI needs the 2^63 bias dance; otherwise cvtt
   1402        * (with the destination widened to 64 for u32) is exact. */
   1403       if (k == CV_FTOI_U && w_dst == 1) {
   1404         int dbl = native_type_size(t, src.type) == 8u;
   1405         MCLabel L_small = mc->label_new(mc);
   1406         MCLabel L_done = mc->label_new(mc);
   1407         /* limit = 2^63 in fp scratch. */
   1408         x64_emit_load_imm(
   1409             mc, 1, X64_R11,
   1410             dbl ? (i64)0x43E0000000000000ull : (i64)0x5F000000ull);
   1411         emit_sse_rr_w(mc, 0x66, 0x6E, dbl, X64_TMP_FP2, X64_R11);
   1412         emit_sse_rr(mc, dbl ? 0x66 : 0, 0x2E, rs, X64_TMP_FP2); /* ucomis */
   1413         emit_jcc_rel32(mc, X64_CC_B, L_small);
   1414         emit_sse_rr(mc, prefix, 0x10, X64_TMP_FP, rs);
   1415         emit_sse_rr(mc, prefix, 0x5C, X64_TMP_FP, X64_TMP_FP2); /* sub bias */
   1416         emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, X64_TMP_FP);
   1417         x64_emit_load_imm(mc, 1, X64_R11, (i64)0x8000000000000000ull);
   1418         emit_alu_rr(mc, 1, X64_OPC_ALU_XOR, rd, X64_R11);
   1419         emit_jmp_rel32(mc, L_done);
   1420         mc->label_place(mc, L_small);
   1421         emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, rs);
   1422         mc->label_place(mc, L_done);
   1423         return;
   1424       }
   1425       if (k == CV_FTOI_U) w_dst = 1; /* widen u32 result */
   1426       emit_sse_rr_w(mc, prefix, 0x2C, w_dst, rd, rs);
   1427       return;
   1428     }
   1429     case CV_FEXT:
   1430       emit_sse_rr(mc, 0xF3, 0x5A, rd, rs); /* cvtss2sd */
   1431       return;
   1432     case CV_FTRUNC:
   1433       emit_sse_rr(mc, 0xF2, 0x5A, rd, rs); /* cvtsd2ss */
   1434       return;
   1435     case CV_BITCAST:
   1436       if (!native_loc_is_fp(src) && native_loc_is_fp(dst)) {
   1437         emit_sse_rr_w(mc, 0x66, 0x6E, x64_is_64(t, dst.type), rd, rs);
   1438       } else if (native_loc_is_fp(src) && !native_loc_is_fp(dst)) {
   1439         emit_sse_rr_w(mc, 0x66, 0x7E, x64_is_64(t, src.type), rs, rd);
   1440       } else {
   1441         x64_move(t, dst, src);
   1442       }
   1443       return;
   1444     default:
   1445       x64_panic(a, "unsupported convert");
   1446   }
   1447 }
   1448 
   1449 /* ============================ spill / reload ============================ */
   1450 
   1451 static void x64_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
   1452                       MemAccess mem) {
   1453   NativeAddr addr;
   1454   memset(&addr, 0, sizeof addr);
   1455   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1456   addr.base.frame = slot;
   1457   addr.base_type = src.type;
   1458   x64_emit_mem(x64_of(t), 0, src, addr, mem);
   1459 }
   1460 static void x64_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
   1461                        MemAccess mem) {
   1462   NativeAddr addr;
   1463   memset(&addr, 0, sizeof addr);
   1464   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1465   addr.base.frame = slot;
   1466   addr.base_type = dst.type;
   1467   x64_emit_mem(x64_of(t), 1, dst, addr, mem);
   1468 }
   1469 
   1470 /* ============================ control flow ============================ */
   1471 
   1472 static void emit_jmp_rel32(MCEmitter* mc, MCLabel l) {
   1473   u8 op = X64_OPC_JMP_REL32;
   1474   mc->emit_bytes(mc, &op, 1);
   1475   emit_u32le(mc, 0);
   1476   mc->emit_label_ref(mc, l, R_PC32, 4, -4);
   1477 }
   1478 static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l) {
   1479   u8 op[2] = {X64_OPC_TWOBYTE, (u8)(X64_OPC_JCC_BASE | (cc & 0xfu))};
   1480   mc->emit_bytes(mc, op, 2);
   1481   emit_u32le(mc, 0);
   1482   mc->emit_label_ref(mc, l, R_PC32, 4, -4);
   1483 }
   1484 
   1485 static MCLabel x64_label_new(NativeTarget* t) {
   1486   return t->mc->label_new(t->mc);
   1487 }
   1488 static void x64_label_place(NativeTarget* t, MCLabel l) {
   1489   t->mc->label_place(t->mc, l);
   1490 }
   1491 static void x64_jump(NativeTarget* t, MCLabel l) { emit_jmp_rel32(t->mc, l); }
   1492 
   1493 static void x64_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop,
   1494                            NativeLoc bop, MCLabel l) {
   1495   MCEmitter* mc = t->mc;
   1496   int fp = cmp_is_fp(op, aop);
   1497   if (fp) {
   1498     /* Materialize the 0/1 result, then branch on nonzero. */
   1499     NativeLoc tmp =
   1500         native_loc_reg(builtin_id(KIT_CG_BUILTIN_I32), NATIVE_REG_INT, X64_RAX);
   1501     x64_cmp(t, op, tmp, aop, bop);
   1502     emit_test_self(mc, 0, X64_RAX);
   1503     emit_jcc_rel32(mc, X64_CC_NE, l);
   1504     return;
   1505   }
   1506   x64_emit_cmp_flags(t, aop, bop, 0);
   1507   emit_jcc_rel32(mc, cmp_to_cc(op), l);
   1508 }
   1509 
   1510 static void x64_indirect_branch(NativeTarget* t, NativeLoc addr,
   1511                                 const MCLabel* valid_targets, u32 ntargets) {
   1512   MCEmitter* mc = t->mc;
   1513   u32 r = loc_reg(addr);
   1514   (void)valid_targets;
   1515   (void)ntargets;
   1516   if (r & 8u) {
   1517     u8 rex = X64_REX_BASE | X64_REX_B;
   1518     mc->emit_bytes(mc, &rex, 1);
   1519   }
   1520   {
   1521     u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)};
   1522     mc->emit_bytes(mc, buf, 2);
   1523   }
   1524 }
   1525 
   1526 static void x64_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
   1527   /* `&&label` address-take: `leaq sym(%rip), rd` with an R_PC32 relocation
   1528    * against the label's per-block local symbol — same form as a global
   1529    * address-take, so a re-encoding assembler recomputes the displacement.
   1530    * (A baked disp32 with no reloc would break once clang re-lays-out the
   1531    * function.) */
   1532   MCEmitter* mc = t->mc;
   1533   u32 rd = loc_reg(dst);
   1534   ObjSymId sym = mc_label_symbol(mc, l);
   1535   u32 disp_pos;
   1536   emit_rex(mc, 1, rd, 0, 0);
   1537   {
   1538     u8 op = X64_OPC_LEA;
   1539     mc->emit_bytes(mc, &op, 1);
   1540   }
   1541   {
   1542     u8 mr = modrm(0u, rd & 7u, 5u); /* [rip + disp32] */
   1543     mc->emit_bytes(mc, &mr, 1);
   1544   }
   1545   disp_pos = mc->pos(mc);
   1546   emit_u32le(mc, 0);
   1547   mc->emit_reloc_at(mc, mc->section_id, disp_pos, R_PC32, sym, -4, 1, 0);
   1548 }
   1549 
   1550 /* ============================ frame / lifecycle ============================
   1551  */
   1552 
   1553 static NativeFrameSlot x64_frame_slot(NativeTarget* t,
   1554                                       const NativeFrameSlotDesc* d) {
   1555   return native_frame_slot_alloc(&x64_of(t)->frame, d);
   1556 }
   1557 
   1558 static int x64_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
   1559                                     CGDebugLoc* out) {
   1560   X64NativeTarget* a = x64_of(t);
   1561   X64NativeSlot* s;
   1562   if (!out) return 0;
   1563   memset(out, 0, sizeof *out);
   1564   if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
   1565   s = x64_slot_get(a, slot);
   1566   out->kind = CG_DEBUG_LOC_FRAME;
   1567   /* x64 slots live at RBP - off (exactly how the memory-operand path addresses
   1568    * them). The hosted dbg snapshot seeds the frame base with RBP, so report
   1569    * the RBP-relative offset — mirroring aa64's FP-relative convention. */
   1570   out->v.frame_ofs = -(i32)s->off;
   1571   return 1;
   1572 }
   1573 
   1574 /* xmm save area base (rbp-relative). XMM saves are 16-aligned. */
   1575 static u32 x64_xmm_base(const X64NativeTarget* a, u32 cs_fp) {
   1576   if (cs_fp == 0) return a->frame.cum_off;
   1577   return align_up_u32(a->frame.cum_off, 16u);
   1578 }
   1579 
   1580 static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int,
   1581                                   u32 cs_fp) {
   1582   u32 xmm_base = x64_xmm_base(a, cs_fp);
   1583   u32 raw = a->frame.max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base;
   1584   u32 fs = align_up_u32(raw, 16u);
   1585   return fs ? fs : 16u;
   1586 }
   1587 
   1588 /* Collect the callee-saves the body actually used. */
   1589 static u32 x64_collect_int_saves(X64NativeTarget* a, Reg* regs) {
   1590   u32 n = 0, i;
   1591   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1592     if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
   1593       regs[n++] = a->frame.callee_saves[i].reg;
   1594   return n;
   1595 }
   1596 static u32 x64_collect_fp_saves(X64NativeTarget* a, Reg* regs) {
   1597   u32 n = 0, i;
   1598   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1599     if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
   1600       regs[n++] = a->frame.callee_saves[i].reg;
   1601   return n;
   1602 }
   1603 
   1604 static ObjSymId x64_chkstk_sym(NativeTarget* t) {
   1605   Sym name = pool_intern_slice(t->c->global, SLICE_LIT("__chkstk"));
   1606   ObjSymId s = obj_symbol_find(t->obj, name);
   1607   if (s != 0) return s;
   1608   return obj_symbol(t->obj, name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0);
   1609 }
   1610 
   1611 /* Build the prologue byte sequence into buf. Returns bytes written and, when
   1612  * the chkstk path fires, the disp32 offset of the call site. When `skip_sub` is
   1613  * set (the known-frame slim / red-zone tiers), the `sub rsp` reservation is
   1614  * omitted entirely: the frame record is established but no stack is reserved,
   1615  * either because the frame is empty (slim) or because the locals/saves live in
   1616  * the SysV red zone (redzone_leaf). Callers must only set it when the frame
   1617  * needs no reserved region (no alloca, no outgoing args, and — for the red
   1618  * zone — a leaf frame <= 128 bytes). */
   1619 static u32 x64_build_prologue(X64NativeTarget* a, u8* buf, u32 cap,
   1620                               u32 frame_size, const Reg* cs_int, u32 n_int,
   1621                               const Reg* cs_fp, u32 n_fp, int skip_sub,
   1622                               u32* chkstk_disp_pos_out) {
   1623   u32 wi = 0;
   1624   u32 xmm_base = x64_xmm_base(a, n_fp);
   1625   u32 i;
   1626   /* Page granularity for Windows large-frame probing (0 = no probe needed).
   1627    * Win64 reserves >1-page frames through __chkstk; the same ABI capability
   1628    * the aarch64 backend reads for its inline probe. */
   1629   u32 probe = abi_stack_probe_interval(a->base.c->abi);
   1630   *chkstk_disp_pos_out = (u32)-1;
   1631   if (cap < X64_PROLOGUE_BASE_BYTES)
   1632     x64_panic(a, "prologue placeholder overflow");
   1633   /* push rbp; mov rbp, rsp. */
   1634   buf[wi++] = (u8)(X64_OPC_PUSH_R | (X64_RBP & 7u));
   1635   buf[wi++] = X64_REX_BASE | X64_REX_W;
   1636   buf[wi++] = X64_OPC_MOV_RM_R;
   1637   buf[wi++] = modrm(3u, X64_RSP, X64_RBP);
   1638   /* sub rsp, frame_size (or chkstk on Win64 large frame); skipped by the slim /
   1639    * red-zone tiers, which reserve no stack. */
   1640   if (skip_sub) {
   1641     /* no reservation */
   1642   } else if (probe && frame_size > probe) {
   1643     if (wi + 13u > cap) x64_panic(a, "prologue placeholder overflow");
   1644     buf[wi++] = (u8)(X64_OPC_MOV_RI | (X64_RAX & 7u)); /* mov eax, imm32 */
   1645     wr_u32_le(buf + wi, frame_size);
   1646     wi += 4;
   1647     buf[wi++] = X64_OPC_CALL_REL32;
   1648     *chkstk_disp_pos_out = wi;
   1649     wr_u32_le(buf + wi, 0);
   1650     wi += 4;
   1651     buf[wi++] = X64_REX_BASE | X64_REX_W; /* sub rsp, rax */
   1652     buf[wi++] = X64_OPC_ALU_SUB;
   1653     buf[wi++] = modrm(3u, X64_RAX, X64_RSP);
   1654   } else {
   1655     if (wi + 7u > cap) x64_panic(a, "prologue placeholder overflow");
   1656     buf[wi++] = X64_REX_BASE | X64_REX_W;
   1657     buf[wi++] = X64_OPC_ALU_IMM32;
   1658     buf[wi++] = modrm(3u, X64_ALU_SUB_SUB, X64_RSP);
   1659     wr_u32_le(buf + wi, frame_size);
   1660     wi += 4;
   1661   }
   1662   /* sret: spill the first int arg reg (destination pointer) into its slot.
   1663    * Use the minimal disp encoding (x64_pack_mem) so it matches the body's
   1664    * frame stores and the matching epilogue restore — the `cc -S | as`
   1665    * round-trip can then reproduce these bytes exactly. The -O0 placeholder is
   1666    * NOP-padded to a fixed width, so a shorter prologue is harmless. */
   1667   if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
   1668     X64NativeSlot* s = x64_slot_get(a, a->sret_ptr_slot);
   1669     u32 sret_reg = a->abi->int_args[0];
   1670     i32 off = -(i32)s->off;
   1671     if (wi + 8u > cap) x64_panic(a, "prologue placeholder overflow");
   1672     buf[wi++] =
   1673         (u8)(X64_REX_BASE | X64_REX_W | ((sret_reg & 8u) ? X64_REX_R : 0u));
   1674     buf[wi++] = X64_OPC_MOV_RM_R;
   1675     wi += x64_pack_mem(buf + wi, sret_reg & 7u, X64_RBP, off);
   1676   }
   1677   /* Spill callee-saved GPRs. */
   1678   for (i = 0; i < n_int; ++i) {
   1679     u32 reg = cs_int[i];
   1680     i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1u) * 8;
   1681     if (wi + 8u > cap) x64_panic(a, "prologue placeholder overflow");
   1682     buf[wi++] = (u8)(X64_REX_BASE | X64_REX_W | ((reg & 8u) ? X64_REX_R : 0u));
   1683     buf[wi++] = X64_OPC_MOV_RM_R;
   1684     wi += x64_pack_mem(buf + wi, reg & 7u, X64_RBP, off);
   1685   }
   1686   /* Spill callee-saved XMMs (Win64). movaps [rbp+disp], xmm. */
   1687   for (i = 0; i < n_fp; ++i) {
   1688     u32 xmm = cs_fp[i];
   1689     i32 off = -(i32)xmm_base - (i32)(i + 1u) * 16;
   1690     u8 rex = (u8)((xmm & 8u) ? (X64_REX_BASE | X64_REX_R) : 0u);
   1691     u32 need = rex ? 9u : 8u;
   1692     if (wi + need > cap) x64_panic(a, "prologue placeholder overflow");
   1693     if (rex) buf[wi++] = rex;
   1694     buf[wi++] = X64_OPC_TWOBYTE;
   1695     buf[wi++] = 0x29; /* MOVAPS r/m128, xmm */
   1696     wi += x64_pack_mem(buf + wi, xmm & 7u, X64_RBP, off);
   1697   }
   1698   return wi;
   1699 }
   1700 
   1701 static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   1702   X64NativeTarget* a = x64_of(t);
   1703   MCEmitter* mc = t->mc;
   1704   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   1705   a->func = fd;
   1706   a->loc = fd->loc;
   1707   a->abi = x64_abi_for_os(t->c->target.os);
   1708   /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
   1709    * callee-save set, and known_frame/has_alloca/frame_final. */
   1710   native_frame_reset(&a->frame);
   1711   a->incoming_stack_size = 0;
   1712   a->next_param_int = 0;
   1713   a->next_param_fp = 0;
   1714   a->next_param_stack = 0;
   1715   a->has_sret = (abi && abi->has_sret) ? 1u : 0u;
   1716   a->is_variadic = (abi && abi->variadic) ? 1u : 0u;
   1717   a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
   1718   a->reg_save_slot = NATIVE_FRAME_SLOT_NONE;
   1719   a->npatches = 0;
   1720   a->nalloca = 0;
   1721   a->nbind_moves = 0;
   1722   a->slim_frame = 0;
   1723   a->redzone_leaf = 0;
   1724   a->prologue_nbytes =
   1725       a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES;
   1726 
   1727   mc->set_section(mc, fd->text_section_id);
   1728   mc->emit_align(mc, 16, X64_NOP1);
   1729   a->func_start = mc->pos(mc);
   1730   mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
   1731   if (mc->cfi_startproc) mc->cfi_startproc(mc);
   1732   a->epilogue_label = mc->label_new(mc);
   1733 }
   1734 
   1735 /* Reserve the sret-pointer slot and (SysV) the 176-byte variadic reg-save
   1736  * area. Advances next_param_int past the sret pointer (a0). */
   1737 static void x64_reserve_entry_saves(X64NativeTarget* a) {
   1738   NativeTarget* t = &a->base;
   1739   if (a->has_sret) {
   1740     NativeFrameSlotDesc sd;
   1741     memset(&sd, 0, sizeof sd);
   1742     sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1743     sd.size = 8;
   1744     sd.align = 8;
   1745     sd.kind = NATIVE_FRAME_SLOT_SAVE;
   1746     a->sret_ptr_slot = t->frame_slot(t, &sd);
   1747     a->next_param_int = 1;
   1748   }
   1749   if (a->is_variadic && a->abi->emit_sysv_vararg_save) {
   1750     NativeFrameSlotDesc rd;
   1751     memset(&rd, 0, sizeof rd);
   1752     rd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1753     rd.size = 176;
   1754     rd.align = 8;
   1755     rd.kind = NATIVE_FRAME_SLOT_SAVE;
   1756     a->reg_save_slot = t->frame_slot(t, &rd);
   1757   }
   1758 }
   1759 
   1760 static void x64_emit_variadic_reg_saves(X64NativeTarget* a) {
   1761   NativeTarget* t = &a->base;
   1762   MCEmitter* mc = t->mc;
   1763   if (!a->is_variadic) return;
   1764   if (a->abi->emit_sysv_vararg_save) {
   1765     X64NativeSlot* rs = x64_slot_get(a, a->reg_save_slot);
   1766     static const u32 gprs[6] = {X64_RDI, X64_RSI, X64_RDX,
   1767                                 X64_RCX, X64_R8,  X64_R9};
   1768     u32 i;
   1769     for (i = 0; i < 6u; ++i)
   1770       emit_mov_store(mc, 8, gprs[i], X64_RBP, -(i32)rs->off + (i32)(i * 8u));
   1771     for (i = 0; i < 8u; ++i)
   1772       emit_sse_store(mc, 0xF2, 0x11, (u32)(X64_XMM0 + i), X64_RBP,
   1773                      -(i32)rs->off + (i32)(48u + i * 16u));
   1774     return;
   1775   }
   1776   /* Win64 variadic: spill the 4 GPR arg slots to the home space. */
   1777   emit_mov_store(mc, 8, X64_RCX, X64_RBP, 16);
   1778   emit_mov_store(mc, 8, X64_RDX, X64_RBP, 24);
   1779   emit_mov_store(mc, 8, X64_R8, X64_RBP, 32);
   1780   emit_mov_store(mc, 8, X64_R9, X64_RBP, 40);
   1781 }
   1782 
   1783 static void x64_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   1784   X64NativeTarget* a = x64_of(t);
   1785   MCEmitter* mc = t->mc;
   1786   u32 i;
   1787   x64_func_begin_common(t, fd);
   1788   a->prologue_pos = mc->pos(mc);
   1789   for (i = 0; i < a->prologue_nbytes; ++i) emit1(mc, X64_NOP1);
   1790   x64_reserve_entry_saves(a);
   1791   x64_emit_variadic_reg_saves(a);
   1792 }
   1793 
   1794 /* x64 homes callee-saves below the locals (offsets computed in
   1795  * x64_compute_frame_size / x64_build_prologue), not in frame slots, so
   1796  * alloc_slots=0: native_frame just records the {reg,cls} set from the masks. */
   1797 static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
   1798                                      u32 nclasses) {
   1799   native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0);
   1800 }
   1801 
   1802 static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r);
   1803 static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
   1804 
   1805 static u32 x64_live_callee_saved_mask(NativeTarget* t,
   1806                                       NativeAllocClass cls) {
   1807   X64NativeTarget* a = x64_of(t);
   1808   const X64ABIRegs* abi = a->abi ? a->abi : x64_abi_for_os(t->c->target.os);
   1809   u32 mask = 0;
   1810   for (Reg r = 0; r < 16u; ++r) {
   1811     if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(abi, r))
   1812       mask |= 1u << r;
   1813     if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(abi, r))
   1814       mask |= 1u << r;
   1815   }
   1816   return mask;
   1817 }
   1818 
   1819 static u32 x64_live_caller_saved_mask(NativeTarget* t,
   1820                                       NativeAllocClass cls) {
   1821   const NativeAllocClassInfo* ci = native_target_class_info(t, cls);
   1822   if (!ci) return 0;
   1823   return ci->caller_saved_mask & ~x64_live_callee_saved_mask(t, cls);
   1824 }
   1825 
   1826 static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   1827                                   u32 nclob, u32* int_mask, u32* fp_mask);
   1828 
   1829 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks
   1830  * (cg/native_asm.h); it reads the target's live ABI masks. */
   1831 
   1832 /* Build the callee-saved set the prologue must preserve: the allocator-assigned
   1833  * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
   1834  * block clobbers. The latter are opaque to the optimizer's operand scan, so it
   1835  * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
   1836  * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
   1837  * and keep only the callee-saved ones. x64_reg_is_callee_* follow the live ABI:
   1838  * they exclude rbp (handled by the prologue head) and keep the
   1839  * reserved-but-callee- saved scratch rbx/r12 (which the caller still expects
   1840  * preserved). This is the same register selection the per-block spill used,
   1841  * hoisted into the prologue. */
   1842 static u32 x64_known_callee_saves(NativeTarget* t, const X64ABIRegs* abi,
   1843                                   const NativeKnownFrameDesc* frame, u32* out,
   1844                                   u32 cap) {
   1845   u32 ncls = frame->ncallee_classes;
   1846   u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
   1847   if (ncls > cap) ncls = cap;
   1848   for (u32 c = 0; c < ncls; ++c)
   1849     out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
   1850   if (frame->asm_clobbers && frame->nasm_clobbers) {
   1851     X64NativeTarget* a = x64_of(t);
   1852     SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   1853     x64_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
   1854                           &clob_int, &clob_fp);
   1855   }
   1856   native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
   1857                                &abi_fp);
   1858   clob_int |= abi_int;
   1859   clob_fp |= abi_fp;
   1860   for (Reg r = 0; r < 16u; ++r) {
   1861     if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
   1862         x64_reg_is_callee_int(abi, r))
   1863       out[NATIVE_REG_INT] |= 1u << r;
   1864     if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) &&
   1865         x64_reg_is_callee_fp(abi, r))
   1866       out[NATIVE_REG_FP] |= 1u << r;
   1867   }
   1868   return ncls;
   1869 }
   1870 
   1871 /* Optimizer entry point: the full frame is supplied up front, so the prologue
   1872  * is emitted final the moment it is built — no NOP region, no func_end patch
   1873  * (x64_func_end skips patching when known_frame). x64_build_prologue emits the
   1874  * push rbp / sub rsp / sret spill / callee-save spills; the variadic
   1875  * register-save stores are emitted separately, as on the single-pass path. */
   1876 static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   1877                                        const NativeKnownFrameDesc* frame,
   1878                                        NativeFrameSlot* out_slots) {
   1879   X64NativeTarget* a = x64_of(t);
   1880   MCEmitter* mc = t->mc;
   1881   Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
   1882   u32 n_int, n_fp, frame_size, nbytes, chkstk_disp_pos, i;
   1883   u8 buf[X64_PROLOGUE_BYTES_WIN64];
   1884   x64_func_begin_common(t, fd);
   1885   a->frame.known_frame = 1;
   1886   if (frame) {
   1887     u32 cs[NATIVE_CALL_PLAN_CLASSES];
   1888     u32 ncs =
   1889         x64_known_callee_saves(t, a->abi, frame, cs, NATIVE_CALL_PLAN_CLASSES);
   1890     a->frame.has_alloca = frame->has_alloca;
   1891     if (ncs) x64_reserve_callee_saves(t, cs, ncs);
   1892     for (i = 0; i < frame->nslots; ++i) {
   1893       NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]);
   1894       if (out_slots) out_slots[i] = slot;
   1895     }
   1896     x64_reserve_entry_saves(a);
   1897     native_frame_note_outgoing(&a->frame, frame->max_outgoing);
   1898   }
   1899   /* Frame is final: size and offsets are settled, so emit the exact prologue.
   1900    */
   1901   n_int = x64_collect_int_saves(a, cs_int);
   1902   n_fp = x64_collect_fp_saves(a, cs_fp);
   1903   frame_size = x64_compute_frame_size(a, n_int, n_fp);
   1904   a->frame_size_final = frame_size;
   1905   /* Cost-model tier selection (mirrors aa64's aa_func_begin_known_frame): with
   1906    * the frame final before the body, choose the cheapest valid prologue shape.
   1907    * Both tiers keep the rbp record and only drop the `sub rsp`, so the
   1908    * epilogue/CFI/offset helpers are untouched. x64 needs no
   1909    * `fp_at_bottom`-style fold: `push rbp` already folds the sp-move into the
   1910    * store. */
   1911   a->slim_frame = a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
   1912                   a->frame.cum_off == 0 && a->frame.max_outgoing == 0;
   1913   /* redzone keeps locals below rsp in the red zone; exclude inline asm, which
   1914    * may issue a `call` (clobbering the red zone) the optimizer can't see. slim
   1915    * needs no such guard: it has no locals there and the return address lives on
   1916    * the stack at [rbp+8], not in a clobberable register. */
   1917   a->redzone_leaf = !a->slim_frame && a->abi->shadow_space == 0 && frame &&
   1918                     frame->is_leaf && !frame->has_asm && !a->frame.has_alloca &&
   1919                     a->frame.max_outgoing == 0 && frame_size <= 128u;
   1920   a->prologue_pos = mc->pos(mc);
   1921   nbytes = x64_build_prologue(a, buf, sizeof buf, frame_size, cs_int, n_int,
   1922                               cs_fp, n_fp, a->slim_frame || a->redzone_leaf,
   1923                               &chkstk_disp_pos);
   1924   mc->emit_bytes(mc, buf, nbytes);
   1925   if (chkstk_disp_pos != (u32)-1) {
   1926     ObjSymId chk = x64_chkstk_sym(t);
   1927     mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos,
   1928                       R_X64_PLT32, chk, -4, 1, 0);
   1929   }
   1930   a->prologue_nbytes = nbytes; /* exact length: used for the CFI post offset */
   1931   x64_emit_variadic_reg_saves(a);
   1932   native_frame_set_final(&a->frame);
   1933 }
   1934 
   1935 static void x64_func_end(NativeTarget* t) {
   1936   X64NativeTarget* a = x64_of(t);
   1937   MCEmitter* mc = t->mc;
   1938   ObjBuilder* obj = t->obj;
   1939   ObjSecId sec = a->func->text_section_id;
   1940   Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
   1941   u32 n_int = x64_collect_int_saves(a, cs_int);
   1942   u32 n_fp = x64_collect_fp_saves(a, cs_fp);
   1943   u32 frame_size = x64_compute_frame_size(a, n_int, n_fp);
   1944   u32 xmm_base = x64_xmm_base(a, n_fp);
   1945   u32 end;
   1946   i32 i;
   1947   a->frame_size_final = frame_size;
   1948 
   1949   /* Epilogue. */
   1950   mc->label_place(mc, a->epilogue_label);
   1951   for (i = (i32)n_fp - 1; i >= 0; --i) {
   1952     i32 off = -(i32)xmm_base - (i32)(i + 1) * 16;
   1953     emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP, off); /* movaps */
   1954   }
   1955   for (i = (i32)n_int - 1; i >= 0; --i) {
   1956     i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8;
   1957     emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP, off);
   1958   }
   1959   emit_leave(mc);
   1960   emit_ret(mc);
   1961 
   1962   /* Patch the single-pass prologue placeholder. */
   1963   if (!a->frame.known_frame) {
   1964     u8 buf[X64_PROLOGUE_BYTES_WIN64];
   1965     u32 chkstk_disp_pos;
   1966     u32 nbytes;
   1967     u32 k;
   1968     for (k = 0; k < a->prologue_nbytes; ++k) buf[k] = X64_NOP1;
   1969     /* Single-pass path never selects a slim/red-zone tier (it cannot know the
   1970      * frame up front), so it always emits the full reservation. */
   1971     nbytes = x64_build_prologue(a, buf, a->prologue_nbytes, frame_size, cs_int,
   1972                                 n_int, cs_fp, n_fp, 0, &chkstk_disp_pos);
   1973     (void)nbytes;
   1974     obj_patch(obj, sec, a->prologue_pos, buf, a->prologue_nbytes);
   1975     if (chkstk_disp_pos != (u32)-1) {
   1976       ObjSymId chk = x64_chkstk_sym(t);
   1977       mc->emit_reloc_at(mc, sec, a->prologue_pos + chkstk_disp_pos, R_X64_PLT32,
   1978                         chk, -4, 1, 0);
   1979     }
   1980   }
   1981 
   1982   /* Patch alloca disp32s: lea dst, [rsp + max_outgoing]. */
   1983   {
   1984     u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
   1985     u32 k;
   1986     for (k = 0; k < a->npatches; ++k) {
   1987       u8 dbuf[4];
   1988       wr_u32_le(dbuf, mo);
   1989       obj_patch(obj, sec, a->patches[k].pos, dbuf, 4);
   1990     }
   1991   }
   1992 
   1993   /* CFI: after the prologue, CFA = rbp + 16; rbp at cfa-16, ra at cfa-8. */
   1994   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
   1995     /* Body starts past the prologue. prologue_nbytes is the reserved NOP-region
   1996      * size on the single-pass path and the exact prologue length on the
   1997      * known-frame path (set in x64_func_begin_known_frame). */
   1998     u32 post = a->prologue_pos + a->prologue_nbytes;
   1999     u32 k;
   2000     mc->cfi_set_next_pc_offset(mc, post - a->func_start);
   2001     /* CFI register operands are DWARF numbers, which differ from the x86-64
   2002      * hardware encoding for rbp/rsp/rsi/rdi/rcx/rdx (e.g. rbp is HW 5 but
   2003      * DWARF 6). Map every hardware GPR through x64_dwarf_from_hw_gpr; rip's
   2004      * DWARF number (16) is already correct. */
   2005     mc->cfi_def_cfa(mc, x64_dwarf_from_hw_gpr(X64_RBP), 16);
   2006     mc->cfi_offset(mc, x64_dwarf_from_hw_gpr(X64_RBP), -16);
   2007     mc->cfi_offset(mc, 16u /* rip */, -8);
   2008     for (k = 0; k < n_int; ++k) {
   2009       i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(k + 1u) * 8;
   2010       mc->cfi_offset(mc, x64_dwarf_from_hw_gpr(cs_int[k]), off);
   2011     }
   2012   }
   2013 
   2014   end = mc->pos(mc);
   2015   obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start,
   2016                     (u64)(end - a->func_start));
   2017   if (a->func->atomize)
   2018     obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym,
   2019                     0);
   2020   if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end);
   2021   if (mc->cfi_endproc) mc->cfi_endproc(mc);
   2022   mc_end_function(mc);
   2023   a->func = NULL;
   2024 }
   2025 
   2026 /* ============================ params / ABI helpers
   2027  * ============================
   2028  */
   2029 
   2030 /* Win64 shares one arg-slot index across int and FP. Keep cursors in lockstep.
   2031  */
   2032 static void x64_sync_slot(const X64ABIRegs* abi, u32* next_int, u32* next_fp) {
   2033   u32 m;
   2034   if (!abi->slot_shared_int_fp) return;
   2035   m = *next_int > *next_fp ? *next_int : *next_fp;
   2036   *next_int = m;
   2037   *next_fp = m;
   2038 }
   2039 
   2040 static const ABIArgInfo* x64_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
   2041                                        const NativeCallDesc* desc, u32 i,
   2042                                        ABIArgInfo* scratch) {
   2043   int variadic = abi && i >= abi->nparams;
   2044   if (abi && i < abi->nparams) return &abi->params[i];
   2045   (void)variadic;
   2046   memset(scratch, 0, sizeof *scratch);
   2047   scratch->kind = ABI_ARG_DIRECT;
   2048   scratch->nparts = 1;
   2049   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
   2050   ((ABIArgPart*)scratch->parts)[0].cls =
   2051       cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT;
   2052   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
   2053   ((ABIArgPart*)scratch->parts)[0].size =
   2054       native_type_size(t, desc->args[i].type);
   2055   ((ABIArgPart*)scratch->parts)[0].align =
   2056       native_type_align(t, desc->args[i].type);
   2057   return scratch;
   2058 }
   2059 
   2060 static KitCgTypeId x64_part_scalar_type(const ABIArgPart* part) {
   2061   if (part->cls == ABI_CLASS_FP)
   2062     return part->size <= 4u ? builtin_id(KIT_CG_BUILTIN_F32)
   2063                             : builtin_id(KIT_CG_BUILTIN_F64);
   2064   switch (part->size) {
   2065     case 1u:
   2066       return builtin_id(KIT_CG_BUILTIN_I8);
   2067     case 2u:
   2068       return builtin_id(KIT_CG_BUILTIN_I16);
   2069     case 4u:
   2070       return builtin_id(KIT_CG_BUILTIN_I32);
   2071     default:
   2072       return builtin_id(KIT_CG_BUILTIN_I64);
   2073   }
   2074 }
   2075 
   2076 /* Is the whole DIRECT arg forced to the stack (not enough reg slots)? */
   2077 static int x64_direct_to_stack(const X64ABIRegs* abi, const ABIArgInfo* ai,
   2078                                u32 next_int, u32 next_fp) {
   2079   u32 need_int, need_fp;
   2080   x64_abi_direct_reg_need(ai, &need_int, &need_fp);
   2081   return next_int + need_int > abi->n_int_args ||
   2082          next_fp + need_fp > abi->n_fp_args;
   2083 }
   2084 
   2085 /* Outgoing stack bytes a call uses (16-aligned), per the ABI. */
   2086 static u32 x64_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
   2087   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2088   const X64ABIRegs* aregs = x64_abi_for_os(t->c->target.os);
   2089   u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   2090   u32 next_fp = 0;
   2091   u32 stack = aregs->shadow_space;
   2092   u32 i;
   2093   x64_sync_slot(aregs, &next_int, &next_fp);
   2094   for (i = 0; i < desc->nargs; ++i) {
   2095     ABIArgInfo tmp;
   2096     const ABIArgInfo* ai = x64_param_abi(t, abi, desc, i, &tmp);
   2097     u16 p;
   2098     if (ai->kind == ABI_ARG_IGNORE) continue;
   2099     if (ai->kind == ABI_ARG_INDIRECT) {
   2100       if (next_int < aregs->n_int_args)
   2101         ++next_int;
   2102       else
   2103         stack += 8u;
   2104       x64_sync_slot(aregs, &next_int, &next_fp);
   2105       continue;
   2106     }
   2107     if (ai->kind == ABI_ARG_DIRECT &&
   2108         x64_direct_to_stack(aregs, ai, next_int, next_fp)) {
   2109       stack += (u32)ai->nparts * 8u;
   2110       continue;
   2111     }
   2112     for (p = 0; p < ai->nparts; ++p) {
   2113       const ABIArgPart* part = &ai->parts[p];
   2114       if (part->cls == ABI_CLASS_FP) {
   2115         if (next_fp < aregs->n_fp_args)
   2116           ++next_fp;
   2117         else
   2118           stack += 8u;
   2119       } else {
   2120         if (next_int < aregs->n_int_args)
   2121           ++next_int;
   2122         else
   2123           stack += 8u;
   2124       }
   2125       x64_sync_slot(aregs, &next_int, &next_fp);
   2126     }
   2127   }
   2128   return align_up_u32(stack, 16u);
   2129 }
   2130 
   2131 static u32 x64_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
   2132   return x64_call_stack_size(t, desc);
   2133 }
   2134 
   2135 static u32 x64_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   2136                                      int* variadic, u32* nparams) {
   2137   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
   2138   NativeCallDesc d;
   2139   if (variadic) *variadic = abi ? (int)abi->variadic : 0;
   2140   if (nparams) *nparams = abi ? abi->nparams : 0u;
   2141   memset(&d, 0, sizeof d);
   2142   d.fn_type = fn_type;
   2143   d.nargs = abi ? abi->nparams : 0u;
   2144   if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
   2145   return x64_call_stack_size(t, &d);
   2146 }
   2147 
   2148 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */
   2149 static NativeAddr x64_loc_addr(X64NativeTarget* a, NativeLoc loc, u32 offset) {
   2150   NativeAddr addr;
   2151   memset(&addr, 0, sizeof addr);
   2152   switch ((NativeLocKind)loc.kind) {
   2153     case NATIVE_LOC_FRAME:
   2154       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2155       addr.base.frame = loc.v.frame;
   2156       addr.base_type = loc.type;
   2157       addr.offset = (i32)offset;
   2158       return addr;
   2159     case NATIVE_LOC_STACK:
   2160       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2161       addr.base.frame = loc.v.stack.slot;
   2162       addr.base_type = loc.type;
   2163       addr.offset = loc.v.stack.offset + (i32)offset;
   2164       return addr;
   2165     case NATIVE_LOC_ADDR:
   2166       addr = loc.v.addr;
   2167       addr.offset += (i32)offset;
   2168       return addr;
   2169     default:
   2170       x64_panic(a, "location is not addressable");
   2171   }
   2172   return addr;
   2173 }
   2174 
   2175 static void x64_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2176                           u32 offset, u32 size) {
   2177   X64NativeTarget* a = x64_of(t);
   2178   if (src.kind == NATIVE_LOC_REG) {
   2179     x64_move(t, dst, src);
   2180     return;
   2181   }
   2182   if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
   2183       src.kind == NATIVE_LOC_ADDR) {
   2184     NativeAddr addr = x64_loc_addr(a, src, offset);
   2185     addr.base_type = dst.type;
   2186     x64_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size));
   2187     return;
   2188   }
   2189   if (src.kind == NATIVE_LOC_IMM) {
   2190     x64_emit_load_imm(t->mc, x64_is_64(t, dst.type) ? 1 : 0, loc_reg(dst),
   2191                       src.v.imm);
   2192     return;
   2193   }
   2194   x64_panic(a, "unsupported part source");
   2195 }
   2196 
   2197 static void x64_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2198                            u32 offset, u32 size) {
   2199   X64NativeTarget* a = x64_of(t);
   2200   if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
   2201       dst.kind == NATIVE_LOC_ADDR) {
   2202     NativeAddr addr = x64_loc_addr(a, dst, offset);
   2203     addr.base_type = src.type;
   2204     x64_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
   2205     return;
   2206   }
   2207   if (dst.kind == NATIVE_LOC_REG) {
   2208     x64_move(t, dst, src);
   2209     return;
   2210   }
   2211   x64_panic(a, "unsupported part destination");
   2212 }
   2213 
   2214 static void x64_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   2215   NativeAddr addr = x64_loc_addr(x64_of(t), src, 0);
   2216   x64_load_addr(t, dst, addr);
   2217 }
   2218 
   2219 static void x64_store_outgoing_part(NativeTarget* t, int tail_call,
   2220                                     u32 stack_off, NativeLoc src, u32 size) {
   2221   X64NativeTarget* a = x64_of(t);
   2222   NativeAddr addr;
   2223   memset(&addr, 0, sizeof addr);
   2224   addr.base_kind = NATIVE_ADDR_BASE_REG;
   2225   addr.base_type = src.type;
   2226   if (tail_call) {
   2227     /* A sibling call reuses the caller's frame: its outgoing stack args land in
   2228      * the caller's incoming-arg window. `stack_off` already includes the
   2229      * shadow-space prefix (the outgoing cursor starts at shadow_space), so the
   2230      * window address is [rbp + 16 + stack_off] — the same bytes the tail-callee
   2231      * reads once `leave` has restored rsp to the return address. */
   2232     addr.base.reg = X64_RBP;
   2233     addr.offset = (i32)(16u + stack_off);
   2234   } else {
   2235     addr.base.reg = X64_RSP;
   2236     addr.offset = (i32)stack_off;
   2237   }
   2238   x64_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
   2239 }
   2240 
   2241 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
   2242 static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
   2243                                    Reg int_scratch);
   2244 
   2245 /* Defer a register-destination param bind for the parallel-copy flush in
   2246  * x64_bind_params_end. `src` is the incoming location (an arg register, or a
   2247  * NATIVE_LOC_ADDR for an incoming stack slot). */
   2248 static void x64_defer_reg_bind(X64NativeTarget* a, NativeLoc dst, NativeLoc src,
   2249                                u32 size) {
   2250   NativeArgMove* m;
   2251   if (a->nbind_moves >= X64_MAX_BIND_MOVES)
   2252     x64_panic(a, "too many register parameter binds");
   2253   m = &a->bind_moves[a->nbind_moves++];
   2254   memset(m, 0, sizeof *m);
   2255   m->dst = dst;
   2256   m->src = src;
   2257   m->size = size;
   2258 }
   2259 
   2260 /* Incoming stack-arg source as a NATIVE_LOC_ADDR ([rbp + bias + stack_off]). */
   2261 static NativeLoc x64_incoming_stack_loc(KitCgTypeId type, NativeAllocClass cls,
   2262                                         i32 off) {
   2263   NativeLoc l;
   2264   memset(&l, 0, sizeof l);
   2265   l.kind = NATIVE_LOC_ADDR;
   2266   l.cls = (u8)cls;
   2267   l.type = type;
   2268   l.v.addr.base_kind = NATIVE_ADDR_BASE_REG;
   2269   l.v.addr.base.reg = X64_RBP;
   2270   l.v.addr.base_type = type;
   2271   l.v.addr.offset = off;
   2272   return l;
   2273 }
   2274 
   2275 static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
   2276                                   NativeLoc dst) {
   2277   X64NativeTarget* a = x64_of(t);
   2278   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   2279   const ABIArgInfo* ai =
   2280       p->index < abi->nparams ? &abi->params[p->index] : NULL;
   2281   int to_reg = dst.kind == NATIVE_LOC_REG;
   2282   /* Incoming stack args sit above the saved rbp + return addr (+16); Win64
   2283    * additionally reserves 32B of home space. */
   2284   i32 incoming_bias = (i32)(16u + a->abi->shadow_space);
   2285   u16 i;
   2286   if (!ai || ai->kind == ABI_ARG_IGNORE) return;
   2287 
   2288   if (ai->kind == ABI_ARG_INDIRECT) {
   2289     /* Incoming pointer to a byval copy: load pointer, memcpy into dst frame. */
   2290     u32 ptr_reg;
   2291     NativeAddr d_addr, from;
   2292     AggregateAccess access;
   2293     if (a->next_param_int < a->abi->n_int_args) {
   2294       ptr_reg = a->abi->int_args[a->next_param_int++];
   2295     } else {
   2296       ptr_reg = X64_R11;
   2297       emit_mov_load(t->mc, 8, 0, ptr_reg, X64_RBP,
   2298                     incoming_bias + (i32)a->next_param_stack);
   2299       a->next_param_stack += 8u;
   2300     }
   2301     x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp);
   2302     if (dst.kind != NATIVE_LOC_FRAME)
   2303       x64_panic(a, "indirect parameter requires a frame destination");
   2304     memset(&d_addr, 0, sizeof d_addr);
   2305     d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2306     d_addr.base.frame = dst.v.frame;
   2307     d_addr.base_type = p->type;
   2308     memset(&from, 0, sizeof from);
   2309     from.base_kind = NATIVE_ADDR_BASE_REG;
   2310     from.base.reg = ptr_reg;
   2311     from.base_type = p->type;
   2312     memset(&access, 0, sizeof access);
   2313     access.type = p->type;
   2314     access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
   2315     access.align = p->align ? p->align : native_type_align(t, p->type);
   2316     x64_copy_bytes(t, d_addr, from, access);
   2317     return;
   2318   }
   2319 
   2320   if (ai->kind == ABI_ARG_DIRECT &&
   2321       x64_direct_to_stack(a->abi, ai, a->next_param_int, a->next_param_fp)) {
   2322     /* Whole arg on the stack. */
   2323     for (i = 0; i < ai->nparts; ++i) {
   2324       const ABIArgPart* part = &ai->parts[i];
   2325       NativeAllocClass cls =
   2326           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2327       NativeLoc isrc = x64_incoming_stack_loc(
   2328           p->type, cls, incoming_bias + (i32)a->next_param_stack);
   2329       a->next_param_stack += 8u;
   2330       if (dst.kind == NATIVE_LOC_NONE) {
   2331         /* unused */
   2332       } else if (to_reg) {
   2333         /* Defer: a register dst may be another param's incoming reg. */
   2334         x64_defer_reg_bind(
   2335             a,
   2336             native_loc_reg(dst.type ? dst.type : p->type,
   2337                            (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
   2338             isrc, part->size);
   2339       } else {
   2340         /* Frame dst: load to scratch then store (memory dst is never a cycle
   2341          * source, so emit eagerly — it only reads the incoming slot). */
   2342         Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
   2343         NativeLoc tloc = native_loc_reg(p->type, cls, tmp);
   2344         x64_load_part(t, tloc, isrc, 0, part->size);
   2345         x64_store_part(
   2346             t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset),
   2347             tloc, 0, part->size);
   2348       }
   2349     }
   2350     return;
   2351   }
   2352 
   2353   for (i = 0; i < ai->nparts; ++i) {
   2354     const ABIArgPart* part = &ai->parts[i];
   2355     NativeAllocClass cls =
   2356         part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2357     NativeLoc
   2358         src; /* incoming: arg register, or NATIVE_LOC_ADDR for a stack arg */
   2359     if (cls == NATIVE_REG_FP && a->next_param_fp < a->abi->n_fp_args) {
   2360       src = native_loc_reg(p->type, cls, (Reg)(X64_XMM0 + a->next_param_fp++));
   2361     } else if (cls == NATIVE_REG_INT &&
   2362                a->next_param_int < a->abi->n_int_args) {
   2363       src = native_loc_reg(p->type, cls, a->abi->int_args[a->next_param_int++]);
   2364     } else {
   2365       src = x64_incoming_stack_loc(p->type, cls,
   2366                                    incoming_bias + (i32)a->next_param_stack);
   2367       a->next_param_stack += 8u;
   2368     }
   2369     x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp);
   2370     if (dst.kind == NATIVE_LOC_NONE) {
   2371       /* unused parameter; cursors advanced */
   2372     } else if (to_reg) {
   2373       /* Defer the register bind: the allocator may rotate params across the
   2374        * incoming arg registers, so a per-param move could clobber a register
   2375        * another bind still needs. x64_bind_params_end resolves them together as
   2376        * a parallel copy. */
   2377       x64_defer_reg_bind(
   2378           a,
   2379           native_loc_reg(dst.type ? dst.type : p->type,
   2380                          (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
   2381           src, part->size);
   2382     } else if (src.kind == NATIVE_LOC_REG) {
   2383       x64_store_part(
   2384           t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
   2385           0, part->size);
   2386     } else {
   2387       /* Stack source -> frame dst: load to scratch, then store. */
   2388       Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
   2389       NativeLoc tloc = native_loc_reg(p->type, cls, tmp);
   2390       x64_load_part(t, tloc, src, 0, part->size);
   2391       x64_store_part(
   2392           t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset),
   2393           tloc, 0, part->size);
   2394     }
   2395   }
   2396   a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
   2397 }
   2398 
   2399 /* Flush the deferred register-destination param binds as a parallel copy (the
   2400  * shared scheduler breaks any cycle the allocator's rotation created through
   2401  * the int/fp emit scratch). Frame-dst and indirect binds were emitted eagerly
   2402  * in bind_param — they only read incoming registers, so they precede this. */
   2403 static void x64_bind_params_end(NativeTarget* t) {
   2404   X64NativeTarget* a = x64_of(t);
   2405   /* No callee is staged during entry binds, so r11 is free as the cycle
   2406    * scratch. */
   2407   if (a->nbind_moves)
   2408     x64_emit_reg_arg_moves(t, a->bind_moves, a->nbind_moves, X64_TMP_INT2);
   2409   a->nbind_moves = 0;
   2410 }
   2411 
   2412 /* ============================ calls / returns ============================ */
   2413 
   2414 typedef NativeArgMove X64ArgMove;
   2415 
   2416 static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   2417   if (m->is_addr) {
   2418     x64_addr_of_loc(t, m->dst, m->src);
   2419   } else {
   2420     x64_load_part(t, m->dst, m->src, m->src_offset, m->size);
   2421   }
   2422   if (m->dup_to_gpr) {
   2423     /* movq gpr, xmm: 66 REX.W 0F 7E /r (xmm in reg field). */
   2424     emit_sse_rr_w(t->mc, 0x66, 0x7E, 1, loc_reg(m->dst), m->dup_gpr);
   2425   }
   2426 }
   2427 
   2428 /* Parallel-copy register arg moves via the shared scheduler. `int_scratch` is
   2429  * the register used to break an integer cycle: normally r11, but rax when an
   2430  * indirect callee is staged in r11 (rax is never a SysV int arg register and
   2431  * the variadic AL count is written only after the moves). */
   2432 static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
   2433                                    Reg int_scratch) {
   2434   NativeArgShuffle s;
   2435   if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args");
   2436   memset(&s, 0, sizeof s);
   2437   s.t = t;
   2438   s.emit_one = x64_emit_one_arg_move;
   2439   s.reg_move = x64_move;
   2440   s.scratch[NATIVE_REG_INT] = int_scratch;
   2441   s.scratch[NATIVE_REG_FP] = X64_TMP_FP;
   2442   native_arg_shuffle(&s, moves, n);
   2443 }
   2444 
   2445 /* Clobber masks: per-call all caller-saved regs are clobbered. */
   2446 static u32 x64_clobber_mask(const X64ABIRegs* abi, NativeAllocClass cls) {
   2447   u32 mask = 0, r;
   2448   if (cls == NATIVE_REG_INT) {
   2449     for (r = 0; r < 16u; ++r) {
   2450       if (r == X64_RSP || r == X64_RBP) continue;
   2451       if ((abi->cs_int_mask & (1ull << r)) == 0) mask |= 1u << r;
   2452     }
   2453   } else if (cls == NATIVE_REG_FP) {
   2454     for (r = 0; r < 16u; ++r)
   2455       if ((abi->cs_fp_mask & (1ull << r)) == 0) mask |= 1u << r;
   2456   }
   2457   return mask;
   2458 }
   2459 
   2460 static u32 x64_return_mask(const ABIFuncInfo* abi, NativeAllocClass cls) {
   2461   u32 mask = 0, ni = 0, nf = 0;
   2462   static const u32 iregs[2] = {X64_RAX, X64_RDX};
   2463   u16 i;
   2464   if (!abi || abi->ret.kind == ABI_ARG_IGNORE ||
   2465       abi->ret.kind == ABI_ARG_INDIRECT)
   2466     return 0;
   2467   for (i = 0; i < abi->ret.nparts; ++i) {
   2468     const ABIArgPart* p = &abi->ret.parts[i];
   2469     if (cls == NATIVE_REG_INT && p->cls == ABI_CLASS_INT && ni < 2)
   2470       mask |= 1u << iregs[ni++];
   2471     else if (cls == NATIVE_REG_FP && p->cls == ABI_CLASS_FP && nf < 2)
   2472       mask |= 1u << (X64_XMM0 + nf++);
   2473   }
   2474   return mask;
   2475 }
   2476 
   2477 static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   2478                           NativeCallPlan* plan) {
   2479   X64NativeTarget* a = x64_of(t);
   2480   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2481   const X64ABIRegs* aregs = a->abi ? a->abi : x64_abi_for_os(t->c->target.os);
   2482   NativeCallPlanRet* rets;
   2483   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2484   u32 c;
   2485   memset(plan, 0, sizeof *plan);
   2486   rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
   2487   plan->callee = desc->callee;
   2488   plan->rets = rets;
   2489   plan->flags = desc->flags;
   2490   plan->has_sret = abi && abi->has_sret;
   2491   plan->is_variadic = abi && abi->variadic;
   2492   plan->stack_arg_size = x64_call_stack_size(t, desc);
   2493   if (plan->stack_arg_size > a->frame.max_outgoing)
   2494     a->frame.max_outgoing = plan->stack_arg_size;
   2495   for (c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) {
   2496     plan->clobber_mask[c] = x64_clobber_mask(aregs, (NativeAllocClass)c);
   2497     plan->return_mask[c] = x64_return_mask(abi, (NativeAllocClass)c);
   2498   }
   2499   /* Indirect callee in a clobbered/arg register would be lost; stage in r11. */
   2500   if (plan->callee.kind == NATIVE_LOC_REG &&
   2501       (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
   2502       plan->callee.v.reg != X64_R11) {
   2503     NativeLoc scratch =
   2504         native_loc_reg(plan->callee.type, NATIVE_REG_INT, X64_R11);
   2505     x64_move(t, scratch, plan->callee);
   2506     plan->callee = scratch;
   2507   }
   2508   {
   2509     u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   2510     u32 next_fp = 0, stack = aregs->shadow_space, nmoves = 0, i;
   2511     int tail = (desc->flags & CG_CALL_TAIL) != 0;
   2512     u16 p;
   2513     X64ArgMove moves[X64_MAX_REG_ARG_MOVES];
   2514     x64_sync_slot(aregs, &next_int, &next_fp);
   2515     for (i = 0; i < desc->nargs; ++i) {
   2516       ABIArgInfo tmp;
   2517       const ABIArgInfo* ai = x64_param_abi(t, abi, desc, i, &tmp);
   2518       int variadic_arg = abi && i >= abi->nparams;
   2519       if (ai->kind == ABI_ARG_IGNORE) continue;
   2520       if (ai->kind == ABI_ARG_INDIRECT) {
   2521         if (next_int < aregs->n_int_args) {
   2522           X64ArgMove* m = &moves[nmoves++];
   2523           memset(m, 0, sizeof *m);
   2524           m->dst =
   2525               native_loc_reg(i64t, NATIVE_REG_INT, aregs->int_args[next_int++]);
   2526           m->src = desc->args[i];
   2527           m->size = 8;
   2528           m->is_addr = 1;
   2529         } else {
   2530           NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX);
   2531           x64_addr_of_loc(t, ptr, desc->args[i]);
   2532           x64_store_outgoing_part(t, tail, stack, ptr, 8);
   2533           stack += 8u;
   2534         }
   2535         x64_sync_slot(aregs, &next_int, &next_fp);
   2536         continue;
   2537       }
   2538       if (ai->kind == ABI_ARG_DIRECT &&
   2539           x64_direct_to_stack(aregs, ai, next_int, next_fp)) {
   2540         for (p = 0; p < ai->nparts; ++p) {
   2541           const ABIArgPart* part = &ai->parts[p];
   2542           NativeAllocClass cls =
   2543               part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2544           Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
   2545           NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
   2546           x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
   2547           x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
   2548           stack += 8u;
   2549         }
   2550         continue;
   2551       }
   2552       for (p = 0; p < ai->nparts; ++p) {
   2553         const ABIArgPart* part = &ai->parts[p];
   2554         NativeAllocClass cls =
   2555             part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2556         if (cls == NATIVE_REG_FP && next_fp < aregs->n_fp_args) {
   2557           X64ArgMove* m = &moves[nmoves++];
   2558           u32 slot = next_fp;
   2559           memset(m, 0, sizeof *m);
   2560           m->dst = native_loc_reg(desc->args[i].type, cls,
   2561                                   (Reg)(X64_XMM0 + next_fp++));
   2562           m->src = desc->args[i];
   2563           m->src_offset = part->src_offset;
   2564           m->size = part->size;
   2565           if (aregs->vararg_fp_dup_to_gpr && variadic_arg &&
   2566               slot < aregs->n_int_args) {
   2567             m->dup_to_gpr = 1;
   2568             m->dup_gpr = aregs->int_args[slot];
   2569           }
   2570           x64_sync_slot(aregs, &next_int, &next_fp);
   2571         } else if (cls == NATIVE_REG_INT && next_int < aregs->n_int_args) {
   2572           X64ArgMove* m = &moves[nmoves++];
   2573           memset(m, 0, sizeof *m);
   2574           m->dst = native_loc_reg(desc->args[i].type, cls,
   2575                                   aregs->int_args[next_int++]);
   2576           m->src = desc->args[i];
   2577           m->src_offset = part->src_offset;
   2578           m->size = part->size;
   2579           x64_sync_slot(aregs, &next_int, &next_fp);
   2580         } else {
   2581           Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
   2582           NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
   2583           x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
   2584           x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
   2585           stack += 8u;
   2586           x64_sync_slot(aregs, &next_int, &next_fp);
   2587         }
   2588       }
   2589     }
   2590     /* If an indirect callee was staged in r11 above, the cycle scratch must
   2591      * avoid it; rax is free here (not an int arg reg; AL count comes later). */
   2592     x64_emit_reg_arg_moves(
   2593         t, moves, nmoves,
   2594         (plan->callee.kind == NATIVE_LOC_REG && plan->callee.v.reg == X64_R11)
   2595             ? X64_TMP_INT
   2596             : X64_TMP_INT2);
   2597     if (abi && abi->has_sret) {
   2598       /* sret pointer in the first int-arg reg. A tail call forwards the
   2599        * caller's own incoming sret pointer (spilled at entry); otherwise pass
   2600        * the address of this call's result slot. */
   2601       NativeLoc sret = native_loc_reg(i64t, NATIVE_REG_INT, aregs->int_args[0]);
   2602       if (tail)
   2603         x64_load_part(t, sret, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0,
   2604                       8);
   2605       else if (desc->nresults)
   2606         x64_addr_of_loc(t, sret, desc->results[0]);
   2607     }
   2608     /* Variadic call: AL = number of vector regs used. */
   2609     if (abi && abi->variadic)
   2610       x64_emit_load_imm(t->mc, 0, X64_RAX, (i64)next_fp);
   2611   }
   2612   /* Return value receipt. */
   2613   if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
   2614     u32 nr = 0, ni = 0, nf = 0;
   2615     static const u32 ret_int_regs[2] = {X64_RAX, X64_RDX};
   2616     u16 p;
   2617     for (p = 0; p < abi->ret.nparts; ++p) {
   2618       const ABIArgPart* part = &abi->ret.parts[p];
   2619       NativeAllocClass cls =
   2620           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2621       KitCgTypeId pty = x64_part_scalar_type(part);
   2622       Reg rreg = cls == NATIVE_REG_FP ? (Reg)(X64_XMM0 + nf++)
   2623                                       : (Reg)ret_int_regs[ni++];
   2624       rets[nr].src = native_loc_reg(pty, cls, rreg);
   2625       rets[nr].dst = desc->results[0];
   2626       if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
   2627         rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
   2628                                         (i32)part->src_offset);
   2629       else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
   2630         rets[nr].dst.v.stack.offset += (i32)part->src_offset;
   2631         rets[nr].dst.type = pty;
   2632       }
   2633       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2634       nr++;
   2635     }
   2636     plan->nrets = nr;
   2637   } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
   2638     plan->nrets = 0;
   2639   } else if (!abi && desc->nresults) {
   2640     rets[0].src =
   2641         native_loc_reg(desc->results[0].type, NATIVE_REG_INT, X64_RAX);
   2642     rets[0].dst = desc->results[0];
   2643     rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0);
   2644     plan->nrets = 1;
   2645   }
   2646 }
   2647 
   2648 /* Emit a sibling (tail) call: tear the frame down and jump (no call) to the
   2649  * callee. Outgoing args are already in arg regs / the caller's incoming-arg
   2650  * window. `leave` (mov rsp,rbp; pop rbp) restores the caller's rbp and leaves
   2651  * rsp at the return address — frame_size-independent, so no func_end patch. */
   2652 static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) {
   2653   X64NativeTarget* a = x64_of(t);
   2654   MCEmitter* mc = t->mc;
   2655   ObjSecId sec = mc->section_id;
   2656   /* Restore callee-saves before the frame teardown (O1 path; none at -O0).
   2657    * Their rbp-relative offsets are frame-size-independent, and the indirect
   2658    * callee was staged in r11 by plan_call — a caller-saved scratch — so these
   2659    * restores never clobber it. Mirrors the x64_func_end epilogue. */
   2660   Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
   2661   u32 n_int = x64_collect_int_saves(a, cs_int);
   2662   u32 n_fp = x64_collect_fp_saves(a, cs_fp);
   2663   u32 xmm_base = x64_xmm_base(a, n_fp);
   2664   i32 i;
   2665   for (i = (i32)n_fp - 1; i >= 0; --i)
   2666     emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP,
   2667                   -(i32)xmm_base - (i32)(i + 1) * 16); /* movaps */
   2668   for (i = (i32)n_int - 1; i >= 0; --i)
   2669     emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP,
   2670                   -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8);
   2671   emit_leave(mc);
   2672   if (callee.kind == NATIVE_LOC_GLOBAL) {
   2673     u8 op = X64_OPC_JMP_REL32;
   2674     u32 disp_pos;
   2675     mc->emit_bytes(mc, &op, 1);
   2676     disp_pos = mc->pos(mc);
   2677     emit_u32le(mc, 0);
   2678     mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, callee.v.global.sym,
   2679                       callee.v.global.addend - 4, 1, 0);
   2680   } else if (callee.kind == NATIVE_LOC_REG) {
   2681     u32 r =
   2682         loc_reg(callee); /* indirect callee was staged in r11 by plan_call */
   2683     if (r & 8u) {
   2684       u8 rex = X64_REX_BASE | X64_REX_B;
   2685       mc->emit_bytes(mc, &rex, 1);
   2686     }
   2687     {
   2688       u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; /* jmp r/m, /4 */
   2689       mc->emit_bytes(mc, buf, 2);
   2690     }
   2691   } else {
   2692     x64_panic(a, "unsupported tail call target");
   2693   }
   2694 }
   2695 
   2696 static void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   2697   MCEmitter* mc = t->mc;
   2698   ObjSecId sec = mc->section_id;
   2699   if (plan->flags & CG_CALL_TAIL) {
   2700     x64_emit_tail_site(t, plan->callee);
   2701     return;
   2702   }
   2703   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
   2704     u8 op = X64_OPC_CALL_REL32;
   2705     u32 disp_pos;
   2706     mc->emit_bytes(mc, &op, 1);
   2707     disp_pos = mc->pos(mc);
   2708     emit_u32le(mc, 0);
   2709     mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, plan->callee.v.global.sym,
   2710                       plan->callee.v.global.addend - 4, 1, 0);
   2711     return;
   2712   }
   2713   if (plan->callee.kind == NATIVE_LOC_REG) {
   2714     u32 r = loc_reg(plan->callee);
   2715     if (r & 8u) {
   2716       u8 rex = X64_REX_BASE | X64_REX_B;
   2717       mc->emit_bytes(mc, &rex, 1);
   2718     }
   2719     {
   2720       u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 2u, r & 7u)}; /* call r/m, /2 */
   2721       mc->emit_bytes(mc, buf, 2);
   2722     }
   2723     return;
   2724   }
   2725   x64_panic(x64_of(t), "unsupported call target");
   2726 }
   2727 
   2728 static void x64_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
   2729                          const NativeLoc* value,
   2730                          NativeCallPlanRet** out_rets, u32* out_nrets) {
   2731   X64NativeTarget* a = x64_of(t);
   2732   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   2733   NativeCallPlanRet* rets = NULL;
   2734   u32 nr = 0;
   2735   if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
   2736   if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
   2737     /* sret: reload destination pointer (spilled at entry) into r11, memcpy the
   2738      * source aggregate into [r11], and convention-return the pointer in rax. */
   2739     KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2740     NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, X64_R11);
   2741     NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0);
   2742     NativeAddr dst_addr, src_addr;
   2743     AggregateAccess access;
   2744     x64_load_part(t, dstp, saved, 0, 8);
   2745     memset(&dst_addr, 0, sizeof dst_addr);
   2746     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   2747     dst_addr.base.reg = X64_R11;
   2748     dst_addr.base_type = value->type;
   2749     src_addr = x64_loc_addr(a, *value, 0);
   2750     src_addr.base_type = value->type;
   2751     memset(&access, 0, sizeof access);
   2752     access.type = value->type;
   2753     access.size = (u32)cg_type_size(t->c, value->type);
   2754     access.align = native_type_align(t, value->type);
   2755     x64_copy_bytes(t, dst_addr, src_addr, access);
   2756     /* rax = sret pointer. Reload it (copy_bytes clobbered r11/rax). */
   2757     x64_load_part(t, native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX), saved, 0,
   2758                   8);
   2759     *out_rets = NULL;
   2760     *out_nrets = 0;
   2761     return;
   2762   }
   2763   if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) {
   2764     u32 ni = 0, nf = 0;
   2765     static const u32 ret_int_regs[2] = {X64_RAX, X64_RDX};
   2766     u16 p;
   2767     for (p = 0; p < abi->ret.nparts; ++p) {
   2768       const ABIArgPart* part = &abi->ret.parts[p];
   2769       NativeAllocClass cls =
   2770           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2771       KitCgTypeId pty = x64_part_scalar_type(part);
   2772       Reg rreg = cls == NATIVE_REG_FP ? (Reg)(X64_XMM0 + nf++)
   2773                                       : (Reg)ret_int_regs[ni++];
   2774       rets[nr].src = *value;
   2775       if (rets[nr].src.kind == NATIVE_LOC_FRAME)
   2776         rets[nr].src =
   2777             native_loc_stack(pty, value->v.frame, (i32)part->src_offset);
   2778       else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
   2779         rets[nr].src.v.stack.offset += (i32)part->src_offset;
   2780         rets[nr].src.type = pty;
   2781       }
   2782       rets[nr].dst = native_loc_reg(pty, cls, rreg);
   2783       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2784       nr++;
   2785     }
   2786   } else if (value) {
   2787     rets[0].src = *value;
   2788     rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, X64_RAX);
   2789     rets[0].mem = native_mem_for_type(t, value->type, 0);
   2790     nr = 1;
   2791   }
   2792   *out_rets = rets;
   2793   *out_nrets = nr;
   2794 }
   2795 
   2796 static void x64_ret(NativeTarget* t) {
   2797   X64NativeTarget* a = x64_of(t);
   2798   x64_jump(t, a->epilogue_label);
   2799 }
   2800 
   2801 /* ============================ alloca ============================ */
   2802 
   2803 static void x64_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
   2804                        u32 align) {
   2805   X64NativeTarget* a = x64_of(t);
   2806   MCEmitter* mc = t->mc;
   2807   u32 rsz = loc_reg(size);
   2808   u32 rd = loc_reg(dst);
   2809   u32 al = align ? align : 16u;
   2810   if (al < 16u) al = 16u;
   2811   if (al > 16u) x64_panic(a, "alloca align > 16 not supported");
   2812   if (size.kind == NATIVE_LOC_IMM) {
   2813     u64 aligned = ((u64)size.v.imm + 15u) & ~(u64)15u;
   2814     if (aligned == 0) aligned = 16;
   2815     /* sub rsp, imm32. */
   2816     emit_rex(mc, 1, 0, 0, X64_RSP);
   2817     {
   2818       u8 buf[2] = {X64_OPC_ALU_IMM32, modrm(3u, X64_ALU_SUB_SUB, X64_RSP)};
   2819       mc->emit_bytes(mc, buf, 2);
   2820     }
   2821     emit_u32le(mc, (u32)aligned);
   2822   } else {
   2823     /* rax = (size + 15) & ~15; sub rsp, rax. */
   2824     emit_lea(mc, X64_RAX, rsz, 15);
   2825     emit_rex(mc, 1, 0, 0, X64_RAX);
   2826     {
   2827       u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_AND, X64_RAX), 0xF0};
   2828       mc->emit_bytes(mc, buf, 3);
   2829     }
   2830     emit_alu_rr(mc, 1, X64_OPC_ALU_SUB, X64_RSP, X64_RAX);
   2831   }
   2832   a->frame.has_alloca = 1;
   2833   /* lea dst, [rsp + max_outgoing] — disp32 patched in func_end. */
   2834   if (a->npatches == a->patches_cap) {
   2835     u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
   2836     X64Patch* nb = arena_zarray(t->c->tu, X64Patch, cap);
   2837     if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
   2838     a->patches = nb;
   2839     a->patches_cap = cap;
   2840   }
   2841   emit_rex(mc, 1, rd, 0, X64_RSP);
   2842   {
   2843     u8 op = X64_OPC_LEA;
   2844     mc->emit_bytes(mc, &op, 1);
   2845   }
   2846   {
   2847     u8 mr = modrm(2u, rd & 7u, 4u);
   2848     mc->emit_bytes(mc, &mr, 1);
   2849   }
   2850   {
   2851     u8 s = sib(0u, 4u, X64_RSP);
   2852     mc->emit_bytes(mc, &s, 1);
   2853   }
   2854   a->patches[a->npatches].kind = X64_PATCH_ALLOCA;
   2855   a->patches[a->npatches].pos = mc->pos(mc);
   2856   a->npatches++;
   2857   a->nalloca++;
   2858   emit_u32le(mc, 0); /* placeholder disp32 */
   2859 }
   2860 
   2861 /* ============================ TLS ============================ */
   2862 
   2863 /* Win64 TLS Local-Exec (PE-COFF): TEB pointer -> _tls_index -> TLS block ->
   2864  * lea &sym@SECREL. R11 is scratch. */
   2865 static void x64_tls_addr_of_win64(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2866                                   i64 addend) {
   2867   MCEmitter* mc = t->mc;
   2868   u32 sec = mc->section_id;
   2869   u32 rd = loc_reg(dst);
   2870   /* (1) mov rd, gs:[0x58]. */
   2871   {
   2872     u8 gs = 0x65;
   2873     mc->emit_bytes(mc, &gs, 1);
   2874     emit_rex(mc, 1, rd, 0, 0);
   2875     {
   2876       u8 op = X64_OPC_MOV_R_RM;
   2877       mc->emit_bytes(mc, &op, 1);
   2878     }
   2879     {
   2880       u8 mr = modrm(0u, rd & 7u, 4u);
   2881       mc->emit_bytes(mc, &mr, 1);
   2882     }
   2883     {
   2884       u8 s = sib(0u, 4u, 5u);
   2885       mc->emit_bytes(mc, &s, 1);
   2886     }
   2887     emit_u32le(mc, 0x58u);
   2888   }
   2889   /* (2) mov r11d, [rip + _tls_index]. */
   2890   {
   2891     Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index"));
   2892     ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name);
   2893     u8 rex_r, op, mr;
   2894     u32 disp_pos;
   2895     if (idx_sym == 0)
   2896       idx_sym =
   2897           obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0);
   2898     rex_r = X64_REX_BASE | X64_REX_R;
   2899     mc->emit_bytes(mc, &rex_r, 1);
   2900     op = X64_OPC_MOV_R_RM;
   2901     mc->emit_bytes(mc, &op, 1);
   2902     mr = modrm(0u, 3u, 5u); /* r11&7, rip-rel */
   2903     mc->emit_bytes(mc, &mr, 1);
   2904     disp_pos = mc->pos(mc);
   2905     emit_u32le(mc, 0);
   2906     mc->emit_reloc_at(mc, sec, disp_pos, R_PC32, idx_sym, -4, 1, 0);
   2907   }
   2908   /* (3) mov rd, [rd + r11*8]. */
   2909   {
   2910     u8 rex = X64_REX_BASE | X64_REX_W | X64_REX_X;
   2911     u8 op;
   2912     if (rd & 8u) rex |= X64_REX_R | X64_REX_B;
   2913     mc->emit_bytes(mc, &rex, 1);
   2914     op = X64_OPC_MOV_R_RM;
   2915     mc->emit_bytes(mc, &op, 1);
   2916     if ((rd & 7u) == 5u) {
   2917       u8 mr = modrm(1u, rd & 7u, 4u);
   2918       u8 s = sib(3u, 3u, rd & 7u);
   2919       u8 zero = 0;
   2920       mc->emit_bytes(mc, &mr, 1);
   2921       mc->emit_bytes(mc, &s, 1);
   2922       mc->emit_bytes(mc, &zero, 1);
   2923     } else {
   2924       u8 mr = modrm(0u, rd & 7u, 4u);
   2925       u8 s = sib(3u, 3u, rd & 7u);
   2926       mc->emit_bytes(mc, &mr, 1);
   2927       mc->emit_bytes(mc, &s, 1);
   2928     }
   2929   }
   2930   /* (4) lea rd, [rd + sym@SECREL]. */
   2931   {
   2932     u8 rex = X64_REX_BASE | X64_REX_W;
   2933     u8 op;
   2934     u32 disp_pos;
   2935     if (rd & 8u) rex |= X64_REX_R | X64_REX_B;
   2936     mc->emit_bytes(mc, &rex, 1);
   2937     op = X64_OPC_LEA;
   2938     mc->emit_bytes(mc, &op, 1);
   2939     if ((rd & 7u) == 4u) {
   2940       u8 mr = modrm(2u, rd & 7u, 4u);
   2941       u8 s = sib(0u, 4u, rd & 7u);
   2942       mc->emit_bytes(mc, &mr, 1);
   2943       mc->emit_bytes(mc, &s, 1);
   2944     } else {
   2945       u8 mr = modrm(2u, rd & 7u, rd & 7u);
   2946       mc->emit_bytes(mc, &mr, 1);
   2947     }
   2948     disp_pos = mc->pos(mc);
   2949     emit_u32le(mc, 0);
   2950     mc->emit_reloc_at(mc, sec, disp_pos, R_COFF_SECREL, sym, addend, 1, 0);
   2951   }
   2952 }
   2953 
   2954 /* x86-64 TLS Local-Exec: mov rd, fs:0; lea rd, [rd + sym@tpoff]. */
   2955 static void x64_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2956                             i64 addend) {
   2957   MCEmitter* mc = t->mc;
   2958   u32 sec = mc->section_id;
   2959   u32 rd = loc_reg(dst);
   2960   u32 disp_pos;
   2961   if (obj_format_tls_model(t->c) == OBJ_TLS_WINDOWS_TEB) {
   2962     x64_tls_addr_of_win64(t, dst, sym, addend);
   2963     return;
   2964   }
   2965   /* mov rd, fs:[0]. */
   2966   {
   2967     u8 fs = 0x64;
   2968     mc->emit_bytes(mc, &fs, 1);
   2969     emit_rex(mc, 1, rd, 0, 0);
   2970     {
   2971       u8 op = X64_OPC_MOV_R_RM;
   2972       mc->emit_bytes(mc, &op, 1);
   2973     }
   2974     {
   2975       u8 mr = modrm(0u, rd & 7u, 4u);
   2976       mc->emit_bytes(mc, &mr, 1);
   2977     }
   2978     {
   2979       u8 s = sib(0u, 4u, 5u);
   2980       mc->emit_bytes(mc, &s, 1);
   2981     }
   2982     emit_u32le(mc, 0);
   2983   }
   2984   /* lea rd, [rd + disp32@tpoff]. */
   2985   emit_rex(mc, 1, rd, 0, rd);
   2986   {
   2987     u8 op = X64_OPC_LEA;
   2988     mc->emit_bytes(mc, &op, 1);
   2989   }
   2990   if ((rd & 7u) == 4u) {
   2991     u8 mr = modrm(2u, rd & 7u, 4u);
   2992     u8 s = sib(0u, 4u, rd & 7u);
   2993     mc->emit_bytes(mc, &mr, 1);
   2994     mc->emit_bytes(mc, &s, 1);
   2995   } else {
   2996     u8 mr = modrm(2u, rd & 7u, rd & 7u);
   2997     mc->emit_bytes(mc, &mr, 1);
   2998   }
   2999   disp_pos = mc->pos(mc);
   3000   emit_u32le(mc, 0);
   3001   mc->emit_reloc_at(mc, sec, disp_pos, R_X64_TPOFF32, sym, addend, 0, 0);
   3002 }
   3003 
   3004 /* ============================ atomics ============================ */
   3005 
   3006 static void emit_lock_prefix(MCEmitter* mc) {
   3007   u8 b = 0xF0;
   3008   mc->emit_bytes(mc, &b, 1);
   3009 }
   3010 static void emit_mfence(MCEmitter* mc) {
   3011   u8 b[3] = {0x0F, 0xAE, 0xF0};
   3012   mc->emit_bytes(mc, b, 3);
   3013 }
   3014 
   3015 /* Resolve an atomic addr to a bare base register (r11) + disp 0. */
   3016 static u32 x64_atomic_base(X64NativeTarget* a, NativeAddr addr) {
   3017   return x64_addr_to_base_reg(a, addr, X64_TMP_INT2);
   3018 }
   3019 
   3020 static void x64_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   3021                             MemAccess mem, KitCgMemOrder mo) {
   3022   X64NativeTarget* a = x64_of(t);
   3023   u32 sz = mem.size ? mem.size : native_type_size(t, dst.type);
   3024   u32 base;
   3025   (void)mo; /* x86 plain MOV is an acquire load. */
   3026   base = x64_atomic_base(a, addr);
   3027   emit_mov_load(t->mc, sz, 0, loc_reg(dst), base, 0);
   3028 }
   3029 
   3030 static void x64_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
   3031                              MemAccess mem, KitCgMemOrder mo) {
   3032   X64NativeTarget* a = x64_of(t);
   3033   MCEmitter* mc = t->mc;
   3034   u32 sz = mem.size ? mem.size : native_type_size(t, src.type);
   3035   int w = sz == 8u ? 1 : 0;
   3036   u32 base = x64_atomic_base(a, addr);
   3037   u32 sr = loc_reg(src);
   3038   if (mo == KIT_CG_MO_SEQ_CST) {
   3039     /* xchg [mem], r11 implicitly fences. Stage src in rax (r11 holds base). */
   3040     if (sr != X64_RAX) emit_mov_rr(mc, w, X64_RAX, sr);
   3041     emit_lock_prefix(mc);
   3042     emit_rex(mc, w, X64_RAX, 0, base);
   3043     {
   3044       u8 op = 0x87; /* xchg r/m, r */
   3045       mc->emit_bytes(mc, &op, 1);
   3046     }
   3047     emit_mem_operand(mc, X64_RAX, base, 0);
   3048     return;
   3049   }
   3050   emit_mov_store(mc, sz, sr, base, 0);
   3051 }
   3052 
   3053 static void x64_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
   3054                            NativeAddr addr, NativeLoc val, MemAccess mem,
   3055                            KitCgMemOrder mo) {
   3056   X64NativeTarget* a = x64_of(t);
   3057   MCEmitter* mc = t->mc;
   3058   u32 sz = mem.size ? mem.size : native_type_size(t, dst.type);
   3059   int w = sz == 8u ? 1 : 0;
   3060   u32 base = x64_atomic_base(a, addr);
   3061   u32 dr = loc_reg(dst);
   3062   u32 vr = loc_reg(val);
   3063   (void)mo; /* LOCK ops are full barriers. */
   3064   /* The rmw uses fixed rax (prior), rcx (new), rdx (val); the optimizer may
   3065    * have materialized the address into one of them, so keep it out (r11 is the
   3066    * int emit scratch, never an allocated operand). Stage before rdx is loaded.
   3067    */
   3068   if (base == X64_RAX || base == X64_RCX || base == X64_RDX) {
   3069     emit_mov_rr(mc, 1, X64_TMP_INT2, base);
   3070     base = X64_TMP_INT2;
   3071   }
   3072   /* val staged in rdx (rax/rcx used by the cmpxchg loop). */
   3073   emit_mov_rr(mc, w, X64_RDX, vr);
   3074   if (op == KIT_CG_ATOMIC_ADD || op == KIT_CG_ATOMIC_SUB) {
   3075     if (op == KIT_CG_ATOMIC_SUB) emit_f7_rm(mc, w, X64_F7_SUB_NEG, X64_RDX);
   3076     emit_lock_prefix(mc);
   3077     emit_rex(mc, w, X64_RDX, 0, base);
   3078     {
   3079       u8 op2[2] = {X64_OPC_TWOBYTE, 0xC1}; /* xadd */
   3080       mc->emit_bytes(mc, op2, 2);
   3081     }
   3082     emit_mem_operand(mc, X64_RDX, base, 0);
   3083     if (dr != X64_RDX) emit_mov_rr(mc, w, dr, X64_RDX);
   3084     return;
   3085   }
   3086   if (op == KIT_CG_ATOMIC_XCHG) {
   3087     emit_lock_prefix(mc);
   3088     emit_rex(mc, w, X64_RDX, 0, base);
   3089     {
   3090       u8 op2 = 0x87; /* xchg */
   3091       mc->emit_bytes(mc, &op2, 1);
   3092     }
   3093     emit_mem_operand(mc, X64_RDX, base, 0);
   3094     if (dr != X64_RDX) emit_mov_rr(mc, w, dr, X64_RDX);
   3095     return;
   3096   }
   3097   /* AND/OR/XOR/NAND: cmpxchg retry loop. rax=prior, rcx=new, rdx=val. */
   3098   {
   3099     MCLabel retry = mc->label_new(mc);
   3100     emit_mov_load(mc, sz, 0, X64_RAX, base, 0);
   3101     mc->label_place(mc, retry);
   3102     emit_mov_rr(mc, w, X64_RCX, X64_RAX);
   3103     switch (op) {
   3104       case KIT_CG_ATOMIC_AND:
   3105         emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX);
   3106         break;
   3107       case KIT_CG_ATOMIC_OR:
   3108         emit_alu_rr(mc, w, X64_OPC_ALU_OR, X64_RCX, X64_RDX);
   3109         break;
   3110       case KIT_CG_ATOMIC_XOR:
   3111         emit_alu_rr(mc, w, X64_OPC_ALU_XOR, X64_RCX, X64_RDX);
   3112         break;
   3113       case KIT_CG_ATOMIC_NAND:
   3114         emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX);
   3115         emit_f7_rm(mc, w, X64_F7_SUB_NOT, X64_RCX);
   3116         break;
   3117       default:
   3118         x64_panic(a, "unsupported atomic rmw op");
   3119     }
   3120     emit_lock_prefix(mc);
   3121     emit_rex(mc, w, X64_RCX, 0, base);
   3122     {
   3123       u8 op2[2] = {X64_OPC_TWOBYTE, 0xB1}; /* cmpxchg */
   3124       mc->emit_bytes(mc, op2, 2);
   3125     }
   3126     emit_mem_operand(mc, X64_RCX, base, 0);
   3127     emit_jcc_rel32(mc, X64_CC_NE, retry);
   3128     if (dr != X64_RAX) emit_mov_rr(mc, w, dr, X64_RAX);
   3129   }
   3130 }
   3131 
   3132 static void x64_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
   3133                            NativeAddr addr, NativeLoc expected,
   3134                            NativeLoc desired, MemAccess mem,
   3135                            KitCgMemOrder success, KitCgMemOrder failure) {
   3136   X64NativeTarget* a = x64_of(t);
   3137   MCEmitter* mc = t->mc;
   3138   u32 sz = mem.size ? mem.size : native_type_size(t, prior.type);
   3139   int w = sz == 8u ? 1 : 0;
   3140   u32 base = x64_atomic_base(a, addr);
   3141   u32 rprior = loc_reg(prior);
   3142   u32 rok = loc_reg(ok);
   3143   u32 rexp = loc_reg(expected);
   3144   u32 rdes = loc_reg(desired);
   3145   (void)success;
   3146   (void)failure;
   3147   /* cmpxchg uses fixed rax (expected) and rcx (desired). The optimizer may have
   3148    * materialized the address into either; keep it out of both (r11 is the int
   3149    * emit scratch, never an allocated operand). */
   3150   if (base == X64_RAX || base == X64_RCX) {
   3151     emit_mov_rr(mc, 1, X64_TMP_INT2, base);
   3152     base = X64_TMP_INT2;
   3153   }
   3154   /* Place expected -> rax and desired -> rcx as a parallel copy: the allocator
   3155    * may have them in each other's target register (full swap) or desired in rax
   3156    * (expected's target), either of which a naive two-move order would clobber.
   3157    */
   3158   if (rexp == X64_RCX && rdes == X64_RAX) {
   3159     /* Swap rax <-> rcx (xchg needs no temp; base is not rax/rcx here). */
   3160     emit_rex(mc, w, X64_RCX, 0, X64_RAX);
   3161     {
   3162       u8 xchg[2] = {0x87, modrm(3u, X64_RCX, X64_RAX)};
   3163       mc->emit_bytes(mc, xchg, 2);
   3164     }
   3165   } else if (rdes == X64_RAX) {
   3166     /* desired sits in rax; move it to rcx before rax is overwritten. */
   3167     if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
   3168     if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
   3169   } else {
   3170     if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
   3171     if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
   3172   }
   3173   emit_lock_prefix(mc);
   3174   emit_rex(mc, w, X64_RCX, 0, base);
   3175   {
   3176     u8 op2[2] = {X64_OPC_TWOBYTE, 0xB1}; /* cmpxchg [base], rcx */
   3177     mc->emit_bytes(mc, op2, 2);
   3178   }
   3179   emit_mem_operand(mc, X64_RCX, base, 0);
   3180   emit_setcc(mc, X64_CC_E, rok);
   3181   emit_movzx_r32_r8(mc, rok, rok);
   3182   if (rprior != X64_RAX) emit_mov_rr(mc, w, rprior, X64_RAX);
   3183 }
   3184 
   3185 static void x64_fence(NativeTarget* t, KitCgMemOrder mo) {
   3186   if (mo == KIT_CG_MO_SEQ_CST) emit_mfence(t->mc);
   3187 }
   3188 
   3189 /* ============================ variadics ============================
   3190  * SysV: __va_list_tag (gp_offset@0, fp_offset@4, overflow@8, reg_save@16). The
   3191  * prologue filled the 176B reg-save area. Win64: va_list is a single pointer
   3192  * to the next 8-byte slot in the home/overflow area; FP varargs are duplicated
   3193  * into the matching GPR slot at the call site. `ap` addresses the va_list
   3194  * object. */
   3195 
   3196 /* Resolve a va_list address into `scratch`, materializing it there if it is not
   3197  * already, so the va field-value scratch registers (rax / r10 / rdx) never
   3198  * alias it. At -O1 the optimizer may place the va_list pointer in any register
   3199  * — including those — and the va code would then clobber the pointer
   3200  * mid-sequence. */
   3201 static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) {
   3202   u32 base = x64_addr_to_base_reg(a, ap, scratch);
   3203   if (base != scratch) {
   3204     emit_mov_rr(a->base.mc, 1, scratch, base);
   3205     base = scratch;
   3206   }
   3207   return base;
   3208 }
   3209 
   3210 /* add r/m, imm8 (group-1 /0) directly to a memory field — advances a va_list
   3211  * offset/pointer in place without consuming a register. w selects 64- vs
   3212  * 32-bit. */
   3213 static void x64_add_mem_imm(MCEmitter* mc, int w, u32 base, i32 disp, i8 imm) {
   3214   u8 op = X64_OPC_ALU_IMM8;
   3215   u8 b;
   3216   emit_rex(mc, w, 0, 0, base);
   3217   mc->emit_bytes(mc, &op, 1);
   3218   emit_mem_operand(mc, X64_ALU_SUB_ADD, base, disp); /* modrm.reg = /0 (ADD) */
   3219   b = (u8)imm;
   3220   mc->emit_bytes(mc, &b, 1);
   3221 }
   3222 
   3223 /* add r64, [base+disp] (0x03 /r). */
   3224 static void x64_add_reg_mem(MCEmitter* mc, u32 dst, u32 base, i32 disp) {
   3225   u8 op = 0x03;
   3226   emit_rex(mc, 1, dst, 0, base);
   3227   mc->emit_bytes(mc, &op, 1);
   3228   emit_mem_operand(mc, dst, base, disp);
   3229 }
   3230 
   3231 static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) {
   3232   NativeTarget* t = &a->base;
   3233   MCEmitter* mc = t->mc;
   3234   u32 ap_base;
   3235   if (!a->is_variadic) x64_panic(a, "va_start: function not variadic");
   3236   ap_base = x64_va_base(a, ap, X64_TMP_INT2);
   3237   if (a->abi->shadow_space) {
   3238     /* Win64: *ap = rbp + 16 + named_int*8 + named_stack. */
   3239     u32 first = 16u + a->next_param_int * 8u + a->next_param_stack;
   3240     emit_lea(mc, X64_RAX, X64_RBP, (i32)first);
   3241     emit_mov_store(mc, 8, X64_RAX, ap_base, 0);
   3242     return;
   3243   }
   3244   {
   3245     X64NativeSlot* rs = x64_slot_get(a, a->reg_save_slot);
   3246     /* gp_offset = next_param_int * 8 */
   3247     x64_emit_load_imm(mc, 0, X64_RAX, (i64)(a->next_param_int * 8u));
   3248     emit_mov_store(mc, 4, X64_RAX, ap_base, 0);
   3249     /* fp_offset = 48 + next_param_fp * 16 */
   3250     x64_emit_load_imm(mc, 0, X64_RAX, (i64)(48u + a->next_param_fp * 16u));
   3251     emit_mov_store(mc, 4, X64_RAX, ap_base, 4);
   3252     /* overflow_arg_area = rbp + 16 + next_param_stack */
   3253     emit_lea(mc, X64_RAX, X64_RBP, (i32)(16u + a->next_param_stack));
   3254     emit_mov_store(mc, 8, X64_RAX, ap_base, 8);
   3255     /* reg_save_area = rbp - reg_save_slot.off */
   3256     emit_lea(mc, X64_RAX, X64_RBP, -(i32)rs->off);
   3257     emit_mov_store(mc, 8, X64_RAX, ap_base, 16);
   3258   }
   3259 }
   3260 
   3261 static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
   3262                             KitCgTypeId type) {
   3263   NativeTarget* t = &a->base;
   3264   MCEmitter* mc = t->mc;
   3265   u32 sz = native_type_size(t, type);
   3266   int is_fp = native_loc_is_fp(dst);
   3267   u32 dr = loc_reg(dst);
   3268   u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */
   3269   /* GPR scratch for the offset/address arithmetic. For integer results the
   3270    * destination is itself a (throwaway) scratch GPR — pass_native_emit fetches
   3271    * va_arg into a scratch and copies it to the real destination afterward — so
   3272    * we reuse `dr` and touch no allocable register at all. FP results keep their
   3273    * value in an XMM register, so they borrow the reserved RAX emit scratch.
   3274    * Either way only r11 (ap_base) and `gp` are used: the va_list fields are
   3275    * advanced in memory (x64_add_mem_imm) and the reg-save base is folded in
   3276    * with x64_add_reg_mem, so no third register is needed. */
   3277   u32 gp = is_fp ? X64_RAX : dr;
   3278   if (a->abi->shadow_space) {
   3279     /* Win64: gp = *ap; load dr from [gp]; *ap += 8. */
   3280     emit_mov_load(mc, 8, 0, gp, ap_base, 0);
   3281     if (is_fp)
   3282       emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
   3283     else
   3284       emit_mov_load(mc, sz, 0, dr, gp, 0);
   3285     x64_add_mem_imm(mc, 1, ap_base, 0, 8);
   3286     return;
   3287   }
   3288   {
   3289     u32 offs_field = is_fp ? 4u : 0u;
   3290     u32 max_offs = is_fp ? 176u : 48u;
   3291     i8 stride = is_fp ? 16 : 8;
   3292     MCLabel L_stack = mc->label_new(mc);
   3293     MCLabel L_done = mc->label_new(mc);
   3294     /* gp32 = ap[offs]; cmp gp32, max; jae L_stack. Use the imm8 form when the
   3295      * threshold fits (gp_offset max 48) so the encoding is canonical and the
   3296      * `cc -S | as` round-trip reproduces it; fp_offset max 176 needs imm32. */
   3297     emit_mov_load(mc, 4, 0, gp, ap_base, (i32)offs_field);
   3298     if (imm_fits_i8((i64)max_offs))
   3299       emit_alu_imm8(mc, 0, X64_ALU_SUB_CMP, gp, (i8)max_offs);
   3300     else
   3301       emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, gp, (i32)max_offs);
   3302     emit_jcc_rel32(mc, X64_CC_AE, L_stack);
   3303     /* reg path: ap[offs] += stride; gp = reg_save_area(ap[16]) + offset; load.
   3304      * (The memory increment leaves gp holding the old offset.) */
   3305     x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride);
   3306     x64_add_reg_mem(mc, gp, ap_base, 16);
   3307     if (is_fp)
   3308       emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
   3309     else
   3310       emit_mov_load(mc, sz, 0, dr, gp, 0);
   3311     emit_jmp_rel32(mc, L_done);
   3312     /* stack path: gp = ap[8] (overflow area); load; ap[8] += 8. */
   3313     mc->label_place(mc, L_stack);
   3314     emit_mov_load(mc, 8, 0, gp, ap_base, 8);
   3315     if (is_fp)
   3316       emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
   3317     else
   3318       emit_mov_load(mc, sz, 0, dr, gp, 0);
   3319     x64_add_mem_imm(mc, 1, ap_base, 8, 8);
   3320     mc->label_place(mc, L_done);
   3321   }
   3322 }
   3323 
   3324 static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap,
   3325                              NativeAddr src_ap) {
   3326   NativeTarget* t = &a->base;
   3327   MCEmitter* mc = t->mc;
   3328   /* Resolve dst into r11, src into rax, and copy each qword through the fp emit
   3329    * scratch xmm14. Uses only reserved emit scratch (r11/rax/xmm14), so the
   3330    * optimizer's register choice for a va_list pointer can never be clobbered
   3331    * and no allocable GPR (previously rdx) is consumed. */
   3332   u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2);
   3333   u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT);
   3334   u32 n = a->abi->shadow_space ? 8u : 24u, i;
   3335   for (i = 0; i < n; i += 8u) {
   3336     emit_sse_load(mc, 0xF2, 0x10, X64_TMP_FP, src_base, (i32)i);  /* movsd */
   3337     emit_sse_store(mc, 0xF2, 0x11, X64_TMP_FP, dst_base, (i32)i); /* movsd */
   3338   }
   3339 }
   3340 
   3341 static NativeAddr x64_va_addr_from_ptr(NativeLoc ap_ptr) {
   3342   NativeAddr addr;
   3343   memset(&addr, 0, sizeof addr);
   3344   addr.base_kind = NATIVE_ADDR_BASE_REG;
   3345   addr.cls = NATIVE_REG_INT;
   3346   addr.base.reg = ap_ptr.v.reg;
   3347   addr.base_type = ap_ptr.type;
   3348   return addr;
   3349 }
   3350 
   3351 static void x64_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
   3352   x64_va_start_core(x64_of(t), x64_va_addr_from_ptr(ap_ptr));
   3353 }
   3354 static void x64_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
   3355                               KitCgTypeId type) {
   3356   x64_va_arg_core(x64_of(t), dst, x64_va_addr_from_ptr(ap_ptr), type);
   3357 }
   3358 static void x64_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
   3359   (void)t;
   3360   (void)ap_ptr;
   3361 }
   3362 static void x64_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   3363   x64_va_copy_core(x64_of(t), x64_va_addr_from_ptr(dst),
   3364                    x64_va_addr_from_ptr(src));
   3365 }
   3366 
   3367 /* ============================ intrinsics ============================ */
   3368 
   3369 static void emit_popcnt(MCEmitter* mc, int w, u32 dst, u32 src) {
   3370   u8 p = 0xF3;
   3371   mc->emit_bytes(mc, &p, 1);
   3372   emit_rex(mc, w, dst, 0, src);
   3373   {
   3374     u8 op[2] = {X64_OPC_TWOBYTE, 0xB8};
   3375     mc->emit_bytes(mc, op, 2);
   3376   }
   3377   emit_rm_reg(mc, dst, src);
   3378 }
   3379 static void emit_bs(MCEmitter* mc, int w, u8 opcode2, u32 dst, u32 src) {
   3380   emit_rex(mc, w, dst, 0, src);
   3381   {
   3382     u8 op[2] = {X64_OPC_TWOBYTE, opcode2};
   3383     mc->emit_bytes(mc, op, 2);
   3384   }
   3385   emit_rm_reg(mc, dst, src);
   3386 }
   3387 static void emit_bswap(MCEmitter* mc, int w, u32 reg) {
   3388   emit_rex(mc, w, 0, 0, reg);
   3389   {
   3390     u8 op[2] = {X64_OPC_TWOBYTE, (u8)(0xC8 + (reg & 7u))};
   3391     mc->emit_bytes(mc, op, 2);
   3392   }
   3393 }
   3394 static void emit_rol16_imm8(MCEmitter* mc, u32 reg, u8 imm) {
   3395   u8 p = X64_OPSIZE_PFX;
   3396   mc->emit_bytes(mc, &p, 1);
   3397   emit_rex(mc, 0, 0, 0, reg);
   3398   {
   3399     u8 buf[3] = {X64_OPC_SHIFT_IMM, modrm(3u, 0u, reg & 7u), imm};
   3400     mc->emit_bytes(mc, buf, 3);
   3401   }
   3402 }
   3403 static void emit_ud2(MCEmitter* mc) {
   3404   u8 b[2] = {0x0F, 0x0B};
   3405   mc->emit_bytes(mc, b, 2);
   3406 }
   3407 
   3408 static void emit_syscall(MCEmitter* mc) {
   3409   u8 b[2] = {0x0F, 0x05};
   3410   mc->emit_bytes(mc, b, 2);
   3411 }
   3412 
   3413 static void x64_intrinsic(NativeTarget* t, IntrinKind kind,
   3414                           const NativeLoc* dsts, u32 ndst,
   3415                           const NativeLoc* args, u32 narg) {
   3416   X64NativeTarget* a = x64_of(t);
   3417   MCEmitter* mc = t->mc;
   3418   (void)ndst;
   3419   switch (kind) {
   3420     case INTRIN_NONE:
   3421       break;
   3422     case INTRIN_EXPECT:
   3423     case INTRIN_ASSUME_ALIGNED:
   3424       if (args[0].kind == NATIVE_LOC_IMM)
   3425         x64_emit_load_imm(mc, x64_is_64(t, dsts[0].type) ? 1 : 0,
   3426                           loc_reg(dsts[0]), args[0].v.imm);
   3427       else
   3428         x64_move(t, dsts[0], args[0]);
   3429       return;
   3430     case INTRIN_PREFETCH:
   3431       return;
   3432     case INTRIN_TRAP:
   3433       emit_ud2(mc);
   3434       return;
   3435     case INTRIN_SYSCALL:
   3436       if (ndst == 1u && narg >= 1u && narg <= 7u) {
   3437         static const u32 syscall_regs[7] = {
   3438             X64_RAX, X64_RDI, X64_RSI, X64_RDX, X64_R10, X64_R8, X64_R9};
   3439         X64ArgMove moves[7];
   3440         for (u32 i = 0; i < narg; ++i) {
   3441           X64ArgMove* m = &moves[i];
   3442           memset(m, 0, sizeof *m);
   3443           m->dst = native_loc_reg(dsts[0].type, NATIVE_REG_INT,
   3444                                   syscall_regs[i]);
   3445           m->src = args[i];
   3446           m->size = t->c->target.ptr_size;
   3447         }
   3448         x64_emit_reg_arg_moves(t, moves, narg, X64_TMP_INT2);
   3449         emit_syscall(mc);
   3450         x64_move(t, dsts[0],
   3451                  native_loc_reg(dsts[0].type, NATIVE_REG_INT, X64_RAX));
   3452       }
   3453       return;
   3454     case INTRIN_POPCOUNT:
   3455       emit_popcnt(mc, x64_is_64(t, args[0].type) ? 1 : 0, loc_reg(dsts[0]),
   3456                   loc_reg(args[0]));
   3457       return;
   3458     case INTRIN_CTZ:
   3459       emit_bs(mc, x64_is_64(t, args[0].type) ? 1 : 0, 0xBC /* bsf */,
   3460               loc_reg(dsts[0]), loc_reg(args[0]));
   3461       return;
   3462     case INTRIN_CLZ: {
   3463       int w = x64_is_64(t, args[0].type) ? 1 : 0;
   3464       u32 dr = loc_reg(dsts[0]);
   3465       emit_bs(mc, w, 0xBD /* bsr */, dr, loc_reg(args[0]));
   3466       /* clz = (bits-1) - bsr, computed via xor with bits-1. The mask (31/63)
   3467        * fits in imm8, so use the compact 0x83 form to match the canonical
   3468        * encoding (and the assembler's `cc -S | as` round-trip). */
   3469       emit_alu_imm8(mc, w, X64_ALU_SUB_XOR, dr, w ? 63 : 31);
   3470       return;
   3471     }
   3472     case INTRIN_BSWAP: {
   3473       u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
   3474       switch (width) {
   3475         case 2: {
   3476           u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
   3477           if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
   3478           emit_rol16_imm8(mc, dr, 8);
   3479           return;
   3480         }
   3481         case 4: {
   3482           u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
   3483           if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
   3484           emit_bswap(mc, 0, dr);
   3485           return;
   3486         }
   3487         case 8: {
   3488           u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
   3489           if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
   3490           emit_bswap(mc, 1, dr);
   3491           return;
   3492         }
   3493         default:
   3494           break;
   3495       }
   3496       return;
   3497     }
   3498     case INTRIN_SADD_OVERFLOW:
   3499     case INTRIN_UADD_OVERFLOW:
   3500     case INTRIN_SSUB_OVERFLOW:
   3501     case INTRIN_USUB_OVERFLOW: {
   3502       int w = x64_is_64(t, dsts[0].type) ? 1 : 0;
   3503       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3504       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3505       u8 op = (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW)
   3506                   ? X64_OPC_ALU_ADD
   3507                   : X64_OPC_ALU_SUB;
   3508       u32 cc = (kind == INTRIN_UADD_OVERFLOW || kind == INTRIN_USUB_OVERFLOW)
   3509                    ? X64_CC_B
   3510                    : X64_CC_O;
   3511       if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   3512       emit_alu_rr(mc, w, op, rd, rb);
   3513       emit_setcc(mc, cc, rovf);
   3514       emit_movzx_r32_r8(mc, rovf, rovf);
   3515       return;
   3516     }
   3517     case INTRIN_SMUL_OVERFLOW: {
   3518       int w = x64_is_64(t, dsts[0].type) ? 1 : 0;
   3519       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3520       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3521       if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   3522       emit_imul_rr(mc, w, rd, rb);
   3523       emit_setcc(mc, X64_CC_O, rovf);
   3524       emit_movzx_r32_r8(mc, rovf, rovf);
   3525       return;
   3526     }
   3527     case INTRIN_UMUL_OVERFLOW: {
   3528       int w = x64_is_64(t, dsts[0].type) ? 1 : 0;
   3529       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3530       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3531       if (rb == X64_RAX || rb == X64_RDX) {
   3532         emit_mov_rr(mc, w, X64_R11, rb);
   3533         rb = X64_R11;
   3534       }
   3535       if (ra != X64_RAX) emit_mov_rr(mc, w, X64_RAX, ra);
   3536       emit_f7_rm(mc, w, X64_F7_SUB_MUL, rb); /* MUL: rdx:rax = rax * rb */
   3537       if (rd != X64_RAX) emit_mov_rr(mc, w, rd, X64_RAX);
   3538       emit_setcc(mc, X64_CC_O, rovf);
   3539       emit_movzx_r32_r8(mc, rovf, rovf);
   3540       return;
   3541     }
   3542     case INTRIN_MEMMOVE: {
   3543       u32 dr, sr, n, i;
   3544       if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
   3545           args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
   3546         x64_panic(a, "unsupported memory intrinsic operands");
   3547       if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
   3548         x64_panic(a, "unsupported memory intrinsic size");
   3549       dr = loc_reg(args[0]);
   3550       sr = loc_reg(args[1]);
   3551       n = (u32)args[2].v.imm;
   3552       i = n; /* copy high-to-low so an overlapping dst > src is safe */
   3553       while (i >= 8u) {
   3554         i -= 8u;
   3555         emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i);
   3556         emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
   3557       }
   3558       while (i >= 4u) {
   3559         i -= 4u;
   3560         emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i);
   3561         emit_mov_store(mc, 4, X64_RAX, dr, (i32)i);
   3562       }
   3563       while (i >= 2u) {
   3564         i -= 2u;
   3565         emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i);
   3566         emit_mov_store(mc, 2, X64_RAX, dr, (i32)i);
   3567       }
   3568       while (i >= 1u) {
   3569         i -= 1u;
   3570         emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i);
   3571         emit_mov_store(mc, 1, X64_RAX, dr, (i32)i);
   3572       }
   3573       return;
   3574     }
   3575     case INTRIN_CPU_NOP: {
   3576       u8 b = 0x90; /* NOP */
   3577       mc->emit_bytes(mc, &b, 1);
   3578       return;
   3579     }
   3580     case INTRIN_CPU_YIELD: {
   3581       u8 b[2] = {0xF3, 0x90}; /* PAUSE */
   3582       mc->emit_bytes(mc, b, 2);
   3583       return;
   3584     }
   3585     case INTRIN_DMB:
   3586     case INTRIN_DSB: {
   3587       u8 b[3] = {0x0F, 0xAE, 0xF0}; /* MFENCE: full-system memory barrier */
   3588       mc->emit_bytes(mc, b, 3);
   3589       return;
   3590     }
   3591     case INTRIN_IRQ_DISABLE: {
   3592       u8 b = 0xFA; /* CLI (privileged) */
   3593       mc->emit_bytes(mc, &b, 1);
   3594       return;
   3595     }
   3596     case INTRIN_IRQ_ENABLE: {
   3597       u8 b = 0xFB; /* STI (privileged) */
   3598       mc->emit_bytes(mc, &b, 1);
   3599       return;
   3600     }
   3601     case INTRIN_FRAME_ADDRESS:
   3602     case INTRIN_RETURN_ADDRESS:
   3603       /* Walk the rbp frame-record chain. Every kit prologue keeps the rbp
   3604        * record: [rbp] = caller's rbp, [rbp + 8] = return address pushed by the
   3605        * `call`. The level is a compile-time constant, so the walk unrolls to
   3606        * `level` dependent loads. */
   3607       if (ndst == 1u) {
   3608         u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM)
   3609                         ? (u32)args[0].v.imm
   3610                         : 0u;
   3611         u32 rd = loc_reg(dsts[0]);
   3612         emit_mov_rr(mc, 1, rd, X64_RBP);
   3613         for (u32 i = 0; i < level; ++i)
   3614           emit_mov_load(mc, 8, 0, rd, rd, 0); /* rd = *(rd) */
   3615         if (kind == INTRIN_RETURN_ADDRESS)
   3616           emit_mov_load(mc, 8, 0, rd, rd, 8); /* rd = *(rd + 8) */
   3617       }
   3618       return;
   3619     default:
   3620       break;
   3621   }
   3622   x64_panic(a, "unsupported compiler intrinsic");
   3623 }
   3624 
   3625 /* ============================ inline asm ============================ */
   3626 
   3627 _Noreturn static void x64_asm_panic_at(Compiler* c, SrcLoc loc,
   3628                                        const char* msg) {
   3629   compiler_panic(c, loc, "x64 inline asm: %s", msg);
   3630 }
   3631 _Noreturn static void x64_asm_panic(NativeDirectTarget* d, const char* msg) {
   3632   x64_asm_panic_at(d->base.c, d->loc, msg);
   3633 }
   3634 
   3635 /* constraint_body / constraint_early / match_index are shared
   3636  * (cg/native_asm.h). */
   3637 
   3638 static void x64_asm_bound_reg(Operand* out, KitCgTypeId type,
   3639                               NativeAllocClass cls, Reg reg) {
   3640   memset(out, 0, sizeof *out);
   3641   out->kind = X64_INLINE_OPK_REG;
   3642   out->pad[0] =
   3643       (cls == NATIVE_REG_FP) ? X64_INLINE_OPCLS_FP : X64_INLINE_OPCLS_INT;
   3644   out->type = type;
   3645   out->v.local = (CGLocal)reg;
   3646 }
   3647 static void x64_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) {
   3648   memset(out, 0, sizeof *out);
   3649   out->kind = OPK_INDIRECT;
   3650   out->type = type;
   3651   out->v.ind.base = (CGLocal)base;
   3652   out->v.ind.index = CG_LOCAL_NONE;
   3653   out->v.ind.ofs = 0;
   3654 }
   3655 
   3656 /* Parse a clobber register name into (class, reg). Returns 0 for cc/memory.
   3657  * GPR names map to HW encoding via x64_register_hw_index; xmm names map via the
   3658  * DWARF table (xmm0..15 = dwarf 17..32). */
   3659 static int x64_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
   3660                                      NativeAllocClass* cls_out, Reg* reg_out) {
   3661   Slice s = pool_slice(c->global, name);
   3662   char buf[16];
   3663   uint32_t idx;
   3664   if (!s.s || !s.len) return 0;
   3665   if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
   3666   if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
   3667   if (s.len >= sizeof buf) x64_asm_panic_at(c, loc, "clobber name is too long");
   3668   memcpy(buf, s.s, s.len);
   3669   buf[s.len] = '\0';
   3670   if (x64_register_hw_index(buf, &idx) == 0 && idx <= 15u) {
   3671     *cls_out = NATIVE_REG_INT;
   3672     *reg_out = (Reg)idx;
   3673     return 1;
   3674   }
   3675   if (x64_register_index(buf, &idx) == 0 && idx >= 17u && idx <= 32u) {
   3676     *cls_out = NATIVE_REG_FP;
   3677     *reg_out = (Reg)(idx - 17u);
   3678     return 1;
   3679   }
   3680   x64_asm_panic_at(c, loc, "unknown clobber register");
   3681   return 0;
   3682 }
   3683 
   3684 static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   3685                                   u32 nclob, u32* int_mask, u32* fp_mask) {
   3686   u32 i;
   3687   *int_mask = 0;
   3688   *fp_mask = 0;
   3689   for (i = 0; i < nclob; ++i) {
   3690     NativeAllocClass cls;
   3691     Reg reg;
   3692     if (!x64_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, &reg)) continue;
   3693     if (cls == NATIVE_REG_INT)
   3694       *int_mask |= 1u << reg;
   3695     else
   3696       *fp_mask |= 1u << reg;
   3697   }
   3698 }
   3699 
   3700 static int x64_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
   3701                                         const char* constraint,
   3702                                         NativeAsmRegPin* pin) {
   3703   NativeAsmRegPinStatus st =
   3704       native_asm_resolve_pin(d->native, reg, constraint, pin);
   3705   if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
   3706   if (st != NATIVE_ASM_REG_PIN_OK)
   3707     x64_asm_panic(d, native_asm_pin_status_message(st));
   3708   return 1;
   3709 }
   3710 
   3711 /* Pick a free register from caller-saved allocable pools for an asm operand the
   3712  * direct path self-allocates. */
   3713 static Reg x64_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls,
   3714                              u32 allowed_mask, u32* used_int, u32* used_fp) {
   3715   static const Reg int_pool[] = {X64_RDI, X64_RSI, X64_RDX,
   3716                                  X64_RCX, X64_R8,  X64_R9};
   3717   static const Reg fp_pool[] = {
   3718       X64_XMM0, X64_XMM1, X64_XMM2, X64_XMM3,     X64_XMM4,      X64_XMM5,
   3719       X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11};
   3720   const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
   3721   u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
   3722                                : (u32)(sizeof int_pool / sizeof int_pool[0]);
   3723   u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
   3724   u32 i;
   3725   for (i = 0; i < n; ++i) {
   3726     Reg r = pool[i];
   3727     if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue;
   3728     if ((*used & (1u << r)) != 0) continue;
   3729     *used |= 1u << r;
   3730     return r;
   3731   }
   3732   x64_asm_panic(d, "out of registers for asm operands");
   3733   return REG_NONE;
   3734 }
   3735 
   3736 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */
   3737 static NativeAddr x64_direct_addr(NativeDirectTarget* d, Operand op) {
   3738   NativeAddr addr;
   3739   memset(&addr, 0, sizeof addr);
   3740   switch ((OpKind)op.kind) {
   3741     case OPK_LOCAL:
   3742       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3743       addr.base.frame = d->locals[op.v.local - 1u].home;
   3744       addr.base_type = op.type;
   3745       return addr;
   3746     case OPK_INDIRECT:
   3747       addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
   3748       addr.base.frame = d->locals[op.v.ind.base - 1u].home;
   3749       addr.cls = d->locals[op.v.ind.base - 1u].cls;
   3750       addr.base_type = d->locals[op.v.ind.base - 1u].type;
   3751       addr.offset = op.v.ind.ofs;
   3752       return addr;
   3753     default:
   3754       x64_asm_panic(d, "operand is not addressable");
   3755   }
   3756 }
   3757 
   3758 static NativeAddr x64_direct_materialize_addr(NativeDirectTarget* d,
   3759                                               Operand op) {
   3760   X64NativeTarget* a = x64_of(d->native);
   3761   NativeAddr addr = x64_direct_addr(d, op);
   3762   if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   3763     NativeAddr load;
   3764     memset(&load, 0, sizeof load);
   3765     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   3766     load.base.frame = addr.base.frame;
   3767     load.base_type = addr.base_type;
   3768     emit_mov_load(a->base.mc, 8, 0, X64_TMP_INT2, X64_RBP,
   3769                   -(i32)x64_slot_get(a, addr.base.frame)->off);
   3770     addr.base_kind = NATIVE_ADDR_BASE_REG;
   3771     addr.base.reg = X64_TMP_INT2;
   3772   }
   3773   return addr;
   3774 }
   3775 
   3776 static void x64_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op,
   3777                                            NativeLoc dst) {
   3778   X64NativeTarget* a = x64_of(d->native);
   3779   NativeAddr addr;
   3780   memset(&addr, 0, sizeof addr);
   3781   switch ((OpKind)op.kind) {
   3782     case OPK_IMM:
   3783       if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
   3784         x64_asm_panic(d, "floating-point immediate asm input is unsupported");
   3785       d->native->load_imm(d->native, dst, op.v.imm);
   3786       return;
   3787     case OPK_LOCAL:
   3788       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3789       addr.base.frame = d->locals[op.v.local - 1u].home;
   3790       addr.base_type = op.type;
   3791       x64_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3792       return;
   3793     case OPK_GLOBAL:
   3794       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3795       addr.base.global.sym = op.v.global.sym;
   3796       addr.base.global.addend = op.v.global.addend;
   3797       addr.base_type = op.type;
   3798       d->native->load_addr(d->native, dst, addr);
   3799       return;
   3800     case OPK_INDIRECT:
   3801       addr = x64_direct_materialize_addr(d, op);
   3802       x64_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3803       return;
   3804   }
   3805   x64_asm_panic(d, "unsupported asm input operand");
   3806 }
   3807 
   3808 static void x64_direct_load_address_to_reg(NativeDirectTarget* d, Operand op,
   3809                                            NativeLoc dst) {
   3810   d->native->load_addr(d->native, dst, x64_direct_addr(d, op));
   3811 }
   3812 
   3813 static void x64_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op,
   3814                                             NativeLoc src) {
   3815   X64NativeTarget* a = x64_of(d->native);
   3816   NativeAddr addr;
   3817   memset(&addr, 0, sizeof addr);
   3818   if (op.kind == OPK_LOCAL) {
   3819     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3820     addr.base.frame = d->locals[op.v.local - 1u].home;
   3821     addr.base_type = op.type;
   3822   } else {
   3823     addr = x64_direct_materialize_addr(d, op);
   3824   }
   3825   x64_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0));
   3826 }
   3827 
   3828 /* Callee-saved registers an asm block clobbers must be saved around the block.
   3829  */
   3830 typedef struct X64AsmSavedClobber {
   3831   NativeFrameSlot slot;
   3832   NativeAllocClass cls;
   3833   Reg reg;
   3834   KitCgTypeId type;
   3835 } X64AsmSavedClobber;
   3836 
   3837 static void x64_asm_save_one(X64NativeTarget* a, X64AsmSavedClobber* s) {
   3838   NativeFrameSlotDesc desc;
   3839   NativeAddr addr;
   3840   memset(&desc, 0, sizeof desc);
   3841   desc.type = s->type;
   3842   desc.size = s->cls == NATIVE_REG_FP ? 16u : 8u;
   3843   desc.align = desc.size;
   3844   desc.kind = NATIVE_FRAME_SLOT_SAVE;
   3845   s->slot = a->base.frame_slot(&a->base, &desc);
   3846   memset(&addr, 0, sizeof addr);
   3847   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3848   addr.base.frame = s->slot;
   3849   addr.base_type = s->type;
   3850   x64_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr,
   3851                native_mem_for_type(&a->base, s->type, desc.size));
   3852 }
   3853 static void x64_asm_restore_one(X64NativeTarget* a,
   3854                                 const X64AsmSavedClobber* s) {
   3855   NativeAddr addr;
   3856   memset(&addr, 0, sizeof addr);
   3857   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3858   addr.base.frame = s->slot;
   3859   addr.base_type = s->type;
   3860   x64_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr,
   3861                native_mem_for_type(&a->base, s->type,
   3862                                    s->cls == NATIVE_REG_FP ? 16u : 8u));
   3863 }
   3864 
   3865 /* SysV callee-saved: int rbx,r12-r15; no fp. Win64 adds rdi,rsi + xmm6-15. */
   3866 static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r) {
   3867   if (r == X64_RBP) return 0; /* prologue head handles rbp */
   3868   return (abi->cs_int_mask & (1ull << r)) != 0;
   3869 }
   3870 static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r) {
   3871   return (abi->cs_fp_mask & (1ull << r)) != 0;
   3872 }
   3873 
   3874 static X64AsmSavedClobber* x64_asm_save_callee_clobbers(X64NativeTarget* a,
   3875                                                         u32 int_mask,
   3876                                                         u32 fp_mask,
   3877                                                         u32* nsaved_out) {
   3878   X64AsmSavedClobber* saved =
   3879       arena_zarray(a->base.c->tu, X64AsmSavedClobber, 32u);
   3880   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   3881   KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   3882   u32 n = 0;
   3883   Reg r;
   3884   for (r = 0; r <= 15u; ++r) {
   3885     if ((int_mask & (1u << r)) == 0 || !x64_reg_is_callee_int(a->abi, r))
   3886       continue;
   3887     saved[n].cls = NATIVE_REG_INT;
   3888     saved[n].reg = r;
   3889     saved[n].type = i64;
   3890     x64_asm_save_one(a, &saved[n++]);
   3891   }
   3892   for (r = 0; r <= 15u; ++r) {
   3893     if ((fp_mask & (1u << r)) == 0 || !x64_reg_is_callee_fp(a->abi, r))
   3894       continue;
   3895     saved[n].cls = NATIVE_REG_FP;
   3896     saved[n].reg = r;
   3897     saved[n].type = f64;
   3898     x64_asm_save_one(a, &saved[n++]);
   3899   }
   3900   *nsaved_out = n;
   3901   return saved;
   3902 }
   3903 
   3904 /* ---- NativeTarget (optimizer) asm hook ---- */
   3905 
   3906 static NativeAddr x64_asm_loc_to_addr(X64NativeTarget* a, SrcLoc loc,
   3907                                       NativeLoc src) {
   3908   NativeAddr addr;
   3909   memset(&addr, 0, sizeof addr);
   3910   addr.base_type = src.type;
   3911   switch ((NativeLocKind)src.kind) {
   3912     case NATIVE_LOC_FRAME:
   3913       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3914       addr.base.frame = src.v.frame;
   3915       return addr;
   3916     case NATIVE_LOC_ADDR:
   3917       return src.v.addr;
   3918     case NATIVE_LOC_GLOBAL:
   3919       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3920       addr.base.global.sym = src.v.global.sym;
   3921       addr.base.global.addend = src.v.global.addend;
   3922       return addr;
   3923     case NATIVE_LOC_REG:
   3924       addr.base_kind = NATIVE_ADDR_BASE_REG;
   3925       addr.cls = NATIVE_REG_INT;
   3926       addr.base.reg = src.v.reg;
   3927       return addr;
   3928     default:
   3929       x64_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
   3930   }
   3931 }
   3932 
   3933 static Reg x64_asm_native_mem_base(X64NativeTarget* a, SrcLoc loc,
   3934                                    NativeLoc src, u32* ntmp) {
   3935   NativeAddr addr = x64_asm_loc_to_addr(a, loc, src);
   3936   Reg dst;
   3937   if (addr.base_kind == NATIVE_ADDR_BASE_REG && addr.offset == 0 &&
   3938       addr.index_kind == NATIVE_ADDR_INDEX_NONE) {
   3939     if ((addr.base.reg & 0xfu) != X64_TMP_INT &&
   3940         (addr.base.reg & 0xfu) != X64_TMP_INT2)
   3941       return (Reg)(addr.base.reg & 0xfu);
   3942   }
   3943   if (*ntmp >= 2u)
   3944     x64_asm_panic_at(a->base.c, loc, "too many memory asm operands");
   3945   dst = (*ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2;
   3946   (*ntmp)++;
   3947   x64_addr_to_base_reg(a, addr, dst);
   3948   return dst;
   3949 }
   3950 
   3951 static void x64_asm_load_loc_to_reg(X64NativeTarget* a, SrcLoc loc,
   3952                                     NativeLoc src, NativeLoc dst) {
   3953   NativeTarget* t = &a->base;
   3954   NativeAllocClass cls = (NativeAllocClass)dst.cls;
   3955   if (src.kind == NATIVE_LOC_REG) {
   3956     if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src);
   3957     return;
   3958   }
   3959   if (src.kind == NATIVE_LOC_IMM) {
   3960     if (cls != NATIVE_REG_INT)
   3961       x64_asm_panic_at(t->c, loc,
   3962                        "floating-point immediate asm input is unsupported");
   3963     t->load_imm(t, dst, src.v.imm);
   3964     return;
   3965   }
   3966   x64_emit_mem(a, 1, dst, x64_asm_loc_to_addr(a, loc, src),
   3967                native_mem_for_type(t, dst.type, native_type_size(t, dst.type)));
   3968 }
   3969 
   3970 static void x64_asm_store_reg_to_loc(X64NativeTarget* a, SrcLoc loc,
   3971                                      NativeLoc dst, NativeLoc src) {
   3972   NativeTarget* t = &a->base;
   3973   if (dst.kind == NATIVE_LOC_REG) {
   3974     if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src);
   3975     return;
   3976   }
   3977   x64_emit_mem(a, 0, src, x64_asm_loc_to_addr(a, loc, dst),
   3978                native_mem_for_type(t, src.type, native_type_size(t, src.type)));
   3979 }
   3980 
   3981 static void x64_asm_bind_native(X64NativeTarget* a, SrcLoc loc, Operand* out,
   3982                                 const char* constraint, KitCgTypeId type,
   3983                                 NativeLoc src, u32* ntmp) {
   3984   const char* body = native_asm_constraint_body(constraint);
   3985   NativeAsmConstraintInfo info;
   3986   if (native_asm_constraint_reg_info(&a->base, constraint, &info)) {
   3987     if (src.kind != NATIVE_LOC_REG)
   3988       x64_asm_panic_at(a->base.c, loc,
   3989                        "register asm operand not in a register");
   3990     if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg)
   3991       x64_asm_panic_at(a->base.c, loc,
   3992                        "fixed-register asm operand in wrong register");
   3993     if (info.allowed_mask &&
   3994         ((Reg)src.v.reg >= 32 ||
   3995          (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0))
   3996       x64_asm_panic_at(a->base.c, loc,
   3997                        "register asm operand violates constraint register set");
   3998     x64_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg);
   3999   } else if (body[0] == 'i') {
   4000     if (src.kind != NATIVE_LOC_IMM)
   4001       x64_asm_panic_at(a->base.c, loc,
   4002                        "immediate asm operand is not immediate");
   4003     memset(out, 0, sizeof *out);
   4004     out->kind = OPK_IMM;
   4005     out->type = type;
   4006     out->v.imm = src.v.imm;
   4007   } else if (body[0] == 'm') {
   4008     x64_asm_bound_mem(out, type, x64_asm_native_mem_base(a, loc, src, ntmp));
   4009   } else {
   4010     x64_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
   4011   }
   4012 }
   4013 
   4014 static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
   4015                                  const AsmConstraint* outs, u32 nout,
   4016                                  NativeLoc* out_locs, const AsmConstraint* ins,
   4017                                  u32 nin, const NativeLoc* in_locs,
   4018                                  const Sym* clobbers, u32 nclob) {
   4019   X64NativeTarget* a = x64_of(t);
   4020   Compiler* c = t->c;
   4021   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   4022   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   4023   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   4024   u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL;
   4025   u32 ntmp = 0, i;
   4026   X64Asm* asmh;
   4027 
   4028   for (i = 0; i < nout; ++i) {
   4029     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
   4030     NativeLoc outloc = out_locs[i];
   4031     NativeAsmPinnedLoc pinned =
   4032         native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
   4033     if (pinned.has_pin) {
   4034       if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   4035         x64_asm_panic_at(c, loc,
   4036                          native_asm_pin_status_message(pinned.pin_status));
   4037       if (pinned.wrong_reg)
   4038         x64_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
   4039       outloc = pinned.loc;
   4040       if (pinned.needs_stage) {
   4041         staged_outs[i] = 1u;
   4042         if (outs[i].dir == KIT_CG_ASM_INOUT)
   4043           x64_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
   4044       }
   4045     }
   4046     x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc,
   4047                         &ntmp);
   4048   }
   4049   for (i = 0; i < nin; ++i) {
   4050     const char* body = native_asm_constraint_body(ins[i].str);
   4051     int matched = native_asm_match_index(body);
   4052     KitCgTypeId type;
   4053     NativeLoc inloc;
   4054     if (matched >= 0) {
   4055       if ((u32)matched >= nout)
   4056         x64_asm_panic_at(c, loc, "matching constraint out of range");
   4057       bound_ins[i] = bound_outs[matched];
   4058       continue;
   4059     }
   4060     type = ins[i].type ? ins[i].type : in_locs[i].type;
   4061     inloc = in_locs[i];
   4062     {
   4063       NativeAsmPinnedLoc pinned =
   4064           native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
   4065       if (pinned.has_pin) {
   4066         if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   4067           x64_asm_panic_at(c, loc,
   4068                            native_asm_pin_status_message(pinned.pin_status));
   4069         if (pinned.wrong_reg)
   4070           x64_asm_panic_at(c, loc,
   4071                            "hard-register asm operand in wrong register");
   4072         inloc = pinned.loc;
   4073         if (pinned.needs_stage)
   4074           x64_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
   4075       } else if ((body[0] == 'r') && inloc.kind != NATIVE_LOC_REG) {
   4076         Reg r;
   4077         if (ntmp >= 2u) x64_asm_panic_at(c, loc, "too many memory asm operands");
   4078         r = (ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2;
   4079         ntmp++;
   4080         inloc = native_loc_reg(type, NATIVE_REG_INT, r);
   4081         x64_emit_mem(a, 1, inloc, x64_asm_loc_to_addr(a, loc, in_locs[i]),
   4082                      native_mem_for_type(t, type, native_type_size(t, type)));
   4083       }
   4084     }
   4085     x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   4086   }
   4087 
   4088   /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
   4089    * masks and x64_known_callee_saves folded the callee-saved ones into the
   4090    * function's saved set, so the prologue/epilogue already preserve them. */
   4091   asmh = x64_asm_open(c);
   4092   x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   4093                   nclob);
   4094   x64_asm_run_template(asmh, t->mc, tmpl);
   4095   x64_asm_close(asmh);
   4096 
   4097   for (i = 0; i < nout; ++i) {
   4098     NativeAllocClass cls;
   4099     NativeLoc src;
   4100     if (!staged_outs || !staged_outs[i]) continue;
   4101     if (bound_outs[i].kind != X64_INLINE_OPK_REG) continue;
   4102     cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   4103                                                        : NATIVE_REG_INT;
   4104     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   4105     x64_asm_store_reg_to_loc(a, loc, out_locs[i], src);
   4106   }
   4107 }
   4108 
   4109 /* file_scope_asm + finalize are shared (cg/native_asm.h). */
   4110 
   4111 static void x64_trap(NativeTarget* t) { emit_ud2(t->mc); }
   4112 static void x64_set_loc(NativeTarget* t, SrcLoc loc) {
   4113   x64_of(t)->loc = loc;
   4114   if (t->mc->set_loc) t->mc->set_loc(t->mc, loc);
   4115 }
   4116 
   4117 /* Physical registers each x86-64 instruction's encoding clobbers as a side
   4118  * effect, so the optimizer keeps values live across them out of those registers
   4119  * (the backend is then free to use them). idiv/div write rax (quotient) and rdx
   4120  * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use
   4121  * rax/rcx/rdx; an FP va_arg borrows rax for the gp/fp offset (an integer va_arg
   4122  * does the offset arithmetic in its own destination register, so it clobbers
   4123  * nothing). */
   4124 static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
   4125                                    u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
   4126   (void)t;
   4127   mask[0] = mask[1] = mask[2] = 0;
   4128   switch ((NativeMachineOpKind)op->kind) {
   4129     case NATIVE_MOP_BINOP:
   4130       switch ((BinOp)op->binop) {
   4131         case BO_SDIV:
   4132         case BO_UDIV:
   4133         case BO_SREM:
   4134         case BO_UREM:
   4135           mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX);
   4136           return 1;
   4137         case BO_SHL:
   4138         case BO_SHR_S:
   4139         case BO_SHR_U:
   4140           if (op->second_is_reg) {
   4141             mask[NATIVE_REG_INT] = (1u << X64_RCX);
   4142             return 1;
   4143           }
   4144           return 0;
   4145         default:
   4146           return 0;
   4147       }
   4148     case NATIVE_MOP_ATOMIC_CAS:
   4149     case NATIVE_MOP_ATOMIC_RMW:
   4150       mask[NATIVE_REG_INT] =
   4151           (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
   4152       return 1;
   4153     case NATIVE_MOP_VA_START:
   4154       /* x64_va_start_core materializes the va_list field values through RAX
   4155        * (the ap pointer itself lands in the reserved r11 scratch). RAX is the
   4156        * return register, so the allocator may otherwise keep a live value there
   4157        * across the op. */
   4158       mask[NATIVE_REG_INT] = (1u << X64_RAX);
   4159       return 1;
   4160     case NATIVE_MOP_VA_ARG:
   4161       if (!op->result_is_fp) return 0;
   4162       mask[NATIVE_REG_INT] = (1u << X64_RAX);
   4163       return 1;
   4164     case NATIVE_MOP_INTRINSIC:
   4165       /* The unsigned multiply-overflow intrinsic emits a one-operand MUL, whose
   4166        * rdx:rax product clobbers both registers. The signed variant uses a
   4167        * two-operand IMUL (no fixed-register clobber). Linux syscall writes rax
   4168        * and the CPU instruction itself clobbers rcx/r11; the kernel ABI treats
   4169        * the integer caller-saved syscall registers as volatile. */
   4170       if ((IntrinKind)op->intrin == INTRIN_UMUL_OVERFLOW) {
   4171         mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX);
   4172         return 1;
   4173       }
   4174       if ((IntrinKind)op->intrin == INTRIN_SYSCALL) {
   4175         mask[NATIVE_REG_INT] =
   4176             (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) |
   4177             (1u << X64_RSI) | (1u << X64_RDI) | (1u << X64_R8) |
   4178             (1u << X64_R9) | (1u << X64_R10) | (1u << X64_R11);
   4179         return 1;
   4180       }
   4181       return 0;
   4182     default:
   4183       return 0;
   4184   }
   4185 }
   4186 
   4187 /* ============================ construction ============================ */
   4188 
   4189 NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   4190                                     MCEmitter* mc) {
   4191   X64NativeTarget* a = arena_znew(c->tu, X64NativeTarget);
   4192   NativeTarget* t;
   4193   if (!a) return NULL;
   4194   t = &a->base;
   4195   t->c = c;
   4196   t->obj = obj;
   4197   t->mc = mc;
   4198   native_frame_init(&a->frame, c);
   4199   t->regs = &x64_reg_info;
   4200   t->class_for_type = native_class_for_type_fp_le8;
   4201   t->imm_legal = x64_imm_legal;
   4202   t->addr_legal = x64_addr_legal;
   4203   t->machine_op_clobbers = x64_machine_op_clobbers;
   4204   t->func_begin = x64_func_begin;
   4205   t->func_begin_known_frame = x64_func_begin_known_frame;
   4206   t->bind_params_end = x64_bind_params_end;
   4207   t->note_frame_state = NULL;
   4208   /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
   4209    * set; x64_func_begin_known_frame derives the records from the masks. */
   4210   t->reserve_callee_saves = x64_reserve_callee_saves;
   4211   t->caller_saved_mask = x64_live_caller_saved_mask;
   4212   t->callee_saved_mask = x64_live_callee_saved_mask;
   4213   t->signature_stack_bytes = x64_signature_stack_bytes;
   4214   t->call_stack_bytes = x64_call_stack_bytes;
   4215   t->has_store_zero_reg = 0;
   4216   t->func_end = x64_func_end;
   4217   t->frame_slot = x64_frame_slot;
   4218   t->frame_slot_debug_loc = x64_frame_slot_debug_loc;
   4219   t->bind_param = x64_bind_native_param;
   4220   t->label_new = x64_label_new;
   4221   t->label_place = x64_label_place;
   4222   t->jump = x64_jump;
   4223   t->cmp_branch = x64_cmp_branch;
   4224   t->indirect_branch = x64_indirect_branch;
   4225   t->load_label_addr = x64_load_label_addr;
   4226   t->move = x64_move;
   4227   t->load_imm = x64_load_imm;
   4228   t->load_const = x64_load_const;
   4229   t->load_addr = x64_load_addr;
   4230   t->load = x64_load;
   4231   t->store = x64_store;
   4232   t->tls_addr_of = x64_tls_addr_of;
   4233   t->copy_bytes = x64_copy_bytes;
   4234   t->set_bytes = x64_set_bytes;
   4235   t->bitfield_load = x64_bitfield_load;
   4236   t->bitfield_store = x64_bitfield_store;
   4237   t->binop = x64_binop;
   4238   t->unop = x64_unop;
   4239   t->cmp = x64_cmp;
   4240   t->convert = x64_convert;
   4241   t->alloca_ = x64_alloca;
   4242   t->spill = x64_spill;
   4243   t->reload = x64_reload;
   4244   t->plan_call = x64_plan_call;
   4245   t->emit_call = x64_emit_call;
   4246   t->plan_ret = x64_plan_ret;
   4247   t->ret = x64_ret;
   4248   t->atomic_load = x64_atomic_load;
   4249   t->atomic_store = x64_atomic_store;
   4250   t->atomic_rmw = x64_atomic_rmw;
   4251   t->atomic_cas = x64_atomic_cas;
   4252   t->fence = x64_fence;
   4253   t->va_start_ = x64_va_start_native;
   4254   t->va_arg_ = x64_va_arg_native;
   4255   t->va_end_ = x64_va_end_native;
   4256   t->va_copy_ = x64_va_copy_native;
   4257   t->intrinsic = x64_intrinsic;
   4258   t->asm_block = x64_asm_block_native;
   4259   t->file_scope_asm = native_file_scope_asm;
   4260   t->trap = x64_trap;
   4261   t->set_loc = x64_set_loc;
   4262   t->finalize = native_finalize;
   4263   return t;
   4264 }
   4265 
   4266 /* ============================ NativeOps (-O0) ============================ */
   4267 
   4268 static void x64_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   4269                            CGLocal local, NativeDirectLocal* l) {
   4270   NativeLoc dst;
   4271   (void)local;
   4272   memset(&dst, 0, sizeof dst);
   4273   dst.kind = NATIVE_LOC_FRAME;
   4274   dst.type = p->type;
   4275   dst.v.frame = l->home;
   4276   x64_bind_native_param(d->native, p, dst);
   4277 }
   4278 
   4279 /* A sibling call is realizable when its outgoing stack-argument area fits the
   4280  * window the caller itself received. Register-only calls always qualify. */
   4281 static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
   4282   X64NativeTarget* a = x64_of(d->native);
   4283   NativeCallDesc nd;
   4284   NativeLoc* args = NULL;
   4285   NativeLoc* results = NULL;
   4286   u32 i, stack;
   4287   if (a->frame.ncallee_saves)
   4288     return "x64 tail call: callee-saved registers in use";
   4289   memset(&nd, 0, sizeof nd);
   4290   u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u;
   4291   if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
   4292   if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults);
   4293   for (i = 0; i < call->nargs; ++i) {
   4294     args[i].kind = NATIVE_LOC_FRAME;
   4295     args[i].type = d->locals[call->args[i] - 1u].type;
   4296     args[i].cls = d->locals[call->args[i] - 1u].cls;
   4297     args[i].v.frame = d->locals[call->args[i] - 1u].home;
   4298   }
   4299   if (nresults) {
   4300     results[0].kind = NATIVE_LOC_FRAME;
   4301     results[0].type = d->locals[call->result - 1u].type;
   4302     results[0].cls = d->locals[call->result - 1u].cls;
   4303     results[0].v.frame = d->locals[call->result - 1u].home;
   4304   }
   4305   nd.fn_type = call->fn_type;
   4306   nd.args = args;
   4307   nd.results = results;
   4308   nd.nargs = call->nargs;
   4309   nd.nresults = nresults;
   4310   stack = x64_call_stack_size(d->native, &nd);
   4311   /* x64_call_stack_size includes the shadow-space prefix; the caller's incoming
   4312    * window has the same prefix, so compare against incoming_stack_size + it. */
   4313   if (stack > a->incoming_stack_size + a->abi->shadow_space)
   4314     return "x64 tail call: stack argument area too small";
   4315   return NULL;
   4316 }
   4317 
   4318 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`,
   4319  * returning a register-based NativeAddr. */
   4320 static NativeAddr x64_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
   4321   X64NativeTarget* a = x64_of(d->native);
   4322   NativeAddr addr;
   4323   memset(&addr, 0, sizeof addr);
   4324   if (op.kind == OPK_LOCAL) {
   4325     emit_mov_load(a->base.mc, 8, 0, X64_R11, X64_RBP,
   4326                   -(i32)x64_slot_get(a, d->locals[op.v.local - 1u].home)->off);
   4327     addr.base_kind = NATIVE_ADDR_BASE_REG;
   4328     addr.base.reg = X64_R11;
   4329     addr.base_type = op.type;
   4330     return addr;
   4331   }
   4332   return x64_direct_materialize_addr(d, op);
   4333 }
   4334 
   4335 static NativeAddr x64_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
   4336                                      Reg reg) {
   4337   NativeLoc dst =
   4338       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
   4339   NativeAddr addr;
   4340   d->native->load_addr(d->native, dst, x64_direct_pointer_addr(d, ap_addr));
   4341   memset(&addr, 0, sizeof addr);
   4342   addr.base_kind = NATIVE_ADDR_BASE_REG;
   4343   addr.cls = NATIVE_REG_INT;
   4344   addr.base.reg = reg;
   4345   addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
   4346   return addr;
   4347 }
   4348 
   4349 static void x64_va_start_(NativeDirectTarget* d, Operand ap_addr) {
   4350   /* Hold the va_list base in R11, not RAX: x64_va_start_core materializes the
   4351    * gp/fp_offset and overflow/reg-save-area field values through RAX, which
   4352    * would otherwise clobber the base before the field stores. */
   4353   x64_va_start_core(x64_of(d->native), x64_direct_va_base(d, ap_addr, X64_R11));
   4354 }
   4355 static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
   4356                         KitCgTypeId type) {
   4357   X64NativeTarget* a = x64_of(d->native);
   4358   int is_fp = cg_type_is_float(d->base.c, type);
   4359   NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
   4360                                  is_fp ? X64_TMP_FP : (Reg)X64_RDX);
   4361   NativeAddr dst_addr;
   4362   /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the
   4363    * integer result reg itself, or RAX for FP results), so R11 must not be RAX.
   4364    */
   4365   x64_va_arg_core(a, res, x64_direct_va_base(d, ap_addr, X64_R11), type);
   4366   dst_addr = x64_direct_addr(d, dst);
   4367   if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   4368     emit_mov_load(a->base.mc, 8, 0, X64_R11, X64_RBP,
   4369                   -(i32)x64_slot_get(a, dst_addr.base.frame)->off);
   4370     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   4371     dst_addr.base.reg = X64_R11;
   4372   }
   4373   x64_emit_mem(
   4374       a, 0, res, dst_addr,
   4375       native_mem_for_type(d->native, type, native_type_size(d->native, type)));
   4376 }
   4377 static void x64_va_end_(NativeDirectTarget* d, Operand ap_addr) {
   4378   (void)d;
   4379   (void)ap_addr;
   4380 }
   4381 static void x64_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) {
   4382   X64NativeTarget* a = x64_of(d->native);
   4383   NativeAddr src_ap = x64_direct_va_base(d, src, X64_RAX);
   4384   NativeAddr dst_ap = x64_direct_va_base(d, dst, X64_R11);
   4385   x64_va_copy_core(a, dst_ap, src_ap);
   4386 }
   4387 
   4388 static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
   4389                                  const AsmConstraint* outs, u32 nout,
   4390                                  Operand* out_ops, const AsmConstraint* ins,
   4391                                  u32 nin, const Operand* in_ops,
   4392                                  const Sym* clobbers, u32 nclob,
   4393                                  u32 clobber_abi_sets) {
   4394   X64NativeTarget* a = x64_of(d->native);
   4395   Compiler* c = d->base.c;
   4396   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   4397   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   4398   u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   4399   X64AsmSavedClobber* saved;
   4400   u32 nsaved, i;
   4401   X64Asm* asmh;
   4402 
   4403   x64_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
   4404   native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
   4405   clob_int |= abi_int;
   4406   clob_fp |= abi_fp;
   4407   /* Reserve emit scratch (r10,r11), driver scratch (r8,r9), rax (reserved;
   4408    * only self-allocated here when explicitly pinned), sp/bp, and clobbers. */
   4409   used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) |
   4410              (1u << X64_RBP) | (1u << X64_R8) | (1u << X64_R9) |
   4411              (1u << X64_R10);
   4412   used_fp = clob_fp | (1u << X64_XMM4) | (1u << X64_XMM5) |
   4413             (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15);
   4414 
   4415   for (i = 0; i < nout; ++i) {
   4416     const char* body = native_asm_constraint_body(outs[i].str);
   4417     KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   4418     NativeAsmRegPin pin;
   4419     if (x64_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
   4420       /* GNU local register variable: pin to the named hard register. */
   4421       if (pin.cls == NATIVE_REG_FP) {
   4422         used_fp |= 1u << pin.reg;
   4423         clob_fp |= 1u << pin.reg;
   4424       } else {
   4425         used_int |= 1u << pin.reg;
   4426         clob_int |= 1u << pin.reg;
   4427       }
   4428       x64_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
   4429     } else {
   4430       NativeAsmConstraintInfo info;
   4431       if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) {
   4432         Reg reg = info.fixed_reg != REG_NONE
   4433                       ? info.fixed_reg
   4434                       : x64_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4435                                           &used_int, &used_fp);
   4436         if (info.cls == NATIVE_REG_FP) {
   4437           used_fp |= 1u << reg;
   4438           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4439         } else {
   4440           used_int |= 1u << reg;
   4441           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4442         }
   4443         x64_asm_bound_reg(&bound_outs[i], type, info.cls, reg);
   4444       } else if (body[0] == 'm') {
   4445         Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4446         x64_asm_bound_mem(&bound_outs[i], type, reg);
   4447       } else {
   4448         x64_asm_panic(d, "unsupported output constraint");
   4449       }
   4450     }
   4451   }
   4452 
   4453   for (i = 0; i < nin; ++i) {
   4454     const char* body = native_asm_constraint_body(ins[i].str);
   4455     int matched = native_asm_match_index(body);
   4456     KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   4457     if (matched >= 0) {
   4458       if ((u32)matched >= nout)
   4459         x64_asm_panic(d, "matching constraint out of range");
   4460       if (native_asm_constraint_early(outs[matched].str))
   4461         x64_asm_panic(d, "matching input names early-clobber output");
   4462       if (bound_outs[matched].kind != X64_INLINE_OPK_REG)
   4463         x64_asm_panic(d, "matching constraint requires register output");
   4464       bound_ins[i] = bound_outs[matched];
   4465       continue;
   4466     }
   4467     NativeAsmRegPin pin;
   4468     if (x64_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
   4469       /* GNU local register variable: pin to the named hard register. */
   4470       if (pin.cls == NATIVE_REG_FP) {
   4471         used_fp |= 1u << pin.reg;
   4472         clob_fp |= 1u << pin.reg;
   4473       } else {
   4474         used_int |= 1u << pin.reg;
   4475         clob_int |= 1u << pin.reg;
   4476       }
   4477       x64_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
   4478     } else {
   4479       NativeAsmConstraintInfo info;
   4480       if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) {
   4481         Reg reg = info.fixed_reg != REG_NONE
   4482                       ? info.fixed_reg
   4483                       : x64_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4484                                           &used_int, &used_fp);
   4485         if (info.cls == NATIVE_REG_FP) {
   4486           used_fp |= 1u << reg;
   4487           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4488         } else {
   4489           used_int |= 1u << reg;
   4490           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4491         }
   4492         x64_asm_bound_reg(&bound_ins[i], type, info.cls, reg);
   4493       } else if (body[0] == 'i') {
   4494         if (in_ops[i].kind != OPK_IMM)
   4495           x64_asm_panic(d, "immediate constraint requires immediate operand");
   4496         bound_ins[i] = in_ops[i];
   4497       } else if (body[0] == 'm') {
   4498         Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4499         x64_asm_bound_mem(&bound_ins[i], type, reg);
   4500       } else {
   4501         x64_asm_panic(d, "unsupported input constraint");
   4502       }
   4503     }
   4504   }
   4505 
   4506   saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
   4507   for (i = 0; i < nout; ++i) {
   4508     if (bound_outs[i].kind == X64_INLINE_OPK_REG) {
   4509       NativeAllocClass cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP
   4510                                  ? NATIVE_REG_FP
   4511                                  : NATIVE_REG_INT;
   4512       if (outs[i].dir == KIT_CG_ASM_INOUT) {
   4513         x64_direct_load_operand_to_reg(
   4514             d, out_ops[i],
   4515             native_loc_reg(bound_outs[i].type, cls,
   4516                            (Reg)bound_outs[i].v.local));
   4517       }
   4518     } else if (bound_outs[i].kind == OPK_INDIRECT) {
   4519       NativeLoc loc =
   4520           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4521                          (Reg)bound_outs[i].v.ind.base);
   4522       x64_direct_load_address_to_reg(d, out_ops[i], loc);
   4523     }
   4524   }
   4525   for (i = 0; i < nin; ++i) {
   4526     if (bound_ins[i].kind == X64_INLINE_OPK_REG) {
   4527       NativeAllocClass cls = bound_ins[i].pad[0] == X64_INLINE_OPCLS_FP
   4528                                  ? NATIVE_REG_FP
   4529                                  : NATIVE_REG_INT;
   4530       x64_direct_load_operand_to_reg(
   4531           d, in_ops[i],
   4532           native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
   4533     } else if (bound_ins[i].kind == OPK_INDIRECT) {
   4534       NativeLoc loc =
   4535           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4536                          (Reg)bound_ins[i].v.ind.base);
   4537       x64_direct_load_address_to_reg(d, in_ops[i], loc);
   4538     }
   4539   }
   4540   asmh = x64_asm_open(c);
   4541   x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   4542                   nclob);
   4543   x64_asm_run_template(asmh, d->native->mc, tmpl);
   4544   x64_asm_close(asmh);
   4545 
   4546   for (i = 0; i < nout; ++i) {
   4547     NativeAllocClass cls;
   4548     NativeLoc src;
   4549     if (bound_outs[i].kind != X64_INLINE_OPK_REG) continue;
   4550     cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   4551                                                       : NATIVE_REG_INT;
   4552     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   4553     x64_direct_store_reg_to_operand(d, out_ops[i], src);
   4554   }
   4555   for (i = nsaved; i > 0; --i) x64_asm_restore_one(a, &saved[i - 1u]);
   4556 }
   4557 
   4558 static const NativeOps x64_direct_ops = {
   4559     .bind_param = x64_bind_param,
   4560     .tail_call_unrealizable_reason = x64_no_tail,
   4561     .va_start_ = x64_va_start_,
   4562     .va_arg_ = x64_va_arg_,
   4563     .va_end_ = x64_va_end_,
   4564     .va_copy_ = x64_va_copy_,
   4565     .asm_block = x64_direct_asm_block,
   4566 };
   4567 
   4568 const NativeOps* x64_native_direct_ops(void) { return &x64_direct_ops; }
	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README