native.c (171676B)
1 /* src/arch/x64/native.c — x86-64 (SysV / Win64) NativeTarget implementation. 2 * 3 * Mirrors the rv64 reference (src/arch/rv64/native.c): a physical-emission 4 * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by 5 * the optimizer emit path. ABI decisions route through abi/ and the per-OS 6 * X64ABIRegs (x64_abi_for_os); this file owns ISA emission and the x64 frame 7 * layout. 8 * 9 * Frame model (single, rbp-anchored): the prologue does `push rbp; mov rbp,rsp; 10 * sub rsp,frame_size`. Local/spill slots live below rbp at positive byte 11 * offsets `off` (address = rbp - off). Incoming stack args sit above the saved 12 * return address at [rbp + 16 + shadow_space + ...]. Callee-saved GPRs (and, on 13 * Win64, XMMs) are saved below the locals; outgoing args sit at [rsp + 0..]. 14 * The single-pass (-O0) prologue reserves a NOP placeholder patched in func_end 15 * once max_outgoing and callee-saves are known. 16 * 17 * Register model. INT scratch (never allocable, never driver scratch): R10 and 18 * R11 — the emit paths' fixed temporaries. FP scratch: XMM14 and XMM15. RSP/RBP 19 * are reserved (stack/frame pointers). RAX is reserved too (return value, the 20 * div/mul implicit operand), but it is NOT an emit temp, so inline asm may pin 21 * an operand to it (the Linux syscall idiom) — see x64_asm_operand_reg_ok. 22 * The driver scratch pool is R8/R9 (int) and XMM4/XMM5 (fp), caller-saved on 23 * both SysV and Win64 and disjoint from the emit temps so a hook never clobbers 24 * an operand parked there. Scratch registers are reserved from allocation. 25 * Callee-saved set is resolved per-OS via x64_abi_for_os at runtime (the 26 * legality masks below are SysV's, the conservative superset that both ABIs' 27 * allocators respect — Win64's extra callee-saves RDI/RSI/xmm6-15 only shrink 28 * the allocable pool, never grow it). */ 29 30 #include <string.h> 31 32 #include "abi/abi.h" 33 #include "arch/x64/asm.h" 34 #include "arch/x64/emit.h" 35 #include "arch/x64/isa.h" 36 #include "arch/x64/regs.h" 37 #include "arch/x64/x64.h" 38 #include "asm/asm.h" 39 #include "asm/asm_lex.h" 40 #include "cg/native_argmove.h" 41 #include "cg/native_asm.h" 42 #include "cg/native_direct_target.h" 43 #include "cg/native_frame.h" 44 #include "cg/type.h" 45 #include "core/arena.h" 46 #include "core/bytes.h" 47 #include "core/pool.h" 48 #include "core/slice.h" 49 #include "obj/obj.h" 50 51 enum { 52 X64_TMP_INT = X64_R10, /* emit-internal int scratch (reserved) */ 53 X64_TMP_INT2 = X64_R11, /* emit-internal int scratch (reserved) */ 54 X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */ 55 X64_TMP_FP2 = X64_XMM15, /* emit-internal fp scratch (reserved) */ 56 X64_MAX_REG_ARG_MOVES = 16u, 57 /* Deferred entry register-binds (-O1): bounded by simultaneously-live 58 * register-homed param parts, i.e. the allocable register count. */ 59 X64_MAX_BIND_MOVES = 32u, 60 X64_MAX_CS_FP_REGS = 10u, /* Win64 xmm6..xmm15 */ 61 }; 62 63 /* ============================ target state ============================ */ 64 65 /* Frame slots and callee-save records live in the shared NativeFrame 66 * bookkeeping (cg/native_frame.h); these aliases keep the x64-local spellings. 67 * x64 reads only .reg/.cls of a callee-save (it computes save offsets below the 68 * locals rather than homing them in frame slots, so .slot/.type stay unused). 69 */ 70 typedef NativeFrameSlotEntry X64NativeSlot; 71 typedef NativeFrameCalleeSave X64CalleeSave; 72 73 typedef enum X64PatchKind { X64_PATCH_ALLOCA } X64PatchKind; 74 75 typedef struct X64Patch { 76 u8 kind; /* X64PatchKind */ 77 u32 pos; /* byte offset of the disp32 to patch */ 78 } X64Patch; 79 80 typedef struct X64NativeTarget { 81 NativeTarget base; 82 SrcLoc loc; 83 const CGFuncDesc* func; 84 85 /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save 86 * set, and the known_frame / has_alloca / frame_final flags. */ 87 NativeFrame frame; 88 u32 frame_size_final; 89 90 u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */ 91 u32 next_param_int; 92 u32 next_param_fp; 93 u32 next_param_stack; 94 u8 has_sret; 95 u8 is_variadic; 96 NativeFrameSlot sret_ptr_slot; 97 NativeFrameSlot reg_save_slot; /* SysV variadic 176B __va_list_tag area */ 98 99 X64Patch* patches; 100 u32 npatches; 101 u32 patches_cap; 102 u32 nalloca; 103 104 u32 func_start; 105 u32 prologue_pos; 106 u32 prologue_nbytes; 107 MCLabel epilogue_label; 108 109 /* Known-frame (-O1) prologue cost-model tiers, settled in 110 * x64_func_begin_known_frame; both 0 on the single-pass path (which can't 111 * know the frame up front). Either one suppresses the `sub rsp` reservation; 112 * the rbp frame record (push rbp; mov rbp,rsp) and every rbp-relative offset 113 * stay unchanged, so the epilogue (`leave`), CFI (CFA = rbp+16), and debug 114 * locs are identical to the fat shape. slim_frame - empty frame (no 115 * callee-saves/locals/outgoing/alloca): the `sub rsp` reserved nothing, so it 116 * is simply dropped. Safe for non-leaves (push rbp keeps rsp 16-aligned for 117 * calls, and nothing lives below rsp). SysV + Win64. redzone_leaf - SysV leaf 118 * with a small frame (<= 128B, no alloca, no outgoing args): 119 * locals/callee-saves stay at their rbp-relative offsets, which now land in 120 * the 128-byte red zone instead of a reserved region. Leaf-only — a call 121 * would clobber the red zone. */ 122 u8 slim_frame; 123 u8 redzone_leaf; 124 125 /* Optimizer (-O1) entry binds: register-destination param binds are deferred 126 * here and resolved as a parallel copy in x64_bind_params_end, since the 127 * allocator may rotate params across the incoming arg registers — a 128 * permutation the naive per-param move order would clobber. */ 129 NativeArgMove bind_moves[X64_MAX_BIND_MOVES]; 130 u32 nbind_moves; 131 132 const X64ABIRegs* abi; 133 } X64NativeTarget; 134 135 static X64NativeTarget* x64_of(NativeTarget* t) { return (X64NativeTarget*)t; } 136 137 static _Noreturn void x64_panic(X64NativeTarget* a, const char* msg) { 138 compiler_panic(a->base.c, a->loc, "x64 native target: %s", msg); 139 } 140 141 static X64NativeSlot* x64_slot_get(X64NativeTarget* a, NativeFrameSlot fs) { 142 return native_frame_slot_at(&a->frame, fs); 143 } 144 145 static u32 align_up_u32(u32 v, u32 align) { 146 u32 mask = align ? align - 1u : 0u; 147 return (v + mask) & ~mask; 148 } 149 150 /* ============================ type helpers ============================ */ 151 152 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h 153 * (native_type_size, native_type_align, native_mem_for_type, 154 * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack, 155 * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */ 156 157 /* A scalar value occupies a 64-bit register when it is pointer-sized or wider 158 * (drives REX.W selection). */ 159 static int x64_is_64(NativeTarget* t, KitCgTypeId type) { 160 return native_type_size(t, type) >= 8u || cg_type_is_ptr(t->c, type); 161 } 162 163 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0xfu; } 164 165 /* SSE scalar prefix: F2 (double / 8-byte) vs F3 (single / 4-byte). */ 166 static u8 sse_scalar_prefix(u32 size) { return size == 8u ? 0xF2u : 0xF3u; } 167 168 /* Forward decls for the rel32 branch emitters (used by convert before the 169 * control-flow section defines them). */ 170 static void emit_jmp_rel32(MCEmitter* mc, MCLabel l); 171 static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l); 172 173 /* ============================ register tables ============================ */ 174 175 #define X64_PHYS_INT_ARG(r) \ 176 {.reg = (r), \ 177 .cls = NATIVE_REG_INT, \ 178 .abi_index = 0xffu, \ 179 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \ 180 .spill_cost = 1u, \ 181 .copy_cost = 1u} 182 #define X64_PHYS_INT_ARG_RESERVED(r) \ 183 {.reg = (r), \ 184 .cls = NATIVE_REG_INT, \ 185 .abi_index = 0xffu, \ 186 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \ 187 .spill_cost = 0u, \ 188 .copy_cost = 0u} 189 #define X64_PHYS_INT_RET_ARG(r) \ 190 {.reg = (r), \ 191 .cls = NATIVE_REG_INT, \ 192 .abi_index = 0xffu, \ 193 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 194 NATIVE_REG_RET, \ 195 .spill_cost = 1u, \ 196 .copy_cost = 1u} 197 #define X64_PHYS_INT_CALLER(r) \ 198 {.reg = (r), \ 199 .cls = NATIVE_REG_INT, \ 200 .abi_index = 0xffu, \ 201 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 202 .spill_cost = 1u, \ 203 .copy_cost = 1u} 204 #define X64_PHYS_INT_CALLEE(r) \ 205 {.reg = (r), \ 206 .cls = NATIVE_REG_INT, \ 207 .abi_index = 0xffu, \ 208 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 209 .spill_cost = 4u, \ 210 .copy_cost = 1u} 211 #define X64_PHYS_INT_RESERVED(r) \ 212 {.reg = (r), \ 213 .cls = NATIVE_REG_INT, \ 214 .abi_index = 0xffu, \ 215 .flags = NATIVE_REG_RESERVED, \ 216 .spill_cost = 0u, \ 217 .copy_cost = 0u} 218 219 /* Allocable int pool, opt's spill/reload set. R8/R9 are the driver scratch 220 * pool; R10/R11 are emit scratch (reserved); RAX is reserved (return / div-mul, 221 * asm-pinnable). */ 222 static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15}; 223 static const Reg x64_int_scratch[] = {X64_R8, X64_R9}; 224 225 static const NativePhysRegInfo x64_int_phys[] = { 226 X64_PHYS_INT_RESERVED(X64_RAX), /* return / div-mul (asm-pinnable) */ 227 X64_PHYS_INT_ARG(X64_RCX), 228 X64_PHYS_INT_RET_ARG(X64_RDX), 229 X64_PHYS_INT_RESERVED(X64_RBX), 230 X64_PHYS_INT_RESERVED(X64_RSP), /* stack pointer */ 231 X64_PHYS_INT_RESERVED(X64_RBP), /* frame pointer */ 232 X64_PHYS_INT_ARG(X64_RSI), 233 X64_PHYS_INT_ARG(X64_RDI), 234 X64_PHYS_INT_ARG_RESERVED(X64_R8), /* driver scratch */ 235 X64_PHYS_INT_ARG_RESERVED(X64_R9), /* driver scratch */ 236 X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */ 237 X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */ 238 X64_PHYS_INT_RESERVED(X64_R12), 239 X64_PHYS_INT_CALLEE(X64_R13), 240 X64_PHYS_INT_CALLEE(X64_R14), 241 X64_PHYS_INT_CALLEE(X64_R15), 242 }; 243 244 #define X64_PHYS_FP_ARG_RET(r) \ 245 {.reg = (r), \ 246 .cls = NATIVE_REG_FP, \ 247 .abi_index = 0xffu, \ 248 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 249 NATIVE_REG_RET, \ 250 .spill_cost = 1u, \ 251 .copy_cost = 1u} 252 #define X64_PHYS_FP_ARG(r) \ 253 {.reg = (r), \ 254 .cls = NATIVE_REG_FP, \ 255 .abi_index = 0xffu, \ 256 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \ 257 .spill_cost = 1u, \ 258 .copy_cost = 1u} 259 #define X64_PHYS_FP_ARG_RESERVED(r) \ 260 {.reg = (r), \ 261 .cls = NATIVE_REG_FP, \ 262 .abi_index = 0xffu, \ 263 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \ 264 .spill_cost = 0u, \ 265 .copy_cost = 0u} 266 #define X64_PHYS_FP_CALLER(r) \ 267 {.reg = (r), \ 268 .cls = NATIVE_REG_FP, \ 269 .abi_index = 0xffu, \ 270 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 271 .spill_cost = 1u, \ 272 .copy_cost = 1u} 273 #define X64_PHYS_FP_RESERVED(r) \ 274 {.reg = (r), \ 275 .cls = NATIVE_REG_FP, \ 276 .abi_index = 0xffu, \ 277 .flags = NATIVE_REG_RESERVED, \ 278 .spill_cost = 0u, \ 279 .copy_cost = 0u} 280 281 /* Allocable FP pool: xmm6..xmm11 (keep arg/ret xmm0..5 clear). xmm4/xmm5 are 282 * driver scratch; xmm14/xmm15 are emit scratch. */ 283 static const Reg x64_fp_allocable[] = { 284 X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11}; 285 static const Reg x64_fp_scratch[] = {X64_XMM4, X64_XMM5}; 286 287 static const NativePhysRegInfo x64_fp_phys[] = { 288 X64_PHYS_FP_ARG_RET(X64_XMM0), X64_PHYS_FP_ARG_RET(X64_XMM1), 289 X64_PHYS_FP_ARG(X64_XMM2), X64_PHYS_FP_ARG(X64_XMM3), 290 X64_PHYS_FP_ARG_RESERVED(X64_XMM4), X64_PHYS_FP_ARG_RESERVED(X64_XMM5), 291 X64_PHYS_FP_CALLER(X64_XMM6), X64_PHYS_FP_CALLER(X64_XMM7), 292 X64_PHYS_FP_CALLER(X64_XMM8), X64_PHYS_FP_CALLER(X64_XMM0 + 9), 293 X64_PHYS_FP_CALLER(X64_XMM0 + 10), X64_PHYS_FP_CALLER(X64_XMM0 + 11), 294 X64_PHYS_FP_RESERVED(X64_XMM0 + 12), X64_PHYS_FP_RESERVED(X64_XMM0 + 13), 295 X64_PHYS_FP_RESERVED(X64_XMM0 + 14), /* emit scratch */ 296 X64_PHYS_FP_RESERVED(X64_XMM15), /* emit scratch */ 297 }; 298 299 static const NativeAllocClassInfo x64_classes[] = { 300 {.cls = NATIVE_REG_INT, 301 .allocable = x64_int_allocable, 302 .nallocable = sizeof x64_int_allocable / sizeof x64_int_allocable[0], 303 .scratch = x64_int_scratch, 304 .nscratch = sizeof x64_int_scratch / sizeof x64_int_scratch[0], 305 .phys = x64_int_phys, 306 .nphys = sizeof x64_int_phys / sizeof x64_int_phys[0], 307 /* caller-saved: rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 (SysV) */ 308 .caller_saved_mask = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) | 309 (1u << X64_RSI) | (1u << X64_RDI) | (1u << X64_R8) | 310 (1u << X64_R9) | (1u << X64_R10) | (1u << X64_R11), 311 /* callee-saved: rbx,r12,r13,r14,r15 (rbp handled by prologue head) */ 312 .callee_saved_mask = (1u << X64_RBX) | (1u << X64_R12) | (1u << X64_R13) | 313 (1u << X64_R14) | (1u << X64_R15), 314 /* SysV arg regs rdi,rsi,rdx,rcx,r8,r9 */ 315 .arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) | 316 (1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9), 317 .ret_mask = (1u << X64_RAX) | (1u << X64_RDX), 318 /* rax, rsp, rbp reserved; r8/r9 driver scratch; r10/r11 emit scratch */ 319 .reserved_mask = (1u << X64_RAX) | (1u << X64_RSP) | (1u << X64_RBP) | 320 (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10) | 321 (1u << X64_R11) | (1u << X64_RBX) | (1u << X64_R12)}, 322 {.cls = NATIVE_REG_FP, 323 .allocable = x64_fp_allocable, 324 .nallocable = sizeof x64_fp_allocable / sizeof x64_fp_allocable[0], 325 .scratch = x64_fp_scratch, 326 .nscratch = sizeof x64_fp_scratch / sizeof x64_fp_scratch[0], 327 .phys = x64_fp_phys, 328 .nphys = sizeof x64_fp_phys / sizeof x64_fp_phys[0], 329 /* All xmm caller-saved on SysV. */ 330 .caller_saved_mask = 0xffffu, 331 .callee_saved_mask = 0u, 332 .arg_mask = 0xffu, /* xmm0..xmm7 */ 333 .ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1), 334 /* xmm4/xmm5 driver scratch; xmm14/xmm15 emit scratch. */ 335 .reserved_mask = (1u << X64_XMM4) | (1u << X64_XMM5) | 336 (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) | 337 (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15)}, 338 }; 339 340 /* Resolve a register name ("r10", "xmm3", ...) to its (class, Reg). Powers the 341 * optimizer's inline-asm clobber masks and explicit hard-register operands 342 * ("{r10}" from a GNU local register variable). GPR names map through the HW 343 * encoding; xmm names through the DWARF index table. Returns non-zero for a 344 * non-register name (cc/memory/unknown), which the caller skips. */ 345 static int x64_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, 346 NativeAllocClass* cls_out) { 347 char buf[16]; 348 uint32_t idx; 349 (void)ri; 350 if (!name.s || !name.len || name.len >= sizeof buf) return 1; 351 memcpy(buf, name.s, name.len); 352 buf[name.len] = '\0'; 353 if (x64_register_hw_index(buf, &idx) == 0 && idx <= 15u) { 354 *cls_out = NATIVE_REG_INT; 355 *out = (Reg)idx; 356 return 0; 357 } 358 if (x64_register_index(buf, &idx) == 0 && idx >= 17u && idx <= 32u) { 359 *cls_out = NATIVE_REG_FP; 360 *out = (Reg)(idx - 17u); 361 return 0; 362 } 363 return 1; 364 } 365 366 static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, 367 Reg reg) { 368 (void)ri; 369 if (cls == NATIVE_REG_INT) { 370 switch (reg) { 371 /* RAX is reserved but not an emit temp, so it is a legal asm pin (the 372 * Linux syscall number/return register). R8/R9 are driver scratch and 373 * R10/R11 are emit scratch, so those stay excluded. */ 374 case X64_RAX: 375 case X64_RBX: 376 case X64_RCX: 377 case X64_RDX: 378 case X64_RSI: 379 case X64_RDI: 380 case X64_R12: 381 case X64_R13: 382 case X64_R14: 383 case X64_R15: 384 return 1; 385 default: 386 return 0; 387 } 388 } 389 if (cls == NATIVE_REG_FP) 390 return reg <= X64_XMM0 + 13u && reg != X64_XMM4 && reg != X64_XMM5; 391 return 0; 392 } 393 394 static int x64_asm_constraint_reg(const NativeRegInfo* ri, const char* body, 395 NativeAllocClass* cls_out, Reg* fixed_out, 396 u32* allowed_mask_out) { 397 (void)ri; 398 if (!body || !body[0] || body[1]) return 0; 399 if (fixed_out) *fixed_out = REG_NONE; 400 if (allowed_mask_out) *allowed_mask_out = 0; 401 switch (body[0]) { 402 case 'r': 403 case 'q': 404 if (cls_out) *cls_out = NATIVE_REG_INT; 405 return 1; 406 case 'a': 407 if (cls_out) *cls_out = NATIVE_REG_INT; 408 if (fixed_out) *fixed_out = X64_RAX; 409 return 1; 410 case 'b': 411 if (cls_out) *cls_out = NATIVE_REG_INT; 412 if (fixed_out) *fixed_out = X64_RBX; 413 return 1; 414 case 'c': 415 if (cls_out) *cls_out = NATIVE_REG_INT; 416 if (fixed_out) *fixed_out = X64_RCX; 417 return 1; 418 case 'd': 419 if (cls_out) *cls_out = NATIVE_REG_INT; 420 if (fixed_out) *fixed_out = X64_RDX; 421 return 1; 422 case 'S': 423 if (cls_out) *cls_out = NATIVE_REG_INT; 424 if (fixed_out) *fixed_out = X64_RSI; 425 return 1; 426 case 'D': 427 if (cls_out) *cls_out = NATIVE_REG_INT; 428 if (fixed_out) *fixed_out = X64_RDI; 429 return 1; 430 case 'x': 431 case 'v': 432 if (cls_out) *cls_out = NATIVE_REG_FP; 433 return 1; 434 default: 435 return 0; 436 } 437 } 438 439 static const NativeRegInfo x64_reg_info = { 440 .classes = x64_classes, 441 .nclasses = sizeof x64_classes / sizeof x64_classes[0], 442 .resolve_name = x64_resolve_name, 443 .asm_operand_reg_ok = x64_asm_operand_reg_ok, 444 .asm_constraint_reg = x64_asm_constraint_reg, 445 }; 446 447 /* ============================ legality ============================ */ 448 449 static int x64_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, 450 KitCgTypeId type, i64 imm) { 451 (void)t; 452 (void)type; 453 switch (use) { 454 case NATIVE_IMM_MOVE: 455 return 1; 456 case NATIVE_IMM_BINOP: 457 switch ((BinOp)op) { 458 case BO_IADD: 459 case BO_ISUB: 460 case BO_AND: 461 case BO_OR: 462 case BO_XOR: 463 case BO_IMUL: 464 return imm_fits_i32(imm); 465 case BO_SHL: 466 case BO_SHR_S: 467 case BO_SHR_U: 468 return imm >= 0 && imm <= 63; 469 default: 470 return 0; 471 } 472 case NATIVE_IMM_CMP: 473 return imm_fits_i32(imm); 474 case NATIVE_IMM_ADDR_OFFSET: 475 return imm_fits_i32(imm); 476 } 477 return 0; 478 } 479 480 static int x64_addr_legal(NativeTarget* t, const NativeAddr* addr, 481 MemAccess mem) { 482 (void)t; 483 (void)mem; 484 if (!addr) return 0; 485 if (addr->base_kind != NATIVE_ADDR_BASE_REG && 486 addr->base_kind != NATIVE_ADDR_BASE_FRAME) 487 return 0; 488 /* x64 supports [base + index*scale + disp32]; index must be a register. */ 489 if (addr->index_kind != NATIVE_ADDR_INDEX_NONE && 490 addr->index_kind != NATIVE_ADDR_INDEX_REG) 491 return 0; 492 return imm_fits_i32(addr->offset); 493 } 494 495 /* ============================ globals / addresses ============================ 496 */ 497 498 static int x64_use_got_for_sym(NativeTarget* t, ObjSymId sym) { 499 return obj_symbol_extern_via_got(t->c, t->obj, sym); 500 } 501 502 /* PC-relative reloc kind for a non-GOT &sym reference. Functions use PLT32 so 503 * the linker can route through a PLT; data uses plain PC32. */ 504 static u32 x64_pcrel_reloc_for_sym(NativeTarget* t, ObjSymId sym) { 505 const ObjSym* s = obj_symbol_get(t->obj, sym); 506 if (s && (s->kind == SK_FUNC || s->kind == SK_IFUNC)) return R_X64_PLT32; 507 return R_PC32; 508 } 509 510 /* Materialize &sym + addend into dst_reg. Local/static-link symbols use 511 * `lea rd, [rip + disp32]`; GOT-routed externs use `mov rd, [rip + GOT]` then 512 * add any nonzero addend. */ 513 static void x64_emit_global_lea(NativeTarget* t, u32 dst_reg, ObjSymId sym, 514 i64 addend) { 515 MCEmitter* mc = t->mc; 516 u32 sec = mc->section_id; 517 if (x64_use_got_for_sym(t, sym)) { 518 u8 op; 519 u32 disp_pos; 520 emit_rex(mc, 1, dst_reg, 0, 0); 521 op = X64_OPC_MOV_R_RM; 522 mc->emit_bytes(mc, &op, 1); 523 { 524 u8 mr = modrm(0u, dst_reg & 7u, 5u); /* [rip + disp32] */ 525 mc->emit_bytes(mc, &mr, 1); 526 } 527 disp_pos = mc->pos(mc); 528 emit_u32le(mc, 0); 529 mc->emit_reloc_at(mc, sec, disp_pos, R_X64_REX_GOTPCRELX, sym, -4, 1, 0); 530 if (addend) { 531 i32 a = (i32)addend; 532 emit_rex(mc, 1, 0, 0, dst_reg); 533 if (imm_fits_i8(a)) { 534 u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, dst_reg & 7u), 535 (u8)a}; 536 mc->emit_bytes(mc, buf, 3); 537 } else { 538 u8 buf[2] = {X64_OPC_ALU_IMM32, 539 modrm(3u, X64_ALU_SUB_ADD, dst_reg & 7u)}; 540 mc->emit_bytes(mc, buf, 2); 541 emit_u32le(mc, (u32)a); 542 } 543 } 544 return; 545 } 546 { 547 u8 op = X64_OPC_LEA; 548 u32 disp_pos; 549 emit_rex(mc, 1, dst_reg, 0, 0); 550 mc->emit_bytes(mc, &op, 1); 551 { 552 u8 mr = modrm(0u, dst_reg & 7u, 5u); /* [rip + disp32] */ 553 mc->emit_bytes(mc, &mr, 1); 554 } 555 disp_pos = mc->pos(mc); 556 emit_u32le(mc, 0); 557 mc->emit_reloc_at(mc, sec, disp_pos, x64_pcrel_reloc_for_sym(t, sym), sym, 558 addend - 4, 1, 0); 559 } 560 } 561 562 /* Resolve a NativeAddr to (base, index, log2_scale, off). Materializes 563 * FRAME/FRAME_VALUE/GLOBAL bases into the supplied scratch register. */ 564 static u32 x64_resolve_addr(X64NativeTarget* a, const NativeAddr* addr, 565 u32 scratch, u32* idx_out, u32* scale_out, 566 i32* off_out) { 567 NativeTarget* t = &a->base; 568 u32 base; 569 i32 off; 570 switch (addr->base_kind) { 571 case NATIVE_ADDR_BASE_REG: 572 base = addr->base.reg & 0xfu; 573 off = addr->offset; 574 break; 575 case NATIVE_ADDR_BASE_FRAME: { 576 X64NativeSlot* s = x64_slot_get(a, addr->base.frame); 577 base = X64_RBP; 578 off = -(i32)s->off + addr->offset; 579 break; 580 } 581 case NATIVE_ADDR_BASE_FRAME_VALUE: { 582 X64NativeSlot* s = x64_slot_get(a, addr->base.frame); 583 emit_mov_load(t->mc, 8, 0, scratch, X64_RBP, -(i32)s->off); 584 base = scratch; 585 off = addr->offset; 586 break; 587 } 588 case NATIVE_ADDR_BASE_GLOBAL: 589 x64_emit_global_lea(t, scratch, addr->base.global.sym, 590 addr->base.global.addend); 591 base = scratch; 592 off = addr->offset; 593 break; 594 default: 595 x64_panic(a, "unsupported address base"); 596 } 597 if (addr->index_kind == NATIVE_ADDR_INDEX_REG) { 598 *idx_out = addr->index.reg & 0xfu; 599 *scale_out = addr->log2_scale; 600 } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) { 601 X64NativeSlot* s = x64_slot_get(a, addr->index.frame); 602 emit_mov_load(t->mc, 8, 0, X64_TMP_INT2, X64_RBP, -(i32)s->off); 603 *idx_out = X64_TMP_INT2; 604 *scale_out = addr->log2_scale; 605 } else { 606 *idx_out = REG_NONE; 607 *scale_out = 0; 608 } 609 *off_out = off; 610 return base; 611 } 612 613 /* ============================ memory ============================ */ 614 615 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem. 616 * Materializes the address through X64_TMP_INT2 (r11) for non-reg bases. */ 617 static void x64_emit_mem(X64NativeTarget* a, int is_load, NativeLoc reg, 618 NativeAddr addr, MemAccess mem) { 619 NativeTarget* t = &a->base; 620 MCEmitter* mc = t->mc; 621 u32 r = loc_reg(reg); 622 int fp = native_loc_is_fp(reg); 623 u32 sz = mem.size ? mem.size : native_type_size(t, reg.type); 624 u32 base, idx, scale; 625 i32 off; 626 627 /* Global base: fold into a single rip-relative access when local. */ 628 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL && 629 addr.index_kind == NATIVE_ADDR_INDEX_NONE && 630 !x64_use_got_for_sym(t, addr.base.global.sym)) { 631 ObjSymId sym = addr.base.global.sym; 632 i64 ad = addr.base.global.addend + addr.offset; 633 u32 sec = mc->section_id; 634 u32 disp_pos; 635 if (fp) { 636 u8 prefix = sse_scalar_prefix(sz); 637 mc->emit_bytes(mc, &prefix, 1); 638 emit_rex(mc, 0, r, 0, 0); 639 { 640 u8 op2[2] = {X64_OPC_TWOBYTE, (u8)(is_load ? 0x10u : 0x11u)}; 641 mc->emit_bytes(mc, op2, 2); 642 } 643 } else if (sz == 8 || sz == 4) { 644 emit_rex(mc, sz == 8, r, 0, 0); 645 { 646 u8 op = is_load ? X64_OPC_MOV_R_RM : X64_OPC_MOV_RM_R; 647 mc->emit_bytes(mc, &op, 1); 648 } 649 } else if (sz == 2) { 650 if (is_load) { 651 emit_rex(mc, 0, r, 0, 0); 652 { 653 u8 op2[2] = {X64_OPC_TWOBYTE, X64_OPC_MOVZX_W}; 654 mc->emit_bytes(mc, op2, 2); 655 } 656 } else { 657 u8 p = X64_OPSIZE_PFX; 658 mc->emit_bytes(mc, &p, 1); 659 emit_rex(mc, 0, r, 0, 0); 660 { 661 u8 op = X64_OPC_MOV_RM_R; 662 mc->emit_bytes(mc, &op, 1); 663 } 664 } 665 } else { /* size 1 */ 666 if (is_load) { 667 emit_rex(mc, 0, r, 0, 0); 668 { 669 u8 op2[2] = {X64_OPC_TWOBYTE, X64_OPC_MOVZX_B}; 670 mc->emit_bytes(mc, op2, 2); 671 } 672 } else { 673 emit_rex_force(mc, 0, r, 0, 0); 674 { 675 u8 op = X64_OPC_MOV_RM_R8; 676 mc->emit_bytes(mc, &op, 1); 677 } 678 } 679 } 680 { 681 u8 mr = modrm(0u, r & 7u, 5u); 682 mc->emit_bytes(mc, &mr, 1); 683 } 684 disp_pos = mc->pos(mc); 685 emit_u32le(mc, 0); 686 mc->emit_reloc_at(mc, sec, disp_pos, x64_pcrel_reloc_for_sym(t, sym), sym, 687 ad - 4, 1, 0); 688 return; 689 } 690 691 base = x64_resolve_addr(a, &addr, X64_TMP_INT2, &idx, &scale, &off); 692 if (fp) { 693 u8 prefix = sse_scalar_prefix(sz); 694 if (is_load) 695 emit_sse_load_idx(mc, prefix, 0x10, r, base, idx, scale, off); 696 else 697 emit_sse_store_idx(mc, prefix, 0x11, r, base, idx, scale, off); 698 } else if (is_load) { 699 /* Loads narrower than 4 bytes zero-extend (sign-extension is applied by a 700 * later CV_SEXT). */ 701 emit_mov_load_idx(mc, sz, 0, r, base, idx, scale, off); 702 } else { 703 emit_mov_store_idx(mc, sz, r, base, idx, scale, off); 704 } 705 } 706 707 /* ============================ moves / data ============================ */ 708 709 static void x64_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { 710 MCEmitter* mc = t->mc; 711 int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src); 712 u32 rd = loc_reg(dst), rs = loc_reg(src); 713 if (dfp && sfp) { 714 if (rd == rs) return; 715 emit_sse_rr(mc, sse_scalar_prefix(native_type_size(t, dst.type)), 0x10, rd, 716 rs); 717 return; 718 } 719 if (dfp && !sfp) { /* movd/movq gpr -> xmm: 66 0F 6E /r */ 720 int w = native_type_size(t, dst.type) == 8u; 721 emit_sse_rr_w(mc, 0x66, 0x6E, w, rd, rs); 722 return; 723 } 724 if (!dfp && sfp) { /* movd/movq xmm -> gpr: 66 0F 7E /r (xmm is reg field) */ 725 int w = native_type_size(t, src.type) == 8u; 726 emit_sse_rr_w(mc, 0x66, 0x7E, w, rs, rd); 727 return; 728 } 729 if (rd == rs) return; 730 emit_mov_rr(mc, x64_is_64(t, dst.type) ? 1 : 0, rd, rs); 731 } 732 733 static void x64_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) { 734 x64_emit_load_imm(t->mc, x64_is_64(t, dst.type) ? 1 : 0, loc_reg(dst), imm); 735 } 736 737 /* FP constant: materialize the bit pattern in a GPR scratch, then movd/movq 738 * into the FPR. Integer constant: plain load_imm. */ 739 static void x64_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) { 740 u64 v = 0; 741 u32 i; 742 for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u); 743 if (!native_loc_is_fp(dst)) { 744 x64_load_imm(t, dst, (i64)v); 745 return; 746 } 747 x64_emit_load_imm(t->mc, cb.size == 8u, X64_TMP_INT, (i64)v); 748 emit_sse_rr_w(t->mc, 0x66, 0x6E, cb.size == 8u, loc_reg(dst), X64_TMP_INT); 749 } 750 751 static void x64_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { 752 X64NativeTarget* a = x64_of(t); 753 MCEmitter* mc = t->mc; 754 u32 rd = loc_reg(dst); 755 u32 base, idx, scale; 756 i32 off; 757 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL && 758 addr.index_kind == NATIVE_ADDR_INDEX_NONE) { 759 x64_emit_global_lea(t, rd, addr.base.global.sym, 760 addr.base.global.addend + addr.offset); 761 return; 762 } 763 base = x64_resolve_addr(a, &addr, rd, &idx, &scale, &off); 764 if (idx == REG_NONE) { 765 if (base == rd && off == 0) return; /* already &slot in rd */ 766 emit_lea(mc, rd, base, off); 767 return; 768 } 769 /* lea rd, [base + idx*scale + off] */ 770 { 771 u8 buf[16]; 772 u32 n = 0; 773 n += x64_pack_rex(buf + n, 1, rd, idx, base); 774 buf[n++] = X64_OPC_LEA; 775 n += x64_pack_mem_sib(buf + n, rd, base, idx, scale, off); 776 mc->emit_bytes(mc, buf, n); 777 } 778 } 779 780 static void x64_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 781 MemAccess mem) { 782 x64_emit_mem(x64_of(t), 1, dst, addr, mem); 783 } 784 static void x64_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 785 MemAccess mem) { 786 x64_emit_mem(x64_of(t), 0, src, addr, mem); 787 } 788 789 /* Resolve an addressable NativeAddr to a bare base register (no index, off 0) 790 * by emitting an lea into `scratch` when needed. */ 791 static u32 x64_addr_to_base_reg(X64NativeTarget* a, NativeAddr addr, 792 u32 scratch) { 793 MCEmitter* mc = a->base.mc; 794 u32 base, idx, scale; 795 i32 off; 796 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL && 797 addr.index_kind == NATIVE_ADDR_INDEX_NONE) { 798 x64_emit_global_lea(&a->base, scratch, addr.base.global.sym, 799 addr.base.global.addend + addr.offset); 800 return scratch; 801 } 802 base = x64_resolve_addr(a, &addr, scratch, &idx, &scale, &off); 803 if (idx == REG_NONE && off == 0) return base; 804 if (idx == REG_NONE) { 805 emit_lea(mc, scratch, base, off); 806 return scratch; 807 } 808 { 809 u8 buf[16]; 810 u32 n = 0; 811 n += x64_pack_rex(buf + n, 1, scratch, idx, base); 812 buf[n++] = X64_OPC_LEA; 813 n += x64_pack_mem_sib(buf + n, scratch, base, idx, scale, off); 814 mc->emit_bytes(mc, buf, n); 815 } 816 return scratch; 817 } 818 819 /* copy_bytes: resolve dst into r11 and src into rax (both bare pointers), then 820 * unrolled granule copy through rdx. dst is resolved first (its base may live 821 * in r11 from a FRAME_VALUE load) and src second so the two never alias. */ 822 static void x64_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, 823 AggregateAccess access) { 824 X64NativeTarget* a = x64_of(t); 825 /* Copy chunk by chunk (8/4/2/1) through the value scratch rax, letting 826 * x64_emit_mem resolve each address with its own scratch (r11). Uses only the 827 * reserved emit scratch (rax/r11) — no ad-hoc allocable temp (previously 828 * rdx), which the optimizer may have live across the copy. */ 829 KitCgTypeId tys[4]; 830 u32 n = access.size, i = 0; 831 tys[0] = builtin_id(KIT_CG_BUILTIN_I64); 832 tys[1] = builtin_id(KIT_CG_BUILTIN_I32); 833 tys[2] = builtin_id(KIT_CG_BUILTIN_I16); 834 tys[3] = builtin_id(KIT_CG_BUILTIN_I8); 835 while (i < n) { 836 u32 rem = n - i, s; 837 KitCgTypeId ty; 838 NativeAddr sa = src, da = dst; 839 NativeLoc val; 840 MemAccess mem; 841 if (rem >= 8u) { 842 s = 8u; 843 ty = tys[0]; 844 } else if (rem >= 4u) { 845 s = 4u; 846 ty = tys[1]; 847 } else if (rem >= 2u) { 848 s = 2u; 849 ty = tys[2]; 850 } else { 851 s = 1u; 852 ty = tys[3]; 853 } 854 sa.offset += (i32)i; 855 sa.base_type = ty; 856 da.offset += (i32)i; 857 da.base_type = ty; 858 val = native_loc_reg(ty, NATIVE_REG_INT, X64_TMP_INT); 859 memset(&mem, 0, sizeof mem); 860 mem.type = ty; 861 mem.size = s; 862 mem.align = s; 863 x64_emit_mem(a, 1, val, sa, mem); /* rax = [src + i] */ 864 x64_emit_mem(a, 0, val, da, mem); /* [dst + i] = rax */ 865 i += s; 866 } 867 } 868 869 static void x64_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, 870 AggregateAccess access) { 871 X64NativeTarget* a = x64_of(t); 872 MCEmitter* mc = t->mc; 873 u32 dr = x64_addr_to_base_reg(a, dst, X64_TMP_INT2); 874 u32 n = access.size, i = 0; 875 /* Broadcast the byte across 8 bytes into rax. */ 876 if (byte_value.kind == NATIVE_LOC_IMM) { 877 u8 b = (u8)(byte_value.v.imm & 0xffu); 878 u64 b64 = b; 879 b64 |= b64 << 8; 880 b64 |= b64 << 16; 881 b64 |= b64 << 32; 882 x64_emit_load_imm(mc, 1, X64_RAX, (i64)b64); 883 } else { 884 /* Replicate the low byte of a register via multiply by 0x0101..01. */ 885 x64_emit_load_imm(mc, 1, X64_R11, (i64)0x0101010101010101ll); 886 emit_mov_rr(mc, 1, X64_RAX, loc_reg(byte_value)); 887 emit_imul_rr(mc, 1, X64_RAX, X64_R11); 888 } 889 while (i + 8u <= n) { 890 emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); 891 i += 8u; 892 } 893 while (i + 4u <= n) { 894 emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); 895 i += 4u; 896 } 897 while (i + 2u <= n) { 898 emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); 899 i += 2u; 900 } 901 while (i < n) { 902 emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); 903 i += 1u; 904 } 905 } 906 907 /* ============================ bitfields ============================ */ 908 909 static void x64_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra, 910 BitFieldAccess bf) { 911 X64NativeTarget* a = x64_of(t); 912 MCEmitter* mc = t->mc; 913 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 914 int w = storage_bytes == 8u ? 1 : 0; 915 u32 reg_size = w ? 64u : 32u; 916 u32 lsb = bf.bit_offset; 917 u32 width = bf.bit_width ? bf.bit_width : 1u; 918 u32 rd = loc_reg(dst); 919 u32 base; 920 ra.offset += (i32)bf.storage_offset; 921 base = x64_addr_to_base_reg(a, ra, X64_TMP_INT2); 922 emit_mov_load(mc, storage_bytes, 0, rd, base, 0); 923 { 924 u8 left = (u8)(reg_size - lsb - width); 925 u8 right = (u8)(reg_size - width); 926 if (left) emit_shift_imm(mc, w, X64_SHIFT_SUB_SHL, rd, left); 927 if (right) 928 emit_shift_imm(mc, w, bf.signed_ ? X64_SHIFT_SUB_SAR : X64_SHIFT_SUB_SHR, 929 rd, right); 930 } 931 } 932 933 static void x64_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src, 934 BitFieldAccess bf) { 935 X64NativeTarget* a = x64_of(t); 936 MCEmitter* mc = t->mc; 937 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 938 int w = storage_bytes == 8u ? 1 : 0; 939 u32 lsb = bf.bit_offset; 940 u32 width = bf.bit_width ? bf.bit_width : 1u; 941 u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u); 942 u64 mask = ones << lsb; 943 u32 src_reg = loc_reg(src); 944 u32 base; 945 ra.offset += (i32)bf.storage_offset; 946 /* Stabilize the base into r11 before consuming rax/rcx/rdx scratch. */ 947 base = x64_addr_to_base_reg(a, ra, X64_TMP_INT2); 948 /* rax = storage; rax &= ~mask. */ 949 emit_mov_load(mc, storage_bytes, 0, X64_RAX, base, 0); 950 x64_emit_load_imm(mc, w, X64_RCX, (i64)~mask); 951 emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RAX, X64_RCX); 952 /* rcx = (src & ones) << lsb. */ 953 emit_mov_rr(mc, w, X64_RCX, src_reg); 954 x64_emit_load_imm(mc, w, X64_RDX, (i64)ones); 955 emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX); 956 if (lsb) emit_shift_imm(mc, w, X64_SHIFT_SUB_SHL, X64_RCX, (u8)lsb); 957 emit_alu_rr(mc, w, X64_OPC_ALU_OR, X64_RAX, X64_RCX); 958 emit_mov_store(mc, storage_bytes, X64_RAX, base, 0); 959 } 960 961 /* ============================ arithmetic ============================ */ 962 963 static void x64_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop, 964 NativeLoc bop) { 965 X64NativeTarget* a = x64_of(t); 966 MCEmitter* mc = t->mc; 967 u32 rd = loc_reg(dst); 968 969 /* FP binops: two-address. dst = aop op bop. */ 970 if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { 971 u32 ra = loc_reg(aop), rb = loc_reg(bop); 972 u8 prefix = sse_scalar_prefix(native_type_size(t, dst.type)); 973 u8 opcode; 974 switch (op) { 975 case BO_FADD: 976 opcode = 0x58; 977 break; 978 case BO_FSUB: 979 opcode = 0x5C; 980 break; 981 case BO_FMUL: 982 opcode = 0x59; 983 break; 984 default: 985 opcode = 0x5E; 986 break; /* BO_FDIV */ 987 } 988 if (rd == rb && rd != ra) { 989 if (op == BO_FADD || op == BO_FMUL) { /* commutative */ 990 emit_sse_rr(mc, prefix, opcode, rd, ra); 991 return; 992 } 993 /* non-commutative dst==rb: stage rb in fp scratch. */ 994 emit_sse_rr(mc, prefix, 0x10, X64_TMP_FP2, rb); 995 emit_sse_rr(mc, prefix, 0x10, rd, ra); 996 emit_sse_rr(mc, prefix, opcode, rd, X64_TMP_FP2); 997 return; 998 } 999 if (rd != ra) emit_sse_rr(mc, prefix, 0x10, rd, ra); 1000 emit_sse_rr(mc, prefix, opcode, rd, rb); 1001 return; 1002 } 1003 1004 { 1005 int w = x64_is_64(t, dst.type) ? 1 : 0; 1006 int b_imm = bop.kind == NATIVE_LOC_IMM; 1007 i64 imm = b_imm ? bop.v.imm : 0; 1008 u32 ra = loc_reg(aop); 1009 1010 /* Division: rax/rdx implicit; divisor must avoid rax/rdx. */ 1011 if (op == BO_SDIV || op == BO_UDIV || op == BO_SREM || op == BO_UREM) { 1012 u32 rb; 1013 if (ra != X64_RAX) emit_mov_rr(mc, w, X64_RAX, ra); 1014 if (b_imm) { 1015 x64_emit_load_imm(mc, w, X64_R11, imm); 1016 rb = X64_R11; 1017 } else { 1018 rb = loc_reg(bop); 1019 if (rb == X64_RAX || rb == X64_RDX) { 1020 emit_mov_rr(mc, w, X64_R11, rb); 1021 rb = X64_R11; 1022 } 1023 } 1024 if (op == BO_SDIV || op == BO_SREM) { 1025 emit_cqo_or_cdq(mc, w); 1026 emit_f7_rm(mc, w, X64_F7_SUB_IDIV, rb); 1027 } else { 1028 emit_xor_self(mc, w, X64_RDX); 1029 emit_f7_rm(mc, w, X64_F7_SUB_DIV, rb); 1030 } 1031 { 1032 u32 result = (op == BO_SREM || op == BO_UREM) ? X64_RDX : X64_RAX; 1033 if (rd != result) emit_mov_rr(mc, w, rd, result); 1034 } 1035 return; 1036 } 1037 1038 /* Shifts: count in CL or imm8. */ 1039 if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) { 1040 u32 sub = (op == BO_SHL) ? X64_SHIFT_SUB_SHL 1041 : (op == BO_SHR_U) ? X64_SHIFT_SUB_SHR 1042 : X64_SHIFT_SUB_SAR; 1043 if (b_imm) { 1044 u32 wbits = w ? 64u : 32u; 1045 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 1046 emit_shift_imm(mc, w, sub, rd, (u8)((u64)imm & (wbits - 1u))); 1047 return; 1048 } 1049 { 1050 u32 rb = loc_reg(bop); 1051 /* Place the count in cl and the value in dst. Stage the count through 1052 * r11 first so neither move clobbers the other when the value already 1053 * sits in rcx or the count sits in dst. (The optimizer additionally 1054 * keeps values live across the shift out of rcx — see 1055 * x64_machine_op_clobbers.) */ 1056 if (rb != X64_RCX) { 1057 emit_mov_rr(mc, 0, X64_TMP_INT2, rb); 1058 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 1059 emit_mov_rr(mc, 0, X64_RCX, X64_TMP_INT2); 1060 } else if (rd != ra) { 1061 emit_mov_rr(mc, w, rd, ra); 1062 } 1063 } 1064 emit_shift_cl(mc, w, sub, rd); 1065 return; 1066 } 1067 1068 /* IMM-form fast paths (b_imm guaranteed legal by imm_legal: imm32). */ 1069 if (b_imm && (op == BO_IADD || op == BO_ISUB || op == BO_AND || 1070 op == BO_OR || op == BO_XOR || op == BO_IMUL)) { 1071 if (op == BO_IMUL) { 1072 if (imm_fits_i8(imm)) { 1073 emit_imul_imm8(mc, w, rd, ra, (i8)imm); 1074 return; 1075 } 1076 emit_imul_imm32(mc, w, rd, ra, (i32)imm); 1077 return; 1078 } 1079 { 1080 u32 sub; 1081 switch (op) { 1082 case BO_IADD: 1083 sub = X64_ALU_SUB_ADD; 1084 break; 1085 case BO_OR: 1086 sub = X64_ALU_SUB_OR; 1087 break; 1088 case BO_AND: 1089 sub = X64_ALU_SUB_AND; 1090 break; 1091 case BO_ISUB: 1092 sub = X64_ALU_SUB_SUB; 1093 break; 1094 default: 1095 sub = X64_ALU_SUB_XOR; 1096 break; /* BO_XOR */ 1097 } 1098 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 1099 if (imm_fits_i8(imm)) 1100 emit_alu_imm8(mc, w, sub, rd, (i8)imm); 1101 else 1102 emit_alu_imm32(mc, w, sub, rd, (i32)imm); 1103 return; 1104 } 1105 } 1106 1107 /* Generic 2-operand ALU: dst = ra op rb. Preserve rb if dst == rb. */ 1108 { 1109 u32 rb = loc_reg(bop); 1110 if (rd == rb && rd != ra) { 1111 switch (op) { 1112 case BO_IADD: 1113 emit_alu_rr(mc, w, X64_OPC_ALU_ADD, rd, ra); 1114 return; 1115 case BO_AND: 1116 emit_alu_rr(mc, w, X64_OPC_ALU_AND, rd, ra); 1117 return; 1118 case BO_OR: 1119 emit_alu_rr(mc, w, X64_OPC_ALU_OR, rd, ra); 1120 return; 1121 case BO_XOR: 1122 emit_alu_rr(mc, w, X64_OPC_ALU_XOR, rd, ra); 1123 return; 1124 case BO_IMUL: 1125 emit_imul_rr(mc, w, rd, ra); 1126 return; 1127 default: 1128 break; /* ISUB falls through: stage rb */ 1129 } 1130 emit_mov_rr(mc, w, X64_R11, rb); 1131 rb = X64_R11; 1132 } 1133 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 1134 switch (op) { 1135 case BO_IADD: 1136 emit_alu_rr(mc, w, X64_OPC_ALU_ADD, rd, rb); 1137 break; 1138 case BO_ISUB: 1139 emit_alu_rr(mc, w, X64_OPC_ALU_SUB, rd, rb); 1140 break; 1141 case BO_AND: 1142 emit_alu_rr(mc, w, X64_OPC_ALU_AND, rd, rb); 1143 break; 1144 case BO_OR: 1145 emit_alu_rr(mc, w, X64_OPC_ALU_OR, rd, rb); 1146 break; 1147 case BO_XOR: 1148 emit_alu_rr(mc, w, X64_OPC_ALU_XOR, rd, rb); 1149 break; 1150 case BO_IMUL: 1151 emit_imul_rr(mc, w, rd, rb); 1152 break; 1153 default: 1154 x64_panic(a, "unsupported binop"); 1155 } 1156 } 1157 } 1158 } 1159 1160 /* FP sign-mask constant materialized in fp scratch for FNEG. */ 1161 static void x64_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { 1162 X64NativeTarget* a = x64_of(t); 1163 MCEmitter* mc = t->mc; 1164 u32 rd = loc_reg(dst), rs = loc_reg(src); 1165 if (op == UO_FNEG) { 1166 int dbl = native_type_size(t, dst.type) == 8u; 1167 if (rd != rs) 1168 emit_sse_rr(mc, sse_scalar_prefix(dbl ? 8u : 4u), 0x10, rd, rs); 1169 /* sign mask into fp scratch via gpr, then XORPS/XORPD. */ 1170 x64_emit_load_imm(mc, dbl, X64_TMP_INT, 1171 dbl ? (i64)0x8000000000000000ull : (i64)0x80000000ull); 1172 emit_sse_rr_w(mc, 0x66, 0x6E, dbl, X64_TMP_FP2, X64_TMP_INT); 1173 emit_sse_rr(mc, dbl ? 0x66 : 0, 0x57, rd, X64_TMP_FP2); 1174 return; 1175 } 1176 { 1177 int w = x64_is_64(t, dst.type) ? 1 : 0; 1178 switch (op) { 1179 case UO_NEG: 1180 if (rd != rs) emit_mov_rr(mc, w, rd, rs); 1181 emit_f7_rm(mc, w, X64_F7_SUB_NEG, rd); 1182 return; 1183 case UO_BNOT: 1184 if (rd != rs) emit_mov_rr(mc, w, rd, rs); 1185 emit_f7_rm(mc, w, X64_F7_SUB_NOT, rd); 1186 return; 1187 case UO_NOT: 1188 /* !x -> (x == 0) as 0/1. */ 1189 emit_test_self(mc, w, rs); 1190 emit_setcc(mc, X64_CC_E, rd); 1191 emit_movzx_r32_r8(mc, rd, rd); 1192 return; 1193 default: 1194 x64_panic(a, "unsupported unop"); 1195 } 1196 } 1197 } 1198 1199 /* ============================ compares ============================ */ 1200 1201 static u32 cmp_to_cc(CmpOp op) { 1202 switch (op) { 1203 case CMP_EQ: 1204 return X64_CC_E; 1205 case CMP_NE: 1206 return X64_CC_NE; 1207 case CMP_LT_U: 1208 return X64_CC_B; 1209 case CMP_LE_U: 1210 return X64_CC_BE; 1211 case CMP_GT_U: 1212 return X64_CC_A; 1213 case CMP_GE_U: 1214 return X64_CC_AE; 1215 case CMP_LT_S: 1216 return X64_CC_L; 1217 case CMP_LE_S: 1218 return X64_CC_LE; 1219 case CMP_GT_S: 1220 return X64_CC_G; 1221 case CMP_GE_S: 1222 return X64_CC_GE; 1223 default: 1224 return X64_CC_E; 1225 } 1226 } 1227 1228 static int cmp_is_fp(CmpOp op, NativeLoc aop) { 1229 /* FP-ness is self-describing from the opcode; FP eq/ne are distinct opcodes 1230 * (CMP_OEQ_F/CMP_UNE_F), so no operand-class sniffing is needed. */ 1231 (void)aop; 1232 return op >= CMP_OEQ_F; 1233 } 1234 1235 /* Emit `cmp ra, rb` (or ucomis[sd] for FP), setting flags from ra - rb. */ 1236 static void x64_emit_cmp_flags(NativeTarget* t, NativeLoc aop, NativeLoc bop, 1237 int fp) { 1238 X64NativeTarget* a = x64_of(t); 1239 MCEmitter* mc = t->mc; 1240 if (fp) { 1241 u8 prefix = native_type_size(t, aop.type) == 8u ? 0x66u : 0u; 1242 emit_sse_rr(mc, prefix, 0x2E, loc_reg(aop), loc_reg(bop)); /* ucomis */ 1243 return; 1244 } 1245 { 1246 int w = x64_is_64(t, aop.type) ? 1 : 0; 1247 u32 ra = loc_reg(aop); 1248 if (bop.kind == NATIVE_LOC_IMM) { 1249 i64 imm = bop.v.imm; 1250 if (imm_fits_i8(imm)) 1251 emit_alu_imm8(mc, w, X64_ALU_SUB_CMP, ra, (i8)imm); 1252 else 1253 emit_alu_imm32(mc, w, X64_ALU_SUB_CMP, ra, (i32)imm); 1254 return; 1255 } 1256 emit_alu_rr(mc, w, X64_OPC_ALU_CMP, ra, loc_reg(bop)); 1257 (void)a; 1258 } 1259 } 1260 1261 /* FP ordered setcc: result = (primary cc) && !unordered (NP). */ 1262 static void x64_fp_setcc_ordered(NativeTarget* t, u32 primary, u32 dst) { 1263 MCEmitter* mc = t->mc; 1264 emit_setcc(mc, primary, dst); 1265 emit_movzx_r32_r8(mc, dst, dst); 1266 emit_setcc(mc, X64_CC_NP, X64_R11); 1267 emit_movzx_r32_r8(mc, X64_R11, X64_R11); 1268 emit_alu_rr(mc, 0, X64_OPC_ALU_AND, dst, X64_R11); 1269 } 1270 1271 /* FP unordered predicate: result = (primary cc) || unordered (P). */ 1272 static void x64_fp_setcc_unord(NativeTarget* t, u32 primary, u32 dst) { 1273 MCEmitter* mc = t->mc; 1274 emit_setcc(mc, primary, dst); 1275 emit_movzx_r32_r8(mc, dst, dst); 1276 emit_setcc(mc, X64_CC_P, X64_R11); 1277 emit_movzx_r32_r8(mc, X64_R11, X64_R11); 1278 emit_alu_rr(mc, 0, X64_OPC_ALU_OR, dst, X64_R11); 1279 } 1280 1281 static void x64_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop, 1282 NativeLoc bop) { 1283 MCEmitter* mc = t->mc; 1284 u32 d = loc_reg(dst); 1285 int fp = cmp_is_fp(op, aop); 1286 x64_emit_cmp_flags(t, aop, bop, fp); 1287 if (fp) { 1288 /* ucomis sets ZF/CF and, when unordered (NaN), also PF. Each predicate's 1289 * flag formula is built explicitly (NOT blindly as !(opposite)): 1290 * ordered: E/B/BE alias {==,<,<=} only when also NP (not-parity); 1291 * NE/A/AE already exclude unordered, so they stand alone. 1292 * unordered: E/B/BE already include the unordered case (ZF/CF set on 1293 * NaN), so they stand alone; NE/A/AE need an OR with P. */ 1294 switch (op) { 1295 /* ordered: require not-unordered (NP) on the equality-flag cases */ 1296 case CMP_OEQ_F: 1297 x64_fp_setcc_ordered(t, X64_CC_E, d); 1298 return; 1299 case CMP_OLT_F: 1300 x64_fp_setcc_ordered(t, X64_CC_B, d); 1301 return; 1302 case CMP_OLE_F: 1303 x64_fp_setcc_ordered(t, X64_CC_BE, d); 1304 return; 1305 case CMP_ONE_F: 1306 emit_setcc(mc, X64_CC_NE, d); 1307 break; 1308 case CMP_OGT_F: 1309 emit_setcc(mc, X64_CC_A, d); 1310 break; 1311 case CMP_OGE_F: 1312 emit_setcc(mc, X64_CC_AE, d); 1313 break; 1314 /* unordered: OR-with-P on the cases that exclude unordered */ 1315 case CMP_UEQ_F: 1316 emit_setcc(mc, X64_CC_E, d); 1317 break; 1318 case CMP_ULT_F: 1319 emit_setcc(mc, X64_CC_B, d); 1320 break; 1321 case CMP_ULE_F: 1322 emit_setcc(mc, X64_CC_BE, d); 1323 break; 1324 case CMP_UNE_F: 1325 x64_fp_setcc_unord(t, X64_CC_NE, d); 1326 return; 1327 case CMP_UGT_F: 1328 x64_fp_setcc_unord(t, X64_CC_A, d); 1329 return; 1330 case CMP_UGE_F: 1331 x64_fp_setcc_unord(t, X64_CC_AE, d); 1332 return; 1333 default: 1334 emit_setcc(mc, cmp_to_cc(op), d); 1335 break; 1336 } 1337 emit_movzx_r32_r8(mc, d, d); 1338 return; 1339 } 1340 emit_setcc(mc, cmp_to_cc(op), d); 1341 emit_movzx_r32_r8(mc, d, d); 1342 } 1343 1344 /* ============================ converts ============================ */ 1345 1346 static void x64_convert(NativeTarget* t, ConvKind k, NativeLoc dst, 1347 NativeLoc src) { 1348 X64NativeTarget* a = x64_of(t); 1349 MCEmitter* mc = t->mc; 1350 u32 rd = loc_reg(dst), rs = loc_reg(src); 1351 switch (k) { 1352 case CV_SEXT: { 1353 u32 src_sz = native_type_size(t, src.type); 1354 int w = x64_is_64(t, dst.type) ? 1 : 0; 1355 emit_extend_rr(mc, w, 1, src_sz, rd, rs); 1356 return; 1357 } 1358 case CV_ZEXT: { 1359 u32 src_sz = native_type_size(t, src.type); 1360 int w = x64_is_64(t, dst.type) ? 1 : 0; 1361 emit_extend_rr(mc, w, 0, src_sz, rd, rs); 1362 return; 1363 } 1364 case CV_TRUNC: 1365 emit_mov_rr(mc, 0, rd, rs); /* low 32 bits; clears high */ 1366 return; 1367 case CV_ITOF_S: 1368 case CV_ITOF_U: { 1369 int w_src = x64_is_64(t, src.type) ? 1 : 0; 1370 u8 prefix = sse_scalar_prefix(native_type_size(t, dst.type)); 1371 if (k == CV_ITOF_U && w_src == 1) { 1372 MCLabel L_high = mc->label_new(mc); 1373 MCLabel L_done = mc->label_new(mc); 1374 emit_test_self(mc, 1, rs); 1375 emit_jcc_rel32(mc, X64_CC_S, L_high); 1376 emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, rs); 1377 emit_jmp_rel32(mc, L_done); 1378 mc->label_place(mc, L_high); 1379 emit_mov_rr(mc, 1, X64_R11, rs); 1380 emit_mov_rr(mc, 1, X64_RAX, rs); 1381 emit_alu_imm8(mc, 1, X64_ALU_SUB_AND, X64_RAX, 1); 1382 emit_shift_imm(mc, 1, X64_SHIFT_SUB_SHR, X64_R11, 1); 1383 emit_alu_rr(mc, 1, X64_OPC_ALU_OR, X64_R11, X64_RAX); 1384 emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, X64_R11); 1385 emit_sse_rr(mc, prefix, 0x58, rd, rd); 1386 mc->label_place(mc, L_done); 1387 return; 1388 } 1389 if (k == CV_ITOF_U) { 1390 emit_extend_rr(mc, 0, 0, 4, X64_R11, rs); /* zext u32 -> 64 */ 1391 rs = X64_R11; 1392 w_src = 1; 1393 } 1394 emit_sse_rr_w(mc, prefix, 0x2A, w_src, rd, rs); 1395 return; 1396 } 1397 case CV_FTOI_S: 1398 case CV_FTOI_U: { 1399 int w_dst = x64_is_64(t, dst.type) ? 1 : 0; 1400 u8 prefix = sse_scalar_prefix(native_type_size(t, src.type)); 1401 /* Unsigned 64-bit FTOI needs the 2^63 bias dance; otherwise cvtt 1402 * (with the destination widened to 64 for u32) is exact. */ 1403 if (k == CV_FTOI_U && w_dst == 1) { 1404 int dbl = native_type_size(t, src.type) == 8u; 1405 MCLabel L_small = mc->label_new(mc); 1406 MCLabel L_done = mc->label_new(mc); 1407 /* limit = 2^63 in fp scratch. */ 1408 x64_emit_load_imm( 1409 mc, 1, X64_R11, 1410 dbl ? (i64)0x43E0000000000000ull : (i64)0x5F000000ull); 1411 emit_sse_rr_w(mc, 0x66, 0x6E, dbl, X64_TMP_FP2, X64_R11); 1412 emit_sse_rr(mc, dbl ? 0x66 : 0, 0x2E, rs, X64_TMP_FP2); /* ucomis */ 1413 emit_jcc_rel32(mc, X64_CC_B, L_small); 1414 emit_sse_rr(mc, prefix, 0x10, X64_TMP_FP, rs); 1415 emit_sse_rr(mc, prefix, 0x5C, X64_TMP_FP, X64_TMP_FP2); /* sub bias */ 1416 emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, X64_TMP_FP); 1417 x64_emit_load_imm(mc, 1, X64_R11, (i64)0x8000000000000000ull); 1418 emit_alu_rr(mc, 1, X64_OPC_ALU_XOR, rd, X64_R11); 1419 emit_jmp_rel32(mc, L_done); 1420 mc->label_place(mc, L_small); 1421 emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, rs); 1422 mc->label_place(mc, L_done); 1423 return; 1424 } 1425 if (k == CV_FTOI_U) w_dst = 1; /* widen u32 result */ 1426 emit_sse_rr_w(mc, prefix, 0x2C, w_dst, rd, rs); 1427 return; 1428 } 1429 case CV_FEXT: 1430 emit_sse_rr(mc, 0xF3, 0x5A, rd, rs); /* cvtss2sd */ 1431 return; 1432 case CV_FTRUNC: 1433 emit_sse_rr(mc, 0xF2, 0x5A, rd, rs); /* cvtsd2ss */ 1434 return; 1435 case CV_BITCAST: 1436 if (!native_loc_is_fp(src) && native_loc_is_fp(dst)) { 1437 emit_sse_rr_w(mc, 0x66, 0x6E, x64_is_64(t, dst.type), rd, rs); 1438 } else if (native_loc_is_fp(src) && !native_loc_is_fp(dst)) { 1439 emit_sse_rr_w(mc, 0x66, 0x7E, x64_is_64(t, src.type), rs, rd); 1440 } else { 1441 x64_move(t, dst, src); 1442 } 1443 return; 1444 default: 1445 x64_panic(a, "unsupported convert"); 1446 } 1447 } 1448 1449 /* ============================ spill / reload ============================ */ 1450 1451 static void x64_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot, 1452 MemAccess mem) { 1453 NativeAddr addr; 1454 memset(&addr, 0, sizeof addr); 1455 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1456 addr.base.frame = slot; 1457 addr.base_type = src.type; 1458 x64_emit_mem(x64_of(t), 0, src, addr, mem); 1459 } 1460 static void x64_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot, 1461 MemAccess mem) { 1462 NativeAddr addr; 1463 memset(&addr, 0, sizeof addr); 1464 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1465 addr.base.frame = slot; 1466 addr.base_type = dst.type; 1467 x64_emit_mem(x64_of(t), 1, dst, addr, mem); 1468 } 1469 1470 /* ============================ control flow ============================ */ 1471 1472 static void emit_jmp_rel32(MCEmitter* mc, MCLabel l) { 1473 u8 op = X64_OPC_JMP_REL32; 1474 mc->emit_bytes(mc, &op, 1); 1475 emit_u32le(mc, 0); 1476 mc->emit_label_ref(mc, l, R_PC32, 4, -4); 1477 } 1478 static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l) { 1479 u8 op[2] = {X64_OPC_TWOBYTE, (u8)(X64_OPC_JCC_BASE | (cc & 0xfu))}; 1480 mc->emit_bytes(mc, op, 2); 1481 emit_u32le(mc, 0); 1482 mc->emit_label_ref(mc, l, R_PC32, 4, -4); 1483 } 1484 1485 static MCLabel x64_label_new(NativeTarget* t) { 1486 return t->mc->label_new(t->mc); 1487 } 1488 static void x64_label_place(NativeTarget* t, MCLabel l) { 1489 t->mc->label_place(t->mc, l); 1490 } 1491 static void x64_jump(NativeTarget* t, MCLabel l) { emit_jmp_rel32(t->mc, l); } 1492 1493 static void x64_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop, 1494 NativeLoc bop, MCLabel l) { 1495 MCEmitter* mc = t->mc; 1496 int fp = cmp_is_fp(op, aop); 1497 if (fp) { 1498 /* Materialize the 0/1 result, then branch on nonzero. */ 1499 NativeLoc tmp = 1500 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I32), NATIVE_REG_INT, X64_RAX); 1501 x64_cmp(t, op, tmp, aop, bop); 1502 emit_test_self(mc, 0, X64_RAX); 1503 emit_jcc_rel32(mc, X64_CC_NE, l); 1504 return; 1505 } 1506 x64_emit_cmp_flags(t, aop, bop, 0); 1507 emit_jcc_rel32(mc, cmp_to_cc(op), l); 1508 } 1509 1510 static void x64_indirect_branch(NativeTarget* t, NativeLoc addr, 1511 const MCLabel* valid_targets, u32 ntargets) { 1512 MCEmitter* mc = t->mc; 1513 u32 r = loc_reg(addr); 1514 (void)valid_targets; 1515 (void)ntargets; 1516 if (r & 8u) { 1517 u8 rex = X64_REX_BASE | X64_REX_B; 1518 mc->emit_bytes(mc, &rex, 1); 1519 } 1520 { 1521 u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; 1522 mc->emit_bytes(mc, buf, 2); 1523 } 1524 } 1525 1526 static void x64_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) { 1527 /* `&&label` address-take: `leaq sym(%rip), rd` with an R_PC32 relocation 1528 * against the label's per-block local symbol — same form as a global 1529 * address-take, so a re-encoding assembler recomputes the displacement. 1530 * (A baked disp32 with no reloc would break once clang re-lays-out the 1531 * function.) */ 1532 MCEmitter* mc = t->mc; 1533 u32 rd = loc_reg(dst); 1534 ObjSymId sym = mc_label_symbol(mc, l); 1535 u32 disp_pos; 1536 emit_rex(mc, 1, rd, 0, 0); 1537 { 1538 u8 op = X64_OPC_LEA; 1539 mc->emit_bytes(mc, &op, 1); 1540 } 1541 { 1542 u8 mr = modrm(0u, rd & 7u, 5u); /* [rip + disp32] */ 1543 mc->emit_bytes(mc, &mr, 1); 1544 } 1545 disp_pos = mc->pos(mc); 1546 emit_u32le(mc, 0); 1547 mc->emit_reloc_at(mc, mc->section_id, disp_pos, R_PC32, sym, -4, 1, 0); 1548 } 1549 1550 /* ============================ frame / lifecycle ============================ 1551 */ 1552 1553 static NativeFrameSlot x64_frame_slot(NativeTarget* t, 1554 const NativeFrameSlotDesc* d) { 1555 return native_frame_slot_alloc(&x64_of(t)->frame, d); 1556 } 1557 1558 static int x64_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot, 1559 CGDebugLoc* out) { 1560 X64NativeTarget* a = x64_of(t); 1561 X64NativeSlot* s; 1562 if (!out) return 0; 1563 memset(out, 0, sizeof *out); 1564 if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0; 1565 s = x64_slot_get(a, slot); 1566 out->kind = CG_DEBUG_LOC_FRAME; 1567 /* x64 slots live at RBP - off (exactly how the memory-operand path addresses 1568 * them). The hosted dbg snapshot seeds the frame base with RBP, so report 1569 * the RBP-relative offset — mirroring aa64's FP-relative convention. */ 1570 out->v.frame_ofs = -(i32)s->off; 1571 return 1; 1572 } 1573 1574 /* xmm save area base (rbp-relative). XMM saves are 16-aligned. */ 1575 static u32 x64_xmm_base(const X64NativeTarget* a, u32 cs_fp) { 1576 if (cs_fp == 0) return a->frame.cum_off; 1577 return align_up_u32(a->frame.cum_off, 16u); 1578 } 1579 1580 static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int, 1581 u32 cs_fp) { 1582 u32 xmm_base = x64_xmm_base(a, cs_fp); 1583 u32 raw = a->frame.max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base; 1584 u32 fs = align_up_u32(raw, 16u); 1585 return fs ? fs : 16u; 1586 } 1587 1588 /* Collect the callee-saves the body actually used. */ 1589 static u32 x64_collect_int_saves(X64NativeTarget* a, Reg* regs) { 1590 u32 n = 0, i; 1591 for (i = 0; i < a->frame.ncallee_saves; ++i) 1592 if (a->frame.callee_saves[i].cls == NATIVE_REG_INT) 1593 regs[n++] = a->frame.callee_saves[i].reg; 1594 return n; 1595 } 1596 static u32 x64_collect_fp_saves(X64NativeTarget* a, Reg* regs) { 1597 u32 n = 0, i; 1598 for (i = 0; i < a->frame.ncallee_saves; ++i) 1599 if (a->frame.callee_saves[i].cls == NATIVE_REG_FP) 1600 regs[n++] = a->frame.callee_saves[i].reg; 1601 return n; 1602 } 1603 1604 static ObjSymId x64_chkstk_sym(NativeTarget* t) { 1605 Sym name = pool_intern_slice(t->c->global, SLICE_LIT("__chkstk")); 1606 ObjSymId s = obj_symbol_find(t->obj, name); 1607 if (s != 0) return s; 1608 return obj_symbol(t->obj, name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); 1609 } 1610 1611 /* Build the prologue byte sequence into buf. Returns bytes written and, when 1612 * the chkstk path fires, the disp32 offset of the call site. When `skip_sub` is 1613 * set (the known-frame slim / red-zone tiers), the `sub rsp` reservation is 1614 * omitted entirely: the frame record is established but no stack is reserved, 1615 * either because the frame is empty (slim) or because the locals/saves live in 1616 * the SysV red zone (redzone_leaf). Callers must only set it when the frame 1617 * needs no reserved region (no alloca, no outgoing args, and — for the red 1618 * zone — a leaf frame <= 128 bytes). */ 1619 static u32 x64_build_prologue(X64NativeTarget* a, u8* buf, u32 cap, 1620 u32 frame_size, const Reg* cs_int, u32 n_int, 1621 const Reg* cs_fp, u32 n_fp, int skip_sub, 1622 u32* chkstk_disp_pos_out) { 1623 u32 wi = 0; 1624 u32 xmm_base = x64_xmm_base(a, n_fp); 1625 u32 i; 1626 /* Page granularity for Windows large-frame probing (0 = no probe needed). 1627 * Win64 reserves >1-page frames through __chkstk; the same ABI capability 1628 * the aarch64 backend reads for its inline probe. */ 1629 u32 probe = abi_stack_probe_interval(a->base.c->abi); 1630 *chkstk_disp_pos_out = (u32)-1; 1631 if (cap < X64_PROLOGUE_BASE_BYTES) 1632 x64_panic(a, "prologue placeholder overflow"); 1633 /* push rbp; mov rbp, rsp. */ 1634 buf[wi++] = (u8)(X64_OPC_PUSH_R | (X64_RBP & 7u)); 1635 buf[wi++] = X64_REX_BASE | X64_REX_W; 1636 buf[wi++] = X64_OPC_MOV_RM_R; 1637 buf[wi++] = modrm(3u, X64_RSP, X64_RBP); 1638 /* sub rsp, frame_size (or chkstk on Win64 large frame); skipped by the slim / 1639 * red-zone tiers, which reserve no stack. */ 1640 if (skip_sub) { 1641 /* no reservation */ 1642 } else if (probe && frame_size > probe) { 1643 if (wi + 13u > cap) x64_panic(a, "prologue placeholder overflow"); 1644 buf[wi++] = (u8)(X64_OPC_MOV_RI | (X64_RAX & 7u)); /* mov eax, imm32 */ 1645 wr_u32_le(buf + wi, frame_size); 1646 wi += 4; 1647 buf[wi++] = X64_OPC_CALL_REL32; 1648 *chkstk_disp_pos_out = wi; 1649 wr_u32_le(buf + wi, 0); 1650 wi += 4; 1651 buf[wi++] = X64_REX_BASE | X64_REX_W; /* sub rsp, rax */ 1652 buf[wi++] = X64_OPC_ALU_SUB; 1653 buf[wi++] = modrm(3u, X64_RAX, X64_RSP); 1654 } else { 1655 if (wi + 7u > cap) x64_panic(a, "prologue placeholder overflow"); 1656 buf[wi++] = X64_REX_BASE | X64_REX_W; 1657 buf[wi++] = X64_OPC_ALU_IMM32; 1658 buf[wi++] = modrm(3u, X64_ALU_SUB_SUB, X64_RSP); 1659 wr_u32_le(buf + wi, frame_size); 1660 wi += 4; 1661 } 1662 /* sret: spill the first int arg reg (destination pointer) into its slot. 1663 * Use the minimal disp encoding (x64_pack_mem) so it matches the body's 1664 * frame stores and the matching epilogue restore — the `cc -S | as` 1665 * round-trip can then reproduce these bytes exactly. The -O0 placeholder is 1666 * NOP-padded to a fixed width, so a shorter prologue is harmless. */ 1667 if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { 1668 X64NativeSlot* s = x64_slot_get(a, a->sret_ptr_slot); 1669 u32 sret_reg = a->abi->int_args[0]; 1670 i32 off = -(i32)s->off; 1671 if (wi + 8u > cap) x64_panic(a, "prologue placeholder overflow"); 1672 buf[wi++] = 1673 (u8)(X64_REX_BASE | X64_REX_W | ((sret_reg & 8u) ? X64_REX_R : 0u)); 1674 buf[wi++] = X64_OPC_MOV_RM_R; 1675 wi += x64_pack_mem(buf + wi, sret_reg & 7u, X64_RBP, off); 1676 } 1677 /* Spill callee-saved GPRs. */ 1678 for (i = 0; i < n_int; ++i) { 1679 u32 reg = cs_int[i]; 1680 i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1u) * 8; 1681 if (wi + 8u > cap) x64_panic(a, "prologue placeholder overflow"); 1682 buf[wi++] = (u8)(X64_REX_BASE | X64_REX_W | ((reg & 8u) ? X64_REX_R : 0u)); 1683 buf[wi++] = X64_OPC_MOV_RM_R; 1684 wi += x64_pack_mem(buf + wi, reg & 7u, X64_RBP, off); 1685 } 1686 /* Spill callee-saved XMMs (Win64). movaps [rbp+disp], xmm. */ 1687 for (i = 0; i < n_fp; ++i) { 1688 u32 xmm = cs_fp[i]; 1689 i32 off = -(i32)xmm_base - (i32)(i + 1u) * 16; 1690 u8 rex = (u8)((xmm & 8u) ? (X64_REX_BASE | X64_REX_R) : 0u); 1691 u32 need = rex ? 9u : 8u; 1692 if (wi + need > cap) x64_panic(a, "prologue placeholder overflow"); 1693 if (rex) buf[wi++] = rex; 1694 buf[wi++] = X64_OPC_TWOBYTE; 1695 buf[wi++] = 0x29; /* MOVAPS r/m128, xmm */ 1696 wi += x64_pack_mem(buf + wi, xmm & 7u, X64_RBP, off); 1697 } 1698 return wi; 1699 } 1700 1701 static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { 1702 X64NativeTarget* a = x64_of(t); 1703 MCEmitter* mc = t->mc; 1704 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 1705 a->func = fd; 1706 a->loc = fd->loc; 1707 a->abi = x64_abi_for_os(t->c->target.os); 1708 /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing, 1709 * callee-save set, and known_frame/has_alloca/frame_final. */ 1710 native_frame_reset(&a->frame); 1711 a->incoming_stack_size = 0; 1712 a->next_param_int = 0; 1713 a->next_param_fp = 0; 1714 a->next_param_stack = 0; 1715 a->has_sret = (abi && abi->has_sret) ? 1u : 0u; 1716 a->is_variadic = (abi && abi->variadic) ? 1u : 0u; 1717 a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; 1718 a->reg_save_slot = NATIVE_FRAME_SLOT_NONE; 1719 a->npatches = 0; 1720 a->nalloca = 0; 1721 a->nbind_moves = 0; 1722 a->slim_frame = 0; 1723 a->redzone_leaf = 0; 1724 a->prologue_nbytes = 1725 a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES; 1726 1727 mc->set_section(mc, fd->text_section_id); 1728 mc->emit_align(mc, 16, X64_NOP1); 1729 a->func_start = mc->pos(mc); 1730 mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); 1731 if (mc->cfi_startproc) mc->cfi_startproc(mc); 1732 a->epilogue_label = mc->label_new(mc); 1733 } 1734 1735 /* Reserve the sret-pointer slot and (SysV) the 176-byte variadic reg-save 1736 * area. Advances next_param_int past the sret pointer (a0). */ 1737 static void x64_reserve_entry_saves(X64NativeTarget* a) { 1738 NativeTarget* t = &a->base; 1739 if (a->has_sret) { 1740 NativeFrameSlotDesc sd; 1741 memset(&sd, 0, sizeof sd); 1742 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 1743 sd.size = 8; 1744 sd.align = 8; 1745 sd.kind = NATIVE_FRAME_SLOT_SAVE; 1746 a->sret_ptr_slot = t->frame_slot(t, &sd); 1747 a->next_param_int = 1; 1748 } 1749 if (a->is_variadic && a->abi->emit_sysv_vararg_save) { 1750 NativeFrameSlotDesc rd; 1751 memset(&rd, 0, sizeof rd); 1752 rd.type = builtin_id(KIT_CG_BUILTIN_I64); 1753 rd.size = 176; 1754 rd.align = 8; 1755 rd.kind = NATIVE_FRAME_SLOT_SAVE; 1756 a->reg_save_slot = t->frame_slot(t, &rd); 1757 } 1758 } 1759 1760 static void x64_emit_variadic_reg_saves(X64NativeTarget* a) { 1761 NativeTarget* t = &a->base; 1762 MCEmitter* mc = t->mc; 1763 if (!a->is_variadic) return; 1764 if (a->abi->emit_sysv_vararg_save) { 1765 X64NativeSlot* rs = x64_slot_get(a, a->reg_save_slot); 1766 static const u32 gprs[6] = {X64_RDI, X64_RSI, X64_RDX, 1767 X64_RCX, X64_R8, X64_R9}; 1768 u32 i; 1769 for (i = 0; i < 6u; ++i) 1770 emit_mov_store(mc, 8, gprs[i], X64_RBP, -(i32)rs->off + (i32)(i * 8u)); 1771 for (i = 0; i < 8u; ++i) 1772 emit_sse_store(mc, 0xF2, 0x11, (u32)(X64_XMM0 + i), X64_RBP, 1773 -(i32)rs->off + (i32)(48u + i * 16u)); 1774 return; 1775 } 1776 /* Win64 variadic: spill the 4 GPR arg slots to the home space. */ 1777 emit_mov_store(mc, 8, X64_RCX, X64_RBP, 16); 1778 emit_mov_store(mc, 8, X64_RDX, X64_RBP, 24); 1779 emit_mov_store(mc, 8, X64_R8, X64_RBP, 32); 1780 emit_mov_store(mc, 8, X64_R9, X64_RBP, 40); 1781 } 1782 1783 static void x64_func_begin(NativeTarget* t, const CGFuncDesc* fd) { 1784 X64NativeTarget* a = x64_of(t); 1785 MCEmitter* mc = t->mc; 1786 u32 i; 1787 x64_func_begin_common(t, fd); 1788 a->prologue_pos = mc->pos(mc); 1789 for (i = 0; i < a->prologue_nbytes; ++i) emit1(mc, X64_NOP1); 1790 x64_reserve_entry_saves(a); 1791 x64_emit_variadic_reg_saves(a); 1792 } 1793 1794 /* x64 homes callee-saves below the locals (offsets computed in 1795 * x64_compute_frame_size / x64_build_prologue), not in frame slots, so 1796 * alloc_slots=0: native_frame just records the {reg,cls} set from the masks. */ 1797 static void x64_reserve_callee_saves(NativeTarget* t, const u32* used, 1798 u32 nclasses) { 1799 native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0); 1800 } 1801 1802 static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r); 1803 static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r); 1804 1805 static u32 x64_live_callee_saved_mask(NativeTarget* t, 1806 NativeAllocClass cls) { 1807 X64NativeTarget* a = x64_of(t); 1808 const X64ABIRegs* abi = a->abi ? a->abi : x64_abi_for_os(t->c->target.os); 1809 u32 mask = 0; 1810 for (Reg r = 0; r < 16u; ++r) { 1811 if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(abi, r)) 1812 mask |= 1u << r; 1813 if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(abi, r)) 1814 mask |= 1u << r; 1815 } 1816 return mask; 1817 } 1818 1819 static u32 x64_live_caller_saved_mask(NativeTarget* t, 1820 NativeAllocClass cls) { 1821 const NativeAllocClassInfo* ci = native_target_class_info(t, cls); 1822 if (!ci) return 0; 1823 return ci->caller_saved_mask & ~x64_live_callee_saved_mask(t, cls); 1824 } 1825 1826 static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 1827 u32 nclob, u32* int_mask, u32* fp_mask); 1828 1829 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks 1830 * (cg/native_asm.h); it reads the target's live ABI masks. */ 1831 1832 /* Build the callee-saved set the prologue must preserve: the allocator-assigned 1833 * callee-saved registers (frame->callee_saved_used) plus any an inline-asm 1834 * block clobbers. The latter are opaque to the optimizer's operand scan, so it 1835 * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral 1836 * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks 1837 * and keep only the callee-saved ones. x64_reg_is_callee_* follow the live ABI: 1838 * they exclude rbp (handled by the prologue head) and keep the 1839 * reserved-but-callee- saved scratch rbx/r12 (which the caller still expects 1840 * preserved). This is the same register selection the per-block spill used, 1841 * hoisted into the prologue. */ 1842 static u32 x64_known_callee_saves(NativeTarget* t, const X64ABIRegs* abi, 1843 const NativeKnownFrameDesc* frame, u32* out, 1844 u32 cap) { 1845 u32 ncls = frame->ncallee_classes; 1846 u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; 1847 if (ncls > cap) ncls = cap; 1848 for (u32 c = 0; c < ncls; ++c) 1849 out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; 1850 if (frame->asm_clobbers && frame->nasm_clobbers) { 1851 X64NativeTarget* a = x64_of(t); 1852 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 1853 x64_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, 1854 &clob_int, &clob_fp); 1855 } 1856 native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int, 1857 &abi_fp); 1858 clob_int |= abi_int; 1859 clob_fp |= abi_fp; 1860 for (Reg r = 0; r < 16u; ++r) { 1861 if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && 1862 x64_reg_is_callee_int(abi, r)) 1863 out[NATIVE_REG_INT] |= 1u << r; 1864 if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && 1865 x64_reg_is_callee_fp(abi, r)) 1866 out[NATIVE_REG_FP] |= 1u << r; 1867 } 1868 return ncls; 1869 } 1870 1871 /* Optimizer entry point: the full frame is supplied up front, so the prologue 1872 * is emitted final the moment it is built — no NOP region, no func_end patch 1873 * (x64_func_end skips patching when known_frame). x64_build_prologue emits the 1874 * push rbp / sub rsp / sret spill / callee-save spills; the variadic 1875 * register-save stores are emitted separately, as on the single-pass path. */ 1876 static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, 1877 const NativeKnownFrameDesc* frame, 1878 NativeFrameSlot* out_slots) { 1879 X64NativeTarget* a = x64_of(t); 1880 MCEmitter* mc = t->mc; 1881 Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS]; 1882 u32 n_int, n_fp, frame_size, nbytes, chkstk_disp_pos, i; 1883 u8 buf[X64_PROLOGUE_BYTES_WIN64]; 1884 x64_func_begin_common(t, fd); 1885 a->frame.known_frame = 1; 1886 if (frame) { 1887 u32 cs[NATIVE_CALL_PLAN_CLASSES]; 1888 u32 ncs = 1889 x64_known_callee_saves(t, a->abi, frame, cs, NATIVE_CALL_PLAN_CLASSES); 1890 a->frame.has_alloca = frame->has_alloca; 1891 if (ncs) x64_reserve_callee_saves(t, cs, ncs); 1892 for (i = 0; i < frame->nslots; ++i) { 1893 NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]); 1894 if (out_slots) out_slots[i] = slot; 1895 } 1896 x64_reserve_entry_saves(a); 1897 native_frame_note_outgoing(&a->frame, frame->max_outgoing); 1898 } 1899 /* Frame is final: size and offsets are settled, so emit the exact prologue. 1900 */ 1901 n_int = x64_collect_int_saves(a, cs_int); 1902 n_fp = x64_collect_fp_saves(a, cs_fp); 1903 frame_size = x64_compute_frame_size(a, n_int, n_fp); 1904 a->frame_size_final = frame_size; 1905 /* Cost-model tier selection (mirrors aa64's aa_func_begin_known_frame): with 1906 * the frame final before the body, choose the cheapest valid prologue shape. 1907 * Both tiers keep the rbp record and only drop the `sub rsp`, so the 1908 * epilogue/CFI/offset helpers are untouched. x64 needs no 1909 * `fp_at_bottom`-style fold: `push rbp` already folds the sp-move into the 1910 * store. */ 1911 a->slim_frame = a->frame.ncallee_saves == 0 && !a->frame.has_alloca && 1912 a->frame.cum_off == 0 && a->frame.max_outgoing == 0; 1913 /* redzone keeps locals below rsp in the red zone; exclude inline asm, which 1914 * may issue a `call` (clobbering the red zone) the optimizer can't see. slim 1915 * needs no such guard: it has no locals there and the return address lives on 1916 * the stack at [rbp+8], not in a clobberable register. */ 1917 a->redzone_leaf = !a->slim_frame && a->abi->shadow_space == 0 && frame && 1918 frame->is_leaf && !frame->has_asm && !a->frame.has_alloca && 1919 a->frame.max_outgoing == 0 && frame_size <= 128u; 1920 a->prologue_pos = mc->pos(mc); 1921 nbytes = x64_build_prologue(a, buf, sizeof buf, frame_size, cs_int, n_int, 1922 cs_fp, n_fp, a->slim_frame || a->redzone_leaf, 1923 &chkstk_disp_pos); 1924 mc->emit_bytes(mc, buf, nbytes); 1925 if (chkstk_disp_pos != (u32)-1) { 1926 ObjSymId chk = x64_chkstk_sym(t); 1927 mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos, 1928 R_X64_PLT32, chk, -4, 1, 0); 1929 } 1930 a->prologue_nbytes = nbytes; /* exact length: used for the CFI post offset */ 1931 x64_emit_variadic_reg_saves(a); 1932 native_frame_set_final(&a->frame); 1933 } 1934 1935 static void x64_func_end(NativeTarget* t) { 1936 X64NativeTarget* a = x64_of(t); 1937 MCEmitter* mc = t->mc; 1938 ObjBuilder* obj = t->obj; 1939 ObjSecId sec = a->func->text_section_id; 1940 Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS]; 1941 u32 n_int = x64_collect_int_saves(a, cs_int); 1942 u32 n_fp = x64_collect_fp_saves(a, cs_fp); 1943 u32 frame_size = x64_compute_frame_size(a, n_int, n_fp); 1944 u32 xmm_base = x64_xmm_base(a, n_fp); 1945 u32 end; 1946 i32 i; 1947 a->frame_size_final = frame_size; 1948 1949 /* Epilogue. */ 1950 mc->label_place(mc, a->epilogue_label); 1951 for (i = (i32)n_fp - 1; i >= 0; --i) { 1952 i32 off = -(i32)xmm_base - (i32)(i + 1) * 16; 1953 emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP, off); /* movaps */ 1954 } 1955 for (i = (i32)n_int - 1; i >= 0; --i) { 1956 i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8; 1957 emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP, off); 1958 } 1959 emit_leave(mc); 1960 emit_ret(mc); 1961 1962 /* Patch the single-pass prologue placeholder. */ 1963 if (!a->frame.known_frame) { 1964 u8 buf[X64_PROLOGUE_BYTES_WIN64]; 1965 u32 chkstk_disp_pos; 1966 u32 nbytes; 1967 u32 k; 1968 for (k = 0; k < a->prologue_nbytes; ++k) buf[k] = X64_NOP1; 1969 /* Single-pass path never selects a slim/red-zone tier (it cannot know the 1970 * frame up front), so it always emits the full reservation. */ 1971 nbytes = x64_build_prologue(a, buf, a->prologue_nbytes, frame_size, cs_int, 1972 n_int, cs_fp, n_fp, 0, &chkstk_disp_pos); 1973 (void)nbytes; 1974 obj_patch(obj, sec, a->prologue_pos, buf, a->prologue_nbytes); 1975 if (chkstk_disp_pos != (u32)-1) { 1976 ObjSymId chk = x64_chkstk_sym(t); 1977 mc->emit_reloc_at(mc, sec, a->prologue_pos + chkstk_disp_pos, R_X64_PLT32, 1978 chk, -4, 1, 0); 1979 } 1980 } 1981 1982 /* Patch alloca disp32s: lea dst, [rsp + max_outgoing]. */ 1983 { 1984 u32 mo = align_up_u32(a->frame.max_outgoing, 16u); 1985 u32 k; 1986 for (k = 0; k < a->npatches; ++k) { 1987 u8 dbuf[4]; 1988 wr_u32_le(dbuf, mo); 1989 obj_patch(obj, sec, a->patches[k].pos, dbuf, 4); 1990 } 1991 } 1992 1993 /* CFI: after the prologue, CFA = rbp + 16; rbp at cfa-16, ra at cfa-8. */ 1994 if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { 1995 /* Body starts past the prologue. prologue_nbytes is the reserved NOP-region 1996 * size on the single-pass path and the exact prologue length on the 1997 * known-frame path (set in x64_func_begin_known_frame). */ 1998 u32 post = a->prologue_pos + a->prologue_nbytes; 1999 u32 k; 2000 mc->cfi_set_next_pc_offset(mc, post - a->func_start); 2001 /* CFI register operands are DWARF numbers, which differ from the x86-64 2002 * hardware encoding for rbp/rsp/rsi/rdi/rcx/rdx (e.g. rbp is HW 5 but 2003 * DWARF 6). Map every hardware GPR through x64_dwarf_from_hw_gpr; rip's 2004 * DWARF number (16) is already correct. */ 2005 mc->cfi_def_cfa(mc, x64_dwarf_from_hw_gpr(X64_RBP), 16); 2006 mc->cfi_offset(mc, x64_dwarf_from_hw_gpr(X64_RBP), -16); 2007 mc->cfi_offset(mc, 16u /* rip */, -8); 2008 for (k = 0; k < n_int; ++k) { 2009 i32 off = -(i32)xmm_base - (i32)n_fp * 16 - (i32)(k + 1u) * 8; 2010 mc->cfi_offset(mc, x64_dwarf_from_hw_gpr(cs_int[k]), off); 2011 } 2012 } 2013 2014 end = mc->pos(mc); 2015 obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start, 2016 (u64)(end - a->func_start)); 2017 if (a->func->atomize) 2018 obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym, 2019 0); 2020 if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end); 2021 if (mc->cfi_endproc) mc->cfi_endproc(mc); 2022 mc_end_function(mc); 2023 a->func = NULL; 2024 } 2025 2026 /* ============================ params / ABI helpers 2027 * ============================ 2028 */ 2029 2030 /* Win64 shares one arg-slot index across int and FP. Keep cursors in lockstep. 2031 */ 2032 static void x64_sync_slot(const X64ABIRegs* abi, u32* next_int, u32* next_fp) { 2033 u32 m; 2034 if (!abi->slot_shared_int_fp) return; 2035 m = *next_int > *next_fp ? *next_int : *next_fp; 2036 *next_int = m; 2037 *next_fp = m; 2038 } 2039 2040 static const ABIArgInfo* x64_param_abi(NativeTarget* t, const ABIFuncInfo* abi, 2041 const NativeCallDesc* desc, u32 i, 2042 ABIArgInfo* scratch) { 2043 int variadic = abi && i >= abi->nparams; 2044 if (abi && i < abi->nparams) return &abi->params[i]; 2045 (void)variadic; 2046 memset(scratch, 0, sizeof *scratch); 2047 scratch->kind = ABI_ARG_DIRECT; 2048 scratch->nparts = 1; 2049 scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); 2050 ((ABIArgPart*)scratch->parts)[0].cls = 2051 cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT; 2052 ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; 2053 ((ABIArgPart*)scratch->parts)[0].size = 2054 native_type_size(t, desc->args[i].type); 2055 ((ABIArgPart*)scratch->parts)[0].align = 2056 native_type_align(t, desc->args[i].type); 2057 return scratch; 2058 } 2059 2060 static KitCgTypeId x64_part_scalar_type(const ABIArgPart* part) { 2061 if (part->cls == ABI_CLASS_FP) 2062 return part->size <= 4u ? builtin_id(KIT_CG_BUILTIN_F32) 2063 : builtin_id(KIT_CG_BUILTIN_F64); 2064 switch (part->size) { 2065 case 1u: 2066 return builtin_id(KIT_CG_BUILTIN_I8); 2067 case 2u: 2068 return builtin_id(KIT_CG_BUILTIN_I16); 2069 case 4u: 2070 return builtin_id(KIT_CG_BUILTIN_I32); 2071 default: 2072 return builtin_id(KIT_CG_BUILTIN_I64); 2073 } 2074 } 2075 2076 /* Is the whole DIRECT arg forced to the stack (not enough reg slots)? */ 2077 static int x64_direct_to_stack(const X64ABIRegs* abi, const ABIArgInfo* ai, 2078 u32 next_int, u32 next_fp) { 2079 u32 need_int, need_fp; 2080 x64_abi_direct_reg_need(ai, &need_int, &need_fp); 2081 return next_int + need_int > abi->n_int_args || 2082 next_fp + need_fp > abi->n_fp_args; 2083 } 2084 2085 /* Outgoing stack bytes a call uses (16-aligned), per the ABI. */ 2086 static u32 x64_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { 2087 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2088 const X64ABIRegs* aregs = x64_abi_for_os(t->c->target.os); 2089 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 2090 u32 next_fp = 0; 2091 u32 stack = aregs->shadow_space; 2092 u32 i; 2093 x64_sync_slot(aregs, &next_int, &next_fp); 2094 for (i = 0; i < desc->nargs; ++i) { 2095 ABIArgInfo tmp; 2096 const ABIArgInfo* ai = x64_param_abi(t, abi, desc, i, &tmp); 2097 u16 p; 2098 if (ai->kind == ABI_ARG_IGNORE) continue; 2099 if (ai->kind == ABI_ARG_INDIRECT) { 2100 if (next_int < aregs->n_int_args) 2101 ++next_int; 2102 else 2103 stack += 8u; 2104 x64_sync_slot(aregs, &next_int, &next_fp); 2105 continue; 2106 } 2107 if (ai->kind == ABI_ARG_DIRECT && 2108 x64_direct_to_stack(aregs, ai, next_int, next_fp)) { 2109 stack += (u32)ai->nparts * 8u; 2110 continue; 2111 } 2112 for (p = 0; p < ai->nparts; ++p) { 2113 const ABIArgPart* part = &ai->parts[p]; 2114 if (part->cls == ABI_CLASS_FP) { 2115 if (next_fp < aregs->n_fp_args) 2116 ++next_fp; 2117 else 2118 stack += 8u; 2119 } else { 2120 if (next_int < aregs->n_int_args) 2121 ++next_int; 2122 else 2123 stack += 8u; 2124 } 2125 x64_sync_slot(aregs, &next_int, &next_fp); 2126 } 2127 } 2128 return align_up_u32(stack, 16u); 2129 } 2130 2131 static u32 x64_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { 2132 return x64_call_stack_size(t, desc); 2133 } 2134 2135 static u32 x64_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 2136 int* variadic, u32* nparams) { 2137 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); 2138 NativeCallDesc d; 2139 if (variadic) *variadic = abi ? (int)abi->variadic : 0; 2140 if (nparams) *nparams = abi ? abi->nparams : 0u; 2141 memset(&d, 0, sizeof d); 2142 d.fn_type = fn_type; 2143 d.nargs = abi ? abi->nparams : 0u; 2144 if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); 2145 return x64_call_stack_size(t, &d); 2146 } 2147 2148 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */ 2149 static NativeAddr x64_loc_addr(X64NativeTarget* a, NativeLoc loc, u32 offset) { 2150 NativeAddr addr; 2151 memset(&addr, 0, sizeof addr); 2152 switch ((NativeLocKind)loc.kind) { 2153 case NATIVE_LOC_FRAME: 2154 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2155 addr.base.frame = loc.v.frame; 2156 addr.base_type = loc.type; 2157 addr.offset = (i32)offset; 2158 return addr; 2159 case NATIVE_LOC_STACK: 2160 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2161 addr.base.frame = loc.v.stack.slot; 2162 addr.base_type = loc.type; 2163 addr.offset = loc.v.stack.offset + (i32)offset; 2164 return addr; 2165 case NATIVE_LOC_ADDR: 2166 addr = loc.v.addr; 2167 addr.offset += (i32)offset; 2168 return addr; 2169 default: 2170 x64_panic(a, "location is not addressable"); 2171 } 2172 return addr; 2173 } 2174 2175 static void x64_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2176 u32 offset, u32 size) { 2177 X64NativeTarget* a = x64_of(t); 2178 if (src.kind == NATIVE_LOC_REG) { 2179 x64_move(t, dst, src); 2180 return; 2181 } 2182 if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK || 2183 src.kind == NATIVE_LOC_ADDR) { 2184 NativeAddr addr = x64_loc_addr(a, src, offset); 2185 addr.base_type = dst.type; 2186 x64_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size)); 2187 return; 2188 } 2189 if (src.kind == NATIVE_LOC_IMM) { 2190 x64_emit_load_imm(t->mc, x64_is_64(t, dst.type) ? 1 : 0, loc_reg(dst), 2191 src.v.imm); 2192 return; 2193 } 2194 x64_panic(a, "unsupported part source"); 2195 } 2196 2197 static void x64_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2198 u32 offset, u32 size) { 2199 X64NativeTarget* a = x64_of(t); 2200 if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK || 2201 dst.kind == NATIVE_LOC_ADDR) { 2202 NativeAddr addr = x64_loc_addr(a, dst, offset); 2203 addr.base_type = src.type; 2204 x64_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size)); 2205 return; 2206 } 2207 if (dst.kind == NATIVE_LOC_REG) { 2208 x64_move(t, dst, src); 2209 return; 2210 } 2211 x64_panic(a, "unsupported part destination"); 2212 } 2213 2214 static void x64_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { 2215 NativeAddr addr = x64_loc_addr(x64_of(t), src, 0); 2216 x64_load_addr(t, dst, addr); 2217 } 2218 2219 static void x64_store_outgoing_part(NativeTarget* t, int tail_call, 2220 u32 stack_off, NativeLoc src, u32 size) { 2221 X64NativeTarget* a = x64_of(t); 2222 NativeAddr addr; 2223 memset(&addr, 0, sizeof addr); 2224 addr.base_kind = NATIVE_ADDR_BASE_REG; 2225 addr.base_type = src.type; 2226 if (tail_call) { 2227 /* A sibling call reuses the caller's frame: its outgoing stack args land in 2228 * the caller's incoming-arg window. `stack_off` already includes the 2229 * shadow-space prefix (the outgoing cursor starts at shadow_space), so the 2230 * window address is [rbp + 16 + stack_off] — the same bytes the tail-callee 2231 * reads once `leave` has restored rsp to the return address. */ 2232 addr.base.reg = X64_RBP; 2233 addr.offset = (i32)(16u + stack_off); 2234 } else { 2235 addr.base.reg = X64_RSP; 2236 addr.offset = (i32)stack_off; 2237 } 2238 x64_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size)); 2239 } 2240 2241 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */ 2242 static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n, 2243 Reg int_scratch); 2244 2245 /* Defer a register-destination param bind for the parallel-copy flush in 2246 * x64_bind_params_end. `src` is the incoming location (an arg register, or a 2247 * NATIVE_LOC_ADDR for an incoming stack slot). */ 2248 static void x64_defer_reg_bind(X64NativeTarget* a, NativeLoc dst, NativeLoc src, 2249 u32 size) { 2250 NativeArgMove* m; 2251 if (a->nbind_moves >= X64_MAX_BIND_MOVES) 2252 x64_panic(a, "too many register parameter binds"); 2253 m = &a->bind_moves[a->nbind_moves++]; 2254 memset(m, 0, sizeof *m); 2255 m->dst = dst; 2256 m->src = src; 2257 m->size = size; 2258 } 2259 2260 /* Incoming stack-arg source as a NATIVE_LOC_ADDR ([rbp + bias + stack_off]). */ 2261 static NativeLoc x64_incoming_stack_loc(KitCgTypeId type, NativeAllocClass cls, 2262 i32 off) { 2263 NativeLoc l; 2264 memset(&l, 0, sizeof l); 2265 l.kind = NATIVE_LOC_ADDR; 2266 l.cls = (u8)cls; 2267 l.type = type; 2268 l.v.addr.base_kind = NATIVE_ADDR_BASE_REG; 2269 l.v.addr.base.reg = X64_RBP; 2270 l.v.addr.base_type = type; 2271 l.v.addr.offset = off; 2272 return l; 2273 } 2274 2275 static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, 2276 NativeLoc dst) { 2277 X64NativeTarget* a = x64_of(t); 2278 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 2279 const ABIArgInfo* ai = 2280 p->index < abi->nparams ? &abi->params[p->index] : NULL; 2281 int to_reg = dst.kind == NATIVE_LOC_REG; 2282 /* Incoming stack args sit above the saved rbp + return addr (+16); Win64 2283 * additionally reserves 32B of home space. */ 2284 i32 incoming_bias = (i32)(16u + a->abi->shadow_space); 2285 u16 i; 2286 if (!ai || ai->kind == ABI_ARG_IGNORE) return; 2287 2288 if (ai->kind == ABI_ARG_INDIRECT) { 2289 /* Incoming pointer to a byval copy: load pointer, memcpy into dst frame. */ 2290 u32 ptr_reg; 2291 NativeAddr d_addr, from; 2292 AggregateAccess access; 2293 if (a->next_param_int < a->abi->n_int_args) { 2294 ptr_reg = a->abi->int_args[a->next_param_int++]; 2295 } else { 2296 ptr_reg = X64_R11; 2297 emit_mov_load(t->mc, 8, 0, ptr_reg, X64_RBP, 2298 incoming_bias + (i32)a->next_param_stack); 2299 a->next_param_stack += 8u; 2300 } 2301 x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp); 2302 if (dst.kind != NATIVE_LOC_FRAME) 2303 x64_panic(a, "indirect parameter requires a frame destination"); 2304 memset(&d_addr, 0, sizeof d_addr); 2305 d_addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2306 d_addr.base.frame = dst.v.frame; 2307 d_addr.base_type = p->type; 2308 memset(&from, 0, sizeof from); 2309 from.base_kind = NATIVE_ADDR_BASE_REG; 2310 from.base.reg = ptr_reg; 2311 from.base_type = p->type; 2312 memset(&access, 0, sizeof access); 2313 access.type = p->type; 2314 access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); 2315 access.align = p->align ? p->align : native_type_align(t, p->type); 2316 x64_copy_bytes(t, d_addr, from, access); 2317 return; 2318 } 2319 2320 if (ai->kind == ABI_ARG_DIRECT && 2321 x64_direct_to_stack(a->abi, ai, a->next_param_int, a->next_param_fp)) { 2322 /* Whole arg on the stack. */ 2323 for (i = 0; i < ai->nparts; ++i) { 2324 const ABIArgPart* part = &ai->parts[i]; 2325 NativeAllocClass cls = 2326 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2327 NativeLoc isrc = x64_incoming_stack_loc( 2328 p->type, cls, incoming_bias + (i32)a->next_param_stack); 2329 a->next_param_stack += 8u; 2330 if (dst.kind == NATIVE_LOC_NONE) { 2331 /* unused */ 2332 } else if (to_reg) { 2333 /* Defer: a register dst may be another param's incoming reg. */ 2334 x64_defer_reg_bind( 2335 a, 2336 native_loc_reg(dst.type ? dst.type : p->type, 2337 (NativeAllocClass)dst.cls, (Reg)dst.v.reg), 2338 isrc, part->size); 2339 } else { 2340 /* Frame dst: load to scratch then store (memory dst is never a cycle 2341 * source, so emit eagerly — it only reads the incoming slot). */ 2342 Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; 2343 NativeLoc tloc = native_loc_reg(p->type, cls, tmp); 2344 x64_load_part(t, tloc, isrc, 0, part->size); 2345 x64_store_part( 2346 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), 2347 tloc, 0, part->size); 2348 } 2349 } 2350 return; 2351 } 2352 2353 for (i = 0; i < ai->nparts; ++i) { 2354 const ABIArgPart* part = &ai->parts[i]; 2355 NativeAllocClass cls = 2356 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2357 NativeLoc 2358 src; /* incoming: arg register, or NATIVE_LOC_ADDR for a stack arg */ 2359 if (cls == NATIVE_REG_FP && a->next_param_fp < a->abi->n_fp_args) { 2360 src = native_loc_reg(p->type, cls, (Reg)(X64_XMM0 + a->next_param_fp++)); 2361 } else if (cls == NATIVE_REG_INT && 2362 a->next_param_int < a->abi->n_int_args) { 2363 src = native_loc_reg(p->type, cls, a->abi->int_args[a->next_param_int++]); 2364 } else { 2365 src = x64_incoming_stack_loc(p->type, cls, 2366 incoming_bias + (i32)a->next_param_stack); 2367 a->next_param_stack += 8u; 2368 } 2369 x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp); 2370 if (dst.kind == NATIVE_LOC_NONE) { 2371 /* unused parameter; cursors advanced */ 2372 } else if (to_reg) { 2373 /* Defer the register bind: the allocator may rotate params across the 2374 * incoming arg registers, so a per-param move could clobber a register 2375 * another bind still needs. x64_bind_params_end resolves them together as 2376 * a parallel copy. */ 2377 x64_defer_reg_bind( 2378 a, 2379 native_loc_reg(dst.type ? dst.type : p->type, 2380 (NativeAllocClass)dst.cls, (Reg)dst.v.reg), 2381 src, part->size); 2382 } else if (src.kind == NATIVE_LOC_REG) { 2383 x64_store_part( 2384 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, 2385 0, part->size); 2386 } else { 2387 /* Stack source -> frame dst: load to scratch, then store. */ 2388 Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; 2389 NativeLoc tloc = native_loc_reg(p->type, cls, tmp); 2390 x64_load_part(t, tloc, src, 0, part->size); 2391 x64_store_part( 2392 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), 2393 tloc, 0, part->size); 2394 } 2395 } 2396 a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); 2397 } 2398 2399 /* Flush the deferred register-destination param binds as a parallel copy (the 2400 * shared scheduler breaks any cycle the allocator's rotation created through 2401 * the int/fp emit scratch). Frame-dst and indirect binds were emitted eagerly 2402 * in bind_param — they only read incoming registers, so they precede this. */ 2403 static void x64_bind_params_end(NativeTarget* t) { 2404 X64NativeTarget* a = x64_of(t); 2405 /* No callee is staged during entry binds, so r11 is free as the cycle 2406 * scratch. */ 2407 if (a->nbind_moves) 2408 x64_emit_reg_arg_moves(t, a->bind_moves, a->nbind_moves, X64_TMP_INT2); 2409 a->nbind_moves = 0; 2410 } 2411 2412 /* ============================ calls / returns ============================ */ 2413 2414 typedef NativeArgMove X64ArgMove; 2415 2416 static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { 2417 if (m->is_addr) { 2418 x64_addr_of_loc(t, m->dst, m->src); 2419 } else { 2420 x64_load_part(t, m->dst, m->src, m->src_offset, m->size); 2421 } 2422 if (m->dup_to_gpr) { 2423 /* movq gpr, xmm: 66 REX.W 0F 7E /r (xmm in reg field). */ 2424 emit_sse_rr_w(t->mc, 0x66, 0x7E, 1, loc_reg(m->dst), m->dup_gpr); 2425 } 2426 } 2427 2428 /* Parallel-copy register arg moves via the shared scheduler. `int_scratch` is 2429 * the register used to break an integer cycle: normally r11, but rax when an 2430 * indirect callee is staged in r11 (rax is never a SysV int arg register and 2431 * the variadic AL count is written only after the moves). */ 2432 static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n, 2433 Reg int_scratch) { 2434 NativeArgShuffle s; 2435 if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args"); 2436 memset(&s, 0, sizeof s); 2437 s.t = t; 2438 s.emit_one = x64_emit_one_arg_move; 2439 s.reg_move = x64_move; 2440 s.scratch[NATIVE_REG_INT] = int_scratch; 2441 s.scratch[NATIVE_REG_FP] = X64_TMP_FP; 2442 native_arg_shuffle(&s, moves, n); 2443 } 2444 2445 /* Clobber masks: per-call all caller-saved regs are clobbered. */ 2446 static u32 x64_clobber_mask(const X64ABIRegs* abi, NativeAllocClass cls) { 2447 u32 mask = 0, r; 2448 if (cls == NATIVE_REG_INT) { 2449 for (r = 0; r < 16u; ++r) { 2450 if (r == X64_RSP || r == X64_RBP) continue; 2451 if ((abi->cs_int_mask & (1ull << r)) == 0) mask |= 1u << r; 2452 } 2453 } else if (cls == NATIVE_REG_FP) { 2454 for (r = 0; r < 16u; ++r) 2455 if ((abi->cs_fp_mask & (1ull << r)) == 0) mask |= 1u << r; 2456 } 2457 return mask; 2458 } 2459 2460 static u32 x64_return_mask(const ABIFuncInfo* abi, NativeAllocClass cls) { 2461 u32 mask = 0, ni = 0, nf = 0; 2462 static const u32 iregs[2] = {X64_RAX, X64_RDX}; 2463 u16 i; 2464 if (!abi || abi->ret.kind == ABI_ARG_IGNORE || 2465 abi->ret.kind == ABI_ARG_INDIRECT) 2466 return 0; 2467 for (i = 0; i < abi->ret.nparts; ++i) { 2468 const ABIArgPart* p = &abi->ret.parts[i]; 2469 if (cls == NATIVE_REG_INT && p->cls == ABI_CLASS_INT && ni < 2) 2470 mask |= 1u << iregs[ni++]; 2471 else if (cls == NATIVE_REG_FP && p->cls == ABI_CLASS_FP && nf < 2) 2472 mask |= 1u << (X64_XMM0 + nf++); 2473 } 2474 return mask; 2475 } 2476 2477 static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, 2478 NativeCallPlan* plan) { 2479 X64NativeTarget* a = x64_of(t); 2480 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2481 const X64ABIRegs* aregs = a->abi ? a->abi : x64_abi_for_os(t->c->target.os); 2482 NativeCallPlanRet* rets; 2483 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2484 u32 c; 2485 memset(plan, 0, sizeof *plan); 2486 rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL; 2487 plan->callee = desc->callee; 2488 plan->rets = rets; 2489 plan->flags = desc->flags; 2490 plan->has_sret = abi && abi->has_sret; 2491 plan->is_variadic = abi && abi->variadic; 2492 plan->stack_arg_size = x64_call_stack_size(t, desc); 2493 if (plan->stack_arg_size > a->frame.max_outgoing) 2494 a->frame.max_outgoing = plan->stack_arg_size; 2495 for (c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) { 2496 plan->clobber_mask[c] = x64_clobber_mask(aregs, (NativeAllocClass)c); 2497 plan->return_mask[c] = x64_return_mask(abi, (NativeAllocClass)c); 2498 } 2499 /* Indirect callee in a clobbered/arg register would be lost; stage in r11. */ 2500 if (plan->callee.kind == NATIVE_LOC_REG && 2501 (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && 2502 plan->callee.v.reg != X64_R11) { 2503 NativeLoc scratch = 2504 native_loc_reg(plan->callee.type, NATIVE_REG_INT, X64_R11); 2505 x64_move(t, scratch, plan->callee); 2506 plan->callee = scratch; 2507 } 2508 { 2509 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 2510 u32 next_fp = 0, stack = aregs->shadow_space, nmoves = 0, i; 2511 int tail = (desc->flags & CG_CALL_TAIL) != 0; 2512 u16 p; 2513 X64ArgMove moves[X64_MAX_REG_ARG_MOVES]; 2514 x64_sync_slot(aregs, &next_int, &next_fp); 2515 for (i = 0; i < desc->nargs; ++i) { 2516 ABIArgInfo tmp; 2517 const ABIArgInfo* ai = x64_param_abi(t, abi, desc, i, &tmp); 2518 int variadic_arg = abi && i >= abi->nparams; 2519 if (ai->kind == ABI_ARG_IGNORE) continue; 2520 if (ai->kind == ABI_ARG_INDIRECT) { 2521 if (next_int < aregs->n_int_args) { 2522 X64ArgMove* m = &moves[nmoves++]; 2523 memset(m, 0, sizeof *m); 2524 m->dst = 2525 native_loc_reg(i64t, NATIVE_REG_INT, aregs->int_args[next_int++]); 2526 m->src = desc->args[i]; 2527 m->size = 8; 2528 m->is_addr = 1; 2529 } else { 2530 NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX); 2531 x64_addr_of_loc(t, ptr, desc->args[i]); 2532 x64_store_outgoing_part(t, tail, stack, ptr, 8); 2533 stack += 8u; 2534 } 2535 x64_sync_slot(aregs, &next_int, &next_fp); 2536 continue; 2537 } 2538 if (ai->kind == ABI_ARG_DIRECT && 2539 x64_direct_to_stack(aregs, ai, next_int, next_fp)) { 2540 for (p = 0; p < ai->nparts; ++p) { 2541 const ABIArgPart* part = &ai->parts[p]; 2542 NativeAllocClass cls = 2543 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2544 Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; 2545 NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp); 2546 x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); 2547 x64_store_outgoing_part(t, tail, stack, tmpreg, part->size); 2548 stack += 8u; 2549 } 2550 continue; 2551 } 2552 for (p = 0; p < ai->nparts; ++p) { 2553 const ABIArgPart* part = &ai->parts[p]; 2554 NativeAllocClass cls = 2555 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2556 if (cls == NATIVE_REG_FP && next_fp < aregs->n_fp_args) { 2557 X64ArgMove* m = &moves[nmoves++]; 2558 u32 slot = next_fp; 2559 memset(m, 0, sizeof *m); 2560 m->dst = native_loc_reg(desc->args[i].type, cls, 2561 (Reg)(X64_XMM0 + next_fp++)); 2562 m->src = desc->args[i]; 2563 m->src_offset = part->src_offset; 2564 m->size = part->size; 2565 if (aregs->vararg_fp_dup_to_gpr && variadic_arg && 2566 slot < aregs->n_int_args) { 2567 m->dup_to_gpr = 1; 2568 m->dup_gpr = aregs->int_args[slot]; 2569 } 2570 x64_sync_slot(aregs, &next_int, &next_fp); 2571 } else if (cls == NATIVE_REG_INT && next_int < aregs->n_int_args) { 2572 X64ArgMove* m = &moves[nmoves++]; 2573 memset(m, 0, sizeof *m); 2574 m->dst = native_loc_reg(desc->args[i].type, cls, 2575 aregs->int_args[next_int++]); 2576 m->src = desc->args[i]; 2577 m->src_offset = part->src_offset; 2578 m->size = part->size; 2579 x64_sync_slot(aregs, &next_int, &next_fp); 2580 } else { 2581 Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; 2582 NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp); 2583 x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); 2584 x64_store_outgoing_part(t, tail, stack, tmpreg, part->size); 2585 stack += 8u; 2586 x64_sync_slot(aregs, &next_int, &next_fp); 2587 } 2588 } 2589 } 2590 /* If an indirect callee was staged in r11 above, the cycle scratch must 2591 * avoid it; rax is free here (not an int arg reg; AL count comes later). */ 2592 x64_emit_reg_arg_moves( 2593 t, moves, nmoves, 2594 (plan->callee.kind == NATIVE_LOC_REG && plan->callee.v.reg == X64_R11) 2595 ? X64_TMP_INT 2596 : X64_TMP_INT2); 2597 if (abi && abi->has_sret) { 2598 /* sret pointer in the first int-arg reg. A tail call forwards the 2599 * caller's own incoming sret pointer (spilled at entry); otherwise pass 2600 * the address of this call's result slot. */ 2601 NativeLoc sret = native_loc_reg(i64t, NATIVE_REG_INT, aregs->int_args[0]); 2602 if (tail) 2603 x64_load_part(t, sret, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 2604 8); 2605 else if (desc->nresults) 2606 x64_addr_of_loc(t, sret, desc->results[0]); 2607 } 2608 /* Variadic call: AL = number of vector regs used. */ 2609 if (abi && abi->variadic) 2610 x64_emit_load_imm(t->mc, 0, X64_RAX, (i64)next_fp); 2611 } 2612 /* Return value receipt. */ 2613 if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { 2614 u32 nr = 0, ni = 0, nf = 0; 2615 static const u32 ret_int_regs[2] = {X64_RAX, X64_RDX}; 2616 u16 p; 2617 for (p = 0; p < abi->ret.nparts; ++p) { 2618 const ABIArgPart* part = &abi->ret.parts[p]; 2619 NativeAllocClass cls = 2620 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2621 KitCgTypeId pty = x64_part_scalar_type(part); 2622 Reg rreg = cls == NATIVE_REG_FP ? (Reg)(X64_XMM0 + nf++) 2623 : (Reg)ret_int_regs[ni++]; 2624 rets[nr].src = native_loc_reg(pty, cls, rreg); 2625 rets[nr].dst = desc->results[0]; 2626 if (rets[nr].dst.kind == NATIVE_LOC_FRAME) 2627 rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, 2628 (i32)part->src_offset); 2629 else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { 2630 rets[nr].dst.v.stack.offset += (i32)part->src_offset; 2631 rets[nr].dst.type = pty; 2632 } 2633 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2634 nr++; 2635 } 2636 plan->nrets = nr; 2637 } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { 2638 plan->nrets = 0; 2639 } else if (!abi && desc->nresults) { 2640 rets[0].src = 2641 native_loc_reg(desc->results[0].type, NATIVE_REG_INT, X64_RAX); 2642 rets[0].dst = desc->results[0]; 2643 rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0); 2644 plan->nrets = 1; 2645 } 2646 } 2647 2648 /* Emit a sibling (tail) call: tear the frame down and jump (no call) to the 2649 * callee. Outgoing args are already in arg regs / the caller's incoming-arg 2650 * window. `leave` (mov rsp,rbp; pop rbp) restores the caller's rbp and leaves 2651 * rsp at the return address — frame_size-independent, so no func_end patch. */ 2652 static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) { 2653 X64NativeTarget* a = x64_of(t); 2654 MCEmitter* mc = t->mc; 2655 ObjSecId sec = mc->section_id; 2656 /* Restore callee-saves before the frame teardown (O1 path; none at -O0). 2657 * Their rbp-relative offsets are frame-size-independent, and the indirect 2658 * callee was staged in r11 by plan_call — a caller-saved scratch — so these 2659 * restores never clobber it. Mirrors the x64_func_end epilogue. */ 2660 Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS]; 2661 u32 n_int = x64_collect_int_saves(a, cs_int); 2662 u32 n_fp = x64_collect_fp_saves(a, cs_fp); 2663 u32 xmm_base = x64_xmm_base(a, n_fp); 2664 i32 i; 2665 for (i = (i32)n_fp - 1; i >= 0; --i) 2666 emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP, 2667 -(i32)xmm_base - (i32)(i + 1) * 16); /* movaps */ 2668 for (i = (i32)n_int - 1; i >= 0; --i) 2669 emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP, 2670 -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8); 2671 emit_leave(mc); 2672 if (callee.kind == NATIVE_LOC_GLOBAL) { 2673 u8 op = X64_OPC_JMP_REL32; 2674 u32 disp_pos; 2675 mc->emit_bytes(mc, &op, 1); 2676 disp_pos = mc->pos(mc); 2677 emit_u32le(mc, 0); 2678 mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, callee.v.global.sym, 2679 callee.v.global.addend - 4, 1, 0); 2680 } else if (callee.kind == NATIVE_LOC_REG) { 2681 u32 r = 2682 loc_reg(callee); /* indirect callee was staged in r11 by plan_call */ 2683 if (r & 8u) { 2684 u8 rex = X64_REX_BASE | X64_REX_B; 2685 mc->emit_bytes(mc, &rex, 1); 2686 } 2687 { 2688 u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; /* jmp r/m, /4 */ 2689 mc->emit_bytes(mc, buf, 2); 2690 } 2691 } else { 2692 x64_panic(a, "unsupported tail call target"); 2693 } 2694 } 2695 2696 static void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) { 2697 MCEmitter* mc = t->mc; 2698 ObjSecId sec = mc->section_id; 2699 if (plan->flags & CG_CALL_TAIL) { 2700 x64_emit_tail_site(t, plan->callee); 2701 return; 2702 } 2703 if (plan->callee.kind == NATIVE_LOC_GLOBAL) { 2704 u8 op = X64_OPC_CALL_REL32; 2705 u32 disp_pos; 2706 mc->emit_bytes(mc, &op, 1); 2707 disp_pos = mc->pos(mc); 2708 emit_u32le(mc, 0); 2709 mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, plan->callee.v.global.sym, 2710 plan->callee.v.global.addend - 4, 1, 0); 2711 return; 2712 } 2713 if (plan->callee.kind == NATIVE_LOC_REG) { 2714 u32 r = loc_reg(plan->callee); 2715 if (r & 8u) { 2716 u8 rex = X64_REX_BASE | X64_REX_B; 2717 mc->emit_bytes(mc, &rex, 1); 2718 } 2719 { 2720 u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 2u, r & 7u)}; /* call r/m, /2 */ 2721 mc->emit_bytes(mc, buf, 2); 2722 } 2723 return; 2724 } 2725 x64_panic(x64_of(t), "unsupported call target"); 2726 } 2727 2728 static void x64_plan_ret(NativeTarget* t, const CGFuncDesc* fd, 2729 const NativeLoc* value, 2730 NativeCallPlanRet** out_rets, u32* out_nrets) { 2731 X64NativeTarget* a = x64_of(t); 2732 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 2733 NativeCallPlanRet* rets = NULL; 2734 u32 nr = 0; 2735 if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4); 2736 if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) { 2737 /* sret: reload destination pointer (spilled at entry) into r11, memcpy the 2738 * source aggregate into [r11], and convention-return the pointer in rax. */ 2739 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2740 NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, X64_R11); 2741 NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0); 2742 NativeAddr dst_addr, src_addr; 2743 AggregateAccess access; 2744 x64_load_part(t, dstp, saved, 0, 8); 2745 memset(&dst_addr, 0, sizeof dst_addr); 2746 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 2747 dst_addr.base.reg = X64_R11; 2748 dst_addr.base_type = value->type; 2749 src_addr = x64_loc_addr(a, *value, 0); 2750 src_addr.base_type = value->type; 2751 memset(&access, 0, sizeof access); 2752 access.type = value->type; 2753 access.size = (u32)cg_type_size(t->c, value->type); 2754 access.align = native_type_align(t, value->type); 2755 x64_copy_bytes(t, dst_addr, src_addr, access); 2756 /* rax = sret pointer. Reload it (copy_bytes clobbered r11/rax). */ 2757 x64_load_part(t, native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX), saved, 0, 2758 8); 2759 *out_rets = NULL; 2760 *out_nrets = 0; 2761 return; 2762 } 2763 if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) { 2764 u32 ni = 0, nf = 0; 2765 static const u32 ret_int_regs[2] = {X64_RAX, X64_RDX}; 2766 u16 p; 2767 for (p = 0; p < abi->ret.nparts; ++p) { 2768 const ABIArgPart* part = &abi->ret.parts[p]; 2769 NativeAllocClass cls = 2770 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2771 KitCgTypeId pty = x64_part_scalar_type(part); 2772 Reg rreg = cls == NATIVE_REG_FP ? (Reg)(X64_XMM0 + nf++) 2773 : (Reg)ret_int_regs[ni++]; 2774 rets[nr].src = *value; 2775 if (rets[nr].src.kind == NATIVE_LOC_FRAME) 2776 rets[nr].src = 2777 native_loc_stack(pty, value->v.frame, (i32)part->src_offset); 2778 else if (rets[nr].src.kind == NATIVE_LOC_STACK) { 2779 rets[nr].src.v.stack.offset += (i32)part->src_offset; 2780 rets[nr].src.type = pty; 2781 } 2782 rets[nr].dst = native_loc_reg(pty, cls, rreg); 2783 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2784 nr++; 2785 } 2786 } else if (value) { 2787 rets[0].src = *value; 2788 rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, X64_RAX); 2789 rets[0].mem = native_mem_for_type(t, value->type, 0); 2790 nr = 1; 2791 } 2792 *out_rets = rets; 2793 *out_nrets = nr; 2794 } 2795 2796 static void x64_ret(NativeTarget* t) { 2797 X64NativeTarget* a = x64_of(t); 2798 x64_jump(t, a->epilogue_label); 2799 } 2800 2801 /* ============================ alloca ============================ */ 2802 2803 static void x64_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, 2804 u32 align) { 2805 X64NativeTarget* a = x64_of(t); 2806 MCEmitter* mc = t->mc; 2807 u32 rsz = loc_reg(size); 2808 u32 rd = loc_reg(dst); 2809 u32 al = align ? align : 16u; 2810 if (al < 16u) al = 16u; 2811 if (al > 16u) x64_panic(a, "alloca align > 16 not supported"); 2812 if (size.kind == NATIVE_LOC_IMM) { 2813 u64 aligned = ((u64)size.v.imm + 15u) & ~(u64)15u; 2814 if (aligned == 0) aligned = 16; 2815 /* sub rsp, imm32. */ 2816 emit_rex(mc, 1, 0, 0, X64_RSP); 2817 { 2818 u8 buf[2] = {X64_OPC_ALU_IMM32, modrm(3u, X64_ALU_SUB_SUB, X64_RSP)}; 2819 mc->emit_bytes(mc, buf, 2); 2820 } 2821 emit_u32le(mc, (u32)aligned); 2822 } else { 2823 /* rax = (size + 15) & ~15; sub rsp, rax. */ 2824 emit_lea(mc, X64_RAX, rsz, 15); 2825 emit_rex(mc, 1, 0, 0, X64_RAX); 2826 { 2827 u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_AND, X64_RAX), 0xF0}; 2828 mc->emit_bytes(mc, buf, 3); 2829 } 2830 emit_alu_rr(mc, 1, X64_OPC_ALU_SUB, X64_RSP, X64_RAX); 2831 } 2832 a->frame.has_alloca = 1; 2833 /* lea dst, [rsp + max_outgoing] — disp32 patched in func_end. */ 2834 if (a->npatches == a->patches_cap) { 2835 u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; 2836 X64Patch* nb = arena_zarray(t->c->tu, X64Patch, cap); 2837 if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches); 2838 a->patches = nb; 2839 a->patches_cap = cap; 2840 } 2841 emit_rex(mc, 1, rd, 0, X64_RSP); 2842 { 2843 u8 op = X64_OPC_LEA; 2844 mc->emit_bytes(mc, &op, 1); 2845 } 2846 { 2847 u8 mr = modrm(2u, rd & 7u, 4u); 2848 mc->emit_bytes(mc, &mr, 1); 2849 } 2850 { 2851 u8 s = sib(0u, 4u, X64_RSP); 2852 mc->emit_bytes(mc, &s, 1); 2853 } 2854 a->patches[a->npatches].kind = X64_PATCH_ALLOCA; 2855 a->patches[a->npatches].pos = mc->pos(mc); 2856 a->npatches++; 2857 a->nalloca++; 2858 emit_u32le(mc, 0); /* placeholder disp32 */ 2859 } 2860 2861 /* ============================ TLS ============================ */ 2862 2863 /* Win64 TLS Local-Exec (PE-COFF): TEB pointer -> _tls_index -> TLS block -> 2864 * lea &sym@SECREL. R11 is scratch. */ 2865 static void x64_tls_addr_of_win64(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2866 i64 addend) { 2867 MCEmitter* mc = t->mc; 2868 u32 sec = mc->section_id; 2869 u32 rd = loc_reg(dst); 2870 /* (1) mov rd, gs:[0x58]. */ 2871 { 2872 u8 gs = 0x65; 2873 mc->emit_bytes(mc, &gs, 1); 2874 emit_rex(mc, 1, rd, 0, 0); 2875 { 2876 u8 op = X64_OPC_MOV_R_RM; 2877 mc->emit_bytes(mc, &op, 1); 2878 } 2879 { 2880 u8 mr = modrm(0u, rd & 7u, 4u); 2881 mc->emit_bytes(mc, &mr, 1); 2882 } 2883 { 2884 u8 s = sib(0u, 4u, 5u); 2885 mc->emit_bytes(mc, &s, 1); 2886 } 2887 emit_u32le(mc, 0x58u); 2888 } 2889 /* (2) mov r11d, [rip + _tls_index]. */ 2890 { 2891 Sym idx_name = pool_intern_slice(t->c->global, SLICE_LIT("_tls_index")); 2892 ObjSymId idx_sym = obj_symbol_find(t->obj, idx_name); 2893 u8 rex_r, op, mr; 2894 u32 disp_pos; 2895 if (idx_sym == 0) 2896 idx_sym = 2897 obj_symbol(t->obj, idx_name, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); 2898 rex_r = X64_REX_BASE | X64_REX_R; 2899 mc->emit_bytes(mc, &rex_r, 1); 2900 op = X64_OPC_MOV_R_RM; 2901 mc->emit_bytes(mc, &op, 1); 2902 mr = modrm(0u, 3u, 5u); /* r11&7, rip-rel */ 2903 mc->emit_bytes(mc, &mr, 1); 2904 disp_pos = mc->pos(mc); 2905 emit_u32le(mc, 0); 2906 mc->emit_reloc_at(mc, sec, disp_pos, R_PC32, idx_sym, -4, 1, 0); 2907 } 2908 /* (3) mov rd, [rd + r11*8]. */ 2909 { 2910 u8 rex = X64_REX_BASE | X64_REX_W | X64_REX_X; 2911 u8 op; 2912 if (rd & 8u) rex |= X64_REX_R | X64_REX_B; 2913 mc->emit_bytes(mc, &rex, 1); 2914 op = X64_OPC_MOV_R_RM; 2915 mc->emit_bytes(mc, &op, 1); 2916 if ((rd & 7u) == 5u) { 2917 u8 mr = modrm(1u, rd & 7u, 4u); 2918 u8 s = sib(3u, 3u, rd & 7u); 2919 u8 zero = 0; 2920 mc->emit_bytes(mc, &mr, 1); 2921 mc->emit_bytes(mc, &s, 1); 2922 mc->emit_bytes(mc, &zero, 1); 2923 } else { 2924 u8 mr = modrm(0u, rd & 7u, 4u); 2925 u8 s = sib(3u, 3u, rd & 7u); 2926 mc->emit_bytes(mc, &mr, 1); 2927 mc->emit_bytes(mc, &s, 1); 2928 } 2929 } 2930 /* (4) lea rd, [rd + sym@SECREL]. */ 2931 { 2932 u8 rex = X64_REX_BASE | X64_REX_W; 2933 u8 op; 2934 u32 disp_pos; 2935 if (rd & 8u) rex |= X64_REX_R | X64_REX_B; 2936 mc->emit_bytes(mc, &rex, 1); 2937 op = X64_OPC_LEA; 2938 mc->emit_bytes(mc, &op, 1); 2939 if ((rd & 7u) == 4u) { 2940 u8 mr = modrm(2u, rd & 7u, 4u); 2941 u8 s = sib(0u, 4u, rd & 7u); 2942 mc->emit_bytes(mc, &mr, 1); 2943 mc->emit_bytes(mc, &s, 1); 2944 } else { 2945 u8 mr = modrm(2u, rd & 7u, rd & 7u); 2946 mc->emit_bytes(mc, &mr, 1); 2947 } 2948 disp_pos = mc->pos(mc); 2949 emit_u32le(mc, 0); 2950 mc->emit_reloc_at(mc, sec, disp_pos, R_COFF_SECREL, sym, addend, 1, 0); 2951 } 2952 } 2953 2954 /* x86-64 TLS Local-Exec: mov rd, fs:0; lea rd, [rd + sym@tpoff]. */ 2955 static void x64_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2956 i64 addend) { 2957 MCEmitter* mc = t->mc; 2958 u32 sec = mc->section_id; 2959 u32 rd = loc_reg(dst); 2960 u32 disp_pos; 2961 if (obj_format_tls_model(t->c) == OBJ_TLS_WINDOWS_TEB) { 2962 x64_tls_addr_of_win64(t, dst, sym, addend); 2963 return; 2964 } 2965 /* mov rd, fs:[0]. */ 2966 { 2967 u8 fs = 0x64; 2968 mc->emit_bytes(mc, &fs, 1); 2969 emit_rex(mc, 1, rd, 0, 0); 2970 { 2971 u8 op = X64_OPC_MOV_R_RM; 2972 mc->emit_bytes(mc, &op, 1); 2973 } 2974 { 2975 u8 mr = modrm(0u, rd & 7u, 4u); 2976 mc->emit_bytes(mc, &mr, 1); 2977 } 2978 { 2979 u8 s = sib(0u, 4u, 5u); 2980 mc->emit_bytes(mc, &s, 1); 2981 } 2982 emit_u32le(mc, 0); 2983 } 2984 /* lea rd, [rd + disp32@tpoff]. */ 2985 emit_rex(mc, 1, rd, 0, rd); 2986 { 2987 u8 op = X64_OPC_LEA; 2988 mc->emit_bytes(mc, &op, 1); 2989 } 2990 if ((rd & 7u) == 4u) { 2991 u8 mr = modrm(2u, rd & 7u, 4u); 2992 u8 s = sib(0u, 4u, rd & 7u); 2993 mc->emit_bytes(mc, &mr, 1); 2994 mc->emit_bytes(mc, &s, 1); 2995 } else { 2996 u8 mr = modrm(2u, rd & 7u, rd & 7u); 2997 mc->emit_bytes(mc, &mr, 1); 2998 } 2999 disp_pos = mc->pos(mc); 3000 emit_u32le(mc, 0); 3001 mc->emit_reloc_at(mc, sec, disp_pos, R_X64_TPOFF32, sym, addend, 0, 0); 3002 } 3003 3004 /* ============================ atomics ============================ */ 3005 3006 static void emit_lock_prefix(MCEmitter* mc) { 3007 u8 b = 0xF0; 3008 mc->emit_bytes(mc, &b, 1); 3009 } 3010 static void emit_mfence(MCEmitter* mc) { 3011 u8 b[3] = {0x0F, 0xAE, 0xF0}; 3012 mc->emit_bytes(mc, b, 3); 3013 } 3014 3015 /* Resolve an atomic addr to a bare base register (r11) + disp 0. */ 3016 static u32 x64_atomic_base(X64NativeTarget* a, NativeAddr addr) { 3017 return x64_addr_to_base_reg(a, addr, X64_TMP_INT2); 3018 } 3019 3020 static void x64_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 3021 MemAccess mem, KitCgMemOrder mo) { 3022 X64NativeTarget* a = x64_of(t); 3023 u32 sz = mem.size ? mem.size : native_type_size(t, dst.type); 3024 u32 base; 3025 (void)mo; /* x86 plain MOV is an acquire load. */ 3026 base = x64_atomic_base(a, addr); 3027 emit_mov_load(t->mc, sz, 0, loc_reg(dst), base, 0); 3028 } 3029 3030 static void x64_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 3031 MemAccess mem, KitCgMemOrder mo) { 3032 X64NativeTarget* a = x64_of(t); 3033 MCEmitter* mc = t->mc; 3034 u32 sz = mem.size ? mem.size : native_type_size(t, src.type); 3035 int w = sz == 8u ? 1 : 0; 3036 u32 base = x64_atomic_base(a, addr); 3037 u32 sr = loc_reg(src); 3038 if (mo == KIT_CG_MO_SEQ_CST) { 3039 /* xchg [mem], r11 implicitly fences. Stage src in rax (r11 holds base). */ 3040 if (sr != X64_RAX) emit_mov_rr(mc, w, X64_RAX, sr); 3041 emit_lock_prefix(mc); 3042 emit_rex(mc, w, X64_RAX, 0, base); 3043 { 3044 u8 op = 0x87; /* xchg r/m, r */ 3045 mc->emit_bytes(mc, &op, 1); 3046 } 3047 emit_mem_operand(mc, X64_RAX, base, 0); 3048 return; 3049 } 3050 emit_mov_store(mc, sz, sr, base, 0); 3051 } 3052 3053 static void x64_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst, 3054 NativeAddr addr, NativeLoc val, MemAccess mem, 3055 KitCgMemOrder mo) { 3056 X64NativeTarget* a = x64_of(t); 3057 MCEmitter* mc = t->mc; 3058 u32 sz = mem.size ? mem.size : native_type_size(t, dst.type); 3059 int w = sz == 8u ? 1 : 0; 3060 u32 base = x64_atomic_base(a, addr); 3061 u32 dr = loc_reg(dst); 3062 u32 vr = loc_reg(val); 3063 (void)mo; /* LOCK ops are full barriers. */ 3064 /* The rmw uses fixed rax (prior), rcx (new), rdx (val); the optimizer may 3065 * have materialized the address into one of them, so keep it out (r11 is the 3066 * int emit scratch, never an allocated operand). Stage before rdx is loaded. 3067 */ 3068 if (base == X64_RAX || base == X64_RCX || base == X64_RDX) { 3069 emit_mov_rr(mc, 1, X64_TMP_INT2, base); 3070 base = X64_TMP_INT2; 3071 } 3072 /* val staged in rdx (rax/rcx used by the cmpxchg loop). */ 3073 emit_mov_rr(mc, w, X64_RDX, vr); 3074 if (op == KIT_CG_ATOMIC_ADD || op == KIT_CG_ATOMIC_SUB) { 3075 if (op == KIT_CG_ATOMIC_SUB) emit_f7_rm(mc, w, X64_F7_SUB_NEG, X64_RDX); 3076 emit_lock_prefix(mc); 3077 emit_rex(mc, w, X64_RDX, 0, base); 3078 { 3079 u8 op2[2] = {X64_OPC_TWOBYTE, 0xC1}; /* xadd */ 3080 mc->emit_bytes(mc, op2, 2); 3081 } 3082 emit_mem_operand(mc, X64_RDX, base, 0); 3083 if (dr != X64_RDX) emit_mov_rr(mc, w, dr, X64_RDX); 3084 return; 3085 } 3086 if (op == KIT_CG_ATOMIC_XCHG) { 3087 emit_lock_prefix(mc); 3088 emit_rex(mc, w, X64_RDX, 0, base); 3089 { 3090 u8 op2 = 0x87; /* xchg */ 3091 mc->emit_bytes(mc, &op2, 1); 3092 } 3093 emit_mem_operand(mc, X64_RDX, base, 0); 3094 if (dr != X64_RDX) emit_mov_rr(mc, w, dr, X64_RDX); 3095 return; 3096 } 3097 /* AND/OR/XOR/NAND: cmpxchg retry loop. rax=prior, rcx=new, rdx=val. */ 3098 { 3099 MCLabel retry = mc->label_new(mc); 3100 emit_mov_load(mc, sz, 0, X64_RAX, base, 0); 3101 mc->label_place(mc, retry); 3102 emit_mov_rr(mc, w, X64_RCX, X64_RAX); 3103 switch (op) { 3104 case KIT_CG_ATOMIC_AND: 3105 emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX); 3106 break; 3107 case KIT_CG_ATOMIC_OR: 3108 emit_alu_rr(mc, w, X64_OPC_ALU_OR, X64_RCX, X64_RDX); 3109 break; 3110 case KIT_CG_ATOMIC_XOR: 3111 emit_alu_rr(mc, w, X64_OPC_ALU_XOR, X64_RCX, X64_RDX); 3112 break; 3113 case KIT_CG_ATOMIC_NAND: 3114 emit_alu_rr(mc, w, X64_OPC_ALU_AND, X64_RCX, X64_RDX); 3115 emit_f7_rm(mc, w, X64_F7_SUB_NOT, X64_RCX); 3116 break; 3117 default: 3118 x64_panic(a, "unsupported atomic rmw op"); 3119 } 3120 emit_lock_prefix(mc); 3121 emit_rex(mc, w, X64_RCX, 0, base); 3122 { 3123 u8 op2[2] = {X64_OPC_TWOBYTE, 0xB1}; /* cmpxchg */ 3124 mc->emit_bytes(mc, op2, 2); 3125 } 3126 emit_mem_operand(mc, X64_RCX, base, 0); 3127 emit_jcc_rel32(mc, X64_CC_NE, retry); 3128 if (dr != X64_RAX) emit_mov_rr(mc, w, dr, X64_RAX); 3129 } 3130 } 3131 3132 static void x64_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, 3133 NativeAddr addr, NativeLoc expected, 3134 NativeLoc desired, MemAccess mem, 3135 KitCgMemOrder success, KitCgMemOrder failure) { 3136 X64NativeTarget* a = x64_of(t); 3137 MCEmitter* mc = t->mc; 3138 u32 sz = mem.size ? mem.size : native_type_size(t, prior.type); 3139 int w = sz == 8u ? 1 : 0; 3140 u32 base = x64_atomic_base(a, addr); 3141 u32 rprior = loc_reg(prior); 3142 u32 rok = loc_reg(ok); 3143 u32 rexp = loc_reg(expected); 3144 u32 rdes = loc_reg(desired); 3145 (void)success; 3146 (void)failure; 3147 /* cmpxchg uses fixed rax (expected) and rcx (desired). The optimizer may have 3148 * materialized the address into either; keep it out of both (r11 is the int 3149 * emit scratch, never an allocated operand). */ 3150 if (base == X64_RAX || base == X64_RCX) { 3151 emit_mov_rr(mc, 1, X64_TMP_INT2, base); 3152 base = X64_TMP_INT2; 3153 } 3154 /* Place expected -> rax and desired -> rcx as a parallel copy: the allocator 3155 * may have them in each other's target register (full swap) or desired in rax 3156 * (expected's target), either of which a naive two-move order would clobber. 3157 */ 3158 if (rexp == X64_RCX && rdes == X64_RAX) { 3159 /* Swap rax <-> rcx (xchg needs no temp; base is not rax/rcx here). */ 3160 emit_rex(mc, w, X64_RCX, 0, X64_RAX); 3161 { 3162 u8 xchg[2] = {0x87, modrm(3u, X64_RCX, X64_RAX)}; 3163 mc->emit_bytes(mc, xchg, 2); 3164 } 3165 } else if (rdes == X64_RAX) { 3166 /* desired sits in rax; move it to rcx before rax is overwritten. */ 3167 if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes); 3168 if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp); 3169 } else { 3170 if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp); 3171 if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes); 3172 } 3173 emit_lock_prefix(mc); 3174 emit_rex(mc, w, X64_RCX, 0, base); 3175 { 3176 u8 op2[2] = {X64_OPC_TWOBYTE, 0xB1}; /* cmpxchg [base], rcx */ 3177 mc->emit_bytes(mc, op2, 2); 3178 } 3179 emit_mem_operand(mc, X64_RCX, base, 0); 3180 emit_setcc(mc, X64_CC_E, rok); 3181 emit_movzx_r32_r8(mc, rok, rok); 3182 if (rprior != X64_RAX) emit_mov_rr(mc, w, rprior, X64_RAX); 3183 } 3184 3185 static void x64_fence(NativeTarget* t, KitCgMemOrder mo) { 3186 if (mo == KIT_CG_MO_SEQ_CST) emit_mfence(t->mc); 3187 } 3188 3189 /* ============================ variadics ============================ 3190 * SysV: __va_list_tag (gp_offset@0, fp_offset@4, overflow@8, reg_save@16). The 3191 * prologue filled the 176B reg-save area. Win64: va_list is a single pointer 3192 * to the next 8-byte slot in the home/overflow area; FP varargs are duplicated 3193 * into the matching GPR slot at the call site. `ap` addresses the va_list 3194 * object. */ 3195 3196 /* Resolve a va_list address into `scratch`, materializing it there if it is not 3197 * already, so the va field-value scratch registers (rax / r10 / rdx) never 3198 * alias it. At -O1 the optimizer may place the va_list pointer in any register 3199 * — including those — and the va code would then clobber the pointer 3200 * mid-sequence. */ 3201 static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) { 3202 u32 base = x64_addr_to_base_reg(a, ap, scratch); 3203 if (base != scratch) { 3204 emit_mov_rr(a->base.mc, 1, scratch, base); 3205 base = scratch; 3206 } 3207 return base; 3208 } 3209 3210 /* add r/m, imm8 (group-1 /0) directly to a memory field — advances a va_list 3211 * offset/pointer in place without consuming a register. w selects 64- vs 3212 * 32-bit. */ 3213 static void x64_add_mem_imm(MCEmitter* mc, int w, u32 base, i32 disp, i8 imm) { 3214 u8 op = X64_OPC_ALU_IMM8; 3215 u8 b; 3216 emit_rex(mc, w, 0, 0, base); 3217 mc->emit_bytes(mc, &op, 1); 3218 emit_mem_operand(mc, X64_ALU_SUB_ADD, base, disp); /* modrm.reg = /0 (ADD) */ 3219 b = (u8)imm; 3220 mc->emit_bytes(mc, &b, 1); 3221 } 3222 3223 /* add r64, [base+disp] (0x03 /r). */ 3224 static void x64_add_reg_mem(MCEmitter* mc, u32 dst, u32 base, i32 disp) { 3225 u8 op = 0x03; 3226 emit_rex(mc, 1, dst, 0, base); 3227 mc->emit_bytes(mc, &op, 1); 3228 emit_mem_operand(mc, dst, base, disp); 3229 } 3230 3231 static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) { 3232 NativeTarget* t = &a->base; 3233 MCEmitter* mc = t->mc; 3234 u32 ap_base; 3235 if (!a->is_variadic) x64_panic(a, "va_start: function not variadic"); 3236 ap_base = x64_va_base(a, ap, X64_TMP_INT2); 3237 if (a->abi->shadow_space) { 3238 /* Win64: *ap = rbp + 16 + named_int*8 + named_stack. */ 3239 u32 first = 16u + a->next_param_int * 8u + a->next_param_stack; 3240 emit_lea(mc, X64_RAX, X64_RBP, (i32)first); 3241 emit_mov_store(mc, 8, X64_RAX, ap_base, 0); 3242 return; 3243 } 3244 { 3245 X64NativeSlot* rs = x64_slot_get(a, a->reg_save_slot); 3246 /* gp_offset = next_param_int * 8 */ 3247 x64_emit_load_imm(mc, 0, X64_RAX, (i64)(a->next_param_int * 8u)); 3248 emit_mov_store(mc, 4, X64_RAX, ap_base, 0); 3249 /* fp_offset = 48 + next_param_fp * 16 */ 3250 x64_emit_load_imm(mc, 0, X64_RAX, (i64)(48u + a->next_param_fp * 16u)); 3251 emit_mov_store(mc, 4, X64_RAX, ap_base, 4); 3252 /* overflow_arg_area = rbp + 16 + next_param_stack */ 3253 emit_lea(mc, X64_RAX, X64_RBP, (i32)(16u + a->next_param_stack)); 3254 emit_mov_store(mc, 8, X64_RAX, ap_base, 8); 3255 /* reg_save_area = rbp - reg_save_slot.off */ 3256 emit_lea(mc, X64_RAX, X64_RBP, -(i32)rs->off); 3257 emit_mov_store(mc, 8, X64_RAX, ap_base, 16); 3258 } 3259 } 3260 3261 static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap, 3262 KitCgTypeId type) { 3263 NativeTarget* t = &a->base; 3264 MCEmitter* mc = t->mc; 3265 u32 sz = native_type_size(t, type); 3266 int is_fp = native_loc_is_fp(dst); 3267 u32 dr = loc_reg(dst); 3268 u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */ 3269 /* GPR scratch for the offset/address arithmetic. For integer results the 3270 * destination is itself a (throwaway) scratch GPR — pass_native_emit fetches 3271 * va_arg into a scratch and copies it to the real destination afterward — so 3272 * we reuse `dr` and touch no allocable register at all. FP results keep their 3273 * value in an XMM register, so they borrow the reserved RAX emit scratch. 3274 * Either way only r11 (ap_base) and `gp` are used: the va_list fields are 3275 * advanced in memory (x64_add_mem_imm) and the reg-save base is folded in 3276 * with x64_add_reg_mem, so no third register is needed. */ 3277 u32 gp = is_fp ? X64_RAX : dr; 3278 if (a->abi->shadow_space) { 3279 /* Win64: gp = *ap; load dr from [gp]; *ap += 8. */ 3280 emit_mov_load(mc, 8, 0, gp, ap_base, 0); 3281 if (is_fp) 3282 emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); 3283 else 3284 emit_mov_load(mc, sz, 0, dr, gp, 0); 3285 x64_add_mem_imm(mc, 1, ap_base, 0, 8); 3286 return; 3287 } 3288 { 3289 u32 offs_field = is_fp ? 4u : 0u; 3290 u32 max_offs = is_fp ? 176u : 48u; 3291 i8 stride = is_fp ? 16 : 8; 3292 MCLabel L_stack = mc->label_new(mc); 3293 MCLabel L_done = mc->label_new(mc); 3294 /* gp32 = ap[offs]; cmp gp32, max; jae L_stack. Use the imm8 form when the 3295 * threshold fits (gp_offset max 48) so the encoding is canonical and the 3296 * `cc -S | as` round-trip reproduces it; fp_offset max 176 needs imm32. */ 3297 emit_mov_load(mc, 4, 0, gp, ap_base, (i32)offs_field); 3298 if (imm_fits_i8((i64)max_offs)) 3299 emit_alu_imm8(mc, 0, X64_ALU_SUB_CMP, gp, (i8)max_offs); 3300 else 3301 emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, gp, (i32)max_offs); 3302 emit_jcc_rel32(mc, X64_CC_AE, L_stack); 3303 /* reg path: ap[offs] += stride; gp = reg_save_area(ap[16]) + offset; load. 3304 * (The memory increment leaves gp holding the old offset.) */ 3305 x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride); 3306 x64_add_reg_mem(mc, gp, ap_base, 16); 3307 if (is_fp) 3308 emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); 3309 else 3310 emit_mov_load(mc, sz, 0, dr, gp, 0); 3311 emit_jmp_rel32(mc, L_done); 3312 /* stack path: gp = ap[8] (overflow area); load; ap[8] += 8. */ 3313 mc->label_place(mc, L_stack); 3314 emit_mov_load(mc, 8, 0, gp, ap_base, 8); 3315 if (is_fp) 3316 emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); 3317 else 3318 emit_mov_load(mc, sz, 0, dr, gp, 0); 3319 x64_add_mem_imm(mc, 1, ap_base, 8, 8); 3320 mc->label_place(mc, L_done); 3321 } 3322 } 3323 3324 static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap, 3325 NativeAddr src_ap) { 3326 NativeTarget* t = &a->base; 3327 MCEmitter* mc = t->mc; 3328 /* Resolve dst into r11, src into rax, and copy each qword through the fp emit 3329 * scratch xmm14. Uses only reserved emit scratch (r11/rax/xmm14), so the 3330 * optimizer's register choice for a va_list pointer can never be clobbered 3331 * and no allocable GPR (previously rdx) is consumed. */ 3332 u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2); 3333 u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT); 3334 u32 n = a->abi->shadow_space ? 8u : 24u, i; 3335 for (i = 0; i < n; i += 8u) { 3336 emit_sse_load(mc, 0xF2, 0x10, X64_TMP_FP, src_base, (i32)i); /* movsd */ 3337 emit_sse_store(mc, 0xF2, 0x11, X64_TMP_FP, dst_base, (i32)i); /* movsd */ 3338 } 3339 } 3340 3341 static NativeAddr x64_va_addr_from_ptr(NativeLoc ap_ptr) { 3342 NativeAddr addr; 3343 memset(&addr, 0, sizeof addr); 3344 addr.base_kind = NATIVE_ADDR_BASE_REG; 3345 addr.cls = NATIVE_REG_INT; 3346 addr.base.reg = ap_ptr.v.reg; 3347 addr.base_type = ap_ptr.type; 3348 return addr; 3349 } 3350 3351 static void x64_va_start_native(NativeTarget* t, NativeLoc ap_ptr) { 3352 x64_va_start_core(x64_of(t), x64_va_addr_from_ptr(ap_ptr)); 3353 } 3354 static void x64_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, 3355 KitCgTypeId type) { 3356 x64_va_arg_core(x64_of(t), dst, x64_va_addr_from_ptr(ap_ptr), type); 3357 } 3358 static void x64_va_end_native(NativeTarget* t, NativeLoc ap_ptr) { 3359 (void)t; 3360 (void)ap_ptr; 3361 } 3362 static void x64_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) { 3363 x64_va_copy_core(x64_of(t), x64_va_addr_from_ptr(dst), 3364 x64_va_addr_from_ptr(src)); 3365 } 3366 3367 /* ============================ intrinsics ============================ */ 3368 3369 static void emit_popcnt(MCEmitter* mc, int w, u32 dst, u32 src) { 3370 u8 p = 0xF3; 3371 mc->emit_bytes(mc, &p, 1); 3372 emit_rex(mc, w, dst, 0, src); 3373 { 3374 u8 op[2] = {X64_OPC_TWOBYTE, 0xB8}; 3375 mc->emit_bytes(mc, op, 2); 3376 } 3377 emit_rm_reg(mc, dst, src); 3378 } 3379 static void emit_bs(MCEmitter* mc, int w, u8 opcode2, u32 dst, u32 src) { 3380 emit_rex(mc, w, dst, 0, src); 3381 { 3382 u8 op[2] = {X64_OPC_TWOBYTE, opcode2}; 3383 mc->emit_bytes(mc, op, 2); 3384 } 3385 emit_rm_reg(mc, dst, src); 3386 } 3387 static void emit_bswap(MCEmitter* mc, int w, u32 reg) { 3388 emit_rex(mc, w, 0, 0, reg); 3389 { 3390 u8 op[2] = {X64_OPC_TWOBYTE, (u8)(0xC8 + (reg & 7u))}; 3391 mc->emit_bytes(mc, op, 2); 3392 } 3393 } 3394 static void emit_rol16_imm8(MCEmitter* mc, u32 reg, u8 imm) { 3395 u8 p = X64_OPSIZE_PFX; 3396 mc->emit_bytes(mc, &p, 1); 3397 emit_rex(mc, 0, 0, 0, reg); 3398 { 3399 u8 buf[3] = {X64_OPC_SHIFT_IMM, modrm(3u, 0u, reg & 7u), imm}; 3400 mc->emit_bytes(mc, buf, 3); 3401 } 3402 } 3403 static void emit_ud2(MCEmitter* mc) { 3404 u8 b[2] = {0x0F, 0x0B}; 3405 mc->emit_bytes(mc, b, 2); 3406 } 3407 3408 static void emit_syscall(MCEmitter* mc) { 3409 u8 b[2] = {0x0F, 0x05}; 3410 mc->emit_bytes(mc, b, 2); 3411 } 3412 3413 static void x64_intrinsic(NativeTarget* t, IntrinKind kind, 3414 const NativeLoc* dsts, u32 ndst, 3415 const NativeLoc* args, u32 narg) { 3416 X64NativeTarget* a = x64_of(t); 3417 MCEmitter* mc = t->mc; 3418 (void)ndst; 3419 switch (kind) { 3420 case INTRIN_NONE: 3421 break; 3422 case INTRIN_EXPECT: 3423 case INTRIN_ASSUME_ALIGNED: 3424 if (args[0].kind == NATIVE_LOC_IMM) 3425 x64_emit_load_imm(mc, x64_is_64(t, dsts[0].type) ? 1 : 0, 3426 loc_reg(dsts[0]), args[0].v.imm); 3427 else 3428 x64_move(t, dsts[0], args[0]); 3429 return; 3430 case INTRIN_PREFETCH: 3431 return; 3432 case INTRIN_TRAP: 3433 emit_ud2(mc); 3434 return; 3435 case INTRIN_SYSCALL: 3436 if (ndst == 1u && narg >= 1u && narg <= 7u) { 3437 static const u32 syscall_regs[7] = { 3438 X64_RAX, X64_RDI, X64_RSI, X64_RDX, X64_R10, X64_R8, X64_R9}; 3439 X64ArgMove moves[7]; 3440 for (u32 i = 0; i < narg; ++i) { 3441 X64ArgMove* m = &moves[i]; 3442 memset(m, 0, sizeof *m); 3443 m->dst = native_loc_reg(dsts[0].type, NATIVE_REG_INT, 3444 syscall_regs[i]); 3445 m->src = args[i]; 3446 m->size = t->c->target.ptr_size; 3447 } 3448 x64_emit_reg_arg_moves(t, moves, narg, X64_TMP_INT2); 3449 emit_syscall(mc); 3450 x64_move(t, dsts[0], 3451 native_loc_reg(dsts[0].type, NATIVE_REG_INT, X64_RAX)); 3452 } 3453 return; 3454 case INTRIN_POPCOUNT: 3455 emit_popcnt(mc, x64_is_64(t, args[0].type) ? 1 : 0, loc_reg(dsts[0]), 3456 loc_reg(args[0])); 3457 return; 3458 case INTRIN_CTZ: 3459 emit_bs(mc, x64_is_64(t, args[0].type) ? 1 : 0, 0xBC /* bsf */, 3460 loc_reg(dsts[0]), loc_reg(args[0])); 3461 return; 3462 case INTRIN_CLZ: { 3463 int w = x64_is_64(t, args[0].type) ? 1 : 0; 3464 u32 dr = loc_reg(dsts[0]); 3465 emit_bs(mc, w, 0xBD /* bsr */, dr, loc_reg(args[0])); 3466 /* clz = (bits-1) - bsr, computed via xor with bits-1. The mask (31/63) 3467 * fits in imm8, so use the compact 0x83 form to match the canonical 3468 * encoding (and the assembler's `cc -S | as` round-trip). */ 3469 emit_alu_imm8(mc, w, X64_ALU_SUB_XOR, dr, w ? 63 : 31); 3470 return; 3471 } 3472 case INTRIN_BSWAP: { 3473 u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); 3474 switch (width) { 3475 case 2: { 3476 u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); 3477 if (dr != sr) emit_mov_rr(mc, 0, dr, sr); 3478 emit_rol16_imm8(mc, dr, 8); 3479 return; 3480 } 3481 case 4: { 3482 u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); 3483 if (dr != sr) emit_mov_rr(mc, 0, dr, sr); 3484 emit_bswap(mc, 0, dr); 3485 return; 3486 } 3487 case 8: { 3488 u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); 3489 if (dr != sr) emit_mov_rr(mc, 1, dr, sr); 3490 emit_bswap(mc, 1, dr); 3491 return; 3492 } 3493 default: 3494 break; 3495 } 3496 return; 3497 } 3498 case INTRIN_SADD_OVERFLOW: 3499 case INTRIN_UADD_OVERFLOW: 3500 case INTRIN_SSUB_OVERFLOW: 3501 case INTRIN_USUB_OVERFLOW: { 3502 int w = x64_is_64(t, dsts[0].type) ? 1 : 0; 3503 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3504 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3505 u8 op = (kind == INTRIN_SADD_OVERFLOW || kind == INTRIN_UADD_OVERFLOW) 3506 ? X64_OPC_ALU_ADD 3507 : X64_OPC_ALU_SUB; 3508 u32 cc = (kind == INTRIN_UADD_OVERFLOW || kind == INTRIN_USUB_OVERFLOW) 3509 ? X64_CC_B 3510 : X64_CC_O; 3511 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 3512 emit_alu_rr(mc, w, op, rd, rb); 3513 emit_setcc(mc, cc, rovf); 3514 emit_movzx_r32_r8(mc, rovf, rovf); 3515 return; 3516 } 3517 case INTRIN_SMUL_OVERFLOW: { 3518 int w = x64_is_64(t, dsts[0].type) ? 1 : 0; 3519 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3520 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3521 if (rd != ra) emit_mov_rr(mc, w, rd, ra); 3522 emit_imul_rr(mc, w, rd, rb); 3523 emit_setcc(mc, X64_CC_O, rovf); 3524 emit_movzx_r32_r8(mc, rovf, rovf); 3525 return; 3526 } 3527 case INTRIN_UMUL_OVERFLOW: { 3528 int w = x64_is_64(t, dsts[0].type) ? 1 : 0; 3529 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3530 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3531 if (rb == X64_RAX || rb == X64_RDX) { 3532 emit_mov_rr(mc, w, X64_R11, rb); 3533 rb = X64_R11; 3534 } 3535 if (ra != X64_RAX) emit_mov_rr(mc, w, X64_RAX, ra); 3536 emit_f7_rm(mc, w, X64_F7_SUB_MUL, rb); /* MUL: rdx:rax = rax * rb */ 3537 if (rd != X64_RAX) emit_mov_rr(mc, w, rd, X64_RAX); 3538 emit_setcc(mc, X64_CC_O, rovf); 3539 emit_movzx_r32_r8(mc, rovf, rovf); 3540 return; 3541 } 3542 case INTRIN_MEMMOVE: { 3543 u32 dr, sr, n, i; 3544 if (narg != 3u || args[0].kind != NATIVE_LOC_REG || 3545 args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) 3546 x64_panic(a, "unsupported memory intrinsic operands"); 3547 if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) 3548 x64_panic(a, "unsupported memory intrinsic size"); 3549 dr = loc_reg(args[0]); 3550 sr = loc_reg(args[1]); 3551 n = (u32)args[2].v.imm; 3552 i = n; /* copy high-to-low so an overlapping dst > src is safe */ 3553 while (i >= 8u) { 3554 i -= 8u; 3555 emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i); 3556 emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); 3557 } 3558 while (i >= 4u) { 3559 i -= 4u; 3560 emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i); 3561 emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); 3562 } 3563 while (i >= 2u) { 3564 i -= 2u; 3565 emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i); 3566 emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); 3567 } 3568 while (i >= 1u) { 3569 i -= 1u; 3570 emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i); 3571 emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); 3572 } 3573 return; 3574 } 3575 case INTRIN_CPU_NOP: { 3576 u8 b = 0x90; /* NOP */ 3577 mc->emit_bytes(mc, &b, 1); 3578 return; 3579 } 3580 case INTRIN_CPU_YIELD: { 3581 u8 b[2] = {0xF3, 0x90}; /* PAUSE */ 3582 mc->emit_bytes(mc, b, 2); 3583 return; 3584 } 3585 case INTRIN_DMB: 3586 case INTRIN_DSB: { 3587 u8 b[3] = {0x0F, 0xAE, 0xF0}; /* MFENCE: full-system memory barrier */ 3588 mc->emit_bytes(mc, b, 3); 3589 return; 3590 } 3591 case INTRIN_IRQ_DISABLE: { 3592 u8 b = 0xFA; /* CLI (privileged) */ 3593 mc->emit_bytes(mc, &b, 1); 3594 return; 3595 } 3596 case INTRIN_IRQ_ENABLE: { 3597 u8 b = 0xFB; /* STI (privileged) */ 3598 mc->emit_bytes(mc, &b, 1); 3599 return; 3600 } 3601 case INTRIN_FRAME_ADDRESS: 3602 case INTRIN_RETURN_ADDRESS: 3603 /* Walk the rbp frame-record chain. Every kit prologue keeps the rbp 3604 * record: [rbp] = caller's rbp, [rbp + 8] = return address pushed by the 3605 * `call`. The level is a compile-time constant, so the walk unrolls to 3606 * `level` dependent loads. */ 3607 if (ndst == 1u) { 3608 u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) 3609 ? (u32)args[0].v.imm 3610 : 0u; 3611 u32 rd = loc_reg(dsts[0]); 3612 emit_mov_rr(mc, 1, rd, X64_RBP); 3613 for (u32 i = 0; i < level; ++i) 3614 emit_mov_load(mc, 8, 0, rd, rd, 0); /* rd = *(rd) */ 3615 if (kind == INTRIN_RETURN_ADDRESS) 3616 emit_mov_load(mc, 8, 0, rd, rd, 8); /* rd = *(rd + 8) */ 3617 } 3618 return; 3619 default: 3620 break; 3621 } 3622 x64_panic(a, "unsupported compiler intrinsic"); 3623 } 3624 3625 /* ============================ inline asm ============================ */ 3626 3627 _Noreturn static void x64_asm_panic_at(Compiler* c, SrcLoc loc, 3628 const char* msg) { 3629 compiler_panic(c, loc, "x64 inline asm: %s", msg); 3630 } 3631 _Noreturn static void x64_asm_panic(NativeDirectTarget* d, const char* msg) { 3632 x64_asm_panic_at(d->base.c, d->loc, msg); 3633 } 3634 3635 /* constraint_body / constraint_early / match_index are shared 3636 * (cg/native_asm.h). */ 3637 3638 static void x64_asm_bound_reg(Operand* out, KitCgTypeId type, 3639 NativeAllocClass cls, Reg reg) { 3640 memset(out, 0, sizeof *out); 3641 out->kind = X64_INLINE_OPK_REG; 3642 out->pad[0] = 3643 (cls == NATIVE_REG_FP) ? X64_INLINE_OPCLS_FP : X64_INLINE_OPCLS_INT; 3644 out->type = type; 3645 out->v.local = (CGLocal)reg; 3646 } 3647 static void x64_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) { 3648 memset(out, 0, sizeof *out); 3649 out->kind = OPK_INDIRECT; 3650 out->type = type; 3651 out->v.ind.base = (CGLocal)base; 3652 out->v.ind.index = CG_LOCAL_NONE; 3653 out->v.ind.ofs = 0; 3654 } 3655 3656 /* Parse a clobber register name into (class, reg). Returns 0 for cc/memory. 3657 * GPR names map to HW encoding via x64_register_hw_index; xmm names map via the 3658 * DWARF table (xmm0..15 = dwarf 17..32). */ 3659 static int x64_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, 3660 NativeAllocClass* cls_out, Reg* reg_out) { 3661 Slice s = pool_slice(c->global, name); 3662 char buf[16]; 3663 uint32_t idx; 3664 if (!s.s || !s.len) return 0; 3665 if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; 3666 if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; 3667 if (s.len >= sizeof buf) x64_asm_panic_at(c, loc, "clobber name is too long"); 3668 memcpy(buf, s.s, s.len); 3669 buf[s.len] = '\0'; 3670 if (x64_register_hw_index(buf, &idx) == 0 && idx <= 15u) { 3671 *cls_out = NATIVE_REG_INT; 3672 *reg_out = (Reg)idx; 3673 return 1; 3674 } 3675 if (x64_register_index(buf, &idx) == 0 && idx >= 17u && idx <= 32u) { 3676 *cls_out = NATIVE_REG_FP; 3677 *reg_out = (Reg)(idx - 17u); 3678 return 1; 3679 } 3680 x64_asm_panic_at(c, loc, "unknown clobber register"); 3681 return 0; 3682 } 3683 3684 static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 3685 u32 nclob, u32* int_mask, u32* fp_mask) { 3686 u32 i; 3687 *int_mask = 0; 3688 *fp_mask = 0; 3689 for (i = 0; i < nclob; ++i) { 3690 NativeAllocClass cls; 3691 Reg reg; 3692 if (!x64_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue; 3693 if (cls == NATIVE_REG_INT) 3694 *int_mask |= 1u << reg; 3695 else 3696 *fp_mask |= 1u << reg; 3697 } 3698 } 3699 3700 static int x64_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, 3701 const char* constraint, 3702 NativeAsmRegPin* pin) { 3703 NativeAsmRegPinStatus st = 3704 native_asm_resolve_pin(d->native, reg, constraint, pin); 3705 if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; 3706 if (st != NATIVE_ASM_REG_PIN_OK) 3707 x64_asm_panic(d, native_asm_pin_status_message(st)); 3708 return 1; 3709 } 3710 3711 /* Pick a free register from caller-saved allocable pools for an asm operand the 3712 * direct path self-allocates. */ 3713 static Reg x64_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, 3714 u32 allowed_mask, u32* used_int, u32* used_fp) { 3715 static const Reg int_pool[] = {X64_RDI, X64_RSI, X64_RDX, 3716 X64_RCX, X64_R8, X64_R9}; 3717 static const Reg fp_pool[] = { 3718 X64_XMM0, X64_XMM1, X64_XMM2, X64_XMM3, X64_XMM4, X64_XMM5, 3719 X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11}; 3720 const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool; 3721 u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0]) 3722 : (u32)(sizeof int_pool / sizeof int_pool[0]); 3723 u32* used = cls == NATIVE_REG_FP ? used_fp : used_int; 3724 u32 i; 3725 for (i = 0; i < n; ++i) { 3726 Reg r = pool[i]; 3727 if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue; 3728 if ((*used & (1u << r)) != 0) continue; 3729 *used |= 1u << r; 3730 return r; 3731 } 3732 x64_asm_panic(d, "out of registers for asm operands"); 3733 return REG_NONE; 3734 } 3735 3736 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */ 3737 static NativeAddr x64_direct_addr(NativeDirectTarget* d, Operand op) { 3738 NativeAddr addr; 3739 memset(&addr, 0, sizeof addr); 3740 switch ((OpKind)op.kind) { 3741 case OPK_LOCAL: 3742 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3743 addr.base.frame = d->locals[op.v.local - 1u].home; 3744 addr.base_type = op.type; 3745 return addr; 3746 case OPK_INDIRECT: 3747 addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; 3748 addr.base.frame = d->locals[op.v.ind.base - 1u].home; 3749 addr.cls = d->locals[op.v.ind.base - 1u].cls; 3750 addr.base_type = d->locals[op.v.ind.base - 1u].type; 3751 addr.offset = op.v.ind.ofs; 3752 return addr; 3753 default: 3754 x64_asm_panic(d, "operand is not addressable"); 3755 } 3756 } 3757 3758 static NativeAddr x64_direct_materialize_addr(NativeDirectTarget* d, 3759 Operand op) { 3760 X64NativeTarget* a = x64_of(d->native); 3761 NativeAddr addr = x64_direct_addr(d, op); 3762 if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 3763 NativeAddr load; 3764 memset(&load, 0, sizeof load); 3765 load.base_kind = NATIVE_ADDR_BASE_FRAME; 3766 load.base.frame = addr.base.frame; 3767 load.base_type = addr.base_type; 3768 emit_mov_load(a->base.mc, 8, 0, X64_TMP_INT2, X64_RBP, 3769 -(i32)x64_slot_get(a, addr.base.frame)->off); 3770 addr.base_kind = NATIVE_ADDR_BASE_REG; 3771 addr.base.reg = X64_TMP_INT2; 3772 } 3773 return addr; 3774 } 3775 3776 static void x64_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op, 3777 NativeLoc dst) { 3778 X64NativeTarget* a = x64_of(d->native); 3779 NativeAddr addr; 3780 memset(&addr, 0, sizeof addr); 3781 switch ((OpKind)op.kind) { 3782 case OPK_IMM: 3783 if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) 3784 x64_asm_panic(d, "floating-point immediate asm input is unsupported"); 3785 d->native->load_imm(d->native, dst, op.v.imm); 3786 return; 3787 case OPK_LOCAL: 3788 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3789 addr.base.frame = d->locals[op.v.local - 1u].home; 3790 addr.base_type = op.type; 3791 x64_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3792 return; 3793 case OPK_GLOBAL: 3794 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3795 addr.base.global.sym = op.v.global.sym; 3796 addr.base.global.addend = op.v.global.addend; 3797 addr.base_type = op.type; 3798 d->native->load_addr(d->native, dst, addr); 3799 return; 3800 case OPK_INDIRECT: 3801 addr = x64_direct_materialize_addr(d, op); 3802 x64_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3803 return; 3804 } 3805 x64_asm_panic(d, "unsupported asm input operand"); 3806 } 3807 3808 static void x64_direct_load_address_to_reg(NativeDirectTarget* d, Operand op, 3809 NativeLoc dst) { 3810 d->native->load_addr(d->native, dst, x64_direct_addr(d, op)); 3811 } 3812 3813 static void x64_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op, 3814 NativeLoc src) { 3815 X64NativeTarget* a = x64_of(d->native); 3816 NativeAddr addr; 3817 memset(&addr, 0, sizeof addr); 3818 if (op.kind == OPK_LOCAL) { 3819 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3820 addr.base.frame = d->locals[op.v.local - 1u].home; 3821 addr.base_type = op.type; 3822 } else { 3823 addr = x64_direct_materialize_addr(d, op); 3824 } 3825 x64_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0)); 3826 } 3827 3828 /* Callee-saved registers an asm block clobbers must be saved around the block. 3829 */ 3830 typedef struct X64AsmSavedClobber { 3831 NativeFrameSlot slot; 3832 NativeAllocClass cls; 3833 Reg reg; 3834 KitCgTypeId type; 3835 } X64AsmSavedClobber; 3836 3837 static void x64_asm_save_one(X64NativeTarget* a, X64AsmSavedClobber* s) { 3838 NativeFrameSlotDesc desc; 3839 NativeAddr addr; 3840 memset(&desc, 0, sizeof desc); 3841 desc.type = s->type; 3842 desc.size = s->cls == NATIVE_REG_FP ? 16u : 8u; 3843 desc.align = desc.size; 3844 desc.kind = NATIVE_FRAME_SLOT_SAVE; 3845 s->slot = a->base.frame_slot(&a->base, &desc); 3846 memset(&addr, 0, sizeof addr); 3847 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3848 addr.base.frame = s->slot; 3849 addr.base_type = s->type; 3850 x64_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr, 3851 native_mem_for_type(&a->base, s->type, desc.size)); 3852 } 3853 static void x64_asm_restore_one(X64NativeTarget* a, 3854 const X64AsmSavedClobber* s) { 3855 NativeAddr addr; 3856 memset(&addr, 0, sizeof addr); 3857 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3858 addr.base.frame = s->slot; 3859 addr.base_type = s->type; 3860 x64_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr, 3861 native_mem_for_type(&a->base, s->type, 3862 s->cls == NATIVE_REG_FP ? 16u : 8u)); 3863 } 3864 3865 /* SysV callee-saved: int rbx,r12-r15; no fp. Win64 adds rdi,rsi + xmm6-15. */ 3866 static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r) { 3867 if (r == X64_RBP) return 0; /* prologue head handles rbp */ 3868 return (abi->cs_int_mask & (1ull << r)) != 0; 3869 } 3870 static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r) { 3871 return (abi->cs_fp_mask & (1ull << r)) != 0; 3872 } 3873 3874 static X64AsmSavedClobber* x64_asm_save_callee_clobbers(X64NativeTarget* a, 3875 u32 int_mask, 3876 u32 fp_mask, 3877 u32* nsaved_out) { 3878 X64AsmSavedClobber* saved = 3879 arena_zarray(a->base.c->tu, X64AsmSavedClobber, 32u); 3880 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 3881 KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64); 3882 u32 n = 0; 3883 Reg r; 3884 for (r = 0; r <= 15u; ++r) { 3885 if ((int_mask & (1u << r)) == 0 || !x64_reg_is_callee_int(a->abi, r)) 3886 continue; 3887 saved[n].cls = NATIVE_REG_INT; 3888 saved[n].reg = r; 3889 saved[n].type = i64; 3890 x64_asm_save_one(a, &saved[n++]); 3891 } 3892 for (r = 0; r <= 15u; ++r) { 3893 if ((fp_mask & (1u << r)) == 0 || !x64_reg_is_callee_fp(a->abi, r)) 3894 continue; 3895 saved[n].cls = NATIVE_REG_FP; 3896 saved[n].reg = r; 3897 saved[n].type = f64; 3898 x64_asm_save_one(a, &saved[n++]); 3899 } 3900 *nsaved_out = n; 3901 return saved; 3902 } 3903 3904 /* ---- NativeTarget (optimizer) asm hook ---- */ 3905 3906 static NativeAddr x64_asm_loc_to_addr(X64NativeTarget* a, SrcLoc loc, 3907 NativeLoc src) { 3908 NativeAddr addr; 3909 memset(&addr, 0, sizeof addr); 3910 addr.base_type = src.type; 3911 switch ((NativeLocKind)src.kind) { 3912 case NATIVE_LOC_FRAME: 3913 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3914 addr.base.frame = src.v.frame; 3915 return addr; 3916 case NATIVE_LOC_ADDR: 3917 return src.v.addr; 3918 case NATIVE_LOC_GLOBAL: 3919 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3920 addr.base.global.sym = src.v.global.sym; 3921 addr.base.global.addend = src.v.global.addend; 3922 return addr; 3923 case NATIVE_LOC_REG: 3924 addr.base_kind = NATIVE_ADDR_BASE_REG; 3925 addr.cls = NATIVE_REG_INT; 3926 addr.base.reg = src.v.reg; 3927 return addr; 3928 default: 3929 x64_asm_panic_at(a->base.c, loc, "unsupported memory asm operand"); 3930 } 3931 } 3932 3933 static Reg x64_asm_native_mem_base(X64NativeTarget* a, SrcLoc loc, 3934 NativeLoc src, u32* ntmp) { 3935 NativeAddr addr = x64_asm_loc_to_addr(a, loc, src); 3936 Reg dst; 3937 if (addr.base_kind == NATIVE_ADDR_BASE_REG && addr.offset == 0 && 3938 addr.index_kind == NATIVE_ADDR_INDEX_NONE) { 3939 if ((addr.base.reg & 0xfu) != X64_TMP_INT && 3940 (addr.base.reg & 0xfu) != X64_TMP_INT2) 3941 return (Reg)(addr.base.reg & 0xfu); 3942 } 3943 if (*ntmp >= 2u) 3944 x64_asm_panic_at(a->base.c, loc, "too many memory asm operands"); 3945 dst = (*ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2; 3946 (*ntmp)++; 3947 x64_addr_to_base_reg(a, addr, dst); 3948 return dst; 3949 } 3950 3951 static void x64_asm_load_loc_to_reg(X64NativeTarget* a, SrcLoc loc, 3952 NativeLoc src, NativeLoc dst) { 3953 NativeTarget* t = &a->base; 3954 NativeAllocClass cls = (NativeAllocClass)dst.cls; 3955 if (src.kind == NATIVE_LOC_REG) { 3956 if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src); 3957 return; 3958 } 3959 if (src.kind == NATIVE_LOC_IMM) { 3960 if (cls != NATIVE_REG_INT) 3961 x64_asm_panic_at(t->c, loc, 3962 "floating-point immediate asm input is unsupported"); 3963 t->load_imm(t, dst, src.v.imm); 3964 return; 3965 } 3966 x64_emit_mem(a, 1, dst, x64_asm_loc_to_addr(a, loc, src), 3967 native_mem_for_type(t, dst.type, native_type_size(t, dst.type))); 3968 } 3969 3970 static void x64_asm_store_reg_to_loc(X64NativeTarget* a, SrcLoc loc, 3971 NativeLoc dst, NativeLoc src) { 3972 NativeTarget* t = &a->base; 3973 if (dst.kind == NATIVE_LOC_REG) { 3974 if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src); 3975 return; 3976 } 3977 x64_emit_mem(a, 0, src, x64_asm_loc_to_addr(a, loc, dst), 3978 native_mem_for_type(t, src.type, native_type_size(t, src.type))); 3979 } 3980 3981 static void x64_asm_bind_native(X64NativeTarget* a, SrcLoc loc, Operand* out, 3982 const char* constraint, KitCgTypeId type, 3983 NativeLoc src, u32* ntmp) { 3984 const char* body = native_asm_constraint_body(constraint); 3985 NativeAsmConstraintInfo info; 3986 if (native_asm_constraint_reg_info(&a->base, constraint, &info)) { 3987 if (src.kind != NATIVE_LOC_REG) 3988 x64_asm_panic_at(a->base.c, loc, 3989 "register asm operand not in a register"); 3990 if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg) 3991 x64_asm_panic_at(a->base.c, loc, 3992 "fixed-register asm operand in wrong register"); 3993 if (info.allowed_mask && 3994 ((Reg)src.v.reg >= 32 || 3995 (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0)) 3996 x64_asm_panic_at(a->base.c, loc, 3997 "register asm operand violates constraint register set"); 3998 x64_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg); 3999 } else if (body[0] == 'i') { 4000 if (src.kind != NATIVE_LOC_IMM) 4001 x64_asm_panic_at(a->base.c, loc, 4002 "immediate asm operand is not immediate"); 4003 memset(out, 0, sizeof *out); 4004 out->kind = OPK_IMM; 4005 out->type = type; 4006 out->v.imm = src.v.imm; 4007 } else if (body[0] == 'm') { 4008 x64_asm_bound_mem(out, type, x64_asm_native_mem_base(a, loc, src, ntmp)); 4009 } else { 4010 x64_asm_panic_at(a->base.c, loc, "unsupported asm constraint"); 4011 } 4012 } 4013 4014 static void x64_asm_block_native(NativeTarget* t, const char* tmpl, 4015 const AsmConstraint* outs, u32 nout, 4016 NativeLoc* out_locs, const AsmConstraint* ins, 4017 u32 nin, const NativeLoc* in_locs, 4018 const Sym* clobbers, u32 nclob) { 4019 X64NativeTarget* a = x64_of(t); 4020 Compiler* c = t->c; 4021 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 4022 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 4023 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 4024 u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL; 4025 u32 ntmp = 0, i; 4026 X64Asm* asmh; 4027 4028 for (i = 0; i < nout; ++i) { 4029 KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; 4030 NativeLoc outloc = out_locs[i]; 4031 NativeAsmPinnedLoc pinned = 4032 native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc); 4033 if (pinned.has_pin) { 4034 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 4035 x64_asm_panic_at(c, loc, 4036 native_asm_pin_status_message(pinned.pin_status)); 4037 if (pinned.wrong_reg) 4038 x64_asm_panic_at(c, loc, "hard-register asm operand in wrong register"); 4039 outloc = pinned.loc; 4040 if (pinned.needs_stage) { 4041 staged_outs[i] = 1u; 4042 if (outs[i].dir == KIT_CG_ASM_INOUT) 4043 x64_asm_load_loc_to_reg(a, loc, out_locs[i], outloc); 4044 } 4045 } 4046 x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, 4047 &ntmp); 4048 } 4049 for (i = 0; i < nin; ++i) { 4050 const char* body = native_asm_constraint_body(ins[i].str); 4051 int matched = native_asm_match_index(body); 4052 KitCgTypeId type; 4053 NativeLoc inloc; 4054 if (matched >= 0) { 4055 if ((u32)matched >= nout) 4056 x64_asm_panic_at(c, loc, "matching constraint out of range"); 4057 bound_ins[i] = bound_outs[matched]; 4058 continue; 4059 } 4060 type = ins[i].type ? ins[i].type : in_locs[i].type; 4061 inloc = in_locs[i]; 4062 { 4063 NativeAsmPinnedLoc pinned = 4064 native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc); 4065 if (pinned.has_pin) { 4066 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 4067 x64_asm_panic_at(c, loc, 4068 native_asm_pin_status_message(pinned.pin_status)); 4069 if (pinned.wrong_reg) 4070 x64_asm_panic_at(c, loc, 4071 "hard-register asm operand in wrong register"); 4072 inloc = pinned.loc; 4073 if (pinned.needs_stage) 4074 x64_asm_load_loc_to_reg(a, loc, in_locs[i], inloc); 4075 } else if ((body[0] == 'r') && inloc.kind != NATIVE_LOC_REG) { 4076 Reg r; 4077 if (ntmp >= 2u) x64_asm_panic_at(c, loc, "too many memory asm operands"); 4078 r = (ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2; 4079 ntmp++; 4080 inloc = native_loc_reg(type, NATIVE_REG_INT, r); 4081 x64_emit_mem(a, 1, inloc, x64_asm_loc_to_addr(a, loc, in_locs[i]), 4082 native_mem_for_type(t, type, native_type_size(t, type))); 4083 } 4084 } 4085 x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); 4086 } 4087 4088 /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber 4089 * masks and x64_known_callee_saves folded the callee-saved ones into the 4090 * function's saved set, so the prologue/epilogue already preserve them. */ 4091 asmh = x64_asm_open(c); 4092 x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 4093 nclob); 4094 x64_asm_run_template(asmh, t->mc, tmpl); 4095 x64_asm_close(asmh); 4096 4097 for (i = 0; i < nout; ++i) { 4098 NativeAllocClass cls; 4099 NativeLoc src; 4100 if (!staged_outs || !staged_outs[i]) continue; 4101 if (bound_outs[i].kind != X64_INLINE_OPK_REG) continue; 4102 cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP ? NATIVE_REG_FP 4103 : NATIVE_REG_INT; 4104 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 4105 x64_asm_store_reg_to_loc(a, loc, out_locs[i], src); 4106 } 4107 } 4108 4109 /* file_scope_asm + finalize are shared (cg/native_asm.h). */ 4110 4111 static void x64_trap(NativeTarget* t) { emit_ud2(t->mc); } 4112 static void x64_set_loc(NativeTarget* t, SrcLoc loc) { 4113 x64_of(t)->loc = loc; 4114 if (t->mc->set_loc) t->mc->set_loc(t->mc, loc); 4115 } 4116 4117 /* Physical registers each x86-64 instruction's encoding clobbers as a side 4118 * effect, so the optimizer keeps values live across them out of those registers 4119 * (the backend is then free to use them). idiv/div write rax (quotient) and rdx 4120 * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use 4121 * rax/rcx/rdx; an FP va_arg borrows rax for the gp/fp offset (an integer va_arg 4122 * does the offset arithmetic in its own destination register, so it clobbers 4123 * nothing). */ 4124 static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, 4125 u32 mask[NATIVE_CALL_PLAN_CLASSES]) { 4126 (void)t; 4127 mask[0] = mask[1] = mask[2] = 0; 4128 switch ((NativeMachineOpKind)op->kind) { 4129 case NATIVE_MOP_BINOP: 4130 switch ((BinOp)op->binop) { 4131 case BO_SDIV: 4132 case BO_UDIV: 4133 case BO_SREM: 4134 case BO_UREM: 4135 mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX); 4136 return 1; 4137 case BO_SHL: 4138 case BO_SHR_S: 4139 case BO_SHR_U: 4140 if (op->second_is_reg) { 4141 mask[NATIVE_REG_INT] = (1u << X64_RCX); 4142 return 1; 4143 } 4144 return 0; 4145 default: 4146 return 0; 4147 } 4148 case NATIVE_MOP_ATOMIC_CAS: 4149 case NATIVE_MOP_ATOMIC_RMW: 4150 mask[NATIVE_REG_INT] = 4151 (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX); 4152 return 1; 4153 case NATIVE_MOP_VA_START: 4154 /* x64_va_start_core materializes the va_list field values through RAX 4155 * (the ap pointer itself lands in the reserved r11 scratch). RAX is the 4156 * return register, so the allocator may otherwise keep a live value there 4157 * across the op. */ 4158 mask[NATIVE_REG_INT] = (1u << X64_RAX); 4159 return 1; 4160 case NATIVE_MOP_VA_ARG: 4161 if (!op->result_is_fp) return 0; 4162 mask[NATIVE_REG_INT] = (1u << X64_RAX); 4163 return 1; 4164 case NATIVE_MOP_INTRINSIC: 4165 /* The unsigned multiply-overflow intrinsic emits a one-operand MUL, whose 4166 * rdx:rax product clobbers both registers. The signed variant uses a 4167 * two-operand IMUL (no fixed-register clobber). Linux syscall writes rax 4168 * and the CPU instruction itself clobbers rcx/r11; the kernel ABI treats 4169 * the integer caller-saved syscall registers as volatile. */ 4170 if ((IntrinKind)op->intrin == INTRIN_UMUL_OVERFLOW) { 4171 mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX); 4172 return 1; 4173 } 4174 if ((IntrinKind)op->intrin == INTRIN_SYSCALL) { 4175 mask[NATIVE_REG_INT] = 4176 (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) | 4177 (1u << X64_RSI) | (1u << X64_RDI) | (1u << X64_R8) | 4178 (1u << X64_R9) | (1u << X64_R10) | (1u << X64_R11); 4179 return 1; 4180 } 4181 return 0; 4182 default: 4183 return 0; 4184 } 4185 } 4186 4187 /* ============================ construction ============================ */ 4188 4189 NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, 4190 MCEmitter* mc) { 4191 X64NativeTarget* a = arena_znew(c->tu, X64NativeTarget); 4192 NativeTarget* t; 4193 if (!a) return NULL; 4194 t = &a->base; 4195 t->c = c; 4196 t->obj = obj; 4197 t->mc = mc; 4198 native_frame_init(&a->frame, c); 4199 t->regs = &x64_reg_info; 4200 t->class_for_type = native_class_for_type_fp_le8; 4201 t->imm_legal = x64_imm_legal; 4202 t->addr_legal = x64_addr_legal; 4203 t->machine_op_clobbers = x64_machine_op_clobbers; 4204 t->func_begin = x64_func_begin; 4205 t->func_begin_known_frame = x64_func_begin_known_frame; 4206 t->bind_params_end = x64_bind_params_end; 4207 t->note_frame_state = NULL; 4208 /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved 4209 * set; x64_func_begin_known_frame derives the records from the masks. */ 4210 t->reserve_callee_saves = x64_reserve_callee_saves; 4211 t->caller_saved_mask = x64_live_caller_saved_mask; 4212 t->callee_saved_mask = x64_live_callee_saved_mask; 4213 t->signature_stack_bytes = x64_signature_stack_bytes; 4214 t->call_stack_bytes = x64_call_stack_bytes; 4215 t->has_store_zero_reg = 0; 4216 t->func_end = x64_func_end; 4217 t->frame_slot = x64_frame_slot; 4218 t->frame_slot_debug_loc = x64_frame_slot_debug_loc; 4219 t->bind_param = x64_bind_native_param; 4220 t->label_new = x64_label_new; 4221 t->label_place = x64_label_place; 4222 t->jump = x64_jump; 4223 t->cmp_branch = x64_cmp_branch; 4224 t->indirect_branch = x64_indirect_branch; 4225 t->load_label_addr = x64_load_label_addr; 4226 t->move = x64_move; 4227 t->load_imm = x64_load_imm; 4228 t->load_const = x64_load_const; 4229 t->load_addr = x64_load_addr; 4230 t->load = x64_load; 4231 t->store = x64_store; 4232 t->tls_addr_of = x64_tls_addr_of; 4233 t->copy_bytes = x64_copy_bytes; 4234 t->set_bytes = x64_set_bytes; 4235 t->bitfield_load = x64_bitfield_load; 4236 t->bitfield_store = x64_bitfield_store; 4237 t->binop = x64_binop; 4238 t->unop = x64_unop; 4239 t->cmp = x64_cmp; 4240 t->convert = x64_convert; 4241 t->alloca_ = x64_alloca; 4242 t->spill = x64_spill; 4243 t->reload = x64_reload; 4244 t->plan_call = x64_plan_call; 4245 t->emit_call = x64_emit_call; 4246 t->plan_ret = x64_plan_ret; 4247 t->ret = x64_ret; 4248 t->atomic_load = x64_atomic_load; 4249 t->atomic_store = x64_atomic_store; 4250 t->atomic_rmw = x64_atomic_rmw; 4251 t->atomic_cas = x64_atomic_cas; 4252 t->fence = x64_fence; 4253 t->va_start_ = x64_va_start_native; 4254 t->va_arg_ = x64_va_arg_native; 4255 t->va_end_ = x64_va_end_native; 4256 t->va_copy_ = x64_va_copy_native; 4257 t->intrinsic = x64_intrinsic; 4258 t->asm_block = x64_asm_block_native; 4259 t->file_scope_asm = native_file_scope_asm; 4260 t->trap = x64_trap; 4261 t->set_loc = x64_set_loc; 4262 t->finalize = native_finalize; 4263 return t; 4264 } 4265 4266 /* ============================ NativeOps (-O0) ============================ */ 4267 4268 static void x64_bind_param(NativeDirectTarget* d, const CGParamDesc* p, 4269 CGLocal local, NativeDirectLocal* l) { 4270 NativeLoc dst; 4271 (void)local; 4272 memset(&dst, 0, sizeof dst); 4273 dst.kind = NATIVE_LOC_FRAME; 4274 dst.type = p->type; 4275 dst.v.frame = l->home; 4276 x64_bind_native_param(d->native, p, dst); 4277 } 4278 4279 /* A sibling call is realizable when its outgoing stack-argument area fits the 4280 * window the caller itself received. Register-only calls always qualify. */ 4281 static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { 4282 X64NativeTarget* a = x64_of(d->native); 4283 NativeCallDesc nd; 4284 NativeLoc* args = NULL; 4285 NativeLoc* results = NULL; 4286 u32 i, stack; 4287 if (a->frame.ncallee_saves) 4288 return "x64 tail call: callee-saved registers in use"; 4289 memset(&nd, 0, sizeof nd); 4290 u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u; 4291 if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); 4292 if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults); 4293 for (i = 0; i < call->nargs; ++i) { 4294 args[i].kind = NATIVE_LOC_FRAME; 4295 args[i].type = d->locals[call->args[i] - 1u].type; 4296 args[i].cls = d->locals[call->args[i] - 1u].cls; 4297 args[i].v.frame = d->locals[call->args[i] - 1u].home; 4298 } 4299 if (nresults) { 4300 results[0].kind = NATIVE_LOC_FRAME; 4301 results[0].type = d->locals[call->result - 1u].type; 4302 results[0].cls = d->locals[call->result - 1u].cls; 4303 results[0].v.frame = d->locals[call->result - 1u].home; 4304 } 4305 nd.fn_type = call->fn_type; 4306 nd.args = args; 4307 nd.results = results; 4308 nd.nargs = call->nargs; 4309 nd.nresults = nresults; 4310 stack = x64_call_stack_size(d->native, &nd); 4311 /* x64_call_stack_size includes the shadow-space prefix; the caller's incoming 4312 * window has the same prefix, so compare against incoming_stack_size + it. */ 4313 if (stack > a->incoming_stack_size + a->abi->shadow_space) 4314 return "x64 tail call: stack argument area too small"; 4315 return NULL; 4316 } 4317 4318 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`, 4319 * returning a register-based NativeAddr. */ 4320 static NativeAddr x64_direct_pointer_addr(NativeDirectTarget* d, Operand op) { 4321 X64NativeTarget* a = x64_of(d->native); 4322 NativeAddr addr; 4323 memset(&addr, 0, sizeof addr); 4324 if (op.kind == OPK_LOCAL) { 4325 emit_mov_load(a->base.mc, 8, 0, X64_R11, X64_RBP, 4326 -(i32)x64_slot_get(a, d->locals[op.v.local - 1u].home)->off); 4327 addr.base_kind = NATIVE_ADDR_BASE_REG; 4328 addr.base.reg = X64_R11; 4329 addr.base_type = op.type; 4330 return addr; 4331 } 4332 return x64_direct_materialize_addr(d, op); 4333 } 4334 4335 static NativeAddr x64_direct_va_base(NativeDirectTarget* d, Operand ap_addr, 4336 Reg reg) { 4337 NativeLoc dst = 4338 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); 4339 NativeAddr addr; 4340 d->native->load_addr(d->native, dst, x64_direct_pointer_addr(d, ap_addr)); 4341 memset(&addr, 0, sizeof addr); 4342 addr.base_kind = NATIVE_ADDR_BASE_REG; 4343 addr.cls = NATIVE_REG_INT; 4344 addr.base.reg = reg; 4345 addr.base_type = builtin_id(KIT_CG_BUILTIN_I64); 4346 return addr; 4347 } 4348 4349 static void x64_va_start_(NativeDirectTarget* d, Operand ap_addr) { 4350 /* Hold the va_list base in R11, not RAX: x64_va_start_core materializes the 4351 * gp/fp_offset and overflow/reg-save-area field values through RAX, which 4352 * would otherwise clobber the base before the field stores. */ 4353 x64_va_start_core(x64_of(d->native), x64_direct_va_base(d, ap_addr, X64_R11)); 4354 } 4355 static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, 4356 KitCgTypeId type) { 4357 X64NativeTarget* a = x64_of(d->native); 4358 int is_fp = cg_type_is_float(d->base.c, type); 4359 NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, 4360 is_fp ? X64_TMP_FP : (Reg)X64_RDX); 4361 NativeAddr dst_addr; 4362 /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the 4363 * integer result reg itself, or RAX for FP results), so R11 must not be RAX. 4364 */ 4365 x64_va_arg_core(a, res, x64_direct_va_base(d, ap_addr, X64_R11), type); 4366 dst_addr = x64_direct_addr(d, dst); 4367 if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 4368 emit_mov_load(a->base.mc, 8, 0, X64_R11, X64_RBP, 4369 -(i32)x64_slot_get(a, dst_addr.base.frame)->off); 4370 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 4371 dst_addr.base.reg = X64_R11; 4372 } 4373 x64_emit_mem( 4374 a, 0, res, dst_addr, 4375 native_mem_for_type(d->native, type, native_type_size(d->native, type))); 4376 } 4377 static void x64_va_end_(NativeDirectTarget* d, Operand ap_addr) { 4378 (void)d; 4379 (void)ap_addr; 4380 } 4381 static void x64_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) { 4382 X64NativeTarget* a = x64_of(d->native); 4383 NativeAddr src_ap = x64_direct_va_base(d, src, X64_RAX); 4384 NativeAddr dst_ap = x64_direct_va_base(d, dst, X64_R11); 4385 x64_va_copy_core(a, dst_ap, src_ap); 4386 } 4387 4388 static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, 4389 const AsmConstraint* outs, u32 nout, 4390 Operand* out_ops, const AsmConstraint* ins, 4391 u32 nin, const Operand* in_ops, 4392 const Sym* clobbers, u32 nclob, 4393 u32 clobber_abi_sets) { 4394 X64NativeTarget* a = x64_of(d->native); 4395 Compiler* c = d->base.c; 4396 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 4397 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 4398 u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; 4399 X64AsmSavedClobber* saved; 4400 u32 nsaved, i; 4401 X64Asm* asmh; 4402 4403 x64_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp); 4404 native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); 4405 clob_int |= abi_int; 4406 clob_fp |= abi_fp; 4407 /* Reserve emit scratch (r10,r11), driver scratch (r8,r9), rax (reserved; 4408 * only self-allocated here when explicitly pinned), sp/bp, and clobbers. */ 4409 used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) | 4410 (1u << X64_RBP) | (1u << X64_R8) | (1u << X64_R9) | 4411 (1u << X64_R10); 4412 used_fp = clob_fp | (1u << X64_XMM4) | (1u << X64_XMM5) | 4413 (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15); 4414 4415 for (i = 0; i < nout; ++i) { 4416 const char* body = native_asm_constraint_body(outs[i].str); 4417 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 4418 NativeAsmRegPin pin; 4419 if (x64_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { 4420 /* GNU local register variable: pin to the named hard register. */ 4421 if (pin.cls == NATIVE_REG_FP) { 4422 used_fp |= 1u << pin.reg; 4423 clob_fp |= 1u << pin.reg; 4424 } else { 4425 used_int |= 1u << pin.reg; 4426 clob_int |= 1u << pin.reg; 4427 } 4428 x64_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); 4429 } else { 4430 NativeAsmConstraintInfo info; 4431 if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) { 4432 Reg reg = info.fixed_reg != REG_NONE 4433 ? info.fixed_reg 4434 : x64_asm_alloc_reg(d, info.cls, info.allowed_mask, 4435 &used_int, &used_fp); 4436 if (info.cls == NATIVE_REG_FP) { 4437 used_fp |= 1u << reg; 4438 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4439 } else { 4440 used_int |= 1u << reg; 4441 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4442 } 4443 x64_asm_bound_reg(&bound_outs[i], type, info.cls, reg); 4444 } else if (body[0] == 'm') { 4445 Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4446 x64_asm_bound_mem(&bound_outs[i], type, reg); 4447 } else { 4448 x64_asm_panic(d, "unsupported output constraint"); 4449 } 4450 } 4451 } 4452 4453 for (i = 0; i < nin; ++i) { 4454 const char* body = native_asm_constraint_body(ins[i].str); 4455 int matched = native_asm_match_index(body); 4456 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 4457 if (matched >= 0) { 4458 if ((u32)matched >= nout) 4459 x64_asm_panic(d, "matching constraint out of range"); 4460 if (native_asm_constraint_early(outs[matched].str)) 4461 x64_asm_panic(d, "matching input names early-clobber output"); 4462 if (bound_outs[matched].kind != X64_INLINE_OPK_REG) 4463 x64_asm_panic(d, "matching constraint requires register output"); 4464 bound_ins[i] = bound_outs[matched]; 4465 continue; 4466 } 4467 NativeAsmRegPin pin; 4468 if (x64_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { 4469 /* GNU local register variable: pin to the named hard register. */ 4470 if (pin.cls == NATIVE_REG_FP) { 4471 used_fp |= 1u << pin.reg; 4472 clob_fp |= 1u << pin.reg; 4473 } else { 4474 used_int |= 1u << pin.reg; 4475 clob_int |= 1u << pin.reg; 4476 } 4477 x64_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); 4478 } else { 4479 NativeAsmConstraintInfo info; 4480 if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) { 4481 Reg reg = info.fixed_reg != REG_NONE 4482 ? info.fixed_reg 4483 : x64_asm_alloc_reg(d, info.cls, info.allowed_mask, 4484 &used_int, &used_fp); 4485 if (info.cls == NATIVE_REG_FP) { 4486 used_fp |= 1u << reg; 4487 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4488 } else { 4489 used_int |= 1u << reg; 4490 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4491 } 4492 x64_asm_bound_reg(&bound_ins[i], type, info.cls, reg); 4493 } else if (body[0] == 'i') { 4494 if (in_ops[i].kind != OPK_IMM) 4495 x64_asm_panic(d, "immediate constraint requires immediate operand"); 4496 bound_ins[i] = in_ops[i]; 4497 } else if (body[0] == 'm') { 4498 Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4499 x64_asm_bound_mem(&bound_ins[i], type, reg); 4500 } else { 4501 x64_asm_panic(d, "unsupported input constraint"); 4502 } 4503 } 4504 } 4505 4506 saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); 4507 for (i = 0; i < nout; ++i) { 4508 if (bound_outs[i].kind == X64_INLINE_OPK_REG) { 4509 NativeAllocClass cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP 4510 ? NATIVE_REG_FP 4511 : NATIVE_REG_INT; 4512 if (outs[i].dir == KIT_CG_ASM_INOUT) { 4513 x64_direct_load_operand_to_reg( 4514 d, out_ops[i], 4515 native_loc_reg(bound_outs[i].type, cls, 4516 (Reg)bound_outs[i].v.local)); 4517 } 4518 } else if (bound_outs[i].kind == OPK_INDIRECT) { 4519 NativeLoc loc = 4520 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4521 (Reg)bound_outs[i].v.ind.base); 4522 x64_direct_load_address_to_reg(d, out_ops[i], loc); 4523 } 4524 } 4525 for (i = 0; i < nin; ++i) { 4526 if (bound_ins[i].kind == X64_INLINE_OPK_REG) { 4527 NativeAllocClass cls = bound_ins[i].pad[0] == X64_INLINE_OPCLS_FP 4528 ? NATIVE_REG_FP 4529 : NATIVE_REG_INT; 4530 x64_direct_load_operand_to_reg( 4531 d, in_ops[i], 4532 native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); 4533 } else if (bound_ins[i].kind == OPK_INDIRECT) { 4534 NativeLoc loc = 4535 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4536 (Reg)bound_ins[i].v.ind.base); 4537 x64_direct_load_address_to_reg(d, in_ops[i], loc); 4538 } 4539 } 4540 asmh = x64_asm_open(c); 4541 x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 4542 nclob); 4543 x64_asm_run_template(asmh, d->native->mc, tmpl); 4544 x64_asm_close(asmh); 4545 4546 for (i = 0; i < nout; ++i) { 4547 NativeAllocClass cls; 4548 NativeLoc src; 4549 if (bound_outs[i].kind != X64_INLINE_OPK_REG) continue; 4550 cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP ? NATIVE_REG_FP 4551 : NATIVE_REG_INT; 4552 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 4553 x64_direct_store_reg_to_operand(d, out_ops[i], src); 4554 } 4555 for (i = nsaved; i > 0; --i) x64_asm_restore_one(a, &saved[i - 1u]); 4556 } 4557 4558 static const NativeOps x64_direct_ops = { 4559 .bind_param = x64_bind_param, 4560 .tail_call_unrealizable_reason = x64_no_tail, 4561 .va_start_ = x64_va_start_, 4562 .va_arg_ = x64_va_arg_, 4563 .va_end_ = x64_va_end_, 4564 .va_copy_ = x64_va_copy_, 4565 .asm_block = x64_direct_asm_block, 4566 }; 4567 4568 const NativeOps* x64_native_direct_ops(void) { return &x64_direct_ops; }