kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

emit.c (22546B)


      1 /* arch/x64/emit.c — byte-level emit helpers, function prologue/epilogue.
      2  *
      3  * Covers: REX, ModR/M, SIB, all emit_* primitives, x_func_begin,
      4  * x_func_end, and the per-ABI int_args tables exposed via X64ABIRegs. */
      5 
      6 #include "arch/x64/emit.h"
      7 
      8 #include <string.h>
      9 
     10 #include "arch/mc.h"
     11 #include "arch/x64/isa.h"
     12 #include "core/slice.h"
     13 #include "obj/obj.h"
     14 
     15 /* ============================================================
     16  * Shared constant tables. */
     17 
     18 static const u32 g_int_arg_regs_sysv[6] = {X64_RDI, X64_RSI, X64_RDX,
     19                                            X64_RCX, X64_R8,  X64_R9};
     20 static const u32 g_int_arg_regs_win64[4] = {X64_RCX, X64_RDX, X64_R8, X64_R9};
     21 
     22 static const X64ABIRegs g_x64_abi_sysv = {
     23     .int_args = g_int_arg_regs_sysv,
     24     .n_int_args = 6,
     25     .n_fp_args = 8,
     26     .slot_shared_int_fp = 0,
     27     .shadow_space = 0,
     28     .emit_sysv_vararg_save = 1,
     29     .vararg_fp_dup_to_gpr = 0,
     30     .cs_int_mask = (1ull << X64_RBX) | (1ull << X64_RBP) | (1ull << X64_R12) |
     31                    (1ull << X64_R13) | (1ull << X64_R14) | (1ull << X64_R15),
     32     .cs_fp_mask = 0,
     33 };
     34 
     35 static const X64ABIRegs g_x64_abi_win64 = {
     36     .int_args = g_int_arg_regs_win64,
     37     .n_int_args = 4,
     38     .n_fp_args = 4,
     39     .slot_shared_int_fp = 1,
     40     .shadow_space = X64_WIN64_SHADOW_SPACE,
     41     .emit_sysv_vararg_save = 0,
     42     .vararg_fp_dup_to_gpr = 1,
     43     .cs_int_mask = (1ull << X64_RBX) | (1ull << X64_RBP) | (1ull << X64_R12) |
     44                    (1ull << X64_R13) | (1ull << X64_R14) | (1ull << X64_R15) |
     45                    (1ull << X64_RDI) | (1ull << X64_RSI),
     46     .cs_fp_mask = (1ull << X64_XMM6) | (1ull << X64_XMM7) | (1ull << X64_XMM8) |
     47                   (1ull << (X64_XMM0 + 9)) | (1ull << (X64_XMM0 + 10)) |
     48                   (1ull << (X64_XMM0 + 11)) | (1ull << (X64_XMM0 + 12)) |
     49                   (1ull << (X64_XMM0 + 13)) | (1ull << (X64_XMM0 + 14)) |
     50                   (1ull << X64_XMM15),
     51 };
     52 
     53 const X64ABIRegs* x64_abi_for_os(KitOSKind os) {
     54   return (os == KIT_OS_WINDOWS) ? &g_x64_abi_win64 : &g_x64_abi_sysv;
     55 }
     56 
     57 /* ============================================================
     58  * Byte-level emit helpers.
     59  *
     60  * x64 instructions are variable length: optional legacy prefix(es),
     61  * optional REX, 1-3 byte opcode, ModR/M, optional SIB, optional
     62  * displacement, optional immediate. Helpers below build sequences
     63  * into the active MCEmitter section, recording one Debug row per
     64  * instruction-start. */
     65 void emit1(MCEmitter* mc, u8 b) {
     66   u32 ofs = obj_pos(mc->obj, mc->section_id);
     67   mc->emit_bytes(mc, &b, 1);
     68   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
     69 }
     70 void emit_u32le(MCEmitter* mc, u32 v) {
     71   u8 b[4];
     72   b[0] = (u8)v;
     73   b[1] = (u8)(v >> 8);
     74   b[2] = (u8)(v >> 16);
     75   b[3] = (u8)(v >> 24);
     76   mc->emit_bytes(mc, b, 4);
     77 }
     78 static u8 make_rex(int w, u32 reg, u32 index, u32 rm) {
     79   u8 r = 0;
     80   if (w) r |= X64_REX_W;
     81   if (reg & 8) r |= X64_REX_R;
     82   if (index & 8) r |= X64_REX_X;
     83   if (rm & 8) r |= X64_REX_B;
     84   return r ? (u8)(X64_REX_BASE | r) : 0;
     85 }
     86 void emit_rex(MCEmitter* mc, int w, u32 reg, u32 index, u32 rm) {
     87   u8 r = make_rex(w, reg, index, rm);
     88   if (r) mc->emit_bytes(mc, &r, 1);
     89 }
     90 /* Force REX (even REX=0x40) — required for byte-reg encodings that
     91  * promote SIL/DIL/etc. */
     92 void emit_rex_force(MCEmitter* mc, int w, u32 reg, u32 index, u32 rm) {
     93   u8 r = (u8)(X64_REX_BASE | (w ? X64_REX_W : 0) | ((reg & 8) ? X64_REX_R : 0) |
     94               ((index & 8) ? X64_REX_X : 0) | ((rm & 8) ? X64_REX_B : 0));
     95   mc->emit_bytes(mc, &r, 1);
     96 }
     97 
     98 u8 modrm(u32 mod, u32 reg, u32 rm) {
     99   return (u8)(((mod & 3u) << 6) | ((reg & 7u) << 3) | (rm & 7u));
    100 }
    101 u8 sib(u32 scale, u32 index, u32 base) {
    102   return (u8)(((scale & 3u) << 6) | ((index & 7u) << 3) | (base & 7u));
    103 }
    104 
    105 static u32 disp_mod(u32 base, i32 disp) {
    106   if (disp == 0 && (base & 7u) != 5u) return 0u; /* [base] */
    107   if (disp >= -128 && disp <= 127) return 1u;    /* [base + disp8] */
    108   return 2u;                                     /* [base + disp32] */
    109 }
    110 
    111 void emit_mem_operand(MCEmitter* mc, u32 reg, u32 base, i32 disp) {
    112   u32 m = disp_mod(base, disp);
    113   if ((base & 7u) == 4u) {
    114     /* SIB byte required: index=4 (none), base=base. */
    115     u8 mr = modrm(m, reg, 4u);
    116     mc->emit_bytes(mc, &mr, 1);
    117     u8 s = sib(0, 4u, base);
    118     mc->emit_bytes(mc, &s, 1);
    119   } else {
    120     u8 mr = modrm(m, reg, base);
    121     mc->emit_bytes(mc, &mr, 1);
    122   }
    123   if (m == 1u) {
    124     u8 d = (u8)(i8)disp;
    125     mc->emit_bytes(mc, &d, 1);
    126   } else if (m == 2u) {
    127     emit_u32le(mc, (u32)disp);
    128   }
    129 }
    130 void emit_rm_reg(MCEmitter* mc, u32 reg, u32 rm) {
    131   u8 mr = modrm(3u, reg, rm);
    132   mc->emit_bytes(mc, &mr, 1);
    133 }
    134 
    135 /* ---- specific instruction emitters ---- */
    136 
    137 /* mov rd, rs (64-bit if w, else 32-bit). */
    138 void emit_mov_rr(MCEmitter* mc, int w, u32 dst, u32 src) {
    139   u32 ofs = obj_pos(mc->obj, mc->section_id);
    140   u8 buf[16];
    141   u32 n = x64_alu_rr_pack(
    142       (X64AluRR){.w = w, .op = X64_OPC_MOV_RM_R, .dst = dst, .src = src}, buf);
    143   mc->emit_bytes(mc, buf, n);
    144   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    145 }
    146 
    147 /* mov reg, [base + disp]; size 1/2/4/8. */
    148 void emit_mov_load(MCEmitter* mc, u32 size, int signed_ext, u32 dst, u32 base,
    149                    i32 disp) {
    150   u32 ofs = obj_pos(mc->obj, mc->section_id);
    151   u8 buf[16];
    152   u32 n = 0;
    153   if (size == 8) {
    154     n = x64_mov_rm_load_pack((X64MovRMLoad){.w = 1,
    155                                             .opc0 = X64_OPC_MOV_R_RM,
    156                                             .dst = dst,
    157                                             .base = base,
    158                                             .disp = disp},
    159                              buf);
    160   } else if (size == 4) {
    161     n = x64_mov_rm_load_pack((X64MovRMLoad){.w = 0,
    162                                             .opc0 = X64_OPC_MOV_R_RM,
    163                                             .dst = dst,
    164                                             .base = base,
    165                                             .disp = disp},
    166                              buf);
    167   } else if (size == 2) {
    168     n = x64_mov_rm_load_pack(
    169         (X64MovRMLoad){.w = 0,
    170                        .opc1 = signed_ext ? X64_OPC_MOVSX_W : X64_OPC_MOVZX_W,
    171                        .dst = dst,
    172                        .base = base,
    173                        .disp = disp},
    174         buf);
    175   } else if (size == 1) {
    176     n = x64_mov_rm_load_pack(
    177         (X64MovRMLoad){.w = 0,
    178                        .opc1 = signed_ext ? X64_OPC_MOVSX_B : X64_OPC_MOVZX_B,
    179                        .dst = dst,
    180                        .base = base,
    181                        .disp = disp},
    182         buf);
    183   }
    184   if (n) mc->emit_bytes(mc, buf, n);
    185   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    186 }
    187 
    188 /* mov [base + disp], src; size 1/2/4/8. */
    189 void emit_mov_store(MCEmitter* mc, u32 size, u32 src, u32 base, i32 disp) {
    190   u32 ofs = obj_pos(mc->obj, mc->section_id);
    191   u8 buf[16];
    192   u32 n = 0;
    193   if (size == 8) {
    194     n = x64_alu_rm_pack((X64AluRM){.w = 1,
    195                                    .op = X64_OPC_MOV_RM_R,
    196                                    .src = src,
    197                                    .base = base,
    198                                    .disp = disp},
    199                         buf);
    200   } else if (size == 4) {
    201     n = x64_alu_rm_pack((X64AluRM){.w = 0,
    202                                    .op = X64_OPC_MOV_RM_R,
    203                                    .src = src,
    204                                    .base = base,
    205                                    .disp = disp},
    206                         buf);
    207   } else if (size == 2) {
    208     n = x64_alu_rm_pack((X64AluRM){.prefix = X64_OPSIZE_PFX,
    209                                    .w = 0,
    210                                    .op = X64_OPC_MOV_RM_R,
    211                                    .src = src,
    212                                    .base = base,
    213                                    .disp = disp},
    214                         buf);
    215   } else if (size == 1) {
    216     /* Force REX so SIL/DIL/etc are addressable as byte regs. */
    217     n = x64_alu_rm_pack((X64AluRM){.w = 0,
    218                                    .op = X64_OPC_MOV_RM_R8,
    219                                    .force_rex = 1,
    220                                    .src = src,
    221                                    .base = base,
    222                                    .disp = disp},
    223                         buf);
    224   }
    225   if (n) mc->emit_bytes(mc, buf, n);
    226   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    227 }
    228 
    229 void emit_lea(MCEmitter* mc, u32 dst, u32 base, i32 disp) {
    230   u32 ofs = obj_pos(mc->obj, mc->section_id);
    231   u8 buf[16];
    232   u32 n = x64_mov_rm_load_pack(
    233       (X64MovRMLoad){
    234           .w = 1, .opc0 = X64_OPC_LEA, .dst = dst, .base = base, .disp = disp},
    235       buf);
    236   mc->emit_bytes(mc, buf, n);
    237   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    238 }
    239 
    240 /* Common low-level emit for `[base + index<<log2_scale + disp]` GPR/SSE
    241  * memory operands. Builds REX with index, opcode(s), and SIB by hand
    242  * (the existing `*_pack` helpers route through `x64_pack_mem`, which
    243  * forces SIB index = 4 (none)). */
    244 static void emit_mem_idx_op(MCEmitter* mc, u8 prefix, int w, int force_rex,
    245                             u8 opc0, u8 opc1, u32 reg, u32 base, u32 index,
    246                             u32 log2_scale, i32 disp) {
    247   u8 buf[16];
    248   u32 n = 0;
    249   if (prefix) buf[n++] = prefix;
    250   if (force_rex)
    251     n += x64_pack_rex_force(buf + n, w, reg, index, base);
    252   else
    253     n += x64_pack_rex(buf + n, w, reg, index, base);
    254   if (opc1) {
    255     buf[n++] = X64_OPC_TWOBYTE;
    256     buf[n++] = opc1;
    257   } else {
    258     buf[n++] = opc0;
    259   }
    260   n += x64_pack_mem_sib(buf + n, reg, base, index, log2_scale, disp);
    261   mc->emit_bytes(mc, buf, n);
    262 }
    263 
    264 /* mov reg, [base + index<<log2_scale + disp]; size 1/2/4/8. */
    265 void emit_mov_load_idx(MCEmitter* mc, u32 size, int signed_ext, u32 dst,
    266                        u32 base, u32 index, u32 log2_scale, i32 disp) {
    267   if (index == REG_NONE) {
    268     emit_mov_load(mc, size, signed_ext, dst, base, disp);
    269     return;
    270   }
    271   u32 ofs = obj_pos(mc->obj, mc->section_id);
    272   if (size == 8) {
    273     emit_mem_idx_op(mc, 0, 1, 0, X64_OPC_MOV_R_RM, 0, dst, base, index & 0xFu,
    274                     log2_scale, disp);
    275   } else if (size == 4) {
    276     emit_mem_idx_op(mc, 0, 0, 0, X64_OPC_MOV_R_RM, 0, dst, base, index & 0xFu,
    277                     log2_scale, disp);
    278   } else if (size == 2) {
    279     emit_mem_idx_op(mc, 0, 0, 0, 0,
    280                     signed_ext ? X64_OPC_MOVSX_W : X64_OPC_MOVZX_W, dst, base,
    281                     index & 0xFu, log2_scale, disp);
    282   } else if (size == 1) {
    283     emit_mem_idx_op(mc, 0, 0, 0, 0,
    284                     signed_ext ? X64_OPC_MOVSX_B : X64_OPC_MOVZX_B, dst, base,
    285                     index & 0xFu, log2_scale, disp);
    286   }
    287   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    288 }
    289 
    290 /* mov [base + index<<log2_scale + disp], src; size 1/2/4/8. */
    291 void emit_mov_store_idx(MCEmitter* mc, u32 size, u32 src, u32 base, u32 index,
    292                         u32 log2_scale, i32 disp) {
    293   if (index == REG_NONE) {
    294     emit_mov_store(mc, size, src, base, disp);
    295     return;
    296   }
    297   u32 ofs = obj_pos(mc->obj, mc->section_id);
    298   if (size == 8) {
    299     emit_mem_idx_op(mc, 0, 1, 0, X64_OPC_MOV_RM_R, 0, src, base, index & 0xFu,
    300                     log2_scale, disp);
    301   } else if (size == 4) {
    302     emit_mem_idx_op(mc, 0, 0, 0, X64_OPC_MOV_RM_R, 0, src, base, index & 0xFu,
    303                     log2_scale, disp);
    304   } else if (size == 2) {
    305     emit_mem_idx_op(mc, X64_OPSIZE_PFX, 0, 0, X64_OPC_MOV_RM_R, 0, src, base,
    306                     index & 0xFu, log2_scale, disp);
    307   } else if (size == 1) {
    308     /* Force REX so SIL/DIL/etc are addressable as byte regs. */
    309     emit_mem_idx_op(mc, 0, 0, 1, X64_OPC_MOV_RM_R8, 0, src, base, index & 0xFu,
    310                     log2_scale, disp);
    311   }
    312   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    313 }
    314 
    315 /* movabs reg, imm64 (REX.W + B8+r imm64) for is64; mov r32, imm32 (B8+r
    316  * imm32) for !is64. Both 10/5 bytes. */
    317 void x64_emit_load_imm(MCEmitter* mc, int is64, u32 dst, i64 imm) {
    318   u32 ofs = obj_pos(mc->obj, mc->section_id);
    319   u8 buf[16];
    320   u32 n =
    321       x64_mov_ri_pack((X64MovRI){.is64 = is64, .dst = dst, .imm = imm}, buf);
    322   mc->emit_bytes(mc, buf, n);
    323   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    324 }
    325 
    326 /* Two-operand ALU r/m, r. op picks ADD(01)/SUB(29)/AND(21)/OR(09)/XOR(31)/
    327  * CMP(39)/MOV(89)/TEST(85). */
    328 void emit_alu_rr(MCEmitter* mc, int w, u8 op, u32 dst, u32 src) {
    329   u32 ofs = obj_pos(mc->obj, mc->section_id);
    330   u8 buf[16];
    331   u32 n = x64_alu_rr_pack((X64AluRR){.w = w, .op = op, .dst = dst, .src = src},
    332                           buf);
    333   mc->emit_bytes(mc, buf, n);
    334   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    335 }
    336 
    337 void emit_imul_rr(MCEmitter* mc, int w, u32 dst, u32 src) {
    338   u32 ofs = obj_pos(mc->obj, mc->section_id);
    339   u8 buf[16];
    340   u32 n = x64_imul_rr_pack((X64ImulRR){.w = w, .dst = dst, .src = src}, buf);
    341   mc->emit_bytes(mc, buf, n);
    342   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    343 }
    344 
    345 void emit_f7_rm(MCEmitter* mc, int w, u32 sub, u32 reg) {
    346   u32 ofs = obj_pos(mc->obj, mc->section_id);
    347   u8 buf[16];
    348   u32 n = x64_f7_rm_pack((X64F7RM){.w = w, .sub = sub, .reg = reg}, buf);
    349   mc->emit_bytes(mc, buf, n);
    350   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    351 }
    352 
    353 void emit_shift_cl(MCEmitter* mc, int w, u32 sub, u32 reg) {
    354   u32 ofs = obj_pos(mc->obj, mc->section_id);
    355   u8 buf[16];
    356   u32 n = x64_shift_cl_pack((X64ShiftCL){.w = w, .sub = sub, .reg = reg}, buf);
    357   mc->emit_bytes(mc, buf, n);
    358   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    359 }
    360 
    361 /* Shift r/m by imm8: opcode C1 /sub ib. sub: SHL=4, SHR=5, SAR=7. */
    362 void emit_shift_imm(MCEmitter* mc, int w, u32 sub, u32 reg, u8 imm) {
    363   u32 ofs = obj_pos(mc->obj, mc->section_id);
    364   u8 buf[16];
    365   u32 n = x64_shift_imm_pack(
    366       (X64ShiftImm){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf);
    367   mc->emit_bytes(mc, buf, n);
    368   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    369 }
    370 
    371 void emit_cqo_or_cdq(MCEmitter* mc, int w) {
    372   u8 buf[16];
    373   u32 n = x64_nullary_pack((X64Nullary){.w = w, .opc0 = X64_OPC_CDQ_CQO}, buf);
    374   mc->emit_bytes(mc, buf, n);
    375 }
    376 
    377 void emit_xor_self(MCEmitter* mc, int w, u32 r) {
    378   emit_alu_rr(mc, w, X64_OPC_ALU_XOR, r, r);
    379 }
    380 
    381 /* cmp r/m, imm8 (0x83 /7). */
    382 void emit_cmp_imm8(MCEmitter* mc, int w, u32 reg, i8 imm) {
    383   u32 ofs = obj_pos(mc->obj, mc->section_id);
    384   u8 buf[16];
    385   u32 n = x64_alu_imm8_pack(
    386       (X64AluRmImm8){.w = w, .sub = X64_ALU_SUB_CMP, .reg = reg, .imm = imm},
    387       buf);
    388   mc->emit_bytes(mc, buf, n);
    389   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    390 }
    391 
    392 /* ALU r/m, imm8: opcode 0x83 /sub ib (sign-extended). sub: ADD=0,
    393  * OR=1, ADC=2, SBB=3, AND=4, SUB=5, XOR=6, CMP=7. */
    394 void emit_alu_imm8(MCEmitter* mc, int w, u32 sub, u32 reg, i8 imm) {
    395   u32 ofs = obj_pos(mc->obj, mc->section_id);
    396   u8 buf[16];
    397   u32 n = x64_alu_imm8_pack(
    398       (X64AluRmImm8){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf);
    399   mc->emit_bytes(mc, buf, n);
    400   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    401 }
    402 
    403 /* ALU r/m, imm32: opcode 0x81 /sub id (sign-extended for w=1). */
    404 void emit_alu_imm32(MCEmitter* mc, int w, u32 sub, u32 reg, i32 imm) {
    405   u32 ofs = obj_pos(mc->obj, mc->section_id);
    406   u8 buf[16];
    407   u32 n = x64_alu_imm32_pack(
    408       (X64AluRmImm32){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf);
    409   mc->emit_bytes(mc, buf, n);
    410   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    411 }
    412 
    413 /* IMUL r, r/m, imm: 0x6B /r ib (imm8 sext) or 0x69 /r id (imm32 sext).
    414  * Both forms write the result back to the same `dst` register so the
    415  * caller doesn't need an explicit copy beforehand — unlike the ALU
    416  * forms which read-modify-write a single operand. */
    417 void emit_imul_imm8(MCEmitter* mc, int w, u32 dst, u32 src, i8 imm) {
    418   u32 ofs = obj_pos(mc->obj, mc->section_id);
    419   u8 buf[16];
    420   u32 n = x64_imul_rri_pack(
    421       (X64ImulRRI){.w = w, .imm32 = 0, .dst = dst, .src = src, .imm = imm},
    422       buf);
    423   mc->emit_bytes(mc, buf, n);
    424   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    425 }
    426 void emit_imul_imm32(MCEmitter* mc, int w, u32 dst, u32 src, i32 imm) {
    427   u32 ofs = obj_pos(mc->obj, mc->section_id);
    428   u8 buf[16];
    429   u32 n = x64_imul_rri_pack(
    430       (X64ImulRRI){.w = w, .imm32 = 1, .dst = dst, .src = src, .imm = imm},
    431       buf);
    432   mc->emit_bytes(mc, buf, n);
    433   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    434 }
    435 
    436 /* Width predicate: does `imm` fit in an i8 (used by the 0x83/0x6B
    437  * imm8-sign-extended forms)? */
    438 int imm_fits_i8(i64 imm) { return imm >= -128 && imm <= 127; }
    439 /* Width predicate: does `imm` fit in a signed 32-bit value (the 0x81/
    440  * 0x69 imm32-sign-extended forms; for w=1 the imm is sign-extended to
    441  * 64). Returns 0 for values outside [INT32_MIN, INT32_MAX] — those
    442  * require a full materialization through x64_emit_load_imm. */
    443 int imm_fits_i32(i64 imm) {
    444   return imm >= -2147483648LL && imm <= 2147483647LL;
    445 }
    446 
    447 void emit_test_self(MCEmitter* mc, int w, u32 reg) {
    448   emit_alu_rr(mc, w, X64_OPC_ALU_TEST, reg, reg);
    449 }
    450 
    451 void emit_setcc(MCEmitter* mc, u32 cc, u32 reg) {
    452   u32 ofs = obj_pos(mc->obj, mc->section_id);
    453   u8 buf[16];
    454   u32 n = x64_setcc_pack((X64Setcc){.cc = cc, .reg = reg}, buf);
    455   mc->emit_bytes(mc, buf, n);
    456   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    457 }
    458 
    459 void emit_movzx_r32_r8(MCEmitter* mc, u32 dst, u32 src) {
    460   u32 ofs = obj_pos(mc->obj, mc->section_id);
    461   u8 buf[16];
    462   u32 n = x64_movzx_rr_pack((X64MovzxRR){.w = 0,
    463                                          .opc1 = X64_OPC_MOVZX_B,
    464                                          .force_rex = 1,
    465                                          .dst = dst,
    466                                          .src = src},
    467                             buf);
    468   mc->emit_bytes(mc, buf, n);
    469   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    470 }
    471 
    472 /* movzx/movsx r→r. src_size is source byte width. */
    473 void emit_extend_rr(MCEmitter* mc, int w, int signed_ext, u32 src_size, u32 dst,
    474                     u32 src) {
    475   u32 ofs = obj_pos(mc->obj, mc->section_id);
    476   u8 buf[16];
    477   u32 n = 0;
    478   if (src_size == 4 && signed_ext) {
    479     /* movsxd r64, r32: REX.W 0x63 ModRM */
    480     n = x64_movsxd_pack((X64Movsxd){.dst = dst, .src = src}, buf);
    481   } else if (src_size == 4 && !signed_ext) {
    482     /* zext 32→64 is `mov r32, r32` (clears high 32). */
    483     n = x64_alu_rr_pack(
    484         (X64AluRR){.w = 0, .op = X64_OPC_MOV_RM_R, .dst = dst, .src = src},
    485         buf);
    486   } else if (src_size == 1) {
    487     n = x64_movzx_rr_pack(
    488         (X64MovzxRR){.w = w,
    489                      .opc1 = signed_ext ? X64_OPC_MOVSX_B : X64_OPC_MOVZX_B,
    490                      .force_rex = 1,
    491                      .dst = dst,
    492                      .src = src},
    493         buf);
    494   } else if (src_size == 2) {
    495     n = x64_movzx_rr_pack(
    496         (X64MovzxRR){.w = w,
    497                      .opc1 = signed_ext ? X64_OPC_MOVSX_W : X64_OPC_MOVZX_W,
    498                      .force_rex = 0,
    499                      .dst = dst,
    500                      .src = src},
    501         buf);
    502   } else {
    503     /* No extension to perform (src already at least as wide as dst, e.g.
    504      * 64→64 zext/sext). Still need a reg-to-reg move when dst != src so the
    505      * destination holds the value. */
    506     if (dst != src) emit_mov_rr(mc, w, dst, src);
    507   }
    508   if (n) mc->emit_bytes(mc, buf, n);
    509   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    510 }
    511 
    512 void emit_ret(MCEmitter* mc) {
    513   u8 op = X64_OPC_RET;
    514   mc->emit_bytes(mc, &op, 1);
    515 }
    516 void emit_leave(MCEmitter* mc) {
    517   u8 op = X64_OPC_LEAVE;
    518   mc->emit_bytes(mc, &op, 1);
    519 }
    520 
    521 /* ---- SSE scalar FP encoders ---- */
    522 void emit_sse_rr(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 src) {
    523   u32 ofs = obj_pos(mc->obj, mc->section_id);
    524   u8 buf[16];
    525   u32 n = x64_sse_rr_pack(
    526       (X64SseRR){
    527           .prefix = prefix, .opcode = opcode, .w = 0, .dst = dst, .src = src},
    528       buf);
    529   mc->emit_bytes(mc, buf, n);
    530   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    531 }
    532 void emit_sse_load(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 base,
    533                    i32 disp) {
    534   u32 ofs = obj_pos(mc->obj, mc->section_id);
    535   u8 buf[16];
    536   u32 n = x64_sse_mem_pack((X64SseMem){.prefix = prefix,
    537                                        .opcode = opcode,
    538                                        .reg = dst,
    539                                        .base = base,
    540                                        .disp = disp},
    541                            buf);
    542   mc->emit_bytes(mc, buf, n);
    543   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    544 }
    545 void emit_sse_store(MCEmitter* mc, u8 prefix, u8 opcode, u32 src, u32 base,
    546                     i32 disp) {
    547   u32 ofs = obj_pos(mc->obj, mc->section_id);
    548   u8 buf[16];
    549   u32 n = x64_sse_mem_pack((X64SseMem){.prefix = prefix,
    550                                        .opcode = opcode,
    551                                        .reg = src,
    552                                        .base = base,
    553                                        .disp = disp},
    554                            buf);
    555   mc->emit_bytes(mc, buf, n);
    556   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    557 }
    558 void emit_sse_load_idx(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 base,
    559                        u32 index, u32 log2_scale, i32 disp) {
    560   if (index == REG_NONE) {
    561     emit_sse_load(mc, prefix, opcode, dst, base, disp);
    562     return;
    563   }
    564   u32 ofs = obj_pos(mc->obj, mc->section_id);
    565   emit_mem_idx_op(mc, prefix, 0, 0, 0, opcode, dst, base, index & 0xFu,
    566                   log2_scale, disp);
    567   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    568 }
    569 void emit_sse_store_idx(MCEmitter* mc, u8 prefix, u8 opcode, u32 src, u32 base,
    570                         u32 index, u32 log2_scale, i32 disp) {
    571   if (index == REG_NONE) {
    572     emit_sse_store(mc, prefix, opcode, src, base, disp);
    573     return;
    574   }
    575   u32 ofs = obj_pos(mc->obj, mc->section_id);
    576   emit_mem_idx_op(mc, prefix, 0, 0, 0, opcode, src, base, index & 0xFu,
    577                   log2_scale, disp);
    578   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    579 }
    580 void emit_sse_rr_w(MCEmitter* mc, u8 prefix, u8 opcode, int w, u32 dst,
    581                    u32 src) {
    582   u32 ofs = obj_pos(mc->obj, mc->section_id);
    583   u8 buf[16];
    584   u32 n = x64_sse_rr_pack(
    585       (X64SseRR){
    586           .prefix = prefix, .opcode = opcode, .w = w, .dst = dst, .src = src},
    587       buf);
    588   mc->emit_bytes(mc, buf, n);
    589   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
    590 }