kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

isa.h (26954B)


      1 /* x86_64 ISA descriptors — single source of truth for every instruction
      2  * the encoder and decoder need to agree on.
      3  *
      4  * Unlike aarch64 (fixed 32-bit insns, identified by `(word & mask) == match`),
      5  * x86_64 instructions are variable-length 1..15 bytes:
      6  *
      7  *   [ legacy_pfx? ][ REX? ][ opcode 1..3B ][ ModR/M? ][ SIB? ]
      8  *   [ disp 0/1/4B ][ imm 0/1/2/4/8B ]
      9  *
     10  * So the table is keyed piecewise:
     11  *   - leg_pfx          one of {none, 0x66, 0xF2, 0xF3}
     12  *   - opc[1..3]        opcode bytes (the last byte may have a low-bit mask
     13  *                      for embed-reg forms like PUSH r64 = 50+rd)
     14  *   - modrm_reg        0..7 for `/digit` opcode extension, 0xFF for `/r`
     15  *                      or no-ModR/M
     16  *   - rex_w_req        WIDTH_ANY / WIDTH_1 / WIDTH_0
     17  *   - fmt              X64Format enum (operand shape)
     18  *
     19  * Disasm flow:
     20  *   X64DecodeCtx ctx;  x64_decode_prefixes(bytes, len, &ctx);
     21  *   const X64InsnDesc* d = x64_disasm_find(bytes, len, &ctx);
     22  *   x64_print_operands(sb, d, bytes, len, &ctx, vaddr) → total byte length.
     23  *
     24  * Encoder migration is staged separately: phase 1 ships the descriptors and
     25  * decode side; phase 2 swaps each emit_* function body to use the per-format
     26  * pack helpers below; phase 3 refactors asm.c around the same table. */
     27 
     28 #ifndef KIT_X64_ISA_H
     29 #define KIT_X64_ISA_H
     30 
     31 #include "core/bytes.h"
     32 #include "core/core.h"
     33 #include "core/slice.h"
     34 #include "core/strbuf.h"
     35 
     36 /* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */
     37 enum {
     38   X64_RAX = 0,
     39   X64_RCX = 1,
     40   X64_RDX = 2,
     41   X64_RBX = 3,
     42   X64_RSP = 4,
     43   X64_RBP = 5,
     44   X64_RSI = 6,
     45   X64_RDI = 7,
     46   X64_R8 = 8,
     47   X64_R9 = 9,
     48   X64_R10 = 10,
     49   X64_R11 = 11,
     50   X64_R12 = 12,
     51   X64_R13 = 13,
     52   X64_R14 = 14,
     53   X64_R15 = 15,
     54 };
     55 
     56 /* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */
     57 enum {
     58   X64_XMM0 = 0,
     59   X64_XMM1 = 1,
     60   X64_XMM2 = 2,
     61   X64_XMM3 = 3,
     62   X64_XMM4 = 4,
     63   X64_XMM5 = 5,
     64   X64_XMM6 = 6,
     65   X64_XMM7 = 7,
     66   X64_XMM8 = 8,
     67   X64_XMM15 = 15,
     68 };
     69 
     70 /* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */
     71 enum {
     72   X64_CC_O = 0x0,
     73   X64_CC_NO = 0x1,
     74   X64_CC_B = 0x2,  /* below / CF=1                      → CMP_LT_U */
     75   X64_CC_AE = 0x3, /* above-or-equal / CF=0             → CMP_GE_U */
     76   X64_CC_E = 0x4,  /* equal / ZF=1                      → CMP_EQ   */
     77   X64_CC_NE = 0x5, /*                                   → CMP_NE   */
     78   X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1     → CMP_LE_U */
     79   X64_CC_A = 0x7,  /* above / CF=0 and ZF=0             → CMP_GT_U */
     80   X64_CC_S = 0x8,
     81   X64_CC_NS = 0x9,
     82   X64_CC_P = 0xA,
     83   X64_CC_NP = 0xB,
     84   X64_CC_L = 0xC,  /* less (signed) / SF!=OF            → CMP_LT_S */
     85   X64_CC_GE = 0xD, /*                                   → CMP_GE_S */
     86   X64_CC_LE = 0xE, /* less-or-equal (signed)            → CMP_LE_S */
     87   X64_CC_G = 0xF,  /* greater                           → CMP_GT_S */
     88 };
     89 
     90 /* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */
     91 #define X64_REX_BASE 0x40u
     92 #define X64_REX_W 0x08u
     93 #define X64_REX_R 0x04u
     94 #define X64_REX_X 0x02u
     95 #define X64_REX_B 0x01u
     96 
     97 /* ---- Branch / NOP encoding constants ----
     98  *
     99  * Used by the linker to emit PLT entries and IPLT stubs without
    100  * sprinkling raw hex into src/arch/x64/link.c.  The shape is always the
    101  * same RIP-relative indirect JMP plus padding NOPs. */
    102 
    103 /* JMP r/m64 — opcode FF /4.  ModR/M for the RIP+disp32 form is
    104  * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */
    105 #define X64_OP_JMP_RM64 0xFFu
    106 #define X64_MODRM_JMP_RIPREL 0x25u
    107 
    108 /* Single-byte NOP. */
    109 #define X64_NOP1 0x90u
    110 
    111 /* Intel multi-byte ("long") NOP forms.  The 6-byte form is the
    112  * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */
    113 #define X64_NOP6_BYTE0 0x66u
    114 #define X64_NOP6_BYTE1 0x0Fu
    115 #define X64_NOP6_BYTE2 0x1Fu
    116 #define X64_NOP6_BYTE3 0x44u
    117 #define X64_NOP6_BYTE4 0x00u
    118 #define X64_NOP6_BYTE5 0x00u
    119 
    120 /* Sizes of the encoded forms above. */
    121 #define X64_JMP_RIPREL_SIZE 6u
    122 #define X64_NOP6_SIZE 6u
    123 
    124 /* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */
    125 static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) {
    126   dst[0] = X64_OP_JMP_RM64;
    127   dst[1] = X64_MODRM_JMP_RIPREL;
    128   wr_u32_le(dst + 2, (u32)disp32);
    129 }
    130 
    131 /* Fill nbytes at dst with single-byte NOPs (0x90).  Matches the
    132  * existing memset-then-patch pattern used to pad PLT entries to 16. */
    133 static inline void x64_write_nop_pad(u8* dst, u32 nbytes) {
    134   u32 i;
    135   for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1;
    136 }
    137 
    138 /* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at
    139  * dst.  Used to pad the IPLT stub from 6 → 12 bytes. */
    140 static inline void x64_write_nop6(u8* dst) {
    141   dst[0] = X64_NOP6_BYTE0;
    142   dst[1] = X64_NOP6_BYTE1;
    143   dst[2] = X64_NOP6_BYTE2;
    144   dst[3] = X64_NOP6_BYTE3;
    145   dst[4] = X64_NOP6_BYTE4;
    146   dst[5] = X64_NOP6_BYTE5;
    147 }
    148 
    149 /* ====================================================================
    150  * Decode context — fills as we walk the prefix bytes.  Disasm and
    151  * (eventually) inline-asm parsers share this so prefix accounting lives
    152  * in one place.
    153  * ==================================================================== */
    154 
    155 #define X64_PFX_NONE 0u
    156 #define X64_PFX_66 0x66u /* operand-size override (16-bit) */
    157 #define X64_PFX_F2 0xF2u /* SSE scalar double / REPNE */
    158 #define X64_PFX_F3 0xF3u /* SSE scalar single / REP */
    159 
    160 #define X64_W_REQ_ANY 0u /* row matches either REX.W value */
    161 #define X64_W_REQ_1 1u   /* row requires REX.W = 1 (64-bit form) */
    162 #define X64_W_REQ_0 2u   /* row requires REX.W = 0 (force 32-bit form) */
    163 
    164 typedef struct X64DecodeCtx {
    165   u8 leg_pfx; /* 0 / 0x66 / 0xF2 / 0xF3 (last seen wins) */
    166   u8 has_lock;
    167   u8 has_rex;
    168   u8 rex_w, rex_r, rex_x, rex_b;
    169   u32 opc_off; /* offset of first opcode byte inside the instruction */
    170 } X64DecodeCtx;
    171 
    172 /* Walk legacy prefix bytes (0x66 / 0xF2 / 0xF3 / 0xF0 LOCK) followed by an
    173  * optional REX byte (0x40..0x4F). Fills `ctx` and returns the offset of the
    174  * first non-prefix byte. */
    175 u32 x64_decode_prefixes(const u8* bytes, u32 len, X64DecodeCtx* ctx);
    176 
    177 /* ====================================================================
    178  * Opcode constants used by both the descriptor table and pack helpers.
    179  *
    180  * Naming: X64_OPC_<class>_<mnemonic>.  We promote the bytes the encoder
    181  * emits (not every byte the decoder might see — alias rows in isa.c
    182  * still spell their own opcode bytes inline). */
    183 
    184 /* ALU r/m, r — opcode picks the operation. */
    185 #define X64_OPC_ALU_ADD 0x01u
    186 #define X64_OPC_ALU_OR 0x09u
    187 #define X64_OPC_ALU_AND 0x21u
    188 #define X64_OPC_ALU_SUB 0x29u
    189 #define X64_OPC_ALU_XOR 0x31u
    190 #define X64_OPC_ALU_CMP 0x39u
    191 #define X64_OPC_ALU_TEST 0x85u
    192 #define X64_OPC_MOV_RM_R 0x89u  /* MOV r/m, r */
    193 #define X64_OPC_MOV_RM_R8 0x88u /* MOV r/m8, r8 */
    194 #define X64_OPC_MOV_R_RM 0x8Bu  /* MOV r, r/m */
    195 #define X64_OPC_LEA 0x8Du
    196 #define X64_OPC_MOVSXD 0x63u
    197 
    198 /* ALU r/m, imm — /sub picks op. */
    199 #define X64_OPC_ALU_IMM8 0x83u
    200 #define X64_OPC_ALU_IMM32 0x81u
    201 #define X64_ALU_SUB_ADD 0u
    202 #define X64_ALU_SUB_OR 1u
    203 #define X64_ALU_SUB_AND 4u
    204 #define X64_ALU_SUB_SUB 5u
    205 #define X64_ALU_SUB_XOR 6u
    206 #define X64_ALU_SUB_CMP 7u
    207 
    208 /* MOV r, imm — B8+rd. */
    209 #define X64_OPC_MOV_RI 0xB8u
    210 
    211 /* MOV r/m, imm — C6 /0 (byte) and C7 /0 (32/64 sign-extended imm32). */
    212 #define X64_OPC_MOV_RM_IMM8 0xC6u
    213 #define X64_OPC_MOV_RM_IMM32 0xC7u
    214 #define X64_MOV_RM_IMM_SUB 0u
    215 
    216 /* IMUL r, r/m (two-byte) and IMUL r, r/m, imm. */
    217 #define X64_OPC_IMUL_2B 0xAFu /* preceded by 0x0F */
    218 #define X64_OPC_IMUL_IMM8 0x6Bu
    219 #define X64_OPC_IMUL_IMM32 0x69u
    220 
    221 /* F7 /sub family. */
    222 #define X64_OPC_F7 0xF7u
    223 #define X64_F7_SUB_NOT 2u
    224 #define X64_F7_SUB_NEG 3u
    225 #define X64_F7_SUB_MUL 4u
    226 #define X64_F7_SUB_IMUL 5u
    227 #define X64_F7_SUB_DIV 6u
    228 #define X64_F7_SUB_IDIV 7u
    229 
    230 /* Shifts. */
    231 #define X64_OPC_SHIFT_IMM 0xC1u
    232 #define X64_OPC_SHIFT_CL 0xD3u
    233 #define X64_SHIFT_SUB_SHL 4u
    234 #define X64_SHIFT_SUB_SHR 5u
    235 #define X64_SHIFT_SUB_SAR 7u
    236 
    237 /* MOVZX / MOVSX (preceded by 0x0F). */
    238 #define X64_OPC_MOVZX_B 0xB6u
    239 #define X64_OPC_MOVZX_W 0xB7u
    240 #define X64_OPC_MOVSX_B 0xBEu
    241 #define X64_OPC_MOVSX_W 0xBFu
    242 
    243 /* SETcc base, CMOVcc base (preceded by 0x0F, low nibble = cc). */
    244 #define X64_OPC_SETCC_BASE 0x90u
    245 #define X64_OPC_CMOVCC_BASE 0x40u
    246 
    247 /* Branches. */
    248 #define X64_OPC_JMP_REL32 0xE9u
    249 #define X64_OPC_CALL_REL32 0xE8u
    250 #define X64_OPC_JCC_BASE 0x80u /* preceded by 0x0F, low nibble = cc */
    251 
    252 /* Stack. */
    253 #define X64_OPC_PUSH_R 0x50u
    254 #define X64_OPC_POP_R 0x58u
    255 
    256 /* Misc. */
    257 #define X64_OPC_RET 0xC3u
    258 #define X64_OPC_LEAVE 0xC9u
    259 #define X64_OPC_CDQ_CQO 0x99u
    260 #define X64_OPC_TWOBYTE 0x0Fu
    261 
    262 /* 0x66 operand-size override, used to force 16-bit forms. */
    263 #define X64_OPSIZE_PFX 0x66u
    264 
    265 /* ====================================================================
    266  * Format kinds — one per "encoding shape" kit's emit.c produces.
    267  * The format determines how operands are recovered from the byte stream
    268  * after the opcode bytes and how they print in AT&T syntax.
    269  * ==================================================================== */
    270 
    271 typedef enum X64Format {
    272   X64_FMT_NULLARY,      /* no operands: RET, NOP, UD2, LEAVE, CDQ/CQO */
    273   X64_FMT_NOP_MULTI,    /* multi-byte NOP family (66 0F 1F ...) */
    274   X64_FMT_PUSH_POP,     /* 50+rd / 58+rd — register in low 3 bits */
    275   X64_FMT_MOV_RI,       /* B8+rd imm{32,64} — width via REX.W */
    276   X64_FMT_ALU_RR,       /* op r/m, r  — ADD/OR/AND/SUB/XOR/CMP/MOV/TEST */
    277   X64_FMT_MOV_RM_LOAD,  /* 8B /r  — MOV r, r/m (also LEA via 8D /r) */
    278   X64_FMT_MOVZX_MOVSX,  /* 0F B6/B7/BE/BF /r  — width-extending loads */
    279   X64_FMT_MOVSXD,       /* REX.W 63 /r  — MOVSXD r64, r/m32 */
    280   X64_FMT_ALU_RM_IMM8,  /* 83 /sub ib  — ADD/OR/AND/SUB/XOR/CMP r/m, imm8 */
    281   X64_FMT_ALU_RM_IMM32, /* 81 /sub id  — same family, imm32 */
    282   X64_FMT_IMUL_RR,      /* 0F AF /r  — IMUL r, r/m */
    283   X64_FMT_IMUL_RRI,     /* 69/6B /r i{8,32}  — IMUL r, r/m, imm */
    284   X64_FMT_F7_RM,        /* F7 /sub  — NOT/NEG/MUL/IMUL/DIV/IDIV */
    285   X64_FMT_SHIFT_IMM,    /* C1 /sub ib  — SHL/SHR/SAR r/m, imm8 */
    286   X64_FMT_SHIFT_CL,     /* D3 /sub  — SHL/SHR/SAR r/m, %cl */
    287   X64_FMT_JCC_REL32,    /* 0F 8x rel32  — Jcc near */
    288   X64_FMT_JMP_REL32,    /* E9 rel32 */
    289   X64_FMT_CALL_REL32,   /* E8 rel32 */
    290   X64_FMT_BR_RM,        /* FF /2 or /4  — call/jmp indirect r/m */
    291   X64_FMT_SETCC_RM,     /* 0F 9x /0 r/m8  — SETcc */
    292   X64_FMT_CMOVCC_RR,    /* 0F 4x /r  — CMOVcc r, r/m */
    293   X64_FMT_SSE_RR,       /* {F2|F3|66}? 0F xx /r  — scalar FP reg-reg */
    294   X64_FMT_SSE_LOAD,     /* same, dst <- [base+disp] */
    295   X64_FMT_SSE_STORE,    /* same, [base+disp] <- src */
    296   X64_FMT_BSWAP,        /* 0F C8+rd */
    297   X64_FMT_BS,           /* 0F BC/BD /r  — BSF/BSR */
    298   X64_FMT_POPCNT,       /* F3 0F B8 /r */
    299   X64_FMT_XADD_MEM,     /* LOCK 0F C1 /r  — XADD m, r */
    300   X64_FMT_XCHG_MEM,     /* 87 /r  — XCHG r, m (LOCK implicit on mem dst) */
    301   X64_FMT_CMPXCHG_MEM,  /* LOCK 0F B1 /r  — CMPXCHG m, r */
    302   X64_FMT_RAW_BYTE,     /* sentinel: render as `.byte 0xNN` (no match) */
    303 } X64Format;
    304 
    305 #define X64_ASMFL_ALIAS                                \
    306   0x01u /* row is an alias spelling (prefer-on-decode) \
    307          */
    308 #define X64_ASMFL_W_FROM_REX 0x02u /* fmt picks width from ctx->rex_w */
    309 #define X64_ASMFL_FORCE_W64 0x04u  /* fmt always 64-bit regardless of REX.W */
    310 #define X64_ASMFL_BYTE 0x08u       /* fixed-byte operand (movb, setcc) */
    311 #define X64_ASMFL_W16 0x10u        /* fixed 16-bit (via 0x66 prefix override) */
    312 
    313 /* ====================================================================
    314  * Descriptor table row.
    315  * ==================================================================== */
    316 
    317 typedef struct X64InsnDesc {
    318   Slice mnemonic;   /* AT&T mnemonic without size suffix; printer adds
    319                        a size letter (b/w/l/q) based on fmt + ctx. */
    320   u8 leg_pfx;       /* X64_PFX_NONE / 0x66 / 0xF2 / 0xF3 */
    321   u8 opc_len;       /* 1..3 */
    322   u8 opc[3];        /* opcode bytes */
    323   u8 opc_last_mask; /* 0xFF for exact match on opc[opc_len-1];
    324                        0xF8 for embed-reg in low 3 bits;
    325                        0xF0 for Jcc / SETcc / CMOVcc condition nibble */
    326   u8 modrm_reg;     /* 0..7 if /digit, 0xFF otherwise */
    327   u8 rex_w_req;     /* X64_W_REQ_* */
    328   u8 fmt;           /* X64Format */
    329   u8 flags;         /* X64_ASMFL_* */
    330 } X64InsnDesc;
    331 
    332 extern const X64InsnDesc x64_insn_table[];
    333 extern const u32 x64_insn_table_n;
    334 
    335 /* Linear scan after prefix decode. Sets `ctx->opc_off` to where opcode
    336  * starts. Returns the matching descriptor, or NULL on no match (caller
    337  * should emit a `.byte` fallback). On success, opc_off is unchanged;
    338  * the caller can derive opc_end as opc_off + desc->opc_len. */
    339 const X64InsnDesc* x64_disasm_find(const u8* bytes, u32 len, X64DecodeCtx* ctx);
    340 
    341 /* Render operand text for a matched descriptor into `sb` and return the
    342  * total instruction length in bytes (from bytes[0], including any
    343  * prefixes/ModR/M/SIB/disp/imm). Returns 0 if the encoding is truncated
    344  * (caller falls back to a single-byte `.byte` rendering). `vaddr` is the
    345  * instruction's virtual address for PC-relative formats; pass 0 if not
    346  * known. The mnemonic itself is *not* written — caller emits desc->mnemonic
    347  * (plus any size suffix it derives via x64_size_suffix_for). */
    348 u32 x64_print_operands(StrBuf* sb, const X64InsnDesc* desc, const u8* bytes,
    349                        u32 len, const X64DecodeCtx* ctx, u64 vaddr);
    350 
    351 /* Returns the AT&T size suffix character ('b','w','l','q') the printer
    352  * appends to mnemonics that depend on operand width.  Returns 0 if the
    353  * mnemonic carries its own width (Jcc, SETcc, MOVZX/MOVSX, SSE, etc.). */
    354 char x64_size_suffix_for(const X64InsnDesc* desc, const X64DecodeCtx* ctx);
    355 
    356 /* Translate a condition nibble (low 4 bits of the second opcode byte for
    357  * Jcc/SETcc/CMOVcc) to its AT&T suffix: "e", "ne", "ge", ... */
    358 const char* x64_cc_name(u8 cc);
    359 
    360 /* ====================================================================
    361  * Pack helpers — encode-side counterpart of the decode dispatch above.
    362  *
    363  * Each helper builds one instruction into a caller-provided buffer and
    364  * returns the number of bytes written.  Callers must reserve at least
    365  * 16 bytes; no single x86_64 instruction we emit exceeds 15.
    366  *
    367  * REX rules (shared by every reg/mem helper):
    368  *   - Emitted only when needed: W=1 or any of R/X/B nonzero.
    369  *   - Force-REX variants (suffix `_force_rex`) always emit a REX byte —
    370  *     required for byte-reg encodings that promote SIL/DIL/etc.
    371  *
    372  * ModR/M memory rules (handled by x64_pack_mem):
    373  *   - mod=0 for disp=0 unless (base & 7) == 5 (RBP/R13 — needs disp8=0).
    374  *   - mod=1 for disp in [-128,127].
    375  *   - mod=2 for full disp32.
    376  *   - SIB byte required when (base & 7) == 4 (RSP/R12); index=4 (none).
    377  * ==================================================================== */
    378 
    379 /* REX prefix byte builder. Returns 0 if no REX needed. */
    380 static inline u8 x64_make_rex(int w, u32 reg, u32 index, u32 rm) {
    381   u8 r = 0;
    382   if (w) r |= X64_REX_W;
    383   if (reg & 8u) r |= X64_REX_R;
    384   if (index & 8u) r |= X64_REX_X;
    385   if (rm & 8u) r |= X64_REX_B;
    386   return r ? (u8)(X64_REX_BASE | r) : 0u;
    387 }
    388 
    389 /* ModR/M byte builder. */
    390 static inline u8 x64_modrm(u32 mod, u32 reg, u32 rm) {
    391   return (u8)(((mod & 3u) << 6) | ((reg & 7u) << 3) | (rm & 7u));
    392 }
    393 
    394 /* SIB byte builder. */
    395 static inline u8 x64_sib(u32 scale, u32 index, u32 base) {
    396   return (u8)(((scale & 3u) << 6) | ((index & 7u) << 3) | (base & 7u));
    397 }
    398 
    399 /* ModR/M r/m encodings with special meaning:
    400  *   rm=100 → SIB byte follows.
    401  *   rm=101 with mod=00 → RIP-relative (disp32) or, in SIB.base, disp32-only. */
    402 #define X64_MODRM_RM_SIB 4u
    403 #define X64_MODRM_RM_RIP_DISP32 5u
    404 
    405 /* SIB.index=100 means "no index". */
    406 #define X64_SIB_NO_INDEX 4u
    407 /* SIB.base=101 with mod=00 means "no base" (disp32 only). */
    408 #define X64_SIB_NO_BASE 5u
    409 
    410 /* Pick ModR/M.mod from a (base,disp) memory operand:
    411  *   0 → [base]            (only if disp==0 and (base&7)!=5)
    412  *   1 → [base + disp8]
    413  *   2 → [base + disp32] */
    414 static inline u32 x64_disp_mod(u32 base, i32 disp) {
    415   if (disp == 0 && (base & 7u) != 5u) return 0u;
    416   if (disp >= -128 && disp <= 127) return 1u;
    417   return 2u;
    418 }
    419 
    420 /* Append `n` little-endian bytes of `v` to out, return n. */
    421 static inline u32 x64_put_u32le(u8* out, u32 v) {
    422   out[0] = (u8)v;
    423   out[1] = (u8)(v >> 8);
    424   out[2] = (u8)(v >> 16);
    425   out[3] = (u8)(v >> 24);
    426   return 4u;
    427 }
    428 static inline u32 x64_put_u64le(u8* out, u64 v) {
    429   for (u32 i = 0; i < 8u; ++i) out[i] = (u8)(v >> (i * 8u));
    430   return 8u;
    431 }
    432 
    433 /* Pack a bare RIP-relative memory operand `[rip + disp32]` (no symbol).
    434  * ModR/M mod=00, rm=101, followed by disp32; no SIB. */
    435 static inline u32 x64_pack_mem_rip(u8* out, u32 reg, i32 disp) {
    436   out[0] = x64_modrm(0u, reg, X64_MODRM_RM_RIP_DISP32);
    437   return 1u + x64_put_u32le(out + 1, (u32)disp);
    438 }
    439 
    440 /* Pack a memory operand (ModR/M + optional SIB + optional disp) for the
    441  * `reg` operand and `[base + disp]` r/m operand. Returns bytes written. */
    442 static inline u32 x64_pack_mem(u8* out, u32 reg, u32 base, i32 disp) {
    443   u32 m = x64_disp_mod(base, disp);
    444   u32 n = 0;
    445   if ((base & 7u) == 4u) {
    446     out[n++] = x64_modrm(m, reg, 4u);
    447     out[n++] = x64_sib(0u, 4u, base);
    448   } else {
    449     out[n++] = x64_modrm(m, reg, base);
    450   }
    451   if (m == 1u) {
    452     out[n++] = (u8)(i8)disp;
    453   } else if (m == 2u) {
    454     n += x64_put_u32le(out + n, (u32)disp);
    455   }
    456   return n;
    457 }
    458 
    459 /* Pack a SIB-form memory operand `[base + index*scale + disp]`. Emits SIB
    460  * unconditionally; pass index = 4 (RSP) for the no-index case (the SIB
    461  * "no index" encoding). `log2_scale` ∈ {0,1,2,3} for byte scale 1/2/4/8.
    462  *
    463  * RBP/R13 base needs at least disp8 even when disp == 0 (mod=00 with
    464  * SIB base=5 means "no base, disp32 only"). RSP/R12 base requires SIB
    465  * regardless — which is what this helper provides. */
    466 static inline u32 x64_pack_mem_sib(u8* out, u32 reg, u32 base, u32 index,
    467                                    u32 log2_scale, i32 disp) {
    468   /* For SIB base encoding, base=5 (RBP/R13) cannot use mod=0; force
    469    * disp8/disp32. Other bases can use the standard mod selection. */
    470   u32 m;
    471   if ((base & 7u) == 5u && disp == 0) {
    472     m = 1u; /* disp8 = 0 */
    473   } else if (disp == 0) {
    474     m = 0u;
    475   } else if (disp >= -128 && disp <= 127) {
    476     m = 1u;
    477   } else {
    478     m = 2u;
    479   }
    480   u32 n = 0;
    481   out[n++] = x64_modrm(m, reg, 4u); /* r/m = 4 → SIB follows */
    482   out[n++] = x64_sib(log2_scale & 3u, index, base);
    483   if (m == 1u) {
    484     out[n++] = (u8)(i8)disp;
    485   } else if (m == 2u) {
    486     n += x64_put_u32le(out + n, (u32)disp);
    487   }
    488   return n;
    489 }
    490 
    491 /* Pack a reg-form ModR/M (mod=3) — one byte. */
    492 static inline u32 x64_pack_rm_reg(u8* out, u32 reg, u32 rm) {
    493   out[0] = x64_modrm(3u, reg, rm);
    494   return 1u;
    495 }
    496 
    497 /* Emit an optional REX (only if needed) and return bytes written (0 or 1). */
    498 static inline u32 x64_pack_rex(u8* out, int w, u32 reg, u32 index, u32 rm) {
    499   u8 r = x64_make_rex(w, reg, index, rm);
    500   if (!r) return 0u;
    501   out[0] = r;
    502   return 1u;
    503 }
    504 /* Always emit a REX byte (force form). */
    505 static inline u32 x64_pack_rex_force(u8* out, int w, u32 reg, u32 index,
    506                                      u32 rm) {
    507   out[0] =
    508       (u8)(X64_REX_BASE | (w ? X64_REX_W : 0u) | ((reg & 8u) ? X64_REX_R : 0u) |
    509            ((index & 8u) ? X64_REX_X : 0u) | ((rm & 8u) ? X64_REX_B : 0u));
    510   return 1u;
    511 }
    512 
    513 /* ---- X64_FMT_NULLARY: one or two opcode bytes, no operands. ---- */
    514 typedef struct X64Nullary {
    515   u8 prefix; /* legacy prefix or 0 */
    516   int w;     /* if nonzero, force REX.W (used by CQO) */
    517   u8 opc0;
    518   u8 opc1; /* 0 if unused */
    519 } X64Nullary;
    520 static inline u32 x64_nullary_pack(X64Nullary f, u8* out) {
    521   u32 n = 0;
    522   if (f.prefix) out[n++] = f.prefix;
    523   if (f.w) out[n++] = (u8)(X64_REX_BASE | X64_REX_W);
    524   out[n++] = f.opc0;
    525   if (f.opc1) out[n++] = f.opc1;
    526   return n;
    527 }
    528 
    529 /* ---- X64_FMT_ALU_RR: op r/m, r (reg-reg form). ----
    530  *   REX(w, src, 0, dst) | op | ModR/M(3, src, dst)
    531  *
    532  * `op` selects the operation (ADD/OR/AND/SUB/XOR/CMP/MOV/TEST). */
    533 typedef struct X64AluRR {
    534   int w;
    535   u8 op;
    536   u32 dst; /* r/m */
    537   u32 src; /* reg */
    538 } X64AluRR;
    539 static inline u32 x64_alu_rr_pack(X64AluRR f, u8* out) {
    540   u32 n = x64_pack_rex(out, f.w, f.src, 0, f.dst);
    541   out[n++] = f.op;
    542   n += x64_pack_rm_reg(out + n, f.src, f.dst);
    543   return n;
    544 }
    545 
    546 /* ---- X64_FMT_ALU_RR memory form: op [base+disp], r ----
    547  *   Optional 0x66 | REX(w, src, 0, base) | op | mem(src, base, disp)
    548  *   `force_rex` matches emit_mov_store size=1 (byte-reg promotion). */
    549 typedef struct X64AluRM {
    550   u8 prefix; /* 0 or 0x66 */
    551   int w;
    552   u8 op;
    553   int force_rex; /* 1 → always emit REX (byte-reg form) */
    554   u32 src;       /* reg operand */
    555   u32 base;      /* memory base */
    556   i32 disp;
    557 } X64AluRM;
    558 static inline u32 x64_alu_rm_pack(X64AluRM f, u8* out) {
    559   u32 n = 0;
    560   if (f.prefix) out[n++] = f.prefix;
    561   if (f.force_rex)
    562     n += x64_pack_rex_force(out + n, f.w, f.src, 0, f.base);
    563   else
    564     n += x64_pack_rex(out + n, f.w, f.src, 0, f.base);
    565   out[n++] = f.op;
    566   n += x64_pack_mem(out + n, f.src, f.base, f.disp);
    567   return n;
    568 }
    569 
    570 /* ---- X64_FMT_MOV_RI: MOV r, imm — opcode B8+rd ---- */
    571 typedef struct X64MovRI {
    572   int is64;
    573   u32 dst;
    574   i64 imm;
    575 } X64MovRI;
    576 static inline u32 x64_mov_ri_pack(X64MovRI f, u8* out) {
    577   u32 n = x64_pack_rex(out, f.is64 ? 1 : 0, 0, 0, f.dst);
    578   out[n++] = (u8)(X64_OPC_MOV_RI | (f.dst & 7u));
    579   if (f.is64)
    580     n += x64_put_u64le(out + n, (u64)f.imm);
    581   else
    582     n += x64_put_u32le(out + n, (u32)f.imm);
    583   return n;
    584 }
    585 
    586 /* ---- X64_FMT_MOV_RM_LOAD (8B /r) and LEA (8D /r) — register dst, memory src.
    587  * Also covers MOVZX/MOVSX with memory source (two-byte opcode). ---- */
    588 typedef struct X64MovRMLoad {
    589   int w;
    590   u8 opc0;  /* primary opcode byte */
    591   u8 opc1;  /* 0 for one-byte opcode; nonzero = 0F xx form */
    592   u32 dst;  /* reg */
    593   u32 base; /* mem base */
    594   i32 disp;
    595 } X64MovRMLoad;
    596 static inline u32 x64_mov_rm_load_pack(X64MovRMLoad f, u8* out) {
    597   u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.base);
    598   if (f.opc1) {
    599     out[n++] = X64_OPC_TWOBYTE;
    600     out[n++] = f.opc1;
    601   } else {
    602     out[n++] = f.opc0;
    603   }
    604   n += x64_pack_mem(out + n, f.dst, f.base, f.disp);
    605   return n;
    606 }
    607 
    608 /* ---- X64_FMT_MOVZX_MOVSX reg-reg form: 0F xx /r ----
    609  * `force_rex` distinguishes byte-source forms (need REX even when no high
    610  * regs) from the word-source form. */
    611 typedef struct X64MovzxRR {
    612   int w;
    613   u8 opc1; /* B6 / B7 / BE / BF */
    614   int force_rex;
    615   u32 dst;
    616   u32 src;
    617 } X64MovzxRR;
    618 static inline u32 x64_movzx_rr_pack(X64MovzxRR f, u8* out) {
    619   u32 n;
    620   if (f.force_rex)
    621     n = x64_pack_rex_force(out, f.w, f.dst, 0, f.src);
    622   else
    623     n = x64_pack_rex(out, f.w, f.dst, 0, f.src);
    624   out[n++] = X64_OPC_TWOBYTE;
    625   out[n++] = f.opc1;
    626   n += x64_pack_rm_reg(out + n, f.dst, f.src);
    627   return n;
    628 }
    629 
    630 /* ---- X64_FMT_MOVSXD: REX.W 63 /r — MOVSXD r64, r32 ---- */
    631 typedef struct X64Movsxd {
    632   u32 dst;
    633   u32 src;
    634 } X64Movsxd;
    635 static inline u32 x64_movsxd_pack(X64Movsxd f, u8* out) {
    636   u32 n = x64_pack_rex(out, 1, f.dst, 0, f.src);
    637   out[n++] = X64_OPC_MOVSXD;
    638   n += x64_pack_rm_reg(out + n, f.dst, f.src);
    639   return n;
    640 }
    641 
    642 /* ---- X64_FMT_ALU_RM_IMM8: 83 /sub ib (sign-extended) — reg-form. ---- */
    643 typedef struct X64AluRmImm8 {
    644   int w;
    645   u32 sub;
    646   u32 reg;
    647   i8 imm;
    648 } X64AluRmImm8;
    649 static inline u32 x64_alu_imm8_pack(X64AluRmImm8 f, u8* out) {
    650   u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg);
    651   out[n++] = X64_OPC_ALU_IMM8;
    652   out[n++] = x64_modrm(3u, f.sub, f.reg);
    653   out[n++] = (u8)f.imm;
    654   return n;
    655 }
    656 
    657 /* ---- X64_FMT_ALU_RM_IMM32: 81 /sub id (sign-extended for w=1). ---- */
    658 typedef struct X64AluRmImm32 {
    659   int w;
    660   u32 sub;
    661   u32 reg;
    662   i32 imm;
    663 } X64AluRmImm32;
    664 static inline u32 x64_alu_imm32_pack(X64AluRmImm32 f, u8* out) {
    665   u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg);
    666   out[n++] = X64_OPC_ALU_IMM32;
    667   out[n++] = x64_modrm(3u, f.sub, f.reg);
    668   n += x64_put_u32le(out + n, (u32)f.imm);
    669   return n;
    670 }
    671 
    672 /* ---- X64_FMT_IMUL_RR: 0F AF /r — IMUL r, r/m ---- */
    673 typedef struct X64ImulRR {
    674   int w;
    675   u32 dst;
    676   u32 src;
    677 } X64ImulRR;
    678 static inline u32 x64_imul_rr_pack(X64ImulRR f, u8* out) {
    679   u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src);
    680   out[n++] = X64_OPC_TWOBYTE;
    681   out[n++] = X64_OPC_IMUL_2B;
    682   n += x64_pack_rm_reg(out + n, f.dst, f.src);
    683   return n;
    684 }
    685 
    686 /* ---- X64_FMT_IMUL_RRI: 6B /r ib (imm8) or 69 /r id (imm32). ---- */
    687 typedef struct X64ImulRRI {
    688   int w;
    689   int imm32; /* 1 → 0x69 with imm32; 0 → 0x6B with imm8 */
    690   u32 dst;
    691   u32 src;
    692   i32 imm; /* sign-extended; for imm32=0, only low byte used */
    693 } X64ImulRRI;
    694 static inline u32 x64_imul_rri_pack(X64ImulRRI f, u8* out) {
    695   u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src);
    696   out[n++] = f.imm32 ? X64_OPC_IMUL_IMM32 : X64_OPC_IMUL_IMM8;
    697   out[n++] = x64_modrm(3u, f.dst, f.src);
    698   if (f.imm32)
    699     n += x64_put_u32le(out + n, (u32)f.imm);
    700   else
    701     out[n++] = (u8)(i8)f.imm;
    702   return n;
    703 }
    704 
    705 /* ---- X64_FMT_F7_RM: F7 /sub — NOT/NEG/MUL/IMUL/DIV/IDIV (reg). ---- */
    706 typedef struct X64F7RM {
    707   int w;
    708   u32 sub;
    709   u32 reg;
    710 } X64F7RM;
    711 static inline u32 x64_f7_rm_pack(X64F7RM f, u8* out) {
    712   u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg);
    713   out[n++] = X64_OPC_F7;
    714   n += x64_pack_rm_reg(out + n, f.sub, f.reg);
    715   return n;
    716 }
    717 
    718 /* ---- X64_FMT_SHIFT_IMM: C1 /sub ib (reg). ---- */
    719 typedef struct X64ShiftImm {
    720   int w;
    721   u32 sub;
    722   u32 reg;
    723   u8 imm;
    724 } X64ShiftImm;
    725 static inline u32 x64_shift_imm_pack(X64ShiftImm f, u8* out) {
    726   u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg);
    727   out[n++] = X64_OPC_SHIFT_IMM;
    728   out[n++] = x64_modrm(3u, f.sub, f.reg);
    729   out[n++] = f.imm;
    730   return n;
    731 }
    732 
    733 /* ---- X64_FMT_SHIFT_CL: D3 /sub (reg, %cl). ---- */
    734 typedef struct X64ShiftCL {
    735   int w;
    736   u32 sub;
    737   u32 reg;
    738 } X64ShiftCL;
    739 static inline u32 x64_shift_cl_pack(X64ShiftCL f, u8* out) {
    740   u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg);
    741   out[n++] = X64_OPC_SHIFT_CL;
    742   n += x64_pack_rm_reg(out + n, f.sub, f.reg);
    743   return n;
    744 }
    745 
    746 /* ---- X64_FMT_SETCC_RM: 0F 9x /0 r/m8 — force REX so byte-reg works. ---- */
    747 typedef struct X64Setcc {
    748   u32 cc;
    749   u32 reg;
    750 } X64Setcc;
    751 static inline u32 x64_setcc_pack(X64Setcc f, u8* out) {
    752   u32 n = x64_pack_rex_force(out, 0, 0, 0, f.reg);
    753   out[n++] = X64_OPC_TWOBYTE;
    754   out[n++] = (u8)(X64_OPC_SETCC_BASE | (f.cc & 0xFu));
    755   n += x64_pack_rm_reg(out + n, 0u, f.reg);
    756   return n;
    757 }
    758 
    759 /* ---- SSE scalar reg-reg / load / store: {pfx?} 0F xx /r. ---- */
    760 typedef struct X64SseRR {
    761   u8 prefix; /* 0 / 0x66 / 0xF2 / 0xF3 */
    762   u8 opcode;
    763   int w; /* REX.W for 64-bit CVTSI2 / CVTT2SI forms */
    764   u32 dst;
    765   u32 src;
    766 } X64SseRR;
    767 static inline u32 x64_sse_rr_pack(X64SseRR f, u8* out) {
    768   u32 n = 0;
    769   if (f.prefix) out[n++] = f.prefix;
    770   n += x64_pack_rex(out + n, f.w, f.dst, 0, f.src);
    771   out[n++] = X64_OPC_TWOBYTE;
    772   out[n++] = f.opcode;
    773   n += x64_pack_rm_reg(out + n, f.dst, f.src);
    774   return n;
    775 }
    776 
    777 typedef struct X64SseMem {
    778   u8 prefix;
    779   u8 opcode;
    780   u32 reg;
    781   u32 base;
    782   i32 disp;
    783 } X64SseMem;
    784 static inline u32 x64_sse_mem_pack(X64SseMem f, u8* out) {
    785   u32 n = 0;
    786   if (f.prefix) out[n++] = f.prefix;
    787   n += x64_pack_rex(out + n, 0, f.reg, 0, f.base);
    788   out[n++] = X64_OPC_TWOBYTE;
    789   out[n++] = f.opcode;
    790   n += x64_pack_mem(out + n, f.reg, f.base, f.disp);
    791   return n;
    792 }
    793 
    794 #endif