isa.h (26954B)
1 /* x86_64 ISA descriptors — single source of truth for every instruction 2 * the encoder and decoder need to agree on. 3 * 4 * Unlike aarch64 (fixed 32-bit insns, identified by `(word & mask) == match`), 5 * x86_64 instructions are variable-length 1..15 bytes: 6 * 7 * [ legacy_pfx? ][ REX? ][ opcode 1..3B ][ ModR/M? ][ SIB? ] 8 * [ disp 0/1/4B ][ imm 0/1/2/4/8B ] 9 * 10 * So the table is keyed piecewise: 11 * - leg_pfx one of {none, 0x66, 0xF2, 0xF3} 12 * - opc[1..3] opcode bytes (the last byte may have a low-bit mask 13 * for embed-reg forms like PUSH r64 = 50+rd) 14 * - modrm_reg 0..7 for `/digit` opcode extension, 0xFF for `/r` 15 * or no-ModR/M 16 * - rex_w_req WIDTH_ANY / WIDTH_1 / WIDTH_0 17 * - fmt X64Format enum (operand shape) 18 * 19 * Disasm flow: 20 * X64DecodeCtx ctx; x64_decode_prefixes(bytes, len, &ctx); 21 * const X64InsnDesc* d = x64_disasm_find(bytes, len, &ctx); 22 * x64_print_operands(sb, d, bytes, len, &ctx, vaddr) → total byte length. 23 * 24 * Encoder migration is staged separately: phase 1 ships the descriptors and 25 * decode side; phase 2 swaps each emit_* function body to use the per-format 26 * pack helpers below; phase 3 refactors asm.c around the same table. */ 27 28 #ifndef KIT_X64_ISA_H 29 #define KIT_X64_ISA_H 30 31 #include "core/bytes.h" 32 #include "core/core.h" 33 #include "core/slice.h" 34 #include "core/strbuf.h" 35 36 /* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */ 37 enum { 38 X64_RAX = 0, 39 X64_RCX = 1, 40 X64_RDX = 2, 41 X64_RBX = 3, 42 X64_RSP = 4, 43 X64_RBP = 5, 44 X64_RSI = 6, 45 X64_RDI = 7, 46 X64_R8 = 8, 47 X64_R9 = 9, 48 X64_R10 = 10, 49 X64_R11 = 11, 50 X64_R12 = 12, 51 X64_R13 = 13, 52 X64_R14 = 14, 53 X64_R15 = 15, 54 }; 55 56 /* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */ 57 enum { 58 X64_XMM0 = 0, 59 X64_XMM1 = 1, 60 X64_XMM2 = 2, 61 X64_XMM3 = 3, 62 X64_XMM4 = 4, 63 X64_XMM5 = 5, 64 X64_XMM6 = 6, 65 X64_XMM7 = 7, 66 X64_XMM8 = 8, 67 X64_XMM15 = 15, 68 }; 69 70 /* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */ 71 enum { 72 X64_CC_O = 0x0, 73 X64_CC_NO = 0x1, 74 X64_CC_B = 0x2, /* below / CF=1 → CMP_LT_U */ 75 X64_CC_AE = 0x3, /* above-or-equal / CF=0 → CMP_GE_U */ 76 X64_CC_E = 0x4, /* equal / ZF=1 → CMP_EQ */ 77 X64_CC_NE = 0x5, /* → CMP_NE */ 78 X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1 → CMP_LE_U */ 79 X64_CC_A = 0x7, /* above / CF=0 and ZF=0 → CMP_GT_U */ 80 X64_CC_S = 0x8, 81 X64_CC_NS = 0x9, 82 X64_CC_P = 0xA, 83 X64_CC_NP = 0xB, 84 X64_CC_L = 0xC, /* less (signed) / SF!=OF → CMP_LT_S */ 85 X64_CC_GE = 0xD, /* → CMP_GE_S */ 86 X64_CC_LE = 0xE, /* less-or-equal (signed) → CMP_LE_S */ 87 X64_CC_G = 0xF, /* greater → CMP_GT_S */ 88 }; 89 90 /* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */ 91 #define X64_REX_BASE 0x40u 92 #define X64_REX_W 0x08u 93 #define X64_REX_R 0x04u 94 #define X64_REX_X 0x02u 95 #define X64_REX_B 0x01u 96 97 /* ---- Branch / NOP encoding constants ---- 98 * 99 * Used by the linker to emit PLT entries and IPLT stubs without 100 * sprinkling raw hex into src/arch/x64/link.c. The shape is always the 101 * same RIP-relative indirect JMP plus padding NOPs. */ 102 103 /* JMP r/m64 — opcode FF /4. ModR/M for the RIP+disp32 form is 104 * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */ 105 #define X64_OP_JMP_RM64 0xFFu 106 #define X64_MODRM_JMP_RIPREL 0x25u 107 108 /* Single-byte NOP. */ 109 #define X64_NOP1 0x90u 110 111 /* Intel multi-byte ("long") NOP forms. The 6-byte form is the 112 * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */ 113 #define X64_NOP6_BYTE0 0x66u 114 #define X64_NOP6_BYTE1 0x0Fu 115 #define X64_NOP6_BYTE2 0x1Fu 116 #define X64_NOP6_BYTE3 0x44u 117 #define X64_NOP6_BYTE4 0x00u 118 #define X64_NOP6_BYTE5 0x00u 119 120 /* Sizes of the encoded forms above. */ 121 #define X64_JMP_RIPREL_SIZE 6u 122 #define X64_NOP6_SIZE 6u 123 124 /* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */ 125 static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) { 126 dst[0] = X64_OP_JMP_RM64; 127 dst[1] = X64_MODRM_JMP_RIPREL; 128 wr_u32_le(dst + 2, (u32)disp32); 129 } 130 131 /* Fill nbytes at dst with single-byte NOPs (0x90). Matches the 132 * existing memset-then-patch pattern used to pad PLT entries to 16. */ 133 static inline void x64_write_nop_pad(u8* dst, u32 nbytes) { 134 u32 i; 135 for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1; 136 } 137 138 /* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at 139 * dst. Used to pad the IPLT stub from 6 → 12 bytes. */ 140 static inline void x64_write_nop6(u8* dst) { 141 dst[0] = X64_NOP6_BYTE0; 142 dst[1] = X64_NOP6_BYTE1; 143 dst[2] = X64_NOP6_BYTE2; 144 dst[3] = X64_NOP6_BYTE3; 145 dst[4] = X64_NOP6_BYTE4; 146 dst[5] = X64_NOP6_BYTE5; 147 } 148 149 /* ==================================================================== 150 * Decode context — fills as we walk the prefix bytes. Disasm and 151 * (eventually) inline-asm parsers share this so prefix accounting lives 152 * in one place. 153 * ==================================================================== */ 154 155 #define X64_PFX_NONE 0u 156 #define X64_PFX_66 0x66u /* operand-size override (16-bit) */ 157 #define X64_PFX_F2 0xF2u /* SSE scalar double / REPNE */ 158 #define X64_PFX_F3 0xF3u /* SSE scalar single / REP */ 159 160 #define X64_W_REQ_ANY 0u /* row matches either REX.W value */ 161 #define X64_W_REQ_1 1u /* row requires REX.W = 1 (64-bit form) */ 162 #define X64_W_REQ_0 2u /* row requires REX.W = 0 (force 32-bit form) */ 163 164 typedef struct X64DecodeCtx { 165 u8 leg_pfx; /* 0 / 0x66 / 0xF2 / 0xF3 (last seen wins) */ 166 u8 has_lock; 167 u8 has_rex; 168 u8 rex_w, rex_r, rex_x, rex_b; 169 u32 opc_off; /* offset of first opcode byte inside the instruction */ 170 } X64DecodeCtx; 171 172 /* Walk legacy prefix bytes (0x66 / 0xF2 / 0xF3 / 0xF0 LOCK) followed by an 173 * optional REX byte (0x40..0x4F). Fills `ctx` and returns the offset of the 174 * first non-prefix byte. */ 175 u32 x64_decode_prefixes(const u8* bytes, u32 len, X64DecodeCtx* ctx); 176 177 /* ==================================================================== 178 * Opcode constants used by both the descriptor table and pack helpers. 179 * 180 * Naming: X64_OPC_<class>_<mnemonic>. We promote the bytes the encoder 181 * emits (not every byte the decoder might see — alias rows in isa.c 182 * still spell their own opcode bytes inline). */ 183 184 /* ALU r/m, r — opcode picks the operation. */ 185 #define X64_OPC_ALU_ADD 0x01u 186 #define X64_OPC_ALU_OR 0x09u 187 #define X64_OPC_ALU_AND 0x21u 188 #define X64_OPC_ALU_SUB 0x29u 189 #define X64_OPC_ALU_XOR 0x31u 190 #define X64_OPC_ALU_CMP 0x39u 191 #define X64_OPC_ALU_TEST 0x85u 192 #define X64_OPC_MOV_RM_R 0x89u /* MOV r/m, r */ 193 #define X64_OPC_MOV_RM_R8 0x88u /* MOV r/m8, r8 */ 194 #define X64_OPC_MOV_R_RM 0x8Bu /* MOV r, r/m */ 195 #define X64_OPC_LEA 0x8Du 196 #define X64_OPC_MOVSXD 0x63u 197 198 /* ALU r/m, imm — /sub picks op. */ 199 #define X64_OPC_ALU_IMM8 0x83u 200 #define X64_OPC_ALU_IMM32 0x81u 201 #define X64_ALU_SUB_ADD 0u 202 #define X64_ALU_SUB_OR 1u 203 #define X64_ALU_SUB_AND 4u 204 #define X64_ALU_SUB_SUB 5u 205 #define X64_ALU_SUB_XOR 6u 206 #define X64_ALU_SUB_CMP 7u 207 208 /* MOV r, imm — B8+rd. */ 209 #define X64_OPC_MOV_RI 0xB8u 210 211 /* MOV r/m, imm — C6 /0 (byte) and C7 /0 (32/64 sign-extended imm32). */ 212 #define X64_OPC_MOV_RM_IMM8 0xC6u 213 #define X64_OPC_MOV_RM_IMM32 0xC7u 214 #define X64_MOV_RM_IMM_SUB 0u 215 216 /* IMUL r, r/m (two-byte) and IMUL r, r/m, imm. */ 217 #define X64_OPC_IMUL_2B 0xAFu /* preceded by 0x0F */ 218 #define X64_OPC_IMUL_IMM8 0x6Bu 219 #define X64_OPC_IMUL_IMM32 0x69u 220 221 /* F7 /sub family. */ 222 #define X64_OPC_F7 0xF7u 223 #define X64_F7_SUB_NOT 2u 224 #define X64_F7_SUB_NEG 3u 225 #define X64_F7_SUB_MUL 4u 226 #define X64_F7_SUB_IMUL 5u 227 #define X64_F7_SUB_DIV 6u 228 #define X64_F7_SUB_IDIV 7u 229 230 /* Shifts. */ 231 #define X64_OPC_SHIFT_IMM 0xC1u 232 #define X64_OPC_SHIFT_CL 0xD3u 233 #define X64_SHIFT_SUB_SHL 4u 234 #define X64_SHIFT_SUB_SHR 5u 235 #define X64_SHIFT_SUB_SAR 7u 236 237 /* MOVZX / MOVSX (preceded by 0x0F). */ 238 #define X64_OPC_MOVZX_B 0xB6u 239 #define X64_OPC_MOVZX_W 0xB7u 240 #define X64_OPC_MOVSX_B 0xBEu 241 #define X64_OPC_MOVSX_W 0xBFu 242 243 /* SETcc base, CMOVcc base (preceded by 0x0F, low nibble = cc). */ 244 #define X64_OPC_SETCC_BASE 0x90u 245 #define X64_OPC_CMOVCC_BASE 0x40u 246 247 /* Branches. */ 248 #define X64_OPC_JMP_REL32 0xE9u 249 #define X64_OPC_CALL_REL32 0xE8u 250 #define X64_OPC_JCC_BASE 0x80u /* preceded by 0x0F, low nibble = cc */ 251 252 /* Stack. */ 253 #define X64_OPC_PUSH_R 0x50u 254 #define X64_OPC_POP_R 0x58u 255 256 /* Misc. */ 257 #define X64_OPC_RET 0xC3u 258 #define X64_OPC_LEAVE 0xC9u 259 #define X64_OPC_CDQ_CQO 0x99u 260 #define X64_OPC_TWOBYTE 0x0Fu 261 262 /* 0x66 operand-size override, used to force 16-bit forms. */ 263 #define X64_OPSIZE_PFX 0x66u 264 265 /* ==================================================================== 266 * Format kinds — one per "encoding shape" kit's emit.c produces. 267 * The format determines how operands are recovered from the byte stream 268 * after the opcode bytes and how they print in AT&T syntax. 269 * ==================================================================== */ 270 271 typedef enum X64Format { 272 X64_FMT_NULLARY, /* no operands: RET, NOP, UD2, LEAVE, CDQ/CQO */ 273 X64_FMT_NOP_MULTI, /* multi-byte NOP family (66 0F 1F ...) */ 274 X64_FMT_PUSH_POP, /* 50+rd / 58+rd — register in low 3 bits */ 275 X64_FMT_MOV_RI, /* B8+rd imm{32,64} — width via REX.W */ 276 X64_FMT_ALU_RR, /* op r/m, r — ADD/OR/AND/SUB/XOR/CMP/MOV/TEST */ 277 X64_FMT_MOV_RM_LOAD, /* 8B /r — MOV r, r/m (also LEA via 8D /r) */ 278 X64_FMT_MOVZX_MOVSX, /* 0F B6/B7/BE/BF /r — width-extending loads */ 279 X64_FMT_MOVSXD, /* REX.W 63 /r — MOVSXD r64, r/m32 */ 280 X64_FMT_ALU_RM_IMM8, /* 83 /sub ib — ADD/OR/AND/SUB/XOR/CMP r/m, imm8 */ 281 X64_FMT_ALU_RM_IMM32, /* 81 /sub id — same family, imm32 */ 282 X64_FMT_IMUL_RR, /* 0F AF /r — IMUL r, r/m */ 283 X64_FMT_IMUL_RRI, /* 69/6B /r i{8,32} — IMUL r, r/m, imm */ 284 X64_FMT_F7_RM, /* F7 /sub — NOT/NEG/MUL/IMUL/DIV/IDIV */ 285 X64_FMT_SHIFT_IMM, /* C1 /sub ib — SHL/SHR/SAR r/m, imm8 */ 286 X64_FMT_SHIFT_CL, /* D3 /sub — SHL/SHR/SAR r/m, %cl */ 287 X64_FMT_JCC_REL32, /* 0F 8x rel32 — Jcc near */ 288 X64_FMT_JMP_REL32, /* E9 rel32 */ 289 X64_FMT_CALL_REL32, /* E8 rel32 */ 290 X64_FMT_BR_RM, /* FF /2 or /4 — call/jmp indirect r/m */ 291 X64_FMT_SETCC_RM, /* 0F 9x /0 r/m8 — SETcc */ 292 X64_FMT_CMOVCC_RR, /* 0F 4x /r — CMOVcc r, r/m */ 293 X64_FMT_SSE_RR, /* {F2|F3|66}? 0F xx /r — scalar FP reg-reg */ 294 X64_FMT_SSE_LOAD, /* same, dst <- [base+disp] */ 295 X64_FMT_SSE_STORE, /* same, [base+disp] <- src */ 296 X64_FMT_BSWAP, /* 0F C8+rd */ 297 X64_FMT_BS, /* 0F BC/BD /r — BSF/BSR */ 298 X64_FMT_POPCNT, /* F3 0F B8 /r */ 299 X64_FMT_XADD_MEM, /* LOCK 0F C1 /r — XADD m, r */ 300 X64_FMT_XCHG_MEM, /* 87 /r — XCHG r, m (LOCK implicit on mem dst) */ 301 X64_FMT_CMPXCHG_MEM, /* LOCK 0F B1 /r — CMPXCHG m, r */ 302 X64_FMT_RAW_BYTE, /* sentinel: render as `.byte 0xNN` (no match) */ 303 } X64Format; 304 305 #define X64_ASMFL_ALIAS \ 306 0x01u /* row is an alias spelling (prefer-on-decode) \ 307 */ 308 #define X64_ASMFL_W_FROM_REX 0x02u /* fmt picks width from ctx->rex_w */ 309 #define X64_ASMFL_FORCE_W64 0x04u /* fmt always 64-bit regardless of REX.W */ 310 #define X64_ASMFL_BYTE 0x08u /* fixed-byte operand (movb, setcc) */ 311 #define X64_ASMFL_W16 0x10u /* fixed 16-bit (via 0x66 prefix override) */ 312 313 /* ==================================================================== 314 * Descriptor table row. 315 * ==================================================================== */ 316 317 typedef struct X64InsnDesc { 318 Slice mnemonic; /* AT&T mnemonic without size suffix; printer adds 319 a size letter (b/w/l/q) based on fmt + ctx. */ 320 u8 leg_pfx; /* X64_PFX_NONE / 0x66 / 0xF2 / 0xF3 */ 321 u8 opc_len; /* 1..3 */ 322 u8 opc[3]; /* opcode bytes */ 323 u8 opc_last_mask; /* 0xFF for exact match on opc[opc_len-1]; 324 0xF8 for embed-reg in low 3 bits; 325 0xF0 for Jcc / SETcc / CMOVcc condition nibble */ 326 u8 modrm_reg; /* 0..7 if /digit, 0xFF otherwise */ 327 u8 rex_w_req; /* X64_W_REQ_* */ 328 u8 fmt; /* X64Format */ 329 u8 flags; /* X64_ASMFL_* */ 330 } X64InsnDesc; 331 332 extern const X64InsnDesc x64_insn_table[]; 333 extern const u32 x64_insn_table_n; 334 335 /* Linear scan after prefix decode. Sets `ctx->opc_off` to where opcode 336 * starts. Returns the matching descriptor, or NULL on no match (caller 337 * should emit a `.byte` fallback). On success, opc_off is unchanged; 338 * the caller can derive opc_end as opc_off + desc->opc_len. */ 339 const X64InsnDesc* x64_disasm_find(const u8* bytes, u32 len, X64DecodeCtx* ctx); 340 341 /* Render operand text for a matched descriptor into `sb` and return the 342 * total instruction length in bytes (from bytes[0], including any 343 * prefixes/ModR/M/SIB/disp/imm). Returns 0 if the encoding is truncated 344 * (caller falls back to a single-byte `.byte` rendering). `vaddr` is the 345 * instruction's virtual address for PC-relative formats; pass 0 if not 346 * known. The mnemonic itself is *not* written — caller emits desc->mnemonic 347 * (plus any size suffix it derives via x64_size_suffix_for). */ 348 u32 x64_print_operands(StrBuf* sb, const X64InsnDesc* desc, const u8* bytes, 349 u32 len, const X64DecodeCtx* ctx, u64 vaddr); 350 351 /* Returns the AT&T size suffix character ('b','w','l','q') the printer 352 * appends to mnemonics that depend on operand width. Returns 0 if the 353 * mnemonic carries its own width (Jcc, SETcc, MOVZX/MOVSX, SSE, etc.). */ 354 char x64_size_suffix_for(const X64InsnDesc* desc, const X64DecodeCtx* ctx); 355 356 /* Translate a condition nibble (low 4 bits of the second opcode byte for 357 * Jcc/SETcc/CMOVcc) to its AT&T suffix: "e", "ne", "ge", ... */ 358 const char* x64_cc_name(u8 cc); 359 360 /* ==================================================================== 361 * Pack helpers — encode-side counterpart of the decode dispatch above. 362 * 363 * Each helper builds one instruction into a caller-provided buffer and 364 * returns the number of bytes written. Callers must reserve at least 365 * 16 bytes; no single x86_64 instruction we emit exceeds 15. 366 * 367 * REX rules (shared by every reg/mem helper): 368 * - Emitted only when needed: W=1 or any of R/X/B nonzero. 369 * - Force-REX variants (suffix `_force_rex`) always emit a REX byte — 370 * required for byte-reg encodings that promote SIL/DIL/etc. 371 * 372 * ModR/M memory rules (handled by x64_pack_mem): 373 * - mod=0 for disp=0 unless (base & 7) == 5 (RBP/R13 — needs disp8=0). 374 * - mod=1 for disp in [-128,127]. 375 * - mod=2 for full disp32. 376 * - SIB byte required when (base & 7) == 4 (RSP/R12); index=4 (none). 377 * ==================================================================== */ 378 379 /* REX prefix byte builder. Returns 0 if no REX needed. */ 380 static inline u8 x64_make_rex(int w, u32 reg, u32 index, u32 rm) { 381 u8 r = 0; 382 if (w) r |= X64_REX_W; 383 if (reg & 8u) r |= X64_REX_R; 384 if (index & 8u) r |= X64_REX_X; 385 if (rm & 8u) r |= X64_REX_B; 386 return r ? (u8)(X64_REX_BASE | r) : 0u; 387 } 388 389 /* ModR/M byte builder. */ 390 static inline u8 x64_modrm(u32 mod, u32 reg, u32 rm) { 391 return (u8)(((mod & 3u) << 6) | ((reg & 7u) << 3) | (rm & 7u)); 392 } 393 394 /* SIB byte builder. */ 395 static inline u8 x64_sib(u32 scale, u32 index, u32 base) { 396 return (u8)(((scale & 3u) << 6) | ((index & 7u) << 3) | (base & 7u)); 397 } 398 399 /* ModR/M r/m encodings with special meaning: 400 * rm=100 → SIB byte follows. 401 * rm=101 with mod=00 → RIP-relative (disp32) or, in SIB.base, disp32-only. */ 402 #define X64_MODRM_RM_SIB 4u 403 #define X64_MODRM_RM_RIP_DISP32 5u 404 405 /* SIB.index=100 means "no index". */ 406 #define X64_SIB_NO_INDEX 4u 407 /* SIB.base=101 with mod=00 means "no base" (disp32 only). */ 408 #define X64_SIB_NO_BASE 5u 409 410 /* Pick ModR/M.mod from a (base,disp) memory operand: 411 * 0 → [base] (only if disp==0 and (base&7)!=5) 412 * 1 → [base + disp8] 413 * 2 → [base + disp32] */ 414 static inline u32 x64_disp_mod(u32 base, i32 disp) { 415 if (disp == 0 && (base & 7u) != 5u) return 0u; 416 if (disp >= -128 && disp <= 127) return 1u; 417 return 2u; 418 } 419 420 /* Append `n` little-endian bytes of `v` to out, return n. */ 421 static inline u32 x64_put_u32le(u8* out, u32 v) { 422 out[0] = (u8)v; 423 out[1] = (u8)(v >> 8); 424 out[2] = (u8)(v >> 16); 425 out[3] = (u8)(v >> 24); 426 return 4u; 427 } 428 static inline u32 x64_put_u64le(u8* out, u64 v) { 429 for (u32 i = 0; i < 8u; ++i) out[i] = (u8)(v >> (i * 8u)); 430 return 8u; 431 } 432 433 /* Pack a bare RIP-relative memory operand `[rip + disp32]` (no symbol). 434 * ModR/M mod=00, rm=101, followed by disp32; no SIB. */ 435 static inline u32 x64_pack_mem_rip(u8* out, u32 reg, i32 disp) { 436 out[0] = x64_modrm(0u, reg, X64_MODRM_RM_RIP_DISP32); 437 return 1u + x64_put_u32le(out + 1, (u32)disp); 438 } 439 440 /* Pack a memory operand (ModR/M + optional SIB + optional disp) for the 441 * `reg` operand and `[base + disp]` r/m operand. Returns bytes written. */ 442 static inline u32 x64_pack_mem(u8* out, u32 reg, u32 base, i32 disp) { 443 u32 m = x64_disp_mod(base, disp); 444 u32 n = 0; 445 if ((base & 7u) == 4u) { 446 out[n++] = x64_modrm(m, reg, 4u); 447 out[n++] = x64_sib(0u, 4u, base); 448 } else { 449 out[n++] = x64_modrm(m, reg, base); 450 } 451 if (m == 1u) { 452 out[n++] = (u8)(i8)disp; 453 } else if (m == 2u) { 454 n += x64_put_u32le(out + n, (u32)disp); 455 } 456 return n; 457 } 458 459 /* Pack a SIB-form memory operand `[base + index*scale + disp]`. Emits SIB 460 * unconditionally; pass index = 4 (RSP) for the no-index case (the SIB 461 * "no index" encoding). `log2_scale` ∈ {0,1,2,3} for byte scale 1/2/4/8. 462 * 463 * RBP/R13 base needs at least disp8 even when disp == 0 (mod=00 with 464 * SIB base=5 means "no base, disp32 only"). RSP/R12 base requires SIB 465 * regardless — which is what this helper provides. */ 466 static inline u32 x64_pack_mem_sib(u8* out, u32 reg, u32 base, u32 index, 467 u32 log2_scale, i32 disp) { 468 /* For SIB base encoding, base=5 (RBP/R13) cannot use mod=0; force 469 * disp8/disp32. Other bases can use the standard mod selection. */ 470 u32 m; 471 if ((base & 7u) == 5u && disp == 0) { 472 m = 1u; /* disp8 = 0 */ 473 } else if (disp == 0) { 474 m = 0u; 475 } else if (disp >= -128 && disp <= 127) { 476 m = 1u; 477 } else { 478 m = 2u; 479 } 480 u32 n = 0; 481 out[n++] = x64_modrm(m, reg, 4u); /* r/m = 4 → SIB follows */ 482 out[n++] = x64_sib(log2_scale & 3u, index, base); 483 if (m == 1u) { 484 out[n++] = (u8)(i8)disp; 485 } else if (m == 2u) { 486 n += x64_put_u32le(out + n, (u32)disp); 487 } 488 return n; 489 } 490 491 /* Pack a reg-form ModR/M (mod=3) — one byte. */ 492 static inline u32 x64_pack_rm_reg(u8* out, u32 reg, u32 rm) { 493 out[0] = x64_modrm(3u, reg, rm); 494 return 1u; 495 } 496 497 /* Emit an optional REX (only if needed) and return bytes written (0 or 1). */ 498 static inline u32 x64_pack_rex(u8* out, int w, u32 reg, u32 index, u32 rm) { 499 u8 r = x64_make_rex(w, reg, index, rm); 500 if (!r) return 0u; 501 out[0] = r; 502 return 1u; 503 } 504 /* Always emit a REX byte (force form). */ 505 static inline u32 x64_pack_rex_force(u8* out, int w, u32 reg, u32 index, 506 u32 rm) { 507 out[0] = 508 (u8)(X64_REX_BASE | (w ? X64_REX_W : 0u) | ((reg & 8u) ? X64_REX_R : 0u) | 509 ((index & 8u) ? X64_REX_X : 0u) | ((rm & 8u) ? X64_REX_B : 0u)); 510 return 1u; 511 } 512 513 /* ---- X64_FMT_NULLARY: one or two opcode bytes, no operands. ---- */ 514 typedef struct X64Nullary { 515 u8 prefix; /* legacy prefix or 0 */ 516 int w; /* if nonzero, force REX.W (used by CQO) */ 517 u8 opc0; 518 u8 opc1; /* 0 if unused */ 519 } X64Nullary; 520 static inline u32 x64_nullary_pack(X64Nullary f, u8* out) { 521 u32 n = 0; 522 if (f.prefix) out[n++] = f.prefix; 523 if (f.w) out[n++] = (u8)(X64_REX_BASE | X64_REX_W); 524 out[n++] = f.opc0; 525 if (f.opc1) out[n++] = f.opc1; 526 return n; 527 } 528 529 /* ---- X64_FMT_ALU_RR: op r/m, r (reg-reg form). ---- 530 * REX(w, src, 0, dst) | op | ModR/M(3, src, dst) 531 * 532 * `op` selects the operation (ADD/OR/AND/SUB/XOR/CMP/MOV/TEST). */ 533 typedef struct X64AluRR { 534 int w; 535 u8 op; 536 u32 dst; /* r/m */ 537 u32 src; /* reg */ 538 } X64AluRR; 539 static inline u32 x64_alu_rr_pack(X64AluRR f, u8* out) { 540 u32 n = x64_pack_rex(out, f.w, f.src, 0, f.dst); 541 out[n++] = f.op; 542 n += x64_pack_rm_reg(out + n, f.src, f.dst); 543 return n; 544 } 545 546 /* ---- X64_FMT_ALU_RR memory form: op [base+disp], r ---- 547 * Optional 0x66 | REX(w, src, 0, base) | op | mem(src, base, disp) 548 * `force_rex` matches emit_mov_store size=1 (byte-reg promotion). */ 549 typedef struct X64AluRM { 550 u8 prefix; /* 0 or 0x66 */ 551 int w; 552 u8 op; 553 int force_rex; /* 1 → always emit REX (byte-reg form) */ 554 u32 src; /* reg operand */ 555 u32 base; /* memory base */ 556 i32 disp; 557 } X64AluRM; 558 static inline u32 x64_alu_rm_pack(X64AluRM f, u8* out) { 559 u32 n = 0; 560 if (f.prefix) out[n++] = f.prefix; 561 if (f.force_rex) 562 n += x64_pack_rex_force(out + n, f.w, f.src, 0, f.base); 563 else 564 n += x64_pack_rex(out + n, f.w, f.src, 0, f.base); 565 out[n++] = f.op; 566 n += x64_pack_mem(out + n, f.src, f.base, f.disp); 567 return n; 568 } 569 570 /* ---- X64_FMT_MOV_RI: MOV r, imm — opcode B8+rd ---- */ 571 typedef struct X64MovRI { 572 int is64; 573 u32 dst; 574 i64 imm; 575 } X64MovRI; 576 static inline u32 x64_mov_ri_pack(X64MovRI f, u8* out) { 577 u32 n = x64_pack_rex(out, f.is64 ? 1 : 0, 0, 0, f.dst); 578 out[n++] = (u8)(X64_OPC_MOV_RI | (f.dst & 7u)); 579 if (f.is64) 580 n += x64_put_u64le(out + n, (u64)f.imm); 581 else 582 n += x64_put_u32le(out + n, (u32)f.imm); 583 return n; 584 } 585 586 /* ---- X64_FMT_MOV_RM_LOAD (8B /r) and LEA (8D /r) — register dst, memory src. 587 * Also covers MOVZX/MOVSX with memory source (two-byte opcode). ---- */ 588 typedef struct X64MovRMLoad { 589 int w; 590 u8 opc0; /* primary opcode byte */ 591 u8 opc1; /* 0 for one-byte opcode; nonzero = 0F xx form */ 592 u32 dst; /* reg */ 593 u32 base; /* mem base */ 594 i32 disp; 595 } X64MovRMLoad; 596 static inline u32 x64_mov_rm_load_pack(X64MovRMLoad f, u8* out) { 597 u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.base); 598 if (f.opc1) { 599 out[n++] = X64_OPC_TWOBYTE; 600 out[n++] = f.opc1; 601 } else { 602 out[n++] = f.opc0; 603 } 604 n += x64_pack_mem(out + n, f.dst, f.base, f.disp); 605 return n; 606 } 607 608 /* ---- X64_FMT_MOVZX_MOVSX reg-reg form: 0F xx /r ---- 609 * `force_rex` distinguishes byte-source forms (need REX even when no high 610 * regs) from the word-source form. */ 611 typedef struct X64MovzxRR { 612 int w; 613 u8 opc1; /* B6 / B7 / BE / BF */ 614 int force_rex; 615 u32 dst; 616 u32 src; 617 } X64MovzxRR; 618 static inline u32 x64_movzx_rr_pack(X64MovzxRR f, u8* out) { 619 u32 n; 620 if (f.force_rex) 621 n = x64_pack_rex_force(out, f.w, f.dst, 0, f.src); 622 else 623 n = x64_pack_rex(out, f.w, f.dst, 0, f.src); 624 out[n++] = X64_OPC_TWOBYTE; 625 out[n++] = f.opc1; 626 n += x64_pack_rm_reg(out + n, f.dst, f.src); 627 return n; 628 } 629 630 /* ---- X64_FMT_MOVSXD: REX.W 63 /r — MOVSXD r64, r32 ---- */ 631 typedef struct X64Movsxd { 632 u32 dst; 633 u32 src; 634 } X64Movsxd; 635 static inline u32 x64_movsxd_pack(X64Movsxd f, u8* out) { 636 u32 n = x64_pack_rex(out, 1, f.dst, 0, f.src); 637 out[n++] = X64_OPC_MOVSXD; 638 n += x64_pack_rm_reg(out + n, f.dst, f.src); 639 return n; 640 } 641 642 /* ---- X64_FMT_ALU_RM_IMM8: 83 /sub ib (sign-extended) — reg-form. ---- */ 643 typedef struct X64AluRmImm8 { 644 int w; 645 u32 sub; 646 u32 reg; 647 i8 imm; 648 } X64AluRmImm8; 649 static inline u32 x64_alu_imm8_pack(X64AluRmImm8 f, u8* out) { 650 u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); 651 out[n++] = X64_OPC_ALU_IMM8; 652 out[n++] = x64_modrm(3u, f.sub, f.reg); 653 out[n++] = (u8)f.imm; 654 return n; 655 } 656 657 /* ---- X64_FMT_ALU_RM_IMM32: 81 /sub id (sign-extended for w=1). ---- */ 658 typedef struct X64AluRmImm32 { 659 int w; 660 u32 sub; 661 u32 reg; 662 i32 imm; 663 } X64AluRmImm32; 664 static inline u32 x64_alu_imm32_pack(X64AluRmImm32 f, u8* out) { 665 u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); 666 out[n++] = X64_OPC_ALU_IMM32; 667 out[n++] = x64_modrm(3u, f.sub, f.reg); 668 n += x64_put_u32le(out + n, (u32)f.imm); 669 return n; 670 } 671 672 /* ---- X64_FMT_IMUL_RR: 0F AF /r — IMUL r, r/m ---- */ 673 typedef struct X64ImulRR { 674 int w; 675 u32 dst; 676 u32 src; 677 } X64ImulRR; 678 static inline u32 x64_imul_rr_pack(X64ImulRR f, u8* out) { 679 u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src); 680 out[n++] = X64_OPC_TWOBYTE; 681 out[n++] = X64_OPC_IMUL_2B; 682 n += x64_pack_rm_reg(out + n, f.dst, f.src); 683 return n; 684 } 685 686 /* ---- X64_FMT_IMUL_RRI: 6B /r ib (imm8) or 69 /r id (imm32). ---- */ 687 typedef struct X64ImulRRI { 688 int w; 689 int imm32; /* 1 → 0x69 with imm32; 0 → 0x6B with imm8 */ 690 u32 dst; 691 u32 src; 692 i32 imm; /* sign-extended; for imm32=0, only low byte used */ 693 } X64ImulRRI; 694 static inline u32 x64_imul_rri_pack(X64ImulRRI f, u8* out) { 695 u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src); 696 out[n++] = f.imm32 ? X64_OPC_IMUL_IMM32 : X64_OPC_IMUL_IMM8; 697 out[n++] = x64_modrm(3u, f.dst, f.src); 698 if (f.imm32) 699 n += x64_put_u32le(out + n, (u32)f.imm); 700 else 701 out[n++] = (u8)(i8)f.imm; 702 return n; 703 } 704 705 /* ---- X64_FMT_F7_RM: F7 /sub — NOT/NEG/MUL/IMUL/DIV/IDIV (reg). ---- */ 706 typedef struct X64F7RM { 707 int w; 708 u32 sub; 709 u32 reg; 710 } X64F7RM; 711 static inline u32 x64_f7_rm_pack(X64F7RM f, u8* out) { 712 u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); 713 out[n++] = X64_OPC_F7; 714 n += x64_pack_rm_reg(out + n, f.sub, f.reg); 715 return n; 716 } 717 718 /* ---- X64_FMT_SHIFT_IMM: C1 /sub ib (reg). ---- */ 719 typedef struct X64ShiftImm { 720 int w; 721 u32 sub; 722 u32 reg; 723 u8 imm; 724 } X64ShiftImm; 725 static inline u32 x64_shift_imm_pack(X64ShiftImm f, u8* out) { 726 u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); 727 out[n++] = X64_OPC_SHIFT_IMM; 728 out[n++] = x64_modrm(3u, f.sub, f.reg); 729 out[n++] = f.imm; 730 return n; 731 } 732 733 /* ---- X64_FMT_SHIFT_CL: D3 /sub (reg, %cl). ---- */ 734 typedef struct X64ShiftCL { 735 int w; 736 u32 sub; 737 u32 reg; 738 } X64ShiftCL; 739 static inline u32 x64_shift_cl_pack(X64ShiftCL f, u8* out) { 740 u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); 741 out[n++] = X64_OPC_SHIFT_CL; 742 n += x64_pack_rm_reg(out + n, f.sub, f.reg); 743 return n; 744 } 745 746 /* ---- X64_FMT_SETCC_RM: 0F 9x /0 r/m8 — force REX so byte-reg works. ---- */ 747 typedef struct X64Setcc { 748 u32 cc; 749 u32 reg; 750 } X64Setcc; 751 static inline u32 x64_setcc_pack(X64Setcc f, u8* out) { 752 u32 n = x64_pack_rex_force(out, 0, 0, 0, f.reg); 753 out[n++] = X64_OPC_TWOBYTE; 754 out[n++] = (u8)(X64_OPC_SETCC_BASE | (f.cc & 0xFu)); 755 n += x64_pack_rm_reg(out + n, 0u, f.reg); 756 return n; 757 } 758 759 /* ---- SSE scalar reg-reg / load / store: {pfx?} 0F xx /r. ---- */ 760 typedef struct X64SseRR { 761 u8 prefix; /* 0 / 0x66 / 0xF2 / 0xF3 */ 762 u8 opcode; 763 int w; /* REX.W for 64-bit CVTSI2 / CVTT2SI forms */ 764 u32 dst; 765 u32 src; 766 } X64SseRR; 767 static inline u32 x64_sse_rr_pack(X64SseRR f, u8* out) { 768 u32 n = 0; 769 if (f.prefix) out[n++] = f.prefix; 770 n += x64_pack_rex(out + n, f.w, f.dst, 0, f.src); 771 out[n++] = X64_OPC_TWOBYTE; 772 out[n++] = f.opcode; 773 n += x64_pack_rm_reg(out + n, f.dst, f.src); 774 return n; 775 } 776 777 typedef struct X64SseMem { 778 u8 prefix; 779 u8 opcode; 780 u32 reg; 781 u32 base; 782 i32 disp; 783 } X64SseMem; 784 static inline u32 x64_sse_mem_pack(X64SseMem f, u8* out) { 785 u32 n = 0; 786 if (f.prefix) out[n++] = f.prefix; 787 n += x64_pack_rex(out + n, 0, f.reg, 0, f.base); 788 out[n++] = X64_OPC_TWOBYTE; 789 out[n++] = f.opcode; 790 n += x64_pack_mem(out + n, f.reg, f.base, f.disp); 791 return n; 792 } 793 794 #endif