kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

isa.c (44498B)


      1 /* x86_64 instruction descriptor table + operand print/decode dispatch.
      2  *
      3  * The table mirrors every encoding `src/arch/x64/emit.c` produces, plus a
      4  * handful that show up via direct byte writes in arch/x64/{alloc,link,ops}.c
      5  * (CALL/JMP rel32, PUSH/POP r64, multi-byte NOP, atomic prefixes).  Each
      6  * row pins down (leg_pfx, opcode bytes, /digit) so the disassembler can
      7  * identify a raw byte stream with one linear pass and then dispatch on
      8  * the format to render operands.
      9  *
     10  * Row ordering: first-match wins.  Aliases (rows with X64_ASMFL_ALIAS)
     11  * sit BEFORE the canonical row they alias so the disassembler prefers
     12  * the alias spelling on output.  We keep aliases narrow today (e.g.,
     13  * SSE-prefixed forms naturally precede their no-prefix neighbours) — we
     14  * can add `xor %eax,%eax` zeroing-idiom aliases later if disasm output
     15  * needs them. */
     16 
     17 #include "arch/x64/isa.h"
     18 
     19 #include <stddef.h>
     20 #include <string.h>
     21 
     22 #include "core/bytes.h"
     23 
     24 /* ====================================================================
     25  * Table. Mnemonics are AT&T-style, lower-case, no size suffix; the
     26  * printer derives the size letter (b/w/l/q) from the fmt + REX.W where
     27  * appropriate.
     28  * ==================================================================== */
     29 
     30 #define ROW(mn, lp, ol, b0, b1, b2, lm, mr, wr, f, fl) \
     31   {{{(mn)}, sizeof(mn) - 1}, lp, ol, {b0, b1, b2}, lm, mr, wr, f, fl}
     32 #define NO_MODRM 0xFFu
     33 
     34 const X64InsnDesc x64_insn_table[] = {
     35     /* ---- single-byte nullary ---- */
     36     ROW("nop", X64_PFX_NONE, 1, 0x90, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     37         X64_FMT_NULLARY, 0),
     38     ROW("ret", X64_PFX_NONE, 1, 0xC3, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     39         X64_FMT_NULLARY, 0),
     40     ROW("leave", X64_PFX_NONE, 1, 0xC9, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     41         X64_FMT_NULLARY, 0),
     42     ROW("cltd", X64_PFX_NONE, 1, 0x99, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
     43         X64_FMT_NULLARY, 0),
     44     ROW("cqto", X64_PFX_NONE, 1, 0x99, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
     45         X64_FMT_NULLARY, 0),
     46 
     47     /* ---- two-byte UD2 ---- */
     48     ROW("ud2", X64_PFX_NONE, 2, 0x0F, 0x0B, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     49         X64_FMT_NULLARY, 0),
     50     /* ---- SYSCALL (0F 05): fast system call ---- */
     51     ROW("syscall", X64_PFX_NONE, 2, 0x0F, 0x05, 0, 0xFF, NO_MODRM,
     52         X64_W_REQ_ANY, X64_FMT_NULLARY, 0),
     53     ROW("mfence", X64_PFX_NONE, 3, 0x0F, 0xAE, 0xF0, 0xFF, NO_MODRM,
     54         X64_W_REQ_ANY, X64_FMT_NULLARY, 0),
     55 
     56     /* ---- multi-byte NOP: 66 0F 1F /0 ----
     57      * Matches the 6-byte canonical "NOPW 0(%rax,%rax,1)" kit emits to pad
     58      * the IPLT stub. The mod/rm bytes (and any disp) are consumed by the
     59      * NOP_MULTI printer. */
     60     ROW("nopw", X64_PFX_66, 2, 0x0F, 0x1F, 0, 0xFF, 0, X64_W_REQ_ANY,
     61         X64_FMT_NOP_MULTI, 0),
     62     ROW("nopl", X64_PFX_NONE, 2, 0x0F, 0x1F, 0, 0xFF, 0, X64_W_REQ_ANY,
     63         X64_FMT_NOP_MULTI, 0),
     64 
     65     /* ---- PUSH/POP r64 (embed-reg in low 3 bits) ---- */
     66     ROW("push", X64_PFX_NONE, 1, 0x50, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY,
     67         X64_FMT_PUSH_POP, X64_ASMFL_FORCE_W64),
     68     ROW("pop", X64_PFX_NONE, 1, 0x58, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY,
     69         X64_FMT_PUSH_POP, X64_ASMFL_FORCE_W64),
     70 
     71     /* ---- MOV r, imm — B8+rd; width via REX.W ----
     72      * imm32 form (no REX.W) and imm64 movabs form (REX.W=1) share the
     73      * same row; the printer reads ctx->rex_w to pick the imm width. */
     74     ROW("mov", X64_PFX_NONE, 1, 0xB8, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY,
     75         X64_FMT_MOV_RI, X64_ASMFL_W_FROM_REX),
     76 
     77     /* ---- ALU r/m, r — opcode picks op ---- */
     78     ROW("add", X64_PFX_NONE, 1, 0x01, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     79         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     80     ROW("or", X64_PFX_NONE, 1, 0x09, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     81         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     82     ROW("and", X64_PFX_NONE, 1, 0x21, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     83         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     84     ROW("sub", X64_PFX_NONE, 1, 0x29, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     85         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     86     ROW("xor", X64_PFX_NONE, 1, 0x31, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     87         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     88     ROW("cmp", X64_PFX_NONE, 1, 0x39, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     89         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     90     ROW("test", X64_PFX_NONE, 1, 0x85, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     91         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     92     ROW("mov", X64_PFX_NONE, 1, 0x89, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     93         X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX),
     94     /* Byte form: MOV r/m8, r8 — opcode 88 forces 1-byte operands. */
     95     ROW("mov", X64_PFX_NONE, 1, 0x88, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     96         X64_FMT_ALU_RR, X64_ASMFL_BYTE),
     97     /* 16-bit form: 0x66 prefix forces 2-byte operands. */
     98     ROW("mov", X64_PFX_66, 1, 0x89, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
     99         X64_FMT_ALU_RR, X64_ASMFL_W16),
    100 
    101     /* ---- MOV r, r/m  (load and reg-reg share opcode 8B) ----
    102      * 8B /r matches both r,r and r,[base+disp]; the printer dispatches on
    103      * ModR/M.mod. LEA is 8D /r — register-only ModR/M.mod=11 is illegal,
    104      * so we use a separate row keyed on the opcode. */
    105     ROW("mov", X64_PFX_NONE, 1, 0x8B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    106         X64_FMT_MOV_RM_LOAD, X64_ASMFL_W_FROM_REX),
    107     /* 16-bit r←r/m via 0x66 prefix. */
    108     ROW("mov", X64_PFX_66, 1, 0x8B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    109         X64_FMT_MOV_RM_LOAD, X64_ASMFL_W16),
    110     ROW("lea", X64_PFX_NONE, 1, 0x8D, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    111         X64_FMT_MOV_RM_LOAD, X64_ASMFL_W_FROM_REX),
    112 
    113     /* ---- MOVZX / MOVSX r{32,64}, r/m{8,16} ----
    114      * The destination width is the *l* (32-bit) form without REX.W and the *q*
    115      * (64-bit) form with it; split by W so the disassembler emits a mnemonic
    116      * whose size letter matches the printed register width (clang rejects a
    117      * `movsbl` with a 64-bit destination). Same opcodes; W disambiguates,
    118      * exactly like cltd/cqto (0x99). */
    119     ROW("movzbl", X64_PFX_NONE, 2, 0x0F, 0xB6, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    120         X64_FMT_MOVZX_MOVSX, 0),
    121     ROW("movzbq", X64_PFX_NONE, 2, 0x0F, 0xB6, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    122         X64_FMT_MOVZX_MOVSX, 0),
    123     ROW("movzwl", X64_PFX_NONE, 2, 0x0F, 0xB7, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    124         X64_FMT_MOVZX_MOVSX, 0),
    125     ROW("movzwq", X64_PFX_NONE, 2, 0x0F, 0xB7, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    126         X64_FMT_MOVZX_MOVSX, 0),
    127     ROW("movsbl", X64_PFX_NONE, 2, 0x0F, 0xBE, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    128         X64_FMT_MOVZX_MOVSX, 0),
    129     ROW("movsbq", X64_PFX_NONE, 2, 0x0F, 0xBE, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    130         X64_FMT_MOVZX_MOVSX, 0),
    131     ROW("movswl", X64_PFX_NONE, 2, 0x0F, 0xBF, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    132         X64_FMT_MOVZX_MOVSX, 0),
    133     ROW("movswq", X64_PFX_NONE, 2, 0x0F, 0xBF, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    134         X64_FMT_MOVZX_MOVSX, 0),
    135 
    136     /* ---- MOVSXD r64, r/m32 ---- */
    137     ROW("movslq", X64_PFX_NONE, 1, 0x63, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    138         X64_FMT_MOVSXD, 0),
    139 
    140     /* ---- ALU r/m, imm — /digit picks operation ----
    141      * 83 (imm8 sign-extended), 81 (imm32 sign-extended). One row per
    142      * (opcode, /digit) pair.
    143      *   /0 ADD  /1 OR   /4 AND  /5 SUB  /6 XOR  /7 CMP
    144      * (/2 ADC and /3 SBB are also valid in the Intel manual but kit
    145      *  doesn't emit them; they can land later as additional rows.) */
    146     ROW("add", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 0, X64_W_REQ_ANY,
    147         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    148     ROW("or", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 1, X64_W_REQ_ANY,
    149         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    150     ROW("and", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    151         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    152     ROW("sub", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 5, X64_W_REQ_ANY,
    153         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    154     ROW("xor", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 6, X64_W_REQ_ANY,
    155         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    156     ROW("cmp", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 7, X64_W_REQ_ANY,
    157         X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX),
    158     ROW("add", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 0, X64_W_REQ_ANY,
    159         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    160     ROW("or", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 1, X64_W_REQ_ANY,
    161         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    162     ROW("and", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    163         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    164     ROW("sub", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 5, X64_W_REQ_ANY,
    165         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    166     ROW("xor", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 6, X64_W_REQ_ANY,
    167         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    168     ROW("cmp", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 7, X64_W_REQ_ANY,
    169         X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX),
    170 
    171     /* ---- IMUL r, r/m (0F AF) / IMUL r, r/m, imm (69 / 6B) ---- */
    172     ROW("imul", X64_PFX_NONE, 2, 0x0F, 0xAF, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    173         X64_FMT_IMUL_RR, X64_ASMFL_W_FROM_REX),
    174     ROW("imul", X64_PFX_NONE, 1, 0x6B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    175         X64_FMT_IMUL_RRI, X64_ASMFL_W_FROM_REX),
    176     ROW("imul", X64_PFX_NONE, 1, 0x69, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    177         X64_FMT_IMUL_RRI, X64_ASMFL_W_FROM_REX | 0x80u /* imm32 */),
    178 
    179     /* ---- F7 /sub family (no immediate read except for /0 /1 which we
    180      *      don't emit) ---- */
    181     ROW("not", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 2, X64_W_REQ_ANY,
    182         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    183     ROW("neg", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 3, X64_W_REQ_ANY,
    184         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    185     ROW("mul", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    186         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    187     ROW("imul", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 5, X64_W_REQ_ANY,
    188         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    189     ROW("div", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 6, X64_W_REQ_ANY,
    190         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    191     ROW("idiv", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 7, X64_W_REQ_ANY,
    192         X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX),
    193 
    194     /* ---- Shifts ---- */
    195     ROW("rol", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 0, X64_W_REQ_ANY,
    196         X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX),
    197     ROW("ror", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 1, X64_W_REQ_ANY,
    198         X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX),
    199     ROW("shl", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    200         X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX),
    201     ROW("shr", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 5, X64_W_REQ_ANY,
    202         X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX),
    203     ROW("sar", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 7, X64_W_REQ_ANY,
    204         X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX),
    205     /* 16-bit ROL imm8 via 0x66 + C1 /0 — used by emit_rol16_imm8. */
    206     ROW("rol", X64_PFX_66, 1, 0xC1, 0, 0, 0xFF, 0, X64_W_REQ_ANY,
    207         X64_FMT_SHIFT_IMM, X64_ASMFL_W16),
    208     ROW("shl", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    209         X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX),
    210     ROW("shr", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 5, X64_W_REQ_ANY,
    211         X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX),
    212     ROW("sar", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 7, X64_W_REQ_ANY,
    213         X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX),
    214 
    215     /* ---- Branches ---- */
    216     /* Jcc near: 0F 80..8F rel32; condition in low 4 bits. The printer
    217      * picks the mnemonic from a per-condition table. */
    218     ROW("j", X64_PFX_NONE, 2, 0x0F, 0x80, 0, 0xF0, NO_MODRM, X64_W_REQ_ANY,
    219         X64_FMT_JCC_REL32, 0),
    220     ROW("jmp", X64_PFX_NONE, 1, 0xE9, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    221         X64_FMT_JMP_REL32, 0),
    222     ROW("callq", X64_PFX_NONE, 1, 0xE8, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    223         X64_FMT_CALL_REL32, 0),
    224     /* Indirect jmp / call via FF /4 or /2. */
    225     ROW("callq", X64_PFX_NONE, 1, 0xFF, 0, 0, 0xFF, 2, X64_W_REQ_ANY,
    226         X64_FMT_BR_RM, 0),
    227     ROW("jmpq", X64_PFX_NONE, 1, 0xFF, 0, 0, 0xFF, 4, X64_W_REQ_ANY,
    228         X64_FMT_BR_RM, 0),
    229 
    230     /* ---- SETcc / CMOVcc ----
    231      * SETcc condition in low 4 bits of 2nd opcode byte (0F 90..9F).
    232      * CMOVcc same encoding around 0F 40..4F. */
    233     ROW("set", X64_PFX_NONE, 2, 0x0F, 0x90, 0, 0xF0, 0, X64_W_REQ_ANY,
    234         X64_FMT_SETCC_RM, 0),
    235     ROW("cmov", X64_PFX_NONE, 2, 0x0F, 0x40, 0, 0xF0, NO_MODRM, X64_W_REQ_ANY,
    236         X64_FMT_CMOVCC_RR, X64_ASMFL_W_FROM_REX),
    237 
    238     /* ---- BSWAP r — 0F C8+rd ---- */
    239     ROW("bswap", X64_PFX_NONE, 2, 0x0F, 0xC8, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY,
    240         X64_FMT_BSWAP, X64_ASMFL_W_FROM_REX),
    241 
    242     /* ---- Bit scan: BSF / BSR ---- */
    243     ROW("bsf", X64_PFX_NONE, 2, 0x0F, 0xBC, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    244         X64_FMT_BS, X64_ASMFL_W_FROM_REX),
    245     ROW("bsr", X64_PFX_NONE, 2, 0x0F, 0xBD, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    246         X64_FMT_BS, X64_ASMFL_W_FROM_REX),
    247 
    248     /* ---- POPCNT — F3 0F B8 /r (note: F3 prefix is REQUIRED) ---- */
    249     ROW("popcnt", X64_PFX_F3, 2, 0x0F, 0xB8, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    250         X64_FMT_POPCNT, X64_ASMFL_W_FROM_REX),
    251 
    252     /* ---- Atomic primitives ---- */
    253     /* XADD m, r — 0F C1 /r (LOCK prefix is decoded separately) */
    254     ROW("xadd", X64_PFX_NONE, 2, 0x0F, 0xC1, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    255         X64_FMT_XADD_MEM, X64_ASMFL_W_FROM_REX),
    256     /* XCHG r, r/m — 0x87 /r */
    257     ROW("xchg", X64_PFX_NONE, 1, 0x87, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    258         X64_FMT_XCHG_MEM, X64_ASMFL_W_FROM_REX),
    259     /* CMPXCHG m, r — 0F B1 /r */
    260     ROW("cmpxchg", X64_PFX_NONE, 2, 0x0F, 0xB1, 0, 0xFF, NO_MODRM,
    261         X64_W_REQ_ANY, X64_FMT_CMPXCHG_MEM, X64_ASMFL_W_FROM_REX),
    262 
    263     /* ---- SSE scalar FP — F2/F3 0F xx /r ----
    264      * Three opcodes per (sd, ss) pair: arith / mov / cmp.  Each row pairs
    265      * the legacy prefix (selects sd vs ss) with the 0F xx /r opcode. */
    266     /* MOVSS / MOVSD */
    267     ROW("movsd", X64_PFX_F2, 2, 0x0F, 0x10, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    268         X64_FMT_SSE_RR, 0),
    269     ROW("movsd", X64_PFX_F2, 2, 0x0F, 0x11, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    270         X64_FMT_SSE_RR, X64_ASMFL_ALIAS),
    271     ROW("movss", X64_PFX_F3, 2, 0x0F, 0x10, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    272         X64_FMT_SSE_RR, 0),
    273     ROW("movss", X64_PFX_F3, 2, 0x0F, 0x11, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    274         X64_FMT_SSE_RR, X64_ASMFL_ALIAS),
    275     /* MOVAPS */
    276     ROW("movaps", X64_PFX_NONE, 2, 0x0F, 0x28, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    277         X64_FMT_SSE_RR, 0),
    278     ROW("movaps", X64_PFX_NONE, 2, 0x0F, 0x29, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    279         X64_FMT_SSE_RR, X64_ASMFL_ALIAS),
    280     /* ADD/SUB/MUL/DIV — opcodes 58/5C/59/5E (same byte for ss and sd;
    281      * prefix picks). */
    282     ROW("addsd", X64_PFX_F2, 2, 0x0F, 0x58, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    283         X64_FMT_SSE_RR, 0),
    284     ROW("addss", X64_PFX_F3, 2, 0x0F, 0x58, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    285         X64_FMT_SSE_RR, 0),
    286     ROW("mulsd", X64_PFX_F2, 2, 0x0F, 0x59, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    287         X64_FMT_SSE_RR, 0),
    288     ROW("mulss", X64_PFX_F3, 2, 0x0F, 0x59, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    289         X64_FMT_SSE_RR, 0),
    290     ROW("subsd", X64_PFX_F2, 2, 0x0F, 0x5C, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    291         X64_FMT_SSE_RR, 0),
    292     ROW("subss", X64_PFX_F3, 2, 0x0F, 0x5C, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    293         X64_FMT_SSE_RR, 0),
    294     ROW("divsd", X64_PFX_F2, 2, 0x0F, 0x5E, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    295         X64_FMT_SSE_RR, 0),
    296     ROW("divss", X64_PFX_F3, 2, 0x0F, 0x5E, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    297         X64_FMT_SSE_RR, 0),
    298     /* Compare scalar (UCOMISS / UCOMISD) */
    299     ROW("ucomisd", X64_PFX_66, 2, 0x0F, 0x2E, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    300         X64_FMT_SSE_RR, 0),
    301     ROW("ucomiss", X64_PFX_NONE, 2, 0x0F, 0x2E, 0, 0xFF, NO_MODRM,
    302         X64_W_REQ_ANY, X64_FMT_SSE_RR, 0),
    303     /* Conversions touched by FP↔int paths: CVTSI2SS/SD, CVTTSS/SD2SI. */
    304     ROW("cvtsi2sd", X64_PFX_F2, 2, 0x0F, 0x2A, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    305         X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX),
    306     ROW("cvtsi2ss", X64_PFX_F3, 2, 0x0F, 0x2A, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    307         X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX),
    308     ROW("cvttsd2si", X64_PFX_F2, 2, 0x0F, 0x2C, 0, 0xFF, NO_MODRM,
    309         X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX),
    310     ROW("cvttss2si", X64_PFX_F3, 2, 0x0F, 0x2C, 0, 0xFF, NO_MODRM,
    311         X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX),
    312     ROW("cvtsd2ss", X64_PFX_F2, 2, 0x0F, 0x5A, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    313         X64_FMT_SSE_RR, 0),
    314     ROW("cvtss2sd", X64_PFX_F3, 2, 0x0F, 0x5A, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    315         X64_FMT_SSE_RR, 0),
    316     /* MOVD/MOVQ between GPR and XMM. 66 0F 6E /r is gpr->xmm, 66 0F 7E /r is
    317      * xmm->gpr (note the reversed operand order, handled in print_xmm_rr).
    318      * REX.W picks movq (64-bit GPR) vs movd (32-bit), and since the *mnemonic*
    319      * itself changes we split into W_REQ_0 / W_REQ_1 rows rather than a width
    320      * suffix. The backend emits these for int<->FP bitcasts (emit_sse_rr_w). */
    321     ROW("movd", X64_PFX_66, 2, 0x0F, 0x6E, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    322         X64_FMT_SSE_RR, 0),
    323     ROW("movq", X64_PFX_66, 2, 0x0F, 0x6E, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    324         X64_FMT_SSE_RR, 0),
    325     ROW("movd", X64_PFX_66, 2, 0x0F, 0x7E, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
    326         X64_FMT_SSE_RR, 0),
    327     ROW("movq", X64_PFX_66, 2, 0x0F, 0x7E, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
    328         X64_FMT_SSE_RR, 0),
    329     /* XORPS / XORPD (0F 57, prefix selects packed-single vs -double). The
    330      * backend uses these to clear/negate FP registers. Both operands xmm. */
    331     ROW("xorps", X64_PFX_NONE, 2, 0x0F, 0x57, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    332         X64_FMT_SSE_RR, 0),
    333     ROW("xorpd", X64_PFX_66, 2, 0x0F, 0x57, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
    334         X64_FMT_SSE_RR, 0),
    335 };
    336 
    337 const u32 x64_insn_table_n =
    338     (u32)(sizeof x64_insn_table / sizeof x64_insn_table[0]);
    339 
    340 /* ====================================================================
    341  * Prefix decode.
    342  * ==================================================================== */
    343 
    344 u32 x64_decode_prefixes(const u8* bytes, u32 len, X64DecodeCtx* ctx) {
    345   u32 off = 0;
    346   memset(ctx, 0, sizeof *ctx);
    347   while (off < len) {
    348     u8 b = bytes[off];
    349     if (b == 0x66u || b == 0xF2u || b == 0xF3u) {
    350       ctx->leg_pfx = b;
    351       ++off;
    352       continue;
    353     }
    354     if (b == 0xF0u) {
    355       /* LOCK — ignored for opcode lookup but consumed so the
    356        * subsequent opcode aligns. The printer adds a "lock " prefix
    357        * separately when annotating, but kit's emit.c currently emits
    358        * LOCK only before XADD / XCHG / CMPXCHG. */
    359       ctx->has_lock = 1;
    360       ++off;
    361       continue;
    362     }
    363     break;
    364   }
    365   if (off < len && bytes[off] >= 0x40u && bytes[off] <= 0x4Fu) {
    366     u8 r = bytes[off];
    367     ctx->has_rex = 1;
    368     ctx->rex_w = (r >> 3) & 1u;
    369     ctx->rex_r = (r >> 2) & 1u;
    370     ctx->rex_x = (r >> 1) & 1u;
    371     ctx->rex_b = r & 1u;
    372     ++off;
    373   }
    374   ctx->opc_off = off;
    375   return off;
    376 }
    377 
    378 /* ====================================================================
    379  * Disassembler row lookup.
    380  * ==================================================================== */
    381 
    382 const X64InsnDesc* x64_disasm_find(const u8* bytes, u32 len,
    383                                    X64DecodeCtx* ctx) {
    384   if (ctx->opc_off >= len) return NULL;
    385   for (u32 i = 0; i < x64_insn_table_n; ++i) {
    386     const X64InsnDesc* d = &x64_insn_table[i];
    387     if (d->leg_pfx != ctx->leg_pfx) continue;
    388     if (d->rex_w_req == X64_W_REQ_1 && !ctx->rex_w) continue;
    389     if (d->rex_w_req == X64_W_REQ_0 && ctx->rex_w) continue;
    390     if (ctx->opc_off + d->opc_len > len) continue;
    391     /* Opcode bytes match exactly except the LAST byte, which may use
    392      * a low-bit mask (embed-reg or condition nibble). */
    393     int ok = 1;
    394     for (u32 j = 0; j + 1u < d->opc_len; ++j) {
    395       if (bytes[ctx->opc_off + j] != d->opc[j]) {
    396         ok = 0;
    397         break;
    398       }
    399     }
    400     if (!ok) continue;
    401     {
    402       u8 last_act = bytes[ctx->opc_off + d->opc_len - 1u] & d->opc_last_mask;
    403       u8 last_exp = d->opc[d->opc_len - 1u] & d->opc_last_mask;
    404       if (last_act != last_exp) continue;
    405     }
    406     /* /digit constraint reads ModR/M.reg. */
    407     if (d->modrm_reg != NO_MODRM) {
    408       u32 mrm_off = ctx->opc_off + d->opc_len;
    409       if (mrm_off >= len) continue;
    410       u8 mrm = bytes[mrm_off];
    411       if (((mrm >> 3) & 7u) != d->modrm_reg) continue;
    412     }
    413     return d;
    414   }
    415   return NULL;
    416 }
    417 
    418 /* ====================================================================
    419  * Operand printers.
    420  * ==================================================================== */
    421 
    422 #define X64_REG_RIP 16u
    423 
    424 static const char* g_cc_name[16] = {
    425     "o", "no", "b", "ae", "e", "ne", "be", "a",
    426     "s", "ns", "p", "np", "l", "ge", "le", "g",
    427 };
    428 
    429 /* AT&T register names by width. Index 0..15 covers RAX..R15. */
    430 static const char* reg_name(u32 reg, u32 width_bytes, int has_rex) {
    431   static const char* r64[16] = {
    432       "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
    433       "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
    434   };
    435   static const char* r32[16] = {
    436       "eax", "ecx", "edx",  "ebx",  "esp",  "ebp",  "esi",  "edi",
    437       "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
    438   };
    439   static const char* r16[16] = {
    440       "ax",  "cx",  "dx",   "bx",   "sp",   "bp",   "si",   "di",
    441       "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w",
    442   };
    443   static const char* r8[16] = {
    444       "al",  "cl",  "dl",   "bl",   "spl",  "bpl",  "sil",  "dil",
    445       "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
    446   };
    447   static const char* rh8[4] = {"ah", "ch", "dh", "bh"};
    448   reg &= 15u;
    449   if (width_bytes == 8) return r64[reg];
    450   if (width_bytes == 4) return r32[reg];
    451   if (width_bytes == 2) return r16[reg];
    452   if (!has_rex && reg >= 4u && reg <= 7u) return rh8[reg - 4u];
    453   return r8[reg];
    454 }
    455 
    456 static const char* xmm_name(u32 reg) {
    457   static const char* x[16] = {
    458       "xmm0", "xmm1", "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
    459       "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
    460   };
    461   return x[reg & 15u];
    462 }
    463 
    464 static void put_reg(StrBuf* sb, u32 reg, u32 width) {
    465   strbuf_putc(sb, '%');
    466   strbuf_puts(sb, reg_name(reg, width, 1));
    467 }
    468 static void put_reg_ctx(StrBuf* sb, u32 reg, u32 width, int has_rex) {
    469   strbuf_putc(sb, '%');
    470   strbuf_puts(sb, reg_name(reg, width, has_rex));
    471 }
    472 static void put_xmm(StrBuf* sb, u32 reg) {
    473   strbuf_putc(sb, '%');
    474   strbuf_puts(sb, xmm_name(reg));
    475 }
    476 static void put_imm(StrBuf* sb, i64 imm) {
    477   strbuf_putc(sb, '$');
    478   strbuf_put_i64(sb, imm);
    479 }
    480 
    481 /* Read a signed displacement of n bytes (1 or 4). Returns 1 on success. */
    482 static int read_disp(const u8* bytes, u32 len, u32 off, u32 n, i32* out) {
    483   if (off + n > len) return 0;
    484   if (n == 1u) {
    485     *out = (i32)(i8)bytes[off];
    486   } else if (n == 4u) {
    487     *out = (i32)rd_u32_le(bytes + off);
    488   } else {
    489     *out = 0;
    490   }
    491   return 1;
    492 }
    493 
    494 /* Decode a ModR/M memory operand starting at bytes[off]. Returns number of
    495  * extra bytes consumed (ModR/M + SIB? + disp?), or (u32)-1 on truncation.
    496  * The ModR/M byte itself is bytes[off]; caller has already read mod/reg/rm.
    497  * `disp_out` and `base_out` describe what to print. */
    498 typedef struct DecodedMem {
    499   u32 base;
    500   u32 index; /* SIB index register (valid when has_index) */
    501   u32 scale; /* SIB scale as the literal 1/2/4/8 (valid when has_index) */
    502   i32 disp;
    503   int has_base;
    504   int has_index; /* a SIB index register is present */
    505   int rip_relative;
    506   u32 bytes_used;
    507 } DecodedMem;
    508 
    509 static u32 decode_mem(const u8* bytes, u32 len, u32 off, X64DecodeCtx ctx,
    510                       u32 mod, u32 rm_low, DecodedMem* out) {
    511   out->base = 0;
    512   out->index = 0;
    513   out->scale = 1;
    514   out->disp = 0;
    515   out->has_base = 1;
    516   out->has_index = 0;
    517   out->rip_relative = 0;
    518   out->bytes_used = 0;
    519   if (mod == 3u) return 0; /* caller handles reg-form */
    520   /* SIB-required form: r/m=100. */
    521   if (rm_low == 4u) {
    522     if (off >= len) return (u32)-1;
    523     u8 s = bytes[off];
    524     u32 sib_base = (s & 7u) | ((u32)ctx.rex_b << 3);
    525     u32 sib_index = ((s >> 3) & 7u) | ((u32)ctx.rex_x << 3);
    526     u32 used = 1;
    527     /* SIB index = 4 (RSP) with REX.X=0 encodes "no index". */
    528     if (sib_index != 4u) {
    529       out->has_index = 1;
    530       out->index = sib_index;
    531       out->scale = 1u << (s >> 6);
    532     }
    533     if (mod == 0u && (s & 7u) == 5u) {
    534       /* mod=00, base=101: disp32 with no base — either a label-table
    535        * disp32 (no index) or an indexed `[index*scale + disp32]`. */
    536       i32 d = 0;
    537       if (!read_disp(bytes, len, off + used, 4, &d)) return (u32)-1;
    538       used += 4;
    539       out->disp = d;
    540       out->has_base = 0;
    541       out->bytes_used = used;
    542       return used;
    543     }
    544     if (mod == 1u) {
    545       i32 d = 0;
    546       if (!read_disp(bytes, len, off + used, 1, &d)) return (u32)-1;
    547       used += 1;
    548       out->disp = d;
    549     } else if (mod == 2u) {
    550       i32 d = 0;
    551       if (!read_disp(bytes, len, off + used, 4, &d)) return (u32)-1;
    552       used += 4;
    553       out->disp = d;
    554     }
    555     out->base = sib_base;
    556     out->bytes_used = used;
    557     return used;
    558   }
    559   /* Non-SIB form. */
    560   if (mod == 0u && rm_low == 5u) {
    561     /* RIP-relative disp32. */
    562     i32 d = 0;
    563     if (!read_disp(bytes, len, off, 4, &d)) return (u32)-1;
    564     out->disp = d;
    565     out->rip_relative = 1;
    566     out->bytes_used = 4;
    567     return 4;
    568   }
    569   u32 base = rm_low | ((u32)ctx.rex_b << 3);
    570   out->base = base;
    571   if (mod == 1u) {
    572     i32 d = 0;
    573     if (!read_disp(bytes, len, off, 1, &d)) return (u32)-1;
    574     out->disp = d;
    575     out->bytes_used = 1;
    576     return 1;
    577   }
    578   if (mod == 2u) {
    579     i32 d = 0;
    580     if (!read_disp(bytes, len, off, 4, &d)) return (u32)-1;
    581     out->disp = d;
    582     out->bytes_used = 4;
    583     return 4;
    584   }
    585   /* mod == 0u with rm != 5,4 → [reg], no disp. */
    586   return 0;
    587 }
    588 
    589 static void put_mem(StrBuf* sb, const DecodedMem* m) {
    590   if (m->disp != 0 || (!m->has_base && !m->rip_relative)) {
    591     strbuf_put_i64(sb, (i64)m->disp);
    592   }
    593   if (m->rip_relative) {
    594     strbuf_puts(sb, "(%rip)");
    595   } else if (m->has_base || m->has_index) {
    596     /* `(base)`, `(base,index,scale)`, or the base-less `(,index,scale)`. */
    597     strbuf_putc(sb, '(');
    598     if (m->has_base) put_reg(sb, m->base, 8);
    599     if (m->has_index) {
    600       strbuf_putc(sb, ',');
    601       put_reg(sb, m->index, 8);
    602       strbuf_putc(sb, ',');
    603       strbuf_put_i64(sb, (i64)m->scale);
    604     }
    605     strbuf_putc(sb, ')');
    606   }
    607 }
    608 
    609 /* ====================================================================
    610  * Width derivation.
    611  * ==================================================================== */
    612 
    613 static u32 width_for(const X64InsnDesc* d, const X64DecodeCtx* ctx) {
    614   if (d->flags & X64_ASMFL_FORCE_W64) return 8u;
    615   if (d->flags & X64_ASMFL_BYTE) return 1u;
    616   if (d->flags & X64_ASMFL_W16) return 2u;
    617   if (d->flags & X64_ASMFL_W_FROM_REX) return ctx->rex_w ? 8u : 4u;
    618   if (d->leg_pfx == X64_PFX_66) return 2u;
    619   return 4u;
    620 }
    621 
    622 char x64_size_suffix_for(const X64InsnDesc* desc, const X64DecodeCtx* ctx) {
    623   switch ((X64Format)desc->fmt) {
    624     case X64_FMT_ALU_RR:
    625     case X64_FMT_MOV_RM_LOAD:
    626     case X64_FMT_ALU_RM_IMM8:
    627     case X64_FMT_ALU_RM_IMM32:
    628     case X64_FMT_IMUL_RR:
    629     case X64_FMT_IMUL_RRI:
    630     case X64_FMT_F7_RM:
    631     case X64_FMT_SHIFT_IMM:
    632     case X64_FMT_SHIFT_CL:
    633     case X64_FMT_BSWAP:
    634     case X64_FMT_BS:
    635     case X64_FMT_POPCNT:
    636     case X64_FMT_XADD_MEM:
    637     case X64_FMT_XCHG_MEM:
    638     case X64_FMT_CMPXCHG_MEM:
    639     case X64_FMT_MOV_RI:
    640       switch (width_for(desc, ctx)) {
    641         case 1:
    642           return 'b';
    643         case 2:
    644           return 'w';
    645         case 4:
    646           return 'l';
    647         case 8:
    648           return 'q';
    649       }
    650       return 0;
    651     default:
    652       return 0;
    653   }
    654 }
    655 
    656 /* ====================================================================
    657  * Per-format printers.
    658  * ==================================================================== */
    659 
    660 /* Decode a ModR/M with reg+r/m. Returns total bytes consumed by the
    661  * ModR/M + any SIB/disp. */
    662 typedef struct RegRm {
    663   u32 reg;    /* high bit from REX.R */
    664   u32 rm_low; /* low 3 bits */
    665   u32 mod;
    666   u32 bytes_after_modrm; /* SIB/disp bytes */
    667   DecodedMem mem;        /* valid iff mod != 3 */
    668 } RegRm;
    669 
    670 static int read_modrm(const u8* bytes, u32 len, u32 off, X64DecodeCtx ctx,
    671                       RegRm* rr) {
    672   if (off >= len) return 0;
    673   u8 mr = bytes[off];
    674   rr->mod = (mr >> 6) & 3u;
    675   rr->reg = ((mr >> 3) & 7u) | ((u32)ctx.rex_r << 3);
    676   rr->rm_low = mr & 7u;
    677   if (rr->mod == 3u) {
    678     rr->bytes_after_modrm = 0;
    679     memset(&rr->mem, 0, sizeof rr->mem);
    680     return 1;
    681   }
    682   u32 used =
    683       decode_mem(bytes, len, off + 1u, ctx, rr->mod, rr->rm_low, &rr->mem);
    684   if (used == (u32)-1) return 0;
    685   rr->bytes_after_modrm = used;
    686   return 1;
    687 }
    688 
    689 /* Print a ModR/M r/m operand at width `w`. */
    690 static void put_rm(StrBuf* sb, const RegRm* rr, X64DecodeCtx ctx, u32 w) {
    691   if (rr->mod == 3u) {
    692     u32 rm = rr->rm_low | ((u32)ctx.rex_b << 3);
    693     put_reg_ctx(sb, rm, w, ctx.has_rex);
    694   } else {
    695     put_mem(sb, &rr->mem);
    696   }
    697 }
    698 static void put_rm_xmm(StrBuf* sb, const RegRm* rr, X64DecodeCtx ctx) {
    699   if (rr->mod == 3u) {
    700     u32 rm = rr->rm_low | ((u32)ctx.rex_b << 3);
    701     put_xmm(sb, rm);
    702   } else {
    703     put_mem(sb, &rr->mem);
    704   }
    705 }
    706 
    707 static u32 print_nullary(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    708                          u32 len, const X64DecodeCtx* ctx) {
    709   (void)sb;
    710   (void)d;
    711   (void)bytes;
    712   (void)len;
    713   return ctx->opc_off + d->opc_len;
    714 }
    715 
    716 static u32 print_push_pop(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    717                           u32 len, const X64DecodeCtx* ctx) {
    718   (void)len;
    719   u32 reg = (bytes[ctx->opc_off] & 7u) | ((u32)ctx->rex_b << 3);
    720   put_reg(sb, reg, 8);
    721   (void)d;
    722   return ctx->opc_off + 1u;
    723 }
    724 
    725 static u32 print_mov_ri(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    726                         u32 len, const X64DecodeCtx* ctx) {
    727   (void)d;
    728   u32 reg = (bytes[ctx->opc_off] & 7u) | ((u32)ctx->rex_b << 3);
    729   u32 off = ctx->opc_off + 1u;
    730   if (ctx->rex_w) {
    731     if (off + 8u > len) return 0;
    732     put_imm(sb, (i64)rd_u64_le(bytes + off));
    733     off += 8u;
    734     strbuf_puts(sb, ", ");
    735     put_reg(sb, reg, 8);
    736   } else {
    737     if (off + 4u > len) return 0;
    738     put_imm(sb, (i64)(i32)rd_u32_le(bytes + off));
    739     off += 4u;
    740     strbuf_puts(sb, ", ");
    741     put_reg(sb, reg, 4);
    742   }
    743   return off;
    744 }
    745 
    746 static u32 print_alu_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    747                         u32 len, const X64DecodeCtx* ctx) {
    748   /* op r/m, r (reg is the source). Width comes from width_for, which
    749    * honours the BYTE / W16 / W_FROM_REX flags on the descriptor. */
    750   u32 off = ctx->opc_off + d->opc_len;
    751   RegRm rr;
    752   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    753   u32 w = width_for(d, ctx);
    754   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    755   strbuf_puts(sb, ", ");
    756   put_rm(sb, &rr, *ctx, w);
    757   return off + 1u + rr.bytes_after_modrm;
    758 }
    759 
    760 static u32 print_mov_rm_load(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    761                              u32 len, const X64DecodeCtx* ctx) {
    762   /* op r, r/m. */
    763   u32 off = ctx->opc_off + d->opc_len;
    764   RegRm rr;
    765   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    766   u32 w = width_for(d, ctx);
    767   if (d->opc[0] == 0x8Du) w = 8u; /* LEA always loads a 64-bit address */
    768   put_rm(sb, &rr, *ctx, w);
    769   strbuf_puts(sb, ", ");
    770   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    771   return off + 1u + rr.bytes_after_modrm;
    772 }
    773 
    774 static u32 print_movzx_movsx(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    775                              u32 len, const X64DecodeCtx* ctx) {
    776   u32 off = ctx->opc_off + d->opc_len;
    777   RegRm rr;
    778   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    779   /* Source width = 1 for B6/BE, 2 for B7/BF. Destination width = 4 unless
    780    * REX.W (then 8). */
    781   u32 src_w = (d->opc[1] == 0xB7u || d->opc[1] == 0xBFu) ? 2u : 1u;
    782   u32 dst_w = ctx->rex_w ? 8u : 4u;
    783   put_rm(sb, &rr, *ctx, src_w);
    784   strbuf_puts(sb, ", ");
    785   put_reg_ctx(sb, rr.reg, dst_w, ctx->has_rex);
    786   return off + 1u + rr.bytes_after_modrm;
    787 }
    788 
    789 static u32 print_movsxd(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    790                         u32 len, const X64DecodeCtx* ctx) {
    791   u32 off = ctx->opc_off + d->opc_len;
    792   RegRm rr;
    793   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    794   put_rm(sb, &rr, *ctx, 4u);
    795   strbuf_puts(sb, ", ");
    796   put_reg_ctx(sb, rr.reg, 8u, ctx->has_rex);
    797   return off + 1u + rr.bytes_after_modrm;
    798 }
    799 
    800 static u32 print_alu_rm_imm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    801                             u32 len, const X64DecodeCtx* ctx) {
    802   u32 off = ctx->opc_off + d->opc_len;
    803   RegRm rr;
    804   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    805   u32 used = 1u + rr.bytes_after_modrm;
    806   i64 imm = 0;
    807   if (d->fmt == X64_FMT_ALU_RM_IMM8) {
    808     if (off + used >= len) return 0;
    809     imm = (i64)(i8)bytes[off + used];
    810     used += 1u;
    811   } else {
    812     if (off + used + 3u >= len) return 0;
    813     imm = (i64)(i32)rd_u32_le(bytes + off + used);
    814     used += 4u;
    815   }
    816   u32 w = width_for(d, ctx);
    817   put_imm(sb, imm);
    818   strbuf_puts(sb, ", ");
    819   put_rm(sb, &rr, *ctx, w);
    820   return off + used;
    821 }
    822 
    823 static u32 print_imul_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    824                          u32 len, const X64DecodeCtx* ctx) {
    825   u32 off = ctx->opc_off + d->opc_len;
    826   RegRm rr;
    827   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    828   u32 w = width_for(d, ctx);
    829   put_rm(sb, &rr, *ctx, w);
    830   strbuf_puts(sb, ", ");
    831   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    832   return off + 1u + rr.bytes_after_modrm;
    833 }
    834 
    835 static u32 print_imul_rri(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    836                           u32 len, const X64DecodeCtx* ctx) {
    837   /* 69 /r imm32 (full) or 6B /r imm8 (sign-extended). */
    838   u32 off = ctx->opc_off + d->opc_len;
    839   RegRm rr;
    840   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    841   u32 used = 1u + rr.bytes_after_modrm;
    842   i64 imm = 0;
    843   u8 op = d->opc[0];
    844   if (op == 0x6Bu) {
    845     if (off + used >= len) return 0;
    846     imm = (i64)(i8)bytes[off + used];
    847     used += 1u;
    848   } else {
    849     if (off + used + 3u >= len) return 0;
    850     imm = (i64)(i32)rd_u32_le(bytes + off + used);
    851     used += 4u;
    852   }
    853   u32 w = width_for(d, ctx);
    854   put_imm(sb, imm);
    855   strbuf_puts(sb, ", ");
    856   put_rm(sb, &rr, *ctx, w);
    857   strbuf_puts(sb, ", ");
    858   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    859   return off + used;
    860 }
    861 
    862 static u32 print_f7_rm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    863                        u32 len, const X64DecodeCtx* ctx) {
    864   u32 off = ctx->opc_off + d->opc_len;
    865   RegRm rr;
    866   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    867   u32 w = width_for(d, ctx);
    868   put_rm(sb, &rr, *ctx, w);
    869   return off + 1u + rr.bytes_after_modrm;
    870 }
    871 
    872 static u32 print_shift_imm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    873                            u32 len, const X64DecodeCtx* ctx) {
    874   u32 off = ctx->opc_off + d->opc_len;
    875   RegRm rr;
    876   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    877   u32 used = 1u + rr.bytes_after_modrm;
    878   if (off + used >= len) return 0;
    879   u8 imm = bytes[off + used];
    880   ++used;
    881   u32 w = width_for(d, ctx);
    882   put_imm(sb, (i64)imm);
    883   strbuf_puts(sb, ", ");
    884   put_rm(sb, &rr, *ctx, w);
    885   return off + used;
    886 }
    887 
    888 static u32 print_shift_cl(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    889                           u32 len, const X64DecodeCtx* ctx) {
    890   u32 off = ctx->opc_off + d->opc_len;
    891   RegRm rr;
    892   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    893   u32 w = width_for(d, ctx);
    894   strbuf_puts(sb, "%cl, ");
    895   put_rm(sb, &rr, *ctx, w);
    896   return off + 1u + rr.bytes_after_modrm;
    897 }
    898 
    899 static u32 print_jcc_rel32(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    900                            u32 len, const X64DecodeCtx* ctx, u64 vaddr) {
    901   u32 off = ctx->opc_off + d->opc_len;
    902   if (off + 4u > len) return 0;
    903   i32 rel = (i32)rd_u32_le(bytes + off);
    904   u64 tgt = vaddr + (u64)(off + 4u) + (u64)rel;
    905   /* Mnemonic suffix from condition nibble: caller wrote "j"; we append. */
    906   strbuf_putc(sb, ' ');
    907   strbuf_put_hex_u64(sb, tgt);
    908   return off + 4u;
    909 }
    910 
    911 static u32 print_jmp_call_rel32(StrBuf* sb, const X64InsnDesc* d,
    912                                 const u8* bytes, u32 len,
    913                                 const X64DecodeCtx* ctx, u64 vaddr) {
    914   u32 off = ctx->opc_off + d->opc_len;
    915   if (off + 4u > len) return 0;
    916   i32 rel = (i32)rd_u32_le(bytes + off);
    917   u64 tgt = vaddr + (u64)(off + 4u) + (u64)rel;
    918   strbuf_put_hex_u64(sb, tgt);
    919   return off + 4u;
    920 }
    921 
    922 static u32 print_br_rm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    923                        u32 len, const X64DecodeCtx* ctx) {
    924   u32 off = ctx->opc_off + d->opc_len;
    925   RegRm rr;
    926   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    927   strbuf_putc(sb, '*');
    928   put_rm(sb, &rr, *ctx, 8u);
    929   return off + 1u + rr.bytes_after_modrm;
    930 }
    931 
    932 static u32 print_setcc(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    933                        u32 len, const X64DecodeCtx* ctx) {
    934   u32 off = ctx->opc_off + d->opc_len;
    935   RegRm rr;
    936   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    937   put_rm(sb, &rr, *ctx, 1u);
    938   (void)d;
    939   return off + 1u + rr.bytes_after_modrm;
    940 }
    941 
    942 static u32 print_cmovcc_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    943                            u32 len, const X64DecodeCtx* ctx) {
    944   u32 off = ctx->opc_off + d->opc_len;
    945   RegRm rr;
    946   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    947   u32 w = width_for(d, ctx);
    948   put_rm(sb, &rr, *ctx, w);
    949   strbuf_puts(sb, ", ");
    950   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    951   return off + 1u + rr.bytes_after_modrm;
    952 }
    953 
    954 static u32 print_bswap(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    955                        u32 len, const X64DecodeCtx* ctx) {
    956   (void)d;
    957   (void)len;
    958   u32 reg = (bytes[ctx->opc_off + 1u] & 7u) | ((u32)ctx->rex_b << 3);
    959   u32 w = ctx->rex_w ? 8u : 4u;
    960   put_reg_ctx(sb, reg, w, ctx->has_rex);
    961   return ctx->opc_off + 2u;
    962 }
    963 
    964 static u32 print_bs(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, u32 len,
    965                     const X64DecodeCtx* ctx) {
    966   /* dst = bsr/bsf(src). Operand order in AT&T is "src, dst". */
    967   u32 off = ctx->opc_off + d->opc_len;
    968   RegRm rr;
    969   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    970   u32 w = width_for(d, ctx);
    971   put_rm(sb, &rr, *ctx, w);
    972   strbuf_puts(sb, ", ");
    973   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
    974   return off + 1u + rr.bytes_after_modrm;
    975 }
    976 
    977 static u32 print_xmm_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
    978                         u32 len, const X64DecodeCtx* ctx) {
    979   u32 off = ctx->opc_off + d->opc_len;
    980   RegRm rr;
    981   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
    982   /* Operand classes/order by opcode (AT&T src, dst):
    983    *   2A CVTSI2*  : rm=GP(src),  reg=xmm(dst)  -> "rm_gp, reg_xmm"
    984    *   6E MOVD/Q   : rm=GP(src),  reg=xmm(dst)  -> "rm_gp, reg_xmm" (gpr->xmm)
    985    *   2C CVTT*2SI : rm=xmm(src), reg=GP(dst)   -> "rm_xmm, reg_gp"
    986    *   7E MOVD/Q   : reg=xmm(src), rm=GP(dst)   -> "reg_xmm, rm_gp" (reversed!)
    987    *   others      : both xmm                   -> "rm_xmm, reg_xmm"
    988    * GP width comes from REX.W (movd vs movq / 32- vs 64-bit operands). */
    989   u8 op = d->opc[1];
    990   u32 gp_w = ctx->rex_w ? 8u : 4u;
    991   if (op == 0x7Eu) {
    992     /* xmm -> r/m GPR: source is the reg-field xmm, dest is the r/m GPR. */
    993     put_xmm(sb, rr.reg);
    994     strbuf_puts(sb, ", ");
    995     put_rm(sb, &rr, *ctx, gp_w);
    996     return off + 1u + rr.bytes_after_modrm;
    997   }
    998   /* Store-direction XMM moves (MOVSD/MOVSS/MOVUPS 0x11, MOVAPS 0x29): the
    999    * reg-field xmm is the SOURCE and the r/m (memory or xmm) is the
   1000    * DESTINATION — AT&T order `reg_xmm, rm`. Without this the disassembler
   1001    * prints them in load order, so re-assembly flips the data direction. */
   1002   if (op == 0x11u || op == 0x29u) {
   1003     put_xmm(sb, rr.reg);
   1004     strbuf_puts(sb, ", ");
   1005     put_rm_xmm(sb, &rr, *ctx);
   1006     return off + 1u + rr.bytes_after_modrm;
   1007   }
   1008   {
   1009     int dst_is_gp = (op == 0x2Cu);                /* CVTTSD/SS2SI */
   1010     int src_is_gp = (op == 0x2Au || op == 0x6Eu); /* CVTSI2*, MOVD/Q g->x */
   1011     if (src_is_gp) {
   1012       put_rm(sb, &rr, *ctx, gp_w);
   1013     } else {
   1014       put_rm_xmm(sb, &rr, *ctx);
   1015     }
   1016     strbuf_puts(sb, ", ");
   1017     if (dst_is_gp) {
   1018       put_reg_ctx(sb, rr.reg, gp_w, ctx->has_rex);
   1019     } else {
   1020       put_xmm(sb, rr.reg);
   1021     }
   1022   }
   1023   return off + 1u + rr.bytes_after_modrm;
   1024 }
   1025 
   1026 static u32 print_xadd_mem(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   1027                           u32 len, const X64DecodeCtx* ctx) {
   1028   /* XADD r/m, r — source is the reg, destination is r/m. */
   1029   u32 off = ctx->opc_off + d->opc_len;
   1030   RegRm rr;
   1031   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
   1032   u32 w = width_for(d, ctx);
   1033   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
   1034   strbuf_puts(sb, ", ");
   1035   put_rm(sb, &rr, *ctx, w);
   1036   return off + 1u + rr.bytes_after_modrm;
   1037 }
   1038 
   1039 static u32 print_xchg_mem(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   1040                           u32 len, const X64DecodeCtx* ctx) {
   1041   u32 off = ctx->opc_off + d->opc_len;
   1042   RegRm rr;
   1043   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
   1044   u32 w = width_for(d, ctx);
   1045   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
   1046   strbuf_puts(sb, ", ");
   1047   put_rm(sb, &rr, *ctx, w);
   1048   return off + 1u + rr.bytes_after_modrm;
   1049 }
   1050 
   1051 static u32 print_cmpxchg_mem(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   1052                              u32 len, const X64DecodeCtx* ctx) {
   1053   /* CMPXCHG r/m, r — implicit RAX is the comparand; not shown. */
   1054   u32 off = ctx->opc_off + d->opc_len;
   1055   RegRm rr;
   1056   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
   1057   u32 w = width_for(d, ctx);
   1058   put_reg_ctx(sb, rr.reg, w, ctx->has_rex);
   1059   strbuf_puts(sb, ", ");
   1060   put_rm(sb, &rr, *ctx, w);
   1061   return off + 1u + rr.bytes_after_modrm;
   1062 }
   1063 
   1064 static u32 print_nop_multi(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   1065                            u32 len, const X64DecodeCtx* ctx) {
   1066   (void)sb;
   1067   u32 off = ctx->opc_off + d->opc_len;
   1068   RegRm rr;
   1069   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
   1070   return off + 1u + rr.bytes_after_modrm;
   1071 }
   1072 
   1073 /* ====================================================================
   1074  * Dispatch.
   1075  * ==================================================================== */
   1076 
   1077 u32 x64_print_operands(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   1078                        u32 len, const X64DecodeCtx* ctx, u64 vaddr) {
   1079   switch ((X64Format)d->fmt) {
   1080     case X64_FMT_NULLARY:
   1081       return print_nullary(sb, d, bytes, len, ctx);
   1082     case X64_FMT_NOP_MULTI:
   1083       return print_nop_multi(sb, d, bytes, len, ctx);
   1084     case X64_FMT_PUSH_POP:
   1085       return print_push_pop(sb, d, bytes, len, ctx);
   1086     case X64_FMT_MOV_RI:
   1087       return print_mov_ri(sb, d, bytes, len, ctx);
   1088     case X64_FMT_ALU_RR:
   1089       return print_alu_rr(sb, d, bytes, len, ctx);
   1090     case X64_FMT_MOV_RM_LOAD:
   1091       return print_mov_rm_load(sb, d, bytes, len, ctx);
   1092     case X64_FMT_MOVZX_MOVSX:
   1093       return print_movzx_movsx(sb, d, bytes, len, ctx);
   1094     case X64_FMT_MOVSXD:
   1095       return print_movsxd(sb, d, bytes, len, ctx);
   1096     case X64_FMT_ALU_RM_IMM8:
   1097     case X64_FMT_ALU_RM_IMM32:
   1098       return print_alu_rm_imm(sb, d, bytes, len, ctx);
   1099     case X64_FMT_IMUL_RR:
   1100       return print_imul_rr(sb, d, bytes, len, ctx);
   1101     case X64_FMT_IMUL_RRI:
   1102       return print_imul_rri(sb, d, bytes, len, ctx);
   1103     case X64_FMT_F7_RM:
   1104       return print_f7_rm(sb, d, bytes, len, ctx);
   1105     case X64_FMT_SHIFT_IMM:
   1106       return print_shift_imm(sb, d, bytes, len, ctx);
   1107     case X64_FMT_SHIFT_CL:
   1108       return print_shift_cl(sb, d, bytes, len, ctx);
   1109     case X64_FMT_JCC_REL32:
   1110       return print_jcc_rel32(sb, d, bytes, len, ctx, vaddr);
   1111     case X64_FMT_JMP_REL32:
   1112     case X64_FMT_CALL_REL32:
   1113       return print_jmp_call_rel32(sb, d, bytes, len, ctx, vaddr);
   1114     case X64_FMT_BR_RM:
   1115       return print_br_rm(sb, d, bytes, len, ctx);
   1116     case X64_FMT_SETCC_RM:
   1117       return print_setcc(sb, d, bytes, len, ctx);
   1118     case X64_FMT_CMOVCC_RR:
   1119       return print_cmovcc_rr(sb, d, bytes, len, ctx);
   1120     case X64_FMT_BSWAP:
   1121       return print_bswap(sb, d, bytes, len, ctx);
   1122     case X64_FMT_BS:
   1123       return print_bs(sb, d, bytes, len, ctx);
   1124     case X64_FMT_POPCNT:
   1125       return print_bs(sb, d, bytes, len, ctx); /* same shape */
   1126     case X64_FMT_SSE_RR:
   1127     case X64_FMT_SSE_LOAD:
   1128     case X64_FMT_SSE_STORE:
   1129       return print_xmm_rr(sb, d, bytes, len, ctx);
   1130     case X64_FMT_XADD_MEM:
   1131       return print_xadd_mem(sb, d, bytes, len, ctx);
   1132     case X64_FMT_XCHG_MEM:
   1133       return print_xchg_mem(sb, d, bytes, len, ctx);
   1134     case X64_FMT_CMPXCHG_MEM:
   1135       return print_cmpxchg_mem(sb, d, bytes, len, ctx);
   1136     case X64_FMT_RAW_BYTE:
   1137       return 0;
   1138   }
   1139   return 0;
   1140 }
   1141 
   1142 /* Resolve the condition nibble for Jcc/SETcc/CMOVcc to its AT&T mnemonic
   1143  * suffix. Used by the disassembler to spell j → "je", set → "sete", etc. */
   1144 const char* x64_cc_name(u8 cc) { return g_cc_name[cc & 0xFu]; }