x64 disasm: decode movd/movq (66 0F 6E/7E) and xorps/xorpd (0F 57) - kit

commit 85b11149946e7dc2f7f1a039aa3e4c389be4c962
parent 898aaa7475961d5f8545fffbcc3aba525a4f181b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 13:32:45 -0700

x64 disasm: decode movd/movq (66 0F 6E/7E) and xorps/xorpd (0F 57)

The x64 backend emits these for int<->FP bitcasts and FP negation, but they
were absent from x64_insn_table, so the disassembler fell back to a 1-byte
.byte and desynced the entire following instruction stream (a 5-byte
movq %rax,%xmm15 became five .byte lines, then the next opcode mis-decoded).
This corrupted objdump output and JIT-debugger disassembly for any x64
function touching float/double.

Add table rows split on REX.W (W_REQ_0 -> movd, W_REQ_1 -> movq) so the
mnemonic tracks operand width, plus xorps (no prefix) / xorpd (66). Teach
print_xmm_rr the 66 0F 7E direction (xmm in reg field is the source, GPR r/m
is the dest -> reversed AT&T order); 6E reuses the existing gpr-source path.

Verified byte-for-byte against llvm-objdump for all six forms, and a real
double program no longer produces any .inst/.byte desync. Adds an x64 decode
corpus case (runs under the x64 decode lane).

Diffstat:
M src/arch/x64/isa.c  | 58 +++++++++++++++++++++++++++++++++++++++++++++-------------
A test/asm/decode/x64_sse_movd_movq.expected.txt  | 6 ++++++
A test/asm/decode/x64_sse_movd_movq.hex  | 2 ++
A test/asm/decode/x64_sse_movd_movq.targets  | 2 ++

4 files changed, 55 insertions(+), 13 deletions(-)
diff --git a/src/arch/x64/isa.c b/src/arch/x64/isa.c
@@ -300,6 +300,25 @@ const X64InsnDesc x64_insn_table[] = {
         X64_FMT_SSE_RR, 0),
     ROW("cvtss2sd", X64_PFX_F3, 2, 0x0F, 0x5A, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
         X64_FMT_SSE_RR, 0),
+    /* MOVD/MOVQ between GPR and XMM. 66 0F 6E /r is gpr->xmm, 66 0F 7E /r is
+     * xmm->gpr (note the reversed operand order, handled in print_xmm_rr).
+     * REX.W picks movq (64-bit GPR) vs movd (32-bit), and since the *mnemonic*
+     * itself changes we split into W_REQ_0 / W_REQ_1 rows rather than a width
+     * suffix. The backend emits these for int<->FP bitcasts (emit_sse_rr_w). */
+    ROW("movd", X64_PFX_66, 2, 0x0F, 0x6E, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
+        X64_FMT_SSE_RR, 0),
+    ROW("movq", X64_PFX_66, 2, 0x0F, 0x6E, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
+        X64_FMT_SSE_RR, 0),
+    ROW("movd", X64_PFX_66, 2, 0x0F, 0x7E, 0, 0xFF, NO_MODRM, X64_W_REQ_0,
+        X64_FMT_SSE_RR, 0),
+    ROW("movq", X64_PFX_66, 2, 0x0F, 0x7E, 0, 0xFF, NO_MODRM, X64_W_REQ_1,
+        X64_FMT_SSE_RR, 0),
+    /* XORPS / XORPD (0F 57, prefix selects packed-single vs -double). The
+     * backend uses these to clear/negate FP registers. Both operands xmm. */
+    ROW("xorps", X64_PFX_NONE, 2, 0x0F, 0x57, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
+        X64_FMT_SSE_RR, 0),
+    ROW("xorpd", X64_PFX_66, 2, 0x0F, 0x57, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY,
+        X64_FMT_SSE_RR, 0),
 };
 
 const u32 x64_insn_table_n =
@@ -927,23 +946,36 @@ static u32 print_xmm_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes,
   u32 off = ctx->opc_off + d->opc_len;
   RegRm rr;
   if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0;
-  /* CVTSI2*: source is GP reg (size from REX.W), dst is xmm.
-   * CVTT*2SI: source is xmm, dst is GP reg.
-   * Other SSE arith / mov / cmp: both xmm. */
+  /* Operand classes/order by opcode (AT&T src, dst):
+   *   2A CVTSI2*  : rm=GP(src),  reg=xmm(dst)  -> "rm_gp, reg_xmm"
+   *   6E MOVD/Q   : rm=GP(src),  reg=xmm(dst)  -> "rm_gp, reg_xmm" (gpr->xmm)
+   *   2C CVTT*2SI : rm=xmm(src), reg=GP(dst)   -> "rm_xmm, reg_gp"
+   *   7E MOVD/Q   : reg=xmm(src), rm=GP(dst)   -> "reg_xmm, rm_gp" (reversed!)
+   *   others      : both xmm                   -> "rm_xmm, reg_xmm"
+   * GP width comes from REX.W (movd vs movq / 32- vs 64-bit operands). */
   u8 op = d->opc[1];
-  int dst_is_gp = (op == 0x2Cu); /* CVTTSD/SS2SI */
-  int src_is_gp = (op == 0x2Au); /* CVTSI2SD/SS */
   u32 gp_w = ctx->rex_w ? 8u : 4u;
-  if (src_is_gp) {
+  if (op == 0x7Eu) {
+    /* xmm -> r/m GPR: source is the reg-field xmm, dest is the r/m GPR. */
+    put_xmm(sb, rr.reg);
+    strbuf_puts(sb, ", ");
     put_rm(sb, &rr, *ctx, gp_w);
-  } else {
-    put_rm_xmm(sb, &rr, *ctx);
+    return off + 1u + rr.bytes_after_modrm;
   }
-  strbuf_puts(sb, ", ");
-  if (dst_is_gp) {
-    put_reg_ctx(sb, rr.reg, gp_w, ctx->has_rex);
-  } else {
-    put_xmm(sb, rr.reg);
+  {
+    int dst_is_gp = (op == 0x2Cu);                    /* CVTTSD/SS2SI */
+    int src_is_gp = (op == 0x2Au || op == 0x6Eu);     /* CVTSI2*, MOVD/Q g->x */
+    if (src_is_gp) {
+      put_rm(sb, &rr, *ctx, gp_w);
+    } else {
+      put_rm_xmm(sb, &rr, *ctx);
+    }
+    strbuf_puts(sb, ", ");
+    if (dst_is_gp) {
+      put_reg_ctx(sb, rr.reg, gp_w, ctx->has_rex);
+    } else {
+      put_xmm(sb, rr.reg);
+    }
   }
   return off + 1u + rr.bytes_after_modrm;
 }
diff --git a/test/asm/decode/x64_sse_movd_movq.expected.txt b/test/asm/decode/x64_sse_movd_movq.expected.txt
@@ -0,0 +1,6 @@
+0:	movq	%xmm0, %rax
+5:	movd	%xmm1, %ecx
+9:	movq	%rdx, %xmm2
+e:	movd	%esi, %xmm3
+12:	xorpd	%xmm4, %xmm5
+16:	xorps	%xmm6, %xmm7
diff --git a/test/asm/decode/x64_sse_movd_movq.hex b/test/asm/decode/x64_sse_movd_movq.hex
@@ -0,0 +1 @@
+66480f7ec0660f7ec966480f6ed2660f6ede660f57ec0f57fe
+\ No newline at end of file
diff --git a/test/asm/decode/x64_sse_movd_movq.targets b/test/asm/decode/x64_sse_movd_movq.targets
@@ -0,0 +1 @@
+x64
+\ No newline at end of file

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/x64/isa.c	\|	58	+++++++++++++++++++++++++++++++++++++++++++++-------------
A	test/asm/decode/x64_sse_movd_movq.expected.txt	\|	6	++++++
A	test/asm/decode/x64_sse_movd_movq.hex	\|	2	++
A	test/asm/decode/x64_sse_movd_movq.targets	\|	2	++