x64 -O1: use only reserved emit scratch in va_arg/va_copy/copy_bytes - kit

commit 6d9c6b8fea4710d515208ecd332168a7327d04e5
parent 51b7bafacd32b879a65c5cb63ef0d8efa335e41b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 10:31:16 -0700

x64 -O1: use only reserved emit scratch in va_arg/va_copy/copy_bytes

Several backend emit helpers used allocable registers (r10, rdx) as ad-hoc
scratch. At -O1 the register allocator places live values there, so the helper
clobbered them — e.g. 132's bswap roundtrip actually failed because copy_bytes
clobbered rdx, where the live bs32_round value sat. Rework the heavy helpers to
draw only from the reserved emit scratch (rax/r11, and xmm14 for fp copies), so
no allocable register is ever consumed and the allocable pool is maximized:

- va_arg: advance the gp/fp offset and overflow pointer in memory (add r/m,imm)
  and fold the reg-save base into rax (add r64,[mem]) — only rax+r11, no r10.
- va_copy: copy each qword through the fp scratch xmm14 — only rax/r11/xmm14, no rdx.
- copy_bytes: copy each 8/4/2/1 chunk through rax via x64_emit_mem (which resolves
  the address with its own r11 scratch) — only rax/r11, no rdx.

r10 and rdx are now never backend scratch, so they stay allocable. x64 toy X-O1
is now 156/0 (was 155/1). No regressions: x64 O0 156/0, rv64 O0/O1 156/156,
aa64 1034/0.

Diffstat:
M src/arch/x64/native.c  | 143 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------

1 file changed, 87 insertions(+), 56 deletions(-)
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -740,29 +740,47 @@ static u32 x64_addr_to_base_reg(X64NativeTarget* a, NativeAddr addr,
 static void x64_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
                            AggregateAccess access) {
   X64NativeTarget* a = x64_of(t);
-  MCEmitter* mc = t->mc;
-  u32 dr = x64_addr_to_base_reg(a, dst, X64_TMP_INT2);
-  u32 sr = x64_addr_to_base_reg(a, src, X64_TMP_INT);
+  /* Copy chunk by chunk (8/4/2/1) through the value scratch rax, letting
+   * x64_emit_mem resolve each address with its own scratch (r11). Uses only the
+   * reserved emit scratch (rax/r11) — no ad-hoc allocable temp (previously rdx),
+   * which the optimizer may have live across the copy. */
+  CfreeCgTypeId tys[4];
   u32 n = access.size, i = 0;
-  while (i + 8u <= n) {
-    emit_mov_load(mc, 8, 0, X64_RDX, sr, (i32)i);
-    emit_mov_store(mc, 8, X64_RDX, dr, (i32)i);
-    i += 8u;
-  }
-  while (i + 4u <= n) {
-    emit_mov_load(mc, 4, 0, X64_RDX, sr, (i32)i);
-    emit_mov_store(mc, 4, X64_RDX, dr, (i32)i);
-    i += 4u;
-  }
-  while (i + 2u <= n) {
-    emit_mov_load(mc, 2, 0, X64_RDX, sr, (i32)i);
-    emit_mov_store(mc, 2, X64_RDX, dr, (i32)i);
-    i += 2u;
-  }
+  tys[0] = builtin_id(CFREE_CG_BUILTIN_I64);
+  tys[1] = builtin_id(CFREE_CG_BUILTIN_I32);
+  tys[2] = builtin_id(CFREE_CG_BUILTIN_I16);
+  tys[3] = builtin_id(CFREE_CG_BUILTIN_I8);
   while (i < n) {
-    emit_mov_load(mc, 1, 0, X64_RDX, sr, (i32)i);
-    emit_mov_store(mc, 1, X64_RDX, dr, (i32)i);
-    i += 1u;
+    u32 rem = n - i, s;
+    CfreeCgTypeId ty;
+    NativeAddr sa = src, da = dst;
+    NativeLoc val;
+    MemAccess mem;
+    if (rem >= 8u) {
+      s = 8u;
+      ty = tys[0];
+    } else if (rem >= 4u) {
+      s = 4u;
+      ty = tys[1];
+    } else if (rem >= 2u) {
+      s = 2u;
+      ty = tys[2];
+    } else {
+      s = 1u;
+      ty = tys[3];
+    }
+    sa.offset += (i32)i;
+    sa.base_type = ty;
+    da.offset += (i32)i;
+    da.base_type = ty;
+    val = x64_reg_loc(ty, NATIVE_REG_INT, X64_TMP_INT);
+    memset(&mem, 0, sizeof mem);
+    mem.type = ty;
+    mem.size = s;
+    mem.align = s;
+    x64_emit_mem(a, 1, val, sa, mem); /* rax = [src + i] */
+    x64_emit_mem(a, 0, val, da, mem); /* [dst + i] = rax */
+    i += s;
   }
 }
 
@@ -2841,6 +2859,26 @@ static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) {
   return base;
 }
 
+/* add r/m, imm8 (group-1 /0) directly to a memory field — advances a va_list
+ * offset/pointer in place without consuming a register. w selects 64- vs 32-bit. */
+static void x64_add_mem_imm(MCEmitter* mc, int w, u32 base, i32 disp, i8 imm) {
+  u8 op = X64_OPC_ALU_IMM8;
+  u8 b;
+  emit_rex(mc, w, 0, 0, base);
+  mc->emit_bytes(mc, &op, 1);
+  emit_mem_operand(mc, X64_ALU_SUB_ADD, base, disp); /* modrm.reg = /0 (ADD) */
+  b = (u8)imm;
+  mc->emit_bytes(mc, &b, 1);
+}
+
+/* add r64, [base+disp] (0x03 /r). */
+static void x64_add_reg_mem(MCEmitter* mc, u32 dst, u32 base, i32 disp) {
+  u8 op = 0x03;
+  emit_rex(mc, 1, dst, 0, base);
+  mc->emit_bytes(mc, &op, 1);
+  emit_mem_operand(mc, dst, base, disp);
+}
+
 static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) {
   NativeTarget* t = &a->base;
   MCEmitter* mc = t->mc;
@@ -2878,56 +2916,48 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
   u32 sz = x64_type_size(t, type);
   int is_fp = loc_is_fp(dst);
   u32 dr = loc_reg(dst);
-  u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2);
+  u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */
+  /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list
+   * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the
+   * reg-save base is folded into rax (x64_add_reg_mem), so no third register is
+   * needed — leaving r10 (and rdx) free for the allocator. */
   if (a->abi->shadow_space) {
-    /* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */
-    emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0);
+    /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */
+    emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
-    /* add r10, 8; *ap = r10. */
-    emit_rex(mc, 1, 0, 0, X64_R10);
-    {
-      u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8};
-      mc->emit_bytes(mc, buf, 3);
-    }
-    emit_mov_store(mc, 8, X64_R10, ap_base, 0);
+      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+    x64_add_mem_imm(mc, 1, ap_base, 0, 8);
     return;
   }
   {
     u32 offs_field = is_fp ? 4u : 0u;
     u32 max_offs = is_fp ? 176u : 48u;
-    u32 stride = is_fp ? 16u : 8u;
+    i8 stride = is_fp ? 16 : 8;
     MCLabel L_stack = mc->label_new(mc);
     MCLabel L_done = mc->label_new(mc);
     /* eax = ap[offs]; cmp eax, max; jae L_stack. */
     emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field);
     emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs);
     emit_jcc_rel32(mc, X64_CC_AE, L_stack);
-    /* reg path: r10 = ap[16] + rax; load; eax += stride; ap[offs] = eax. */
-    emit_mov_load(mc, 8, 0, X64_R10, ap_base, 16);
-    emit_alu_rr(mc, 1, X64_OPC_ALU_ADD, X64_R10, X64_RAX);
+    /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load.
+     * (The memory increment leaves rax holding the old offset.) */
+    x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride);
+    x64_add_reg_mem(mc, X64_RAX, ap_base, 16);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
-    emit_alu_imm8(mc, 0, X64_ALU_SUB_ADD, X64_RAX, (i8)stride);
-    emit_mov_store(mc, 4, X64_RAX, ap_base, (i32)offs_field);
+      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
     emit_jmp_rel32(mc, L_done);
-    /* stack path: r10 = ap[8]; load; r10 += 8; ap[8] = r10. */
+    /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */
     mc->label_place(mc, L_stack);
-    emit_mov_load(mc, 8, 0, X64_R10, ap_base, 8);
+    emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
-    emit_rex(mc, 1, 0, 0, X64_R10);
-    {
-      u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8};
-      mc->emit_bytes(mc, buf, 3);
-    }
-    emit_mov_store(mc, 8, X64_R10, ap_base, 8);
+      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+    x64_add_mem_imm(mc, 1, ap_base, 8, 8);
     mc->label_place(mc, L_done);
   }
 }
@@ -2936,15 +2966,16 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap,
                              NativeAddr src_ap) {
   NativeTarget* t = &a->base;
   MCEmitter* mc = t->mc;
-  /* Resolve dst into r11, src into rax (disjoint from each other and from the
-   * rdx copy scratch); force both so the optimizer's register choice for a
-   * va_list pointer can't alias the copy scratch. */
+  /* Resolve dst into r11, src into rax, and copy each qword through the fp emit
+   * scratch xmm14. Uses only reserved emit scratch (r11/rax/xmm14), so the
+   * optimizer's register choice for a va_list pointer can never be clobbered and
+   * no allocable GPR (previously rdx) is consumed. */
   u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2);
   u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT);
   u32 n = a->abi->shadow_space ? 8u : 24u, i;
   for (i = 0; i < n; i += 8u) {
-    emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i);
-    emit_mov_store(mc, 8, X64_RDX, dst_base, (i32)i);
+    emit_sse_load(mc, 0xF2, 0x10, X64_TMP_FP, src_base, (i32)i);  /* movsd */
+    emit_sse_store(mc, 0xF2, 0x11, X64_TMP_FP, dst_base, (i32)i); /* movsd */
   }
 }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README