commit 6d9c6b8fea4710d515208ecd332168a7327d04e5
parent 51b7bafacd32b879a65c5cb63ef0d8efa335e41b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 10:31:16 -0700
x64 -O1: use only reserved emit scratch in va_arg/va_copy/copy_bytes
Several backend emit helpers used allocable registers (r10, rdx) as ad-hoc
scratch. At -O1 the register allocator places live values there, so the helper
clobbered them — e.g. 132's bswap roundtrip actually failed because copy_bytes
clobbered rdx, where the live bs32_round value sat. Rework the heavy helpers to
draw only from the reserved emit scratch (rax/r11, and xmm14 for fp copies), so
no allocable register is ever consumed and the allocable pool is maximized:
- va_arg: advance the gp/fp offset and overflow pointer in memory (add r/m,imm)
and fold the reg-save base into rax (add r64,[mem]) — only rax+r11, no r10.
- va_copy: copy each qword through the fp scratch xmm14 — only rax/r11/xmm14, no rdx.
- copy_bytes: copy each 8/4/2/1 chunk through rax via x64_emit_mem (which resolves
the address with its own r11 scratch) — only rax/r11, no rdx.
r10 and rdx are now never backend scratch, so they stay allocable. x64 toy X-O1
is now 156/0 (was 155/1). No regressions: x64 O0 156/0, rv64 O0/O1 156/156,
aa64 1034/0.
Diffstat:
| M | src/arch/x64/native.c | | | 143 | ++++++++++++++++++++++++++++++++++++++++++++++++------------------------------- |
1 file changed, 87 insertions(+), 56 deletions(-)
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -740,29 +740,47 @@ static u32 x64_addr_to_base_reg(X64NativeTarget* a, NativeAddr addr,
static void x64_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
AggregateAccess access) {
X64NativeTarget* a = x64_of(t);
- MCEmitter* mc = t->mc;
- u32 dr = x64_addr_to_base_reg(a, dst, X64_TMP_INT2);
- u32 sr = x64_addr_to_base_reg(a, src, X64_TMP_INT);
+ /* Copy chunk by chunk (8/4/2/1) through the value scratch rax, letting
+ * x64_emit_mem resolve each address with its own scratch (r11). Uses only the
+ * reserved emit scratch (rax/r11) — no ad-hoc allocable temp (previously rdx),
+ * which the optimizer may have live across the copy. */
+ CfreeCgTypeId tys[4];
u32 n = access.size, i = 0;
- while (i + 8u <= n) {
- emit_mov_load(mc, 8, 0, X64_RDX, sr, (i32)i);
- emit_mov_store(mc, 8, X64_RDX, dr, (i32)i);
- i += 8u;
- }
- while (i + 4u <= n) {
- emit_mov_load(mc, 4, 0, X64_RDX, sr, (i32)i);
- emit_mov_store(mc, 4, X64_RDX, dr, (i32)i);
- i += 4u;
- }
- while (i + 2u <= n) {
- emit_mov_load(mc, 2, 0, X64_RDX, sr, (i32)i);
- emit_mov_store(mc, 2, X64_RDX, dr, (i32)i);
- i += 2u;
- }
+ tys[0] = builtin_id(CFREE_CG_BUILTIN_I64);
+ tys[1] = builtin_id(CFREE_CG_BUILTIN_I32);
+ tys[2] = builtin_id(CFREE_CG_BUILTIN_I16);
+ tys[3] = builtin_id(CFREE_CG_BUILTIN_I8);
while (i < n) {
- emit_mov_load(mc, 1, 0, X64_RDX, sr, (i32)i);
- emit_mov_store(mc, 1, X64_RDX, dr, (i32)i);
- i += 1u;
+ u32 rem = n - i, s;
+ CfreeCgTypeId ty;
+ NativeAddr sa = src, da = dst;
+ NativeLoc val;
+ MemAccess mem;
+ if (rem >= 8u) {
+ s = 8u;
+ ty = tys[0];
+ } else if (rem >= 4u) {
+ s = 4u;
+ ty = tys[1];
+ } else if (rem >= 2u) {
+ s = 2u;
+ ty = tys[2];
+ } else {
+ s = 1u;
+ ty = tys[3];
+ }
+ sa.offset += (i32)i;
+ sa.base_type = ty;
+ da.offset += (i32)i;
+ da.base_type = ty;
+ val = x64_reg_loc(ty, NATIVE_REG_INT, X64_TMP_INT);
+ memset(&mem, 0, sizeof mem);
+ mem.type = ty;
+ mem.size = s;
+ mem.align = s;
+ x64_emit_mem(a, 1, val, sa, mem); /* rax = [src + i] */
+ x64_emit_mem(a, 0, val, da, mem); /* [dst + i] = rax */
+ i += s;
}
}
@@ -2841,6 +2859,26 @@ static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) {
return base;
}
+/* add r/m, imm8 (group-1 /0) directly to a memory field — advances a va_list
+ * offset/pointer in place without consuming a register. w selects 64- vs 32-bit. */
+static void x64_add_mem_imm(MCEmitter* mc, int w, u32 base, i32 disp, i8 imm) {
+ u8 op = X64_OPC_ALU_IMM8;
+ u8 b;
+ emit_rex(mc, w, 0, 0, base);
+ mc->emit_bytes(mc, &op, 1);
+ emit_mem_operand(mc, X64_ALU_SUB_ADD, base, disp); /* modrm.reg = /0 (ADD) */
+ b = (u8)imm;
+ mc->emit_bytes(mc, &b, 1);
+}
+
+/* add r64, [base+disp] (0x03 /r). */
+static void x64_add_reg_mem(MCEmitter* mc, u32 dst, u32 base, i32 disp) {
+ u8 op = 0x03;
+ emit_rex(mc, 1, dst, 0, base);
+ mc->emit_bytes(mc, &op, 1);
+ emit_mem_operand(mc, dst, base, disp);
+}
+
static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) {
NativeTarget* t = &a->base;
MCEmitter* mc = t->mc;
@@ -2878,56 +2916,48 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
u32 sz = x64_type_size(t, type);
int is_fp = loc_is_fp(dst);
u32 dr = loc_reg(dst);
- u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2);
+ u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */
+ /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list
+ * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the
+ * reg-save base is folded into rax (x64_add_reg_mem), so no third register is
+ * needed — leaving r10 (and rdx) free for the allocator. */
if (a->abi->shadow_space) {
- /* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */
- emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0);
+ /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */
+ emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
- /* add r10, 8; *ap = r10. */
- emit_rex(mc, 1, 0, 0, X64_R10);
- {
- u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8};
- mc->emit_bytes(mc, buf, 3);
- }
- emit_mov_store(mc, 8, X64_R10, ap_base, 0);
+ emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+ x64_add_mem_imm(mc, 1, ap_base, 0, 8);
return;
}
{
u32 offs_field = is_fp ? 4u : 0u;
u32 max_offs = is_fp ? 176u : 48u;
- u32 stride = is_fp ? 16u : 8u;
+ i8 stride = is_fp ? 16 : 8;
MCLabel L_stack = mc->label_new(mc);
MCLabel L_done = mc->label_new(mc);
/* eax = ap[offs]; cmp eax, max; jae L_stack. */
emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field);
emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs);
emit_jcc_rel32(mc, X64_CC_AE, L_stack);
- /* reg path: r10 = ap[16] + rax; load; eax += stride; ap[offs] = eax. */
- emit_mov_load(mc, 8, 0, X64_R10, ap_base, 16);
- emit_alu_rr(mc, 1, X64_OPC_ALU_ADD, X64_R10, X64_RAX);
+ /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load.
+ * (The memory increment leaves rax holding the old offset.) */
+ x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride);
+ x64_add_reg_mem(mc, X64_RAX, ap_base, 16);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
- emit_alu_imm8(mc, 0, X64_ALU_SUB_ADD, X64_RAX, (i8)stride);
- emit_mov_store(mc, 4, X64_RAX, ap_base, (i32)offs_field);
+ emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
emit_jmp_rel32(mc, L_done);
- /* stack path: r10 = ap[8]; load; r10 += 8; ap[8] = r10. */
+ /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */
mc->label_place(mc, L_stack);
- emit_mov_load(mc, 8, 0, X64_R10, ap_base, 8);
+ emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_R10, 0);
- emit_rex(mc, 1, 0, 0, X64_R10);
- {
- u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8};
- mc->emit_bytes(mc, buf, 3);
- }
- emit_mov_store(mc, 8, X64_R10, ap_base, 8);
+ emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+ x64_add_mem_imm(mc, 1, ap_base, 8, 8);
mc->label_place(mc, L_done);
}
}
@@ -2936,15 +2966,16 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap,
NativeAddr src_ap) {
NativeTarget* t = &a->base;
MCEmitter* mc = t->mc;
- /* Resolve dst into r11, src into rax (disjoint from each other and from the
- * rdx copy scratch); force both so the optimizer's register choice for a
- * va_list pointer can't alias the copy scratch. */
+ /* Resolve dst into r11, src into rax, and copy each qword through the fp emit
+ * scratch xmm14. Uses only reserved emit scratch (r11/rax/xmm14), so the
+ * optimizer's register choice for a va_list pointer can never be clobbered and
+ * no allocable GPR (previously rdx) is consumed. */
u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2);
u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT);
u32 n = a->abi->shadow_space ? 8u : 24u, i;
for (i = 0; i < n; i += 8u) {
- emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i);
- emit_mov_store(mc, 8, X64_RDX, dst_base, (i32)i);
+ emit_sse_load(mc, 0xF2, 0x10, X64_TMP_FP, src_base, (i32)i); /* movsd */
+ emit_sse_store(mc, 0xF2, 0x11, X64_TMP_FP, dst_base, (i32)i); /* movsd */
}
}