kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 6d9c6b8fea4710d515208ecd332168a7327d04e5
parent 51b7bafacd32b879a65c5cb63ef0d8efa335e41b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 10:31:16 -0700

x64 -O1: use only reserved emit scratch in va_arg/va_copy/copy_bytes

Several backend emit helpers used allocable registers (r10, rdx) as ad-hoc
scratch. At -O1 the register allocator places live values there, so the helper
clobbered them — e.g. 132's bswap roundtrip actually failed because copy_bytes
clobbered rdx, where the live bs32_round value sat. Rework the heavy helpers to
draw only from the reserved emit scratch (rax/r11, and xmm14 for fp copies), so
no allocable register is ever consumed and the allocable pool is maximized:

- va_arg: advance the gp/fp offset and overflow pointer in memory (add r/m,imm)
  and fold the reg-save base into rax (add r64,[mem]) — only rax+r11, no r10.
- va_copy: copy each qword through the fp scratch xmm14 — only rax/r11/xmm14, no rdx.
- copy_bytes: copy each 8/4/2/1 chunk through rax via x64_emit_mem (which resolves
  the address with its own r11 scratch) — only rax/r11, no rdx.

r10 and rdx are now never backend scratch, so they stay allocable. x64 toy X-O1
is now 156/0 (was 155/1). No regressions: x64 O0 156/0, rv64 O0/O1 156/156,
aa64 1034/0.

Diffstat:
Msrc/arch/x64/native.c | 143++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
1 file changed, 87 insertions(+), 56 deletions(-)

diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -740,29 +740,47 @@ static u32 x64_addr_to_base_reg(X64NativeTarget* a, NativeAddr addr, static void x64_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, AggregateAccess access) { X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - u32 dr = x64_addr_to_base_reg(a, dst, X64_TMP_INT2); - u32 sr = x64_addr_to_base_reg(a, src, X64_TMP_INT); + /* Copy chunk by chunk (8/4/2/1) through the value scratch rax, letting + * x64_emit_mem resolve each address with its own scratch (r11). Uses only the + * reserved emit scratch (rax/r11) — no ad-hoc allocable temp (previously rdx), + * which the optimizer may have live across the copy. */ + CfreeCgTypeId tys[4]; u32 n = access.size, i = 0; - while (i + 8u <= n) { - emit_mov_load(mc, 8, 0, X64_RDX, sr, (i32)i); - emit_mov_store(mc, 8, X64_RDX, dr, (i32)i); - i += 8u; - } - while (i + 4u <= n) { - emit_mov_load(mc, 4, 0, X64_RDX, sr, (i32)i); - emit_mov_store(mc, 4, X64_RDX, dr, (i32)i); - i += 4u; - } - while (i + 2u <= n) { - emit_mov_load(mc, 2, 0, X64_RDX, sr, (i32)i); - emit_mov_store(mc, 2, X64_RDX, dr, (i32)i); - i += 2u; - } + tys[0] = builtin_id(CFREE_CG_BUILTIN_I64); + tys[1] = builtin_id(CFREE_CG_BUILTIN_I32); + tys[2] = builtin_id(CFREE_CG_BUILTIN_I16); + tys[3] = builtin_id(CFREE_CG_BUILTIN_I8); while (i < n) { - emit_mov_load(mc, 1, 0, X64_RDX, sr, (i32)i); - emit_mov_store(mc, 1, X64_RDX, dr, (i32)i); - i += 1u; + u32 rem = n - i, s; + CfreeCgTypeId ty; + NativeAddr sa = src, da = dst; + NativeLoc val; + MemAccess mem; + if (rem >= 8u) { + s = 8u; + ty = tys[0]; + } else if (rem >= 4u) { + s = 4u; + ty = tys[1]; + } else if (rem >= 2u) { + s = 2u; + ty = tys[2]; + } else { + s = 1u; + ty = tys[3]; + } + sa.offset += (i32)i; + sa.base_type = ty; + da.offset += (i32)i; + da.base_type = ty; + val = x64_reg_loc(ty, NATIVE_REG_INT, X64_TMP_INT); + memset(&mem, 0, sizeof mem); + mem.type = ty; + mem.size = s; + mem.align = s; + x64_emit_mem(a, 1, val, sa, mem); /* rax = [src + i] */ + x64_emit_mem(a, 0, val, da, mem); /* [dst + i] = rax */ + i += s; } } @@ -2841,6 +2859,26 @@ static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) { return base; } +/* add r/m, imm8 (group-1 /0) directly to a memory field — advances a va_list + * offset/pointer in place without consuming a register. w selects 64- vs 32-bit. */ +static void x64_add_mem_imm(MCEmitter* mc, int w, u32 base, i32 disp, i8 imm) { + u8 op = X64_OPC_ALU_IMM8; + u8 b; + emit_rex(mc, w, 0, 0, base); + mc->emit_bytes(mc, &op, 1); + emit_mem_operand(mc, X64_ALU_SUB_ADD, base, disp); /* modrm.reg = /0 (ADD) */ + b = (u8)imm; + mc->emit_bytes(mc, &b, 1); +} + +/* add r64, [base+disp] (0x03 /r). */ +static void x64_add_reg_mem(MCEmitter* mc, u32 dst, u32 base, i32 disp) { + u8 op = 0x03; + emit_rex(mc, 1, dst, 0, base); + mc->emit_bytes(mc, &op, 1); + emit_mem_operand(mc, dst, base, disp); +} + static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) { NativeTarget* t = &a->base; MCEmitter* mc = t->mc; @@ -2878,56 +2916,48 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap, u32 sz = x64_type_size(t, type); int is_fp = loc_is_fp(dst); u32 dr = loc_reg(dst); - u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); + u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */ + /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list + * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the + * reg-save base is folded into rax (x64_add_reg_mem), so no third register is + * needed — leaving r10 (and rdx) free for the allocator. */ if (a->abi->shadow_space) { - /* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */ - emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0); + /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */ + emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); else - emit_mov_load(mc, sz, 0, dr, X64_R10, 0); - /* add r10, 8; *ap = r10. */ - emit_rex(mc, 1, 0, 0, X64_R10); - { - u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8}; - mc->emit_bytes(mc, buf, 3); - } - emit_mov_store(mc, 8, X64_R10, ap_base, 0); + emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); + x64_add_mem_imm(mc, 1, ap_base, 0, 8); return; } { u32 offs_field = is_fp ? 4u : 0u; u32 max_offs = is_fp ? 176u : 48u; - u32 stride = is_fp ? 16u : 8u; + i8 stride = is_fp ? 16 : 8; MCLabel L_stack = mc->label_new(mc); MCLabel L_done = mc->label_new(mc); /* eax = ap[offs]; cmp eax, max; jae L_stack. */ emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field); emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs); emit_jcc_rel32(mc, X64_CC_AE, L_stack); - /* reg path: r10 = ap[16] + rax; load; eax += stride; ap[offs] = eax. */ - emit_mov_load(mc, 8, 0, X64_R10, ap_base, 16); - emit_alu_rr(mc, 1, X64_OPC_ALU_ADD, X64_R10, X64_RAX); + /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load. + * (The memory increment leaves rax holding the old offset.) */ + x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride); + x64_add_reg_mem(mc, X64_RAX, ap_base, 16); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); else - emit_mov_load(mc, sz, 0, dr, X64_R10, 0); - emit_alu_imm8(mc, 0, X64_ALU_SUB_ADD, X64_RAX, (i8)stride); - emit_mov_store(mc, 4, X64_RAX, ap_base, (i32)offs_field); + emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); emit_jmp_rel32(mc, L_done); - /* stack path: r10 = ap[8]; load; r10 += 8; ap[8] = r10. */ + /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */ mc->label_place(mc, L_stack); - emit_mov_load(mc, 8, 0, X64_R10, ap_base, 8); + emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_R10, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); else - emit_mov_load(mc, sz, 0, dr, X64_R10, 0); - emit_rex(mc, 1, 0, 0, X64_R10); - { - u8 buf[3] = {X64_OPC_ALU_IMM8, modrm(3u, X64_ALU_SUB_ADD, X64_R10 & 7u), 8}; - mc->emit_bytes(mc, buf, 3); - } - emit_mov_store(mc, 8, X64_R10, ap_base, 8); + emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); + x64_add_mem_imm(mc, 1, ap_base, 8, 8); mc->label_place(mc, L_done); } } @@ -2936,15 +2966,16 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap, NativeAddr src_ap) { NativeTarget* t = &a->base; MCEmitter* mc = t->mc; - /* Resolve dst into r11, src into rax (disjoint from each other and from the - * rdx copy scratch); force both so the optimizer's register choice for a - * va_list pointer can't alias the copy scratch. */ + /* Resolve dst into r11, src into rax, and copy each qword through the fp emit + * scratch xmm14. Uses only reserved emit scratch (r11/rax/xmm14), so the + * optimizer's register choice for a va_list pointer can never be clobbered and + * no allocable GPR (previously rdx) is consumed. */ u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2); u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT); u32 n = a->abi->shadow_space ? 8u : 24u, i; for (i = 0; i < n; i += 8u) { - emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i); - emit_mov_store(mc, 8, X64_RDX, dst_base, (i32)i); + emit_sse_load(mc, 0xF2, 0x10, X64_TMP_FP, src_base, (i32)i); /* movsd */ + emit_sse_store(mc, 0xF2, 0x11, X64_TMP_FP, dst_base, (i32)i); /* movsd */ } }