commit 6cedc8f3b379c3369f243742bc1e3e23a5474e84
parent dc46a3135f40231846840a807f477f5d082525c3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 09:50:15 -0700
x64: make -O1 entry param-binds cycle-safe via the shared shuffle
At -O1 the allocator may place a tail function's params in a rotated register
set (pass_lower skips arg-reg hinting for tail functions). x64_bind_native_param
emitted the incoming->home moves one param at a time, so a rotation clobbered a
register a later bind still needed — e.g. caller(x,y,z) tail-calling
target(z,x,y) collapsed all three args to one value.
Defer register-destination binds into a pending list (incoming arg reg, or a
NATIVE_LOC_ADDR for an incoming stack slot) and resolve them together as a
parallel copy in a new bind_params_end hook, via the shared native_arg_shuffle.
Frame-destination and indirect binds stay eager: they only read incoming
registers, so they correctly precede the shuffle. New optional
NativeTarget.bind_params_end is called by the optimizer emit path after the last
bind; aa64/rv64 leave it NULL (their allocator doesn't rotate, so they were
already correct).
x64 toy X-O1: 137 -> 148 pass (all 9 tail cases + many-params fixed); 8 remain
(atomics, varargs, a couple others). No regressions: x64 O0 156/0, rv64 O1
156/0, aa64 1034/0.
x64: keep atomic cmpxchg/rmw address out of the fixed operand registers
At -O1 the optimizer can materialize the atomic operand address into rax/rcx
(cmpxchg) or rax/rcx/rdx (rmw) — exactly the registers x64_atomic_cas/_rmw
hardcode for expected/desired/prior/new/val. x64_addr_to_base_reg returns the
existing base register as-is when there is no index/displacement, so the address
landed in one of those and the operand setup then clobbered it: e.g.
`leaq -8(%rbp),%rcx; movq %r12,%rcx; lock cmpxchgq %rcx,(%rcx)` dereferenced 42.
Stage the address into r11 (the reserved int emit scratch, never an allocated
operand) whenever it occupies a fixed operand register, before any operand move.
Also place expected->rax / desired->rcx as a parallel copy (xchg on a full swap;
desired-in-rax ordering) rather than two naive moves. x64 toy X-O1: 148 -> 151
(all 3 atomic cmpxchg/rmw cases). No -O0 change (the address is r11 there).
x64 -O1: fix va-list base + indirect-callee scratch register clobbers
Two -O1 register-aliasing bugs, both because the optimizer may place a value in
a register the codegen hardcodes:
- va_start/va_arg/va_copy use rax/r10/rdx for field values, but x64_addr_to_base_reg
returns the va_list pointer's own register when it's a simple base — which at
-O1 can be rax/rdx, so the field stores clobbered the pointer and wrote to
garbage. New x64_va_base() forces the va_list base into a reserved scratch
(r11, and rax for va_copy's source) that the field values never alias.
- For an indirect call the callee is staged in r11, but x64_emit_reg_arg_moves
used r11 as its integer cycle-break scratch, clobbering the callee with an arg
value (SIGSEGV on call *r11). The shuffle's int scratch is now a parameter:
rax when the callee occupies r11 (rax is never a SysV int arg reg and the
variadic AL count is written after the moves), r11 otherwise.
x64 toy X-O1: 151 -> 154 (fixes 112 indirect-many-args, 19 variadic+asm, 123
spec_demo). Remaining: 132 (bswap roundtrip), 133 (varargs mixed types). No
regressions: x64 O0 156/0, rv64 O0/O1 156/156.
Diffstat:
4 files changed, 170 insertions(+), 46 deletions(-)
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -342,6 +342,13 @@ struct NativeTarget {
* registers are never allocable, so reg destinations never alias an incoming
* arg register and ordering across params is unconstrained. */
void (*bind_param)(NativeTarget*, const CGParamDesc*, NativeLoc dst);
+ /* Optional. Called once by the optimizer emit path after the last bind_param,
+ * before the body. Lets a backend that defers register-destination param
+ * binds (to resolve them as a parallel copy, since the allocator may rotate
+ * params across the incoming arg registers — a permutation the naive
+ * per-param move order cannot realize) flush them now. Backends that bind
+ * eagerly leave this NULL. */
+ void (*bind_params_end)(NativeTarget*);
MCLabel (*label_new)(NativeTarget*);
void (*label_place)(NativeTarget*, MCLabel);
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -50,6 +50,9 @@ enum {
X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */
X64_TMP_FP2 = X64_XMM15, /* emit-internal fp scratch (reserved) */
X64_MAX_REG_ARG_MOVES = 16u,
+ /* Deferred entry register-binds (-O1): bounded by simultaneously-live
+ * register-homed param parts, i.e. the allocable register count. */
+ X64_MAX_BIND_MOVES = 32u,
X64_MAX_CS_FP_REGS = 10u, /* Win64 xmm6..xmm15 */
};
@@ -98,6 +101,13 @@ typedef struct X64NativeTarget {
u32 prologue_nbytes;
MCLabel epilogue_label;
+ /* Optimizer (-O1) entry binds: register-destination param binds are deferred
+ * here and resolved as a parallel copy in x64_bind_params_end, since the
+ * allocator may rotate params across the incoming arg registers — a
+ * permutation the naive per-param move order would clobber. */
+ NativeArgMove bind_moves[X64_MAX_BIND_MOVES];
+ u32 nbind_moves;
+
const X64ABIRegs* abi;
} X64NativeTarget;
@@ -1466,6 +1476,7 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
a->reg_save_slot = NATIVE_FRAME_SLOT_NONE;
a->npatches = 0;
a->nalloca = 0;
+ a->nbind_moves = 0;
a->prologue_nbytes =
a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES;
@@ -1883,6 +1894,39 @@ static void x64_store_outgoing_part(NativeTarget* t, int tail_call,
}
/* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
+ Reg int_scratch);
+
+/* Defer a register-destination param bind for the parallel-copy flush in
+ * x64_bind_params_end. `src` is the incoming location (an arg register, or a
+ * NATIVE_LOC_ADDR for an incoming stack slot). */
+static void x64_defer_reg_bind(X64NativeTarget* a, NativeLoc dst, NativeLoc src,
+ u32 size) {
+ NativeArgMove* m;
+ if (a->nbind_moves >= X64_MAX_BIND_MOVES)
+ x64_panic(a, "too many register parameter binds");
+ m = &a->bind_moves[a->nbind_moves++];
+ memset(m, 0, sizeof *m);
+ m->dst = dst;
+ m->src = src;
+ m->size = size;
+}
+
+/* Incoming stack-arg source as a NATIVE_LOC_ADDR ([rbp + bias + stack_off]). */
+static NativeLoc x64_incoming_stack_loc(CfreeCgTypeId type, NativeAllocClass cls,
+ i32 off) {
+ NativeLoc l;
+ memset(&l, 0, sizeof l);
+ l.kind = NATIVE_LOC_ADDR;
+ l.cls = (u8)cls;
+ l.type = type;
+ l.v.addr.base_kind = NATIVE_ADDR_BASE_REG;
+ l.v.addr.base.reg = X64_RBP;
+ l.v.addr.base_type = type;
+ l.v.addr.offset = off;
+ return l;
+}
+
static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
NativeLoc dst) {
X64NativeTarget* a = x64_of(t);
@@ -1934,26 +1978,26 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
const ABIArgPart* part = &ai->parts[i];
NativeAllocClass cls =
part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
- NativeLoc src = x64_reg_loc(p->type, cls, tmp);
- NativeAddr sa;
- memset(&sa, 0, sizeof sa);
- sa.base_kind = NATIVE_ADDR_BASE_REG;
- sa.base.reg = X64_RBP;
- sa.base_type = p->type;
- sa.offset = incoming_bias + (i32)a->next_param_stack;
+ NativeLoc isrc = x64_incoming_stack_loc(
+ p->type, cls, incoming_bias + (i32)a->next_param_stack);
a->next_param_stack += 8u;
- x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size));
if (dst.kind == NATIVE_LOC_NONE) {
/* unused */
} else if (to_reg) {
- x64_move(t, x64_reg_loc(dst.type ? dst.type : p->type,
- (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
- src);
+ /* Defer: a register dst may be another param's incoming reg. */
+ x64_defer_reg_bind(a,
+ x64_reg_loc(dst.type ? dst.type : p->type,
+ (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
+ isrc, part->size);
} else {
+ /* Frame dst: load to scratch then store (memory dst is never a cycle
+ * source, so emit eagerly — it only reads the incoming slot). */
+ Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
+ NativeLoc tloc = x64_reg_loc(p->type, cls, tmp);
+ x64_load_part(t, tloc, isrc, 0, part->size);
x64_store_part(t,
x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
- src, 0, part->size);
+ tloc, 0, part->size);
}
}
return;
@@ -1963,42 +2007,58 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
const ABIArgPart* part = &ai->parts[i];
NativeAllocClass cls =
part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- NativeLoc src;
+ NativeLoc src; /* incoming: arg register, or NATIVE_LOC_ADDR for a stack arg */
if (cls == NATIVE_REG_FP && a->next_param_fp < a->abi->n_fp_args) {
src = x64_reg_loc(p->type, cls, (Reg)(X64_XMM0 + a->next_param_fp++));
} else if (cls == NATIVE_REG_INT &&
a->next_param_int < a->abi->n_int_args) {
src = x64_reg_loc(p->type, cls, a->abi->int_args[a->next_param_int++]);
} else {
- Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
- NativeAddr sa;
- src = x64_reg_loc(p->type, cls, tmp);
- memset(&sa, 0, sizeof sa);
- sa.base_kind = NATIVE_ADDR_BASE_REG;
- sa.base.reg = X64_RBP;
- sa.base_type = p->type;
- sa.offset = incoming_bias + (i32)a->next_param_stack;
- x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size));
+ src = x64_incoming_stack_loc(p->type, cls,
+ incoming_bias + (i32)a->next_param_stack);
a->next_param_stack += 8u;
}
x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp);
if (dst.kind == NATIVE_LOC_NONE) {
/* unused parameter; cursors advanced */
} else if (to_reg) {
- NativeLoc d = x64_reg_loc(dst.type ? dst.type : p->type,
- (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
- if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
- (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
- x64_move(t, d, src);
- } else {
+ /* Defer the register bind: the allocator may rotate params across the
+ * incoming arg registers, so a per-param move could clobber a register
+ * another bind still needs. x64_bind_params_end resolves them together as
+ * a parallel copy. */
+ x64_defer_reg_bind(a,
+ x64_reg_loc(dst.type ? dst.type : p->type,
+ (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
+ src, part->size);
+ } else if (src.kind == NATIVE_LOC_REG) {
x64_store_part(t,
x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
src, 0, part->size);
+ } else {
+ /* Stack source -> frame dst: load to scratch, then store. */
+ Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
+ NativeLoc tloc = x64_reg_loc(p->type, cls, tmp);
+ x64_load_part(t, tloc, src, 0, part->size);
+ x64_store_part(t,
+ x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
+ tloc, 0, part->size);
}
}
a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
}
+/* Flush the deferred register-destination param binds as a parallel copy (the
+ * shared scheduler breaks any cycle the allocator's rotation created through
+ * the int/fp emit scratch). Frame-dst and indirect binds were emitted eagerly
+ * in bind_param — they only read incoming registers, so they precede this. */
+static void x64_bind_params_end(NativeTarget* t) {
+ X64NativeTarget* a = x64_of(t);
+ /* No callee is staged during entry binds, so r11 is free as the cycle scratch. */
+ if (a->nbind_moves)
+ x64_emit_reg_arg_moves(t, a->bind_moves, a->nbind_moves, X64_TMP_INT2);
+ a->nbind_moves = 0;
+}
+
/* ============================ calls / returns ============================ */
typedef NativeArgMove X64ArgMove;
@@ -2015,17 +2075,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
}
}
-/* Parallel-copy register arg moves via the shared scheduler; cycles break
- * through the int/fp emit scratch (r11 / xmm14). */
-static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
- u32 n) {
+/* Parallel-copy register arg moves via the shared scheduler. `int_scratch` is
+ * the register used to break an integer cycle: normally r11, but rax when an
+ * indirect callee is staged in r11 (rax is never a SysV int arg register and
+ * the variadic AL count is written only after the moves). */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
+ Reg int_scratch) {
NativeArgShuffle s;
if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args");
memset(&s, 0, sizeof s);
s.t = t;
s.emit_one = x64_emit_one_arg_move;
s.reg_move = x64_move;
- s.scratch[NATIVE_REG_INT] = X64_TMP_INT2;
+ s.scratch[NATIVE_REG_INT] = int_scratch;
s.scratch[NATIVE_REG_FP] = X64_TMP_FP;
native_arg_shuffle(&s, moves, n);
}
@@ -2173,7 +2235,13 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
}
}
}
- x64_emit_reg_arg_moves(t, moves, nmoves);
+ /* If an indirect callee was staged in r11 above, the cycle scratch must
+ * avoid it; rax is free here (not an int arg reg; AL count comes later). */
+ x64_emit_reg_arg_moves(
+ t, moves, nmoves,
+ (plan->callee.kind == NATIVE_LOC_REG && plan->callee.v.reg == X64_R11)
+ ? X64_TMP_INT
+ : X64_TMP_INT2);
if (abi && abi->has_sret && desc->nresults) {
/* sret pointer in the first int-arg reg. A tail call forwards the
* caller's own incoming sret pointer (spilled at entry); otherwise pass
@@ -2634,11 +2702,18 @@ static void x64_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst,
MCEmitter* mc = t->mc;
u32 sz = mem.size ? mem.size : x64_type_size(t, dst.type);
int w = sz == 8u ? 1 : 0;
- u32 base = x64_atomic_base(a, addr); /* r11 */
+ u32 base = x64_atomic_base(a, addr);
u32 dr = loc_reg(dst);
u32 vr = loc_reg(val);
(void)mo; /* LOCK ops are full barriers. */
- /* val staged in rdx (base owns r11; rax/rcx used by the cmpxchg loop). */
+ /* The rmw uses fixed rax (prior), rcx (new), rdx (val); the optimizer may have
+ * materialized the address into one of them, so keep it out (r11 is the int
+ * emit scratch, never an allocated operand). Stage before rdx is loaded. */
+ if (base == X64_RAX || base == X64_RCX || base == X64_RDX) {
+ emit_mov_rr(mc, 1, X64_TMP_INT2, base);
+ base = X64_TMP_INT2;
+ }
+ /* val staged in rdx (rax/rcx used by the cmpxchg loop). */
emit_mov_rr(mc, w, X64_RDX, vr);
if (op == AO_ADD || op == AO_SUB) {
if (op == AO_SUB) emit_f7_rm(mc, w, X64_F7_SUB_NEG, X64_RDX);
@@ -2698,16 +2773,38 @@ static void x64_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
MCEmitter* mc = t->mc;
u32 sz = mem.size ? mem.size : x64_type_size(t, prior.type);
int w = sz == 8u ? 1 : 0;
- u32 base = x64_atomic_base(a, addr); /* r11 */
+ u32 base = x64_atomic_base(a, addr);
u32 rprior = loc_reg(prior);
u32 rok = loc_reg(ok);
u32 rexp = loc_reg(expected);
u32 rdes = loc_reg(desired);
(void)success;
(void)failure;
- /* rax = expected; rcx = desired. */
- if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
- if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+ /* cmpxchg uses fixed rax (expected) and rcx (desired). The optimizer may have
+ * materialized the address into either; keep it out of both (r11 is the int
+ * emit scratch, never an allocated operand). */
+ if (base == X64_RAX || base == X64_RCX) {
+ emit_mov_rr(mc, 1, X64_TMP_INT2, base);
+ base = X64_TMP_INT2;
+ }
+ /* Place expected -> rax and desired -> rcx as a parallel copy: the allocator
+ * may have them in each other's target register (full swap) or desired in rax
+ * (expected's target), either of which a naive two-move order would clobber. */
+ if (rexp == X64_RCX && rdes == X64_RAX) {
+ /* Swap rax <-> rcx (xchg needs no temp; base is not rax/rcx here). */
+ emit_rex(mc, w, X64_RCX, 0, X64_RAX);
+ {
+ u8 xchg[2] = {0x87, modrm(3u, X64_RCX, X64_RAX)};
+ mc->emit_bytes(mc, xchg, 2);
+ }
+ } else if (rdes == X64_RAX) {
+ /* desired sits in rax; move it to rcx before rax is overwritten. */
+ if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+ if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
+ } else {
+ if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
+ if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+ }
emit_lock_prefix(mc);
emit_rex(mc, w, X64_RCX, 0, base);
{
@@ -2731,12 +2828,25 @@ static void x64_fence(NativeTarget* t, MemOrder mo) {
* into the matching GPR slot at the call site. `ap` addresses the va_list
* object. */
+/* Resolve a va_list address into `scratch`, materializing it there if it is not
+ * already, so the va field-value scratch registers (rax / r10 / rdx) never alias
+ * it. At -O1 the optimizer may place the va_list pointer in any register —
+ * including those — and the va code would then clobber the pointer mid-sequence. */
+static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) {
+ u32 base = x64_addr_to_base_reg(a, ap, scratch);
+ if (base != scratch) {
+ emit_mov_rr(a->base.mc, 1, scratch, base);
+ base = scratch;
+ }
+ return base;
+}
+
static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) {
NativeTarget* t = &a->base;
MCEmitter* mc = t->mc;
u32 ap_base;
if (!a->is_variadic) x64_panic(a, "va_start: function not variadic");
- ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2);
+ ap_base = x64_va_base(a, ap, X64_TMP_INT2);
if (a->abi->shadow_space) {
/* Win64: *ap = rbp + 16 + named_int*8 + named_stack. */
u32 first = 16u + a->next_param_int * 8u + a->next_param_stack;
@@ -2768,7 +2878,7 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
u32 sz = x64_type_size(t, type);
int is_fp = loc_is_fp(dst);
u32 dr = loc_reg(dst);
- u32 ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2);
+ u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2);
if (a->abi->shadow_space) {
/* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */
emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0);
@@ -2826,9 +2936,11 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap,
NativeAddr src_ap) {
NativeTarget* t = &a->base;
MCEmitter* mc = t->mc;
- /* Resolve dst into r11 first, src into rax (disjoint). */
- u32 dst_base = x64_addr_to_base_reg(a, dst_ap, X64_TMP_INT2);
- u32 src_base = x64_addr_to_base_reg(a, src_ap, X64_TMP_INT);
+ /* Resolve dst into r11, src into rax (disjoint from each other and from the
+ * rdx copy scratch); force both so the optimizer's register choice for a
+ * va_list pointer can't alias the copy scratch. */
+ u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2);
+ u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT);
u32 n = a->abi->shadow_space ? 8u : 24u, i;
for (i = 0; i < n; i += 8u) {
emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i);
@@ -3521,6 +3633,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
t->addr_legal = x64_addr_legal;
t->func_begin = x64_func_begin;
t->func_begin_known_frame = x64_func_begin_known_frame;
+ t->bind_params_end = x64_bind_params_end;
t->note_frame_state = NULL;
/* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
* set; x64_func_begin_known_frame derives the records from the masks. */
diff --git a/src/cg/native_frame.h b/src/cg/native_frame.h
@@ -1,3 +1,4 @@
+/* see also: cg/native_argmove.h (shared parallel-copy register shuffle) */
#ifndef CFREE_CG_NATIVE_FRAME_H
#define CFREE_CG_NATIVE_FRAME_H
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -596,6 +596,9 @@ static void bind_params(NativeEmitCtx* e) {
map_slot(e, p->storage.v.frame_slot, p->loc));
if (e->target->bind_param) e->target->bind_param(e->target, &sd, dst);
}
+ /* Let a backend that defers register-destination binds resolve them now (as a
+ * parallel copy), once every param's incoming location has been read. */
+ if (e->target->bind_params_end) e->target->bind_params_end(e->target);
}
/* The parameter value is placed into its allocated location by bind_param at