kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 6cedc8f3b379c3369f243742bc1e3e23a5474e84
parent dc46a3135f40231846840a807f477f5d082525c3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 09:50:15 -0700

x64: make -O1 entry param-binds cycle-safe via the shared shuffle

At -O1 the allocator may place a tail function's params in a rotated register
set (pass_lower skips arg-reg hinting for tail functions). x64_bind_native_param
emitted the incoming->home moves one param at a time, so a rotation clobbered a
register a later bind still needed — e.g. caller(x,y,z) tail-calling
target(z,x,y) collapsed all three args to one value.

Defer register-destination binds into a pending list (incoming arg reg, or a
NATIVE_LOC_ADDR for an incoming stack slot) and resolve them together as a
parallel copy in a new bind_params_end hook, via the shared native_arg_shuffle.
Frame-destination and indirect binds stay eager: they only read incoming
registers, so they correctly precede the shuffle. New optional
NativeTarget.bind_params_end is called by the optimizer emit path after the last
bind; aa64/rv64 leave it NULL (their allocator doesn't rotate, so they were
already correct).

x64 toy X-O1: 137 -> 148 pass (all 9 tail cases + many-params fixed); 8 remain
(atomics, varargs, a couple others). No regressions: x64 O0 156/0, rv64 O1
156/0, aa64 1034/0.

x64: keep atomic cmpxchg/rmw address out of the fixed operand registers

At -O1 the optimizer can materialize the atomic operand address into rax/rcx
(cmpxchg) or rax/rcx/rdx (rmw) — exactly the registers x64_atomic_cas/_rmw
hardcode for expected/desired/prior/new/val. x64_addr_to_base_reg returns the
existing base register as-is when there is no index/displacement, so the address
landed in one of those and the operand setup then clobbered it: e.g.
`leaq -8(%rbp),%rcx; movq %r12,%rcx; lock cmpxchgq %rcx,(%rcx)` dereferenced 42.

Stage the address into r11 (the reserved int emit scratch, never an allocated
operand) whenever it occupies a fixed operand register, before any operand move.
Also place expected->rax / desired->rcx as a parallel copy (xchg on a full swap;
desired-in-rax ordering) rather than two naive moves. x64 toy X-O1: 148 -> 151
(all 3 atomic cmpxchg/rmw cases). No -O0 change (the address is r11 there).

x64 -O1: fix va-list base + indirect-callee scratch register clobbers

Two -O1 register-aliasing bugs, both because the optimizer may place a value in
a register the codegen hardcodes:

- va_start/va_arg/va_copy use rax/r10/rdx for field values, but x64_addr_to_base_reg
  returns the va_list pointer's own register when it's a simple base — which at
  -O1 can be rax/rdx, so the field stores clobbered the pointer and wrote to
  garbage. New x64_va_base() forces the va_list base into a reserved scratch
  (r11, and rax for va_copy's source) that the field values never alias.

- For an indirect call the callee is staged in r11, but x64_emit_reg_arg_moves
  used r11 as its integer cycle-break scratch, clobbering the callee with an arg
  value (SIGSEGV on call *r11). The shuffle's int scratch is now a parameter:
  rax when the callee occupies r11 (rax is never a SysV int arg reg and the
  variadic AL count is written after the moves), r11 otherwise.

x64 toy X-O1: 151 -> 154 (fixes 112 indirect-many-args, 19 variadic+asm, 123
spec_demo). Remaining: 132 (bswap roundtrip), 133 (varargs mixed types). No
regressions: x64 O0 156/0, rv64 O0/O1 156/156.

Diffstat:
Msrc/arch/native_target.h | 7+++++++
Msrc/arch/x64/native.c | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Msrc/cg/native_frame.h | 1+
Msrc/opt/pass_native_emit.c | 3+++
4 files changed, 170 insertions(+), 46 deletions(-)

diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -342,6 +342,13 @@ struct NativeTarget { * registers are never allocable, so reg destinations never alias an incoming * arg register and ordering across params is unconstrained. */ void (*bind_param)(NativeTarget*, const CGParamDesc*, NativeLoc dst); + /* Optional. Called once by the optimizer emit path after the last bind_param, + * before the body. Lets a backend that defers register-destination param + * binds (to resolve them as a parallel copy, since the allocator may rotate + * params across the incoming arg registers — a permutation the naive + * per-param move order cannot realize) flush them now. Backends that bind + * eagerly leave this NULL. */ + void (*bind_params_end)(NativeTarget*); MCLabel (*label_new)(NativeTarget*); void (*label_place)(NativeTarget*, MCLabel); diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -50,6 +50,9 @@ enum { X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */ X64_TMP_FP2 = X64_XMM15, /* emit-internal fp scratch (reserved) */ X64_MAX_REG_ARG_MOVES = 16u, + /* Deferred entry register-binds (-O1): bounded by simultaneously-live + * register-homed param parts, i.e. the allocable register count. */ + X64_MAX_BIND_MOVES = 32u, X64_MAX_CS_FP_REGS = 10u, /* Win64 xmm6..xmm15 */ }; @@ -98,6 +101,13 @@ typedef struct X64NativeTarget { u32 prologue_nbytes; MCLabel epilogue_label; + /* Optimizer (-O1) entry binds: register-destination param binds are deferred + * here and resolved as a parallel copy in x64_bind_params_end, since the + * allocator may rotate params across the incoming arg registers — a + * permutation the naive per-param move order would clobber. */ + NativeArgMove bind_moves[X64_MAX_BIND_MOVES]; + u32 nbind_moves; + const X64ABIRegs* abi; } X64NativeTarget; @@ -1466,6 +1476,7 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { a->reg_save_slot = NATIVE_FRAME_SLOT_NONE; a->npatches = 0; a->nalloca = 0; + a->nbind_moves = 0; a->prologue_nbytes = a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES; @@ -1883,6 +1894,39 @@ static void x64_store_outgoing_part(NativeTarget* t, int tail_call, } /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */ +static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n, + Reg int_scratch); + +/* Defer a register-destination param bind for the parallel-copy flush in + * x64_bind_params_end. `src` is the incoming location (an arg register, or a + * NATIVE_LOC_ADDR for an incoming stack slot). */ +static void x64_defer_reg_bind(X64NativeTarget* a, NativeLoc dst, NativeLoc src, + u32 size) { + NativeArgMove* m; + if (a->nbind_moves >= X64_MAX_BIND_MOVES) + x64_panic(a, "too many register parameter binds"); + m = &a->bind_moves[a->nbind_moves++]; + memset(m, 0, sizeof *m); + m->dst = dst; + m->src = src; + m->size = size; +} + +/* Incoming stack-arg source as a NATIVE_LOC_ADDR ([rbp + bias + stack_off]). */ +static NativeLoc x64_incoming_stack_loc(CfreeCgTypeId type, NativeAllocClass cls, + i32 off) { + NativeLoc l; + memset(&l, 0, sizeof l); + l.kind = NATIVE_LOC_ADDR; + l.cls = (u8)cls; + l.type = type; + l.v.addr.base_kind = NATIVE_ADDR_BASE_REG; + l.v.addr.base.reg = X64_RBP; + l.v.addr.base_type = type; + l.v.addr.offset = off; + return l; +} + static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, NativeLoc dst) { X64NativeTarget* a = x64_of(t); @@ -1934,26 +1978,26 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, const ABIArgPart* part = &ai->parts[i]; NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; - Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; - NativeLoc src = x64_reg_loc(p->type, cls, tmp); - NativeAddr sa; - memset(&sa, 0, sizeof sa); - sa.base_kind = NATIVE_ADDR_BASE_REG; - sa.base.reg = X64_RBP; - sa.base_type = p->type; - sa.offset = incoming_bias + (i32)a->next_param_stack; + NativeLoc isrc = x64_incoming_stack_loc( + p->type, cls, incoming_bias + (i32)a->next_param_stack); a->next_param_stack += 8u; - x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size)); if (dst.kind == NATIVE_LOC_NONE) { /* unused */ } else if (to_reg) { - x64_move(t, x64_reg_loc(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg), - src); + /* Defer: a register dst may be another param's incoming reg. */ + x64_defer_reg_bind(a, + x64_reg_loc(dst.type ? dst.type : p->type, + (NativeAllocClass)dst.cls, (Reg)dst.v.reg), + isrc, part->size); } else { + /* Frame dst: load to scratch then store (memory dst is never a cycle + * source, so emit eagerly — it only reads the incoming slot). */ + Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; + NativeLoc tloc = x64_reg_loc(p->type, cls, tmp); + x64_load_part(t, tloc, isrc, 0, part->size); x64_store_part(t, x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset), - src, 0, part->size); + tloc, 0, part->size); } } return; @@ -1963,42 +2007,58 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, const ABIArgPart* part = &ai->parts[i]; NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; - NativeLoc src; + NativeLoc src; /* incoming: arg register, or NATIVE_LOC_ADDR for a stack arg */ if (cls == NATIVE_REG_FP && a->next_param_fp < a->abi->n_fp_args) { src = x64_reg_loc(p->type, cls, (Reg)(X64_XMM0 + a->next_param_fp++)); } else if (cls == NATIVE_REG_INT && a->next_param_int < a->abi->n_int_args) { src = x64_reg_loc(p->type, cls, a->abi->int_args[a->next_param_int++]); } else { - Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; - NativeAddr sa; - src = x64_reg_loc(p->type, cls, tmp); - memset(&sa, 0, sizeof sa); - sa.base_kind = NATIVE_ADDR_BASE_REG; - sa.base.reg = X64_RBP; - sa.base_type = p->type; - sa.offset = incoming_bias + (i32)a->next_param_stack; - x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size)); + src = x64_incoming_stack_loc(p->type, cls, + incoming_bias + (i32)a->next_param_stack); a->next_param_stack += 8u; } x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp); if (dst.kind == NATIVE_LOC_NONE) { /* unused parameter; cursors advanced */ } else if (to_reg) { - NativeLoc d = x64_reg_loc(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg); - if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) && - (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) - x64_move(t, d, src); - } else { + /* Defer the register bind: the allocator may rotate params across the + * incoming arg registers, so a per-param move could clobber a register + * another bind still needs. x64_bind_params_end resolves them together as + * a parallel copy. */ + x64_defer_reg_bind(a, + x64_reg_loc(dst.type ? dst.type : p->type, + (NativeAllocClass)dst.cls, (Reg)dst.v.reg), + src, part->size); + } else if (src.kind == NATIVE_LOC_REG) { x64_store_part(t, x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset), src, 0, part->size); + } else { + /* Stack source -> frame dst: load to scratch, then store. */ + Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; + NativeLoc tloc = x64_reg_loc(p->type, cls, tmp); + x64_load_part(t, tloc, src, 0, part->size); + x64_store_part(t, + x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset), + tloc, 0, part->size); } } a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); } +/* Flush the deferred register-destination param binds as a parallel copy (the + * shared scheduler breaks any cycle the allocator's rotation created through + * the int/fp emit scratch). Frame-dst and indirect binds were emitted eagerly + * in bind_param — they only read incoming registers, so they precede this. */ +static void x64_bind_params_end(NativeTarget* t) { + X64NativeTarget* a = x64_of(t); + /* No callee is staged during entry binds, so r11 is free as the cycle scratch. */ + if (a->nbind_moves) + x64_emit_reg_arg_moves(t, a->bind_moves, a->nbind_moves, X64_TMP_INT2); + a->nbind_moves = 0; +} + /* ============================ calls / returns ============================ */ typedef NativeArgMove X64ArgMove; @@ -2015,17 +2075,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { } } -/* Parallel-copy register arg moves via the shared scheduler; cycles break - * through the int/fp emit scratch (r11 / xmm14). */ -static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, - u32 n) { +/* Parallel-copy register arg moves via the shared scheduler. `int_scratch` is + * the register used to break an integer cycle: normally r11, but rax when an + * indirect callee is staged in r11 (rax is never a SysV int arg register and + * the variadic AL count is written only after the moves). */ +static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n, + Reg int_scratch) { NativeArgShuffle s; if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args"); memset(&s, 0, sizeof s); s.t = t; s.emit_one = x64_emit_one_arg_move; s.reg_move = x64_move; - s.scratch[NATIVE_REG_INT] = X64_TMP_INT2; + s.scratch[NATIVE_REG_INT] = int_scratch; s.scratch[NATIVE_REG_FP] = X64_TMP_FP; native_arg_shuffle(&s, moves, n); } @@ -2173,7 +2235,13 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } } } - x64_emit_reg_arg_moves(t, moves, nmoves); + /* If an indirect callee was staged in r11 above, the cycle scratch must + * avoid it; rax is free here (not an int arg reg; AL count comes later). */ + x64_emit_reg_arg_moves( + t, moves, nmoves, + (plan->callee.kind == NATIVE_LOC_REG && plan->callee.v.reg == X64_R11) + ? X64_TMP_INT + : X64_TMP_INT2); if (abi && abi->has_sret && desc->nresults) { /* sret pointer in the first int-arg reg. A tail call forwards the * caller's own incoming sret pointer (spilled at entry); otherwise pass @@ -2634,11 +2702,18 @@ static void x64_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst, MCEmitter* mc = t->mc; u32 sz = mem.size ? mem.size : x64_type_size(t, dst.type); int w = sz == 8u ? 1 : 0; - u32 base = x64_atomic_base(a, addr); /* r11 */ + u32 base = x64_atomic_base(a, addr); u32 dr = loc_reg(dst); u32 vr = loc_reg(val); (void)mo; /* LOCK ops are full barriers. */ - /* val staged in rdx (base owns r11; rax/rcx used by the cmpxchg loop). */ + /* The rmw uses fixed rax (prior), rcx (new), rdx (val); the optimizer may have + * materialized the address into one of them, so keep it out (r11 is the int + * emit scratch, never an allocated operand). Stage before rdx is loaded. */ + if (base == X64_RAX || base == X64_RCX || base == X64_RDX) { + emit_mov_rr(mc, 1, X64_TMP_INT2, base); + base = X64_TMP_INT2; + } + /* val staged in rdx (rax/rcx used by the cmpxchg loop). */ emit_mov_rr(mc, w, X64_RDX, vr); if (op == AO_ADD || op == AO_SUB) { if (op == AO_SUB) emit_f7_rm(mc, w, X64_F7_SUB_NEG, X64_RDX); @@ -2698,16 +2773,38 @@ static void x64_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, MCEmitter* mc = t->mc; u32 sz = mem.size ? mem.size : x64_type_size(t, prior.type); int w = sz == 8u ? 1 : 0; - u32 base = x64_atomic_base(a, addr); /* r11 */ + u32 base = x64_atomic_base(a, addr); u32 rprior = loc_reg(prior); u32 rok = loc_reg(ok); u32 rexp = loc_reg(expected); u32 rdes = loc_reg(desired); (void)success; (void)failure; - /* rax = expected; rcx = desired. */ - if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp); - if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes); + /* cmpxchg uses fixed rax (expected) and rcx (desired). The optimizer may have + * materialized the address into either; keep it out of both (r11 is the int + * emit scratch, never an allocated operand). */ + if (base == X64_RAX || base == X64_RCX) { + emit_mov_rr(mc, 1, X64_TMP_INT2, base); + base = X64_TMP_INT2; + } + /* Place expected -> rax and desired -> rcx as a parallel copy: the allocator + * may have them in each other's target register (full swap) or desired in rax + * (expected's target), either of which a naive two-move order would clobber. */ + if (rexp == X64_RCX && rdes == X64_RAX) { + /* Swap rax <-> rcx (xchg needs no temp; base is not rax/rcx here). */ + emit_rex(mc, w, X64_RCX, 0, X64_RAX); + { + u8 xchg[2] = {0x87, modrm(3u, X64_RCX, X64_RAX)}; + mc->emit_bytes(mc, xchg, 2); + } + } else if (rdes == X64_RAX) { + /* desired sits in rax; move it to rcx before rax is overwritten. */ + if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes); + if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp); + } else { + if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp); + if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes); + } emit_lock_prefix(mc); emit_rex(mc, w, X64_RCX, 0, base); { @@ -2731,12 +2828,25 @@ static void x64_fence(NativeTarget* t, MemOrder mo) { * into the matching GPR slot at the call site. `ap` addresses the va_list * object. */ +/* Resolve a va_list address into `scratch`, materializing it there if it is not + * already, so the va field-value scratch registers (rax / r10 / rdx) never alias + * it. At -O1 the optimizer may place the va_list pointer in any register — + * including those — and the va code would then clobber the pointer mid-sequence. */ +static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) { + u32 base = x64_addr_to_base_reg(a, ap, scratch); + if (base != scratch) { + emit_mov_rr(a->base.mc, 1, scratch, base); + base = scratch; + } + return base; +} + static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) { NativeTarget* t = &a->base; MCEmitter* mc = t->mc; u32 ap_base; if (!a->is_variadic) x64_panic(a, "va_start: function not variadic"); - ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2); + ap_base = x64_va_base(a, ap, X64_TMP_INT2); if (a->abi->shadow_space) { /* Win64: *ap = rbp + 16 + named_int*8 + named_stack. */ u32 first = 16u + a->next_param_int * 8u + a->next_param_stack; @@ -2768,7 +2878,7 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap, u32 sz = x64_type_size(t, type); int is_fp = loc_is_fp(dst); u32 dr = loc_reg(dst); - u32 ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2); + u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); if (a->abi->shadow_space) { /* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */ emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0); @@ -2826,9 +2936,11 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap, NativeAddr src_ap) { NativeTarget* t = &a->base; MCEmitter* mc = t->mc; - /* Resolve dst into r11 first, src into rax (disjoint). */ - u32 dst_base = x64_addr_to_base_reg(a, dst_ap, X64_TMP_INT2); - u32 src_base = x64_addr_to_base_reg(a, src_ap, X64_TMP_INT); + /* Resolve dst into r11, src into rax (disjoint from each other and from the + * rdx copy scratch); force both so the optimizer's register choice for a + * va_list pointer can't alias the copy scratch. */ + u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2); + u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT); u32 n = a->abi->shadow_space ? 8u : 24u, i; for (i = 0; i < n; i += 8u) { emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i); @@ -3521,6 +3633,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, t->addr_legal = x64_addr_legal; t->func_begin = x64_func_begin; t->func_begin_known_frame = x64_func_begin_known_frame; + t->bind_params_end = x64_bind_params_end; t->note_frame_state = NULL; /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved * set; x64_func_begin_known_frame derives the records from the masks. */ diff --git a/src/cg/native_frame.h b/src/cg/native_frame.h @@ -1,3 +1,4 @@ +/* see also: cg/native_argmove.h (shared parallel-copy register shuffle) */ #ifndef CFREE_CG_NATIVE_FRAME_H #define CFREE_CG_NATIVE_FRAME_H diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -596,6 +596,9 @@ static void bind_params(NativeEmitCtx* e) { map_slot(e, p->storage.v.frame_slot, p->loc)); if (e->target->bind_param) e->target->bind_param(e->target, &sd, dst); } + /* Let a backend that defers register-destination binds resolve them now (as a + * parallel copy), once every param's incoming location has been read. */ + if (e->target->bind_params_end) e->target->bind_params_end(e->target); } /* The parameter value is placed into its allocated location by bind_param at