commit dc46a3135f40231846840a807f477f5d082525c3
parent 1bfb41a5fc5c5da9f233bcf2d6846192694639e3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 09:40:53 -0700
Share the parallel-copy arg-move scheduler; x64 NativeFrame + partial O1
New src/cg/native_argmove.{h,c}: the parallel-copy register-move scheduler
(topological emission + cycle-break through a scratch) was duplicated nearly
verbatim in all three backends (aa_/rv_/x64_emit_reg_arg_moves). The scheduling
is arch-independent; only the leaf ops differ (emit one move, the scratch reg).
Extracted to one native_arg_shuffle() taking a tiny ops struct; aa64/rv64/x64
each now build their move list and call it. Behavior-identical: aa64 toy 1034/0
(O0+O1), rv64 toy O0 156/0 + O1 156/0, x64 toy O0 156/0.
x64 also adopts the shared NativeFrame (mirroring aa64/rv64) and implements its
-O1 known-frame path (func_begin_known_frame derives the callee-saved set from
the optimizer masks; reserve_callee_saves non-NULL; tail-site restores
callee-saves before leave, indirect callee already staged in r11). x64
compute_frame_size already counted callee-save bytes, so no frame-size fix
needed. x64 -O1 is 137 pass / 19 fail: the 9 tail failures are root-caused to
naive per-parameter entry binds (bind_param) clobbering when the allocator
rotates a tail function's params across the incoming arg regs — the same cycle
the shuffle breaks, but the entry-bind path doesn't use it yet. Fix (route entry
binds through the shared scheduler) and the atomic/varargs failures are next.
Diffstat:
5 files changed, 290 insertions(+), 294 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -36,6 +36,7 @@
#include "arch/aa64/regs.h"
#include "asm/asm.h"
#include "asm/asm_lex.h"
+#include "cg/native_argmove.h"
#include "cg/native_direct_target.h"
#include "cg/native_frame.h"
#include "cg/type.h"
@@ -2446,99 +2447,37 @@ static u32 aa_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
/* One register-passed call argument: write `src` (or its address) into the
* argument register `dst`. Collected during planning and emitted as a batch so
* the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */
-typedef struct {
- NativeLoc dst;
- NativeLoc src;
- u32 src_offset;
- u32 size;
- int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */
-} AAArgMove;
+typedef NativeArgMove AAArgMove;
/* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */
#define AA_MAX_REG_ARG_MOVES 16u
-/* The register a move reads, or REG_NONE if it reads none (immediate, frame
- * slot, or an address derived from fp/sp/a global). Only a register source can
- * be invalidated by another move writing its argument register, so only these
- * constrain the emission order. */
-static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) {
- if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
- *cls_out = (NativeAllocClass)m->src.cls;
- return m->src.v.reg;
- }
- return REG_NONE;
-}
-
-static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) {
+static void aa_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
if (m->is_addr)
aa_addr_of_loc(t, m->dst, m->src);
else
aa_load_part(t, m->dst, m->src, m->src_offset, m->size);
}
-/* Emit register-argument moves with parallel-copy semantics: every register
- * must be read by all moves that source it before any move overwrites it. The
- * allocator usually arranges the sources so the moves are already conflict-free
- * (it copies a value out of an about-to-be-clobbered argument register), but it
- * does not always — notably for variadic arguments, where it can leave a prior
- * call's result sitting in x0 even though x0 is this call's first arg register.
- * So the backend must not assume a safe order. Emit any move whose destination
- * is no longer needed as a source (topological order); when only a true cycle
- * remains, save one member's register into a scratch and redirect its readers,
- * then drain the now-acyclic remainder. */
-static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) {
- u8 done[AA_MAX_REG_ARG_MOVES];
- u32 emitted = 0;
+/* Emit register-argument moves as a parallel copy via the shared scheduler:
+ * every register is read by all moves that source it before any move overwrites
+ * it; a true cycle is broken through a scratch. The allocator usually arranges
+ * a conflict-free order, but not always (notably variadic args, where it can
+ * leave a prior call's result in x0 even though x0 is this call's first arg
+ * register), so the backend must not assume a safe order. Cycle scratch is
+ * AA_TMP1 (x17) for int and v16 for fp — distinct from x16 (AA_TMP0), which may
+ * hold a stashed indirect callee. */
+static void aa_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) {
+ NativeArgShuffle s;
if (n > AA_MAX_REG_ARG_MOVES)
aa_panic(aa_of(t), "too many register arguments");
- memset(done, 0, sizeof done);
- while (emitted < n) {
- int progress = 0;
- for (u32 i = 0; i < n; ++i) {
- int blocked = 0;
- if (done[i]) continue;
- for (u32 j = 0; j < n && !blocked; ++j) {
- NativeAllocClass sc;
- Reg sr;
- if (done[j] || j == i) continue;
- sr = aa_arg_move_src_reg(&moves[j], &sc);
- if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
- sc == (NativeAllocClass)moves[i].dst.cls)
- blocked = 1;
- }
- if (!blocked) {
- aa_emit_one_arg_move(t, &moves[i]);
- done[i] = 1;
- emitted++;
- progress = 1;
- }
- }
- if (!progress) {
- /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
- * at least one remaining move is register-sourced. Break on such a move:
- * its destination register still holds a pending source value, so copy it
- * to a scratch and point that value's readers at the scratch. Scratch
- * avoids x16 (which may hold a stashed indirect callee) and the arg regs;
- * cycle moves are reg->reg, so the scratch needs no address arithmetic. */
- u32 k = 0;
- NativeAllocClass bc, sc;
- Reg scratch_reg;
- NativeLoc scratchloc;
- while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
- ++k;
- bc = (NativeAllocClass)moves[k].dst.cls;
- scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1;
- scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg);
- aa_move(t, scratchloc, moves[k].dst);
- for (u32 j = 0; j < n; ++j) {
- Reg sr = aa_arg_move_src_reg(&moves[j], &sc);
- if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
- moves[j].src = scratchloc;
- moves[j].src_offset = 0;
- }
- }
- }
- }
+ memset(&s, 0, sizeof s);
+ s.t = t;
+ s.emit_one = aa_emit_one_arg_move;
+ s.reg_move = aa_move;
+ s.scratch[NATIVE_REG_INT] = AA_TMP1;
+ s.scratch[NATIVE_REG_FP] = 16u;
+ native_arg_shuffle(&s, moves, n);
}
static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -23,6 +23,7 @@
#include "asm/asm_lex.h"
#include "arch/rv64/regs.h"
#include "arch/rv64/rv64.h"
+#include "cg/native_argmove.h"
#include "cg/native_direct_target.h"
#include "cg/native_frame.h"
#include "cg/type.h"
@@ -1794,82 +1795,27 @@ static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
/* ============================ calls / returns ============================ */
-typedef struct {
- NativeLoc dst;
- NativeLoc src;
- u32 src_offset;
- u32 size;
- int is_addr;
-} RvArgMove;
+typedef NativeArgMove RvArgMove;
-static Reg rv_arg_move_src_reg(const RvArgMove* m, NativeAllocClass* cls_out) {
- if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
- *cls_out = (NativeAllocClass)m->src.cls;
- return m->src.v.reg;
- }
- return REG_NONE;
-}
-
-static void rv_emit_one_arg_move(NativeTarget* t, const RvArgMove* m) {
+static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
if (m->is_addr)
rv_addr_of_loc(t, m->dst, m->src);
else
rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
}
-/* Parallel-copy register arg moves with cycle breaking. */
-static void rv_emit_reg_arg_moves(NativeTarget* t, RvArgMove* moves, u32 n) {
- u8 done[RV_MAX_REG_ARG_MOVES];
- u32 emitted = 0;
+/* Parallel-copy register arg moves via the shared scheduler; cycles break
+ * through the int/fp emit scratch (t1 / ft1). */
+static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) {
+ NativeArgShuffle s;
if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
- memset(done, 0, sizeof done);
- while (emitted < n) {
- int progress = 0;
- u32 i, j;
- for (i = 0; i < n; ++i) {
- int blocked = 0;
- if (done[i]) continue;
- for (j = 0; j < n && !blocked; ++j) {
- NativeAllocClass sc;
- Reg sr;
- if (done[j] || j == i) continue;
- sr = rv_arg_move_src_reg(&moves[j], &sc);
- if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
- sc == (NativeAllocClass)moves[i].dst.cls)
- blocked = 1;
- }
- if (!blocked) {
- rv_emit_one_arg_move(t, &moves[i]);
- done[i] = 1;
- emitted++;
- progress = 1;
- }
- }
- if (!progress) {
- u32 k = 0;
- NativeAllocClass bc, sc;
- Reg scratch_reg;
- NativeLoc scratchloc;
- while (k < n &&
- (done[k] || rv_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
- ++k;
- bc = (NativeAllocClass)moves[k].dst.cls;
- scratch_reg = bc == NATIVE_REG_FP ? RV_FTMP1 : RV_TMP1;
- scratchloc = rv_reg_loc(moves[k].dst.type, bc, scratch_reg);
- rv_move(t, scratchloc, moves[k].dst);
- {
- u32 jj;
- for (jj = 0; jj < n; ++jj) {
- Reg sr = rv_arg_move_src_reg(&moves[jj], &sc);
- if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg &&
- sc == bc) {
- moves[jj].src = scratchloc;
- moves[jj].src_offset = 0;
- }
- }
- }
- }
- }
+ memset(&s, 0, sizeof s);
+ s.t = t;
+ s.emit_one = rv_emit_one_arg_move;
+ s.reg_move = rv_move;
+ s.scratch[NATIVE_REG_INT] = RV_TMP1;
+ s.scratch[NATIVE_REG_FP] = RV_FTMP1;
+ native_arg_shuffle(&s, moves, n);
}
static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -34,7 +34,9 @@
#include "arch/x64/x64.h"
#include "asm/asm.h"
#include "asm/asm_lex.h"
+#include "cg/native_argmove.h"
#include "cg/native_direct_target.h"
+#include "cg/native_frame.h"
#include "cg/type.h"
#include "core/arena.h"
#include "core/bytes.h"
@@ -53,18 +55,12 @@ enum {
/* ============================ target state ============================ */
-typedef struct X64NativeSlot {
- u32 off; /* bytes below rbp (positive); address = rbp - off */
- u32 size;
- u32 align;
- u8 kind; /* NativeFrameSlotKind */
- u8 pad[3];
-} X64NativeSlot;
-
-typedef struct X64CalleeSave {
- Reg reg;
- u8 cls; /* NativeAllocClass */
-} X64CalleeSave;
+/* Frame slots and callee-save records live in the shared NativeFrame
+ * bookkeeping (cg/native_frame.h); these aliases keep the x64-local spellings.
+ * x64 reads only .reg/.cls of a callee-save (it computes save offsets below the
+ * locals rather than homing them in frame slots, so .slot/.type stay unused). */
+typedef NativeFrameSlotEntry X64NativeSlot;
+typedef NativeFrameCalleeSave X64CalleeSave;
typedef enum X64PatchKind { X64_PATCH_ALLOCA } X64PatchKind;
@@ -78,11 +74,9 @@ typedef struct X64NativeTarget {
SrcLoc loc;
const CGFuncDesc* func;
- X64NativeSlot* slots;
- u32 nslots;
- u32 slots_cap;
- u32 cum_off; /* sum of frame-slot reservations below rbp */
- u32 max_outgoing; /* max outgoing-arg bytes across all calls */
+ /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
+ * set, and the known_frame / has_alloca / frame_final flags. */
+ NativeFrame frame;
u32 frame_size_final;
u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
@@ -104,13 +98,6 @@ typedef struct X64NativeTarget {
u32 prologue_nbytes;
MCLabel epilogue_label;
- X64CalleeSave callee_saves[X64_MAX_CS_INT_REGS + X64_MAX_CS_FP_REGS];
- u32 ncallee_saves;
-
- u8 known_frame;
- u8 has_alloca;
- u8 frame_final;
-
const X64ABIRegs* abi;
} X64NativeTarget;
@@ -121,9 +108,7 @@ static _Noreturn void x64_panic(X64NativeTarget* a, const char* msg) {
}
static X64NativeSlot* x64_slot_get(X64NativeTarget* a, NativeFrameSlot fs) {
- if (fs == NATIVE_FRAME_SLOT_NONE || fs > a->nslots)
- x64_panic(a, "bad frame slot");
- return &a->slots[fs - 1u];
+ return native_frame_slot_at(&a->frame, fs);
}
static u32 align_up_u32(u32 v, u32 align) {
@@ -1343,37 +1328,19 @@ static void x64_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
static NativeFrameSlot x64_frame_slot(NativeTarget* t,
const NativeFrameSlotDesc* d) {
- X64NativeTarget* a = x64_of(t);
- X64NativeSlot* s;
- u32 size = d->size ? d->size : 8u;
- u32 align = d->align ? d->align : 1u;
- if (a->frame_final) x64_panic(a, "frame slot requested after prologue");
- if (a->nslots == a->slots_cap) {
- u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u;
- X64NativeSlot* nb = arena_zarray(t->c->tu, X64NativeSlot, cap);
- if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots);
- a->slots = nb;
- a->slots_cap = cap;
- }
- a->cum_off = align_up_u32(a->cum_off + size, align);
- s = &a->slots[a->nslots++];
- s->off = a->cum_off;
- s->size = size;
- s->align = align;
- s->kind = d->kind;
- return (NativeFrameSlot)a->nslots;
+ return native_frame_slot_alloc(&x64_of(t)->frame, d);
}
/* xmm save area base (rbp-relative). XMM saves are 16-aligned. */
static u32 x64_xmm_base(const X64NativeTarget* a, u32 cs_fp) {
- if (cs_fp == 0) return a->cum_off;
- return align_up_u32(a->cum_off, 16u);
+ if (cs_fp == 0) return a->frame.cum_off;
+ return align_up_u32(a->frame.cum_off, 16u);
}
static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int,
u32 cs_fp) {
u32 xmm_base = x64_xmm_base(a, cs_fp);
- u32 raw = a->max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base;
+ u32 raw = a->frame.max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base;
u32 fs = align_up_u32(raw, 16u);
return fs ? fs : 16u;
}
@@ -1381,16 +1348,16 @@ static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int,
/* Collect the callee-saves the body actually used. */
static u32 x64_collect_int_saves(X64NativeTarget* a, Reg* regs) {
u32 n = 0, i;
- for (i = 0; i < a->ncallee_saves; ++i)
- if (a->callee_saves[i].cls == NATIVE_REG_INT)
- regs[n++] = a->callee_saves[i].reg;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
+ regs[n++] = a->frame.callee_saves[i].reg;
return n;
}
static u32 x64_collect_fp_saves(X64NativeTarget* a, Reg* regs) {
u32 n = 0, i;
- for (i = 0; i < a->ncallee_saves; ++i)
- if (a->callee_saves[i].cls == NATIVE_REG_FP)
- regs[n++] = a->callee_saves[i].reg;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
+ regs[n++] = a->frame.callee_saves[i].reg;
return n;
}
@@ -1486,9 +1453,9 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
a->func = fd;
a->loc = fd->loc;
a->abi = x64_abi_for_os(t->c->target.os);
- a->nslots = 0;
- a->cum_off = 0;
- a->max_outgoing = 0;
+ /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
+ * callee-save set, and known_frame/has_alloca/frame_final. */
+ native_frame_reset(&a->frame);
a->incoming_stack_size = 0;
a->next_param_int = 0;
a->next_param_fp = 0;
@@ -1499,10 +1466,6 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
a->reg_save_slot = NATIVE_FRAME_SLOT_NONE;
a->npatches = 0;
a->nalloca = 0;
- a->ncallee_saves = 0;
- a->known_frame = 0;
- a->has_alloca = 0;
- a->frame_final = 0;
a->prologue_nbytes =
a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES;
@@ -1573,13 +1536,58 @@ static void x64_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
x64_emit_variadic_reg_saves(a);
}
+/* x64 homes callee-saves below the locals (offsets computed in
+ * x64_compute_frame_size / x64_build_prologue), not in frame slots, so
+ * alloc_slots=0: native_frame just records the {reg,cls} set from the masks. */
+static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
+ u32 nclasses) {
+ native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0);
+}
+
+/* Optimizer entry point: the full frame is supplied up front, so the prologue
+ * is emitted final the moment it is built — no NOP region, no func_end patch
+ * (x64_func_end skips patching when known_frame). x64_build_prologue emits the
+ * push rbp / sub rsp / sret spill / callee-save spills; the variadic
+ * register-save stores are emitted separately, as on the single-pass path. */
static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
const NativeKnownFrameDesc* frame,
NativeFrameSlot* out_slots) {
- (void)fd;
- (void)frame;
- (void)out_slots;
- x64_panic(x64_of(t), "known-frame path not implemented yet");
+ X64NativeTarget* a = x64_of(t);
+ MCEmitter* mc = t->mc;
+ Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
+ u32 n_int, n_fp, frame_size, nbytes, chkstk_disp_pos, i;
+ u8 buf[X64_PROLOGUE_BYTES_WIN64];
+ x64_func_begin_common(t, fd);
+ a->frame.known_frame = 1;
+ if (frame) {
+ a->frame.has_alloca = frame->has_alloca;
+ if (frame->callee_saved_used && frame->ncallee_classes)
+ x64_reserve_callee_saves(t, frame->callee_saved_used,
+ frame->ncallee_classes);
+ for (i = 0; i < frame->nslots; ++i) {
+ NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = slot;
+ }
+ x64_reserve_entry_saves(a);
+ native_frame_note_outgoing(&a->frame, frame->max_outgoing);
+ }
+ /* Frame is final: size and offsets are settled, so emit the exact prologue. */
+ n_int = x64_collect_int_saves(a, cs_int);
+ n_fp = x64_collect_fp_saves(a, cs_fp);
+ frame_size = x64_compute_frame_size(a, n_int, n_fp);
+ a->frame_size_final = frame_size;
+ a->prologue_pos = mc->pos(mc);
+ nbytes = x64_build_prologue(a, buf, sizeof buf, frame_size, cs_int, n_int,
+ cs_fp, n_fp, &chkstk_disp_pos);
+ mc->emit_bytes(mc, buf, nbytes);
+ if (chkstk_disp_pos != (u32)-1) {
+ ObjSymId chk = x64_chkstk_sym(t);
+ mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos,
+ R_X64_PLT32, chk, -4, 1, 0);
+ }
+ a->prologue_nbytes = nbytes; /* exact length: used for the CFI post offset */
+ x64_emit_variadic_reg_saves(a);
+ native_frame_set_final(&a->frame);
}
static void x64_func_end(NativeTarget* t) {
@@ -1610,7 +1618,7 @@ static void x64_func_end(NativeTarget* t) {
emit_ret(mc);
/* Patch the single-pass prologue placeholder. */
- if (!a->known_frame) {
+ if (!a->frame.known_frame) {
u8 buf[X64_PROLOGUE_BYTES_WIN64];
u32 chkstk_disp_pos;
u32 nbytes;
@@ -1629,7 +1637,7 @@ static void x64_func_end(NativeTarget* t) {
/* Patch alloca disp32s: lea dst, [rsp + max_outgoing]. */
{
- u32 mo = align_up_u32(a->max_outgoing, 16u);
+ u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
u32 k;
for (k = 0; k < a->npatches; ++k) {
u8 dbuf[4];
@@ -1640,8 +1648,10 @@ static void x64_func_end(NativeTarget* t) {
/* CFI: after the prologue, CFA = rbp + 16; rbp at cfa-16, ra at cfa-8. */
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
- u32 post =
- a->prologue_pos + (a->known_frame ? 0u : a->prologue_nbytes);
+ /* Body starts past the prologue. prologue_nbytes is the reserved NOP-region
+ * size on the single-pass path and the exact prologue length on the
+ * known-frame path (set in x64_func_begin_known_frame). */
+ u32 post = a->prologue_pos + a->prologue_nbytes;
u32 k;
mc->cfi_set_next_pc_offset(mc, post - a->func_start);
mc->cfi_def_cfa(mc, X64_RBP, 16);
@@ -1991,25 +2001,9 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
/* ============================ calls / returns ============================ */
-typedef struct {
- NativeLoc dst;
- NativeLoc src;
- u32 src_offset;
- u32 size;
- int is_addr;
- int dup_to_gpr; /* Win64: also place into the matching GPR */
- Reg dup_gpr;
-} X64ArgMove;
-
-static Reg x64_arg_move_src_reg(const X64ArgMove* m, NativeAllocClass* cls_out) {
- if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
- *cls_out = (NativeAllocClass)m->src.cls;
- return m->src.v.reg;
- }
- return REG_NONE;
-}
+typedef NativeArgMove X64ArgMove;
-static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) {
+static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
if (m->is_addr) {
x64_addr_of_loc(t, m->dst, m->src);
} else {
@@ -2021,57 +2015,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) {
}
}
-/* Parallel-copy register arg moves with cycle breaking through scratch. */
-static void x64_emit_reg_arg_moves(NativeTarget* t, X64ArgMove* moves, u32 n) {
- u8 done[X64_MAX_REG_ARG_MOVES];
- u32 emitted = 0;
+/* Parallel-copy register arg moves via the shared scheduler; cycles break
+ * through the int/fp emit scratch (r11 / xmm14). */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
+ u32 n) {
+ NativeArgShuffle s;
if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args");
- memset(done, 0, sizeof done);
- while (emitted < n) {
- int progress = 0;
- u32 i, j;
- for (i = 0; i < n; ++i) {
- int blocked = 0;
- if (done[i]) continue;
- for (j = 0; j < n && !blocked; ++j) {
- NativeAllocClass sc;
- Reg sr;
- if (done[j] || j == i) continue;
- sr = x64_arg_move_src_reg(&moves[j], &sc);
- if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
- sc == (NativeAllocClass)moves[i].dst.cls)
- blocked = 1;
- }
- if (!blocked) {
- x64_emit_one_arg_move(t, &moves[i]);
- done[i] = 1;
- emitted++;
- progress = 1;
- }
- }
- if (!progress) {
- u32 k = 0;
- NativeAllocClass bc, sc;
- Reg scratch_reg;
- NativeLoc scratchloc;
- u32 jj;
- while (k < n &&
- (done[k] || x64_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
- ++k;
- bc = (NativeAllocClass)moves[k].dst.cls;
- scratch_reg = bc == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT2;
- scratchloc = x64_reg_loc(moves[k].dst.type, bc, scratch_reg);
- x64_move(t, scratchloc, moves[k].dst);
- for (jj = 0; jj < n; ++jj) {
- Reg sr = x64_arg_move_src_reg(&moves[jj], &sc);
- if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg &&
- sc == bc) {
- moves[jj].src = scratchloc;
- moves[jj].src_offset = 0;
- }
- }
- }
- }
+ memset(&s, 0, sizeof s);
+ s.t = t;
+ s.emit_one = x64_emit_one_arg_move;
+ s.reg_move = x64_move;
+ s.scratch[NATIVE_REG_INT] = X64_TMP_INT2;
+ s.scratch[NATIVE_REG_FP] = X64_TMP_FP;
+ native_arg_shuffle(&s, moves, n);
}
/* Clobber masks: per-call all caller-saved regs are clobbered. */
@@ -2122,8 +2078,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
plan->has_sret = abi && abi->has_sret;
plan->is_variadic = abi && abi->variadic;
plan->stack_arg_size = x64_call_stack_size(t, desc);
- if (plan->stack_arg_size > a->max_outgoing)
- a->max_outgoing = plan->stack_arg_size;
+ if (plan->stack_arg_size > a->frame.max_outgoing)
+ a->frame.max_outgoing = plan->stack_arg_size;
for (c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) {
plan->clobber_mask[c] = x64_clobber_mask(aregs, (NativeAllocClass)c);
plan->return_mask[c] = x64_return_mask(abi, (NativeAllocClass)c);
@@ -2275,8 +2231,21 @@ static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) {
X64NativeTarget* a = x64_of(t);
MCEmitter* mc = t->mc;
ObjSecId sec = mc->section_id;
- if (a->ncallee_saves)
- x64_panic(a, "tail call with callee-saves (O1 path) not implemented");
+ /* Restore callee-saves before the frame teardown (O1 path; none at -O0).
+ * Their rbp-relative offsets are frame-size-independent, and the indirect
+ * callee was staged in r11 by plan_call — a caller-saved scratch — so these
+ * restores never clobber it. Mirrors the x64_func_end epilogue. */
+ Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
+ u32 n_int = x64_collect_int_saves(a, cs_int);
+ u32 n_fp = x64_collect_fp_saves(a, cs_fp);
+ u32 xmm_base = x64_xmm_base(a, n_fp);
+ i32 i;
+ for (i = (i32)n_fp - 1; i >= 0; --i)
+ emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP,
+ -(i32)xmm_base - (i32)(i + 1) * 16); /* movaps */
+ for (i = (i32)n_int - 1; i >= 0; --i)
+ emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP,
+ -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8);
emit_leave(mc);
if (callee.kind == NATIVE_LOC_GLOBAL) {
u8 op = X64_OPC_JMP_REL32;
@@ -2437,7 +2406,7 @@ static void x64_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
}
emit_alu_rr(mc, 1, X64_OPC_ALU_SUB, X64_RSP, X64_RAX);
}
- a->has_alloca = 1;
+ a->frame.has_alloca = 1;
/* lea dst, [rsp + max_outgoing] — disp32 patched in func_end. */
if (a->npatches == a->patches_cap) {
u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
@@ -3545,6 +3514,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
t->c = c;
t->obj = obj;
t->mc = mc;
+ native_frame_init(&a->frame, c);
t->regs = &x64_reg_info;
t->class_for_type = x64_class_for_type;
t->imm_legal = x64_imm_legal;
@@ -3552,7 +3522,9 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
t->func_begin = x64_func_begin;
t->func_begin_known_frame = x64_func_begin_known_frame;
t->note_frame_state = NULL;
- t->reserve_callee_saves = NULL;
+ /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
+ * set; x64_func_begin_known_frame derives the records from the masks. */
+ t->reserve_callee_saves = x64_reserve_callee_saves;
t->signature_stack_bytes = x64_signature_stack_bytes;
t->call_stack_bytes = x64_call_stack_bytes;
t->has_store_zero_reg = 0;
@@ -3627,7 +3599,7 @@ static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
NativeLoc* args = NULL;
NativeLoc* results = NULL;
u32 i, stack;
- if (a->ncallee_saves)
+ if (a->frame.ncallee_saves)
return "x64 tail call: callee-saved registers in use";
memset(&nd, 0, sizeof nd);
if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
diff --git a/src/cg/native_argmove.c b/src/cg/native_argmove.c
@@ -0,0 +1,79 @@
+#include "cg/native_argmove.h"
+
+/* Upper bound on moves in one shuffle. Callers (per-call register args, entry
+ * param binds) are bounded by the ABI register pools (<= 16 per class); 32
+ * covers both classes with headroom. */
+#define NATIVE_ARG_SHUFFLE_MAX 32u
+
+/* The register a move reads, or REG_NONE if it reads none (immediate, frame
+ * slot, or an address derived from fp/sp/a global). Only a register source can
+ * be invalidated by another move writing its destination register, so only
+ * these constrain the emission order. */
+static Reg nam_src_reg(const NativeArgMove* m, NativeAllocClass* cls_out) {
+ if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
+ *cls_out = (NativeAllocClass)m->src.cls;
+ return m->src.v.reg;
+ }
+ return REG_NONE;
+}
+
+void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n) {
+ u8 done[NATIVE_ARG_SHUFFLE_MAX];
+ u32 emitted = 0;
+ if (n > NATIVE_ARG_SHUFFLE_MAX)
+ compiler_panic(s->t->c, (SrcLoc){0, 0, 0},
+ "native arg shuffle: too many moves");
+ memset(done, 0, sizeof done);
+ /* Mark-and-sweep over the move set. Each pass emits every move whose
+ * destination is no longer needed as a source (topological order); when only
+ * a true cycle remains, break one member and continue. */
+ while (emitted < n) {
+ int progress = 0;
+ u32 i, j;
+ for (i = 0; i < n; ++i) {
+ int blocked = 0;
+ if (done[i]) continue;
+ for (j = 0; j < n && !blocked; ++j) {
+ NativeAllocClass sc;
+ Reg sr;
+ if (j == i || done[j]) continue;
+ sr = nam_src_reg(&moves[j], &sc);
+ if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
+ sc == (NativeAllocClass)moves[i].dst.cls)
+ blocked = 1;
+ }
+ if (!blocked) {
+ s->emit_one(s->t, &moves[i]);
+ done[i] = 1;
+ emitted++;
+ progress = 1;
+ }
+ }
+ if (!progress) {
+ /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
+ * at least one remaining move is register-sourced. Break on such a move:
+ * its destination register still holds a pending source value, so copy it
+ * to the class scratch and point that value's readers at the scratch. */
+ u32 k = 0;
+ NativeAllocClass bc, sc;
+ NativeLoc scratchloc;
+ while (k < n &&
+ (done[k] || nam_src_reg(&moves[k], &sc) == REG_NONE))
+ ++k;
+ bc = (NativeAllocClass)moves[k].dst.cls;
+ memset(&scratchloc, 0, sizeof scratchloc);
+ scratchloc.kind = NATIVE_LOC_REG;
+ scratchloc.cls = (u8)bc;
+ scratchloc.type = moves[k].dst.type;
+ scratchloc.v.reg = s->scratch[bc];
+ s->reg_move(s->t, scratchloc, moves[k].dst);
+ for (j = 0; j < n; ++j) {
+ Reg sr = nam_src_reg(&moves[j], &sc);
+ if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
+ moves[j].src = scratchloc;
+ moves[j].src_offset = 0;
+ }
+ }
+ }
+ }
+}
diff --git a/src/cg/native_argmove.h b/src/cg/native_argmove.h
@@ -0,0 +1,60 @@
+#ifndef CFREE_CG_NATIVE_ARGMOVE_H
+#define CFREE_CG_NATIVE_ARGMOVE_H
+
+/* Shared parallel-copy register-move scheduler for the native backends.
+ *
+ * Marshalling call arguments (and, on the optimizer path, binding incoming
+ * parameters) means realizing a set of register `dst <- src` moves as a
+ * *parallel* copy: every register must be read by all moves that source it
+ * before any move overwrites it. The allocator usually arranges a conflict-free
+ * order, but not always (notably variadic args, and tail-call / entry
+ * permutations the allocator is free to rotate). When a true cycle remains —
+ * a rotation like rdi<-rdx, rsi<-rdi, rdx<-rsi — it is broken by stashing one
+ * member's register into a scratch and redirecting that value's readers.
+ *
+ * The scheduling (topological emission + cycle detection + the scratch break)
+ * is identical across aa64/rv64/x64; only the leaf operations differ — how one
+ * move is emitted and which register is the scratch. Those come in via the ops
+ * struct, so all three backends (and the entry-bind path) share one scheduler. */
+
+#include <string.h>
+
+#include "arch/native_target.h"
+#include "core/core.h"
+
+/* A single register-destination move. `src` may be a register (the common,
+ * order-constraining case), a frame/stack slot, an immediate, or — when
+ * `is_addr` — an address to materialize into `dst`. Identical to the per-arch
+ * arg-move structs it replaces. */
+typedef struct NativeArgMove {
+ NativeLoc dst;
+ NativeLoc src;
+ u32 src_offset;
+ u32 size;
+ int is_addr;
+ /* Optional emit hints, opaque to the scheduler (it only reads dst/src/is_addr
+ * for ordering). x64 uses these for the Win64 "variadic FP arg also duplicated
+ * into the matching GPR" rule; other backends leave them zero. */
+ int dup_to_gpr;
+ Reg dup_gpr;
+} NativeArgMove;
+
+typedef struct NativeArgShuffle {
+ NativeTarget* t;
+ /* Emit one move as-is: an address-of for is_addr, else a load/copy of `size`
+ * bytes at `src_offset` from `src` into `dst`. */
+ void (*emit_one)(NativeTarget* t, const NativeArgMove* m);
+ /* A plain register-to-register copy, used to stash a cycle victim into the
+ * scratch register before redirecting its readers. */
+ void (*reg_move)(NativeTarget* t, NativeLoc dst, NativeLoc src);
+ /* Scratch register per NativeAllocClass (index by class id) for breaking
+ * cycles. A cycle's ring is register-to-register, so the scratch needs no
+ * address arithmetic — a bare register suffices. */
+ Reg scratch[NATIVE_REG_VEC + 1];
+} NativeArgShuffle;
+
+/* Emit `moves[0..n)` as a parallel register copy via `s`. Mutates `moves`
+ * (redirects a broken cycle's readers to the scratch register). */
+void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n);
+
+#endif