kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit dc46a3135f40231846840a807f477f5d082525c3
parent 1bfb41a5fc5c5da9f233bcf2d6846192694639e3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 09:40:53 -0700

Share the parallel-copy arg-move scheduler; x64 NativeFrame + partial O1

New src/cg/native_argmove.{h,c}: the parallel-copy register-move scheduler
(topological emission + cycle-break through a scratch) was duplicated nearly
verbatim in all three backends (aa_/rv_/x64_emit_reg_arg_moves). The scheduling
is arch-independent; only the leaf ops differ (emit one move, the scratch reg).
Extracted to one native_arg_shuffle() taking a tiny ops struct; aa64/rv64/x64
each now build their move list and call it. Behavior-identical: aa64 toy 1034/0
(O0+O1), rv64 toy O0 156/0 + O1 156/0, x64 toy O0 156/0.

x64 also adopts the shared NativeFrame (mirroring aa64/rv64) and implements its
-O1 known-frame path (func_begin_known_frame derives the callee-saved set from
the optimizer masks; reserve_callee_saves non-NULL; tail-site restores
callee-saves before leave, indirect callee already staged in r11). x64
compute_frame_size already counted callee-save bytes, so no frame-size fix
needed. x64 -O1 is 137 pass / 19 fail: the 9 tail failures are root-caused to
naive per-parameter entry binds (bind_param) clobbering when the allocator
rotates a tail function's params across the incoming arg regs — the same cycle
the shuffle breaks, but the entry-bind path doesn't use it yet. Fix (route entry
binds through the shared scheduler) and the atomic/varargs failures are next.

Diffstat:
Msrc/arch/aa64/native.c | 101++++++++++++++++---------------------------------------------------------------
Msrc/arch/rv64/native.c | 82++++++++++++++-----------------------------------------------------------------
Msrc/arch/x64/native.c | 262+++++++++++++++++++++++++++++++++++--------------------------------------------
Asrc/cg/native_argmove.c | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cg/native_argmove.h | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 290 insertions(+), 294 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -36,6 +36,7 @@ #include "arch/aa64/regs.h" #include "asm/asm.h" #include "asm/asm_lex.h" +#include "cg/native_argmove.h" #include "cg/native_direct_target.h" #include "cg/native_frame.h" #include "cg/type.h" @@ -2446,99 +2447,37 @@ static u32 aa_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { /* One register-passed call argument: write `src` (or its address) into the * argument register `dst`. Collected during planning and emitted as a batch so * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */ -typedef struct { - NativeLoc dst; - NativeLoc src; - u32 src_offset; - u32 size; - int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */ -} AAArgMove; +typedef NativeArgMove AAArgMove; /* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */ #define AA_MAX_REG_ARG_MOVES 16u -/* The register a move reads, or REG_NONE if it reads none (immediate, frame - * slot, or an address derived from fp/sp/a global). Only a register source can - * be invalidated by another move writing its argument register, so only these - * constrain the emission order. */ -static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) { - if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) { - *cls_out = (NativeAllocClass)m->src.cls; - return m->src.v.reg; - } - return REG_NONE; -} - -static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) { +static void aa_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { if (m->is_addr) aa_addr_of_loc(t, m->dst, m->src); else aa_load_part(t, m->dst, m->src, m->src_offset, m->size); } -/* Emit register-argument moves with parallel-copy semantics: every register - * must be read by all moves that source it before any move overwrites it. The - * allocator usually arranges the sources so the moves are already conflict-free - * (it copies a value out of an about-to-be-clobbered argument register), but it - * does not always — notably for variadic arguments, where it can leave a prior - * call's result sitting in x0 even though x0 is this call's first arg register. - * So the backend must not assume a safe order. Emit any move whose destination - * is no longer needed as a source (topological order); when only a true cycle - * remains, save one member's register into a scratch and redirect its readers, - * then drain the now-acyclic remainder. */ -static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) { - u8 done[AA_MAX_REG_ARG_MOVES]; - u32 emitted = 0; +/* Emit register-argument moves as a parallel copy via the shared scheduler: + * every register is read by all moves that source it before any move overwrites + * it; a true cycle is broken through a scratch. The allocator usually arranges + * a conflict-free order, but not always (notably variadic args, where it can + * leave a prior call's result in x0 even though x0 is this call's first arg + * register), so the backend must not assume a safe order. Cycle scratch is + * AA_TMP1 (x17) for int and v16 for fp — distinct from x16 (AA_TMP0), which may + * hold a stashed indirect callee. */ +static void aa_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) { + NativeArgShuffle s; if (n > AA_MAX_REG_ARG_MOVES) aa_panic(aa_of(t), "too many register arguments"); - memset(done, 0, sizeof done); - while (emitted < n) { - int progress = 0; - for (u32 i = 0; i < n; ++i) { - int blocked = 0; - if (done[i]) continue; - for (u32 j = 0; j < n && !blocked; ++j) { - NativeAllocClass sc; - Reg sr; - if (done[j] || j == i) continue; - sr = aa_arg_move_src_reg(&moves[j], &sc); - if (sr != REG_NONE && sr == moves[i].dst.v.reg && - sc == (NativeAllocClass)moves[i].dst.cls) - blocked = 1; - } - if (!blocked) { - aa_emit_one_arg_move(t, &moves[i]); - done[i] = 1; - emitted++; - progress = 1; - } - } - if (!progress) { - /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so - * at least one remaining move is register-sourced. Break on such a move: - * its destination register still holds a pending source value, so copy it - * to a scratch and point that value's readers at the scratch. Scratch - * avoids x16 (which may hold a stashed indirect callee) and the arg regs; - * cycle moves are reg->reg, so the scratch needs no address arithmetic. */ - u32 k = 0; - NativeAllocClass bc, sc; - Reg scratch_reg; - NativeLoc scratchloc; - while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE)) - ++k; - bc = (NativeAllocClass)moves[k].dst.cls; - scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1; - scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg); - aa_move(t, scratchloc, moves[k].dst); - for (u32 j = 0; j < n; ++j) { - Reg sr = aa_arg_move_src_reg(&moves[j], &sc); - if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) { - moves[j].src = scratchloc; - moves[j].src_offset = 0; - } - } - } - } + memset(&s, 0, sizeof s); + s.t = t; + s.emit_one = aa_emit_one_arg_move; + s.reg_move = aa_move; + s.scratch[NATIVE_REG_INT] = AA_TMP1; + s.scratch[NATIVE_REG_FP] = 16u; + native_arg_shuffle(&s, moves, n); } static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c @@ -23,6 +23,7 @@ #include "asm/asm_lex.h" #include "arch/rv64/regs.h" #include "arch/rv64/rv64.h" +#include "cg/native_argmove.h" #include "cg/native_direct_target.h" #include "cg/native_frame.h" #include "cg/type.h" @@ -1794,82 +1795,27 @@ static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p, /* ============================ calls / returns ============================ */ -typedef struct { - NativeLoc dst; - NativeLoc src; - u32 src_offset; - u32 size; - int is_addr; -} RvArgMove; +typedef NativeArgMove RvArgMove; -static Reg rv_arg_move_src_reg(const RvArgMove* m, NativeAllocClass* cls_out) { - if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) { - *cls_out = (NativeAllocClass)m->src.cls; - return m->src.v.reg; - } - return REG_NONE; -} - -static void rv_emit_one_arg_move(NativeTarget* t, const RvArgMove* m) { +static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { if (m->is_addr) rv_addr_of_loc(t, m->dst, m->src); else rv_load_part(t, m->dst, m->src, m->src_offset, m->size); } -/* Parallel-copy register arg moves with cycle breaking. */ -static void rv_emit_reg_arg_moves(NativeTarget* t, RvArgMove* moves, u32 n) { - u8 done[RV_MAX_REG_ARG_MOVES]; - u32 emitted = 0; +/* Parallel-copy register arg moves via the shared scheduler; cycles break + * through the int/fp emit scratch (t1 / ft1). */ +static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) { + NativeArgShuffle s; if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args"); - memset(done, 0, sizeof done); - while (emitted < n) { - int progress = 0; - u32 i, j; - for (i = 0; i < n; ++i) { - int blocked = 0; - if (done[i]) continue; - for (j = 0; j < n && !blocked; ++j) { - NativeAllocClass sc; - Reg sr; - if (done[j] || j == i) continue; - sr = rv_arg_move_src_reg(&moves[j], &sc); - if (sr != REG_NONE && sr == moves[i].dst.v.reg && - sc == (NativeAllocClass)moves[i].dst.cls) - blocked = 1; - } - if (!blocked) { - rv_emit_one_arg_move(t, &moves[i]); - done[i] = 1; - emitted++; - progress = 1; - } - } - if (!progress) { - u32 k = 0; - NativeAllocClass bc, sc; - Reg scratch_reg; - NativeLoc scratchloc; - while (k < n && - (done[k] || rv_arg_move_src_reg(&moves[k], &sc) == REG_NONE)) - ++k; - bc = (NativeAllocClass)moves[k].dst.cls; - scratch_reg = bc == NATIVE_REG_FP ? RV_FTMP1 : RV_TMP1; - scratchloc = rv_reg_loc(moves[k].dst.type, bc, scratch_reg); - rv_move(t, scratchloc, moves[k].dst); - { - u32 jj; - for (jj = 0; jj < n; ++jj) { - Reg sr = rv_arg_move_src_reg(&moves[jj], &sc); - if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg && - sc == bc) { - moves[jj].src = scratchloc; - moves[jj].src_offset = 0; - } - } - } - } - } + memset(&s, 0, sizeof s); + s.t = t; + s.emit_one = rv_emit_one_arg_move; + s.reg_move = rv_move; + s.scratch[NATIVE_REG_INT] = RV_TMP1; + s.scratch[NATIVE_REG_FP] = RV_FTMP1; + native_arg_shuffle(&s, moves, n); } static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -34,7 +34,9 @@ #include "arch/x64/x64.h" #include "asm/asm.h" #include "asm/asm_lex.h" +#include "cg/native_argmove.h" #include "cg/native_direct_target.h" +#include "cg/native_frame.h" #include "cg/type.h" #include "core/arena.h" #include "core/bytes.h" @@ -53,18 +55,12 @@ enum { /* ============================ target state ============================ */ -typedef struct X64NativeSlot { - u32 off; /* bytes below rbp (positive); address = rbp - off */ - u32 size; - u32 align; - u8 kind; /* NativeFrameSlotKind */ - u8 pad[3]; -} X64NativeSlot; - -typedef struct X64CalleeSave { - Reg reg; - u8 cls; /* NativeAllocClass */ -} X64CalleeSave; +/* Frame slots and callee-save records live in the shared NativeFrame + * bookkeeping (cg/native_frame.h); these aliases keep the x64-local spellings. + * x64 reads only .reg/.cls of a callee-save (it computes save offsets below the + * locals rather than homing them in frame slots, so .slot/.type stay unused). */ +typedef NativeFrameSlotEntry X64NativeSlot; +typedef NativeFrameCalleeSave X64CalleeSave; typedef enum X64PatchKind { X64_PATCH_ALLOCA } X64PatchKind; @@ -78,11 +74,9 @@ typedef struct X64NativeTarget { SrcLoc loc; const CGFuncDesc* func; - X64NativeSlot* slots; - u32 nslots; - u32 slots_cap; - u32 cum_off; /* sum of frame-slot reservations below rbp */ - u32 max_outgoing; /* max outgoing-arg bytes across all calls */ + /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save + * set, and the known_frame / has_alloca / frame_final flags. */ + NativeFrame frame; u32 frame_size_final; u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */ @@ -104,13 +98,6 @@ typedef struct X64NativeTarget { u32 prologue_nbytes; MCLabel epilogue_label; - X64CalleeSave callee_saves[X64_MAX_CS_INT_REGS + X64_MAX_CS_FP_REGS]; - u32 ncallee_saves; - - u8 known_frame; - u8 has_alloca; - u8 frame_final; - const X64ABIRegs* abi; } X64NativeTarget; @@ -121,9 +108,7 @@ static _Noreturn void x64_panic(X64NativeTarget* a, const char* msg) { } static X64NativeSlot* x64_slot_get(X64NativeTarget* a, NativeFrameSlot fs) { - if (fs == NATIVE_FRAME_SLOT_NONE || fs > a->nslots) - x64_panic(a, "bad frame slot"); - return &a->slots[fs - 1u]; + return native_frame_slot_at(&a->frame, fs); } static u32 align_up_u32(u32 v, u32 align) { @@ -1343,37 +1328,19 @@ static void x64_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) { static NativeFrameSlot x64_frame_slot(NativeTarget* t, const NativeFrameSlotDesc* d) { - X64NativeTarget* a = x64_of(t); - X64NativeSlot* s; - u32 size = d->size ? d->size : 8u; - u32 align = d->align ? d->align : 1u; - if (a->frame_final) x64_panic(a, "frame slot requested after prologue"); - if (a->nslots == a->slots_cap) { - u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u; - X64NativeSlot* nb = arena_zarray(t->c->tu, X64NativeSlot, cap); - if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots); - a->slots = nb; - a->slots_cap = cap; - } - a->cum_off = align_up_u32(a->cum_off + size, align); - s = &a->slots[a->nslots++]; - s->off = a->cum_off; - s->size = size; - s->align = align; - s->kind = d->kind; - return (NativeFrameSlot)a->nslots; + return native_frame_slot_alloc(&x64_of(t)->frame, d); } /* xmm save area base (rbp-relative). XMM saves are 16-aligned. */ static u32 x64_xmm_base(const X64NativeTarget* a, u32 cs_fp) { - if (cs_fp == 0) return a->cum_off; - return align_up_u32(a->cum_off, 16u); + if (cs_fp == 0) return a->frame.cum_off; + return align_up_u32(a->frame.cum_off, 16u); } static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int, u32 cs_fp) { u32 xmm_base = x64_xmm_base(a, cs_fp); - u32 raw = a->max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base; + u32 raw = a->frame.max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base; u32 fs = align_up_u32(raw, 16u); return fs ? fs : 16u; } @@ -1381,16 +1348,16 @@ static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int, /* Collect the callee-saves the body actually used. */ static u32 x64_collect_int_saves(X64NativeTarget* a, Reg* regs) { u32 n = 0, i; - for (i = 0; i < a->ncallee_saves; ++i) - if (a->callee_saves[i].cls == NATIVE_REG_INT) - regs[n++] = a->callee_saves[i].reg; + for (i = 0; i < a->frame.ncallee_saves; ++i) + if (a->frame.callee_saves[i].cls == NATIVE_REG_INT) + regs[n++] = a->frame.callee_saves[i].reg; return n; } static u32 x64_collect_fp_saves(X64NativeTarget* a, Reg* regs) { u32 n = 0, i; - for (i = 0; i < a->ncallee_saves; ++i) - if (a->callee_saves[i].cls == NATIVE_REG_FP) - regs[n++] = a->callee_saves[i].reg; + for (i = 0; i < a->frame.ncallee_saves; ++i) + if (a->frame.callee_saves[i].cls == NATIVE_REG_FP) + regs[n++] = a->frame.callee_saves[i].reg; return n; } @@ -1486,9 +1453,9 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { a->func = fd; a->loc = fd->loc; a->abi = x64_abi_for_os(t->c->target.os); - a->nslots = 0; - a->cum_off = 0; - a->max_outgoing = 0; + /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing, + * callee-save set, and known_frame/has_alloca/frame_final. */ + native_frame_reset(&a->frame); a->incoming_stack_size = 0; a->next_param_int = 0; a->next_param_fp = 0; @@ -1499,10 +1466,6 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { a->reg_save_slot = NATIVE_FRAME_SLOT_NONE; a->npatches = 0; a->nalloca = 0; - a->ncallee_saves = 0; - a->known_frame = 0; - a->has_alloca = 0; - a->frame_final = 0; a->prologue_nbytes = a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES; @@ -1573,13 +1536,58 @@ static void x64_func_begin(NativeTarget* t, const CGFuncDesc* fd) { x64_emit_variadic_reg_saves(a); } +/* x64 homes callee-saves below the locals (offsets computed in + * x64_compute_frame_size / x64_build_prologue), not in frame slots, so + * alloc_slots=0: native_frame just records the {reg,cls} set from the masks. */ +static void x64_reserve_callee_saves(NativeTarget* t, const u32* used, + u32 nclasses) { + native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0); +} + +/* Optimizer entry point: the full frame is supplied up front, so the prologue + * is emitted final the moment it is built — no NOP region, no func_end patch + * (x64_func_end skips patching when known_frame). x64_build_prologue emits the + * push rbp / sub rsp / sret spill / callee-save spills; the variadic + * register-save stores are emitted separately, as on the single-pass path. */ static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, const NativeKnownFrameDesc* frame, NativeFrameSlot* out_slots) { - (void)fd; - (void)frame; - (void)out_slots; - x64_panic(x64_of(t), "known-frame path not implemented yet"); + X64NativeTarget* a = x64_of(t); + MCEmitter* mc = t->mc; + Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS]; + u32 n_int, n_fp, frame_size, nbytes, chkstk_disp_pos, i; + u8 buf[X64_PROLOGUE_BYTES_WIN64]; + x64_func_begin_common(t, fd); + a->frame.known_frame = 1; + if (frame) { + a->frame.has_alloca = frame->has_alloca; + if (frame->callee_saved_used && frame->ncallee_classes) + x64_reserve_callee_saves(t, frame->callee_saved_used, + frame->ncallee_classes); + for (i = 0; i < frame->nslots; ++i) { + NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]); + if (out_slots) out_slots[i] = slot; + } + x64_reserve_entry_saves(a); + native_frame_note_outgoing(&a->frame, frame->max_outgoing); + } + /* Frame is final: size and offsets are settled, so emit the exact prologue. */ + n_int = x64_collect_int_saves(a, cs_int); + n_fp = x64_collect_fp_saves(a, cs_fp); + frame_size = x64_compute_frame_size(a, n_int, n_fp); + a->frame_size_final = frame_size; + a->prologue_pos = mc->pos(mc); + nbytes = x64_build_prologue(a, buf, sizeof buf, frame_size, cs_int, n_int, + cs_fp, n_fp, &chkstk_disp_pos); + mc->emit_bytes(mc, buf, nbytes); + if (chkstk_disp_pos != (u32)-1) { + ObjSymId chk = x64_chkstk_sym(t); + mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos, + R_X64_PLT32, chk, -4, 1, 0); + } + a->prologue_nbytes = nbytes; /* exact length: used for the CFI post offset */ + x64_emit_variadic_reg_saves(a); + native_frame_set_final(&a->frame); } static void x64_func_end(NativeTarget* t) { @@ -1610,7 +1618,7 @@ static void x64_func_end(NativeTarget* t) { emit_ret(mc); /* Patch the single-pass prologue placeholder. */ - if (!a->known_frame) { + if (!a->frame.known_frame) { u8 buf[X64_PROLOGUE_BYTES_WIN64]; u32 chkstk_disp_pos; u32 nbytes; @@ -1629,7 +1637,7 @@ static void x64_func_end(NativeTarget* t) { /* Patch alloca disp32s: lea dst, [rsp + max_outgoing]. */ { - u32 mo = align_up_u32(a->max_outgoing, 16u); + u32 mo = align_up_u32(a->frame.max_outgoing, 16u); u32 k; for (k = 0; k < a->npatches; ++k) { u8 dbuf[4]; @@ -1640,8 +1648,10 @@ static void x64_func_end(NativeTarget* t) { /* CFI: after the prologue, CFA = rbp + 16; rbp at cfa-16, ra at cfa-8. */ if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { - u32 post = - a->prologue_pos + (a->known_frame ? 0u : a->prologue_nbytes); + /* Body starts past the prologue. prologue_nbytes is the reserved NOP-region + * size on the single-pass path and the exact prologue length on the + * known-frame path (set in x64_func_begin_known_frame). */ + u32 post = a->prologue_pos + a->prologue_nbytes; u32 k; mc->cfi_set_next_pc_offset(mc, post - a->func_start); mc->cfi_def_cfa(mc, X64_RBP, 16); @@ -1991,25 +2001,9 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, /* ============================ calls / returns ============================ */ -typedef struct { - NativeLoc dst; - NativeLoc src; - u32 src_offset; - u32 size; - int is_addr; - int dup_to_gpr; /* Win64: also place into the matching GPR */ - Reg dup_gpr; -} X64ArgMove; - -static Reg x64_arg_move_src_reg(const X64ArgMove* m, NativeAllocClass* cls_out) { - if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) { - *cls_out = (NativeAllocClass)m->src.cls; - return m->src.v.reg; - } - return REG_NONE; -} +typedef NativeArgMove X64ArgMove; -static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) { +static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { if (m->is_addr) { x64_addr_of_loc(t, m->dst, m->src); } else { @@ -2021,57 +2015,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) { } } -/* Parallel-copy register arg moves with cycle breaking through scratch. */ -static void x64_emit_reg_arg_moves(NativeTarget* t, X64ArgMove* moves, u32 n) { - u8 done[X64_MAX_REG_ARG_MOVES]; - u32 emitted = 0; +/* Parallel-copy register arg moves via the shared scheduler; cycles break + * through the int/fp emit scratch (r11 / xmm14). */ +static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, + u32 n) { + NativeArgShuffle s; if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args"); - memset(done, 0, sizeof done); - while (emitted < n) { - int progress = 0; - u32 i, j; - for (i = 0; i < n; ++i) { - int blocked = 0; - if (done[i]) continue; - for (j = 0; j < n && !blocked; ++j) { - NativeAllocClass sc; - Reg sr; - if (done[j] || j == i) continue; - sr = x64_arg_move_src_reg(&moves[j], &sc); - if (sr != REG_NONE && sr == moves[i].dst.v.reg && - sc == (NativeAllocClass)moves[i].dst.cls) - blocked = 1; - } - if (!blocked) { - x64_emit_one_arg_move(t, &moves[i]); - done[i] = 1; - emitted++; - progress = 1; - } - } - if (!progress) { - u32 k = 0; - NativeAllocClass bc, sc; - Reg scratch_reg; - NativeLoc scratchloc; - u32 jj; - while (k < n && - (done[k] || x64_arg_move_src_reg(&moves[k], &sc) == REG_NONE)) - ++k; - bc = (NativeAllocClass)moves[k].dst.cls; - scratch_reg = bc == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT2; - scratchloc = x64_reg_loc(moves[k].dst.type, bc, scratch_reg); - x64_move(t, scratchloc, moves[k].dst); - for (jj = 0; jj < n; ++jj) { - Reg sr = x64_arg_move_src_reg(&moves[jj], &sc); - if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg && - sc == bc) { - moves[jj].src = scratchloc; - moves[jj].src_offset = 0; - } - } - } - } + memset(&s, 0, sizeof s); + s.t = t; + s.emit_one = x64_emit_one_arg_move; + s.reg_move = x64_move; + s.scratch[NATIVE_REG_INT] = X64_TMP_INT2; + s.scratch[NATIVE_REG_FP] = X64_TMP_FP; + native_arg_shuffle(&s, moves, n); } /* Clobber masks: per-call all caller-saved regs are clobbered. */ @@ -2122,8 +2078,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, plan->has_sret = abi && abi->has_sret; plan->is_variadic = abi && abi->variadic; plan->stack_arg_size = x64_call_stack_size(t, desc); - if (plan->stack_arg_size > a->max_outgoing) - a->max_outgoing = plan->stack_arg_size; + if (plan->stack_arg_size > a->frame.max_outgoing) + a->frame.max_outgoing = plan->stack_arg_size; for (c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) { plan->clobber_mask[c] = x64_clobber_mask(aregs, (NativeAllocClass)c); plan->return_mask[c] = x64_return_mask(abi, (NativeAllocClass)c); @@ -2275,8 +2231,21 @@ static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) { X64NativeTarget* a = x64_of(t); MCEmitter* mc = t->mc; ObjSecId sec = mc->section_id; - if (a->ncallee_saves) - x64_panic(a, "tail call with callee-saves (O1 path) not implemented"); + /* Restore callee-saves before the frame teardown (O1 path; none at -O0). + * Their rbp-relative offsets are frame-size-independent, and the indirect + * callee was staged in r11 by plan_call — a caller-saved scratch — so these + * restores never clobber it. Mirrors the x64_func_end epilogue. */ + Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS]; + u32 n_int = x64_collect_int_saves(a, cs_int); + u32 n_fp = x64_collect_fp_saves(a, cs_fp); + u32 xmm_base = x64_xmm_base(a, n_fp); + i32 i; + for (i = (i32)n_fp - 1; i >= 0; --i) + emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP, + -(i32)xmm_base - (i32)(i + 1) * 16); /* movaps */ + for (i = (i32)n_int - 1; i >= 0; --i) + emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP, + -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8); emit_leave(mc); if (callee.kind == NATIVE_LOC_GLOBAL) { u8 op = X64_OPC_JMP_REL32; @@ -2437,7 +2406,7 @@ static void x64_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, } emit_alu_rr(mc, 1, X64_OPC_ALU_SUB, X64_RSP, X64_RAX); } - a->has_alloca = 1; + a->frame.has_alloca = 1; /* lea dst, [rsp + max_outgoing] — disp32 patched in func_end. */ if (a->npatches == a->patches_cap) { u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; @@ -3545,6 +3514,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, t->c = c; t->obj = obj; t->mc = mc; + native_frame_init(&a->frame, c); t->regs = &x64_reg_info; t->class_for_type = x64_class_for_type; t->imm_legal = x64_imm_legal; @@ -3552,7 +3522,9 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, t->func_begin = x64_func_begin; t->func_begin_known_frame = x64_func_begin_known_frame; t->note_frame_state = NULL; - t->reserve_callee_saves = NULL; + /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved + * set; x64_func_begin_known_frame derives the records from the masks. */ + t->reserve_callee_saves = x64_reserve_callee_saves; t->signature_stack_bytes = x64_signature_stack_bytes; t->call_stack_bytes = x64_call_stack_bytes; t->has_store_zero_reg = 0; @@ -3627,7 +3599,7 @@ static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { NativeLoc* args = NULL; NativeLoc* results = NULL; u32 i, stack; - if (a->ncallee_saves) + if (a->frame.ncallee_saves) return "x64 tail call: callee-saved registers in use"; memset(&nd, 0, sizeof nd); if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); diff --git a/src/cg/native_argmove.c b/src/cg/native_argmove.c @@ -0,0 +1,79 @@ +#include "cg/native_argmove.h" + +/* Upper bound on moves in one shuffle. Callers (per-call register args, entry + * param binds) are bounded by the ABI register pools (<= 16 per class); 32 + * covers both classes with headroom. */ +#define NATIVE_ARG_SHUFFLE_MAX 32u + +/* The register a move reads, or REG_NONE if it reads none (immediate, frame + * slot, or an address derived from fp/sp/a global). Only a register source can + * be invalidated by another move writing its destination register, so only + * these constrain the emission order. */ +static Reg nam_src_reg(const NativeArgMove* m, NativeAllocClass* cls_out) { + if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) { + *cls_out = (NativeAllocClass)m->src.cls; + return m->src.v.reg; + } + return REG_NONE; +} + +void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n) { + u8 done[NATIVE_ARG_SHUFFLE_MAX]; + u32 emitted = 0; + if (n > NATIVE_ARG_SHUFFLE_MAX) + compiler_panic(s->t->c, (SrcLoc){0, 0, 0}, + "native arg shuffle: too many moves"); + memset(done, 0, sizeof done); + /* Mark-and-sweep over the move set. Each pass emits every move whose + * destination is no longer needed as a source (topological order); when only + * a true cycle remains, break one member and continue. */ + while (emitted < n) { + int progress = 0; + u32 i, j; + for (i = 0; i < n; ++i) { + int blocked = 0; + if (done[i]) continue; + for (j = 0; j < n && !blocked; ++j) { + NativeAllocClass sc; + Reg sr; + if (j == i || done[j]) continue; + sr = nam_src_reg(&moves[j], &sc); + if (sr != REG_NONE && sr == moves[i].dst.v.reg && + sc == (NativeAllocClass)moves[i].dst.cls) + blocked = 1; + } + if (!blocked) { + s->emit_one(s->t, &moves[i]); + done[i] = 1; + emitted++; + progress = 1; + } + } + if (!progress) { + /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so + * at least one remaining move is register-sourced. Break on such a move: + * its destination register still holds a pending source value, so copy it + * to the class scratch and point that value's readers at the scratch. */ + u32 k = 0; + NativeAllocClass bc, sc; + NativeLoc scratchloc; + while (k < n && + (done[k] || nam_src_reg(&moves[k], &sc) == REG_NONE)) + ++k; + bc = (NativeAllocClass)moves[k].dst.cls; + memset(&scratchloc, 0, sizeof scratchloc); + scratchloc.kind = NATIVE_LOC_REG; + scratchloc.cls = (u8)bc; + scratchloc.type = moves[k].dst.type; + scratchloc.v.reg = s->scratch[bc]; + s->reg_move(s->t, scratchloc, moves[k].dst); + for (j = 0; j < n; ++j) { + Reg sr = nam_src_reg(&moves[j], &sc); + if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) { + moves[j].src = scratchloc; + moves[j].src_offset = 0; + } + } + } + } +} diff --git a/src/cg/native_argmove.h b/src/cg/native_argmove.h @@ -0,0 +1,60 @@ +#ifndef CFREE_CG_NATIVE_ARGMOVE_H +#define CFREE_CG_NATIVE_ARGMOVE_H + +/* Shared parallel-copy register-move scheduler for the native backends. + * + * Marshalling call arguments (and, on the optimizer path, binding incoming + * parameters) means realizing a set of register `dst <- src` moves as a + * *parallel* copy: every register must be read by all moves that source it + * before any move overwrites it. The allocator usually arranges a conflict-free + * order, but not always (notably variadic args, and tail-call / entry + * permutations the allocator is free to rotate). When a true cycle remains — + * a rotation like rdi<-rdx, rsi<-rdi, rdx<-rsi — it is broken by stashing one + * member's register into a scratch and redirecting that value's readers. + * + * The scheduling (topological emission + cycle detection + the scratch break) + * is identical across aa64/rv64/x64; only the leaf operations differ — how one + * move is emitted and which register is the scratch. Those come in via the ops + * struct, so all three backends (and the entry-bind path) share one scheduler. */ + +#include <string.h> + +#include "arch/native_target.h" +#include "core/core.h" + +/* A single register-destination move. `src` may be a register (the common, + * order-constraining case), a frame/stack slot, an immediate, or — when + * `is_addr` — an address to materialize into `dst`. Identical to the per-arch + * arg-move structs it replaces. */ +typedef struct NativeArgMove { + NativeLoc dst; + NativeLoc src; + u32 src_offset; + u32 size; + int is_addr; + /* Optional emit hints, opaque to the scheduler (it only reads dst/src/is_addr + * for ordering). x64 uses these for the Win64 "variadic FP arg also duplicated + * into the matching GPR" rule; other backends leave them zero. */ + int dup_to_gpr; + Reg dup_gpr; +} NativeArgMove; + +typedef struct NativeArgShuffle { + NativeTarget* t; + /* Emit one move as-is: an address-of for is_addr, else a load/copy of `size` + * bytes at `src_offset` from `src` into `dst`. */ + void (*emit_one)(NativeTarget* t, const NativeArgMove* m); + /* A plain register-to-register copy, used to stash a cycle victim into the + * scratch register before redirecting its readers. */ + void (*reg_move)(NativeTarget* t, NativeLoc dst, NativeLoc src); + /* Scratch register per NativeAllocClass (index by class id) for breaking + * cycles. A cycle's ring is register-to-register, so the scratch needs no + * address arithmetic — a bare register suffices. */ + Reg scratch[NATIVE_REG_VEC + 1]; +} NativeArgShuffle; + +/* Emit `moves[0..n)` as a parallel register copy via `s`. Mutates `moves` + * (redirects a broken cycle's readers to the scratch register). */ +void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n); + +#endif