commit 622485a9c40384eb68d2a018fb5aa7279a169343
parent 910583a6980b0af766d502b20a688260efa409ec
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 08:21:36 -0700
rv64+x64: implement sibling (tail) calls
rv_no_tail now checks realizability (outgoing stack args must fit the caller's
incoming-arg window) instead of always blocking; rv_plan_call routes tail
outgoing stack args into that window ([s0+16+va_save+off]) and forwards the
caller's incoming sret pointer; rv_emit_call tears the frame down to the
caller's entry state (restore ra, sp=CFA via s0 (frame_size-independent at -O0
where there are no callee-saves), s0) and sibling-jumps (auipc+jr / jr reg).
test-toy X-O0 rv64: 256 pass, 0 fail, 0 skip (all musttail/tail cases green).
x64: implement sibling (tail) calls
x64_no_tail checks realizability (outgoing stack args fit the caller's
incoming-arg window, accounting for shadow space); plan_call routes tail
outgoing stack args into that window ([rbp+16+off]) and forwards the caller's
incoming sret pointer; emit_call tears the frame down with leave (restoring
the caller's rbp, leaving rsp at the return address — frame_size-independent)
and jmps the callee (jmp rel32 / jmp r/m via the r11-staged indirect callee).
test-toy X-O0 x64: 256 pass, 0 fail, 0 skip.
Diffstat:
| M | src/arch/rv64/native.c | | | 105 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- |
| M | src/arch/x64/native.c | | | 114 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- |
2 files changed, 193 insertions(+), 26 deletions(-)
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -1668,14 +1668,23 @@ static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
rv_load_addr(t, dst, addr);
}
-static void rv_store_outgoing_part(NativeTarget* t, u32 stack_off, NativeLoc src,
- u32 size) {
+static void rv_store_outgoing_part(NativeTarget* t, int tail_call, u32 stack_off,
+ NativeLoc src, u32 size) {
NativeAddr addr;
memset(&addr, 0, sizeof addr);
addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.base.reg = RV_SP;
addr.base_type = src.type;
- addr.offset = (i32)stack_off;
+ if (tail_call) {
+ /* A sibling call reuses the caller's frame: its outgoing stack args land in
+ * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically
+ * the same address the tail-callee will read at [sp+off] once the teardown
+ * has restored sp to the caller's entry sp (the CFA). */
+ addr.base.reg = RV_S0;
+ addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
+ } else {
+ addr.base.reg = RV_SP;
+ addr.offset = (i32)stack_off;
+ }
rv_emit_mem(rv_of(t), 0, src, addr, rv_mem_for_type(t, src.type, size));
}
@@ -1871,6 +1880,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
* integer argument (a0), so the real args start at a1. */
u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
+ int tail = (desc->flags & CG_CALL_TAIL) != 0;
RvArgMove moves[RV_MAX_REG_ARG_MOVES];
for (i = 0; i < desc->nargs; ++i) {
ABIArgInfo tmp;
@@ -1883,7 +1893,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
u32 n = rv_class_stack_size(ai), off = 0;
while (off < n) {
rv_load_part(t, tmpreg, desc->args[i], off, 8);
- rv_store_outgoing_part(t, stack + off, tmpreg, 8);
+ rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8);
off += 8;
}
stack += n;
@@ -1900,7 +1910,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
} else {
NativeLoc ptr = rv_reg_loc(i64t, NATIVE_REG_INT, RV_TMP0);
rv_addr_of_loc(t, ptr, desc->args[i]);
- rv_store_outgoing_part(t, stack, ptr, 8);
+ rv_store_outgoing_part(t, tail, stack, ptr, 8);
stack += 8u;
}
continue;
@@ -1924,16 +1934,22 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
NativeLoc tmpreg = rv_reg_loc(desc->args[i].type, cls, tmp);
rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
stack = align_up_u32(stack, rv_part_stack_align(part));
- rv_store_outgoing_part(t, stack, tmpreg, part->size);
+ rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
stack += rv_part_stack_size(part);
}
}
}
rv_emit_reg_arg_moves(t, moves, nmoves);
if (abi && abi->has_sret && desc->nresults) {
- /* sret pointer goes in a0; arg loads have completed. */
+ /* sret pointer goes in a0; arg loads have completed. A tail call forwards
+ * the caller's own incoming sret pointer (spilled at entry) so the
+ * sibling writes the result into the caller's caller's destination;
+ * otherwise pass the address of this call's result slot. */
NativeLoc a0 = rv_reg_loc(i64t, NATIVE_REG_INT, RV_A0);
- rv_addr_of_loc(t, a0, desc->results[0]);
+ if (tail)
+ rv_load_part(t, a0, rv_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8);
+ else
+ rv_addr_of_loc(t, a0, desc->results[0]);
}
}
if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
@@ -1967,10 +1983,40 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
}
}
+/* Emit a sibling (tail) call: tear the frame down to the caller's entry state
+ * and jump (no link) to the callee. Outgoing args are already in the arg regs /
+ * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
+ * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
+ * the not-yet-final frame_size — so no func_end patching is needed. */
+static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a));
+ if (a->ncallee_saves)
+ rv_panic(a, "tail call with callee-saves (O1 path) not implemented");
+ rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
+ rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
+ rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
+ if (callee.kind == NATIVE_LOC_GLOBAL) {
+ u32 pos = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
+ callee.v.global.addend, 0, 0);
+ } else if (callee.kind == NATIVE_LOC_REG) {
+ rv64_emit32(mc, rv_jalr(RV_ZERO, loc_reg(callee), 0));
+ } else {
+ rv_panic(a, "unsupported tail call target");
+ }
+}
+
static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
MCEmitter* mc = t->mc;
ObjSecId sec = mc->section_id;
- if (plan->flags & CG_CALL_TAIL) rv_panic(rv_of(t), "tail call not implemented");
+ if (plan->flags & CG_CALL_TAIL) {
+ rv_emit_tail_site(t, plan->callee);
+ return;
+ }
if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
u32 pos = mc->pos(mc);
rv64_emit32(mc, rv_auipc(RV_RA, 0));
@@ -3257,10 +3303,43 @@ static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
rv_bind_native_param(d->native, p, dst);
}
+/* A sibling call is realizable when its outgoing stack-argument area fits the
+ * window the caller itself received (so the args land in the caller's incoming
+ * slots without overflowing into the caller's caller's frame). Register-only
+ * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
- (void)d;
- (void)call;
- return "rv64 tail calls not implemented yet";
+ RvNativeTarget* a = rv_of(d->native);
+ NativeCallDesc nd;
+ NativeLoc* args = NULL;
+ NativeLoc* results = NULL;
+ u32 i, stack;
+ if (a->ncallee_saves)
+ return "rv64 tail call: callee-saved registers in use";
+ memset(&nd, 0, sizeof nd);
+ if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+ if (call->nresults)
+ results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+ for (i = 0; i < call->nargs; ++i) {
+ args[i].kind = NATIVE_LOC_FRAME;
+ args[i].type = d->locals[call->args[i] - 1u].type;
+ args[i].cls = d->locals[call->args[i] - 1u].cls;
+ args[i].v.frame = d->locals[call->args[i] - 1u].home;
+ }
+ for (i = 0; i < call->nresults; ++i) {
+ results[i].kind = NATIVE_LOC_FRAME;
+ results[i].type = d->locals[call->results[i] - 1u].type;
+ results[i].cls = d->locals[call->results[i] - 1u].cls;
+ results[i].v.frame = d->locals[call->results[i] - 1u].home;
+ }
+ nd.fn_type = call->fn_type;
+ nd.args = args;
+ nd.results = results;
+ nd.nargs = call->nargs;
+ nd.nresults = call->nresults;
+ stack = rv_call_stack_size(d->native, &nd);
+ if (stack > a->incoming_stack_size)
+ return "rv64 tail call: stack argument area too small";
+ return NULL;
}
/* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1850,15 +1850,26 @@ static void x64_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
x64_load_addr(t, dst, addr);
}
-static void x64_store_outgoing_part(NativeTarget* t, u32 stack_off,
- NativeLoc src, u32 size) {
+static void x64_store_outgoing_part(NativeTarget* t, int tail_call,
+ u32 stack_off, NativeLoc src, u32 size) {
+ X64NativeTarget* a = x64_of(t);
NativeAddr addr;
memset(&addr, 0, sizeof addr);
addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.base.reg = X64_RSP;
addr.base_type = src.type;
- addr.offset = (i32)stack_off;
- x64_emit_mem(x64_of(t), 0, src, addr, x64_mem_for_type(t, src.type, size));
+ if (tail_call) {
+ /* A sibling call reuses the caller's frame: its outgoing stack args land in
+ * the caller's incoming-arg window. `stack_off` already includes the
+ * shadow-space prefix (the outgoing cursor starts at shadow_space), so the
+ * window address is [rbp + 16 + stack_off] — the same bytes the tail-callee
+ * reads once `leave` has restored rsp to the return address. */
+ addr.base.reg = X64_RBP;
+ addr.offset = (i32)(16u + stack_off);
+ } else {
+ addr.base.reg = X64_RSP;
+ addr.offset = (i32)stack_off;
+ }
+ x64_emit_mem(a, 0, src, addr, x64_mem_for_type(t, src.type, size));
}
/* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
@@ -2128,6 +2139,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
{
u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
u32 next_fp = 0, stack = aregs->shadow_space, nmoves = 0, i;
+ int tail = (desc->flags & CG_CALL_TAIL) != 0;
u16 p;
X64ArgMove moves[X64_MAX_REG_ARG_MOVES];
x64_sync_slot(aregs, &next_int, &next_fp);
@@ -2148,7 +2160,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
} else {
NativeLoc ptr = x64_reg_loc(i64t, NATIVE_REG_INT, X64_RAX);
x64_addr_of_loc(t, ptr, desc->args[i]);
- x64_store_outgoing_part(t, stack, ptr, 8);
+ x64_store_outgoing_part(t, tail, stack, ptr, 8);
stack += 8u;
}
x64_sync_slot(aregs, &next_int, &next_fp);
@@ -2163,7 +2175,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp);
x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
- x64_store_outgoing_part(t, stack, tmpreg, part->size);
+ x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
stack += 8u;
}
continue;
@@ -2199,7 +2211,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp);
x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
- x64_store_outgoing_part(t, stack, tmpreg, part->size);
+ x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
stack += 8u;
x64_sync_slot(aregs, &next_int, &next_fp);
}
@@ -2207,8 +2219,14 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
}
x64_emit_reg_arg_moves(t, moves, nmoves);
if (abi && abi->has_sret && desc->nresults) {
+ /* sret pointer in the first int-arg reg. A tail call forwards the
+ * caller's own incoming sret pointer (spilled at entry); otherwise pass
+ * the address of this call's result slot. */
NativeLoc sret = x64_reg_loc(i64t, NATIVE_REG_INT, aregs->int_args[0]);
- x64_addr_of_loc(t, sret, desc->results[0]);
+ if (tail)
+ x64_load_part(t, sret, x64_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8);
+ else
+ x64_addr_of_loc(t, sret, desc->results[0]);
}
/* Variadic call: AL = number of vector regs used. */
if (abi && abi->variadic)
@@ -2249,10 +2267,47 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
}
}
+/* Emit a sibling (tail) call: tear the frame down and jump (no call) to the
+ * callee. Outgoing args are already in arg regs / the caller's incoming-arg
+ * window. `leave` (mov rsp,rbp; pop rbp) restores the caller's rbp and leaves
+ * rsp at the return address — frame_size-independent, so no func_end patch. */
+static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+ X64NativeTarget* a = x64_of(t);
+ MCEmitter* mc = t->mc;
+ ObjSecId sec = mc->section_id;
+ if (a->ncallee_saves)
+ x64_panic(a, "tail call with callee-saves (O1 path) not implemented");
+ emit_leave(mc);
+ if (callee.kind == NATIVE_LOC_GLOBAL) {
+ u8 op = X64_OPC_JMP_REL32;
+ u32 disp_pos;
+ mc->emit_bytes(mc, &op, 1);
+ disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, callee.v.global.sym,
+ callee.v.global.addend - 4, 1, 0);
+ } else if (callee.kind == NATIVE_LOC_REG) {
+ u32 r = loc_reg(callee); /* indirect callee was staged in r11 by plan_call */
+ if (r & 8u) {
+ u8 rex = X64_REX_BASE | X64_REX_B;
+ mc->emit_bytes(mc, &rex, 1);
+ }
+ {
+ u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; /* jmp r/m, /4 */
+ mc->emit_bytes(mc, buf, 2);
+ }
+ } else {
+ x64_panic(a, "unsupported tail call target");
+ }
+}
+
static void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
MCEmitter* mc = t->mc;
ObjSecId sec = mc->section_id;
- if (plan->flags & CG_CALL_TAIL) x64_panic(x64_of(t), "tail call not implemented");
+ if (plan->flags & CG_CALL_TAIL) {
+ x64_emit_tail_site(t, plan->callee);
+ return;
+ }
if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
u8 op = X64_OPC_CALL_REL32;
u32 disp_pos;
@@ -3564,10 +3619,43 @@ static void x64_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
x64_bind_native_param(d->native, p, dst);
}
+/* A sibling call is realizable when its outgoing stack-argument area fits the
+ * window the caller itself received. Register-only calls always qualify. */
static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
- (void)d;
- (void)call;
- return "x64 tail calls not implemented yet";
+ X64NativeTarget* a = x64_of(d->native);
+ NativeCallDesc nd;
+ NativeLoc* args = NULL;
+ NativeLoc* results = NULL;
+ u32 i, stack;
+ if (a->ncallee_saves)
+ return "x64 tail call: callee-saved registers in use";
+ memset(&nd, 0, sizeof nd);
+ if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+ if (call->nresults)
+ results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+ for (i = 0; i < call->nargs; ++i) {
+ args[i].kind = NATIVE_LOC_FRAME;
+ args[i].type = d->locals[call->args[i] - 1u].type;
+ args[i].cls = d->locals[call->args[i] - 1u].cls;
+ args[i].v.frame = d->locals[call->args[i] - 1u].home;
+ }
+ for (i = 0; i < call->nresults; ++i) {
+ results[i].kind = NATIVE_LOC_FRAME;
+ results[i].type = d->locals[call->results[i] - 1u].type;
+ results[i].cls = d->locals[call->results[i] - 1u].cls;
+ results[i].v.frame = d->locals[call->results[i] - 1u].home;
+ }
+ nd.fn_type = call->fn_type;
+ nd.args = args;
+ nd.results = results;
+ nd.nargs = call->nargs;
+ nd.nresults = call->nresults;
+ stack = x64_call_stack_size(d->native, &nd);
+ /* x64_call_stack_size includes the shadow-space prefix; the caller's incoming
+ * window has the same prefix, so compare against incoming_stack_size + it. */
+ if (stack > a->incoming_stack_size + a->abi->shadow_space)
+ return "x64 tail call: stack argument area too small";
+ return NULL;
}
/* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`,