kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 622485a9c40384eb68d2a018fb5aa7279a169343
parent 910583a6980b0af766d502b20a688260efa409ec
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 08:21:36 -0700

rv64+x64: implement sibling (tail) calls

rv_no_tail now checks realizability (outgoing stack args must fit the caller's
incoming-arg window) instead of always blocking; rv_plan_call routes tail
outgoing stack args into that window ([s0+16+va_save+off]) and forwards the
caller's incoming sret pointer; rv_emit_call tears the frame down to the
caller's entry state (restore ra, sp=CFA via s0 (frame_size-independent at -O0
where there are no callee-saves), s0) and sibling-jumps (auipc+jr / jr reg).
test-toy X-O0 rv64: 256 pass, 0 fail, 0 skip (all musttail/tail cases green).

x64: implement sibling (tail) calls

x64_no_tail checks realizability (outgoing stack args fit the caller's
incoming-arg window, accounting for shadow space); plan_call routes tail
outgoing stack args into that window ([rbp+16+off]) and forwards the caller's
incoming sret pointer; emit_call tears the frame down with leave (restoring
the caller's rbp, leaving rsp at the return address — frame_size-independent)
and jmps the callee (jmp rel32 / jmp r/m via the r11-staged indirect callee).
test-toy X-O0 x64: 256 pass, 0 fail, 0 skip.

Diffstat:
Msrc/arch/rv64/native.c | 105+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/arch/x64/native.c | 114++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 193 insertions(+), 26 deletions(-)

diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c @@ -1668,14 +1668,23 @@ static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { rv_load_addr(t, dst, addr); } -static void rv_store_outgoing_part(NativeTarget* t, u32 stack_off, NativeLoc src, - u32 size) { +static void rv_store_outgoing_part(NativeTarget* t, int tail_call, u32 stack_off, + NativeLoc src, u32 size) { NativeAddr addr; memset(&addr, 0, sizeof addr); addr.base_kind = NATIVE_ADDR_BASE_REG; - addr.base.reg = RV_SP; addr.base_type = src.type; - addr.offset = (i32)stack_off; + if (tail_call) { + /* A sibling call reuses the caller's frame: its outgoing stack args land in + * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically + * the same address the tail-callee will read at [sp+off] once the teardown + * has restored sp to the caller's entry sp (the CFA). */ + addr.base.reg = RV_S0; + addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off); + } else { + addr.base.reg = RV_SP; + addr.offset = (i32)stack_off; + } rv_emit_mem(rv_of(t), 0, src, addr, rv_mem_for_type(t, src.type, size)); } @@ -1871,6 +1880,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, * integer argument (a0), so the real args start at a1. */ u32 next_int = (abi && abi->has_sret) ? 1u : 0u; u32 next_fp = 0, stack = 0, nmoves = 0, i, p; + int tail = (desc->flags & CG_CALL_TAIL) != 0; RvArgMove moves[RV_MAX_REG_ARG_MOVES]; for (i = 0; i < desc->nargs; ++i) { ABIArgInfo tmp; @@ -1883,7 +1893,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, u32 n = rv_class_stack_size(ai), off = 0; while (off < n) { rv_load_part(t, tmpreg, desc->args[i], off, 8); - rv_store_outgoing_part(t, stack + off, tmpreg, 8); + rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8); off += 8; } stack += n; @@ -1900,7 +1910,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, } else { NativeLoc ptr = rv_reg_loc(i64t, NATIVE_REG_INT, RV_TMP0); rv_addr_of_loc(t, ptr, desc->args[i]); - rv_store_outgoing_part(t, stack, ptr, 8); + rv_store_outgoing_part(t, tail, stack, ptr, 8); stack += 8u; } continue; @@ -1924,16 +1934,22 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, NativeLoc tmpreg = rv_reg_loc(desc->args[i].type, cls, tmp); rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); stack = align_up_u32(stack, rv_part_stack_align(part)); - rv_store_outgoing_part(t, stack, tmpreg, part->size); + rv_store_outgoing_part(t, tail, stack, tmpreg, part->size); stack += rv_part_stack_size(part); } } } rv_emit_reg_arg_moves(t, moves, nmoves); if (abi && abi->has_sret && desc->nresults) { - /* sret pointer goes in a0; arg loads have completed. */ + /* sret pointer goes in a0; arg loads have completed. A tail call forwards + * the caller's own incoming sret pointer (spilled at entry) so the + * sibling writes the result into the caller's caller's destination; + * otherwise pass the address of this call's result slot. */ NativeLoc a0 = rv_reg_loc(i64t, NATIVE_REG_INT, RV_A0); - rv_addr_of_loc(t, a0, desc->results[0]); + if (tail) + rv_load_part(t, a0, rv_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8); + else + rv_addr_of_loc(t, a0, desc->results[0]); } } if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { @@ -1967,10 +1983,40 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, } } +/* Emit a sibling (tail) call: tear the frame down to the caller's entry state + * and jump (no link) to the callee. Outgoing args are already in the arg regs / + * the caller's incoming-arg window. At -O0 there are no callee-saves, and the + * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of + * the not-yet-final frame_size — so no func_end patching is needed. */ +static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) { + RvNativeTarget* a = rv_of(t); + MCEmitter* mc = t->mc; + i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a)); + if (a->ncallee_saves) + rv_panic(a, "tail call with callee-saves (O1 path) not implemented"); + rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8)); + rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa)); + rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0)); + if (callee.kind == NATIVE_LOC_GLOBAL) { + u32 pos = mc->pos(mc); + rv64_emit32(mc, rv_auipc(RV_TMP0, 0)); + rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0)); + mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym, + callee.v.global.addend, 0, 0); + } else if (callee.kind == NATIVE_LOC_REG) { + rv64_emit32(mc, rv_jalr(RV_ZERO, loc_reg(callee), 0)); + } else { + rv_panic(a, "unsupported tail call target"); + } +} + static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) { MCEmitter* mc = t->mc; ObjSecId sec = mc->section_id; - if (plan->flags & CG_CALL_TAIL) rv_panic(rv_of(t), "tail call not implemented"); + if (plan->flags & CG_CALL_TAIL) { + rv_emit_tail_site(t, plan->callee); + return; + } if (plan->callee.kind == NATIVE_LOC_GLOBAL) { u32 pos = mc->pos(mc); rv64_emit32(mc, rv_auipc(RV_RA, 0)); @@ -3257,10 +3303,43 @@ static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p, rv_bind_native_param(d->native, p, dst); } +/* A sibling call is realizable when its outgoing stack-argument area fits the + * window the caller itself received (so the args land in the caller's incoming + * slots without overflowing into the caller's caller's frame). Register-only + * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */ static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { - (void)d; - (void)call; - return "rv64 tail calls not implemented yet"; + RvNativeTarget* a = rv_of(d->native); + NativeCallDesc nd; + NativeLoc* args = NULL; + NativeLoc* results = NULL; + u32 i, stack; + if (a->ncallee_saves) + return "rv64 tail call: callee-saved registers in use"; + memset(&nd, 0, sizeof nd); + if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); + if (call->nresults) + results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults); + for (i = 0; i < call->nargs; ++i) { + args[i].kind = NATIVE_LOC_FRAME; + args[i].type = d->locals[call->args[i] - 1u].type; + args[i].cls = d->locals[call->args[i] - 1u].cls; + args[i].v.frame = d->locals[call->args[i] - 1u].home; + } + for (i = 0; i < call->nresults; ++i) { + results[i].kind = NATIVE_LOC_FRAME; + results[i].type = d->locals[call->results[i] - 1u].type; + results[i].cls = d->locals[call->results[i] - 1u].cls; + results[i].v.frame = d->locals[call->results[i] - 1u].home; + } + nd.fn_type = call->fn_type; + nd.args = args; + nd.results = results; + nd.nargs = call->nargs; + nd.nresults = call->nresults; + stack = rv_call_stack_size(d->native, &nd); + if (stack > a->incoming_stack_size) + return "rv64 tail call: stack argument area too small"; + return NULL; } /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg` diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -1850,15 +1850,26 @@ static void x64_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { x64_load_addr(t, dst, addr); } -static void x64_store_outgoing_part(NativeTarget* t, u32 stack_off, - NativeLoc src, u32 size) { +static void x64_store_outgoing_part(NativeTarget* t, int tail_call, + u32 stack_off, NativeLoc src, u32 size) { + X64NativeTarget* a = x64_of(t); NativeAddr addr; memset(&addr, 0, sizeof addr); addr.base_kind = NATIVE_ADDR_BASE_REG; - addr.base.reg = X64_RSP; addr.base_type = src.type; - addr.offset = (i32)stack_off; - x64_emit_mem(x64_of(t), 0, src, addr, x64_mem_for_type(t, src.type, size)); + if (tail_call) { + /* A sibling call reuses the caller's frame: its outgoing stack args land in + * the caller's incoming-arg window. `stack_off` already includes the + * shadow-space prefix (the outgoing cursor starts at shadow_space), so the + * window address is [rbp + 16 + stack_off] — the same bytes the tail-callee + * reads once `leave` has restored rsp to the return address. */ + addr.base.reg = X64_RBP; + addr.offset = (i32)(16u + stack_off); + } else { + addr.base.reg = X64_RSP; + addr.offset = (i32)stack_off; + } + x64_emit_mem(a, 0, src, addr, x64_mem_for_type(t, src.type, size)); } /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */ @@ -2128,6 +2139,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, { u32 next_int = (abi && abi->has_sret) ? 1u : 0u; u32 next_fp = 0, stack = aregs->shadow_space, nmoves = 0, i; + int tail = (desc->flags & CG_CALL_TAIL) != 0; u16 p; X64ArgMove moves[X64_MAX_REG_ARG_MOVES]; x64_sync_slot(aregs, &next_int, &next_fp); @@ -2148,7 +2160,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } else { NativeLoc ptr = x64_reg_loc(i64t, NATIVE_REG_INT, X64_RAX); x64_addr_of_loc(t, ptr, desc->args[i]); - x64_store_outgoing_part(t, stack, ptr, 8); + x64_store_outgoing_part(t, tail, stack, ptr, 8); stack += 8u; } x64_sync_slot(aregs, &next_int, &next_fp); @@ -2163,7 +2175,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp); x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); - x64_store_outgoing_part(t, stack, tmpreg, part->size); + x64_store_outgoing_part(t, tail, stack, tmpreg, part->size); stack += 8u; } continue; @@ -2199,7 +2211,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp); x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); - x64_store_outgoing_part(t, stack, tmpreg, part->size); + x64_store_outgoing_part(t, tail, stack, tmpreg, part->size); stack += 8u; x64_sync_slot(aregs, &next_int, &next_fp); } @@ -2207,8 +2219,14 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } x64_emit_reg_arg_moves(t, moves, nmoves); if (abi && abi->has_sret && desc->nresults) { + /* sret pointer in the first int-arg reg. A tail call forwards the + * caller's own incoming sret pointer (spilled at entry); otherwise pass + * the address of this call's result slot. */ NativeLoc sret = x64_reg_loc(i64t, NATIVE_REG_INT, aregs->int_args[0]); - x64_addr_of_loc(t, sret, desc->results[0]); + if (tail) + x64_load_part(t, sret, x64_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8); + else + x64_addr_of_loc(t, sret, desc->results[0]); } /* Variadic call: AL = number of vector regs used. */ if (abi && abi->variadic) @@ -2249,10 +2267,47 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } } +/* Emit a sibling (tail) call: tear the frame down and jump (no call) to the + * callee. Outgoing args are already in arg regs / the caller's incoming-arg + * window. `leave` (mov rsp,rbp; pop rbp) restores the caller's rbp and leaves + * rsp at the return address — frame_size-independent, so no func_end patch. */ +static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) { + X64NativeTarget* a = x64_of(t); + MCEmitter* mc = t->mc; + ObjSecId sec = mc->section_id; + if (a->ncallee_saves) + x64_panic(a, "tail call with callee-saves (O1 path) not implemented"); + emit_leave(mc); + if (callee.kind == NATIVE_LOC_GLOBAL) { + u8 op = X64_OPC_JMP_REL32; + u32 disp_pos; + mc->emit_bytes(mc, &op, 1); + disp_pos = mc->pos(mc); + emit_u32le(mc, 0); + mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, callee.v.global.sym, + callee.v.global.addend - 4, 1, 0); + } else if (callee.kind == NATIVE_LOC_REG) { + u32 r = loc_reg(callee); /* indirect callee was staged in r11 by plan_call */ + if (r & 8u) { + u8 rex = X64_REX_BASE | X64_REX_B; + mc->emit_bytes(mc, &rex, 1); + } + { + u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; /* jmp r/m, /4 */ + mc->emit_bytes(mc, buf, 2); + } + } else { + x64_panic(a, "unsupported tail call target"); + } +} + static void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) { MCEmitter* mc = t->mc; ObjSecId sec = mc->section_id; - if (plan->flags & CG_CALL_TAIL) x64_panic(x64_of(t), "tail call not implemented"); + if (plan->flags & CG_CALL_TAIL) { + x64_emit_tail_site(t, plan->callee); + return; + } if (plan->callee.kind == NATIVE_LOC_GLOBAL) { u8 op = X64_OPC_CALL_REL32; u32 disp_pos; @@ -3564,10 +3619,43 @@ static void x64_bind_param(NativeDirectTarget* d, const CGParamDesc* p, x64_bind_native_param(d->native, p, dst); } +/* A sibling call is realizable when its outgoing stack-argument area fits the + * window the caller itself received. Register-only calls always qualify. */ static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { - (void)d; - (void)call; - return "x64 tail calls not implemented yet"; + X64NativeTarget* a = x64_of(d->native); + NativeCallDesc nd; + NativeLoc* args = NULL; + NativeLoc* results = NULL; + u32 i, stack; + if (a->ncallee_saves) + return "x64 tail call: callee-saved registers in use"; + memset(&nd, 0, sizeof nd); + if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); + if (call->nresults) + results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults); + for (i = 0; i < call->nargs; ++i) { + args[i].kind = NATIVE_LOC_FRAME; + args[i].type = d->locals[call->args[i] - 1u].type; + args[i].cls = d->locals[call->args[i] - 1u].cls; + args[i].v.frame = d->locals[call->args[i] - 1u].home; + } + for (i = 0; i < call->nresults; ++i) { + results[i].kind = NATIVE_LOC_FRAME; + results[i].type = d->locals[call->results[i] - 1u].type; + results[i].cls = d->locals[call->results[i] - 1u].cls; + results[i].v.frame = d->locals[call->results[i] - 1u].home; + } + nd.fn_type = call->fn_type; + nd.args = args; + nd.results = results; + nd.nargs = call->nargs; + nd.nresults = call->nresults; + stack = x64_call_stack_size(d->native, &nd); + /* x64_call_stack_size includes the shadow-space prefix; the caller's incoming + * window has the same prefix, so compare against incoming_stack_size + it. */ + if (stack > a->incoming_stack_size + a->abi->shadow_space) + return "x64 tail call: stack argument area too small"; + return NULL; } /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`,