rv64+x64: implement sibling (tail) calls - kit

commit 622485a9c40384eb68d2a018fb5aa7279a169343
parent 910583a6980b0af766d502b20a688260efa409ec
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 08:21:36 -0700

rv64+x64: implement sibling (tail) calls

rv_no_tail now checks realizability (outgoing stack args must fit the caller's
incoming-arg window) instead of always blocking; rv_plan_call routes tail
outgoing stack args into that window ([s0+16+va_save+off]) and forwards the
caller's incoming sret pointer; rv_emit_call tears the frame down to the
caller's entry state (restore ra, sp=CFA via s0 (frame_size-independent at -O0
where there are no callee-saves), s0) and sibling-jumps (auipc+jr / jr reg).
test-toy X-O0 rv64: 256 pass, 0 fail, 0 skip (all musttail/tail cases green).

x64: implement sibling (tail) calls

x64_no_tail checks realizability (outgoing stack args fit the caller's
incoming-arg window, accounting for shadow space); plan_call routes tail
outgoing stack args into that window ([rbp+16+off]) and forwards the caller's
incoming sret pointer; emit_call tears the frame down with leave (restoring
the caller's rbp, leaving rsp at the return address — frame_size-independent)
and jmps the callee (jmp rel32 / jmp r/m via the r11-staged indirect callee).
test-toy X-O0 x64: 256 pass, 0 fail, 0 skip.

Diffstat:
M src/arch/rv64/native.c  | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M src/arch/x64/native.c  | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------

2 files changed, 193 insertions(+), 26 deletions(-)
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -1668,14 +1668,23 @@ static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   rv_load_addr(t, dst, addr);
 }
 
-static void rv_store_outgoing_part(NativeTarget* t, u32 stack_off, NativeLoc src,
-                                   u32 size) {
+static void rv_store_outgoing_part(NativeTarget* t, int tail_call, u32 stack_off,
+                                   NativeLoc src, u32 size) {
   NativeAddr addr;
   memset(&addr, 0, sizeof addr);
   addr.base_kind = NATIVE_ADDR_BASE_REG;
-  addr.base.reg = RV_SP;
   addr.base_type = src.type;
-  addr.offset = (i32)stack_off;
+  if (tail_call) {
+    /* A sibling call reuses the caller's frame: its outgoing stack args land in
+     * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically
+     * the same address the tail-callee will read at [sp+off] once the teardown
+     * has restored sp to the caller's entry sp (the CFA). */
+    addr.base.reg = RV_S0;
+    addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
+  } else {
+    addr.base.reg = RV_SP;
+    addr.offset = (i32)stack_off;
+  }
   rv_emit_mem(rv_of(t), 0, src, addr, rv_mem_for_type(t, src.type, size));
 }
 
@@ -1871,6 +1880,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
      * integer argument (a0), so the real args start at a1. */
     u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
     u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
+    int tail = (desc->flags & CG_CALL_TAIL) != 0;
     RvArgMove moves[RV_MAX_REG_ARG_MOVES];
     for (i = 0; i < desc->nargs; ++i) {
       ABIArgInfo tmp;
@@ -1883,7 +1893,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         u32 n = rv_class_stack_size(ai), off = 0;
         while (off < n) {
           rv_load_part(t, tmpreg, desc->args[i], off, 8);
-          rv_store_outgoing_part(t, stack + off, tmpreg, 8);
+          rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8);
           off += 8;
         }
         stack += n;
@@ -1900,7 +1910,7 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         } else {
           NativeLoc ptr = rv_reg_loc(i64t, NATIVE_REG_INT, RV_TMP0);
           rv_addr_of_loc(t, ptr, desc->args[i]);
-          rv_store_outgoing_part(t, stack, ptr, 8);
+          rv_store_outgoing_part(t, tail, stack, ptr, 8);
           stack += 8u;
         }
         continue;
@@ -1924,16 +1934,22 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
           NativeLoc tmpreg = rv_reg_loc(desc->args[i].type, cls, tmp);
           rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
           stack = align_up_u32(stack, rv_part_stack_align(part));
-          rv_store_outgoing_part(t, stack, tmpreg, part->size);
+          rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
           stack += rv_part_stack_size(part);
         }
       }
     }
     rv_emit_reg_arg_moves(t, moves, nmoves);
     if (abi && abi->has_sret && desc->nresults) {
-      /* sret pointer goes in a0; arg loads have completed. */
+      /* sret pointer goes in a0; arg loads have completed. A tail call forwards
+       * the caller's own incoming sret pointer (spilled at entry) so the
+       * sibling writes the result into the caller's caller's destination;
+       * otherwise pass the address of this call's result slot. */
       NativeLoc a0 = rv_reg_loc(i64t, NATIVE_REG_INT, RV_A0);
-      rv_addr_of_loc(t, a0, desc->results[0]);
+      if (tail)
+        rv_load_part(t, a0, rv_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8);
+      else
+        rv_addr_of_loc(t, a0, desc->results[0]);
     }
   }
   if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
@@ -1967,10 +1983,40 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   }
 }
 
+/* Emit a sibling (tail) call: tear the frame down to the caller's entry state
+ * and jump (no link) to the callee. Outgoing args are already in the arg regs /
+ * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
+ * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
+ * the not-yet-final frame_size — so no func_end patching is needed. */
+static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+  RvNativeTarget* a = rv_of(t);
+  MCEmitter* mc = t->mc;
+  i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a));
+  if (a->ncallee_saves)
+    rv_panic(a, "tail call with callee-saves (O1 path) not implemented");
+  rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
+  rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
+  rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
+  if (callee.kind == NATIVE_LOC_GLOBAL) {
+    u32 pos = mc->pos(mc);
+    rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
+    rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
+    mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
+                      callee.v.global.addend, 0, 0);
+  } else if (callee.kind == NATIVE_LOC_REG) {
+    rv64_emit32(mc, rv_jalr(RV_ZERO, loc_reg(callee), 0));
+  } else {
+    rv_panic(a, "unsupported tail call target");
+  }
+}
+
 static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   MCEmitter* mc = t->mc;
   ObjSecId sec = mc->section_id;
-  if (plan->flags & CG_CALL_TAIL) rv_panic(rv_of(t), "tail call not implemented");
+  if (plan->flags & CG_CALL_TAIL) {
+    rv_emit_tail_site(t, plan->callee);
+    return;
+  }
   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
     u32 pos = mc->pos(mc);
     rv64_emit32(mc, rv_auipc(RV_RA, 0));
@@ -3257,10 +3303,43 @@ static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   rv_bind_native_param(d->native, p, dst);
 }
 
+/* A sibling call is realizable when its outgoing stack-argument area fits the
+ * window the caller itself received (so the args land in the caller's incoming
+ * slots without overflowing into the caller's caller's frame). Register-only
+ * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
 static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
-  (void)d;
-  (void)call;
-  return "rv64 tail calls not implemented yet";
+  RvNativeTarget* a = rv_of(d->native);
+  NativeCallDesc nd;
+  NativeLoc* args = NULL;
+  NativeLoc* results = NULL;
+  u32 i, stack;
+  if (a->ncallee_saves)
+    return "rv64 tail call: callee-saved registers in use";
+  memset(&nd, 0, sizeof nd);
+  if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+  if (call->nresults)
+    results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+  for (i = 0; i < call->nargs; ++i) {
+    args[i].kind = NATIVE_LOC_FRAME;
+    args[i].type = d->locals[call->args[i] - 1u].type;
+    args[i].cls = d->locals[call->args[i] - 1u].cls;
+    args[i].v.frame = d->locals[call->args[i] - 1u].home;
+  }
+  for (i = 0; i < call->nresults; ++i) {
+    results[i].kind = NATIVE_LOC_FRAME;
+    results[i].type = d->locals[call->results[i] - 1u].type;
+    results[i].cls = d->locals[call->results[i] - 1u].cls;
+    results[i].v.frame = d->locals[call->results[i] - 1u].home;
+  }
+  nd.fn_type = call->fn_type;
+  nd.args = args;
+  nd.results = results;
+  nd.nargs = call->nargs;
+  nd.nresults = call->nresults;
+  stack = rv_call_stack_size(d->native, &nd);
+  if (stack > a->incoming_stack_size)
+    return "rv64 tail call: stack argument area too small";
+  return NULL;
 }
 
 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1850,15 +1850,26 @@ static void x64_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   x64_load_addr(t, dst, addr);
 }
 
-static void x64_store_outgoing_part(NativeTarget* t, u32 stack_off,
-                                    NativeLoc src, u32 size) {
+static void x64_store_outgoing_part(NativeTarget* t, int tail_call,
+                                    u32 stack_off, NativeLoc src, u32 size) {
+  X64NativeTarget* a = x64_of(t);
   NativeAddr addr;
   memset(&addr, 0, sizeof addr);
   addr.base_kind = NATIVE_ADDR_BASE_REG;
-  addr.base.reg = X64_RSP;
   addr.base_type = src.type;
-  addr.offset = (i32)stack_off;
-  x64_emit_mem(x64_of(t), 0, src, addr, x64_mem_for_type(t, src.type, size));
+  if (tail_call) {
+    /* A sibling call reuses the caller's frame: its outgoing stack args land in
+     * the caller's incoming-arg window. `stack_off` already includes the
+     * shadow-space prefix (the outgoing cursor starts at shadow_space), so the
+     * window address is [rbp + 16 + stack_off] — the same bytes the tail-callee
+     * reads once `leave` has restored rsp to the return address. */
+    addr.base.reg = X64_RBP;
+    addr.offset = (i32)(16u + stack_off);
+  } else {
+    addr.base.reg = X64_RSP;
+    addr.offset = (i32)stack_off;
+  }
+  x64_emit_mem(a, 0, src, addr, x64_mem_for_type(t, src.type, size));
 }
 
 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
@@ -2128,6 +2139,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   {
     u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
     u32 next_fp = 0, stack = aregs->shadow_space, nmoves = 0, i;
+    int tail = (desc->flags & CG_CALL_TAIL) != 0;
     u16 p;
     X64ArgMove moves[X64_MAX_REG_ARG_MOVES];
     x64_sync_slot(aregs, &next_int, &next_fp);
@@ -2148,7 +2160,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         } else {
           NativeLoc ptr = x64_reg_loc(i64t, NATIVE_REG_INT, X64_RAX);
           x64_addr_of_loc(t, ptr, desc->args[i]);
-          x64_store_outgoing_part(t, stack, ptr, 8);
+          x64_store_outgoing_part(t, tail, stack, ptr, 8);
           stack += 8u;
         }
         x64_sync_slot(aregs, &next_int, &next_fp);
@@ -2163,7 +2175,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
           Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
           NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp);
           x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
-          x64_store_outgoing_part(t, stack, tmpreg, part->size);
+          x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
           stack += 8u;
         }
         continue;
@@ -2199,7 +2211,7 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
           Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
           NativeLoc tmpreg = x64_reg_loc(desc->args[i].type, cls, tmp);
           x64_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
-          x64_store_outgoing_part(t, stack, tmpreg, part->size);
+          x64_store_outgoing_part(t, tail, stack, tmpreg, part->size);
           stack += 8u;
           x64_sync_slot(aregs, &next_int, &next_fp);
         }
@@ -2207,8 +2219,14 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
     }
     x64_emit_reg_arg_moves(t, moves, nmoves);
     if (abi && abi->has_sret && desc->nresults) {
+      /* sret pointer in the first int-arg reg. A tail call forwards the
+       * caller's own incoming sret pointer (spilled at entry); otherwise pass
+       * the address of this call's result slot. */
       NativeLoc sret = x64_reg_loc(i64t, NATIVE_REG_INT, aregs->int_args[0]);
-      x64_addr_of_loc(t, sret, desc->results[0]);
+      if (tail)
+        x64_load_part(t, sret, x64_stack_loc(i64t, a->sret_ptr_slot, 0), 0, 8);
+      else
+        x64_addr_of_loc(t, sret, desc->results[0]);
     }
     /* Variadic call: AL = number of vector regs used. */
     if (abi && abi->variadic)
@@ -2249,10 +2267,47 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   }
 }
 
+/* Emit a sibling (tail) call: tear the frame down and jump (no call) to the
+ * callee. Outgoing args are already in arg regs / the caller's incoming-arg
+ * window. `leave` (mov rsp,rbp; pop rbp) restores the caller's rbp and leaves
+ * rsp at the return address — frame_size-independent, so no func_end patch. */
+static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+  X64NativeTarget* a = x64_of(t);
+  MCEmitter* mc = t->mc;
+  ObjSecId sec = mc->section_id;
+  if (a->ncallee_saves)
+    x64_panic(a, "tail call with callee-saves (O1 path) not implemented");
+  emit_leave(mc);
+  if (callee.kind == NATIVE_LOC_GLOBAL) {
+    u8 op = X64_OPC_JMP_REL32;
+    u32 disp_pos;
+    mc->emit_bytes(mc, &op, 1);
+    disp_pos = mc->pos(mc);
+    emit_u32le(mc, 0);
+    mc->emit_reloc_at(mc, sec, disp_pos, R_X64_PLT32, callee.v.global.sym,
+                      callee.v.global.addend - 4, 1, 0);
+  } else if (callee.kind == NATIVE_LOC_REG) {
+    u32 r = loc_reg(callee); /* indirect callee was staged in r11 by plan_call */
+    if (r & 8u) {
+      u8 rex = X64_REX_BASE | X64_REX_B;
+      mc->emit_bytes(mc, &rex, 1);
+    }
+    {
+      u8 buf[2] = {X64_OP_JMP_RM64, modrm(3u, 4u, r & 7u)}; /* jmp r/m, /4 */
+      mc->emit_bytes(mc, buf, 2);
+    }
+  } else {
+    x64_panic(a, "unsupported tail call target");
+  }
+}
+
 static void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   MCEmitter* mc = t->mc;
   ObjSecId sec = mc->section_id;
-  if (plan->flags & CG_CALL_TAIL) x64_panic(x64_of(t), "tail call not implemented");
+  if (plan->flags & CG_CALL_TAIL) {
+    x64_emit_tail_site(t, plan->callee);
+    return;
+  }
   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
     u8 op = X64_OPC_CALL_REL32;
     u32 disp_pos;
@@ -3564,10 +3619,43 @@ static void x64_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   x64_bind_native_param(d->native, p, dst);
 }
 
+/* A sibling call is realizable when its outgoing stack-argument area fits the
+ * window the caller itself received. Register-only calls always qualify. */
 static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
-  (void)d;
-  (void)call;
-  return "x64 tail calls not implemented yet";
+  X64NativeTarget* a = x64_of(d->native);
+  NativeCallDesc nd;
+  NativeLoc* args = NULL;
+  NativeLoc* results = NULL;
+  u32 i, stack;
+  if (a->ncallee_saves)
+    return "x64 tail call: callee-saved registers in use";
+  memset(&nd, 0, sizeof nd);
+  if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+  if (call->nresults)
+    results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+  for (i = 0; i < call->nargs; ++i) {
+    args[i].kind = NATIVE_LOC_FRAME;
+    args[i].type = d->locals[call->args[i] - 1u].type;
+    args[i].cls = d->locals[call->args[i] - 1u].cls;
+    args[i].v.frame = d->locals[call->args[i] - 1u].home;
+  }
+  for (i = 0; i < call->nresults; ++i) {
+    results[i].kind = NATIVE_LOC_FRAME;
+    results[i].type = d->locals[call->results[i] - 1u].type;
+    results[i].cls = d->locals[call->results[i] - 1u].cls;
+    results[i].v.frame = d->locals[call->results[i] - 1u].home;
+  }
+  nd.fn_type = call->fn_type;
+  nd.args = args;
+  nd.results = results;
+  nd.nargs = call->nargs;
+  nd.nresults = call->nresults;
+  stack = x64_call_stack_size(d->native, &nd);
+  /* x64_call_stack_size includes the shadow-space prefix; the caller's incoming
+   * window has the same prefix, so compare against incoming_stack_size + it. */
+  if (stack > a->incoming_stack_size + a->abi->shadow_space)
+    return "x64 tail call: stack argument area too small";
+  return NULL;
 }
 
 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`,

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/rv64/native.c	\|	105	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M	src/arch/x64/native.c	\|	114	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------