commit 7007eef93b526756a09f98af5cbd01096fdf1b7e parent 912dbabac0f3bde2a9d89a4a0613dd38064e77ef Author: Ryan Sepassi <rsepassi@gmail.com> Date: Mon, 25 May 2026 13:44:38 -0700 cg: support tail-call realizability Diffstat:
37 files changed, 617 insertions(+), 170 deletions(-)
diff --git a/include/cfree/cg.h b/include/cfree/cg.h @@ -735,10 +735,43 @@ CFREE_API void cfree_cg_float_to_uint(CfreeCg*, CfreeCgTypeId dst, * Calls and Returns * ============================================================ */ +/* Tail-call policy for a call site. + * + * A tail call is a TERMINATOR: it ends the current function. It pushes no + * result onto the value stack, and the caller must not emit cfree_cg_ret + * after it — the call is the return. + * + * Two distinct conditions govern whether a tail call is legal: + * + * - Signature match is a PRECONDITION the frontend must guarantee: the + * callee's return type must be ABI-compatible with the enclosing + * function's declared return type, and the call must sit in return + * position. A violation is a frontend bug; CG aborts (compiler_panic) + * regardless of policy — it is never treated as a fallback case. + * + * - ABI realizability is DISCOVERED during emission and is target-specific: + * can control transfer to the callee while reusing (and tearing down) the + * current frame, with the callee's outgoing argument area and return + * mechanism fitting the space the caller itself received? sret and + * variadic are NOT inherent blockers — an sret return is forwarded via the + * function's own incoming sret pointer, and a variadic callee is fine when + * its argument area fits the caller's incoming area. The usual blocker is + * an outgoing argument area that exceeds that space; some are arch-specific + * (e.g. wasm packs varargs into a caller-frame buffer that a sibling call + * would dangle). This is what ALLOWED vs MUST governs. + */ typedef enum CfreeCgTailPolicy { + /* Ordinary call. Result is pushed; not a terminator. */ CFREE_CG_TAIL_DEFAULT, + /* Tail-call (terminator) if ABI-realizable; otherwise silently emit an + * ordinary call and synthesize the caller's return of the result. Never + * fails for realizability. */ CFREE_CG_TAIL_ALLOWED, + /* Tail-call (terminator); if not ABI-realizable, CG fails with a + * diagnostic naming the reason. Never silently degrades to an ordinary + * call. */ CFREE_CG_TAIL_MUST, + /* Ordinary call even in tail position. Treated as DEFAULT. */ CFREE_CG_TAIL_NEVER, } CfreeCgTailPolicy; @@ -756,8 +789,13 @@ typedef struct CfreeCgCallAttrs { /* cfree_cg_call pops a computed function pointer plus nargs arguments. * cfree_cg_call_symbol emits a direct call to the declared function symbol, * allowing the backend/linker to choose PLT/stub/IAT/direct/IFUNC handling. - * MUST tail calls should fail diagnostically if the ABI shapes are not - * compatible. */ + * + * For tail policies (see CfreeCgTailPolicy): the call is a terminator that + * pushes no result. A MUST tail call that is not ABI-realizable fails with a + * diagnostic; an ALLOWED tail call that is not realizable silently degrades + * to an ordinary call followed by a synthesized return of the result. A + * tail call whose callee return type is incompatible with the enclosing + * function's return type is a frontend bug and aborts under any policy. */ CFREE_API void cfree_cg_call(CfreeCg*, uint32_t nargs, CfreeCgTypeId fn_type, CfreeCgCallAttrs attrs); CFREE_API void cfree_cg_call_symbol(CfreeCg*, CfreeCgSym sym, uint32_t nargs, diff --git a/lang/toy/builtins.c b/lang/toy/builtins.c @@ -377,15 +377,22 @@ static CfreeCgTypeId toy_parse_call_builtin(ToyParser* p) { return CFREE_CG_TYPE_NONE; } - if (direct) - cfree_cg_call_symbol(p->cg, fn->sym, (uint32_t)nargs, attrs); - else - cfree_cg_call(p->cg, (uint32_t)nargs, fn_ty, attrs); if (!ret_toy_type) ret_toy_type = toy_type_from_cg(p, ret_ty); if (tail) { p->tail_call_expr = 1; p->tail_call_ret_toy = ret_toy_type; + if (p->cur_fn_ret_toy != TOY_TYPE_NONE && + ret_toy_type != TOY_TYPE_NONE && + !toy_type_accepts_type(p, p->cur_fn_ret_toy, ret_toy_type)) { + p->last_type = ret_toy_type; + return ret_ty; + } } + + if (direct) + cfree_cg_call_symbol(p->cg, fn->sym, (uint32_t)nargs, attrs); + else + cfree_cg_call(p->cg, (uint32_t)nargs, fn_ty, attrs); p->last_type = ret_toy_type; return ret_ty; } diff --git a/lang/toy/lexer.c b/lang/toy/lexer.c @@ -260,6 +260,10 @@ ToyToken toy_lexer_next(ToyLexer* lex) { else if (len == 4 && start[0] == 't' && start[1] == 'a' && start[2] == 'i' && start[3] == 'l') kind = TOK_TAIL; + else if (len == 8 && start[0] == 'm' && start[1] == 'u' && + start[2] == 's' && start[3] == 't' && start[4] == 't' && + start[5] == 'a' && start[6] == 'i' && start[7] == 'l') + kind = TOK_MUSTTAIL; else if (len == 4 && start[0] == 't' && start[1] == 'y' && start[2] == 'p' && start[3] == 'e') kind = TOK_TYPE; diff --git a/lang/toy/lexer.h b/lang/toy/lexer.h @@ -22,6 +22,7 @@ typedef enum ToyTokenKind { TOK_CONTINUE, TOK_RETURN, TOK_TAIL, + TOK_MUSTTAIL, TOK_TYPE, TOK_PUB, TOK_EXTERN, diff --git a/lang/toy/parser.c b/lang/toy/parser.c @@ -1320,14 +1320,19 @@ static int toy_parse_continue_stmt(ToyParser* p) { static int toy_parse_return_stmt(ToyParser* p) { CfreeCgTypeId ty; toy_parser_advance(p); /* return */ - if (toy_parser_match(p, TOK_TAIL)) { + int is_tail = p->cur.kind == TOK_TAIL; + int is_musttail = p->cur.kind == TOK_MUSTTAIL; + if (is_tail || is_musttail) { + int must_tail = is_musttail; CfreeSym name; ToyFn* fn; ToyToken call_tok; CfreeCgTypeId fn_ty; size_t nargs = 0; + toy_parser_advance(p); /* tail | musttail */ if (p->cur.kind != TOK_IDENT) { - toy_error(p, p->cur.loc, "expected function name after tail"); + toy_error(p, p->cur.loc, "expected function name after %s", + must_tail ? "musttail" : "tail"); return 0; } call_tok = p->cur; @@ -1349,10 +1354,10 @@ static int toy_parse_return_stmt(ToyParser* p) { return 0; } } - if (cfree_cg_type_func_is_variadic(p->c, fn_ty)) { - toy_error(p, call_tok.loc, "tail call to variadic function unsupported"); - return 0; - } + /* Variadic tail calls are not rejected here: realizability is a per-target + * decision owned by CG's precondition and the target's tail_call hook + * (e.g. native fits varargs in the caller's incoming area; wasm cannot, + * since its vararg buffer lives in the frame a sibling call tears down). */ if (!toy_parse_call_args(p, call_tok, fn_ty, fn ? fn->toy_params : NULL, fn ? fn->nparams : 0, &nargs)) return 0; if (fn && !toy_type_accepts_type(p, p->cur_fn_ret_toy, fn->toy_ret)) { @@ -1363,10 +1368,17 @@ static int toy_parse_return_stmt(ToyParser* p) { toy_error(p, p->cur.loc, "tail call signature mismatch"); return 0; } - if (fn) - cfree_cg_tail_call_symbol(p->cg, fn->sym, (uint32_t)nargs); - else - cfree_cg_tail_call(p->cg, (uint32_t)nargs, fn_ty); + if (fn) { + if (must_tail) + cfree_cg_musttail_call_symbol(p->cg, fn->sym, (uint32_t)nargs); + else + cfree_cg_tail_call_symbol(p->cg, fn->sym, (uint32_t)nargs); + } else { + if (must_tail) + cfree_cg_musttail_call(p->cg, (uint32_t)nargs, fn_ty); + else + cfree_cg_tail_call(p->cg, (uint32_t)nargs, fn_ty); + } if (!toy_parser_expect(p, TOK_SEMI)) return 0; return 1; } diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -1418,6 +1418,24 @@ static u32 aa_call_stack_size(CGTarget* t, const CGCallDesc* d) { return (stack_off + 15u) & ~15u; } +/* Realizability of a sibling call (see CGTarget.tail_call_unrealizable_reason). + * The callee's outgoing stack arguments must fit the area this function itself + * received (next_param_stack); the tail prologue restore reuses those slots. + * Variadic callees need no special handling — their arguments are placed by + * the ordinary register/stack rules and the same fit check covers them. sret + * callees are realizable too: aa_call forwards this function's own incoming + * sret pointer (the return-shape precondition guarantees it matches). */ +static const char* aa_tail_call_unrealizable_reason(CGTarget* t, + const CGCallDesc* d) { + AAImpl* a = impl_of(t); + u32 next_int = 0, next_fp = 0, stack_off = 0; + for (u32 i = 0; i < d->nargs; ++i) + count_arg_stack(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); + if (stack_off > a->next_param_stack) + return "tail call stack arguments exceed the caller's parameter area"; + return NULL; +} + static u32 aa_collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) { u32 n = 0; for (u32 r = first; r <= last; ++r) { @@ -1495,7 +1513,10 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) { u32 next_int = 0, next_fp = 0, stack_off = 0; - if (d->abi && d->abi->has_sret) { + /* Ordinary sret call: point x8 at the caller-provided destination local. + * A tail call instead forwards this function's own incoming sret pointer + * (handled below), so skip this here. */ + if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { if (d->ret.storage.kind != OPK_LOCAL) { compiler_panic(t->c, a->loc, "aarch64 call: sret destination must be LOCAL"); @@ -1520,8 +1541,21 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) { } if (d->flags & CG_CALL_TAIL) { - if (d->abi && d->abi->has_sret) - compiler_panic(t->c, a->loc, "aarch64 tail call: sret unsupported"); + if (d->abi && d->abi->has_sret) { + /* Forward this function's own incoming sret pointer (spilled to + * sret_ptr_slot at entry) into x8 for the callee. The return-shape + * precondition guarantees the callee writes the same type, so the + * forwarded pointer is correct. Load while x29 still addresses this + * frame, before aa_tail_branch tears it down; x8 is untouched by the + * frame restore and any indirect-callee move (AA_TMP0 = x9). */ + AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE) + ? aa64_slot_get(a, a->sret_ptr_slot) + : NULL; + if (!s) + compiler_panic(t->c, a->loc, + "aarch64 tail call: missing incoming sret slot"); + aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off)); + } aa_check_tail_stack_args(t, stack_off); aa_tail_branch(t, d->callee); return; @@ -1610,8 +1644,18 @@ static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) { MCEmitter* mc = t->mc; if (p->flags & CG_CALL_TAIL) { - if (p->has_sret) - compiler_panic(t->c, a->loc, "aarch64 tail call: sret unsupported"); + if (p->has_sret) { + /* Forward the function's own incoming sret pointer into x8 (see the + * O0 path in aa_call). Load before aa_tail_branch tears the frame + * down; x8 survives the restore and any indirect-callee move. */ + AASlot* s = (a->sret_ptr_slot != FRAME_SLOT_NONE) + ? aa64_slot_get(a, a->sret_ptr_slot) + : NULL; + if (!s) + compiler_panic(t->c, a->loc, + "aarch64 tail call: missing incoming sret slot"); + aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)s->off)); + } aa_check_tail_stack_args(t, aa_call_plan_stack_raw_size(p)); aa_tail_branch(t, p->callee); return; @@ -2781,6 +2825,7 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->store_call_arg = aa_store_call_arg; t->store_call_ret = aa_store_call_ret; t->call_stack_size = aa_call_stack_size; + t->tail_call_unrealizable_reason = aa_tail_call_unrealizable_reason; t->ret = aa_ret; t->alloca_ = aa_alloca_; diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c @@ -222,7 +222,10 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u); out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4); u32 next_int = 0, next_fp = 0, stack = 0; - if (d->abi && d->abi->has_sret) { + /* Ordinary sret call: pass the destination address in x8. A tail call + * instead forwards the function's own incoming sret pointer (handled in + * aa_emit_call_plan), and ret.storage is the void sentinel, so skip it. */ + if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { CGCallPlanMove* m = &out->args[out->nargs++]; m->src = d->ret.storage; m->src_kind = CG_CALL_PLAN_SRC_ADDR; diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -394,8 +394,21 @@ typedef enum CGCallFlag { /* Sibling call. The target emits the caller's epilogue, transfers * control to the callee (B/JUMP26 for direct, BR Xn for indirect), and * does NOT emit a return-style continuation. CG will not invoke - * target->ret afterwards. The target panics if it cannot honor the - * request — legality is the caller's responsibility (see cg_tail_call). */ + * target->ret afterwards. + * + * Realizability is verified before this flag is set: CG only sets it after + * tail_call_unrealizable_reason() returns NULL for the same desc and frame + * state, so the target can emit the sibling call unconditionally. The + * target may assert/compiler_panic if the flag is set on an unrealizable + * desc, but that is an internal-consistency check — fallback and + * diagnostics for unrealizable tail calls are CG's responsibility, not the + * target's. + * + * When the desc has an sret return (abi->has_sret), a tail call forwards + * the function's own incoming sret pointer to the callee's sret slot rather + * than pointing at a fresh local; ret.storage is the void sentinel and must + * not be used. CG's return-shape precondition guarantees the forwarded + * pointer matches the callee's expectation. */ CG_CALL_TAIL = 1u << 0, } CGCallFlag; @@ -405,8 +418,12 @@ typedef struct CGCallDesc { Operand callee; const CGABIValue* args; u32 nargs; - u16 flags; /* CGCallFlag */ - u16 pad; + u16 flags; /* CGCallFlag */ + u8 tail_policy; /* CfreeCgTailPolicy; meaningful when CG_CALL_TAIL is set. + * The opt recorder accepts every tail and preserves this so + * the replay can pick: emit tail (realizable), fall back to + * call+ret (ALLOWED), or diagnose (MUST). */ + u8 pad; CfreeCgInlinePolicy inline_policy; CGABIValue ret; } CGCallDesc; @@ -882,6 +899,34 @@ struct CGTarget { * for direct, indirect/byval, sret, split, and multi-register values. * `callee.kind == OPK_GLOBAL` is direct; any other kind is indirect. */ void (*call)(CGTarget*, const CGCallDesc*); + /* Pure query: can `d` be emitted as a sibling (tail) call on this target, + * given the current frame state? Returns NULL if yes; otherwise a short, + * static, human-readable string naming the blocker, used verbatim in the + * musttail diagnostic. Must not emit code and must not abort. + * + * Realizable means the target can transfer control to the callee while + * reusing (and tearing down) the current frame, with the callee's outgoing + * argument area and return mechanism fitting the space the caller itself + * received. CG guarantees the return shapes already match (a frontend + * precondition), so neither sret nor variadic is an inherent blocker: + * - sret: realized by forwarding the function's own incoming sret pointer + * to the callee's sret slot — sound because both sides return the same + * type via sret. Not a blocker on its own. + * - variadic: realizable when the callee's (variadic) argument area fits + * the caller's incoming parameter area on this arch. It is a blocker + * only where the realization cannot survive the frame teardown — e.g. + * wasm packs varargs into a caller-frame buffer that a sibling call + * would dangle. + * Typical blockers are therefore stack/argument areas that exceed the + * caller's incoming space, or arch-specific constraints like the wasm + * vararg buffer. A target may also return a "not yet implemented" reason + * for shapes whose codegen it has not built yet; that is honest and safe. + * + * CG owns the tail policy: it calls this first and only sets CG_CALL_TAIL + * when it returns NULL, so a NULL result must guarantee a later call() with + * CG_CALL_TAIL can emit the sibling call. May itself be NULL, meaning the + * target supports no tail calls at all. */ + const char* (*tail_call_unrealizable_reason)(CGTarget*, const CGCallDesc*); void (*plan_call)(CGTarget*, const CGCallDesc*, CGCallPlan* out); void (*load_call_arg)(CGTarget*, Operand dst, const CGCallPlanMove*); void (*store_call_arg)(CGTarget*, const CGCallPlanMove*); diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c @@ -1229,6 +1229,23 @@ static u32 rv_call_stack_size(CGTarget* t, const CGCallDesc* d) { return (stack_off + 15u) & ~15u; } +/* Realizability of a sibling call (see CGTarget.tail_call_unrealizable_reason). + * The callee's outgoing stack arguments must fit the area this function itself + * received (next_param_stack). Variadic callees need no special handling and + * sret callees are realizable by forwarding this function's own incoming sret + * pointer (the return-shape precondition guarantees it matches). */ +static const char* rv_tail_call_unrealizable_reason(CGTarget* t, + const CGCallDesc* d) { + RImpl* a = impl_of(t); + u32 next_int = (d->abi && d->abi->has_sret) ? 1u : 0u; + u32 next_fp = 0, stack_off = 0; + for (u32 i = 0; i < d->nargs; ++i) + count_arg_stack(&d->args[i], &next_int, &next_fp, &stack_off); + if (stack_off > a->next_param_stack) + return "tail call stack arguments exceed the caller's parameter area"; + return NULL; +} + typedef struct RvTailFrameLayout { u32 max_out; u32 fp_saves_sz; @@ -1328,21 +1345,26 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) { u32 next_int = 0, next_fp = 0, stack_off = 0; - /* sret: caller passes destination pointer in a0. */ + /* sret: a0 holds the result pointer. An ordinary call points it at the + * destination local; a tail call forwards this function's own incoming sret + * pointer (loaded just before the branch below), and ret.storage is the + * void sentinel, so only reserve a0 here. */ if (d->abi && d->abi->has_sret) { - if (d->ret.storage.kind != OPK_LOCAL) { - compiler_panic(t->c, a->loc, "rv64 call: sret dst must be LOCAL"); - } - RvSlot* s = rv64_slot_get(a, d->ret.storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad sret slot"); - i32 off = -(i32)s->off; - if (off >= -2048 && off <= 2047) { - rv64_emit32(mc, rv_addi(RV_A0, RV_S0, off)); - } else { - rv64_emit_load_imm(mc, 1, RV_A0, (i64)off); - rv64_emit32(mc, rv_add(RV_A0, RV_S0, RV_A0)); - } next_int = 1; + if ((d->flags & CG_CALL_TAIL) == 0) { + if (d->ret.storage.kind != OPK_LOCAL) { + compiler_panic(t->c, a->loc, "rv64 call: sret dst must be LOCAL"); + } + RvSlot* s = rv64_slot_get(a, d->ret.storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad sret slot"); + i32 off = -(i32)s->off; + if (off >= -2048 && off <= 2047) { + rv64_emit32(mc, rv_addi(RV_A0, RV_S0, off)); + } else { + rv64_emit_load_imm(mc, 1, RV_A0, (i64)off); + rv64_emit32(mc, rv_add(RV_A0, RV_S0, RV_A0)); + } + } } for (u32 i = 0; i < d->nargs; ++i) { @@ -1358,8 +1380,17 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) { } if (d->flags & CG_CALL_TAIL) { - if (d->abi && d->abi->has_sret) - compiler_panic(t->c, a->loc, "rv64 tail call: sret unsupported"); + if (d->abi && d->abi->has_sret) { + /* Forward the incoming sret pointer into a0 (spilled to sret_ptr_slot + * at entry). Load while s0 is valid, before rv_tail_branch restores the + * frame; a0 survives the restore and is unused by the args above. */ + if (a->sret_ptr_slot == FRAME_SLOT_NONE) + compiler_panic(t->c, a->loc, + "rv64 tail call: missing incoming sret slot"); + RvSlot* s = rv64_slot_get(a, a->sret_ptr_slot); + if (!s) compiler_panic(t->c, a->loc, "rv64 tail call: bad sret slot"); + rv64_emit32(mc, rv_ld(RV_A0, RV_S0, -(i32)s->off)); + } rv_check_tail_stack_args(t, stack_off); rv_tail_branch(t, d->callee); return; @@ -1427,8 +1458,15 @@ static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) { MCEmitter* mc = t->mc; if (p->flags & CG_CALL_TAIL) { - if (p->has_sret) - compiler_panic(t->c, a->loc, "rv64 tail call: sret unsupported"); + if (p->has_sret) { + /* Forward the incoming sret pointer into a0 (see rv_call). Load before + * rv_tail_branch restores the frame; a0 survives the restore. */ + if (a->sret_ptr_slot == FRAME_SLOT_NONE) + compiler_panic(t->c, a->loc, "rv64 tail call: missing incoming sret slot"); + RvSlot* s = rv64_slot_get(a, a->sret_ptr_slot); + if (!s) compiler_panic(t->c, a->loc, "rv64 tail call: bad sret slot"); + rv64_emit32(mc, rv_ld(RV_A0, RV_S0, -(i32)s->off)); + } rv_check_tail_stack_args(t, rv_call_plan_stack_raw_size(p)); rv_tail_branch(t, p->callee); return; @@ -2431,6 +2469,7 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->store_call_arg = rv_store_call_arg; t->store_call_ret = rv_store_call_ret; t->call_stack_size = rv_call_stack_size; + t->tail_call_unrealizable_reason = rv_tail_call_unrealizable_reason; t->ret = rv_ret; t->alloca_ = rv_alloca_; diff --git a/src/arch/rv64/opt_coord.c b/src/arch/rv64/opt_coord.c @@ -201,7 +201,10 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u); out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4); u32 next_int = d->abi && d->abi->has_sret ? 1u : 0u, next_fp = 0, stack = 0; - if (d->abi && d->abi->has_sret) { + /* Ordinary sret call: pass the destination address in a0. A tail call + * instead forwards the function's own incoming sret pointer (handled in + * rv_emit_call_plan), and ret.storage is the void sentinel, so skip it. */ + if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { CGCallPlanMove* m = &out->args[out->nargs++]; m->src = d->ret.storage; m->src_kind = CG_CALL_PLAN_SRC_ADDR; diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -1488,6 +1488,20 @@ static const Reg g_tail_cs_fp_order_all[X64_TAIL_MAX_CS_FP_REGS] = { X64_XMM0 + 14, X64_XMM15, }; +/* Realizability of a sibling call (see CGTarget.tail_call_unrealizable_reason). + * The callee's outgoing stack arguments must fit the area this function itself + * received (next_param_stack); the tail prologue restore reuses those slots. + * Variadic callees need no special handling (AL = #XMM regs is set as usual) + * and sret callees are realizable by forwarding this function's own incoming + * sret pointer (the return-shape precondition guarantees it matches). */ +static const char* x_tail_call_unrealizable_reason(CGTarget* t, + const CGCallDesc* d) { + XImpl* a = impl_of(t); + if (x_call_stack_size(t, d) > a->next_param_stack) + return "tail call stack arguments exceed the caller's parameter area"; + return NULL; +} + static u32 x_tail_collect_cs_regs(const XImpl* a, Reg* cs_regs) { u32 cs_used = 0; u64 mask = (u64)a->used_cs_int_mask & a->abi->cs_int_mask; @@ -1566,36 +1580,32 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { XImpl* a = impl_of(t); MCEmitter* mc = t->mc; - u32 next_int = 0, next_fp = 0; - /* Win64 reserves a 32 B shadow space at [rsp+0..31] which is part of - * the caller's outgoing-arg area; the first stack-passed arg lands - * at [rsp+32]. SysV starts at [rsp+0]. */ - u32 stack_off = a->abi->shadow_space; - int requested_tail = (d->flags & CG_CALL_TAIL) != 0; - int tail_ok = 1; - if (requested_tail) { - u32 tail_stack = x_call_stack_size(t, d); - tail_ok = tail_stack <= a->next_param_stack; - } - - /* sret: caller puts the destination pointer in the first int arg reg - * (RDI on SysV, RCX on Win64). */ + u32 next_int = 0, next_fp = 0, stack_off = a->abi->shadow_space; + int is_tail = (d->flags & CG_CALL_TAIL) != 0; + + /* sret: the first integer argument register holds the result pointer. + * An ordinary call points it at the + * destination local; a tail call forwards this function's own incoming sret + * pointer (loaded just before the branch below), and ret.storage is the void + * sentinel, so only reserve the register here. */ if (d->abi && d->abi->has_sret) { - if (d->ret.storage.kind != OPK_LOCAL) { - compiler_panic(t->c, a->loc, "x64 call: sret destination must be LOCAL"); - } - XSlot* s = x64_slot_get(a, d->ret.storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "x64 call: bad sret slot"); - emit_lea(mc, a->abi->int_args[0], X64_RBP, -(i32)s->off); next_int = 1; x_call_sync_slot(a->abi, &next_int, &next_fp); + if (!is_tail) { + if (d->ret.storage.kind != OPK_LOCAL) { + compiler_panic(t->c, a->loc, + "x64 call: sret destination must be LOCAL"); + } + XSlot* s = x64_slot_get(a, d->ret.storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "x64 call: bad sret slot"); + emit_lea(mc, a->abi->int_args[0], X64_RBP, -(i32)s->off); + } } for (u32 i = 0; i < d->nargs; ++i) { - emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off, - requested_tail && tail_ok); + emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off, is_tail); } u32 needed = (stack_off + 15u) & ~15u; - if ((!requested_tail || !tail_ok) && needed > a->max_outgoing) { + if (!is_tail && needed > a->max_outgoing) { if (a->known_frame) compiler_panic(t->c, a->loc, "x64 call: known frame outgoing area too small"); @@ -1607,9 +1617,19 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { x64_emit_load_imm(mc, 0, X64_RAX, (i64)next_fp); } - if (requested_tail && tail_ok) { - if (d->abi && d->abi->has_sret) - compiler_panic(t->c, a->loc, "x64 tail call: sret unsupported"); + if (is_tail) { + if (d->abi && d->abi->has_sret) { + /* Forward the incoming sret pointer into the ABI sret register (spilled + * to sret_ptr_slot at entry). Load while rbp is valid, before + * x_tail_branch restores the frame; the register survives the restore + * and is not used by the args above. */ + if (a->sret_ptr_slot == FRAME_SLOT_NONE) + compiler_panic(t->c, a->loc, + "x64 tail call: missing incoming sret slot"); + XSlot* s = x64_slot_get(a, a->sret_ptr_slot); + if (!s) compiler_panic(t->c, a->loc, "x64 tail call: bad sret slot"); + emit_mov_load(mc, 8, 0, a->abi->int_args[0], X64_RBP, -(i32)s->off); + } x_check_tail_stack_args(t, stack_off); x_tail_branch(t, d->callee); return; @@ -1703,8 +1723,15 @@ static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) { x64_emit_load_imm(mc, 0, X64_RAX, (i64)p->variadic_fp_count); if (p->flags & CG_CALL_TAIL) { - if (p->has_sret) - compiler_panic(t->c, impl_of(t)->loc, "x64 tail call: sret unsupported"); + if (p->has_sret) { + /* Forward the incoming sret pointer into the ABI sret register (see + * x_call). Load before x_tail_branch restores the frame. */ + if (a->sret_ptr_slot == FRAME_SLOT_NONE) + compiler_panic(t->c, a->loc, "x64 tail call: missing incoming sret slot"); + XSlot* s = x64_slot_get(a, a->sret_ptr_slot); + if (!s) compiler_panic(t->c, a->loc, "x64 tail call: bad sret slot"); + emit_mov_load(mc, 8, 0, a->abi->int_args[0], X64_RBP, -(i32)s->off); + } x_check_tail_stack_args(t, x_call_plan_stack_raw_size(p)); x_tail_branch(t, p->callee); return; @@ -2884,6 +2911,7 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->store_call_arg = x_store_call_arg; t->store_call_ret = x_store_call_ret; t->call_stack_size = x_call_stack_size; + t->tail_call_unrealizable_reason = x_tail_call_unrealizable_reason; t->ret = x_ret; t->alloca_ = x_alloca_; diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c @@ -201,8 +201,6 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->callee = d->callee; out->flags = d->flags; out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; - if ((out->flags & CG_CALL_TAIL) && out->stack_arg_size) - out->flags &= ~CG_CALL_TAIL; out->has_sret = d->abi && d->abi->has_sret; out->is_variadic = d->abi && d->abi->variadic; for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { @@ -218,7 +216,10 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { * the caller owns; the first stack-passed arg lands above it. SysV * starts at offset 0. */ u32 stack = abi->shadow_space; - if (d->abi && d->abi->has_sret) { + /* Ordinary sret call: pass the destination address. A tail call + * instead forwards the function's own incoming sret pointer (handled in + * x_emit_call_plan), and ret.storage is the void sentinel, so skip it. */ + if (d->abi && d->abi->has_sret && (d->flags & CG_CALL_TAIL) == 0) { CGCallPlanMove* m = &out->args[out->nargs++]; m->src = d->ret.storage; m->src_kind = CG_CALL_PLAN_SRC_ADDR; diff --git a/src/cg/call.c b/src/cg/call.c @@ -116,6 +116,62 @@ static void api_spill_call_clobbered_stack(CfreeCg* g, const CGCallDesc* d) { } } +/* Return-ABI compatibility between the enclosing function and a tail callee. + * A tail call only makes sense when the callee's return value flows directly + * out of the caller, so the two return shapes must match. This is a frontend + * precondition; a mismatch is a frontend bug, not a fallback case. */ +static int api_tail_ret_compatible(const ABIArgInfo* caller, + const ABIArgInfo* callee) { + if (!caller || !callee) return 0; + if (caller->kind != callee->kind) return 0; + if (caller->kind == ABI_ARG_IGNORE || caller->kind == ABI_ARG_INDIRECT) + return 1; + if (caller->nparts != callee->nparts) return 0; + for (u16 i = 0; i < caller->nparts; ++i) { + if (caller->parts[i].cls != callee->parts[i].cls) return 0; + if (caller->parts[i].size != callee->parts[i].size) return 0; + } + return 1; +} + +/* Decide whether a requested tail call (policy ALLOWED or MUST) is emitted as + * a sibling call. Returns 1 to emit a tail/terminator, 0 for the ALLOWED + * fallback to an ordinary call. Aborts on a return-shape precondition + * violation (any policy) or an unrealizable MUST tail call. The target + * authors the realizability verdict and, when it blocks, the reason string. */ +static int api_tail_decide(CfreeCg* g, const CGCallDesc* desc, + CfreeCgTailPolicy policy) { + CGTarget* T = g->target; + const ABIArgInfo* caller_ret = g->fn_abi ? &g->fn_abi->ret : NULL; + const char* reason; + if (!api_tail_ret_compatible(caller_ret, &desc->abi->ret)) { + compiler_panic(g->c, g->cur_loc, + "tail call: callee return type is incompatible with the " + "enclosing function's return type"); + return 0; + } + reason = T->tail_call_unrealizable_reason + ? T->tail_call_unrealizable_reason(T, desc) + : "target does not support tail calls"; + if (!reason) return 1; /* realizable */ + if (policy == CFREE_CG_TAIL_MUST) { + compiler_panic(g->c, g->cur_loc, "musttail call not realizable: %s", + reason); + return 0; + } + return 0; /* ALLOWED: fall back to an ordinary call. */ +} + +/* Emit the caller's return for an ALLOWED tail call that fell back to an + * ordinary call. A tail call is a terminator, so the frontend emits no ret of + * its own; the fallback must supply one that forwards the call result. */ +static void api_tail_fallback_ret(CfreeCg* g, CfreeCgTypeId ret_ty) { + if (cg_type_is_void(g->c, ret_ty)) + g->target->ret(g->target, NULL); + else + cfree_cg_ret(g); /* pops the result pushed by api_push_call_result */ +} + void cfree_cg_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type, CfreeCgCallAttrs attrs) { CGTarget* T; @@ -126,17 +182,17 @@ void cfree_cg_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type, CGABIValue* avs; CGCallDesc desc; ApiSValue callee; - int tail; + int want_tail; + int emit_tail; if (!g) return; api_local_const_memory_boundary(g); - tail = + want_tail = attrs.tail == CFREE_CG_TAIL_ALLOWED || attrs.tail == CFREE_CG_TAIL_MUST; T = g->target; fty = resolve_type(g->c, fn_type); if (!fty) return; abi = abi_cg_func_info(g->c->abi, fty); ret_ty = cg_type_func_ret_id(g->c, fty); - has_result = !tail && !cg_type_is_void(g->c, ret_ty); if (g->sp < (u32)nargs + 1u) { compiler_panic(g->c, g->cur_loc, "CfreeCg: call stack underflow"); @@ -169,19 +225,23 @@ void cfree_cg_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type, desc.callee = callee_op; desc.args = avs; desc.nargs = nargs; - desc.flags = tail ? CG_CALL_TAIL : CG_CALL_NONE; + desc.tail_policy = (u8)attrs.tail; desc.inline_policy = inline_policy; desc.ret.type = ret_ty; desc.ret.abi = &abi->ret; + emit_tail = want_tail ? api_tail_decide(g, &desc, attrs.tail) : 0; + has_result = !emit_tail && !cg_type_is_void(g->c, ret_ty); + desc.flags = emit_tail ? CG_CALL_TAIL : CG_CALL_NONE; + if (has_result) { api_alloc_call_ret_storage(g, T, ret_ty, &desc.ret.storage); } else { desc.ret.storage = api_op_imm(0, builtin_id(CFREE_CG_BUILTIN_VOID)); } - if (tail) api_regalloc_finish(g); - if (!tail) api_spill_call_clobbered_stack(g, &desc); + if (emit_tail) api_regalloc_finish(g); + if (!emit_tail) api_spill_call_clobbered_stack(g, &desc); T->call(T, &desc); api_release_call_args(g, avs, nargs); @@ -193,64 +253,10 @@ void cfree_cg_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type, if (has_result) { api_push_call_result(g, desc.ret.storage, ret_ty); } -} -void api_cg_tail_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type) - __attribute__((unused)); -void api_cg_tail_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type) { - CGTarget* T; - CfreeCgTypeId fty; - const ABIFuncInfo* abi; - CGABIValue* avs; - CGCallDesc desc; - ApiSValue callee; - if (!g) return; - api_local_const_memory_boundary(g); - T = g->target; - fty = resolve_type(g->c, fn_type); - if (!fty) return; - abi = abi_cg_func_info(g->c->abi, fty); - avs = NULL; - if (nargs) { - avs = arena_array(g->c->tu, CGABIValue, nargs); - memset(avs, 0, sizeof(CGABIValue) * nargs); - } - for (u32 i = 0; i < nargs; ++i) { - u32 idx = nargs - 1u - i; - ApiSValue arg = api_pop(g); - api_ensure_reg(g, &arg); - CfreeCgTypeId aty = cg_type_func_param_id(g->c, fty, idx); - if (!aty) aty = arg.type; - avs[idx].type = aty; - avs[idx].abi = idx < abi->nparams ? &abi->params[idx] : NULL; - avs[idx].storage = (api_is_lvalue_sv(&arg) || arg.op.kind == OPK_GLOBAL) - ? api_force_reg(g, &arg, aty) - : arg.op; - } - callee = api_pop(g); - api_ensure_reg(g, &callee); - Operand callee_op = (callee.op.kind == OPK_GLOBAL) - ? callee.op - : api_force_reg(g, &callee, fty); - memset(&desc, 0, sizeof desc); - desc.fn_type = fty; - desc.abi = abi; - desc.callee = callee_op; - desc.args = avs; - desc.nargs = nargs; - desc.flags = CG_CALL_TAIL; - desc.inline_policy = CFREE_CG_INLINE_DEFAULT; - desc.ret.type = cg_type_func_ret_id(g->c, fty); - desc.ret.abi = &abi->ret; - desc.ret.storage = api_op_imm(0, builtin_id(CFREE_CG_BUILTIN_VOID)); - api_regalloc_finish(g); - T->call(T, &desc); - for (u32 i = 0; i < nargs; ++i) { - api_release_arg_storage(g, &avs[i].storage); - } - if (callee.op.kind != OPK_GLOBAL) { - api_free_reg(g, callee_op.v.reg, RC_INT); - } + /* ALLOWED tail call that could not be realized: forward the result via a + * synthesized return so the function still terminates correctly. */ + if (want_tail && !emit_tail) api_tail_fallback_ret(g, ret_ty); } void api_call_symbol_common(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, @@ -264,16 +270,17 @@ void api_call_symbol_common(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, CGCallDesc desc; Operand callee_op; CfreeCgInlinePolicy inline_policy; + int want_tail; + int emit_tail; if (!g) return; api_local_const_memory_boundary(g); - int tail = + want_tail = attrs.tail == CFREE_CG_TAIL_ALLOWED || attrs.tail == CFREE_CG_TAIL_MUST; T = g->target; fty = api_sym_type(g, sym); if (!fty) return; abi = abi_cg_func_info(g->c->abi, fty); ret_ty = cg_type_func_ret_id(g->c, fty); - has_result = !tail && !cg_type_is_void(g->c, ret_ty); if (g->sp < nargs) { compiler_panic(g->c, g->cur_loc, "CfreeCg: call stack underflow"); return; @@ -296,22 +303,28 @@ void api_call_symbol_common(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, desc.callee = callee_op; desc.args = avs; desc.nargs = nargs; - desc.flags = tail ? CG_CALL_TAIL : CG_CALL_NONE; + desc.tail_policy = (u8)attrs.tail; desc.inline_policy = inline_policy; desc.ret.type = ret_ty; desc.ret.abi = &abi->ret; + + emit_tail = want_tail ? api_tail_decide(g, &desc, attrs.tail) : 0; + has_result = !emit_tail && !cg_type_is_void(g->c, ret_ty); + desc.flags = emit_tail ? CG_CALL_TAIL : CG_CALL_NONE; + if (has_result) { api_alloc_call_ret_storage(g, T, ret_ty, &desc.ret.storage); } else { desc.ret.storage = api_op_imm(0, builtin_id(CFREE_CG_BUILTIN_VOID)); } - if (tail) api_regalloc_finish(g); - if (!tail) api_spill_call_clobbered_stack(g, &desc); + if (emit_tail) api_regalloc_finish(g); + if (!emit_tail) api_spill_call_clobbered_stack(g, &desc); T->call(T, &desc); api_release_call_args(g, avs, nargs); if (has_result) { api_push_call_result(g, desc.ret.storage, ret_ty); } + if (want_tail && !emit_tail) api_tail_fallback_ret(g, ret_ty); } void cfree_cg_call_symbol(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, diff --git a/src/cg/internal.h b/src/cg/internal.h @@ -252,7 +252,6 @@ void api_push_call_result(CfreeCg* g, Operand ret_storage, CfreeCgTypeId ret_ty); void cfree_cg_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type, CfreeCgCallAttrs attrs); -void api_cg_tail_call(CfreeCg* g, uint32_t nargs, CfreeCgTypeId fn_type); void api_call_symbol_common(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, CfreeCgCallAttrs attrs); void cfree_cg_call_symbol(CfreeCg* g, CfreeCgSym sym, uint32_t nargs, diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1011,6 +1011,18 @@ static void w_call(CGTarget* t, const CGCallDesc* d) { } } +static const char* w_tail_call_unrealizable_reason(CGTarget* t, + const CGCallDesc* d) { + (void)t; + (void)d; + /* The recorder accepts every tail call. Realizability depends on the laid- + * out frame, known only when the call is emitted onto the real backend + * during replay (pass_emit). There the real target's hook is consulted and + * the call is emitted as a tail, falls back to an ordinary call+return + * (ALLOWED), or diagnosed (MUST). */ + return NULL; +} + static void w_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { CGTarget* wr = impl_of(t)->target; if (wr->plan_call) @@ -1644,6 +1656,7 @@ CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) { t->convert = w_convert; t->call = w_call; + t->tail_call_unrealizable_reason = w_tail_call_unrealizable_reason; t->plan_call = w_plan_call; t->load_call_arg = w_load_call_arg; t->store_call_arg = w_store_call_arg; diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -374,16 +374,14 @@ static int replay_plan_supported(CGTarget* w, const CGCallPlan* p, return 1; } -static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) { - const CGCallPlan* src_plan = &aux->plan; +/* Materialize args and emit the call instruction for a planned call. Does not + * emit return-value moves. */ +static void emit_call_and_args(ReplayCtx* r, const CGCallPlan* src_plan) { CGCallPlan plan = *src_plan; plan.callee = xlat_op(r, src_plan->callee); plan.args = src_plan->nargs ? arena_array(r->f->arena, CGCallPlanMove, src_plan->nargs) : NULL; - plan.rets = src_plan->nrets - ? arena_array(r->f->arena, CGCallPlanRet, src_plan->nrets) - : NULL; ReplayParallelMove* arg_moves = src_plan->nargs @@ -438,25 +436,66 @@ static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) { replay_parallel_moves(r, arg_moves, nargs, callee_scratch); r->tgt->emit_call_plan(r->tgt, &plan); +} + +/* An ALLOWED tail call the backend could not realize: emit it as an ordinary + * call and return its result. The callee's return registers are the function's + * (the return shapes match by CG's precondition) and the epilogue preserves + * them, so a plain call followed by a bare return forwards the value. The + * frame already reserves outgoing space for this call (the known-frame sizing + * counts tail calls' stack args), so the ordinary call's args fit. */ +static void replay_tail_fallback(ReplayCtx* r, const IRCallAux* aux) { + /* Reuse the recorded plan (its arg sources are the post-regalloc operands) + * but clear CG_CALL_TAIL so emit_call_and_args emits an ordinary BL and + * keeps stack args in the outgoing area rather than the tail slots. */ + CGCallPlan plan = aux->plan; + plan.flags &= (u16)~CG_CALL_TAIL; + emit_call_and_args(r, &plan); + r->tgt->ret(r->tgt, NULL); +} + +static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux, + SrcLoc loc) { + const CGCallPlan* src_plan = &aux->plan; - if (plan.flags & CG_CALL_TAIL) return; + /* The opt recorder accepted this tail call unconditionally; resolve it now + * against the real backend, which has a laid-out frame. NULL => realizable, + * emit the sibling call. Otherwise MUST diagnoses and ALLOWED falls back. */ + if (src_plan->flags & CG_CALL_TAIL) { + const char* reason = + r->tgt->tail_call_unrealizable_reason + ? r->tgt->tail_call_unrealizable_reason(r->tgt, &aux->desc) + : "target does not support tail calls"; + if (reason) { + if (aux->desc.tail_policy == CFREE_CG_TAIL_MUST) + compiler_panic(r->c, loc, "musttail call not realizable: %s", reason); + replay_tail_fallback(r, aux); + return; + } + } + + emit_call_and_args(r, src_plan); + if (src_plan->flags & CG_CALL_TAIL) return; + CGCallPlanRet* rets = + src_plan->nrets + ? arena_array(r->f->arena, CGCallPlanRet, src_plan->nrets) + : NULL; ReplayParallelMove* ret_moves = src_plan->nrets ? arena_zarray(r->f->arena, ReplayParallelMove, src_plan->nrets) : NULL; u32 nrets = 0; for (u32 i = 0; i < src_plan->nrets; ++i) { - plan.rets[i] = src_plan->rets[i]; - plan.rets[i].dst = xlat_op(r, src_plan->rets[i].dst); - Operand src = - phys_reg_operand(plan.rets[i].src_reg, (RegClass)plan.rets[i].cls, - plan.rets[i].mem.type); - ret_moves[nrets].dst = plan.rets[i].dst; + rets[i] = src_plan->rets[i]; + rets[i].dst = xlat_op(r, src_plan->rets[i].dst); + Operand src = phys_reg_operand(rets[i].src_reg, (RegClass)rets[i].cls, + rets[i].mem.type); + ret_moves[nrets].dst = rets[i].dst; ret_moves[nrets].src = src; - ret_moves[nrets].mem = plan.rets[i].mem; - ret_moves[nrets].ret = &plan.rets[i]; - ret_moves[nrets].dst_offset = plan.rets[i].dst_offset; + ret_moves[nrets].mem = rets[i].mem; + ret_moves[nrets].ret = &rets[i]; + ret_moves[nrets].dst_offset = rets[i].dst_offset; ret_moves[nrets].dst_kind = CG_CALL_PLAN_REG; ret_moves[nrets].src_kind = CG_CALL_PLAN_SRC_VALUE; ret_moves[nrets].is_ret = 1; @@ -639,7 +678,7 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { const char* plan_reason = NULL; if (aux && aux->use_plan_replay && w->emit_call_plan && replay_plan_supported(w, &aux->plan, &plan_reason)) { - replay_planned_call(r, aux); + replay_planned_call(r, aux, in->loc); break; } compiler_panic( @@ -1129,7 +1168,12 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { out->has_call = 1; continue; } - if ((aux->desc.flags & CG_CALL_TAIL) == 0) out->has_call = 1; + /* A non-tail call, or an ALLOWED tail that may fall back to an + * ordinary call (clobbering the link register), needs a frame. A MUST + * tail never falls back, so it alone does not force one. */ + if ((aux->desc.flags & CG_CALL_TAIL) == 0 || + aux->desc.tail_policy != CFREE_CG_TAIL_MUST) + out->has_call = 1; u32 need = 0; if (aux->use_plan_replay) { need = aux->plan.stack_arg_size; diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c @@ -2485,7 +2485,10 @@ static int gvn_raw_local_addr_root(Func* f, Val v, u32 depth, static void gvn_mark_escaped_operand(GvnCtx* ctx, const Operand* op) { FrameSlot fs = FRAME_SLOT_NONE; if (!op) return; - if (op->kind == OPK_REG) { + if (op->kind == OPK_LOCAL) { + fs = op->v.frame_slot; + if (fs > 0 && (u32)fs <= ctx->f->nframe_slots) ctx->local_escaped[fs] = 1; + } else if (op->kind == OPK_REG) { if (gvn_raw_local_addr_root(ctx->f, (Val)op->v.reg, 0, &fs) && fs > 0 && (u32)fs <= ctx->f->nframe_slots) ctx->local_escaped[fs] = 1; @@ -2535,10 +2538,13 @@ static void gvn_collect_local_escapes(GvnCtx* ctx) { gvn_mark_escaped_operand(ctx, &aux->plan.callee); for (u32 a = 0; a < aux->plan.nargs; ++a) gvn_mark_escaped_operand(ctx, &aux->plan.args[a].src); + for (u32 r = 0; r < aux->plan.nrets; ++r) + gvn_mark_escaped_operand(ctx, &aux->plan.rets[r].dst); } else { gvn_mark_escaped_operand(ctx, &aux->desc.callee); for (u32 a = 0; a < aux->desc.nargs; ++a) gvn_mark_escaped_abivalue(ctx, &aux->desc.args[a]); + gvn_mark_escaped_abivalue(ctx, &aux->desc.ret); } break; } diff --git a/test/toy/cases/31_musttail_direct.expected b/test/toy/cases/31_musttail_direct.expected @@ -0,0 +1 @@ +33 diff --git a/test/toy/cases/31_musttail_direct.toy b/test/toy/cases/31_musttail_direct.toy @@ -0,0 +1,13 @@ +fn target(a: i64, b: i64, c: i64): i64 { + return a + b * 2 + c * 4; +} + +fn caller(x: i64, y: i64, z: i64): i64 { + return musttail target(z, x, y); +} + +fn __user_main(): i64 { + return caller(3, 5, 7); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/32_musttail_indirect.expected b/test/toy/cases/32_musttail_indirect.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/32_musttail_indirect.toy b/test/toy/cases/32_musttail_indirect.toy @@ -0,0 +1,13 @@ +fn add1(x: i64): i64 { + return x + 1; +} + +fn apply(fp: *fn(i64): i64, x: i64): i64 { + return musttail fp(x); +} + +fn __user_main(): i64 { + return apply(add1, 41); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/33_musttail_void.expected b/test/toy/cases/33_musttail_void.expected @@ -0,0 +1 @@ +7 diff --git a/test/toy/cases/33_musttail_void.toy b/test/toy/cases/33_musttail_void.toy @@ -0,0 +1,14 @@ +fn leaf(): void { + return; +} + +fn forward(): void { + return musttail leaf(); +} + +fn __user_main(): i64 { + forward(); + return 7; +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/34_tail_variadic.expected b/test/toy/cases/34_tail_variadic.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/34_tail_variadic.toy b/test/toy/cases/34_tail_variadic.toy @@ -0,0 +1,13 @@ +fn sink(x: i64, ...): i64 { + return x; +} + +fn caller(x: i64): i64 { + return tail sink(x); +} + +fn __user_main(): i64 { + return caller(42); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/35_musttail_variadic.expected b/test/toy/cases/35_musttail_variadic.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/35_musttail_variadic.toy b/test/toy/cases/35_musttail_variadic.toy @@ -0,0 +1,13 @@ +fn sink(x: i64, ...): i64 { + return x; +} + +fn caller(x: i64): i64 { + return musttail sink(x); +} + +fn __user_main(): i64 { + return caller(42); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/36_musttail_sret.expected b/test/toy/cases/36_musttail_sret.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/36_musttail_sret.toy b/test/toy/cases/36_musttail_sret.toy @@ -0,0 +1,22 @@ +record Triple { + a: i64, + b: i64, + c: i64, +} + +fn make(x: i64, y: i64, z: i64): Triple { + let t: Triple = Triple { a: x, b: y, c: z }; + return t; +} + +fn forward(x: i64, y: i64, z: i64): Triple { + return musttail make(z, x, y); +} + +fn __user_main(): i64 { + var t: Triple = Triple { a: 0, b: 0, c: 0 }; + t = forward(10, 13, 19); + return t.a + t.b + t.c; +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/37_tail_sret.expected b/test/toy/cases/37_tail_sret.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/37_tail_sret.toy b/test/toy/cases/37_tail_sret.toy @@ -0,0 +1,22 @@ +record Triple { + a: i64, + b: i64, + c: i64, +} + +fn make(x: i64, y: i64, z: i64): Triple { + let t: Triple = Triple { a: x, b: y, c: z }; + return t; +} + +fn forward(x: i64, y: i64, z: i64): Triple { + return tail make(z, x, y); +} + +fn __user_main(): i64 { + var t: Triple = Triple { a: 0, b: 0, c: 0 }; + t = forward(10, 13, 19); + return t.a + t.b + t.c; +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/38_tail_stack_fallback.expected b/test/toy/cases/38_tail_stack_fallback.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/38_tail_stack_fallback.toy b/test/toy/cases/38_tail_stack_fallback.toy @@ -0,0 +1,18 @@ +fn many(a: i64, b: i64, c: i64, d: i64, e: i64, + f: i64, g: i64, h: i64, i: i64, j: i64): i64 { + return a; +} + +// `caller` takes one parameter (no incoming stack-arg area), but `many` needs +// stack arguments (10 ints exceed the 8 argument registers). The sibling call +// cannot reuse the caller's parameter area, so this ALLOWED tail is not +// realizable and CG falls back to an ordinary call + return. Result is `a`. +fn caller(x: i64): i64 { + return tail many(x, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +fn __user_main(): i64 { + return caller(42); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/err/musttail_stack_args.expected b/test/toy/err/musttail_stack_args.expected @@ -0,0 +1 @@ +musttail call not realizable diff --git a/test/toy/err/musttail_stack_args.toy b/test/toy/err/musttail_stack_args.toy @@ -0,0 +1,17 @@ +fn many(a: i64, b: i64, c: i64, d: i64, e: i64, + f: i64, g: i64, h: i64, i: i64, j: i64): i64 { + return a; +} + +// musttail to a callee needing stack arguments that don't fit the caller's +// (empty) incoming parameter area is not realizable: CG must diagnose rather +// than silently fall back. +fn caller(x: i64): i64 { + return musttail many(x, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +fn __user_main(): i64 { + return caller(42); +} + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/err/tail_variadic.expected b/test/toy/err/tail_variadic.expected @@ -1 +0,0 @@ -tail call to variadic function unsupported diff --git a/test/toy/err/tail_variadic.toy b/test/toy/err/tail_variadic.toy @@ -1,7 +0,0 @@ -fn sink(x: i64, ...): i64 { - return x; -} - -fn main(): i64 { - return tail sink(42); -}