commit 57f1dd7b4466ca7780e7231a501980836d83c462
parent 7aecaa468f135bcada791ccc5b7f76aa8a0a3a2e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 11 May 2026 22:28:41 -0700
TAILCALL.md plan
Diffstat:
| A | doc/TAILCALL.md | | | 234 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 234 insertions(+), 0 deletions(-)
diff --git a/doc/TAILCALL.md b/doc/TAILCALL.md
@@ -0,0 +1,234 @@
+# Tail Call Support
+
+First-class tail calls from the C frontend through codegen and the aarch64
+backend. x64 and rv64 follow the same pattern; this document focuses on
+aarch64.
+
+## Current state
+
+The groundwork is present but nothing is wired end-to-end:
+
+- `cg_tail_call` (cg.c) — stub that panics `"not in v1 slice"`
+- `CG_CALL_TAIL` in `CGCallDesc.flags` (arch.h) — defined and documented,
+ never set
+- `R_AARCH64_JUMP26` (obj.h) — handled identically to CALL26 by the linker;
+ only the emitted instruction opcode differs
+- `aa64_b_base()` / `aa64_br()` — defined in the ISA layer, never used for
+ tail calls
+- `test/elf/cases/16_tail_call.c` — verifies that cfree's linker handles
+ JUMP26 from a clang-compiled object; does not test cfree generating tail calls
+
+## Architecture constraint
+
+The aarch64 backend defers frame layout: frame size, callee-saved register
+counts, and stack offsets are computed in `aa_func_end` after all body code is
+emitted. The prologue uses NOP placeholders that are patched back-filled by
+`aa_func_end`. The epilogue is a single labeled block; all `aa_ret` paths emit
+a `B` to that label.
+
+For tail calls the frame teardown (callee-saved restores + SP restore) must
+appear **inline at the call site**, before the `B`/`BR` to the callee — not at
+the epilogue. Since frame dimensions are unknown at call-emit time, we use the
+same NOP-placeholder-then-patch approach as the prologue.
+
+## v1 constraints (panic, don't silently miscompile)
+
+| Scenario | Disposition |
+|---|---|
+| Tail call from alloca function | `compiler_panic` in `aa_call` |
+| Tail call with sret return type | `compiler_panic` in `aa_call` |
+| Tail call with stack-passed args | `compiler_panic` in `aa_call` |
+| Tail call from variadic function | `compiler_panic` in `aa_call` |
+| `musttail` not on a return stmt | `perr` in parser |
+| C23 `[[clang::musttail]]` syntax | out of scope |
+
+---
+
+## Step 1 — Frontend: recognize `musttail`
+
+**`src/parse/attr.h`**: Add `ATTR_MUSTTAIL` to `AttrKind`.
+
+**`src/parse/parse_type.c`**: Register in the attribute table:
+```c
+{"musttail", ATTR_MUSTTAIL, AS_NONE},
+```
+
+**`src/parse/parse_priv.h`**: Add `u8 in_musttail` to `Parser`.
+
+**`src/parse/parse_stmt.c`** — `parse_stmt`: before the keyword dispatch check
+`starts_attr(p)`. If the parsed list contains `ATTR_MUSTTAIL`, set
+`p->in_musttail = 1`. The next token must be `return`; any other statement is a
+fatal error.
+
+**`src/parse/parse_stmt.c`** — `parse_return_stmt`: if `p->in_musttail`:
+- call `parse_expr(p)` as usual — the inner call dispatch emits `cg_tail_call`
+- do **not** call `to_rvalue` or `cg_ret`; `cg_tail_call` implicitly terminates
+ the function
+- clear `p->in_musttail` before returning
+
+**`src/parse/parse_expr.c`** — postfix call dispatch: if `p->in_musttail` is
+set when a `'('` call is dispatched, emit `cg_tail_call(p->cg, nargs, fn_type)`
+instead of `cg_call`.
+
+## Step 2 — CG layer: implement `cg_tail_call`
+
+Factor the body of `cg_call` into:
+```c
+static void cg_call_impl(CG* g, u32 nargs, const Type* fn_type, u16 flags);
+```
+
+Both `cg_call` and `cg_tail_call` call it, passing `CG_CALL_NONE` or
+`CG_CALL_TAIL` respectively.
+
+When `flags & CG_CALL_TAIL`:
+- Set `desc.flags = CG_CALL_TAIL`
+- Skip result register allocation; pass a void-typed `OPK_IMM` placeholder in
+ `desc.ret` so the backend has a typed slot to inspect
+- Skip `push(g, …)` — no result, no continuation
+- Still call `T->free_reg` for the callee register after `T->call` returns
+ (the backend has already moved it to x16 by then)
+
+## Step 3 — AArch64 backend
+
+### `src/arch/aarch64/internal.h`
+
+Worst-case inline teardown: 5 int-pair LDPs (x19–x28) + 4 fp-pair LDPs
+(d8–d15) + 1 fp/lr LDP + 2 SP-add instructions = 12; use 14 for headroom.
+
+```c
+#define AA_TAIL_EP_WORDS 14u
+#define AA64_TAIL_SCRATCH 16u /* x16 / ip0: caller-saved, not a pool reg */
+```
+
+Add to `AAImpl`:
+```c
+struct { u32 pos; } *tail_sites;
+u32 ntail_sites;
+u32 tail_sites_cap;
+```
+
+Initialize in `aa_func_begin`:
+```c
+a->tail_sites = NULL;
+a->ntail_sites = 0;
+a->tail_sites_cap = 0;
+```
+
+### `src/arch/aarch64/ops.c` — `aa_call`
+
+After the `emit_arg_value` loop and `max_outgoing` update, before the existing
+BL/BLR emission:
+
+```c
+if (d->flags & CG_CALL_TAIL) {
+ if (a->has_alloca)
+ compiler_panic(…, "musttail not supported in alloca function");
+ if (d->abi && d->abi->has_sret)
+ compiler_panic(…, "musttail not supported with sret return type");
+ if (stack_off > 0)
+ compiler_panic(…, "musttail with stack-passed arguments not supported");
+ if (a->is_variadic)
+ compiler_panic(…, "musttail not supported in variadic function");
+
+ /* Indirect callees live in x19–x28 (the int pool). Move to ip0 now,
+ * before the teardown restores those registers from the stack. */
+ if (d->callee.kind == OPK_REG)
+ aa64_emit32(mc, aa64_mov_reg(1, AA64_TAIL_SCRATCH, reg_num(d->callee)));
+
+ /* NOP placeholder; patched with the frame teardown in aa_func_end. */
+ u32 site_pos = mc->pos(mc);
+ for (u32 i = 0; i < AA_TAIL_EP_WORDS; ++i) aa64_emit32(mc, AA64_NOP);
+
+ /* Tail branch. */
+ if (d->callee.kind == OPK_GLOBAL) {
+ u32 b_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_b_base());
+ mc->emit_reloc_at(mc, mc->section_id, b_pos, R_AARCH64_JUMP26,
+ d->callee.v.global.sym, d->callee.v.global.addend, 0, 0);
+ } else if (d->callee.kind == OPK_REG) {
+ aa64_emit32(mc, aa64_br(AA64_TAIL_SCRATCH));
+ } else {
+ compiler_panic(…, "aarch64 tail call: callee kind %d unsupported", …);
+ }
+
+ aa_tail_site_push(a, site_pos);
+ return; /* no return-value extraction; no continuation */
+}
+```
+
+`aa_tail_site_push` is a small grow-array helper consistent with the existing
+`add_patches` pattern.
+
+### `src/arch/aarch64/emit.c` — `aa_func_end`
+
+After computing `n_int_pairs`, `n_fp_pairs`, `frame_size`, `int_save_off`,
+`fp_save_off`, `fp_lr_off` — before placing the epilogue label — patch each
+tail call site:
+
+```c
+for (u32 ti = 0; ti < a->ntail_sites; ++ti) {
+ u32 words[AA_TAIL_EP_WORDS];
+ u32 wi = 0;
+ for (u32 i = 0; i < AA_TAIL_EP_WORDS; ++i) words[i] = AA64_NOP;
+
+ for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) {
+ u32 r0 = 8u + (u32)i * 2u;
+ words[wi++] = aa64_ldp_d(r0, r0+1, 31, (i32)(fp_save_off + (u32)i*16));
+ }
+ for (i32 i = (i32)n_int_pairs - 1; i >= 0; --i) {
+ u32 r0 = 19u + (u32)i * 2u;
+ words[wi++] = aa64_ldp_x(r0, r0+1, 31, (i32)(int_save_off + (u32)i*16));
+ }
+ words[wi++] = aa64_ldp_x(29, 30, 31, (i32)fp_lr_off);
+
+ /* SP restore — mirrors emit_sp_add but writes into words[]. */
+ if (frame_size <= 0xfff) {
+ words[wi++] = aa64_add_imm(1, 31, 31, frame_size, 0);
+ } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
+ words[wi++] = aa64_add_imm(1, 31, 31, frame_size >> 12, 1);
+ } else {
+ words[wi++] = aa64_add_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
+ words[wi++] = aa64_add_imm(1, 31, 31, frame_size & 0xfff, 0);
+ }
+
+ if (wi > AA_TAIL_EP_WORDS)
+ compiler_panic(…, "aarch64: tail epilogue overflow (%u words)", wi);
+
+ u32 p0 = a->tail_sites[ti].pos;
+ for (u32 i = 0; i < AA_TAIL_EP_WORDS; ++i)
+ aa64_patch32(obj, sec, p0 + i * 4u, words[i]);
+}
+```
+
+## Step 4 — Tests (red-green)
+
+Write tests first, then implement.
+
+**`test/parse/`** — attribute parse test: `__attribute__((musttail)) return f(x);`
+parses without error; missing-return error fires on a non-return statement.
+
+**`test/cg/`** — direct tail call: build `int f(int x)` that musttail-calls
+`int g(int x)` via `cg_tail_call`; verify the emitted aarch64 text contains a
+JUMP26 relocation (B) and no CALL26 (BL).
+
+**`test/cg/`** — indirect tail call via function pointer: verify `MOV x16, xN`
++ `BR x16` and no BLR.
+
+**`test/cg/`** — end-to-end: tail-recursive sum that overflows without TCO;
+compile and run via `cfree run` and verify correctness.
+
+## Files touched
+
+| File | Change |
+|---|---|
+| `src/parse/attr.h` | `+ATTR_MUSTTAIL` |
+| `src/parse/parse_type.c` | `musttail` attribute table entry |
+| `src/parse/parse_priv.h` | `+u8 in_musttail` to `Parser` |
+| `src/parse/parse_stmt.c` | attribute prefix detection; musttail return path |
+| `src/parse/parse_expr.c` | `cg_tail_call` dispatch when `in_musttail` |
+| `src/cg/cg.c` | factor `cg_call_impl`; implement `cg_tail_call` |
+| `src/arch/aarch64/internal.h` | constants, `AATailCallSite`, fields in `AAImpl` |
+| `src/arch/aarch64/ops.c` | tail-call branch in `aa_call`; `aa_tail_site_push` |
+| `src/arch/aarch64/emit.c` | init in `aa_func_begin`; patch loop in `aa_func_end` |
+| `test/parse/` | musttail attribute parse test |
+| `test/cg/` | direct/indirect/e2e tail call tests |