commit 5a7a304b5a1d6dd328cc662759c895feebb5340c parent 14e3792d88237772be6076f14a77103d5ea746f8 Author: Ryan Sepassi <rsepassi@gmail.com> Date: Fri, 5 Jun 2026 08:38:50 -0700 Implement stack-trace builtins & __kit_backtrace (BACKTRACE L1+L2) L1 — __builtin_frame_address / __builtin_return_address: two CG intrinsics (KIT_CG_INTRIN_FRAME_ADDRESS/_RETURN_ADDRESS) carrying the constant level as an IMM operand, lowered as an unrolled frame-pointer walk on aarch64/x86-64/ riscv (O0 and O1, shared backend handler). C target forwards __builtin_* to the host; wasm reports unsupported; the C frontend validates the level via eval_const_int. IR_INTRINSIC is already conservatively side-effecting in opt, so no new effect modeling was needed; the one O1 hazard — riscv's frameless- leaf tier omitting the frame record — is handled by a new NativeKnownFrameDesc.reads_frame flag (aarch64/x64 always keep the record). L2 — __kit_backtrace(void**, int, int): freestanding FP-walk capture in rt/lib/stack/backtrace.c + rt/include/kit/backtrace.h, added to RT_BASE_SRCS. kit's frame layout is uniform across targets (fp[0]=caller fp, fp[1]=saved ra in void* units), so no per-arch offset table — the psABI rv64 layout in the plan sketch was wrong; verified against the actual prologue. Tests: test/rt/cases/backtrace_capture.c (aa64/x64/rv64 under exec), test/parse builtin_29..31 + cases_err/..._nonconst (D/R/E/J/C lanes, O0/O1), test/toy 154_frame_return_address. Verified end-to-end through kit addr2line on static-freestanding and dynamic-musl binaries across all three arches. doc/plan/BACKTRACE.md updated (L1+L2 done, L3 remaining). doc/plan/TODO.md records four pre-existing gaps found while building/verifying: &&label-as-value broken at O1, __kit_syscallN declared-but-unimplemented, -no-pie not producing ET_EXEC, and kit cc/ld dynamic-libc link ergonomics. Diffstat:
41 files changed, 708 insertions(+), 61 deletions(-)
diff --git a/doc/plan/BACKTRACE.md b/doc/plan/BACKTRACE.md @@ -1,6 +1,62 @@ # Plan: stack-trace builtins & runtime backtrace -## Status — 2026-06-05 — proposed; nothing built yet +## Status — 2026-06-05 — L1 + L2 shipped (WS1–WS3); L3 remaining + +Implemented and tested through L2: + +- **L1 builtins** `__builtin_frame_address` / `__builtin_return_address` — two + CG intrinsics (`KIT_CG_INTRIN_FRAME_ADDRESS` / `_RETURN_ADDRESS`), constant + level carried as a single IMM operand, lowered as an unrolled FP walk on + aarch64 / x86-64 / riscv (O0 and O1, same backend handler). The C target + forwards `__builtin_*` to the host compiler; wasm reports unsupported; the C + frontend validates the level via `eval_const_int`. +- **L1 O1 modeling** — `IR_INTRINSIC` is already conservatively side-effecting + in opt (never DCE'd / CSE'd / hoisted), so no new effect modeling was needed. + The one real O1 hazard — riscv's frameless-leaf tier (`slim_prologue`) emits + no prologue and never anchors `s0` — is handled by a new + `NativeKnownFrameDesc.reads_frame` flag set during frame analysis when these + intrinsics appear; aarch64/x64 keep the frame record in every prologue shape, + so they need no change. +- **L2 capture** `__kit_backtrace` — `rt/lib/stack/backtrace.c` + + `rt/include/kit/backtrace.h`, in `RT_BASE_SRCS` for every variant. + +Open questions resolved while building: + +- **rv64 frame-record layout** — the psABI `ra@s0-8` / `fp@s0-16` guess in the + L2 sketch is *wrong for kit*. kit's prologue stores the pair at and above s0: + `[s0+0] = caller fp`, `[s0+ptr] = saved ra` (verified against + `rv_build_prologue`). So the layout is uniform across all kit targets + (`fp[0]`/`fp[1]` in units of `void*`) — `__kit_backtrace` needs no per-arch + offset table at all, just index 0 and 1. +- **wasm** — diagnose unsupported (confirmed acceptable); the capability hook + returns false and the C frontend emits a clean error. +- **leaf-frame omission** — handled via `reads_frame` (above). + +Tests: `test/rt/cases/backtrace_capture.c` (aa64/x64/rv64 under exec), +`test/parse/cases/builtin_29..31_*` (+ `cases_err/..._nonconst`) across the +D/R/E/J/C lanes at O0/O1, `test/toy/cases/154_frame_return_address.toy`. + +### Remaining tasks (L3) + +Nothing in L1/L2 is outstanding. What's left is all of L3 — symbolize & print: + +- **WS4 — L3a (recommended next):** `__kit_print_backtrace()` in rt — walk via + `__kit_backtrace`, write raw `#N 0xADDR` lines to a weak + `__kit_backtrace_write` sink, symbolize out-of-process via `kit addr2line`. + Then wire the **assert-path hook** (`rt/lib/assert/assert.c::__kit_assert_fail` + → `__kit_print_backtrace()` before `__builtin_trap()`) — deferred from L2 + because it calls this L3 function. The end-to-end round-trip is already proven + manually (static + dynamic, aa64/x64/rv64 — see Status); WS4 packages it as a + shipped `__kit_print_backtrace` + test. +- **WS5 — L3c:** tool-side auto-backtrace in `kit run`/`kit emu`/`dbg` fault + handlers (reuses the existing DWARF reader + `dbg bt`; never crosses into rt). +- **L3b:** in-process self-symbolization (hosted-only `libkit_bt.a`); deferred + until a concrete consumer needs in-binary symbolized panics. + +One open question remains (the L3a output sink — see Open questions); the others +listed below were resolved while building L1/L2. + +## Overview kit has no way for compiled code to inspect its own call stack. This roadmap adds that capability in three layers: GCC-compatible primitive **builtins** @@ -79,7 +135,12 @@ the live LR/RA, which may be clobbered mid-function): |------|--------|-----------|----------------|--------------------------| | aarch64 | x29 | x29 | `fp = *(fp)` | `*(fp + 8)` (saved x30) | | x86-64 | rbp | rbp | `fp = *(fp)` | `*(fp + 8)` (pushed retaddr) | -| rv64 | s0/x8 | s0 | `fp = *(fp - 16)` | `*(fp - 8)` (saved ra) | +| rv64 | s0/x8 | s0 | `fp = *(fp)` | `*(fp + ptr)` (saved ra) | + +The table is **uniform** across kit's targets: the prologue stores +`[fp+0] = caller fp`, `[fp+ptr] = saved ra` everywhere (verified against +`rv_build_prologue` — note this differs from the RISC-V psABI's `ra@s0-8` / +`fp@s0-16`, which an early draft of this table wrongly assumed). For a constant level the walk unrolls to `level` dependent loads (typically 0–2), so no loop is emitted. wasm has no FP chain → **diagnose unsupported**, exactly @@ -104,25 +165,29 @@ as the IRQ/cache intrinsics already do per-arch. compiler). - Capability hooks: `src/arch/{aa64,x64,riscv,wasm}/arch.c` (alongside the existing `KIT_CG_INTRIN_TRAP` cases at e.g. `aa64/arch.c:197`). -- **Optimizer (O1/O2):** unlike `INTRIN_TRAP`/`INTRIN_LONGJMP` these are *not* - control-flow terminators, so the special-casing scattered through `src/opt` - (`pass_cfg.c:43`, `cg_ir_lower.c:286`, `pass_ssa.c:818`, …) does **not** apply. - But they **read memory and depend on the live frame**, so they must be modeled - as memory-reading / non-pure: not hoistable out of the function, not CSE'd - across a call, level-0 not sunk past a frame-pointer change. Audit the O1 emit - path (`src/opt/pass_native_emit.c`, `cg_ir_lower.c`) to lower them like an - ordinary load with a frame dependency. **This is the main risk area** and wants - dedicated O1 tests. - -### Tests (L1) - -- `test/toy/` — a CG-API toy case exercising both intrinsics at levels 0/1/2. -- `test/parse/cases/` — `builtin_NN_return_address.c` etc.; an error case for a - non-constant level. -- Smoke (`test/smoke`, exit-code-42 convention): a known 2-deep call chain where - `f` returns `__builtin_return_address(0)` and `main` compares it against a - label/`&&`-style anchor, asserting it lands inside the caller's range. Run at - **O0 and O1** on x64 + aa64 + rv64. +- **Optimizer (O1/O2) [done]:** in practice no new effect modeling was needed — + `IR_INTRINSIC` is already conservatively side-effecting in opt (never DCE'd, + CSE'd, or hoisted; see `pass_dce.c`), and the FP it reads is stable across the + whole function, so scheduling is harmless. The one real O1 hazard turned out to + be a backend frame issue, not an opt-modeling one: riscv's frameless-leaf tier + (`slim_prologue`) emits no prologue and never anchors `s0`, so a leaf that reads + its own frame would walk a stale `s0`. Fixed with a `NativeKnownFrameDesc.reads_frame` + flag set in `pass_native_emit.c` frame analysis and ANDed into riscv's + `slim_prologue` decision; aarch64/x64 keep the frame record in every prologue + shape, so they need nothing. O1 smoke tests run on all three arches. + +### Tests (L1) [done] + +- `test/toy/cases/154_frame_return_address.toy` — CG-API case exercising both + intrinsics at levels 0/1/2 (`@[.noinline]` chain pins the depth). +- `test/parse/cases/builtin_29_return_address.c`, `builtin_30_frame_address.c`, + and `builtin_31_return_address_anchor.c`; error case + `cases_err/builtin_return_address_nonconst.c` for a non-constant level. +- The plan's "anchor in caller's range" smoke check is `builtin_31` (run via the + parse harness's qemu/podman exec lane on x64 + aa64 + rv64 at **O0 and O1**), + not a `test/smoke` script. It anchors on the **caller's function address**, not + a `&&label`: GNU labels-as-values whose address is taken but never `goto`'d + break at O1 (`undefined reference to '.Lcfblk.N'`; see doc/plan/TODO.md). --- @@ -143,23 +208,26 @@ int __kit_backtrace(void** buf, int max, int skip); Implementation is the L1 walk expressed in portable C: seed from `__builtin_frame_address(0)`, then loop `fp = *(void**)fp` reading the saved-RA -slot per the table above, stopping on NULL fp, an fp that doesn't increase -(stack grows down — detect cycles/garbage), or `max`. The per-arch slot offsets -are the *only* target knob; keep them in one small `static` per the -RUNTIME.md "no target-dispatch ifdef" rule (parameterize, don't `#ifdef`-cascade -— select by a build-time constant the way the int/fp helpers do). - -- `mk/rt.mk` — add `rt/lib/stack/backtrace.c` to every variant (it already - compiles `rt/lib/stack/` for the Windows chkstk helper). -- **Hook the trap paths:** make `rt/lib/assert/assert.c::__kit_assert_fail` call - `__kit_print_backtrace()` (L3) before `__builtin_trap()`. Because the symbol is - `weak`, a freestanding user with no output sink can still override it. - -### Tests (L2) - -- `test/rt/cases/backtrace_capture.c` — build a known N-deep recursion, capture, - assert depth and monotonic frame addresses; `return 42` on success. Runs under - the existing `test/rt/run.sh` harness across variants. +slot, stopping on a NULL saved-RA (the synthetic stack origin), a NULL or +non-increasing fp (stack grows down — detect cycles/garbage), a misaligned link, +or `max`. **No per-arch knob is needed:** kit's frame layout is uniform, so the +walk indexes `fp[0]` (caller fp) and `fp[1]` (saved ra) as `void**`, which scales +to the target pointer width automatically — no offset table, no `#ifdef` cascade. +`skip` discards the innermost N frames (a print wrapper passes `skip >= 1`). + +- `mk/rt.mk` — added `rt/lib/stack/backtrace.c` to `RT_BASE_SRCS` (built for + every variant; `rt/lib/stack/` already compiled the Windows chkstk helper). +- **Assert-path hook — deferred to L3 (WS4):** making + `rt/lib/assert/assert.c::__kit_assert_fail` print a backtrace before + `__builtin_trap()` needs the L3 `__kit_print_backtrace()`, so it lands with + WS4, not here. + +### Tests (L2) [done] + +- `test/rt/cases/backtrace_capture.c` — a known-depth `@[.noinline]` recursion; + asserts depth, all return addresses non-null, that the recursive frames share a + call site (proving the walk follows the chain), and the `skip`/`max` bounds; + `return 42` on success. Runs under `test/rt/run.sh` on aa64/x64/rv64. --- @@ -204,25 +272,31 @@ shipping **L3a now**, leaving L3b/L3c as documented extensions. ## Suggested sequencing -1. **WS1 — L1 primitives, O0**, all three native arches + parse/toy tests. Ship - the GCC-compatible surface first; it's the foundation and independently useful. -2. **WS2 — L1 at O1/O2**: the optimizer memory-effect modeling + O1 smoke tests. - (Highest-risk slice; isolate it.) -3. **WS3 — L2 `__kit_backtrace`** in rt + capture test + assert-hook. -4. **WS4 — L3a** raw print + `kit addr2line` round-trip; wire into assert/emu. -5. **WS5 — L3c** tool-side auto-backtrace (optional, parallelizable with WS3/4). +1. **WS1 — L1 primitives, O0** — all three native arches + parse/toy tests. ✅ done. +2. **WS2 — L1 at O1/O2** — opt effect-modeling audit (turned out to need only the + riscv frame-record fix) + O1 tests. ✅ done. +3. **WS3 — L2 `__kit_backtrace`** in rt + capture test. ✅ done (assert-hook moved + to WS4 — it needs the L3 print fn). +4. **WS4 — L3a** raw print (`__kit_print_backtrace` + weak `__kit_backtrace_write` + sink) + `kit addr2line` round-trip; wire the assert hook. ⏳ remaining (next). +5. **WS5 — L3c** tool-side auto-backtrace (optional, parallelizable). ⏳ remaining. 6. **L3b** deferred until a consumer needs in-binary symbolized panics. ## Open questions -- **wasm:** confirm "diagnose unsupported" is acceptable for L1 (no FP chain), or - whether the C/wasm targets should forward `__builtin_*` to the host toolchain. -- **rv64 frame-record layout:** verify the saved-ra/prev-fp offsets against the - actual prologue emitted by `src/arch/riscv/native.c` (the table above assumes - `ra@fp-8`, `fp@fp-16`; confirm before coding the walk). -- **Output sink for L3a:** weak `__kit_backtrace_write` vs. requiring the host to - pass a sink explicitly. Weak-symbol default keeps freestanding builds linking. -- **Level-0 return address semantics under tail-call / leaf-frame omission:** kit - keeps a frame pointer everywhere, but confirm leaf functions still store the - frame record (if a leaf skips the `{fp,ra}` store, level-0 return address must - fall back to live LR/RA on aa64/rv64). +- **Output sink for L3a (open):** weak `__kit_backtrace_write` vs. requiring the + host to pass a sink explicitly. Weak-symbol default keeps freestanding builds + linking. Resolve in WS4. + +Resolved while building L1/L2: + +- ~~**wasm:**~~ diagnose unsupported — confirmed acceptable; the capability hook + returns false and the C frontend emits a clean error. (C target separately + forwards `__builtin_*` to the host compiler.) +- ~~**rv64 frame-record layout:**~~ verified against `rv_build_prologue` — kit + stores `[s0+0]=caller fp`, `[s0+ptr]=saved ra` (NOT the psABI `ra@s0-8` / + `fp@s0-16`), so the layout is uniform across targets. +- ~~**leaf-frame omission:**~~ handled by `NativeKnownFrameDesc.reads_frame`, which + forces riscv off its frameless-leaf tier when these intrinsics appear; aa64/x64 + always keep the frame record. (Level-0 reads the spilled slot via the FP, so no + live-LR/RA fallback is needed.) diff --git a/doc/plan/TODO.md b/doc/plan/TODO.md @@ -21,3 +21,134 @@ x86_64-windows -O1` crash at runtime with 0xC0000005 (STATUS_ACCESS_VIOLATION); they pass at O0, on aarch64-freebsd, and `130_record_sret_return` passes at O1 — so it is specifically tail-call + struct-return (sret) lowering on the Win64 ABI at O1. Reproduced reliably in the Windows VM. Surfaced by `test-toy-windows-vm`. + +## O1: `&&label` whose address is taken but never `goto`'d → undefined `.Lcfblk.N` + +At `-O1`, taking a label's address with the GNU labels-as-values extension +(`&&label`) when that label is **not** also a computed-goto target produces a +dangling reference to an internal control-flow-block symbol: the optimizer +elides/merges the block but the address-of relocation survives, so the output +has a relocation against an undefined `.Lcfblk.N`. The JIT path reports +`fatal: link: undefined reference to '.Lcfblk.6'`; an emitted object carries an +undefined `.Lcfblk.N` (visible via `kit nm`). O0 is fine (the block is kept). +The plain `goto *p; done:` form is also fine because `done` is a real goto +target, so its block is retained. + +Minimal repro: + +```c +static int ext(void) { return 7; } +int main(void) { + int x; +before:; + x = ext(); +after:; + void* lo = &&before; /* address-taken, never a goto target */ + void* hi = &&after; + return (lo != hi) ? 42 : x; +} +``` + +`kit run -O1` on the above fails to link; `kit run -O0` returns 42. Likely in +the O1 CFG/block-merge passes (`src/opt/pass_cfg.c` / block dedup): a block that +is only referenced by a label-address relocation must be pinned (not merged or +dropped), or the relocation retargeted to the surviving block. Found while +writing the backtrace anchor test (doc/plan/BACKTRACE.md); worked around there +by anchoring on a function address instead of `&&label`. + +## `__kit_syscallN` (`rt/include/kit/syscall.h`) is declared/documented but unimplemented + +The header declares `__kit_syscall0..6` and documents them as lowering to the +trap instruction inline ("kit emits the appropriate trap instruction inline; +there is no library call"). In practice nothing implements them: + +- the C frontend does not recognize the `__kit_syscallN` names (no intern in + `lang/c/parse/parse.c`, no handler in `try_parse_builtin_call`), so a call + compiles as an ordinary extern function reference; +- `KIT_CG_INTRIN_SYSCALL` maps to `INTRIN_NONE` in `src/cg/arith.c` and every + native `*_supports_intrinsic` returns 0, so the generic intrinsic path has no + backend lowering either; +- no rt object defines the symbols. + +Result: `#include <kit/syscall.h>` + `__kit_syscall1(93, 42)` fails to link with +`undefined reference to '__kit_syscall1'`. The only working way to issue a +syscall from kit-compiled C today is hand-written extended inline asm (the x64 +backend explicitly supports the register-pinned syscall idiom). Either wire the +`__kit_syscallN` names to an inline lowering (per-arch trap emit, normalizing the +BSD/Darwin carry-flag convention as the header promises) or drop the header. +Found while building a freestanding Linux backtrace demo (it needed `write`/ +`exit`); worked around with inline asm. The toy frontend maps `@syscall` to the +same `INTRIN_NONE` and likewise can't lower it (`test/toy/cases/unsupported_syscall`). + +## `-no-pie` does not produce a non-PIE (ET_EXEC) executable + +`-no-pie` sets `o->target.pic = KIT_PIC_NONE` (`driver/cmd/cc.c:1185`) but does +not clear `o->pie`, which is what feeds the linker's e_type decision +(`driver/cmd/cc.c:2494`). So `kit cc -no-pie -static foo.c -o exe` still emits an +ET_DYN (static-PIE) image — `readelf`/`file` report "shared object", and the +program loads at an ASLR base, so captured code addresses don't match link-time +addresses without computing the load slide. `-no-pie` should clear `o->pie` (and +ideally `-static` without `-pie` should default to non-PIE). Found while making a +backtrace demo's addresses line up with `kit addr2line`. + +## `kit cc --sysroot=<musl>` one-shot: vestigial interp on a static image, and no way to force a dynamic libc link + +Two related problems with one-shot `kit cc` against a musl sysroot (a sysroot +that ships both `libc.a` and `libc.so`, the normal case): + +1. **Vestigial `PT_INTERP` + empty `PT_DYNAMIC` on an effectively-static + binary.** `kit cc -fPIE --sysroot=<musl> foo.c -o exe` links `libc.a` + statically — `printf`/`__libc_start_main` come out `T` (defined in the image), + the Dynamic Section is empty (no `DT_NEEDED`) — yet because the output is + ET_DYN (PIE) kit ld stamps its default musl `PT_INTERP` + (`/lib/ld-musl-<arch>.so.1`) and an empty `PT_DYNAMIC`. The result is a static + image wearing a dynamic costume: it references a loader it doesn't need and + carries a pointless dynamic segment. It happens to run where that loader path + exists, but it's malformed and would fail to start on a system without that + exact interp. kit ld should not emit a `PT_INTERP`/`PT_DYNAMIC` when the link + produced no dynamic dependencies. + +2. **No way to get a genuine dynamic libc link through `kit cc`.** The hosted + profile selector `hosted_resolve_linux` (`driver/lib/hosted.c:392-397`) routes + to `hosted_resolve_linux_musl_static` whenever `libc.a` is present — the rule + at line 394 (`has_libc_a && !(has_libc_so6 && has_glibc_nonshared)`) does not + consult `req->static_link`, and it precedes the `musl_dynamic` branch (line + 396), so `hosted_resolve_linux_musl_dynamic` is unreachable for any musl + sysroot that ships `libc.a`. No `kit cc` flag (`-Bdynamic`, absence of + `-static`, etc.) flips it. The only way to get a real dynamic exe today + (`printf` as a `U` import + `DT_NEEDED libc.musl-<arch>.so.1`) is to drop to an + explicit two-step `kit cc -c` + `kit ld -pie … Scrt1.o crti.o obj libc.so + libkit_rt.a crtn.o` (what test/libc/musl/run.sh's dynamic lane does). The + static rule should respect a default-dynamic link mode (or at least let + `-Bdynamic`/non-`-static` reach the dynamic branch). Found verifying + `__kit_backtrace` on dynamically-linked binaries (doc/plan/BACKTRACE.md). + +## `kit ld` can't link against a sysroot with just `-lc` (no `--sysroot`, no crt auto-add, `-l` is `.a`-only) + +The ergonomic expectation — point `kit ld` at a sysroot and pass `-lc`, without +hand-listing crt objects and the raw `libc.so` path — does not work. Three gaps, +all reproduced against `build/musl-sysroot`: + +1. **No `--sysroot`.** `kit ld --sysroot=<dir> …` → `ld: unknown flag: + --sysroot=…`. GNU ld supports `--sysroot` (and the `=`-prefix path rewrite); + kit ld only has `-L`. +2. **No crt auto-provision.** `kit ld -L <sysroot>/lib -lc -pie -o exe obj + libkit_rt.a` → `fatal: link: entry symbol '_start' not defined`. kit ld does + not pull the sysroot's start files, so the caller must pass + `crt1.o`/`Scrt1.o` + `crti.o` + `crtn.o` explicitly. (crt selection is + traditionally the cc driver's job, but kit's own one-step `kit cc` can't + produce a dynamic libc link either — see the item above — so there is no + convenient route at all.) +3. **`-l NAME` resolves only `lib<NAME>.a`.** Per `kit ld --help` and observed + behavior, `-lc` finds `libc.a` (static) only; it never considers + `libc.so`/`libc.so.N`. So even once crts are sorted, `-lc` yields a static + libc — a dynamic link still requires handing `libc.so` to kit ld as a + positional input. GNU ld searches `.so` then `.a` (honoring `-Bstatic`/ + `-Bdynamic`); kit ld should do the same so `-lc` can produce a dynamic + dependency. + +Net: a dynamic libc link today requires the explicit +`kit ld -pie … Scrt1.o crti.o obj libc.so libkit_rt.a crtn.o` form. Wiring +`--sysroot`, `.so`-aware `-l` resolution, and (optionally) crt auto-add would let +`kit ld --sysroot=<dir> -pie -lc -o exe obj` work as expected. Found verifying +`__kit_backtrace` on dynamically-linked binaries (doc/plan/BACKTRACE.md). diff --git a/include/kit/cg.h b/include/kit/cg.h @@ -999,6 +999,13 @@ typedef enum KitCgIntrinsic { KIT_CG_INTRIN_WFE, /* arm/aarch64 only */ KIT_CG_INTRIN_SEV, /* arm/aarch64 only */ KIT_CG_INTRIN_CORO_SWITCH, /* pop from, to, value; push value */ + /* Frame-pointer-chain introspection (GCC __builtin_frame_address / + * __builtin_return_address). The level is a compile-time constant passed as a + * single immediate operand (nargs == 1); level 0 names the current frame. + * Both push a void*. Lowered as an unrolled FP walk; targets with no frame + * pointer (wasm) report unsupported. */ + KIT_CG_INTRIN_FRAME_ADDRESS, /* pop level(u32 const); push void* */ + KIT_CG_INTRIN_RETURN_ADDRESS, /* pop level(u32 const); push void* */ } KitCgIntrinsic; typedef enum KitCgBarrierScope { diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c @@ -1146,6 +1146,22 @@ void pcg_intrinsic_void(Parser* p, IntrinKind k) { } } +/* __builtin_return_address(level) / __builtin_frame_address(level): emit the + * frame-pointer-chain intrinsic. The constant level rides as a single immediate + * operand (kept as OPK_IMM by kit_cg_intrinsic); the result is void*. */ +void pcg_frame_or_return_address(Parser* p, int is_return, u32 level) { + const Type* void_ptr = type_ptr(p->pool, type_void(p->pool)); + KitCgIntrinsic intrin = + is_return ? KIT_CG_INTRIN_RETURN_ADDRESS : KIT_CG_INTRIN_FRAME_ADDRESS; + pcg_push_int(p, (i64)level, type_prim(p->pool, TY_INT)); + if (pcg_emit_enabled(p)) + kit_cg_intrinsic(p->cg, intrin, 1, pcg_tid(p, void_ptr)); + /* kit_cg_intrinsic popped the level operand and pushed the void* result; + * retag the slot pcg_push_int added (also clears the level's value flags, + * e.g. the null-pointer-constant tag set for level 0). */ + pcg_retag_top(p, void_ptr); +} + void pcg_inline_asm(Parser* p, const char* tmpl, const AsmConstraint* outs, u32 nout, const AsmConstraint* ins, u32 nin, const Sym* clobbers, u32 nclob) { diff --git a/lang/c/parse/cg_adapter.h b/lang/c/parse/cg_adapter.h @@ -347,6 +347,7 @@ void pcg_atomic_cas(Parser*, MemOrder, MemOrder); void pcg_fence(Parser*, MemOrder); void pcg_intrinsic_unary_to_int(Parser*, IntrinKind); void pcg_intrinsic_void(Parser*, IntrinKind); +void pcg_frame_or_return_address(Parser*, int is_return, u32 level); void pcg_inline_asm(Parser*, const char*, const AsmConstraint*, u32, const AsmConstraint*, u32, const Sym*, u32); diff --git a/lang/c/parse/parse.c b/lang/c/parse/parse.c @@ -1533,6 +1533,10 @@ void parse_c(Compiler* c, Pool* pool, Pp* pp, DeclTable* decls, CG* cg, p.sym_b_trap = kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_trap")); p.sym_b_unreachable = kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_unreachable")); + p.sym_b_return_address = + kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_return_address")); + p.sym_b_frame_address = + kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_frame_address")); p.sym_b_memcpy = kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_memcpy")); p.sym_b_memmove = kit_sym_intern(p.pool->c, KIT_SLICE_LIT("__builtin_memmove")); diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c @@ -1712,6 +1712,7 @@ static int try_parse_builtin_call(Parser* p) { name != p->sym_b_ctzl && name != p->sym_b_ctzll && name != p->sym_b_clz && name != p->sym_b_clzl && name != p->sym_b_clzll && name != p->sym_b_trap && name != p->sym_b_unreachable && + name != p->sym_b_return_address && name != p->sym_b_frame_address && name != p->sym_b_expect && name != p->sym_b_offsetof && name != p->sym_b_va_start && name != p->sym_b_va_arg && name != p->sym_b_va_end && name != p->sym_b_va_copy && @@ -1790,6 +1791,20 @@ static int try_parse_builtin_call(Parser* p) { return 1; } + if (name == p->sym_b_return_address || name == p->sym_b_frame_address) { + /* GCC requires the level to be an integer constant expression. */ + int is_return = (name == p->sym_b_return_address); + i64 level = eval_const_int(p, p->cur.loc); + expect_punct(p, ')', + "')' after __builtin_return_address/__builtin_frame_address"); + if (level < 0) + perr(p, "__builtin_%s: level must be non-negative", + is_return ? "return_address" : "frame_address"); + pcg_set_loc(p, loc); + pcg_frame_or_return_address(p, is_return, (u32)level); + return 1; + } + if (name == p->sym_b_va_start) { parse_assign_expr(p); pcg_addr(p); diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h @@ -278,6 +278,8 @@ typedef struct Parser { Sym sym_b_va_arg; Sym sym_b_va_end; Sym sym_b_va_copy; + Sym sym_b_return_address; /* __builtin_return_address */ + Sym sym_b_frame_address; /* __builtin_frame_address */ Sym sym_attribute; Sym sym_volatile_alias; Sym sym_alignof_alias; diff --git a/lang/toy/builtins.c b/lang/toy/builtins.c @@ -419,6 +419,33 @@ KitCgTypeId toy_parse_builtin_call(ToyParser* p, KitSym name, int* recognized) { return ty; } + if (toy_sym_is(p, name, "frame_address") || + toy_sym_is(p, name, "return_address")) { + /* __builtin_frame_address / __builtin_return_address. The constant level + * rides as a single immediate operand; the result is a void*. */ + KitCgIntrinsic intrin = toy_sym_is(p, name, "return_address") + ? KIT_CG_INTRIN_RETURN_ADDRESS + : KIT_CG_INTRIN_FRAME_ADDRESS; + KitCgTypeId void_ptr = + kit_cg_type_ptr(p->c, toy_builtin_type(p, KIT_CG_BUILTIN_VOID), 0); + KitCgTypeId level_ty; + if (!toy_parser_expect(p, TOK_LPAREN)) return KIT_CG_TYPE_NONE; + level_ty = toy_parse_expr(p); + if (level_ty == KIT_CG_TYPE_NONE) return KIT_CG_TYPE_NONE; + if (!toy_type_is_intlike(p, level_ty)) { + toy_error(p, p->cur.loc, "frame/return address level must be integer"); + return KIT_CG_TYPE_NONE; + } + if (!toy_parser_expect(p, TOK_RPAREN)) { + toy_error(p, p->cur.loc, "expected ')'"); + return KIT_CG_TYPE_NONE; + } + if (!kit_cg_target_supports_intrinsic(p->c, intrin)) + return toy_unsupported_intrinsic(p); + kit_cg_intrinsic(p->cg, intrin, 1, void_ptr); + return void_ptr; + } + if (toy_sym_is(p, name, "expect")) { KitCgTypeId a, b; toy_parser_advance(p); diff --git a/mk/rt.mk b/mk/rt.mk @@ -231,6 +231,7 @@ RT_BASE_SRCS = \ rt/lib/stdio/printf.c \ rt/lib/atomic/atomic_freestanding.c \ rt/lib/cache/clear_cache.c \ + rt/lib/stack/backtrace.c \ rt/lib/kit/ifunc_init.c RT_COMPILER_SRCS = \ diff --git a/rt/include/kit/backtrace.h b/rt/include/kit/backtrace.h @@ -0,0 +1,37 @@ +/* kit/backtrace.h -- kit extension -- freestanding call-stack capture + * + * kit/backtrace.h is non-standard: C11 has no notion of inspecting the + * call stack. kit exposes a freestanding capture primitive so panic + * handlers, allocators, profilers, and unwind-free diagnostics can record + * a backtrace without libc, without DWARF, and without .eh_frame. + * + * How it works. kit maintains a frame pointer on every backend and never + * omits it, so each prologue stores a {saved_fp, saved_ra} record and + * anchors the frame pointer at it. __kit_backtrace walks that chain. The + * walk is target-independent: kit lays every frame record out as + * fp[0] = caller frame pointer, fp[1] = return address (in units of + * void*, so it scales to the target pointer width automatically). + * + * Symbolization (turning a captured address into func at file:line) is a + * separate, hosted-side concern — feed the addresses to `kit addr2line`. + * See doc/plan/BACKTRACE.md. + */ +#ifndef KIT_BACKTRACE_H +#define KIT_BACKTRACE_H + +/* Fill buf[0..max) with return addresses, innermost first, and return the + * number written. The walk starts at __kit_backtrace's own frame, so with + * skip == 0 buf[0] is the return address into the direct caller of + * __kit_backtrace (the helper itself never appears — a return address always + * points into a caller). `skip` discards that many innermost return addresses + * first: a print wrapper passes skip >= 1 to drop its own frame from the + * trace. The walk stops at `max`, at a null frame pointer, or at any frame + * pointer that does not strictly increase (stack grows down) — which both + * terminates at the outermost frame and bounds a runaway walk over garbage. + * + * Freestanding: a pure frame-pointer walk, no libc, no DWARF. Valid on every + * kit target that keeps a frame pointer (all native backends). Negative + * `skip` is treated as 0; non-positive `max` returns 0. */ +int __kit_backtrace(void** buf, int max, int skip); + +#endif /* KIT_BACKTRACE_H */ diff --git a/rt/lib/stack/backtrace.c b/rt/lib/stack/backtrace.c @@ -0,0 +1,53 @@ +/* + * __kit_backtrace -- freestanding call-stack capture via the frame-pointer + * chain. See rt/include/kit/backtrace.h for the contract and doc/plan/ + * BACKTRACE.md for the design. + * + * kit keeps a frame pointer on every backend and never omits it, so each + * prologue stores {saved_fp, saved_ra} and anchors the frame pointer at the + * record. The layout is uniform across all kit targets: + * + * fp[0] = caller's frame pointer (saved fp) + * fp[1] = return address (saved ra) + * + * Indexing a void** scales by the target pointer width, so the same two + * indices are correct on 32- and 64-bit targets alike — no per-arch offset + * table, no #ifdef cascade (per the RUNTIME.md "no target-dispatch ifdef" + * rule). __builtin_frame_address(0) marks __kit_backtrace as reading its own + * frame, which forces it to keep a valid record (it never degrades to a + * frameless leaf), so the seed frame is always walkable. + */ +#include <kit/backtrace.h> + +int __kit_backtrace(void** buf, int max, int skip) { + void** fp; + int n = 0; + if (!buf || max <= 0) return 0; + if (skip < 0) skip = 0; + + fp = (void**)__builtin_frame_address(0); + while (fp) { + void** next = (void**)fp[0]; /* saved caller frame pointer */ + void* ra = fp[1]; /* saved return address */ + + /* A real frame always has a non-null return address; a null one is the + * synthetic stack origin (_start / runtime entry zeroes the record), so + * stop without recording it. */ + if (!ra) break; + + if (skip > 0) { + --skip; + } else { + if (n >= max) break; + buf[n++] = ra; + } + + /* The stack grows down, so a valid caller frame sits strictly above this + * one. A null, non-increasing, or misaligned link is the chain terminator + * (or garbage) — stop, which also bounds a runaway walk. */ + if (next <= fp) break; + if (((unsigned long)(void*)next & (sizeof(void*) - 1u)) != 0u) break; + fp = next; + } + return n; +} diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c @@ -220,6 +220,8 @@ static int aa64_supports_intrinsic(const Compiler* c, KitCgIntrinsic intrin) { case KIT_CG_INTRIN_IRQ_RESTORE: case KIT_CG_INTRIN_IRQ_ENABLE: case KIT_CG_INTRIN_IRQ_DISABLE: + case KIT_CG_INTRIN_FRAME_ADDRESS: + case KIT_CG_INTRIN_RETURN_ADDRESS: return 1; case KIT_CG_INTRIN_SETJMP: case KIT_CG_INTRIN_LONGJMP: diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -3612,6 +3612,24 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind, case INTRIN_IRQ_ENABLE: aa_emit32(t->mc, aa64_msr_daifclr(AA64_DAIF_ALL)); return; + case INTRIN_FRAME_ADDRESS: + case INTRIN_RETURN_ADDRESS: + /* Walk the AAPCS64 frame-record chain. Every kit prologue stores + * {x29, x30} and anchors x29 at the record: [x29] = caller's x29, + * [x29 + 8] = saved x30 (this frame's return address). The level is a + * compile-time constant, so the walk unrolls to `level` dependent loads. */ + if (ndst == 1u) { + u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) + ? (u32)args[0].v.imm + : 0u; + u32 rd = loc_reg(dsts[0]); + aa_emit32(t->mc, aa64_mov_reg(1, rd, AA_FP)); + for (u32 i = 0; i < level; ++i) + aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 0)); /* rd = *(rd) */ + if (kind == INTRIN_RETURN_ADDRESS) + aa_emit32(t->mc, aa64_ldr64_uimm12(rd, rd, 1)); /* rd = *(rd + 8) */ + } + return; default: aa_panic(aa_of(t), "unsupported compiler intrinsic"); } diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c @@ -2788,6 +2788,31 @@ void c_emit_intrinsic(CTarget* t, IntrinKind k, Operand* dsts, u32 ndst, cbuf_puts(&t->body, ");\n"); return; } + case INTRIN_FRAME_ADDRESS: + case INTRIN_RETURN_ADDRESS: { + /* Forward straight to the host compiler's builtin. dsts[0] is the void* + * result; args[0] is the constant level. The builtin requires a bare + * integer constant, so emit the level as a plain decimal (not via + * c_emit_operand, which wraps IMMs in a cast). */ + char nbuf[24]; + unsigned level = + (narg >= 1 && args[0].kind == OPK_IMM) ? (unsigned)args[0].v.imm : 0u; + if (ndst != 1) { + compiler_panic(t->c, loc, + "C target: frame/return address: expected 1 dst, got %u", + (unsigned)ndst); + } + snprintf(nbuf, sizeof nbuf, "%u", level); + c_ensure_local(t, dsts[0].v.local, dsts[0].type); + c_emit_local_assign_open(t, dsts[0].v.local, (KitCgTypeId)0); + cbuf_puts(&t->body, k == INTRIN_FRAME_ADDRESS + ? "__builtin_frame_address(" + : "__builtin_return_address("); + cbuf_puts(&t->body, nbuf); + cbuf_puts(&t->body, ")"); + c_emit_local_assign_close(t); + return; + } case INTRIN_NONE: default: compiler_panic(t->c, loc, "C target: intrinsic kind %d not handled", diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -97,6 +97,13 @@ typedef struct NativeKnownFrameDesc { * otherwise-leaf function. The single-pass and fat known-frame shapes always * save the return address and reserve their stack, so they are unaffected. */ u8 has_asm; + /* Whether the body reads its own frame-pointer chain via + * __builtin_frame_address / __builtin_return_address (INTRIN_FRAME_ADDRESS / + * INTRIN_RETURN_ADDRESS). Such a function must keep a valid frame record and + * frame pointer, so the frameless-leaf tier (rv64 slim_prologue, which emits + * no prologue and never anchors s0) must NOT fire. aa64/x64 keep the frame + * record in every prologue shape, so they ignore this flag. */ + u8 reads_frame; } NativeKnownFrameDesc; typedef enum NativeAllocClass { diff --git a/src/arch/riscv/arch.c b/src/arch/riscv/arch.c @@ -393,6 +393,8 @@ static int rv64_supports_intrinsic(const Compiler* c, KitCgIntrinsic intrin) { case KIT_CG_INTRIN_DMB: case KIT_CG_INTRIN_DSB: case KIT_CG_INTRIN_WFI: + case KIT_CG_INTRIN_FRAME_ADDRESS: + case KIT_CG_INTRIN_RETURN_ADDRESS: return 1; case KIT_CG_INTRIN_SETJMP: case KIT_CG_INTRIN_LONGJMP: diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c @@ -1826,9 +1826,10 @@ static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, * clobber ra opaquely, and without the saved record the bare `ret` would * return through the destroyed link register. */ a->slim_prologue = frame && frame->is_leaf && !frame->has_asm && - a->frame.ncallee_saves == 0 && !a->frame.has_alloca && - a->frame.cum_off == 0 && a->frame.max_outgoing == 0 && - !a->has_sret && !a->is_variadic && + !frame->reads_frame && a->frame.ncallee_saves == 0 && + !a->frame.has_alloca && a->frame.cum_off == 0 && + a->frame.max_outgoing == 0 && !a->has_sret && + !a->is_variadic && rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0; if (a->slim_prologue) { a->minimal_prologue_words = 0; @@ -3308,6 +3309,27 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, case INTRIN_WFI: rv64_emit32(mc, rv_wfi()); return; + case INTRIN_FRAME_ADDRESS: + case INTRIN_RETURN_ADDRESS: + /* Walk the s0 frame-record chain. kit's RISC-V prologue anchors s0 at the + * saved pair: [s0] = caller's s0, [s0 + ptr_bytes] = saved ra (this + * frame's return address). NOTE: this differs from the psABI's + * ra@s0-8 / fp@s0-16 layout — kit stores the pair at and above s0. A + * function that reads its frame is forced off the frameless-leaf tier + * (see NativeKnownFrameDesc.reads_frame), so s0 is always valid here. The + * level is constant, so the walk unrolls to `level` dependent loads. */ + if (ndst == 1u) { + u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) + ? (u32)args[0].v.imm + : 0u; + u32 rd = loc_reg(dsts[0]); + rv64_emit32(mc, rv_addi(rd, RV_S0, 0)); /* rd = s0 */ + for (u32 i = 0; i < level; ++i) + rv64_emit32(mc, rv_ld_ptr(v, rd, rd, 0)); /* rd = *(rd) */ + if (kind == INTRIN_RETURN_ADDRESS) + rv64_emit32(mc, rv_ld_ptr(v, rd, rd, (i32)v->ptr_bytes)); + } + return; default: break; } diff --git a/src/arch/wasm/arch.c b/src/arch/wasm/arch.c @@ -131,6 +131,9 @@ static int wasm_supports_intrinsic(const Compiler* c, KitCgIntrinsic intrin) { case KIT_CG_INTRIN_WFE: case KIT_CG_INTRIN_SEV: case KIT_CG_INTRIN_CORO_SWITCH: + /* wasm has no frame-pointer chain to walk. */ + case KIT_CG_INTRIN_FRAME_ADDRESS: + case KIT_CG_INTRIN_RETURN_ADDRESS: return 0; } return 0; diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c @@ -1629,6 +1629,10 @@ static const char* intrin_name(IntrinKind k) { return "irq_enable"; case INTRIN_IRQ_DISABLE: return "irq_disable"; + case INTRIN_FRAME_ADDRESS: + return "frame_address"; + case INTRIN_RETURN_ADDRESS: + return "return_address"; } return "<unknown>"; } @@ -1802,6 +1806,9 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst, case INTRIN_IRQ_RESTORE: case INTRIN_IRQ_ENABLE: case INTRIN_IRQ_DISABLE: + /* No frame-pointer chain in wasm; reported unsupported up front. */ + case INTRIN_FRAME_ADDRESS: + case INTRIN_RETURN_ADDRESS: case INTRIN_NONE: break; } diff --git a/src/arch/x64/arch.c b/src/arch/x64/arch.c @@ -184,6 +184,8 @@ static int x64_supports_intrinsic(const Compiler* c, KitCgIntrinsic intrin) { case KIT_CG_INTRIN_DSB: case KIT_CG_INTRIN_IRQ_ENABLE: case KIT_CG_INTRIN_IRQ_DISABLE: + case KIT_CG_INTRIN_FRAME_ADDRESS: + case KIT_CG_INTRIN_RETURN_ADDRESS: return 1; case KIT_CG_INTRIN_SETJMP: case KIT_CG_INTRIN_LONGJMP: diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -3594,6 +3594,24 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind, } return; } + case INTRIN_FRAME_ADDRESS: + case INTRIN_RETURN_ADDRESS: + /* Walk the rbp frame-record chain. Every kit prologue keeps the rbp + * record: [rbp] = caller's rbp, [rbp + 8] = return address pushed by the + * `call`. The level is a compile-time constant, so the walk unrolls to + * `level` dependent loads. */ + if (ndst == 1u) { + u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) + ? (u32)args[0].v.imm + : 0u; + u32 rd = loc_reg(dsts[0]); + emit_mov_rr(mc, 1, rd, X64_RBP); + for (u32 i = 0; i < level; ++i) + emit_mov_load(mc, 8, 0, rd, rd, 0); /* rd = *(rd) */ + if (kind == INTRIN_RETURN_ADDRESS) + emit_mov_load(mc, 8, 0, rd, rd, 8); /* rd = *(rd + 8) */ + } + return; default: break; } diff --git a/src/cg/arith.c b/src/cg/arith.c @@ -1770,10 +1770,14 @@ static const IntrinDesc kIntrinTable[] = { [KIT_CG_INTRIN_WFE] = {INTRIN_WFE, "wfe", true, false}, [KIT_CG_INTRIN_SEV] = {INTRIN_SEV, "sev", true, false}, [KIT_CG_INTRIN_CORO_SWITCH] = {INTRIN_NONE, "coro_switch", false, false}, + [KIT_CG_INTRIN_FRAME_ADDRESS] = + {INTRIN_FRAME_ADDRESS, "frame_address", false, false}, + [KIT_CG_INTRIN_RETURN_ADDRESS] = + {INTRIN_RETURN_ADDRESS, "return_address", false, false}, }; _Static_assert(sizeof(kIntrinTable) / sizeof(kIntrinTable[0]) == - KIT_CG_INTRIN_CORO_SWITCH + 1, + KIT_CG_INTRIN_RETURN_ADDRESS + 1, "kIntrinTable must have exactly one row per KitCgIntrinsic"); /* Bounds-guarded row lookup: an out-of-range intrinsic falls back to the NONE @@ -1889,7 +1893,9 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs, (intrin == KIT_CG_INTRIN_EXPECT || intrin == KIT_CG_INTRIN_ASSUME_ALIGNED || intrin == KIT_CG_INTRIN_PREFETCH || intrin == KIT_CG_INTRIN_DMB || - intrin == KIT_CG_INTRIN_DSB)) { + intrin == KIT_CG_INTRIN_DSB || + intrin == KIT_CG_INTRIN_FRAME_ADDRESS || + intrin == KIT_CG_INTRIN_RETURN_ADDRESS)) { args[idx] = svs[idx].op; } else { args[idx] = api_force_local(g, &svs[idx], aty); diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h @@ -176,6 +176,14 @@ typedef enum IntrinKind { INTRIN_IRQ_RESTORE, INTRIN_IRQ_ENABLE, INTRIN_IRQ_DISABLE, + + /* frame-pointer-chain introspection — value-producing, single immediate + * operand (the constant level). args[0] is the level (OPK_IMM); dsts[0] is + * the void* result. Lowered as an unrolled FP walk; modeled as an ordinary + * frame-dependent memory read (IR_INTRINSIC is already conservatively + * side-effecting in opt, so it is never hoisted, CSE'd, or eliminated). */ + INTRIN_FRAME_ADDRESS, + INTRIN_RETURN_ADDRESS, } IntrinKind; typedef enum OpKind { diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -1387,6 +1387,7 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { u8 needs_scratch_spill = 0; u8 has_call = 0; u8 has_asm = 0; + u8 reads_frame = 0; u32 nasm_clob = 0; u32 asm_clobber_abi_sets = 0; Sym* asm_clobbers = NULL; @@ -1422,6 +1423,14 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { nasm_clob += aux->nclob; asm_clobber_abi_sets |= aux->clobber_abi_sets; } + } else if ((IROp)in->op == IR_INTRINSIC) { + /* __builtin_frame_address / __builtin_return_address read the frame + * record, so the function must keep one (disables the rv64 frameless + * leaf tier; see NativeKnownFrameDesc.reads_frame). */ + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (aux && (aux->kind == INTRIN_FRAME_ADDRESS || + aux->kind == INTRIN_RETURN_ADDRESS)) + reads_frame = 1; } } } @@ -1498,6 +1507,7 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { frame.needs_scratch_spill = needs_scratch_spill; frame.is_leaf = !has_call; frame.has_asm = has_asm; + frame.reads_frame = reads_frame; frame.asm_clobbers = asm_clobbers; frame.nasm_clobbers = nasm_clob; frame.asm_clobber_abi_sets = asm_clobber_abi_sets; diff --git a/test/parse/cases/builtin_29_return_address.c b/test/parse/cases/builtin_29_return_address.c @@ -0,0 +1,10 @@ +/* __builtin_return_address(0): the current function's return address, read + * from the spilled frame record. A real frame's return address is never null. */ +__attribute__((noinline)) static void* ra0(void) { + return __builtin_return_address(0); +} + +int test_main(void) { + void* r = ra0(); + return r != 0 ? 42 : 1; +} diff --git a/test/parse/cases/builtin_29_return_address.expected b/test/parse/cases/builtin_29_return_address.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/builtin_29_return_address.wasm.skip b/test/parse/cases/builtin_29_return_address.wasm.skip @@ -0,0 +1 @@ +wasm has no frame-pointer chain (__builtin_frame/return_address unsupported) diff --git a/test/parse/cases/builtin_30_frame_address.c b/test/parse/cases/builtin_30_frame_address.c @@ -0,0 +1,15 @@ +/* __builtin_frame_address(0): the current frame pointer, never null. Level 1 + * names the caller's frame; since the stack grows down, the caller frame sits + * at a higher address than ours. */ +__attribute__((noinline)) static void* fa0(void) { + return __builtin_frame_address(0); +} + +int test_main(void) { + void* here = __builtin_frame_address(0); + void* callee = fa0(); + if (here == 0 || callee == 0) return 1; + /* fa0's frame is below test_main's (deeper call, lower address). */ + if (!((unsigned char*)callee < (unsigned char*)here)) return 2; + return 42; +} diff --git a/test/parse/cases/builtin_30_frame_address.expected b/test/parse/cases/builtin_30_frame_address.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/builtin_30_frame_address.wasm.skip b/test/parse/cases/builtin_30_frame_address.wasm.skip @@ -0,0 +1 @@ +wasm has no frame-pointer chain (__builtin_frame/return_address unsupported) diff --git a/test/parse/cases/builtin_31_return_address_anchor.c b/test/parse/cases/builtin_31_return_address_anchor.c @@ -0,0 +1,19 @@ +/* __builtin_return_address(0) anchor check: the return address a callee reads + * must land inside its caller. callee() is defined before test_main, so its + * code lies below test_main's; the return address into test_main therefore sits + * at or above test_main's entry and within its (small) body. Exercised across + * the parse lanes (host JIT + ELF roundtrip + qemu/podman exec) at O0 and O1. */ +__attribute__((noinline)) static void* callee(void) { + return __builtin_return_address(0); +} + +int test_main(void) { + void* ra = callee(); + char* lo = (char*)(void*)test_main; /* caller's entry address */ + if (ra == 0) return 1; + /* The return address is just past the `call callee` inside test_main, so it + * is above the entry and well within the function's modest code size. */ + if ((char*)ra < lo) return 2; + if ((char*)ra >= lo + 4096) return 3; + return 42; +} diff --git a/test/parse/cases/builtin_31_return_address_anchor.expected b/test/parse/cases/builtin_31_return_address_anchor.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/builtin_31_return_address_anchor.wasm.skip b/test/parse/cases/builtin_31_return_address_anchor.wasm.skip @@ -0,0 +1 @@ +wasm has no frame-pointer chain (__builtin_return_address unsupported) diff --git a/test/parse/cases_err/builtin_return_address_nonconst.c b/test/parse/cases_err/builtin_return_address_nonconst.c @@ -0,0 +1,5 @@ +/* The level argument to __builtin_return_address must be an integer constant + * expression; a runtime value must be diagnosed. */ +void* f(int n) { return __builtin_return_address(n); } + +int test_main(void) { return 0; } diff --git a/test/parse/cases_err/builtin_return_address_nonconst.errpat b/test/parse/cases_err/builtin_return_address_nonconst.errpat @@ -0,0 +1 @@ +constant expression diff --git a/test/rt/cases/backtrace_capture.c b/test/rt/cases/backtrace_capture.c @@ -0,0 +1,64 @@ +/* __kit_backtrace capture test: a known-depth non-tail recursion, captured via + * the freestanding frame-pointer walk. Verifies depth, that every captured + * return address is non-null, that the recursive frames share a call site (so + * the walk really follows the chain), the `skip` and `max` bounds, then exits + * 42 on success. Runs under test/rt/run.sh across the aa64/x64/rv64 tuples. */ +#include <kit/backtrace.h> + +#define DEPTH 6 + +/* Non-tail recursion (work after the call) so each level keeps a live frame, + * and noinline so the chain is real even if the harness opt level rises. The + * captures happen at the deepest frame, where at least DEPTH+1 frames exist — + * so the `max` bound is exercised against a stack deeper than the cap (the top + * of test_main only has a couple of frames above it). */ +__attribute__((noinline)) static int recurse(int n, void** buf, int cap, + int skip, int* out, int* capped) { + if (n > 0) { + int r = recurse(n - 1, buf, cap, skip, out, capped); + return r + 1; + } + *out = __kit_backtrace(buf, cap, skip); + if (capped) { + void* small[3]; + *capped = __kit_backtrace(small, 3, 0); /* deep stack, so this fills 3 */ + } + return n; +} + +int test_main(void) { + void* buf0[64]; + void* buf2[64]; + int n0 = 0, n2 = 0, capped = 0; + int i; + + recurse(DEPTH, buf0, 64, 0, &n0, &capped); + + /* The chain holds at least the recurse() frames plus test_main. */ + if (n0 < DEPTH + 1) return 1; + + /* A real frame never has a null return address (the walk stops at the + * synthetic stack origin before recording it). */ + for (i = 0; i < n0; i++) + if (buf0[i] == 0) return 2; + + /* buf0[1..DEPTH] are all the address after the single recursive call site, + * so consecutive recursive frames share a return address. */ + if (buf0[1] != buf0[2]) return 3; + + /* skip drops the innermost frames: skip=2 yields exactly two fewer entries + * and shifts the trace, so buf2[0] is the old buf0[2]. */ + recurse(DEPTH, buf2, 64, 2, &n2, (int*)0); + if (n2 != n0 - 2) return 4; + if (buf2[0] != buf0[2]) return 5; + + /* max is honored: against a stack deeper than the cap, exactly `max` are + * written (never more). */ + if (capped != 3) return 6; + + /* Degenerate inputs return 0, not a crash. */ + if (__kit_backtrace(buf0, 0, 0) != 0) return 7; + if (__kit_backtrace((void**)0, 8, 0) != 0) return 8; + + return 42; +} diff --git a/test/toy/cases/154_frame_return_address.cbackend.skip b/test/toy/cases/154_frame_return_address.cbackend.skip @@ -0,0 +1 @@ +host cc rejects __builtin_return/frame_address with nonzero level under -Wframe-address -Werror diff --git a/test/toy/cases/154_frame_return_address.expected b/test/toy/cases/154_frame_return_address.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/154_frame_return_address.toy b/test/toy/cases/154_frame_return_address.toy @@ -0,0 +1,26 @@ +// @frame_address / @return_address: walk the frame-pointer chain at levels +// 0/1/2. The functions are @[.noinline] so the call chain is real at every opt +// level: inside deepest(), level 0/1/2 name deepest/middle/outer respectively. +// The stack grows down, so each caller frame sits at a higher address than its +// callee; return addresses are never null. Exits 42 on success. + +fn @[.noinline] deepest(): i64 { + let f0: usize = @frame_address(0) as usize; + let f1: usize = @frame_address(1) as usize; + let f2: usize = @frame_address(2) as usize; + let r0: usize = @return_address(0) as usize; + let r1: usize = @return_address(1) as usize; + if f0 == 0 { return 1; } + if r0 == 0 { return 2; } + if r1 == 0 { return 3; } + if f1 <= f0 { return 4; } // caller frame is higher (stack grows down) + if f2 <= f1 { return 5; } + return 42; +} + +fn @[.noinline] middle(): i64 { return deepest(); } +fn @[.noinline] outer(): i64 { return middle(); } + +fn __user_main(): i64 { return outer(); } + +fn main(): i32 { return __user_main() as i32; } diff --git a/test/toy/cases/154_frame_return_address.wasm.skip b/test/toy/cases/154_frame_return_address.wasm.skip @@ -0,0 +1 @@ +wasm has no frame-pointer chain (frame/return address unsupported)