kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 9c1a093280492ba7866eb3cebbb445c716da8ecb
parent 8dd63074edda0ee085f6ebe758e9f0e7a9739594
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  7 May 2026 14:10:48 -0700

stdcoro asymmetric coro_t

Diffstat:
Mdoc/builtins.md | 45+++++++++++++++++++++++++++++----------------
Minclude/stdcoro.h | 140++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mlib/README.md | 45+++++++++++++++++++++++++++++++--------------
Mlib/build.sh | 33++++++++++++++++++++-------------
Mlib/coro/aarch64.c | 16++++++++--------
Mlib/coro/arm32.c | 20++++++++++----------
Mlib/coro/arm32_thumb1.c | 20++++++++++----------
Alib/coro/coro.c | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/coro/i386.c | 18+++++++++---------
Mlib/coro/riscv32.c | 20++++++++++----------
Mlib/coro/riscv64.c | 26+++++++++++++-------------
Mlib/coro/x86_64.c | 18+++++++++---------
Mlib/coro/x86_64_win.c | 18+++++++++---------
Mtest/smoke.c | 20++++++++++++++------
14 files changed, 403 insertions(+), 160 deletions(-)

diff --git a/doc/builtins.md b/doc/builtins.md @@ -150,22 +150,35 @@ Always: - Compare: `__eq`, `__ne`, `__lt`, `__le`, `__gt`, `__ge`, `__unord` × `sf2`/`df2`/`tf2` ### Nonlocal jumps + stackful coroutines (per-arch, always shipped) -The `<setjmp.h>` and `<stdcoro.h>` primitives share one per-target context -struct: callee-saved GPRs + callee-saved FPRs + sp + return address. The -`jmp_buf` and `coro_ctx` typedefs are 256-byte aligned-16 storage; the -runtime reinterprets them as the per-arch struct. -- `setjmp`, `longjmp` — `<setjmp.h>` (C11 7.13). cfree extension: this - header is *not* in the C11 freestanding subset. -- `coro_init`, `coro_switch`, `__cfree_coro_trampoline` — `<stdcoro.h>` - (cfree-specific). `coro_switch(from, to, value) → uintptr_t` is the - one universal primitive; `setjmp` = save-and-return-0, - `longjmp` = restore-and-deliver-val. -- Implementations live one master `.c` per arch under `lib/coro/` - (file-scope asm + tiny C `coro_init`). ARM has two: `arm32.c` - (Thumb-2, ARMv7+, may use VFP `d8-d15`) and `arm32_thumb1.c` - (ARMv6-M, no IT blocks / no VFP / data-processing limited to - r0-r7). Not provided for: WASM (would need an Asyncify-fiber - port). +`<setjmp.h>` and `<stdcoro.h>` share one per-target context payload +(256 bytes, 16-byte aligned): callee-saved GPRs + callee-saved FPRs ++ sp + return address. `jmp_buf` and `coro_ctx` are both opaque +typedefs over that payload; the runtime reinterprets them as the +per-arch struct. + +- `setjmp`, `longjmp` — `<setjmp.h>` (C11 7.13). cfree extension: + this header is *not* in the C11 freestanding subset. +- `coro_init`, `coro_resume`, `coro_yield`, `coro_self` — public + asymmetric API in `<stdcoro.h>`. Resume drives a coroutine + forward; yield suspends back to the most recent resumer; resumes + nest like function calls. Status (`CORO_INIT` / `RUNNING` / + `SUSPENDED` / `DEAD`) is tracked on the `coro_t` and propagates + through `coro_resume`'s result. +- `__cfree_coro_switch(from, to, value) -> uintptr_t` — the symmetric + primitive. `coro_resume` / `coro_yield` are built on it; setjmp = + save+return-0, longjmp = restore+deliver-val. Exposed (with the + `__cfree_` prefix to signal "compiler-builtin-style") for + schedulers that don't fit the asymmetric resume-chain model. +- `__cfree_coro_ctx_init`, `__cfree_coro_trampoline` — internal, + used only by `lib/coro/coro.c`'s asymmetric layer. + +Implementation: one master `.c` per arch under `lib/coro/` (file-scope +asm + tiny C `__cfree_coro_ctx_init`), plus one arch-agnostic +`coro/coro.c` for the public asymmetric layer. ARM has two arch +masters: `arm32.c` (Thumb-2, ARMv7+, may use VFP `d8-d15`) and +`arm32_thumb1.c` (ARMv6-M, no IT blocks / no VFP / data-processing +limited to r0-r7). Not provided for: WASM (would need an +Asyncify-fiber port). ### Atomic fallbacks (only when target lacks native atomics for that width) - Generic: `__atomic_load`, `__atomic_store`, `__atomic_exchange`, `__atomic_compare_exchange` diff --git a/include/stdcoro.h b/include/stdcoro.h @@ -1,25 +1,40 @@ -/* stdcoro.h -- cfree extension -- stackful symmetric coroutines +/* stdcoro.h -- cfree extension -- stackful asymmetric coroutines * * stdcoro.h is non-standard: C11 has no stackful-coroutine facility. * cfree ships it as a native counterpart to <setjmp.h>: the underlying - * per-target context struct, save sequence, and restore sequence are - * literally shared with setjmp/longjmp -- only the entry shapes differ - * (setjmp = save+return-0; longjmp = restore+return-val; coro_switch = - * save(from)+restore(to)+deliver-value). Implementations live in - * libcfree_rt.a -- see doc/builtins.md. - * - * Programming model - * 1. Allocate a coro_ctx and a stack region. - * 2. coro_init(&ctx, stack_base, stack_len, entry). - * 3. coro_switch(&caller, &ctx, value) -- delivers `value` to entry's - * uintptr_t argument on first switch in. - * 4. Inside the coroutine, coro_switch(&ctx, &caller, value) yields - * back, with `value` becoming the caller's coro_switch return. - * 5. entry must NOT return; the trampoline traps if it does. - * - * coro_ctx is sized conservatively -- large enough for every cfree - * target's callee-saved registers + sp + ip + (where applicable) - * callee-saved FP regs. Layout is internal to the runtime. + * per-target context payload is literally shared with setjmp/longjmp + * (256 bytes, see doc/builtins.md), and the runtime is target-specific + * assembly in libcfree_rt.a. + * + * Two layers in this header: + * + * coro_ctx Raw register-context buffer used by the symmetric + * primitive __cfree_coro_switch. Most code does not + * touch it -- it is exposed for advanced schedulers + * (M:N, custom dispatch) that want the bare switch. + * + * coro_t Asymmetric coroutine handle. Resume drives forward, + * yield suspends back to the most recent resumer. + * Resumes nest like function calls. status is + * publicly readable; the rest is private storage. + * + * Programming model (asymmetric): + * 1. Allocate a coro_t and a stack region. + * 2. coro_init(&c, fn, stack_base, stack_len). + * 3. coro_resume(&c, value) drives c forward. + * 4. From inside fn, coro_yield(value) suspends back to the resumer. + * 5. fn's return value becomes the final coro_resume payload, with + * status CORO_DEAD; the runtime cleans up automatically. + * + * Threading. The runtime's "current coroutine" pointer and "main" + * register save slot are _Thread_local, so each thread has its own + * resume chain. A coroutine itself is still tied to the thread that + * drives it: errno, _Thread_local user state, and thread-affine OS + * handles silently rebind if a coroutine is resumed on a different + * thread, so don't migrate a suspended coroutine across threads. + * cfree's contract defines __STDC_NO_THREADS__ (no <threads.h>) -- + * _Thread_local is a separate C11 language feature and works + * independently. */ #ifndef CFREE_STDCORO_H #define CFREE_STDCORO_H @@ -32,25 +47,84 @@ but 16 covers it). Caller stacks must be aligned to this. */ #define CORO_STACK_ALIGN 16 -/* 256 bytes is the largest per-target context across cfree's targets - (x86_64 Windows: 12 GPR slots + xmm6-15). Same byte payload as - <setjmp.h>'s jmp_buf -- the per-arch runtime reinterprets either - as the same internal struct. */ +/* Raw register-context buffer. 256 bytes, alignof 16. The runtime + reinterprets this as a per-target struct of callee-saved GPRs + + callee-saved FPRs + sp + return address. Exposed only because the + internal __cfree_coro_switch primitive at the bottom of this header + needs it as an argument type. coro_t below embeds one of these as + the first word of its private storage. */ typedef struct coro_ctx { _Alignas(16) unsigned char __cfree_storage[256]; } coro_ctx; -typedef void (*coro_entry_fn)(uintptr_t value); +/* ==================================================================== + * Asymmetric coroutine API. + * ==================================================================== */ + +typedef enum { + CORO_INIT, /* never resumed */ + CORO_RUNNING, /* on the live resume chain */ + CORO_SUSPENDED, /* yielded; resumable */ + CORO_DEAD, /* entry returned */ +} coro_status_t; + +typedef struct { + uintptr_t value; + coro_status_t status; +} coro_result_t; + +/* Coroutine entry point. The first coro_resume's value is passed as + `arg`. The return value is delivered as the final coro_resume's + payload, with status CORO_DEAD. */ +typedef uintptr_t (*coro_fn)(uintptr_t arg); -/* Initialize *ctx to begin executing entry(value) on first switch in, - using the stack region [stack_base, stack_base + stack_len). The - stack base must be CORO_STACK_ALIGN-aligned. entry must not return. */ -void coro_init(coro_ctx *ctx, - void *stack_base, size_t stack_len, - coro_entry_fn entry); +/* Coroutine handle. status is publicly readable; the private blob + carries the register context (256 B), a resumer pointer, and the + user-supplied entry fn. 288 B is comfortable headroom on both LP64 + and ILP32 (lib/coro/coro.c verifies the fit with a _Static_assert). */ +typedef struct coro { + coro_status_t status; + _Alignas(16) unsigned char __cfree_priv[288]; +} coro_t; -/* Save callee-saved state into *from, restore it from *to, deliver - `value` to *to. Returns the value passed by the next switch back. */ -uintptr_t coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value); +/* Initialize *c to run fn on [stack_base, stack_base + stack_len). + stack_base must be CORO_STACK_ALIGN-aligned. status becomes + CORO_INIT. The first coro_resume delivers its value as fn's arg. */ +void coro_init(coro_t *c, coro_fn fn, void *stack_base, size_t stack_len); + +/* Drive c forward. If c is INIT, calls fn(value) on c's stack. If + SUSPENDED, c's matching coro_yield call returns value. coro_resume + itself returns when c yields or its fn returns; the result carries + c's new status (SUSPENDED or DEAD) and the value c delivered. + UB if c is RUNNING or DEAD. */ +coro_result_t coro_resume(coro_t *c, uintptr_t value); + +/* Suspend the current coroutine, returning value to its resumer (the + matching coro_resume call returns this value). coro_yield itself + returns the value the next resumer passes. UB outside a coroutine. */ +uintptr_t coro_yield(uintptr_t value); + +/* The currently running coroutine, or NULL if not in one. */ +coro_t *coro_self(void); + +static inline coro_status_t coro_status(const coro_t *c) { return c->status; } + +/* ==================================================================== + * Symmetric primitive (compiler-builtin-style; for advanced schedulers). + * + * Saves callee-saved state into *from, restores it from *to, and + * delivers `value` to *to as the return of its prior switch (or as + * the first-arg register of *to's trampoline on a fresh context). + * Returns the value passed by the next switch back to *from. + * + * coro_resume / coro_yield are built on this. Most code should not + * call it directly; it is exposed for schedulers that don't fit the + * asymmetric resume-chain model (M:N runtimes, work-stealing, etc.). + * + * Bypassing the asymmetric layer means losing coro_self / status + * tracking / DEAD propagation -- the symmetric primitive is purely + * a register-shuffle and knows nothing about coro_t. + * ==================================================================== */ +uintptr_t __cfree_coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value); #endif diff --git a/lib/README.md b/lib/README.md @@ -33,7 +33,8 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired. | `riscv/rv64.S` | `__riscv_save_*` + `__riscv_restore_*` (rv64) | RISC-V rv64 with `-msave-restore` | | `mem/mem.c` | `memcpy` / `memmove` / `memset` / `memcmp` (weak) | All; user libc overrides | | `atomic/atomic_freestanding.c` | `__atomic_*` fallback shim | All | -| `coro/<arch>.c` | `setjmp` / `longjmp` (`<setjmp.h>`) + `coro_init` / `coro_switch` / `__cfree_coro_trampoline` (`<stdcoro.h>`) | One of `aarch64`, `arm32`, `arm32_thumb1`, `i386`, `riscv32`, `riscv64`, `x86_64`, `x86_64_win`. Not built for `wasm32`. | +| `coro/<arch>.c` | Per-arch primitives: `setjmp` / `longjmp` (`<setjmp.h>`) + `__cfree_coro_ctx_init` / `__cfree_coro_switch` / `__cfree_coro_trampoline` (internal; the public `<stdcoro.h>` API sits on top via `coro/coro.c`) | One of `aarch64`, `arm32`, `arm32_thumb1`, `i386`, `riscv32`, `riscv64`, `x86_64`, `x86_64_win`. Not built for `wasm32`. | +| `coro/coro.c` | Arch-agnostic asymmetric layer: `coro_init` / `coro_resume` / `coro_yield` / `coro_self` (`<stdcoro.h>`) | All variants that ship a `coro/<arch>.c`. | ### Build-time include dirs (consumed by the masters; nothing here lands in `libcfree_rt.a`) @@ -142,27 +143,43 @@ Hand-written portable C (not from compiler-rt). All four functions are weak so a user libc, or a tuned arch-specific replacement, wins at link time. `arm/aeabi_thumb{1,2}.S`'s `aeabi_mem*` symbols forward to these. -### `coro/<arch>.c` -One master `.c` per arch that supplies both `<setjmp.h>` (`setjmp`, -`longjmp`) and `<stdcoro.h>` (`coro_init`, `coro_switch`, -`__cfree_coro_trampoline`). The setjmp/longjmp/coro_switch primitives -share a per-arch struct (callee-saved GPRs + callee-saved FPRs + sp + -return address) and one pair of C string-concat macros -`SAVE_INTO(reg)` / `RESTORE_FROM(reg)` so the same instruction bytes -are emitted in all three places. Written as file-scope `__asm__` -inside a `.c` file (not a separate `.S`) so the asm and the tiny -`coro_init` C function stay co-located. Symbol naming uses -`__USER_LABEL_PREFIX__` so the same source compiles for ELF / Mach-O / -COFF. +### `coro/<arch>.c` + `coro/coro.c` +The coro module ships in two layers: + +**`coro/<arch>.c`** (one per arch) — per-target primitives, file-scope +`__asm__` inside a `.c` file (not a separate `.S`) so the tiny C +`__cfree_coro_ctx_init` and the asm save/restore stay co-located. +Provides: + +- `setjmp` / `longjmp` (public, `<setjmp.h>`). +- `__cfree_coro_switch(from, to, value)` — symmetric register switch, + exposed in `<stdcoro.h>` as a compiler-builtin-style primitive for + advanced schedulers; the asymmetric layer below also uses it. +- `__cfree_coro_ctx_init` / `__cfree_coro_trampoline` — internal. + +The three primitives that need register save/restore (setjmp, +longjmp, `__cfree_coro_switch`) share one pair of C string-concat +macros `SAVE_INTO(reg)` / `RESTORE_FROM(reg)` so the same instruction +bytes are emitted in all three places. Symbol naming uses +`__USER_LABEL_PREFIX__` so the same source compiles for ELF / Mach-O +/ COFF. ARM ships two variants: `arm32.c` (Thumb-2, ARMv7+, optional VFP `d8-d15` gated on `__ARM_FP`) and `arm32_thumb1.c` (ARMv6-M / Cortex-M0/M0+; no IT blocks, no VFP, data-processing restricted to -r0-r7, no `str sp` / `str rN, [sp,...]` -- the asm sequences don't +r0-r7, no `str sp` / `str rN, [sp,...]` — the asm sequences don't share with arm32.c so it's a separate file). Not provided for `wasm32` (would need an Asyncify-fiber port). +**`coro/coro.c`** (arch-agnostic) — the public asymmetric API: +`coro_init` / `coro_resume` / `coro_yield` / `coro_self`. Tracks the +current coroutine in a static, threads each `coro_t`'s resumer slot +through the resume chain, and dispatches the per-arch trampoline via +a thunk that runs the user's `coro_fn`, marks the coroutine +`CORO_DEAD`, and switches back to the resumer. Built once per coro +variant and linked alongside the per-arch master. + ### `atomic/atomic_freestanding.c` Defines a pointer-sized `_Atomic(uintptr_t)` spinlock as the lock primitive (no OS dependency) then `#include`s `atomic_common.inc`, which contains the diff --git a/lib/build.sh b/lib/build.sh @@ -93,7 +93,14 @@ ARM_AEABI_THUMB1="arm/aeabi_thumb1.S arm/aeabi.c" RV32_SR="riscv/rv32.S" RV64_SR="riscv/rv64.S" -# Coro + setjmp/longjmp: one master .c per arch, file-scope asm inside. +# Coro + setjmp/longjmp: +# coro/coro.c -- arch-agnostic asymmetric layer (coro_init, +# coro_resume, coro_yield, coro_self). +# coro/<arch>.c -- per-arch primitives (setjmp / longjmp / +# __cfree_coro_ctx_init / __cfree_coro_switch / +# __cfree_coro_trampoline). +# Every variant that ships coro picks one <arch>.c plus the common file. +CORO_COMMON="coro/coro.c" CORO_X86_64="coro/x86_64.c" CORO_X86_64_WIN="coro/x86_64_win.c" CORO_I386="coro/i386.c" @@ -118,11 +125,11 @@ CORO_INC="-I../include" build_variant x86_64-linux \ "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ - "$LP64_BASE $CORO_X86_64" + "$LP64_BASE $CORO_X86_64 $CORO_COMMON" build_variant x86_64-apple-darwin \ "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ - "$LP64_BASE $CORO_X86_64" + "$LP64_BASE $CORO_X86_64 $CORO_COMMON" # aarch64-linux: long double is binary128; needs fp_tf + fp_ti and the # tf_supplement.h pre-include. @@ -130,33 +137,33 @@ build_variant aarch64-linux \ "--target=aarch64-linux-gnu \ -Iinclude/lp64_le_ldbl128 -Iinclude/lp64_le -DHAS_INT128=1 \ -include include/lp64_le_ldbl128/tf_supplement.h $CORO_INC" \ - "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C $CORO_AARCH64" + "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C $CORO_AARCH64 $CORO_COMMON" build_variant aarch64-apple-darwin \ "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ - "$LP64_BASE $CORO_AARCH64" + "$LP64_BASE $CORO_AARCH64 $CORO_COMMON" build_variant riscv64-elf \ "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd \ -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ - "$LP64_BASE $CORO_RV64" + "$LP64_BASE $CORO_RV64 $CORO_COMMON" build_variant riscv64-elf-save-restore \ "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd -msave-restore \ -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ - "$LP64_BASE $RV64_SR $CORO_RV64" + "$LP64_BASE $RV64_SR $CORO_RV64 $CORO_COMMON" # ---- LLP64 little-endian (Win64) -------------------------------------------- build_variant x86_64-pc-windows \ "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1 $CORO_INC" \ - "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C $CORO_X86_64_WIN" + "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C $CORO_X86_64_WIN $CORO_COMMON" # ---- ILP32 little-endian ----------------------------------------------------- ILP32_BASE="$INT_C $INT32_C $FP_C $MEM_C $ATOMIC_C" build_variant i386-linux \ "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ - "$ILP32_BASE $CORO_I386" + "$ILP32_BASE $CORO_I386 $CORO_COMMON" # wasm32: no setjmp/coro impl yet -- Emscripten fibers / sjlj are a # separate runtime model that hasn't been ported to cfree. @@ -167,17 +174,17 @@ build_variant wasm32 \ build_variant riscv32-elf \ "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd \ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ - "$ILP32_BASE $CORO_RV32" + "$ILP32_BASE $CORO_RV32 $CORO_COMMON" build_variant riscv32-elf-save-restore \ "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd -msave-restore \ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ - "$ILP32_BASE $RV32_SR $CORO_RV32" + "$ILP32_BASE $RV32_SR $CORO_RV32 $CORO_COMMON" build_variant arm-eabi-thumb2 \ "--target=arm-none-eabi -march=armv7-a -mthumb -mfloat-abi=soft \ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ - "$ILP32_BASE $ARM_AEABI_THUMB2 $CORO_ARM32" + "$ILP32_BASE $ARM_AEABI_THUMB2 $CORO_ARM32 $CORO_COMMON" # arm-eabi-thumb1 (Cortex-M0/M0+, ARMv6-M): Thumb-1 ISA, no IT blocks, # data-processing ops restricted to r0-r7, no VFP. Coro impl is a @@ -185,7 +192,7 @@ build_variant arm-eabi-thumb2 \ build_variant arm-eabi-thumb1 \ "--target=arm-none-eabi -march=armv6-m -mthumb -mfloat-abi=soft \ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ - "$ILP32_BASE $ARM_AEABI_THUMB1 $CORO_ARM32_THUMB1" + "$ILP32_BASE $ARM_AEABI_THUMB1 $CORO_ARM32_THUMB1 $CORO_COMMON" #------------------------------------------------------------------------------- echo diff --git a/lib/coro/aarch64.c b/lib/coro/aarch64.c @@ -1,7 +1,7 @@ /* * lib/coro/aarch64.c -- AArch64 (AAPCS) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * All three primitives sit on one per-target context layout: * @@ -15,7 +15,7 @@ * 256-byte storage carved out by jmp_buf and coro_ctx. * * SAVE_/RESTORE_ are C string-concat macros so the same byte - * sequence is emitted in setjmp, longjmp, and coro_switch without + * sequence is emitted in setjmp, longjmp, and __cfree_coro_switch without * any duplication or gas-specific .macro tricks. * * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C @@ -42,9 +42,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm64_ctx), "align extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_arm64_ctx *c = (struct __cfree_arm64_ctx *)ctx; /* AArch64 stacks grow down; align top to 16. */ @@ -116,19 +116,19 @@ __asm__ ( " csinc x0, x1, xzr, ne\n" " ret\n" - /* coro_switch(from, to, value) -- x0, x1, x2. Save into [x0], + /* __cfree_coro_switch(from, to, value) -- x0, x1, x2. Save into [x0], restore from [x1], deliver x2 in x0 (which is both the return register here and the first-arg register the trampoline reads on a fresh context's first run). */ - ".globl " SYM(coro_switch) "\n" - SYM(coro_switch) ":\n" + ".globl " SYM(__cfree_coro_switch) "\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("x0") RESTORE_FROM("x1") " mov x0, x2\n" " ret\n" /* __cfree_coro_trampoline -- on first entry x0 = value (delivered), - x19 = entry fn (set by coro_init), sp aligned to 16. brk if entry + x19 = entry fn (set by __cfree_coro_ctx_init), sp aligned to 16. brk if entry returns. */ ".globl " SYM(__cfree_coro_trampoline) "\n" SYM(__cfree_coro_trampoline) ":\n" diff --git a/lib/coro/arm32.c b/lib/coro/arm32.c @@ -1,7 +1,7 @@ /* * lib/coro/arm32.c -- ARM32 Thumb-2 (AAPCS) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * All three primitives sit on one per-target context layout: * @@ -20,7 +20,7 @@ * storage carved out by jmp_buf and coro_ctx. * * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence - * is emitted in setjmp, longjmp, and coro_switch. The VFP half is + * is emitted in setjmp, longjmp, and __cfree_coro_switch. The VFP half is * gated by a C-level #ifdef on __ARM_FP -- the cpp pass picks one * macro body before the assembler sees anything, so we can't hide * `#ifdef` inside the asm string. @@ -49,9 +49,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_ctx), "align extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_arm32_ctx *c = (struct __cfree_arm32_ctx *)ctx; /* ARM32 stacks grow down; align top to 16 (AAPCS public-boundary @@ -173,23 +173,23 @@ __asm__ ( " mov r0, r1\n" " bx lr\n" - /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value. + /* __cfree_coro_switch(from, to, value) -- r0=from, r1=to, r2=value. Save into [r0], restore from [r1], deliver r2 in r0. The lr loaded by RESTORE_FROM is either a real return address (a previously-suspended coro) or __cfree_coro_trampoline (a fresh - coro initialized by coro_init). Either way `bx lr` lands there + coro initialized by __cfree_coro_ctx_init). Either way `bx lr` lands there with r0 holding `value`. */ - ".globl " SYM(coro_switch) "\n" + ".globl " SYM(__cfree_coro_switch) "\n" ".thumb_func\n" - ".type " SYM(coro_switch) ", %function\n" - SYM(coro_switch) ":\n" + ".type " SYM(__cfree_coro_switch) ", %function\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("r0") RESTORE_FROM("r1") " mov r0, r2\n" " bx lr\n" /* __cfree_coro_trampoline -- on first entry r0 = value (delivered - by coro_switch's `mov r0, r2`), r4 = entry fn (set by coro_init), + by __cfree_coro_switch's `mov r0, r2`), r4 = entry fn (set by __cfree_coro_ctx_init), sp aligned to 16. udf if entry returns. */ ".globl " SYM(__cfree_coro_trampoline) "\n" ".thumb_func\n" diff --git a/lib/coro/arm32_thumb1.c b/lib/coro/arm32_thumb1.c @@ -1,7 +1,7 @@ /* * lib/coro/arm32_thumb1.c -- ARMv6-M (Cortex-M0 / M0+, Thumb-1) impls of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * Thumb-1 / ARMv6-M is a strict subset of the Thumb-2 ISA used by the * sibling arm32.c, and several conveniences disappear: @@ -21,7 +21,7 @@ * jmp_buf and coro_ctx. * * SAVE_INTO uses r4-r7 as scratches *after* they have themselves been - * stored, so r0-r3 are never clobbered. That matters for coro_switch: + * stored, so r0-r3 are never clobbered. That matters for __cfree_coro_switch: * `to` (r1) and `value` (r2) survive across the save half and are still * live for the restore half / value delivery. */ @@ -43,9 +43,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_thumb1_ctx)," extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_arm32_thumb1_ctx *c = (struct __cfree_arm32_thumb1_ctx *)ctx; /* ARM stacks grow down; align top to 16 (AAPCS public-boundary @@ -144,24 +144,24 @@ __asm__ ( " movs r0, r1\n" " bx lr\n" - /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value. + /* __cfree_coro_switch(from, to, value) -- r0=from, r1=to, r2=value. SAVE_INTO leaves r0-r3 untouched, so r1 (to) and r2 (value) are still live. RESTORE_FROM clobbers r4-r7 freely (they belong to the resumed coro). The lr loaded by RESTORE_FROM is either a real return address (a previously-suspended coro) or - __cfree_coro_trampoline (a fresh coro initialized by coro_init); + __cfree_coro_trampoline (a fresh coro initialized by __cfree_coro_ctx_init); either way `bx lr` lands there with r0 holding `value`. */ - ".globl " SYM(coro_switch) "\n" + ".globl " SYM(__cfree_coro_switch) "\n" ".thumb_func\n" - ".type " SYM(coro_switch) ", %function\n" - SYM(coro_switch) ":\n" + ".type " SYM(__cfree_coro_switch) ", %function\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("r0") RESTORE_FROM("r1") " movs r0, r2\n" " bx lr\n" /* __cfree_coro_trampoline -- on first entry r0 = value (delivered - by coro_switch's `movs r0, r2`), r4 = entry fn (set by coro_init), + by __cfree_coro_switch's `movs r0, r2`), r4 = entry fn (set by __cfree_coro_ctx_init), sp aligned to 16. UDF #0 (T1, ARMv6-M) traps if entry returns. */ ".globl " SYM(__cfree_coro_trampoline) "\n" ".thumb_func\n" diff --git a/lib/coro/coro.c b/lib/coro/coro.c @@ -0,0 +1,124 @@ +/* + * lib/coro/coro.c -- asymmetric coroutine layer for <stdcoro.h>. + * + * Sits on top of the per-arch __cfree_coro_switch / __cfree_coro_ctx_init + * primitives (one of lib/coro/<arch>.c) and supplies the public + * coro_init / coro_resume / coro_yield / coro_self surface. + * + * Layout of coro_t.__cfree_priv: + * + * offset 0: coro_ctx ctx + * offset 256: coro_t *resumer + * offset 256 + sizeof(void*): coro_fn user_fn + * + * Total = 256 + 2 * sizeof(void*) bytes (272 LP64 / 264 ILP32). The + * header reserves 288 -- comfortable headroom either way. The + * _Static_assert below pins the fit. + * + * Resume chain. coro_resume saves the previous "current coroutine" + * pointer (NULL means "no coroutine; main thread") into the resumed + * coroutine's resumer slot, switches in, and on return flips the + * pointer back. coro_yield reads its own resumer slot and switches + * there. The result is a stack of resumers; resumes nest like calls. + * + * Per-thread scheduler state. __cfree_current and __cfree_main_ctx + * are _Thread_local: each thread that drives coroutines gets its own + * resume chain and its own "main" save slot. cfree's contract still + * defines __STDC_NO_THREADS__ (no <threads.h>), but _Thread_local is + * a C11 language feature independent of that library, so this is + * fine -- on hosted targets it Just Works, and bare-metal toolchains + * that don't link a TLS runtime fall through to per-image storage, + * which collapses to single-thread semantics. + * + * "Main" thread context. coro_resume needs a coro_ctx to save the + * caller's regs into; if the caller is itself a coroutine we use its + * ctx, otherwise the per-thread __cfree_main_ctx. The "main" slot is + * only ever touched on the resume/yield boundary -- it lives outside + * any coroutine's lifecycle. + */ + +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +typedef struct { + coro_ctx ctx; + coro_t *resumer; + coro_fn user_fn; +} __cfree_coro_priv_t; + +_Static_assert(sizeof(__cfree_coro_priv_t) <= sizeof(((coro_t *)0)->__cfree_priv), + "priv blob fits in coro_t reservation"); +_Static_assert(_Alignof(__cfree_coro_priv_t) <= _Alignof(coro_t), + "priv blob alignment fits coro_t"); + +/* Per-arch primitives (declared here, defined in lib/coro/<arch>.c). */ +extern uintptr_t __cfree_coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value); +extern void __cfree_coro_ctx_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + void (*entry)(uintptr_t)); + +/* Per-thread scheduler state. */ +static _Thread_local coro_t *__cfree_current = NULL; +static _Thread_local coro_ctx __cfree_main_ctx; + +static inline __cfree_coro_priv_t *__priv(coro_t *c) { + return (__cfree_coro_priv_t *)c->__cfree_priv; +} +static inline coro_ctx *__ctx_of(coro_t *c) { + return &__priv(c)->ctx; +} +static inline coro_ctx *__resumer_ctx(coro_t *c) { + coro_t *r = __priv(c)->resumer; + return r ? __ctx_of(r) : &__cfree_main_ctx; +} + +/* Trampoline-side thunk. Each per-arch trampoline calls this with the + uintptr_t delivered by the first __cfree_coro_switch into the fresh + context. The thunk dispatches to user_fn, then performs the + "DEAD + switch back to resumer" handoff so the symmetric primitive + doesn't need to know about coro_t lifecycle. */ +static void __cfree_coro_thunk(uintptr_t value) { + coro_t *self = __cfree_current; + uintptr_t retval = __priv(self)->user_fn(value); + + self->status = CORO_DEAD; + __cfree_coro_switch(__ctx_of(self), __resumer_ctx(self), retval); + __builtin_unreachable(); +} + +void coro_init(coro_t *c, coro_fn fn, void *stack_base, size_t stack_len) { + __cfree_coro_priv_t *p = __priv(c); + c->status = CORO_INIT; + p->resumer = NULL; + p->user_fn = fn; + __cfree_coro_ctx_init(&p->ctx, stack_base, stack_len, __cfree_coro_thunk); +} + +coro_result_t coro_resume(coro_t *c, uintptr_t value) { + coro_t *prev = __cfree_current; + coro_ctx *prev_ctx = prev ? __ctx_of(prev) : &__cfree_main_ctx; + + __priv(c)->resumer = prev; + __cfree_current = c; + c->status = CORO_RUNNING; + + uintptr_t v = __cfree_coro_switch(prev_ctx, __ctx_of(c), value); + + /* c either yielded (status set to CORO_SUSPENDED by coro_yield) + or finished (status set to CORO_DEAD by the thunk). */ + __cfree_current = prev; + return (coro_result_t){ .value = v, .status = c->status }; +} + +uintptr_t coro_yield(uintptr_t value) { + coro_t *self = __cfree_current; + self->status = CORO_SUSPENDED; + /* When the resumer next coro_resumes us, it sets status back to + CORO_RUNNING before the switch -- so our caller sees RUNNING. */ + return __cfree_coro_switch(__ctx_of(self), __resumer_ctx(self), value); +} + +coro_t *coro_self(void) { + return __cfree_current; +} diff --git a/lib/coro/i386.c b/lib/coro/i386.c @@ -1,7 +1,7 @@ /* * lib/coro/i386.c -- i386 System V (cdecl, ILP32) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * cdecl callee-saved set: ebx, esi, edi, ebp, esp. Args are pushed * right-to-left on the stack: at function entry, 4(%esp)=arg0, @@ -20,11 +20,11 @@ * * setjmp(env) 4(%esp)=env * longjmp(env, val) 4(%esp)=env, 8(%esp)=val - * coro_switch(f, t, val) 4(%esp)=from, 8(%esp)=to, 12(%esp)=value + * __cfree_coro_switch(f, t, val) 4(%esp)=from, 8(%esp)=to, 12(%esp)=value * * The "save esp/eip" trick: at function entry, (%esp) holds the caller's * return address (just pushed by `call`); 4(%esp) is the caller's - * pre-call esp. Saving those two lets longjmp/coro_switch "land" at the + * pre-call esp. Saving those two lets longjmp/__cfree_coro_switch "land" at the * call site exactly as if the function had returned. * * Modern SysV i386 (ABI rev 1.1+) requires 16-byte stack alignment @@ -49,9 +49,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_i386_ctx), "align c extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_i386_ctx *c = (struct __cfree_i386_ctx *)ctx; /* i386 stacks grow down; align top to 16. */ @@ -117,10 +117,10 @@ __asm__ ( RESTORE_FROM("%edx") " jmp *%ecx\n" - /* coro_switch(from, to, value) -- 4(%esp)=from, 8(%esp)=to, 12(%esp)=value. + /* __cfree_coro_switch(from, to, value) -- 4(%esp)=from, 8(%esp)=to, 12(%esp)=value. Read all three args before SAVE_INTO clobbers the stack frame. */ - ".globl " SYM(coro_switch) "\n" - SYM(coro_switch) ":\n" + ".globl " SYM(__cfree_coro_switch) "\n" + SYM(__cfree_coro_switch) ":\n" " movl 4(%esp), %edx\n" /* from */ SAVE_INTO("%edx") " movl 8(%esp), %edx\n" /* to (re-read; SAVE clobbered %eax not stack) */ @@ -129,7 +129,7 @@ __asm__ ( " jmp *%ecx\n" /* __cfree_coro_trampoline -- on first entry: %eax=value, %ebx=entry, - %esp=stack_top (no return addr pushed -- coro_switch reaches here + %esp=stack_top (no return addr pushed -- __cfree_coro_switch reaches here via jmp). cdecl needs the arg pushed; align defensively, then reserve 12 bytes + push value so that after the upcoming `call` pushes the 4-byte return addr, the callee sees %esp+4 16-aligned. */ diff --git a/lib/coro/riscv32.c b/lib/coro/riscv32.c @@ -1,7 +1,7 @@ /* * lib/coro/riscv32.c -- RISC-V 32-bit (ILP32/ILP32F/ILP32D) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * Per-target context layout (matches xOS rv32 tick_coro_ctx): * @@ -22,7 +22,7 @@ * and coro_ctx. * * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence - * is emitted in setjmp, longjmp, and coro_switch without duplication. + * is emitted in setjmp, longjmp, and __cfree_coro_switch without duplication. */ #include <setjmp.h> @@ -44,9 +44,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv32_ctx), "alig extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_riscv32_ctx *c = (struct __cfree_riscv32_ctx *)ctx; /* RISC-V stacks grow down; align top to 16. */ @@ -192,21 +192,21 @@ __asm__ ( " ret\n" ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n" - /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value. + /* __cfree_coro_switch(from, to, value) -- a0=from, a1=to, a2=value. Save into [a0], restore from [a1], deliver a2 in a0 (which is both the return register and the trampoline's first-arg reg on a fresh context's first run). */ - ".globl " SYM(coro_switch) "\n" - ".type " SYM(coro_switch) ", @function\n" - SYM(coro_switch) ":\n" + ".globl " SYM(__cfree_coro_switch) "\n" + ".type " SYM(__cfree_coro_switch) ", @function\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("a0") RESTORE_FROM("a1") " mv a0, a2\n" " ret\n" - ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n" + ".size " SYM(__cfree_coro_switch) ", .-" SYM(__cfree_coro_switch) "\n" /* __cfree_coro_trampoline -- on first entry: a0=value (delivered - by coro_switch's `mv a0, a2`), s0=entry (set by coro_init via + by __cfree_coro_switch's `mv a0, a2`), s0=entry (set by __cfree_coro_ctx_init via regs[2]), sp=stack_top. ebreak if entry returns. */ ".globl " SYM(__cfree_coro_trampoline) "\n" ".type " SYM(__cfree_coro_trampoline) ", @function\n" diff --git a/lib/coro/riscv64.c b/lib/coro/riscv64.c @@ -1,10 +1,10 @@ /* * lib/coro/riscv64.c -- RISC-V 64-bit (LP64D) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * RISC-V LP64D callee-saved set: - * ra (x1) -- saved manually so longjmp/coro_switch can + * ra (x1) -- saved manually so longjmp/__cfree_coro_switch can * "return" to the original call site * sp (x2) * s0-s11 (x8-x9, x18-x27) @@ -22,17 +22,17 @@ * * setjmp(env) a0=env * longjmp(env, val) a0=env, a1=val - * coro_switch(f, t, val) a0=from, a1=to, a2=val + * __cfree_coro_switch(f, t, val) a0=from, a1=to, a2=val * * Value-passing trick: the destination context "returns" via * ld ra, 0(a1); ... ret * where `ret` is `jalr x0, 0(ra)`. By moving the value into a0 just * before `ret`, both a fresh trampoline (entry(value)) and a previously - * suspended coro_switch (= the value its switch call returned) see it + * suspended __cfree_coro_switch (= the value its switch call returned) see it * as the a0 return register. * * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence - * is emitted in setjmp, longjmp, and coro_switch without duplication. + * is emitted in setjmp, longjmp, and __cfree_coro_switch without duplication. * * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C * compiler's call-site mangling (empty on RISC-V ELF). @@ -57,9 +57,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv64_ctx), "alig extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_riscv64_ctx *c = (struct __cfree_riscv64_ctx *)ctx; /* RISC-V stacks grow down; align top to 16. */ @@ -166,21 +166,21 @@ __asm__ ( " ret\n" ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n" - /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value. + /* __cfree_coro_switch(from, to, value) -- a0=from, a1=to, a2=value. Save into [a0], restore from [a1] (which clobbers a0 and a1's roles -- ra/sp/s* are loaded from the to-context), then deliver value in a0 just before ret. */ - ".globl " SYM(coro_switch) "\n" - ".type " SYM(coro_switch) ", @function\n" - SYM(coro_switch) ":\n" + ".globl " SYM(__cfree_coro_switch) "\n" + ".type " SYM(__cfree_coro_switch) ", @function\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("a0") RESTORE_FROM("a1") " mv a0, a2\n" " ret\n" - ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n" + ".size " SYM(__cfree_coro_switch) ", .-" SYM(__cfree_coro_switch) "\n" /* __cfree_coro_trampoline -- on first entry: a0=value (delivered), - s0=entry fn (set by coro_init), sp aligned to 16. ebreak if entry + s0=entry fn (set by __cfree_coro_ctx_init), sp aligned to 16. ebreak if entry returns. */ ".globl " SYM(__cfree_coro_trampoline) "\n" ".type " SYM(__cfree_coro_trampoline) ", @function\n" diff --git a/lib/coro/x86_64.c b/lib/coro/x86_64.c @@ -1,7 +1,7 @@ /* * lib/coro/x86_64.c -- x86_64 System V ABI implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * Callee-saved set on SysV: rbx, rbp, r12-r15. (No callee-saved xmm * regs -- those are MS-ABI specific; see x86_64_win.c.) @@ -15,11 +15,11 @@ * * setjmp(env) %rdi=env * longjmp(env, val) %rdi=env, %esi=val - * coro_switch(f, t, val) %rdi=from, %rsi=to, %rdx=val + * __cfree_coro_switch(f, t, val) %rdi=from, %rsi=to, %rdx=val * * The "save rsp/rip" trick: at function entry, (%rsp) holds the * caller's return address (just pushed by `call`); 8(%rsp) is the - * caller's pre-call rsp. Saving those two lets longjmp/coro_switch + * caller's pre-call rsp. Saving those two lets longjmp/__cfree_coro_switch * "land" at the call site exactly as if the function had returned. */ @@ -40,9 +40,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_ctx), "align extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_x86_64_ctx *c = (struct __cfree_x86_64_ctx *)ctx; /* x86_64 stacks grow down; align top to 16. */ @@ -110,16 +110,16 @@ __asm__ ( RESTORE_FROM("%rdi") " jmpq *%rcx\n" - /* coro_switch(from, to, value) -- from=%rdi, to=%rsi, value=%rdx. */ - ".globl " SYM(coro_switch) "\n" - SYM(coro_switch) ":\n" + /* __cfree_coro_switch(from, to, value) -- from=%rdi, to=%rsi, value=%rdx. */ + ".globl " SYM(__cfree_coro_switch) "\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("%rdi") " movq %rdx, %rax\n" /* deliver value as return reg */ RESTORE_FROM("%rsi") " jmpq *%rcx\n" /* __cfree_coro_trampoline -- on first entry: %rax=value, - %r13=entry, %rsp=stack_top (no return addr pushed -- coro_switch + %r13=entry, %rsp=stack_top (no return addr pushed -- __cfree_coro_switch reaches here via jmp). System V wants %rsp+8 ≡ 16 (mod 16) at function entry; the andq below makes that hold defensively. */ ".globl " SYM(__cfree_coro_trampoline) "\n" diff --git a/lib/coro/x86_64_win.c b/lib/coro/x86_64_win.c @@ -1,7 +1,7 @@ /* * lib/coro/x86_64_win.c -- x86_64 Windows (MS x64 ABI) implementations of * setjmp / longjmp (<setjmp.h>) - * coro_init / coro_switch / trampoline (<stdcoro.h>) + * __cfree_coro_ctx_init / __cfree_coro_switch / trampoline (<stdcoro.h>) * * MS x64 callee-saved set: rbx, rbp, rdi, rsi, r12-r15, xmm6-xmm15. * (Compare with x86_64.c -- SysV doesn't preserve rdi/rsi or any xmm.) @@ -21,7 +21,7 @@ * * setjmp(env) %rcx=env * longjmp(env, val) %rcx=env, %edx=val - * coro_switch(f, t, val) %rcx=from, %rdx=to, %r8=value + * __cfree_coro_switch(f, t, val) %rcx=from, %rdx=to, %r8=value * * The "save rsp/rip" trick mirrors x86_64.c: at function entry, * (%rsp) holds the caller's return address, 8(%rsp) is the caller's @@ -47,9 +47,9 @@ _Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_win_ctx), "a extern void __cfree_coro_trampoline(void); -void coro_init(coro_ctx *ctx, +void __cfree_coro_ctx_init(coro_ctx *ctx, void *stack_base, size_t stack_len, - coro_entry_fn entry) { + void (*entry)(uintptr_t)) { struct __cfree_x86_64_win_ctx *c = (struct __cfree_x86_64_win_ctx *)ctx; /* x86_64 stacks grow down; align top to 16. */ @@ -153,17 +153,17 @@ __asm__ ( RESTORE_FROM("%rcx") " jmpq *%r10\n" - /* coro_switch(from, to, value) -- from=%rcx, to=%rdx, value=%r8. */ - ".globl " SYM(coro_switch) "\n" - SYM(coro_switch) ":\n" + /* __cfree_coro_switch(from, to, value) -- from=%rcx, to=%rdx, value=%r8. */ + ".globl " SYM(__cfree_coro_switch) "\n" + SYM(__cfree_coro_switch) ":\n" SAVE_INTO("%rcx") " movq %r8, %rax\n" /* deliver value as return reg */ RESTORE_FROM("%rdx") " jmpq *%r10\n" /* __cfree_coro_trampoline -- on first entry: %rax=value (delivered - by coro_switch), %r12=entry (set by coro_init), %rsp=stack_top - (no return addr pushed -- coro_switch reaches here via jmp). MS + by __cfree_coro_switch), %r12=entry (set by __cfree_coro_ctx_init), %rsp=stack_top + (no return addr pushed -- __cfree_coro_switch reaches here via jmp). MS x64 wants %rsp 16-byte aligned at call sites with 32 bytes of shadow space reserved by the caller. */ ".globl " SYM(__cfree_coro_trampoline) "\n" diff --git a/test/smoke.c b/test/smoke.c @@ -145,17 +145,25 @@ static int cfree_setjmp_compiles(int x) { return 0; } -/* stdcoro: coro_ctx storage exists, the API surface compiles and - resolves; same compile-only caveat as setjmp. */ +/* stdcoro: coro_ctx and coro_t storage exists; the asymmetric API + surface compiles and resolves. Compile-only -- smoke.c never links + against a libcfree_rt. */ _Static_assert(sizeof(coro_ctx) >= 64, "coro_ctx room for regs"); _Static_assert(_Alignof(coro_ctx) >= 16, "coro_ctx 16-byte aligned"); +_Static_assert(sizeof(coro_t) >= sizeof(coro_ctx) + 2 * sizeof(void *), + "coro_t room for ctx + resumer + user_fn"); +_Static_assert(_Alignof(coro_t) >= 16, "coro_t 16-byte aligned"); _Static_assert(CORO_STACK_ALIGN >= 8, "stack align reasonable"); -static coro_ctx cfree_co_a, cfree_co_b; +_Static_assert(CORO_INIT != CORO_DEAD, "status enum distinct"); + +static coro_t cfree_co; static _Alignas(16) unsigned char cfree_co_stack[4096]; -static void cfree_co_entry(uintptr_t v) { (void)v; for (;;) {} } +static uintptr_t cfree_co_body(uintptr_t v) { return coro_yield(v + 1); } static uintptr_t cfree_coro_compiles(void) { - coro_init(&cfree_co_b, cfree_co_stack, sizeof(cfree_co_stack), cfree_co_entry); - return coro_switch(&cfree_co_a, &cfree_co_b, 0xC0FFEEu); + coro_init(&cfree_co, cfree_co_body, cfree_co_stack, sizeof(cfree_co_stack)); + coro_result_t r = coro_resume(&cfree_co, 0xC0FFEEu); + coro_t *me = coro_self(); + return r.value + (uintptr_t)me + (uintptr_t)coro_status(&cfree_co); } /* stdatomic: types, memory_order, lock-free macros, plus a runtime