commit 8dd63074edda0ee085f6ebe758e9f0e7a9739594
parent 0dcc91b39522fbc9557618c46bf4d193aa40bba9
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 7 May 2026 13:47:23 -0700
setjmp.h stdcoro.h
Diffstat:
| M | doc/builtins.md | | | 20 | +++++++++++++++++--- |
| M | include/setjmp.h | | | 31 | +++++++++++++++++++------------ |
| A | include/stdcoro.h | | | 56 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | lib/README.md | | | 22 | ++++++++++++++++++++++ |
| M | lib/build.sh | | | 66 | ++++++++++++++++++++++++++++++++++++++++++------------------------ |
| A | lib/coro/aarch64.c | | | 137 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/arm32.c | | | 202 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/arm32_thumb1.c | | | 174 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/i386.c | | | 143 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/riscv32.c | | | 219 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/riscv64.c | | | 193 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/x86_64.c | | | 131 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib/coro/x86_64_win.c | | | 176 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | test/smoke.c | | | 18 | +++++++++++++++++- |
14 files changed, 1548 insertions(+), 40 deletions(-)
diff --git a/doc/builtins.md b/doc/builtins.md
@@ -149,9 +149,23 @@ Always:
- Float → float: `__extendsfdf2`, `__extendsftf2`, `__extenddftf2`, `__truncdfsf2`, `__trunctfsf2`, `__trunctfdf2`
- Compare: `__eq`, `__ne`, `__lt`, `__le`, `__gt`, `__ge`, `__unord` × `sf2`/`df2`/`tf2`
-### Nonlocal jumps (always shipped)
-- `setjmp`, `longjmp` — target-specific assembly. The `jmp_buf` layout is
- internal to these two functions; `<setjmp.h>` only fixes the array size.
+### Nonlocal jumps + stackful coroutines (per-arch, always shipped)
+The `<setjmp.h>` and `<stdcoro.h>` primitives share one per-target context
+struct: callee-saved GPRs + callee-saved FPRs + sp + return address. The
+`jmp_buf` and `coro_ctx` typedefs are 256-byte aligned-16 storage; the
+runtime reinterprets them as the per-arch struct.
+- `setjmp`, `longjmp` — `<setjmp.h>` (C11 7.13). cfree extension: this
+ header is *not* in the C11 freestanding subset.
+- `coro_init`, `coro_switch`, `__cfree_coro_trampoline` — `<stdcoro.h>`
+ (cfree-specific). `coro_switch(from, to, value) → uintptr_t` is the
+ one universal primitive; `setjmp` = save-and-return-0,
+ `longjmp` = restore-and-deliver-val.
+- Implementations live one master `.c` per arch under `lib/coro/`
+ (file-scope asm + tiny C `coro_init`). ARM has two: `arm32.c`
+ (Thumb-2, ARMv7+, may use VFP `d8-d15`) and `arm32_thumb1.c`
+ (ARMv6-M, no IT blocks / no VFP / data-processing limited to
+ r0-r7). Not provided for: WASM (would need an Asyncify-fiber
+ port).
### Atomic fallbacks (only when target lacks native atomics for that width)
- Generic: `__atomic_load`, `__atomic_store`, `__atomic_exchange`, `__atomic_compare_exchange`
diff --git a/include/setjmp.h b/include/setjmp.h
@@ -1,21 +1,28 @@
/* setjmp.h -- C11 7.13 -- Nonlocal jumps
*
- * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6); cfree
- * provides it as an extension for code that wants nonlocal control flow
- * without a hosted libc. The setjmp/longjmp pair is target-specific
- * assembly and lives in libcfree_rt.a -- see doc/builtins.md.
+ * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6);
+ * cfree provides it as an extension. The setjmp/longjmp pair is
+ * target-specific assembly in libcfree_rt.a -- see doc/builtins.md.
*
- * jmp_buf is an array type (C11 7.13p2). Its layout is internal to the
- * runtime; the size below is conservative -- large enough to hold every
- * cfree target's callee-saved GPRs + callee-saved FPRs + sp + return
- * address. C11 7.13 explicitly excludes the floating-point status flags,
- * the state of open files, and any other component of the abstract
- * machine, so no signal-mask slot is reserved.
- */
+ * jmp_buf is an array type (C11 7.13p2). The runtime reinterprets the
+ * buffer as a per-target struct of callee-saved GPRs + callee-saved
+ * FPRs + sp + return address. The size below is sized to the largest
+ * such struct across cfree targets -- 256 bytes (x86_64 Windows: 12
+ * GPR slots + xmm6-15). C11 explicitly excludes the FP status flags
+ * and open-file state, so no signal-mask slot is reserved. The same
+ * 256-byte payload is shared with <stdcoro.h>'s coro_ctx so the
+ * underlying save/restore halves are reused across all three
+ * primitives. */
#ifndef CFREE_SETJMP_H
#define CFREE_SETJMP_H
-typedef long jmp_buf[32];
+/* Wrap in a struct so 16-byte alignment is guaranteed even when the
+ user puts a jmp_buf on the stack -- xmm save instructions require
+ it on x86_64. The [1] makes jmp_buf an array type as the standard
+ demands, so passing one to setjmp/longjmp decays to a pointer. */
+typedef struct {
+ _Alignas(16) unsigned char __cfree_storage[256];
+} jmp_buf[1];
int setjmp(jmp_buf env);
_Noreturn void longjmp(jmp_buf env, int val);
diff --git a/include/stdcoro.h b/include/stdcoro.h
@@ -0,0 +1,56 @@
+/* stdcoro.h -- cfree extension -- stackful symmetric coroutines
+ *
+ * stdcoro.h is non-standard: C11 has no stackful-coroutine facility.
+ * cfree ships it as a native counterpart to <setjmp.h>: the underlying
+ * per-target context struct, save sequence, and restore sequence are
+ * literally shared with setjmp/longjmp -- only the entry shapes differ
+ * (setjmp = save+return-0; longjmp = restore+return-val; coro_switch =
+ * save(from)+restore(to)+deliver-value). Implementations live in
+ * libcfree_rt.a -- see doc/builtins.md.
+ *
+ * Programming model
+ * 1. Allocate a coro_ctx and a stack region.
+ * 2. coro_init(&ctx, stack_base, stack_len, entry).
+ * 3. coro_switch(&caller, &ctx, value) -- delivers `value` to entry's
+ * uintptr_t argument on first switch in.
+ * 4. Inside the coroutine, coro_switch(&ctx, &caller, value) yields
+ * back, with `value` becoming the caller's coro_switch return.
+ * 5. entry must NOT return; the trampoline traps if it does.
+ *
+ * coro_ctx is sized conservatively -- large enough for every cfree
+ * target's callee-saved registers + sp + ip + (where applicable)
+ * callee-saved FP regs. Layout is internal to the runtime.
+ */
+#ifndef CFREE_STDCORO_H
+#define CFREE_STDCORO_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Stack alignment required at function-call boundaries on every cfree
+ target (16 on x86_64/aarch64/arm32-AAPCS-VFP/riscv; weaker on i386
+ but 16 covers it). Caller stacks must be aligned to this. */
+#define CORO_STACK_ALIGN 16
+
+/* 256 bytes is the largest per-target context across cfree's targets
+ (x86_64 Windows: 12 GPR slots + xmm6-15). Same byte payload as
+ <setjmp.h>'s jmp_buf -- the per-arch runtime reinterprets either
+ as the same internal struct. */
+typedef struct coro_ctx {
+ _Alignas(16) unsigned char __cfree_storage[256];
+} coro_ctx;
+
+typedef void (*coro_entry_fn)(uintptr_t value);
+
+/* Initialize *ctx to begin executing entry(value) on first switch in,
+ using the stack region [stack_base, stack_base + stack_len). The
+ stack base must be CORO_STACK_ALIGN-aligned. entry must not return. */
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry);
+
+/* Save callee-saved state into *from, restore it from *to, deliver
+ `value` to *to. Returns the value passed by the next switch back. */
+uintptr_t coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value);
+
+#endif
diff --git a/lib/README.md b/lib/README.md
@@ -33,6 +33,7 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired.
| `riscv/rv64.S` | `__riscv_save_*` + `__riscv_restore_*` (rv64) | RISC-V rv64 with `-msave-restore` |
| `mem/mem.c` | `memcpy` / `memmove` / `memset` / `memcmp` (weak) | All; user libc overrides |
| `atomic/atomic_freestanding.c` | `__atomic_*` fallback shim | All |
+| `coro/<arch>.c` | `setjmp` / `longjmp` (`<setjmp.h>`) + `coro_init` / `coro_switch` / `__cfree_coro_trampoline` (`<stdcoro.h>`) | One of `aarch64`, `arm32`, `arm32_thumb1`, `i386`, `riscv32`, `riscv64`, `x86_64`, `x86_64_win`. Not built for `wasm32`. |
### Build-time include dirs (consumed by the masters; nothing here lands in `libcfree_rt.a`)
@@ -141,6 +142,27 @@ Hand-written portable C (not from compiler-rt). All four functions are weak
so a user libc, or a tuned arch-specific replacement, wins at link time.
`arm/aeabi_thumb{1,2}.S`'s `aeabi_mem*` symbols forward to these.
+### `coro/<arch>.c`
+One master `.c` per arch that supplies both `<setjmp.h>` (`setjmp`,
+`longjmp`) and `<stdcoro.h>` (`coro_init`, `coro_switch`,
+`__cfree_coro_trampoline`). The setjmp/longjmp/coro_switch primitives
+share a per-arch struct (callee-saved GPRs + callee-saved FPRs + sp +
+return address) and one pair of C string-concat macros
+`SAVE_INTO(reg)` / `RESTORE_FROM(reg)` so the same instruction bytes
+are emitted in all three places. Written as file-scope `__asm__`
+inside a `.c` file (not a separate `.S`) so the asm and the tiny
+`coro_init` C function stay co-located. Symbol naming uses
+`__USER_LABEL_PREFIX__` so the same source compiles for ELF / Mach-O /
+COFF.
+
+ARM ships two variants: `arm32.c` (Thumb-2, ARMv7+, optional VFP
+`d8-d15` gated on `__ARM_FP`) and `arm32_thumb1.c` (ARMv6-M /
+Cortex-M0/M0+; no IT blocks, no VFP, data-processing restricted to
+r0-r7, no `str sp` / `str rN, [sp,...]` -- the asm sequences don't
+share with arm32.c so it's a separate file).
+
+Not provided for `wasm32` (would need an Asyncify-fiber port).
+
### `atomic/atomic_freestanding.c`
Defines a pointer-sized `_Atomic(uintptr_t)` spinlock as the lock primitive
(no OS dependency) then `#include`s `atomic_common.inc`, which contains the
diff --git a/lib/build.sh b/lib/build.sh
@@ -93,6 +93,16 @@ ARM_AEABI_THUMB1="arm/aeabi_thumb1.S arm/aeabi.c"
RV32_SR="riscv/rv32.S"
RV64_SR="riscv/rv64.S"
+# Coro + setjmp/longjmp: one master .c per arch, file-scope asm inside.
+CORO_X86_64="coro/x86_64.c"
+CORO_X86_64_WIN="coro/x86_64_win.c"
+CORO_I386="coro/i386.c"
+CORO_AARCH64="coro/aarch64.c"
+CORO_ARM32="coro/arm32.c"
+CORO_ARM32_THUMB1="coro/arm32_thumb1.c"
+CORO_RV32="coro/riscv32.c"
+CORO_RV64="coro/riscv64.c"
+
#-------------------------------------------------------------------------------
# Variants
#-------------------------------------------------------------------------------
@@ -103,71 +113,79 @@ echo
# ---- LP64 little-endian ------------------------------------------------------
LP64_BASE="$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C"
+# Coro impl needs cfree's own headers (setjmp.h, stdcoro.h).
+CORO_INC="-I../include"
+
build_variant x86_64-linux \
- "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1" \
- "$LP64_BASE"
+ "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$LP64_BASE $CORO_X86_64"
build_variant x86_64-apple-darwin \
- "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \
- "$LP64_BASE"
+ "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$LP64_BASE $CORO_X86_64"
# aarch64-linux: long double is binary128; needs fp_tf + fp_ti and the
# tf_supplement.h pre-include.
build_variant aarch64-linux \
"--target=aarch64-linux-gnu \
-Iinclude/lp64_le_ldbl128 -Iinclude/lp64_le -DHAS_INT128=1 \
- -include include/lp64_le_ldbl128/tf_supplement.h" \
- "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C"
+ -include include/lp64_le_ldbl128/tf_supplement.h $CORO_INC" \
+ "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C $CORO_AARCH64"
build_variant aarch64-apple-darwin \
- "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \
- "$LP64_BASE"
+ "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$LP64_BASE $CORO_AARCH64"
build_variant riscv64-elf \
"--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd \
- -Iinclude/lp64_le -DHAS_INT128=1" \
- "$LP64_BASE"
+ -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$LP64_BASE $CORO_RV64"
build_variant riscv64-elf-save-restore \
"--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd -msave-restore \
- -Iinclude/lp64_le -DHAS_INT128=1" \
- "$LP64_BASE $RV64_SR"
+ -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$LP64_BASE $RV64_SR $CORO_RV64"
# ---- LLP64 little-endian (Win64) --------------------------------------------
build_variant x86_64-pc-windows \
- "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1" \
- "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C"
+ "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1 $CORO_INC" \
+ "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C $CORO_X86_64_WIN"
# ---- ILP32 little-endian -----------------------------------------------------
ILP32_BASE="$INT_C $INT32_C $FP_C $MEM_C $ATOMIC_C"
build_variant i386-linux \
- "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0" \
- "$ILP32_BASE"
+ "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+ "$ILP32_BASE $CORO_I386"
+# wasm32: no setjmp/coro impl yet -- Emscripten fibers / sjlj are a
+# separate runtime model that hasn't been ported to cfree.
build_variant wasm32 \
"--target=wasm32-unknown-unknown -Iinclude/ilp32_le -DHAS_INT128=0" \
"$ILP32_BASE"
build_variant riscv32-elf \
"--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd \
- -Iinclude/ilp32_le -DHAS_INT128=0" \
- "$ILP32_BASE"
+ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+ "$ILP32_BASE $CORO_RV32"
build_variant riscv32-elf-save-restore \
"--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd -msave-restore \
- -Iinclude/ilp32_le -DHAS_INT128=0" \
- "$ILP32_BASE $RV32_SR"
+ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+ "$ILP32_BASE $RV32_SR $CORO_RV32"
build_variant arm-eabi-thumb2 \
"--target=arm-none-eabi -march=armv7-a -mthumb -mfloat-abi=soft \
- -Iinclude/ilp32_le -DHAS_INT128=0" \
- "$ILP32_BASE $ARM_AEABI_THUMB2"
+ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+ "$ILP32_BASE $ARM_AEABI_THUMB2 $CORO_ARM32"
+# arm-eabi-thumb1 (Cortex-M0/M0+, ARMv6-M): Thumb-1 ISA, no IT blocks,
+# data-processing ops restricted to r0-r7, no VFP. Coro impl is a
+# separate file from arm32.c since the asm sequences don't share.
build_variant arm-eabi-thumb1 \
"--target=arm-none-eabi -march=armv6-m -mthumb -mfloat-abi=soft \
- -Iinclude/ilp32_le -DHAS_INT128=0" \
- "$ILP32_BASE $ARM_AEABI_THUMB1"
+ -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+ "$ILP32_BASE $ARM_AEABI_THUMB1 $CORO_ARM32_THUMB1"
#-------------------------------------------------------------------------------
echo
diff --git a/lib/coro/aarch64.c b/lib/coro/aarch64.c
@@ -0,0 +1,137 @@
+/*
+ * lib/coro/aarch64.c -- AArch64 (AAPCS) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * All three primitives sit on one per-target context layout:
+ *
+ * regs[0..9] x19-x28
+ * regs[10..11] fp (x29), lr (x30)
+ * regs[12] sp
+ * fp_regs[0..7] d8-d15 (low 64 bits of v8-v15; AAPCS only mandates
+ * the lower 64 bits be preserved)
+ *
+ * sizeof = 176 (alignof-16 padded), 16-byte aligned. Fits in the
+ * 256-byte storage carved out by jmp_buf and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte
+ * sequence is emitted in setjmp, longjmp, and coro_switch without
+ * any duplication or gas-specific .macro tricks.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling on both ELF (no prefix) and Mach-O
+ * (leading "_").
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm64_ctx {
+ uintptr_t regs[13];
+ uint64_t fp_regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm64_ctx) == 176, "layout");
+_Static_assert(_Alignof(struct __cfree_arm64_ctx) == 16, "align");
+_Static_assert(offsetof(struct __cfree_arm64_ctx, fp_regs) == 104, "fp off");
+_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_arm64_ctx *c = (struct __cfree_arm64_ctx *)ctx;
+
+ /* AArch64 stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)entry; /* x19 -- entry fn */
+ c->regs[10] = 0; /* fp */
+ c->regs[11] = (uintptr_t)__cfree_coro_trampoline;/* lr */
+ c->regs[12] = top; /* sp */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg]; clobbers x9 (caller-saved). */
+#define SAVE_INTO(reg) \
+ " stp x19, x20, [" reg ", #0]\n" \
+ " stp x21, x22, [" reg ", #16]\n" \
+ " stp x23, x24, [" reg ", #32]\n" \
+ " stp x25, x26, [" reg ", #48]\n" \
+ " stp x27, x28, [" reg ", #64]\n" \
+ " stp x29, x30, [" reg ", #80]\n" \
+ " mov x9, sp\n" \
+ " str x9, [" reg ", #96]\n" \
+ " stp d8, d9, [" reg ", #104]\n" \
+ " stp d10, d11, [" reg ", #120]\n" \
+ " stp d12, d13, [" reg ", #136]\n" \
+ " stp d14, d15, [" reg ", #152]\n"
+
+/* Restore callee-saved state from [reg]; clobbers x9. */
+#define RESTORE_FROM(reg) \
+ " ldp d8, d9, [" reg ", #104]\n" \
+ " ldp d10, d11, [" reg ", #120]\n" \
+ " ldp d12, d13, [" reg ", #136]\n" \
+ " ldp d14, d15, [" reg ", #152]\n" \
+ " ldp x19, x20, [" reg ", #0]\n" \
+ " ldp x21, x22, [" reg ", #16]\n" \
+ " ldp x23, x24, [" reg ", #32]\n" \
+ " ldp x25, x26, [" reg ", #48]\n" \
+ " ldp x27, x28, [" reg ", #64]\n" \
+ " ldp x29, x30, [" reg ", #80]\n" \
+ " ldr x9, [" reg ", #96]\n" \
+ " mov sp, x9\n"
+
+__asm__ (
+ ".text\n"
+ ".align 4\n"
+
+ /* setjmp(env) -- env in x0. lr at call time is the return address
+ into the caller, exactly what longjmp must restore. */
+ ".globl " SYM(setjmp) "\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("x0")
+ " mov x0, #0\n"
+ " ret\n"
+
+ /* longjmp(env, val) -- env in x0, val in x1.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); csinc gives
+ x0 = (x1 != 0) ? x1 : 1 branch-free. */
+ ".globl " SYM(longjmp) "\n"
+ SYM(longjmp) ":\n"
+ RESTORE_FROM("x0")
+ " cmp x1, #0\n"
+ " csinc x0, x1, xzr, ne\n"
+ " ret\n"
+
+ /* coro_switch(from, to, value) -- x0, x1, x2. Save into [x0],
+ restore from [x1], deliver x2 in x0 (which is both the return
+ register here and the first-arg register the trampoline reads
+ on a fresh context's first run). */
+ ".globl " SYM(coro_switch) "\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("x0")
+ RESTORE_FROM("x1")
+ " mov x0, x2\n"
+ " ret\n"
+
+ /* __cfree_coro_trampoline -- on first entry x0 = value (delivered),
+ x19 = entry fn (set by coro_init), sp aligned to 16. brk if entry
+ returns. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " blr x19\n"
+ " brk #0\n"
+);
diff --git a/lib/coro/arm32.c b/lib/coro/arm32.c
@@ -0,0 +1,202 @@
+/*
+ * lib/coro/arm32.c -- ARM32 Thumb-2 (AAPCS) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * All three primitives sit on one per-target context layout:
+ *
+ * regs[0..7] r4-r11
+ * regs[8] sp
+ * regs[9] lr
+ * fp_regs[0..7] d8-d15 (AAPCS only mandates the lower 64 bits of
+ * v8-v15 be preserved across calls; saved
+ * only when __ARM_FP is defined, but the
+ * slots are always allocated so the byte
+ * layout is stable across soft/hard-float
+ * builds).
+ *
+ * 10*4 GPR slots + 8*8 fp_regs slots = 104 bytes of payload, padded
+ * to 112 by alignof(16). fp_regs at offset 40. Fits in the 256-byte
+ * storage carved out by jmp_buf and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch. The VFP half is
+ * gated by a C-level #ifdef on __ARM_FP -- the cpp pass picks one
+ * macro body before the assembler sees anything, so we can't hide
+ * `#ifdef` inside the asm string.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling on both ELF (no prefix) and Mach-O
+ * (leading "_").
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm32_ctx {
+ uintptr_t regs[10];
+ uint64_t fp_regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm32_ctx) == 112, "layout");
+_Static_assert(_Alignof(struct __cfree_arm32_ctx) == 16, "align");
+_Static_assert(offsetof(struct __cfree_arm32_ctx, fp_regs) == 40, "fp off");
+_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_arm32_ctx *c = (struct __cfree_arm32_ctx *)ctx;
+
+ /* ARM32 stacks grow down; align top to 16 (AAPCS public-boundary
+ requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)entry; /* r4 -- entry fn */
+ c->regs[3] = 0; /* r7 -- frame ptr */
+ c->regs[8] = top; /* sp */
+ c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* lr */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save/restore macros. The VFP half is conditional on __ARM_FP at the
+ C-cpp level -- by the time the inline assembler sees the string,
+ only one variant remains. The byte offsets match the struct layout
+ regardless (slots are always allocated). */
+#ifdef __ARM_FP
+#define SAVE_INTO(reg) \
+ " str r4, [" reg ", #0]\n" \
+ " str r5, [" reg ", #4]\n" \
+ " str r6, [" reg ", #8]\n" \
+ " str r7, [" reg ", #12]\n" \
+ " str r8, [" reg ", #16]\n" \
+ " str r9, [" reg ", #20]\n" \
+ " str r10, [" reg ", #24]\n" \
+ " str r11, [" reg ", #28]\n" \
+ " str sp, [" reg ", #32]\n" \
+ " str lr, [" reg ", #36]\n" \
+ " vstr d8, [" reg ", #40]\n" \
+ " vstr d9, [" reg ", #48]\n" \
+ " vstr d10, [" reg ", #56]\n" \
+ " vstr d11, [" reg ", #64]\n" \
+ " vstr d12, [" reg ", #72]\n" \
+ " vstr d13, [" reg ", #80]\n" \
+ " vstr d14, [" reg ", #88]\n" \
+ " vstr d15, [" reg ", #96]\n"
+
+#define RESTORE_FROM(reg) \
+ " vldr d8, [" reg ", #40]\n" \
+ " vldr d9, [" reg ", #48]\n" \
+ " vldr d10, [" reg ", #56]\n" \
+ " vldr d11, [" reg ", #64]\n" \
+ " vldr d12, [" reg ", #72]\n" \
+ " vldr d13, [" reg ", #80]\n" \
+ " vldr d14, [" reg ", #88]\n" \
+ " vldr d15, [" reg ", #96]\n" \
+ " ldr r4, [" reg ", #0]\n" \
+ " ldr r5, [" reg ", #4]\n" \
+ " ldr r6, [" reg ", #8]\n" \
+ " ldr r7, [" reg ", #12]\n" \
+ " ldr r8, [" reg ", #16]\n" \
+ " ldr r9, [" reg ", #20]\n" \
+ " ldr r10, [" reg ", #24]\n" \
+ " ldr r11, [" reg ", #28]\n" \
+ " ldr sp, [" reg ", #32]\n" \
+ " ldr lr, [" reg ", #36]\n"
+#else
+#define SAVE_INTO(reg) \
+ " str r4, [" reg ", #0]\n" \
+ " str r5, [" reg ", #4]\n" \
+ " str r6, [" reg ", #8]\n" \
+ " str r7, [" reg ", #12]\n" \
+ " str r8, [" reg ", #16]\n" \
+ " str r9, [" reg ", #20]\n" \
+ " str r10, [" reg ", #24]\n" \
+ " str r11, [" reg ", #28]\n" \
+ " str sp, [" reg ", #32]\n" \
+ " str lr, [" reg ", #36]\n"
+
+#define RESTORE_FROM(reg) \
+ " ldr r4, [" reg ", #0]\n" \
+ " ldr r5, [" reg ", #4]\n" \
+ " ldr r6, [" reg ", #8]\n" \
+ " ldr r7, [" reg ", #12]\n" \
+ " ldr r8, [" reg ", #16]\n" \
+ " ldr r9, [" reg ", #20]\n" \
+ " ldr r10, [" reg ", #24]\n" \
+ " ldr r11, [" reg ", #28]\n" \
+ " ldr sp, [" reg ", #32]\n" \
+ " ldr lr, [" reg ", #36]\n"
+#endif
+
+__asm__ (
+ ".syntax unified\n"
+ ".thumb\n"
+ ".text\n"
+ ".align 2\n"
+
+ /* setjmp(env) -- env in r0. lr at call time is the return address
+ into the caller, exactly what longjmp must restore. */
+ ".globl " SYM(setjmp) "\n"
+ ".thumb_func\n"
+ ".type " SYM(setjmp) ", %function\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("r0")
+ " movs r0, #0\n"
+ " bx lr\n"
+
+ /* longjmp(env, val) -- env in r0, val in r1.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); the IT block
+ gives r1 = (r1 == 0) ? 1 : r1, then we move it into r0 and
+ branch to the saved lr. */
+ ".globl " SYM(longjmp) "\n"
+ ".thumb_func\n"
+ ".type " SYM(longjmp) ", %function\n"
+ SYM(longjmp) ":\n"
+ RESTORE_FROM("r0")
+ " cmp r1, #0\n"
+ " it eq\n"
+ " moveq r1, #1\n"
+ " mov r0, r1\n"
+ " bx lr\n"
+
+ /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value.
+ Save into [r0], restore from [r1], deliver r2 in r0. The lr
+ loaded by RESTORE_FROM is either a real return address (a
+ previously-suspended coro) or __cfree_coro_trampoline (a fresh
+ coro initialized by coro_init). Either way `bx lr` lands there
+ with r0 holding `value`. */
+ ".globl " SYM(coro_switch) "\n"
+ ".thumb_func\n"
+ ".type " SYM(coro_switch) ", %function\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("r0")
+ RESTORE_FROM("r1")
+ " mov r0, r2\n"
+ " bx lr\n"
+
+ /* __cfree_coro_trampoline -- on first entry r0 = value (delivered
+ by coro_switch's `mov r0, r2`), r4 = entry fn (set by coro_init),
+ sp aligned to 16. udf if entry returns. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ ".thumb_func\n"
+ ".type " SYM(__cfree_coro_trampoline) ", %function\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " blx r4\n"
+ " udf #0\n"
+
+ ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/arm32_thumb1.c b/lib/coro/arm32_thumb1.c
@@ -0,0 +1,174 @@
+/*
+ * lib/coro/arm32_thumb1.c -- ARMv6-M (Cortex-M0 / M0+, Thumb-1) impls of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * Thumb-1 / ARMv6-M is a strict subset of the Thumb-2 ISA used by the
+ * sibling arm32.c, and several conveniences disappear:
+ *
+ * - no IT blocks: conditional execution must use a forward branch.
+ * - data-processing ops are restricted to r0-r7. r8-r15 are reachable
+ * only via the `mov` high-register form and a few specials; in
+ * particular there is no `str rN, [sp,...]` / `str sp, [rN,...]`.
+ * - `mov rd, rm` with *both* operands low is UNPREDICTABLE in
+ * ARMv6-M; use the T2 flags-setting form `movs rd, rm` for low->low
+ * register copies. The plain `mov` form is reserved for cases where
+ * at least one operand is a high register (sp/lr/r8-r11).
+ * - no VFP coprocessor on M0/M0+, so no fp_regs slots.
+ *
+ * Layout: 10 GPR slots (r4-r11, sp, lr) = 40 bytes, padded to 16-byte
+ * alignment by alignof(16). Fits in the 256-byte storage carved out by
+ * jmp_buf and coro_ctx.
+ *
+ * SAVE_INTO uses r4-r7 as scratches *after* they have themselves been
+ * stored, so r0-r3 are never clobbered. That matters for coro_switch:
+ * `to` (r1) and `value` (r2) survive across the save half and are still
+ * live for the restore half / value delivery.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm32_thumb1_ctx {
+ uintptr_t regs[10];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) == 48, "layout");
+_Static_assert(_Alignof(struct __cfree_arm32_thumb1_ctx) == 16, "align");
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_thumb1_ctx),"align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_arm32_thumb1_ctx *c = (struct __cfree_arm32_thumb1_ctx *)ctx;
+
+ /* ARM stacks grow down; align top to 16 (AAPCS public-boundary
+ requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)entry; /* r4 -- entry fn */
+ c->regs[3] = 0; /* r7 -- frame ptr */
+ c->regs[8] = top; /* sp */
+ c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* lr */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg].
+ Stage 1: store r4-r7 directly (low->low str is fine).
+ Stage 2: with r4-r7 already saved, reuse them as scratches to copy
+ the high regs r8-r11 down and store them.
+ Stage 3: same trick for sp and lr.
+ r0-r3 are never touched. */
+#define SAVE_INTO(reg) \
+ " str r4, [" reg ", #0]\n" \
+ " str r5, [" reg ", #4]\n" \
+ " str r6, [" reg ", #8]\n" \
+ " str r7, [" reg ", #12]\n" \
+ " mov r4, r8\n" \
+ " mov r5, r9\n" \
+ " mov r6, r10\n" \
+ " mov r7, r11\n" \
+ " str r4, [" reg ", #16]\n" \
+ " str r5, [" reg ", #20]\n" \
+ " str r6, [" reg ", #24]\n" \
+ " str r7, [" reg ", #28]\n" \
+ " mov r4, sp\n" \
+ " mov r5, lr\n" \
+ " str r4, [" reg ", #32]\n" \
+ " str r5, [" reg ", #36]\n"
+
+/* Restore callee-saved state from [reg]. Mirror image: load r8-r11/sp/lr
+ first via r4-r7 as scratches, then restore the real r4-r7 last. */
+#define RESTORE_FROM(reg) \
+ " ldr r4, [" reg ", #16]\n" \
+ " ldr r5, [" reg ", #20]\n" \
+ " ldr r6, [" reg ", #24]\n" \
+ " ldr r7, [" reg ", #28]\n" \
+ " mov r8, r4\n" \
+ " mov r9, r5\n" \
+ " mov r10, r6\n" \
+ " mov r11, r7\n" \
+ " ldr r4, [" reg ", #32]\n" \
+ " ldr r5, [" reg ", #36]\n" \
+ " mov sp, r4\n" \
+ " mov lr, r5\n" \
+ " ldr r4, [" reg ", #0]\n" \
+ " ldr r5, [" reg ", #4]\n" \
+ " ldr r6, [" reg ", #8]\n" \
+ " ldr r7, [" reg ", #12]\n"
+
+__asm__ (
+ ".syntax unified\n"
+ ".thumb\n"
+ ".text\n"
+ ".align 2\n"
+
+ /* setjmp(env) -- env in r0. lr at call time is the return address
+ into the caller, exactly what longjmp must restore. */
+ ".globl " SYM(setjmp) "\n"
+ ".thumb_func\n"
+ ".type " SYM(setjmp) ", %function\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("r0")
+ " movs r0, #0\n"
+ " bx lr\n"
+
+ /* longjmp(env, val) -- env in r0, val in r1.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). No IT blocks in
+ Thumb-1, so use a forward branch for the substitution.
+ Both `r0 <- r1` and the immediate ops use the T2 (`movs`) form
+ since plain `mov rd, rm` with both low operands is UNPREDICTABLE
+ on ARMv6-M. */
+ ".globl " SYM(longjmp) "\n"
+ ".thumb_func\n"
+ ".type " SYM(longjmp) ", %function\n"
+ SYM(longjmp) ":\n"
+ RESTORE_FROM("r0")
+ " cmp r1, #0\n"
+ " bne 1f\n"
+ " movs r1, #1\n"
+ "1:\n"
+ " movs r0, r1\n"
+ " bx lr\n"
+
+ /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value.
+ SAVE_INTO leaves r0-r3 untouched, so r1 (to) and r2 (value) are
+ still live. RESTORE_FROM clobbers r4-r7 freely (they belong to
+ the resumed coro). The lr loaded by RESTORE_FROM is either a
+ real return address (a previously-suspended coro) or
+ __cfree_coro_trampoline (a fresh coro initialized by coro_init);
+ either way `bx lr` lands there with r0 holding `value`. */
+ ".globl " SYM(coro_switch) "\n"
+ ".thumb_func\n"
+ ".type " SYM(coro_switch) ", %function\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("r0")
+ RESTORE_FROM("r1")
+ " movs r0, r2\n"
+ " bx lr\n"
+
+ /* __cfree_coro_trampoline -- on first entry r0 = value (delivered
+ by coro_switch's `movs r0, r2`), r4 = entry fn (set by coro_init),
+ sp aligned to 16. UDF #0 (T1, ARMv6-M) traps if entry returns. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ ".thumb_func\n"
+ ".type " SYM(__cfree_coro_trampoline) ", %function\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " blx r4\n"
+ " udf #0\n"
+
+ ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/i386.c b/lib/coro/i386.c
@@ -0,0 +1,143 @@
+/*
+ * lib/coro/i386.c -- i386 System V (cdecl, ILP32) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * cdecl callee-saved set: ebx, esi, edi, ebp, esp. Args are pushed
+ * right-to-left on the stack: at function entry, 4(%esp)=arg0,
+ * 8(%esp)=arg1, 12(%esp)=arg2, (%esp)=return-address.
+ *
+ * regs[0]: ebx (also stashes entry fn for the trampoline)
+ * regs[1]: esi
+ * regs[2]: edi
+ * regs[3]: ebp
+ * regs[4]: esp (caller's pre-call esp)
+ * regs[5]: eip (return address)
+ *
+ * 6 × 4 = 24 bytes of state, padded to sizeof = 32 by the 16-byte
+ * over-alignment (vs. natural 4) so coro_ctx's 16-byte alignment is
+ * matched.
+ *
+ * setjmp(env) 4(%esp)=env
+ * longjmp(env, val) 4(%esp)=env, 8(%esp)=val
+ * coro_switch(f, t, val) 4(%esp)=from, 8(%esp)=to, 12(%esp)=value
+ *
+ * The "save esp/eip" trick: at function entry, (%esp) holds the caller's
+ * return address (just pushed by `call`); 4(%esp) is the caller's
+ * pre-call esp. Saving those two lets longjmp/coro_switch "land" at the
+ * call site exactly as if the function had returned.
+ *
+ * Modern SysV i386 (ABI rev 1.1+) requires 16-byte stack alignment
+ * before each `call`; the trampoline `andl $-16, %esp` enforces this
+ * defensively for fresh contexts.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_i386_ctx {
+ uintptr_t regs[6];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_i386_ctx) == 32, "layout");
+_Static_assert(_Alignof(struct __cfree_i386_ctx) == 16, "align");
+_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_i386_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_i386_ctx *c = (struct __cfree_i386_ctx *)ctx;
+
+ /* i386 stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)entry; /* ebx -- entry fn */
+ c->regs[3] = 0; /* ebp */
+ c->regs[4] = top; /* esp */
+ c->regs[5] = (uintptr_t)__cfree_coro_trampoline; /* eip */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) esp + eip into [reg]; clobbers %eax.
+ Used at function-entry stack discipline: (%esp)=ret-addr, 4(%esp)=pre-call esp. */
+#define SAVE_INTO(reg) \
+ " movl %ebx, 0(" reg ")\n" \
+ " movl %esi, 4(" reg ")\n" \
+ " movl %edi, 8(" reg ")\n" \
+ " movl %ebp, 12(" reg ")\n" \
+ " leal 4(%esp), %eax\n" \
+ " movl %eax, 16(" reg ")\n" \
+ " movl (%esp), %eax\n" \
+ " movl %eax, 20(" reg ")\n"
+
+/* Restore callee-saved + esp from [reg], leave eip in %ecx ready to
+ jmp. Caller delivers the destination value in %eax beforehand. */
+#define RESTORE_FROM(reg) \
+ " movl 0(" reg "), %ebx\n" \
+ " movl 4(" reg "), %esi\n" \
+ " movl 8(" reg "), %edi\n" \
+ " movl 12(" reg "), %ebp\n" \
+ " movl 16(" reg "), %esp\n" \
+ " movl 20(" reg "), %ecx\n"
+
+__asm__ (
+ ".text\n"
+ ".p2align 4\n"
+
+ /* setjmp(env) -- env at 4(%esp). */
+ ".globl " SYM(setjmp) "\n"
+ SYM(setjmp) ":\n"
+ " movl 4(%esp), %edx\n"
+ SAVE_INTO("%edx")
+ " xorl %eax, %eax\n"
+ " ret\n"
+
+ /* longjmp(env, val) -- env at 4(%esp), val at 8(%esp).
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+ ".globl " SYM(longjmp) "\n"
+ SYM(longjmp) ":\n"
+ " movl 4(%esp), %edx\n" /* env */
+ " movl 8(%esp), %eax\n" /* val */
+ " testl %eax, %eax\n"
+ " movl $1, %ecx\n"
+ " cmovel %ecx, %eax\n"
+ RESTORE_FROM("%edx")
+ " jmp *%ecx\n"
+
+ /* coro_switch(from, to, value) -- 4(%esp)=from, 8(%esp)=to, 12(%esp)=value.
+ Read all three args before SAVE_INTO clobbers the stack frame. */
+ ".globl " SYM(coro_switch) "\n"
+ SYM(coro_switch) ":\n"
+ " movl 4(%esp), %edx\n" /* from */
+ SAVE_INTO("%edx")
+ " movl 8(%esp), %edx\n" /* to (re-read; SAVE clobbered %eax not stack) */
+ " movl 12(%esp), %eax\n" /* value -- delivered as return reg */
+ RESTORE_FROM("%edx")
+ " jmp *%ecx\n"
+
+ /* __cfree_coro_trampoline -- on first entry: %eax=value, %ebx=entry,
+ %esp=stack_top (no return addr pushed -- coro_switch reaches here
+ via jmp). cdecl needs the arg pushed; align defensively, then
+ reserve 12 bytes + push value so that after the upcoming `call`
+ pushes the 4-byte return addr, the callee sees %esp+4 16-aligned. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " andl $-16, %esp\n"
+ " subl $12, %esp\n"
+ " pushl %eax\n" /* arg0 = value */
+ " calll *%ebx\n" /* entry(value) */
+ " ud2\n"
+);
diff --git a/lib/coro/riscv32.c b/lib/coro/riscv32.c
@@ -0,0 +1,219 @@
+/*
+ * lib/coro/riscv32.c -- RISC-V 32-bit (ILP32/ILP32F/ILP32D) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * Per-target context layout (matches xOS rv32 tick_coro_ctx):
+ *
+ * regs[0]: ra
+ * regs[1]: sp
+ * regs[2..13]: s0-s11
+ * fp_regs[0..11]: fs0-fs11
+ *
+ * The fp_regs slots are always allocated (12 * 8 = 96 bytes at offset
+ * 56) so the struct layout is constant regardless of the F/D extension.
+ * The save/restore code is conditional on __riscv_flen:
+ * __riscv_flen == 64 -> fsd/fld (64-bit, fills slots fully)
+ * __riscv_flen == 32 -> fsw/flw (32-bit, packs into the low halves)
+ * else -> no FP save/restore
+ *
+ * Field bytes = 14*4 + 12*8 = 152; sizeof = 160 after 16-byte align
+ * tail padding. Fits in the 256-byte storage carved out by jmp_buf
+ * and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch without duplication.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_riscv32_ctx {
+ uintptr_t regs[14];
+ uint64_t fp_regs[12];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) == 160, "layout");
+_Static_assert(_Alignof(struct __cfree_riscv32_ctx) == 16, "align");
+_Static_assert(offsetof(struct __cfree_riscv32_ctx, fp_regs) == 56, "fp off");
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv32_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_riscv32_ctx *c = (struct __cfree_riscv32_ctx *)ctx;
+
+ /* RISC-V stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */
+ c->regs[1] = top; /* sp */
+ c->regs[2] = (uintptr_t)entry; /* s0 -- entry fn */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Integer save: ra, sp, s0-s11 into regs[0..13] at offsets 0..52. */
+#define SAVE_GPR(reg) \
+ " sw ra, 0(" reg ")\n" \
+ " sw sp, 4(" reg ")\n" \
+ " sw s0, 8(" reg ")\n" \
+ " sw s1, 12(" reg ")\n" \
+ " sw s2, 16(" reg ")\n" \
+ " sw s3, 20(" reg ")\n" \
+ " sw s4, 24(" reg ")\n" \
+ " sw s5, 28(" reg ")\n" \
+ " sw s6, 32(" reg ")\n" \
+ " sw s7, 36(" reg ")\n" \
+ " sw s8, 40(" reg ")\n" \
+ " sw s9, 44(" reg ")\n" \
+ " sw s10, 48(" reg ")\n" \
+ " sw s11, 52(" reg ")\n"
+
+#define RESTORE_GPR(reg) \
+ " lw ra, 0(" reg ")\n" \
+ " lw sp, 4(" reg ")\n" \
+ " lw s0, 8(" reg ")\n" \
+ " lw s1, 12(" reg ")\n" \
+ " lw s2, 16(" reg ")\n" \
+ " lw s3, 20(" reg ")\n" \
+ " lw s4, 24(" reg ")\n" \
+ " lw s5, 28(" reg ")\n" \
+ " lw s6, 32(" reg ")\n" \
+ " lw s7, 36(" reg ")\n" \
+ " lw s8, 40(" reg ")\n" \
+ " lw s9, 44(" reg ")\n" \
+ " lw s10, 48(" reg ")\n" \
+ " lw s11, 52(" reg ")\n"
+
+#if __riscv_flen == 64
+#define SAVE_FPR(reg) \
+ " fsd fs0, 56(" reg ")\n" \
+ " fsd fs1, 64(" reg ")\n" \
+ " fsd fs2, 72(" reg ")\n" \
+ " fsd fs3, 80(" reg ")\n" \
+ " fsd fs4, 88(" reg ")\n" \
+ " fsd fs5, 96(" reg ")\n" \
+ " fsd fs6, 104(" reg ")\n" \
+ " fsd fs7, 112(" reg ")\n" \
+ " fsd fs8, 120(" reg ")\n" \
+ " fsd fs9, 128(" reg ")\n" \
+ " fsd fs10, 136(" reg ")\n" \
+ " fsd fs11, 144(" reg ")\n"
+#define RESTORE_FPR(reg) \
+ " fld fs0, 56(" reg ")\n" \
+ " fld fs1, 64(" reg ")\n" \
+ " fld fs2, 72(" reg ")\n" \
+ " fld fs3, 80(" reg ")\n" \
+ " fld fs4, 88(" reg ")\n" \
+ " fld fs5, 96(" reg ")\n" \
+ " fld fs6, 104(" reg ")\n" \
+ " fld fs7, 112(" reg ")\n" \
+ " fld fs8, 120(" reg ")\n" \
+ " fld fs9, 128(" reg ")\n" \
+ " fld fs10, 136(" reg ")\n" \
+ " fld fs11, 144(" reg ")\n"
+#elif __riscv_flen == 32
+#define SAVE_FPR(reg) \
+ " fsw fs0, 56(" reg ")\n" \
+ " fsw fs1, 60(" reg ")\n" \
+ " fsw fs2, 64(" reg ")\n" \
+ " fsw fs3, 68(" reg ")\n" \
+ " fsw fs4, 72(" reg ")\n" \
+ " fsw fs5, 76(" reg ")\n" \
+ " fsw fs6, 80(" reg ")\n" \
+ " fsw fs7, 84(" reg ")\n" \
+ " fsw fs8, 88(" reg ")\n" \
+ " fsw fs9, 92(" reg ")\n" \
+ " fsw fs10, 96(" reg ")\n" \
+ " fsw fs11, 100(" reg ")\n"
+#define RESTORE_FPR(reg) \
+ " flw fs0, 56(" reg ")\n" \
+ " flw fs1, 60(" reg ")\n" \
+ " flw fs2, 64(" reg ")\n" \
+ " flw fs3, 68(" reg ")\n" \
+ " flw fs4, 72(" reg ")\n" \
+ " flw fs5, 76(" reg ")\n" \
+ " flw fs6, 80(" reg ")\n" \
+ " flw fs7, 84(" reg ")\n" \
+ " flw fs8, 88(" reg ")\n" \
+ " flw fs9, 92(" reg ")\n" \
+ " flw fs10, 96(" reg ")\n" \
+ " flw fs11, 100(" reg ")\n"
+#else
+#define SAVE_FPR(reg) ""
+#define RESTORE_FPR(reg) ""
+#endif
+
+/* Save: int first, FP second (matches xOS rv32 pattern, and rv64 here).
+ Restore: FP first, int second -- mirror order, minimizes register
+ reuse window. Note none of these loads write to the address-base
+ register, so the integer/FP order is purely cosmetic. */
+#define SAVE_INTO(reg) SAVE_GPR(reg) SAVE_FPR(reg)
+#define RESTORE_FROM(reg) RESTORE_FPR(reg) RESTORE_GPR(reg)
+
+__asm__ (
+ ".text\n"
+ ".align 2\n"
+
+ /* setjmp(env) -- env=a0. ra at function entry is the caller's
+ return address, exactly what longjmp must restore. */
+ ".globl " SYM(setjmp) "\n"
+ ".type " SYM(setjmp) ", @function\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("a0")
+ " li a0, 0\n"
+ " ret\n"
+ ".size " SYM(setjmp) ", .-" SYM(setjmp) "\n"
+
+ /* longjmp(env, val) -- env=a0, val=a1.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); branch-free:
+ seqz t0, a1 ; t0 = (a1 == 0)
+ add a0, a1, t0
+ so a0 = a1 if a1 != 0, else 1. */
+ ".globl " SYM(longjmp) "\n"
+ ".type " SYM(longjmp) ", @function\n"
+ SYM(longjmp) ":\n"
+ RESTORE_FROM("a0")
+ " seqz t0, a1\n"
+ " add a0, a1, t0\n"
+ " ret\n"
+ ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n"
+
+ /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value.
+ Save into [a0], restore from [a1], deliver a2 in a0 (which is
+ both the return register and the trampoline's first-arg reg
+ on a fresh context's first run). */
+ ".globl " SYM(coro_switch) "\n"
+ ".type " SYM(coro_switch) ", @function\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("a0")
+ RESTORE_FROM("a1")
+ " mv a0, a2\n"
+ " ret\n"
+ ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n"
+
+ /* __cfree_coro_trampoline -- on first entry: a0=value (delivered
+ by coro_switch's `mv a0, a2`), s0=entry (set by coro_init via
+ regs[2]), sp=stack_top. ebreak if entry returns. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ ".type " SYM(__cfree_coro_trampoline) ", @function\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " jalr s0\n"
+ " ebreak\n"
+ ".size " SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n"
+
+ ".section .note.GNU-stack,\"\",@progbits\n"
+);
diff --git a/lib/coro/riscv64.c b/lib/coro/riscv64.c
@@ -0,0 +1,193 @@
+/*
+ * lib/coro/riscv64.c -- RISC-V 64-bit (LP64D) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * RISC-V LP64D callee-saved set:
+ * ra (x1) -- saved manually so longjmp/coro_switch can
+ * "return" to the original call site
+ * sp (x2)
+ * s0-s11 (x8-x9, x18-x27)
+ * fs0-fs11 (f8-f9, f18-f27)
+ *
+ * Layout (matches xOS rv64 tick_coro_ctx):
+ *
+ * regs[0]: ra
+ * regs[1]: sp
+ * regs[2..13]: s0-s11
+ * fp_regs[0..11]: fs0-fs11 (offset 112)
+ *
+ * sizeof = 14*8 + 12*8 = 208, 16-byte aligned. Fits in the 256-byte
+ * storage carved out by jmp_buf and coro_ctx.
+ *
+ * setjmp(env) a0=env
+ * longjmp(env, val) a0=env, a1=val
+ * coro_switch(f, t, val) a0=from, a1=to, a2=val
+ *
+ * Value-passing trick: the destination context "returns" via
+ * ld ra, 0(a1); ... ret
+ * where `ret` is `jalr x0, 0(ra)`. By moving the value into a0 just
+ * before `ret`, both a fresh trampoline (entry(value)) and a previously
+ * suspended coro_switch (= the value its switch call returned) see it
+ * as the a0 return register.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch without duplication.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling (empty on RISC-V ELF).
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_riscv64_ctx {
+ uintptr_t regs[14];
+ uint64_t fp_regs[12];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) == 208, "layout");
+_Static_assert(_Alignof(struct __cfree_riscv64_ctx) == 16, "align");
+_Static_assert(offsetof(struct __cfree_riscv64_ctx, fp_regs) == 112, "fp off");
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_riscv64_ctx *c = (struct __cfree_riscv64_ctx *)ctx;
+
+ /* RISC-V stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */
+ c->regs[1] = top; /* sp */
+ c->regs[2] = (uintptr_t)entry; /* s0 -- entry fn */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg]. reg is a register name string,
+ e.g. "a0". Emits straight-line sd/fsd; no scratch register needed. */
+#define SAVE_INTO(reg) \
+ " sd ra, 0(" reg ")\n" \
+ " sd sp, 8(" reg ")\n" \
+ " sd s0, 16(" reg ")\n" \
+ " sd s1, 24(" reg ")\n" \
+ " sd s2, 32(" reg ")\n" \
+ " sd s3, 40(" reg ")\n" \
+ " sd s4, 48(" reg ")\n" \
+ " sd s5, 56(" reg ")\n" \
+ " sd s6, 64(" reg ")\n" \
+ " sd s7, 72(" reg ")\n" \
+ " sd s8, 80(" reg ")\n" \
+ " sd s9, 88(" reg ")\n" \
+ " sd s10, 96(" reg ")\n" \
+ " sd s11, 104(" reg ")\n" \
+ " fsd fs0, 112(" reg ")\n" \
+ " fsd fs1, 120(" reg ")\n" \
+ " fsd fs2, 128(" reg ")\n" \
+ " fsd fs3, 136(" reg ")\n" \
+ " fsd fs4, 144(" reg ")\n" \
+ " fsd fs5, 152(" reg ")\n" \
+ " fsd fs6, 160(" reg ")\n" \
+ " fsd fs7, 168(" reg ")\n" \
+ " fsd fs8, 176(" reg ")\n" \
+ " fsd fs9, 184(" reg ")\n" \
+ " fsd fs10, 192(" reg ")\n" \
+ " fsd fs11, 200(" reg ")\n"
+
+/* Restore callee-saved state from [reg]. */
+#define RESTORE_FROM(reg) \
+ " fld fs0, 112(" reg ")\n" \
+ " fld fs1, 120(" reg ")\n" \
+ " fld fs2, 128(" reg ")\n" \
+ " fld fs3, 136(" reg ")\n" \
+ " fld fs4, 144(" reg ")\n" \
+ " fld fs5, 152(" reg ")\n" \
+ " fld fs6, 160(" reg ")\n" \
+ " fld fs7, 168(" reg ")\n" \
+ " fld fs8, 176(" reg ")\n" \
+ " fld fs9, 184(" reg ")\n" \
+ " fld fs10, 192(" reg ")\n" \
+ " fld fs11, 200(" reg ")\n" \
+ " ld ra, 0(" reg ")\n" \
+ " ld sp, 8(" reg ")\n" \
+ " ld s0, 16(" reg ")\n" \
+ " ld s1, 24(" reg ")\n" \
+ " ld s2, 32(" reg ")\n" \
+ " ld s3, 40(" reg ")\n" \
+ " ld s4, 48(" reg ")\n" \
+ " ld s5, 56(" reg ")\n" \
+ " ld s6, 64(" reg ")\n" \
+ " ld s7, 72(" reg ")\n" \
+ " ld s8, 80(" reg ")\n" \
+ " ld s9, 88(" reg ")\n" \
+ " ld s10, 96(" reg ")\n" \
+ " ld s11, 104(" reg ")\n"
+
+__asm__ (
+ ".text\n"
+ ".align 2\n"
+
+ /* setjmp(env) -- env in a0. ra at call time is the caller's return
+ address, which is exactly what longjmp must restore. */
+ ".globl " SYM(setjmp) "\n"
+ ".type " SYM(setjmp) ", @function\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("a0")
+ " li a0, 0\n"
+ " ret\n"
+ ".size " SYM(setjmp) ", .-" SYM(setjmp) "\n"
+
+ /* longjmp(env, val) -- env in a0, val in a1.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). Branch-free:
+ seqz t0, a1 -> t0 = (a1==0); a0 = a1 + t0. RESTORE_FROM
+ doesn't touch t0/a0/a1, so the seqz/add can run after it and
+ write a0 directly -- one fewer instruction than munging a1
+ first and mv'ing later. */
+ ".globl " SYM(longjmp) "\n"
+ ".type " SYM(longjmp) ", @function\n"
+ SYM(longjmp) ":\n"
+ RESTORE_FROM("a0")
+ " seqz t0, a1\n"
+ " add a0, a1, t0\n"
+ " ret\n"
+ ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n"
+
+ /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value.
+ Save into [a0], restore from [a1] (which clobbers a0 and a1's
+ roles -- ra/sp/s* are loaded from the to-context), then deliver
+ value in a0 just before ret. */
+ ".globl " SYM(coro_switch) "\n"
+ ".type " SYM(coro_switch) ", @function\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("a0")
+ RESTORE_FROM("a1")
+ " mv a0, a2\n"
+ " ret\n"
+ ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n"
+
+ /* __cfree_coro_trampoline -- on first entry: a0=value (delivered),
+ s0=entry fn (set by coro_init), sp aligned to 16. ebreak if entry
+ returns. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ ".type " SYM(__cfree_coro_trampoline) ", @function\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " jalr s0\n"
+ " ebreak\n"
+ ".size " SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n"
+
+ ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/x86_64.c b/lib/coro/x86_64.c
@@ -0,0 +1,131 @@
+/*
+ * lib/coro/x86_64.c -- x86_64 System V ABI implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * Callee-saved set on SysV: rbx, rbp, r12-r15. (No callee-saved xmm
+ * regs -- those are MS-ABI specific; see x86_64_win.c.)
+ *
+ * regs[0]: rbx regs[4]: r14
+ * regs[1]: rbp regs[5]: r15
+ * regs[2]: r12 regs[6]: rsp
+ * regs[3]: r13 regs[7]: rip
+ *
+ * sizeof = 64, 16-byte aligned.
+ *
+ * setjmp(env) %rdi=env
+ * longjmp(env, val) %rdi=env, %esi=val
+ * coro_switch(f, t, val) %rdi=from, %rsi=to, %rdx=val
+ *
+ * The "save rsp/rip" trick: at function entry, (%rsp) holds the
+ * caller's return address (just pushed by `call`); 8(%rsp) is the
+ * caller's pre-call rsp. Saving those two lets longjmp/coro_switch
+ * "land" at the call site exactly as if the function had returned.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_x86_64_ctx {
+ uintptr_t regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) == 64, "layout");
+_Static_assert(_Alignof(struct __cfree_x86_64_ctx) == 16, "align");
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_x86_64_ctx *c = (struct __cfree_x86_64_ctx *)ctx;
+
+ /* x86_64 stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[1] = 0; /* rbp */
+ c->regs[3] = (uintptr_t)entry; /* r13 -- entry fn */
+ c->regs[6] = top; /* rsp */
+ c->regs[7] = (uintptr_t)__cfree_coro_trampoline; /* rip */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) rsp + rip into [reg]; clobbers %rax.
+ Used at function-entry stack discipline: (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */
+#define SAVE_INTO(reg) \
+ " movq %rbx, 0(" reg ")\n" \
+ " movq %rbp, 8(" reg ")\n" \
+ " movq %r12, 16(" reg ")\n" \
+ " movq %r13, 24(" reg ")\n" \
+ " movq %r14, 32(" reg ")\n" \
+ " movq %r15, 40(" reg ")\n" \
+ " leaq 8(%rsp), %rax\n" \
+ " movq %rax, 48(" reg ")\n" \
+ " movq (%rsp), %rax\n" \
+ " movq %rax, 56(" reg ")\n"
+
+/* Restore callee-saved + rsp from [reg], leave rip in %rcx ready to
+ jmp. Caller delivers the destination value in %rax beforehand. */
+#define RESTORE_FROM(reg) \
+ " movq 0(" reg "), %rbx\n" \
+ " movq 8(" reg "), %rbp\n" \
+ " movq 16(" reg "), %r12\n" \
+ " movq 24(" reg "), %r13\n" \
+ " movq 32(" reg "), %r14\n" \
+ " movq 40(" reg "), %r15\n" \
+ " movq 48(" reg "), %rsp\n" \
+ " movq 56(" reg "), %rcx\n"
+
+__asm__ (
+ ".text\n"
+ ".p2align 4\n"
+
+ /* setjmp(env) -- env=%rdi */
+ ".globl " SYM(setjmp) "\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("%rdi")
+ " xorl %eax, %eax\n"
+ " ret\n"
+
+ /* longjmp(env, val) -- env=%rdi, val=%esi.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+ ".globl " SYM(longjmp) "\n"
+ SYM(longjmp) ":\n"
+ " movslq %esi, %rax\n" /* sign-extend int → long */
+ " testq %rax, %rax\n"
+ " movl $1, %edx\n"
+ " cmoveq %rdx, %rax\n"
+ RESTORE_FROM("%rdi")
+ " jmpq *%rcx\n"
+
+ /* coro_switch(from, to, value) -- from=%rdi, to=%rsi, value=%rdx. */
+ ".globl " SYM(coro_switch) "\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("%rdi")
+ " movq %rdx, %rax\n" /* deliver value as return reg */
+ RESTORE_FROM("%rsi")
+ " jmpq *%rcx\n"
+
+ /* __cfree_coro_trampoline -- on first entry: %rax=value,
+ %r13=entry, %rsp=stack_top (no return addr pushed -- coro_switch
+ reaches here via jmp). System V wants %rsp+8 ≡ 16 (mod 16) at
+ function entry; the andq below makes that hold defensively. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " andq $-16, %rsp\n"
+ " movq %rax, %rdi\n" /* value → first arg */
+ " callq *%r13\n" /* entry(value) */
+ " ud2\n"
+);
diff --git a/lib/coro/x86_64_win.c b/lib/coro/x86_64_win.c
@@ -0,0 +1,176 @@
+/*
+ * lib/coro/x86_64_win.c -- x86_64 Windows (MS x64 ABI) implementations of
+ * setjmp / longjmp (<setjmp.h>)
+ * coro_init / coro_switch / trampoline (<stdcoro.h>)
+ *
+ * MS x64 callee-saved set: rbx, rbp, rdi, rsi, r12-r15, xmm6-xmm15.
+ * (Compare with x86_64.c -- SysV doesn't preserve rdi/rsi or any xmm.)
+ * Windows additionally requires the TEB stack-bound slots gs:0x08
+ * (StackBase) and gs:0x10 (StackLimit) to track the live stack so
+ * exception unwinding etc. behave; these are saved/restored on every
+ * switch.
+ *
+ * regs[0]: rbx regs[8]: rsp
+ * regs[1]: rbp regs[9]: rip
+ * regs[2]: rdi regs[10]: stack_base (TEB gs:0x08)
+ * regs[3]: rsi regs[11]: stack_limit (TEB gs:0x10)
+ * regs[4..7]: r12-r15
+ * fp_regs[0..19]: xmm6-xmm15 (10 regs * 128b = 20 * 64b slots, off 96)
+ *
+ * sizeof = 256, 16-byte aligned. Exactly fills jmp_buf / coro_ctx.
+ *
+ * setjmp(env) %rcx=env
+ * longjmp(env, val) %rcx=env, %edx=val
+ * coro_switch(f, t, val) %rcx=from, %rdx=to, %r8=value
+ *
+ * The "save rsp/rip" trick mirrors x86_64.c: at function entry,
+ * (%rsp) holds the caller's return address, 8(%rsp) is the caller's
+ * pre-call rsp.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_x86_64_win_ctx {
+ uintptr_t regs[12];
+ uint64_t fp_regs[20];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) == 256, "layout");
+_Static_assert(_Alignof(struct __cfree_x86_64_win_ctx) == 16, "align");
+_Static_assert(offsetof(struct __cfree_x86_64_win_ctx, fp_regs) == 96, "fp off");
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(coro_ctx), "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(jmp_buf), "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_win_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+ void *stack_base, size_t stack_len,
+ coro_entry_fn entry) {
+ struct __cfree_x86_64_win_ctx *c = (struct __cfree_x86_64_win_ctx *)ctx;
+
+ /* x86_64 stacks grow down; align top to 16. */
+ uintptr_t top = (uintptr_t)stack_base + stack_len;
+ top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+ for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+ ((uintptr_t *)c)[i] = 0;
+
+ c->regs[1] = 0; /* rbp */
+ c->regs[4] = (uintptr_t)entry; /* r12 -- entry fn */
+ c->regs[8] = top; /* rsp */
+ c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* rip */
+ c->regs[10] = top; /* stack_base (TEB) */
+ c->regs[11] = (uintptr_t)stack_base; /* stack_limit (TEB) */
+}
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+#define SYM(n) STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) rsp + rip + TEB stack bounds + xmm6-15
+ into [reg]; clobbers %rax. Used at function-entry stack discipline:
+ (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */
+#define SAVE_INTO(reg) \
+ " movq %rbx, 0(" reg ")\n" \
+ " movq %rbp, 8(" reg ")\n" \
+ " movq %rdi, 16(" reg ")\n" \
+ " movq %rsi, 24(" reg ")\n" \
+ " movq %r12, 32(" reg ")\n" \
+ " movq %r13, 40(" reg ")\n" \
+ " movq %r14, 48(" reg ")\n" \
+ " movq %r15, 56(" reg ")\n" \
+ " leaq 8(%rsp), %rax\n" \
+ " movq %rax, 64(" reg ")\n" \
+ " movq (%rsp), %rax\n" \
+ " movq %rax, 72(" reg ")\n" \
+ " movq %gs:0x08, %rax\n" \
+ " movq %rax, 80(" reg ")\n" \
+ " movq %gs:0x10, %rax\n" \
+ " movq %rax, 88(" reg ")\n" \
+ " movaps %xmm6, 96(" reg ")\n" \
+ " movaps %xmm7, 112(" reg ")\n" \
+ " movaps %xmm8, 128(" reg ")\n" \
+ " movaps %xmm9, 144(" reg ")\n" \
+ " movaps %xmm10, 160(" reg ")\n" \
+ " movaps %xmm11, 176(" reg ")\n" \
+ " movaps %xmm12, 192(" reg ")\n" \
+ " movaps %xmm13, 208(" reg ")\n" \
+ " movaps %xmm14, 224(" reg ")\n" \
+ " movaps %xmm15, 240(" reg ")\n"
+
+/* Restore callee-saved + xmm + TEB bounds + rsp from [reg]; leaves rip
+ in %r10 ready to jmp. Caller delivers the destination value in %rax
+ beforehand, so %rax must not be touched here. */
+#define RESTORE_FROM(reg) \
+ " movaps 96(" reg "), %xmm6\n" \
+ " movaps 112(" reg "), %xmm7\n" \
+ " movaps 128(" reg "), %xmm8\n" \
+ " movaps 144(" reg "), %xmm9\n" \
+ " movaps 160(" reg "), %xmm10\n" \
+ " movaps 176(" reg "), %xmm11\n" \
+ " movaps 192(" reg "), %xmm12\n" \
+ " movaps 208(" reg "), %xmm13\n" \
+ " movaps 224(" reg "), %xmm14\n" \
+ " movaps 240(" reg "), %xmm15\n" \
+ " movq 0(" reg "), %rbx\n" \
+ " movq 8(" reg "), %rbp\n" \
+ " movq 16(" reg "), %rdi\n" \
+ " movq 24(" reg "), %rsi\n" \
+ " movq 32(" reg "), %r12\n" \
+ " movq 40(" reg "), %r13\n" \
+ " movq 48(" reg "), %r14\n" \
+ " movq 56(" reg "), %r15\n" \
+ " movq 80(" reg "), %r10\n" \
+ " movq %r10, %gs:0x08\n" \
+ " movq 88(" reg "), %r10\n" \
+ " movq %r10, %gs:0x10\n" \
+ " movq 64(" reg "), %rsp\n" \
+ " movq 72(" reg "), %r10\n"
+
+__asm__ (
+ ".text\n"
+ ".p2align 4\n"
+
+ /* setjmp(env) -- env=%rcx */
+ ".globl " SYM(setjmp) "\n"
+ SYM(setjmp) ":\n"
+ SAVE_INTO("%rcx")
+ " xorl %eax, %eax\n"
+ " ret\n"
+
+ /* longjmp(env, val) -- env=%rcx, val=%edx.
+ longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+ ".globl " SYM(longjmp) "\n"
+ SYM(longjmp) ":\n"
+ " movslq %edx, %rax\n" /* sign-extend int -> long */
+ " testq %rax, %rax\n"
+ " movl $1, %r11d\n"
+ " cmoveq %r11, %rax\n"
+ RESTORE_FROM("%rcx")
+ " jmpq *%r10\n"
+
+ /* coro_switch(from, to, value) -- from=%rcx, to=%rdx, value=%r8. */
+ ".globl " SYM(coro_switch) "\n"
+ SYM(coro_switch) ":\n"
+ SAVE_INTO("%rcx")
+ " movq %r8, %rax\n" /* deliver value as return reg */
+ RESTORE_FROM("%rdx")
+ " jmpq *%r10\n"
+
+ /* __cfree_coro_trampoline -- on first entry: %rax=value (delivered
+ by coro_switch), %r12=entry (set by coro_init), %rsp=stack_top
+ (no return addr pushed -- coro_switch reaches here via jmp). MS
+ x64 wants %rsp 16-byte aligned at call sites with 32 bytes of
+ shadow space reserved by the caller. */
+ ".globl " SYM(__cfree_coro_trampoline) "\n"
+ SYM(__cfree_coro_trampoline) ":\n"
+ " andq $-16, %rsp\n" /* defensive align */
+ " subq $32, %rsp\n" /* MS x64 shadow space */
+ " movq %rax, %rcx\n" /* value -> first arg */
+ " callq *%r12\n" /* entry(value) */
+ " ud2\n"
+);
diff --git a/test/smoke.c b/test/smoke.c
@@ -34,6 +34,7 @@
#include <stdarg.h>
#include <stdatomic.h>
#include <stdbool.h>
+#include <stdcoro.h>
#include <stddef.h>
#include <stdint.h>
#include <stdnoreturn.h>
@@ -135,7 +136,8 @@ static noreturn void cfree_trap(void) { for (;;) {} }
/* setjmp: jmp_buf is an array type, setjmp is callable in the contexts
permitted by C11 7.13.1.1p4, longjmp is _Noreturn. Compile-only --
smoke.c never links against a setjmp implementation. */
-_Static_assert(sizeof(jmp_buf) >= sizeof(void *) * 8, "jmp_buf room for regs");
+_Static_assert(sizeof(jmp_buf) >= 64, "jmp_buf room for regs");
+_Static_assert(_Alignof(jmp_buf) >= 16, "jmp_buf 16-byte aligned");
static jmp_buf cfree_jb;
static int cfree_setjmp_compiles(int x) {
if (setjmp(cfree_jb) != 0) return 1; /* allowed context */
@@ -143,6 +145,19 @@ static int cfree_setjmp_compiles(int x) {
return 0;
}
+/* stdcoro: coro_ctx storage exists, the API surface compiles and
+ resolves; same compile-only caveat as setjmp. */
+_Static_assert(sizeof(coro_ctx) >= 64, "coro_ctx room for regs");
+_Static_assert(_Alignof(coro_ctx) >= 16, "coro_ctx 16-byte aligned");
+_Static_assert(CORO_STACK_ALIGN >= 8, "stack align reasonable");
+static coro_ctx cfree_co_a, cfree_co_b;
+static _Alignas(16) unsigned char cfree_co_stack[4096];
+static void cfree_co_entry(uintptr_t v) { (void)v; for (;;) {} }
+static uintptr_t cfree_coro_compiles(void) {
+ coro_init(&cfree_co_b, cfree_co_stack, sizeof(cfree_co_stack), cfree_co_entry);
+ return coro_switch(&cfree_co_a, &cfree_co_b, 0xC0FFEEu);
+}
+
/* stdatomic: types, memory_order, lock-free macros, plus a runtime
exercise of load, store, exchange, CAS, fetch ops, and atomic_flag. */
_Static_assert(sizeof(atomic_int) == sizeof(int), "atomic_int matches int");
@@ -185,5 +200,6 @@ int cfree_smoke_ok(void) {
(void)aligned_buf;
if (0) cfree_trap();
if (0) (void)cfree_setjmp_compiles(0);
+ if (0) (void)cfree_coro_compiles();
return sum_n(3, 1, 2, 3) == 6 && cfree_atomic_ok();
}