setjmp.h stdcoro.h - kit

commit 8dd63074edda0ee085f6ebe758e9f0e7a9739594
parent 0dcc91b39522fbc9557618c46bf4d193aa40bba9
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  7 May 2026 13:47:23 -0700

setjmp.h stdcoro.h

Diffstat:
M doc/builtins.md  | 20 +++++++++++++++++---
M include/setjmp.h  | 31 +++++++++++++++++++------------
A include/stdcoro.h  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M lib/README.md  | 22 ++++++++++++++++++++++
M lib/build.sh  | 66 ++++++++++++++++++++++++++++++++++++++++++------------------------
A lib/coro/aarch64.c  | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/arm32.c  | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/arm32_thumb1.c  | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/i386.c  | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/riscv32.c  | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/riscv64.c  | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/x86_64.c  | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/coro/x86_64_win.c  | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/smoke.c  | 18 +++++++++++++++++-

14 files changed, 1548 insertions(+), 40 deletions(-)
diff --git a/doc/builtins.md b/doc/builtins.md
@@ -149,9 +149,23 @@ Always:
 - Float → float: `__extendsfdf2`, `__extendsftf2`, `__extenddftf2`, `__truncdfsf2`, `__trunctfsf2`, `__trunctfdf2`
 - Compare: `__eq`, `__ne`, `__lt`, `__le`, `__gt`, `__ge`, `__unord` × `sf2`/`df2`/`tf2`
 
-### Nonlocal jumps (always shipped)
-- `setjmp`, `longjmp` — target-specific assembly. The `jmp_buf` layout is
-  internal to these two functions; `<setjmp.h>` only fixes the array size.
+### Nonlocal jumps + stackful coroutines (per-arch, always shipped)
+The `<setjmp.h>` and `<stdcoro.h>` primitives share one per-target context
+struct: callee-saved GPRs + callee-saved FPRs + sp + return address. The
+`jmp_buf` and `coro_ctx` typedefs are 256-byte aligned-16 storage; the
+runtime reinterprets them as the per-arch struct.
+- `setjmp`, `longjmp` — `<setjmp.h>` (C11 7.13). cfree extension: this
+  header is *not* in the C11 freestanding subset.
+- `coro_init`, `coro_switch`, `__cfree_coro_trampoline` — `<stdcoro.h>`
+  (cfree-specific). `coro_switch(from, to, value) → uintptr_t` is the
+  one universal primitive; `setjmp` = save-and-return-0,
+  `longjmp` = restore-and-deliver-val.
+- Implementations live one master `.c` per arch under `lib/coro/`
+  (file-scope asm + tiny C `coro_init`). ARM has two: `arm32.c`
+  (Thumb-2, ARMv7+, may use VFP `d8-d15`) and `arm32_thumb1.c`
+  (ARMv6-M, no IT blocks / no VFP / data-processing limited to
+  r0-r7). Not provided for: WASM (would need an Asyncify-fiber
+  port).
 
 ### Atomic fallbacks (only when target lacks native atomics for that width)
 - Generic: `__atomic_load`, `__atomic_store`, `__atomic_exchange`, `__atomic_compare_exchange`
diff --git a/include/setjmp.h b/include/setjmp.h
@@ -1,21 +1,28 @@
 /* setjmp.h -- C11 7.13 -- Nonlocal jumps
  *
- * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6); cfree
- * provides it as an extension for code that wants nonlocal control flow
- * without a hosted libc. The setjmp/longjmp pair is target-specific
- * assembly and lives in libcfree_rt.a -- see doc/builtins.md.
+ * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6);
+ * cfree provides it as an extension. The setjmp/longjmp pair is
+ * target-specific assembly in libcfree_rt.a -- see doc/builtins.md.
  *
- * jmp_buf is an array type (C11 7.13p2). Its layout is internal to the
- * runtime; the size below is conservative -- large enough to hold every
- * cfree target's callee-saved GPRs + callee-saved FPRs + sp + return
- * address. C11 7.13 explicitly excludes the floating-point status flags,
- * the state of open files, and any other component of the abstract
- * machine, so no signal-mask slot is reserved.
- */
+ * jmp_buf is an array type (C11 7.13p2). The runtime reinterprets the
+ * buffer as a per-target struct of callee-saved GPRs + callee-saved
+ * FPRs + sp + return address. The size below is sized to the largest
+ * such struct across cfree targets -- 256 bytes (x86_64 Windows: 12
+ * GPR slots + xmm6-15). C11 explicitly excludes the FP status flags
+ * and open-file state, so no signal-mask slot is reserved. The same
+ * 256-byte payload is shared with <stdcoro.h>'s coro_ctx so the
+ * underlying save/restore halves are reused across all three
+ * primitives. */
 #ifndef CFREE_SETJMP_H
 #define CFREE_SETJMP_H
 
-typedef long jmp_buf[32];
+/* Wrap in a struct so 16-byte alignment is guaranteed even when the
+   user puts a jmp_buf on the stack -- xmm save instructions require
+   it on x86_64. The [1] makes jmp_buf an array type as the standard
+   demands, so passing one to setjmp/longjmp decays to a pointer. */
+typedef struct {
+    _Alignas(16) unsigned char __cfree_storage[256];
+} jmp_buf[1];
 
 int            setjmp(jmp_buf env);
 _Noreturn void longjmp(jmp_buf env, int val);
diff --git a/include/stdcoro.h b/include/stdcoro.h
@@ -0,0 +1,56 @@
+/* stdcoro.h -- cfree extension -- stackful symmetric coroutines
+ *
+ * stdcoro.h is non-standard: C11 has no stackful-coroutine facility.
+ * cfree ships it as a native counterpart to <setjmp.h>: the underlying
+ * per-target context struct, save sequence, and restore sequence are
+ * literally shared with setjmp/longjmp -- only the entry shapes differ
+ * (setjmp = save+return-0; longjmp = restore+return-val; coro_switch =
+ * save(from)+restore(to)+deliver-value). Implementations live in
+ * libcfree_rt.a -- see doc/builtins.md.
+ *
+ * Programming model
+ *   1. Allocate a coro_ctx and a stack region.
+ *   2. coro_init(&ctx, stack_base, stack_len, entry).
+ *   3. coro_switch(&caller, &ctx, value) -- delivers `value` to entry's
+ *      uintptr_t argument on first switch in.
+ *   4. Inside the coroutine, coro_switch(&ctx, &caller, value) yields
+ *      back, with `value` becoming the caller's coro_switch return.
+ *   5. entry must NOT return; the trampoline traps if it does.
+ *
+ * coro_ctx is sized conservatively -- large enough for every cfree
+ * target's callee-saved registers + sp + ip + (where applicable)
+ * callee-saved FP regs. Layout is internal to the runtime.
+ */
+#ifndef CFREE_STDCORO_H
+#define CFREE_STDCORO_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Stack alignment required at function-call boundaries on every cfree
+   target (16 on x86_64/aarch64/arm32-AAPCS-VFP/riscv; weaker on i386
+   but 16 covers it). Caller stacks must be aligned to this. */
+#define CORO_STACK_ALIGN 16
+
+/* 256 bytes is the largest per-target context across cfree's targets
+   (x86_64 Windows: 12 GPR slots + xmm6-15). Same byte payload as
+   <setjmp.h>'s jmp_buf -- the per-arch runtime reinterprets either
+   as the same internal struct. */
+typedef struct coro_ctx {
+    _Alignas(16) unsigned char __cfree_storage[256];
+} coro_ctx;
+
+typedef void (*coro_entry_fn)(uintptr_t value);
+
+/* Initialize *ctx to begin executing entry(value) on first switch in,
+   using the stack region [stack_base, stack_base + stack_len). The
+   stack base must be CORO_STACK_ALIGN-aligned. entry must not return. */
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry);
+
+/* Save callee-saved state into *from, restore it from *to, deliver
+   `value` to *to. Returns the value passed by the next switch back. */
+uintptr_t coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value);
+
+#endif
diff --git a/lib/README.md b/lib/README.md
@@ -33,6 +33,7 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired.
 | `riscv/rv64.S`             | `__riscv_save_*` + `__riscv_restore_*` (rv64)               | RISC-V rv64 with `-msave-restore`                   |
 | `mem/mem.c`                | `memcpy` / `memmove` / `memset` / `memcmp` (weak)           | All; user libc overrides                            |
 | `atomic/atomic_freestanding.c` | `__atomic_*` fallback shim                              | All                                                 |
+| `coro/<arch>.c`            | `setjmp` / `longjmp` (`<setjmp.h>`) + `coro_init` / `coro_switch` / `__cfree_coro_trampoline` (`<stdcoro.h>`) | One of `aarch64`, `arm32`, `arm32_thumb1`, `i386`, `riscv32`, `riscv64`, `x86_64`, `x86_64_win`. Not built for `wasm32`. |
 
 ### Build-time include dirs (consumed by the masters; nothing here lands in `libcfree_rt.a`)
 
@@ -141,6 +142,27 @@ Hand-written portable C (not from compiler-rt). All four functions are weak
 so a user libc, or a tuned arch-specific replacement, wins at link time.
 `arm/aeabi_thumb{1,2}.S`'s `aeabi_mem*` symbols forward to these.
 
+### `coro/<arch>.c`
+One master `.c` per arch that supplies both `<setjmp.h>` (`setjmp`,
+`longjmp`) and `<stdcoro.h>` (`coro_init`, `coro_switch`,
+`__cfree_coro_trampoline`). The setjmp/longjmp/coro_switch primitives
+share a per-arch struct (callee-saved GPRs + callee-saved FPRs + sp +
+return address) and one pair of C string-concat macros
+`SAVE_INTO(reg)` / `RESTORE_FROM(reg)` so the same instruction bytes
+are emitted in all three places. Written as file-scope `__asm__`
+inside a `.c` file (not a separate `.S`) so the asm and the tiny
+`coro_init` C function stay co-located. Symbol naming uses
+`__USER_LABEL_PREFIX__` so the same source compiles for ELF / Mach-O /
+COFF.
+
+ARM ships two variants: `arm32.c` (Thumb-2, ARMv7+, optional VFP
+`d8-d15` gated on `__ARM_FP`) and `arm32_thumb1.c` (ARMv6-M /
+Cortex-M0/M0+; no IT blocks, no VFP, data-processing restricted to
+r0-r7, no `str sp` / `str rN, [sp,...]` -- the asm sequences don't
+share with arm32.c so it's a separate file).
+
+Not provided for `wasm32` (would need an Asyncify-fiber port).
+
 ### `atomic/atomic_freestanding.c`
 Defines a pointer-sized `_Atomic(uintptr_t)` spinlock as the lock primitive
 (no OS dependency) then `#include`s `atomic_common.inc`, which contains the
diff --git a/lib/build.sh b/lib/build.sh
@@ -93,6 +93,16 @@ ARM_AEABI_THUMB1="arm/aeabi_thumb1.S arm/aeabi.c"
 RV32_SR="riscv/rv32.S"
 RV64_SR="riscv/rv64.S"
 
+# Coro + setjmp/longjmp: one master .c per arch, file-scope asm inside.
+CORO_X86_64="coro/x86_64.c"
+CORO_X86_64_WIN="coro/x86_64_win.c"
+CORO_I386="coro/i386.c"
+CORO_AARCH64="coro/aarch64.c"
+CORO_ARM32="coro/arm32.c"
+CORO_ARM32_THUMB1="coro/arm32_thumb1.c"
+CORO_RV32="coro/riscv32.c"
+CORO_RV64="coro/riscv64.c"
+
 #-------------------------------------------------------------------------------
 # Variants
 #-------------------------------------------------------------------------------
@@ -103,71 +113,79 @@ echo
 # ---- LP64 little-endian ------------------------------------------------------
 LP64_BASE="$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C"
 
+# Coro impl needs cfree's own headers (setjmp.h, stdcoro.h).
+CORO_INC="-I../include"
+
 build_variant x86_64-linux \
-    "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1" \
-    "$LP64_BASE"
+    "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$LP64_BASE $CORO_X86_64"
 
 build_variant x86_64-apple-darwin \
-    "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \
-    "$LP64_BASE"
+    "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$LP64_BASE $CORO_X86_64"
 
 # aarch64-linux: long double is binary128; needs fp_tf + fp_ti and the
 # tf_supplement.h pre-include.
 build_variant aarch64-linux \
     "--target=aarch64-linux-gnu \
      -Iinclude/lp64_le_ldbl128 -Iinclude/lp64_le -DHAS_INT128=1 \
-     -include include/lp64_le_ldbl128/tf_supplement.h" \
-    "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C"
+     -include include/lp64_le_ldbl128/tf_supplement.h $CORO_INC" \
+    "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C $CORO_AARCH64"
 
 build_variant aarch64-apple-darwin \
-    "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \
-    "$LP64_BASE"
+    "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$LP64_BASE $CORO_AARCH64"
 
 build_variant riscv64-elf \
     "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd \
-     -Iinclude/lp64_le -DHAS_INT128=1" \
-    "$LP64_BASE"
+     -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$LP64_BASE $CORO_RV64"
 
 build_variant riscv64-elf-save-restore \
     "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd -msave-restore \
-     -Iinclude/lp64_le -DHAS_INT128=1" \
-    "$LP64_BASE $RV64_SR"
+     -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$LP64_BASE $RV64_SR $CORO_RV64"
 
 # ---- LLP64 little-endian (Win64) --------------------------------------------
 build_variant x86_64-pc-windows \
-    "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1" \
-    "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C"
+    "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1 $CORO_INC" \
+    "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C $CORO_X86_64_WIN"
 
 # ---- ILP32 little-endian -----------------------------------------------------
 ILP32_BASE="$INT_C $INT32_C $FP_C $MEM_C $ATOMIC_C"
 
 build_variant i386-linux \
-    "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0" \
-    "$ILP32_BASE"
+    "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+    "$ILP32_BASE $CORO_I386"
 
+# wasm32: no setjmp/coro impl yet -- Emscripten fibers / sjlj are a
+# separate runtime model that hasn't been ported to cfree.
 build_variant wasm32 \
     "--target=wasm32-unknown-unknown -Iinclude/ilp32_le -DHAS_INT128=0" \
     "$ILP32_BASE"
 
 build_variant riscv32-elf \
     "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd \
-     -Iinclude/ilp32_le -DHAS_INT128=0" \
-    "$ILP32_BASE"
+     -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+    "$ILP32_BASE $CORO_RV32"
 
 build_variant riscv32-elf-save-restore \
     "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd -msave-restore \
-     -Iinclude/ilp32_le -DHAS_INT128=0" \
-    "$ILP32_BASE $RV32_SR"
+     -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+    "$ILP32_BASE $RV32_SR $CORO_RV32"
 
 build_variant arm-eabi-thumb2 \
     "--target=arm-none-eabi -march=armv7-a -mthumb -mfloat-abi=soft \
-     -Iinclude/ilp32_le -DHAS_INT128=0" \
-    "$ILP32_BASE $ARM_AEABI_THUMB2"
+     -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+    "$ILP32_BASE $ARM_AEABI_THUMB2 $CORO_ARM32"
 
+# arm-eabi-thumb1 (Cortex-M0/M0+, ARMv6-M): Thumb-1 ISA, no IT blocks,
+# data-processing ops restricted to r0-r7, no VFP. Coro impl is a
+# separate file from arm32.c since the asm sequences don't share.
 build_variant arm-eabi-thumb1 \
     "--target=arm-none-eabi -march=armv6-m -mthumb -mfloat-abi=soft \
-     -Iinclude/ilp32_le -DHAS_INT128=0" \
-    "$ILP32_BASE $ARM_AEABI_THUMB1"
+     -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \
+    "$ILP32_BASE $ARM_AEABI_THUMB1 $CORO_ARM32_THUMB1"
 
 #-------------------------------------------------------------------------------
 echo
diff --git a/lib/coro/aarch64.c b/lib/coro/aarch64.c
@@ -0,0 +1,137 @@
+/*
+ * lib/coro/aarch64.c -- AArch64 (AAPCS) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * All three primitives sit on one per-target context layout:
+ *
+ *   regs[0..9]    x19-x28
+ *   regs[10..11]  fp (x29), lr (x30)
+ *   regs[12]      sp
+ *   fp_regs[0..7] d8-d15  (low 64 bits of v8-v15; AAPCS only mandates
+ *                          the lower 64 bits be preserved)
+ *
+ * sizeof = 176 (alignof-16 padded), 16-byte aligned. Fits in the
+ * 256-byte storage carved out by jmp_buf and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte
+ * sequence is emitted in setjmp, longjmp, and coro_switch without
+ * any duplication or gas-specific .macro tricks.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling on both ELF (no prefix) and Mach-O
+ * (leading "_").
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm64_ctx {
+    uintptr_t regs[13];
+    uint64_t  fp_regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm64_ctx) == 176,                  "layout");
+_Static_assert(_Alignof(struct __cfree_arm64_ctx) == 16,                 "align");
+_Static_assert(offsetof(struct __cfree_arm64_ctx, fp_regs) == 104,       "fp off");
+_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_arm64_ctx *c = (struct __cfree_arm64_ctx *)ctx;
+
+    /* AArch64 stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0]  = (uintptr_t)entry;                  /* x19 -- entry fn */
+    c->regs[10] = 0;                                 /* fp */
+    c->regs[11] = (uintptr_t)__cfree_coro_trampoline;/* lr */
+    c->regs[12] = top;                               /* sp */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg]; clobbers x9 (caller-saved). */
+#define SAVE_INTO(reg) \
+    "    stp  x19, x20, [" reg ", #0]\n"   \
+    "    stp  x21, x22, [" reg ", #16]\n"  \
+    "    stp  x23, x24, [" reg ", #32]\n"  \
+    "    stp  x25, x26, [" reg ", #48]\n"  \
+    "    stp  x27, x28, [" reg ", #64]\n"  \
+    "    stp  x29, x30, [" reg ", #80]\n"  \
+    "    mov  x9, sp\n"                    \
+    "    str  x9,       [" reg ", #96]\n"  \
+    "    stp  d8,  d9,  [" reg ", #104]\n" \
+    "    stp  d10, d11, [" reg ", #120]\n" \
+    "    stp  d12, d13, [" reg ", #136]\n" \
+    "    stp  d14, d15, [" reg ", #152]\n"
+
+/* Restore callee-saved state from [reg]; clobbers x9. */
+#define RESTORE_FROM(reg) \
+    "    ldp  d8,  d9,  [" reg ", #104]\n" \
+    "    ldp  d10, d11, [" reg ", #120]\n" \
+    "    ldp  d12, d13, [" reg ", #136]\n" \
+    "    ldp  d14, d15, [" reg ", #152]\n" \
+    "    ldp  x19, x20, [" reg ", #0]\n"   \
+    "    ldp  x21, x22, [" reg ", #16]\n"  \
+    "    ldp  x23, x24, [" reg ", #32]\n"  \
+    "    ldp  x25, x26, [" reg ", #48]\n"  \
+    "    ldp  x27, x28, [" reg ", #64]\n"  \
+    "    ldp  x29, x30, [" reg ", #80]\n"  \
+    "    ldr  x9,       [" reg ", #96]\n"  \
+    "    mov  sp, x9\n"
+
+__asm__ (
+    ".text\n"
+    ".align 4\n"
+
+    /* setjmp(env) -- env in x0. lr at call time is the return address
+       into the caller, exactly what longjmp must restore. */
+    ".globl " SYM(setjmp) "\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("x0")
+    "    mov  x0, #0\n"
+    "    ret\n"
+
+    /* longjmp(env, val) -- env in x0, val in x1.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); csinc gives
+       x0 = (x1 != 0) ? x1 : 1 branch-free. */
+    ".globl " SYM(longjmp) "\n"
+    SYM(longjmp) ":\n"
+    RESTORE_FROM("x0")
+    "    cmp  x1, #0\n"
+    "    csinc x0, x1, xzr, ne\n"
+    "    ret\n"
+
+    /* coro_switch(from, to, value) -- x0, x1, x2. Save into [x0],
+       restore from [x1], deliver x2 in x0 (which is both the return
+       register here and the first-arg register the trampoline reads
+       on a fresh context's first run). */
+    ".globl " SYM(coro_switch) "\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("x0")
+    RESTORE_FROM("x1")
+    "    mov  x0, x2\n"
+    "    ret\n"
+
+    /* __cfree_coro_trampoline -- on first entry x0 = value (delivered),
+       x19 = entry fn (set by coro_init), sp aligned to 16. brk if entry
+       returns. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    blr  x19\n"
+    "    brk  #0\n"
+);
diff --git a/lib/coro/arm32.c b/lib/coro/arm32.c
@@ -0,0 +1,202 @@
+/*
+ * lib/coro/arm32.c -- ARM32 Thumb-2 (AAPCS) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * All three primitives sit on one per-target context layout:
+ *
+ *   regs[0..7]    r4-r11
+ *   regs[8]       sp
+ *   regs[9]       lr
+ *   fp_regs[0..7] d8-d15  (AAPCS only mandates the lower 64 bits of
+ *                          v8-v15 be preserved across calls; saved
+ *                          only when __ARM_FP is defined, but the
+ *                          slots are always allocated so the byte
+ *                          layout is stable across soft/hard-float
+ *                          builds).
+ *
+ * 10*4 GPR slots + 8*8 fp_regs slots = 104 bytes of payload, padded
+ * to 112 by alignof(16). fp_regs at offset 40. Fits in the 256-byte
+ * storage carved out by jmp_buf and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch. The VFP half is
+ * gated by a C-level #ifdef on __ARM_FP -- the cpp pass picks one
+ * macro body before the assembler sees anything, so we can't hide
+ * `#ifdef` inside the asm string.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling on both ELF (no prefix) and Mach-O
+ * (leading "_").
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm32_ctx {
+    uintptr_t regs[10];
+    uint64_t  fp_regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm32_ctx) == 112,                  "layout");
+_Static_assert(_Alignof(struct __cfree_arm32_ctx) == 16,                 "align");
+_Static_assert(offsetof(struct __cfree_arm32_ctx, fp_regs) == 40,        "fp off");
+_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_arm32_ctx *c = (struct __cfree_arm32_ctx *)ctx;
+
+    /* ARM32 stacks grow down; align top to 16 (AAPCS public-boundary
+       requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0] = (uintptr_t)entry;                    /* r4 -- entry fn */
+    c->regs[3] = 0;                                   /* r7 -- frame ptr */
+    c->regs[8] = top;                                 /* sp */
+    c->regs[9] = (uintptr_t)__cfree_coro_trampoline;  /* lr */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save/restore macros. The VFP half is conditional on __ARM_FP at the
+   C-cpp level -- by the time the inline assembler sees the string,
+   only one variant remains. The byte offsets match the struct layout
+   regardless (slots are always allocated). */
+#ifdef __ARM_FP
+#define SAVE_INTO(reg) \
+    "    str  r4,  [" reg ", #0]\n"   \
+    "    str  r5,  [" reg ", #4]\n"   \
+    "    str  r6,  [" reg ", #8]\n"   \
+    "    str  r7,  [" reg ", #12]\n"  \
+    "    str  r8,  [" reg ", #16]\n"  \
+    "    str  r9,  [" reg ", #20]\n"  \
+    "    str  r10, [" reg ", #24]\n"  \
+    "    str  r11, [" reg ", #28]\n"  \
+    "    str  sp,  [" reg ", #32]\n"  \
+    "    str  lr,  [" reg ", #36]\n"  \
+    "    vstr d8,  [" reg ", #40]\n"  \
+    "    vstr d9,  [" reg ", #48]\n"  \
+    "    vstr d10, [" reg ", #56]\n"  \
+    "    vstr d11, [" reg ", #64]\n"  \
+    "    vstr d12, [" reg ", #72]\n"  \
+    "    vstr d13, [" reg ", #80]\n"  \
+    "    vstr d14, [" reg ", #88]\n"  \
+    "    vstr d15, [" reg ", #96]\n"
+
+#define RESTORE_FROM(reg) \
+    "    vldr d8,  [" reg ", #40]\n"  \
+    "    vldr d9,  [" reg ", #48]\n"  \
+    "    vldr d10, [" reg ", #56]\n"  \
+    "    vldr d11, [" reg ", #64]\n"  \
+    "    vldr d12, [" reg ", #72]\n"  \
+    "    vldr d13, [" reg ", #80]\n"  \
+    "    vldr d14, [" reg ", #88]\n"  \
+    "    vldr d15, [" reg ", #96]\n"  \
+    "    ldr  r4,  [" reg ", #0]\n"   \
+    "    ldr  r5,  [" reg ", #4]\n"   \
+    "    ldr  r6,  [" reg ", #8]\n"   \
+    "    ldr  r7,  [" reg ", #12]\n"  \
+    "    ldr  r8,  [" reg ", #16]\n"  \
+    "    ldr  r9,  [" reg ", #20]\n"  \
+    "    ldr  r10, [" reg ", #24]\n"  \
+    "    ldr  r11, [" reg ", #28]\n"  \
+    "    ldr  sp,  [" reg ", #32]\n"  \
+    "    ldr  lr,  [" reg ", #36]\n"
+#else
+#define SAVE_INTO(reg) \
+    "    str  r4,  [" reg ", #0]\n"   \
+    "    str  r5,  [" reg ", #4]\n"   \
+    "    str  r6,  [" reg ", #8]\n"   \
+    "    str  r7,  [" reg ", #12]\n"  \
+    "    str  r8,  [" reg ", #16]\n"  \
+    "    str  r9,  [" reg ", #20]\n"  \
+    "    str  r10, [" reg ", #24]\n"  \
+    "    str  r11, [" reg ", #28]\n"  \
+    "    str  sp,  [" reg ", #32]\n"  \
+    "    str  lr,  [" reg ", #36]\n"
+
+#define RESTORE_FROM(reg) \
+    "    ldr  r4,  [" reg ", #0]\n"   \
+    "    ldr  r5,  [" reg ", #4]\n"   \
+    "    ldr  r6,  [" reg ", #8]\n"   \
+    "    ldr  r7,  [" reg ", #12]\n"  \
+    "    ldr  r8,  [" reg ", #16]\n"  \
+    "    ldr  r9,  [" reg ", #20]\n"  \
+    "    ldr  r10, [" reg ", #24]\n"  \
+    "    ldr  r11, [" reg ", #28]\n"  \
+    "    ldr  sp,  [" reg ", #32]\n"  \
+    "    ldr  lr,  [" reg ", #36]\n"
+#endif
+
+__asm__ (
+    ".syntax unified\n"
+    ".thumb\n"
+    ".text\n"
+    ".align 2\n"
+
+    /* setjmp(env) -- env in r0. lr at call time is the return address
+       into the caller, exactly what longjmp must restore. */
+    ".globl " SYM(setjmp) "\n"
+    ".thumb_func\n"
+    ".type " SYM(setjmp) ", %function\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("r0")
+    "    movs r0, #0\n"
+    "    bx   lr\n"
+
+    /* longjmp(env, val) -- env in r0, val in r1.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); the IT block
+       gives r1 = (r1 == 0) ? 1 : r1, then we move it into r0 and
+       branch to the saved lr. */
+    ".globl " SYM(longjmp) "\n"
+    ".thumb_func\n"
+    ".type " SYM(longjmp) ", %function\n"
+    SYM(longjmp) ":\n"
+    RESTORE_FROM("r0")
+    "    cmp  r1, #0\n"
+    "    it   eq\n"
+    "    moveq r1, #1\n"
+    "    mov  r0, r1\n"
+    "    bx   lr\n"
+
+    /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value.
+       Save into [r0], restore from [r1], deliver r2 in r0. The lr
+       loaded by RESTORE_FROM is either a real return address (a
+       previously-suspended coro) or __cfree_coro_trampoline (a fresh
+       coro initialized by coro_init). Either way `bx lr` lands there
+       with r0 holding `value`. */
+    ".globl " SYM(coro_switch) "\n"
+    ".thumb_func\n"
+    ".type " SYM(coro_switch) ", %function\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("r0")
+    RESTORE_FROM("r1")
+    "    mov  r0, r2\n"
+    "    bx   lr\n"
+
+    /* __cfree_coro_trampoline -- on first entry r0 = value (delivered
+       by coro_switch's `mov r0, r2`), r4 = entry fn (set by coro_init),
+       sp aligned to 16. udf if entry returns. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    ".thumb_func\n"
+    ".type " SYM(__cfree_coro_trampoline) ", %function\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    blx  r4\n"
+    "    udf  #0\n"
+
+    ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/arm32_thumb1.c b/lib/coro/arm32_thumb1.c
@@ -0,0 +1,174 @@
+/*
+ * lib/coro/arm32_thumb1.c -- ARMv6-M (Cortex-M0 / M0+, Thumb-1) impls of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * Thumb-1 / ARMv6-M is a strict subset of the Thumb-2 ISA used by the
+ * sibling arm32.c, and several conveniences disappear:
+ *
+ *   - no IT blocks: conditional execution must use a forward branch.
+ *   - data-processing ops are restricted to r0-r7. r8-r15 are reachable
+ *     only via the `mov` high-register form and a few specials; in
+ *     particular there is no `str rN, [sp,...]` / `str sp, [rN,...]`.
+ *   - `mov rd, rm` with *both* operands low is UNPREDICTABLE in
+ *     ARMv6-M; use the T2 flags-setting form `movs rd, rm` for low->low
+ *     register copies. The plain `mov` form is reserved for cases where
+ *     at least one operand is a high register (sp/lr/r8-r11).
+ *   - no VFP coprocessor on M0/M0+, so no fp_regs slots.
+ *
+ * Layout: 10 GPR slots (r4-r11, sp, lr) = 40 bytes, padded to 16-byte
+ * alignment by alignof(16). Fits in the 256-byte storage carved out by
+ * jmp_buf and coro_ctx.
+ *
+ * SAVE_INTO uses r4-r7 as scratches *after* they have themselves been
+ * stored, so r0-r3 are never clobbered. That matters for coro_switch:
+ * `to` (r1) and `value` (r2) survive across the save half and are still
+ * live for the restore half / value delivery.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_arm32_thumb1_ctx {
+    uintptr_t regs[10];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) == 48,                  "layout");
+_Static_assert(_Alignof(struct __cfree_arm32_thumb1_ctx) == 16,                "align");
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(coro_ctx),    "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(jmp_buf),     "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_thumb1_ctx),"align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_arm32_thumb1_ctx *c = (struct __cfree_arm32_thumb1_ctx *)ctx;
+
+    /* ARM stacks grow down; align top to 16 (AAPCS public-boundary
+       requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0] = (uintptr_t)entry;                    /* r4 -- entry fn */
+    c->regs[3] = 0;                                   /* r7 -- frame ptr */
+    c->regs[8] = top;                                 /* sp */
+    c->regs[9] = (uintptr_t)__cfree_coro_trampoline;  /* lr */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg].
+   Stage 1: store r4-r7 directly (low->low str is fine).
+   Stage 2: with r4-r7 already saved, reuse them as scratches to copy
+            the high regs r8-r11 down and store them.
+   Stage 3: same trick for sp and lr.
+   r0-r3 are never touched. */
+#define SAVE_INTO(reg)                       \
+    "    str  r4,  [" reg ", #0]\n"          \
+    "    str  r5,  [" reg ", #4]\n"          \
+    "    str  r6,  [" reg ", #8]\n"          \
+    "    str  r7,  [" reg ", #12]\n"         \
+    "    mov  r4,  r8\n"                     \
+    "    mov  r5,  r9\n"                     \
+    "    mov  r6,  r10\n"                    \
+    "    mov  r7,  r11\n"                    \
+    "    str  r4,  [" reg ", #16]\n"         \
+    "    str  r5,  [" reg ", #20]\n"         \
+    "    str  r6,  [" reg ", #24]\n"         \
+    "    str  r7,  [" reg ", #28]\n"         \
+    "    mov  r4,  sp\n"                     \
+    "    mov  r5,  lr\n"                     \
+    "    str  r4,  [" reg ", #32]\n"         \
+    "    str  r5,  [" reg ", #36]\n"
+
+/* Restore callee-saved state from [reg]. Mirror image: load r8-r11/sp/lr
+   first via r4-r7 as scratches, then restore the real r4-r7 last. */
+#define RESTORE_FROM(reg)                    \
+    "    ldr  r4,  [" reg ", #16]\n"         \
+    "    ldr  r5,  [" reg ", #20]\n"         \
+    "    ldr  r6,  [" reg ", #24]\n"         \
+    "    ldr  r7,  [" reg ", #28]\n"         \
+    "    mov  r8,  r4\n"                     \
+    "    mov  r9,  r5\n"                     \
+    "    mov  r10, r6\n"                     \
+    "    mov  r11, r7\n"                     \
+    "    ldr  r4,  [" reg ", #32]\n"         \
+    "    ldr  r5,  [" reg ", #36]\n"         \
+    "    mov  sp,  r4\n"                     \
+    "    mov  lr,  r5\n"                     \
+    "    ldr  r4,  [" reg ", #0]\n"          \
+    "    ldr  r5,  [" reg ", #4]\n"          \
+    "    ldr  r6,  [" reg ", #8]\n"          \
+    "    ldr  r7,  [" reg ", #12]\n"
+
+__asm__ (
+    ".syntax unified\n"
+    ".thumb\n"
+    ".text\n"
+    ".align 2\n"
+
+    /* setjmp(env) -- env in r0. lr at call time is the return address
+       into the caller, exactly what longjmp must restore. */
+    ".globl " SYM(setjmp) "\n"
+    ".thumb_func\n"
+    ".type " SYM(setjmp) ", %function\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("r0")
+    "    movs r0, #0\n"
+    "    bx   lr\n"
+
+    /* longjmp(env, val) -- env in r0, val in r1.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). No IT blocks in
+       Thumb-1, so use a forward branch for the substitution.
+       Both `r0 <- r1` and the immediate ops use the T2 (`movs`) form
+       since plain `mov rd, rm` with both low operands is UNPREDICTABLE
+       on ARMv6-M. */
+    ".globl " SYM(longjmp) "\n"
+    ".thumb_func\n"
+    ".type " SYM(longjmp) ", %function\n"
+    SYM(longjmp) ":\n"
+    RESTORE_FROM("r0")
+    "    cmp  r1, #0\n"
+    "    bne  1f\n"
+    "    movs r1, #1\n"
+    "1:\n"
+    "    movs r0, r1\n"
+    "    bx   lr\n"
+
+    /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value.
+       SAVE_INTO leaves r0-r3 untouched, so r1 (to) and r2 (value) are
+       still live. RESTORE_FROM clobbers r4-r7 freely (they belong to
+       the resumed coro). The lr loaded by RESTORE_FROM is either a
+       real return address (a previously-suspended coro) or
+       __cfree_coro_trampoline (a fresh coro initialized by coro_init);
+       either way `bx lr` lands there with r0 holding `value`. */
+    ".globl " SYM(coro_switch) "\n"
+    ".thumb_func\n"
+    ".type " SYM(coro_switch) ", %function\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("r0")
+    RESTORE_FROM("r1")
+    "    movs r0, r2\n"
+    "    bx   lr\n"
+
+    /* __cfree_coro_trampoline -- on first entry r0 = value (delivered
+       by coro_switch's `movs r0, r2`), r4 = entry fn (set by coro_init),
+       sp aligned to 16. UDF #0 (T1, ARMv6-M) traps if entry returns. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    ".thumb_func\n"
+    ".type " SYM(__cfree_coro_trampoline) ", %function\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    blx  r4\n"
+    "    udf  #0\n"
+
+    ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/i386.c b/lib/coro/i386.c
@@ -0,0 +1,143 @@
+/*
+ * lib/coro/i386.c -- i386 System V (cdecl, ILP32) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * cdecl callee-saved set: ebx, esi, edi, ebp, esp. Args are pushed
+ * right-to-left on the stack: at function entry, 4(%esp)=arg0,
+ * 8(%esp)=arg1, 12(%esp)=arg2, (%esp)=return-address.
+ *
+ *   regs[0]:  ebx     (also stashes entry fn for the trampoline)
+ *   regs[1]:  esi
+ *   regs[2]:  edi
+ *   regs[3]:  ebp
+ *   regs[4]:  esp     (caller's pre-call esp)
+ *   regs[5]:  eip     (return address)
+ *
+ * 6 × 4 = 24 bytes of state, padded to sizeof = 32 by the 16-byte
+ * over-alignment (vs. natural 4) so coro_ctx's 16-byte alignment is
+ * matched.
+ *
+ *   setjmp(env)             4(%esp)=env
+ *   longjmp(env, val)       4(%esp)=env, 8(%esp)=val
+ *   coro_switch(f, t, val)  4(%esp)=from, 8(%esp)=to, 12(%esp)=value
+ *
+ * The "save esp/eip" trick: at function entry, (%esp) holds the caller's
+ * return address (just pushed by `call`); 4(%esp) is the caller's
+ * pre-call esp. Saving those two lets longjmp/coro_switch "land" at the
+ * call site exactly as if the function had returned.
+ *
+ * Modern SysV i386 (ABI rev 1.1+) requires 16-byte stack alignment
+ * before each `call`; the trampoline `andl $-16, %esp` enforces this
+ * defensively for fresh contexts.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_i386_ctx {
+    uintptr_t regs[6];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_i386_ctx) == 32,                   "layout");
+_Static_assert(_Alignof(struct __cfree_i386_ctx) == 16,                 "align");
+_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_i386_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_i386_ctx *c = (struct __cfree_i386_ctx *)ctx;
+
+    /* i386 stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0] = (uintptr_t)entry;                   /* ebx -- entry fn */
+    c->regs[3] = 0;                                  /* ebp */
+    c->regs[4] = top;                                /* esp */
+    c->regs[5] = (uintptr_t)__cfree_coro_trampoline; /* eip */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) esp + eip into [reg]; clobbers %eax.
+   Used at function-entry stack discipline: (%esp)=ret-addr, 4(%esp)=pre-call esp. */
+#define SAVE_INTO(reg) \
+    "    movl %ebx,  0(" reg ")\n" \
+    "    movl %esi,  4(" reg ")\n" \
+    "    movl %edi,  8(" reg ")\n" \
+    "    movl %ebp, 12(" reg ")\n" \
+    "    leal 4(%esp), %eax\n"     \
+    "    movl %eax, 16(" reg ")\n" \
+    "    movl (%esp), %eax\n"      \
+    "    movl %eax, 20(" reg ")\n"
+
+/* Restore callee-saved + esp from [reg], leave eip in %ecx ready to
+   jmp. Caller delivers the destination value in %eax beforehand. */
+#define RESTORE_FROM(reg) \
+    "    movl  0(" reg "), %ebx\n" \
+    "    movl  4(" reg "), %esi\n" \
+    "    movl  8(" reg "), %edi\n" \
+    "    movl 12(" reg "), %ebp\n" \
+    "    movl 16(" reg "), %esp\n" \
+    "    movl 20(" reg "), %ecx\n"
+
+__asm__ (
+    ".text\n"
+    ".p2align 4\n"
+
+    /* setjmp(env) -- env at 4(%esp). */
+    ".globl " SYM(setjmp) "\n"
+    SYM(setjmp) ":\n"
+    "    movl 4(%esp), %edx\n"
+    SAVE_INTO("%edx")
+    "    xorl %eax, %eax\n"
+    "    ret\n"
+
+    /* longjmp(env, val) -- env at 4(%esp), val at 8(%esp).
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+    ".globl " SYM(longjmp) "\n"
+    SYM(longjmp) ":\n"
+    "    movl 4(%esp), %edx\n"        /* env */
+    "    movl 8(%esp), %eax\n"        /* val */
+    "    testl %eax, %eax\n"
+    "    movl  $1, %ecx\n"
+    "    cmovel %ecx, %eax\n"
+    RESTORE_FROM("%edx")
+    "    jmp *%ecx\n"
+
+    /* coro_switch(from, to, value) -- 4(%esp)=from, 8(%esp)=to, 12(%esp)=value.
+       Read all three args before SAVE_INTO clobbers the stack frame. */
+    ".globl " SYM(coro_switch) "\n"
+    SYM(coro_switch) ":\n"
+    "    movl 4(%esp),  %edx\n"       /* from */
+    SAVE_INTO("%edx")
+    "    movl 8(%esp),  %edx\n"       /* to (re-read; SAVE clobbered %eax not stack) */
+    "    movl 12(%esp), %eax\n"       /* value -- delivered as return reg */
+    RESTORE_FROM("%edx")
+    "    jmp *%ecx\n"
+
+    /* __cfree_coro_trampoline -- on first entry: %eax=value, %ebx=entry,
+       %esp=stack_top (no return addr pushed -- coro_switch reaches here
+       via jmp). cdecl needs the arg pushed; align defensively, then
+       reserve 12 bytes + push value so that after the upcoming `call`
+       pushes the 4-byte return addr, the callee sees %esp+4 16-aligned. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    andl $-16, %esp\n"
+    "    subl $12, %esp\n"
+    "    pushl %eax\n"                /* arg0 = value */
+    "    calll *%ebx\n"               /* entry(value) */
+    "    ud2\n"
+);
diff --git a/lib/coro/riscv32.c b/lib/coro/riscv32.c
@@ -0,0 +1,219 @@
+/*
+ * lib/coro/riscv32.c -- RISC-V 32-bit (ILP32/ILP32F/ILP32D) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * Per-target context layout (matches xOS rv32 tick_coro_ctx):
+ *
+ *   regs[0]:     ra
+ *   regs[1]:     sp
+ *   regs[2..13]: s0-s11
+ *   fp_regs[0..11]: fs0-fs11
+ *
+ * The fp_regs slots are always allocated (12 * 8 = 96 bytes at offset
+ * 56) so the struct layout is constant regardless of the F/D extension.
+ * The save/restore code is conditional on __riscv_flen:
+ *   __riscv_flen == 64 -> fsd/fld (64-bit, fills slots fully)
+ *   __riscv_flen == 32 -> fsw/flw (32-bit, packs into the low halves)
+ *   else               -> no FP save/restore
+ *
+ * Field bytes = 14*4 + 12*8 = 152; sizeof = 160 after 16-byte align
+ * tail padding. Fits in the 256-byte storage carved out by jmp_buf
+ * and coro_ctx.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch without duplication.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_riscv32_ctx {
+    uintptr_t regs[14];
+    uint64_t  fp_regs[12];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) == 160,                  "layout");
+_Static_assert(_Alignof(struct __cfree_riscv32_ctx) == 16,                 "align");
+_Static_assert(offsetof(struct __cfree_riscv32_ctx, fp_regs) == 56,        "fp off");
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv32_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_riscv32_ctx *c = (struct __cfree_riscv32_ctx *)ctx;
+
+    /* RISC-V stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */
+    c->regs[1] = top;                                /* sp */
+    c->regs[2] = (uintptr_t)entry;                   /* s0 -- entry fn */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Integer save: ra, sp, s0-s11 into regs[0..13] at offsets 0..52. */
+#define SAVE_GPR(reg) \
+    "    sw ra,   0(" reg ")\n" \
+    "    sw sp,   4(" reg ")\n" \
+    "    sw s0,   8(" reg ")\n" \
+    "    sw s1,  12(" reg ")\n" \
+    "    sw s2,  16(" reg ")\n" \
+    "    sw s3,  20(" reg ")\n" \
+    "    sw s4,  24(" reg ")\n" \
+    "    sw s5,  28(" reg ")\n" \
+    "    sw s6,  32(" reg ")\n" \
+    "    sw s7,  36(" reg ")\n" \
+    "    sw s8,  40(" reg ")\n" \
+    "    sw s9,  44(" reg ")\n" \
+    "    sw s10, 48(" reg ")\n" \
+    "    sw s11, 52(" reg ")\n"
+
+#define RESTORE_GPR(reg) \
+    "    lw ra,   0(" reg ")\n" \
+    "    lw sp,   4(" reg ")\n" \
+    "    lw s0,   8(" reg ")\n" \
+    "    lw s1,  12(" reg ")\n" \
+    "    lw s2,  16(" reg ")\n" \
+    "    lw s3,  20(" reg ")\n" \
+    "    lw s4,  24(" reg ")\n" \
+    "    lw s5,  28(" reg ")\n" \
+    "    lw s6,  32(" reg ")\n" \
+    "    lw s7,  36(" reg ")\n" \
+    "    lw s8,  40(" reg ")\n" \
+    "    lw s9,  44(" reg ")\n" \
+    "    lw s10, 48(" reg ")\n" \
+    "    lw s11, 52(" reg ")\n"
+
+#if __riscv_flen == 64
+#define SAVE_FPR(reg) \
+    "    fsd fs0,   56(" reg ")\n" \
+    "    fsd fs1,   64(" reg ")\n" \
+    "    fsd fs2,   72(" reg ")\n" \
+    "    fsd fs3,   80(" reg ")\n" \
+    "    fsd fs4,   88(" reg ")\n" \
+    "    fsd fs5,   96(" reg ")\n" \
+    "    fsd fs6,  104(" reg ")\n" \
+    "    fsd fs7,  112(" reg ")\n" \
+    "    fsd fs8,  120(" reg ")\n" \
+    "    fsd fs9,  128(" reg ")\n" \
+    "    fsd fs10, 136(" reg ")\n" \
+    "    fsd fs11, 144(" reg ")\n"
+#define RESTORE_FPR(reg) \
+    "    fld fs0,   56(" reg ")\n" \
+    "    fld fs1,   64(" reg ")\n" \
+    "    fld fs2,   72(" reg ")\n" \
+    "    fld fs3,   80(" reg ")\n" \
+    "    fld fs4,   88(" reg ")\n" \
+    "    fld fs5,   96(" reg ")\n" \
+    "    fld fs6,  104(" reg ")\n" \
+    "    fld fs7,  112(" reg ")\n" \
+    "    fld fs8,  120(" reg ")\n" \
+    "    fld fs9,  128(" reg ")\n" \
+    "    fld fs10, 136(" reg ")\n" \
+    "    fld fs11, 144(" reg ")\n"
+#elif __riscv_flen == 32
+#define SAVE_FPR(reg) \
+    "    fsw fs0,   56(" reg ")\n" \
+    "    fsw fs1,   60(" reg ")\n" \
+    "    fsw fs2,   64(" reg ")\n" \
+    "    fsw fs3,   68(" reg ")\n" \
+    "    fsw fs4,   72(" reg ")\n" \
+    "    fsw fs5,   76(" reg ")\n" \
+    "    fsw fs6,   80(" reg ")\n" \
+    "    fsw fs7,   84(" reg ")\n" \
+    "    fsw fs8,   88(" reg ")\n" \
+    "    fsw fs9,   92(" reg ")\n" \
+    "    fsw fs10,  96(" reg ")\n" \
+    "    fsw fs11, 100(" reg ")\n"
+#define RESTORE_FPR(reg) \
+    "    flw fs0,   56(" reg ")\n" \
+    "    flw fs1,   60(" reg ")\n" \
+    "    flw fs2,   64(" reg ")\n" \
+    "    flw fs3,   68(" reg ")\n" \
+    "    flw fs4,   72(" reg ")\n" \
+    "    flw fs5,   76(" reg ")\n" \
+    "    flw fs6,   80(" reg ")\n" \
+    "    flw fs7,   84(" reg ")\n" \
+    "    flw fs8,   88(" reg ")\n" \
+    "    flw fs9,   92(" reg ")\n" \
+    "    flw fs10,  96(" reg ")\n" \
+    "    flw fs11, 100(" reg ")\n"
+#else
+#define SAVE_FPR(reg)    ""
+#define RESTORE_FPR(reg) ""
+#endif
+
+/* Save: int first, FP second (matches xOS rv32 pattern, and rv64 here).
+   Restore: FP first, int second -- mirror order, minimizes register
+   reuse window.  Note none of these loads write to the address-base
+   register, so the integer/FP order is purely cosmetic. */
+#define SAVE_INTO(reg)    SAVE_GPR(reg)    SAVE_FPR(reg)
+#define RESTORE_FROM(reg) RESTORE_FPR(reg) RESTORE_GPR(reg)
+
+__asm__ (
+    ".text\n"
+    ".align 2\n"
+
+    /* setjmp(env) -- env=a0. ra at function entry is the caller's
+       return address, exactly what longjmp must restore. */
+    ".globl " SYM(setjmp) "\n"
+    ".type "  SYM(setjmp) ", @function\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("a0")
+    "    li a0, 0\n"
+    "    ret\n"
+    ".size " SYM(setjmp) ", .-" SYM(setjmp) "\n"
+
+    /* longjmp(env, val) -- env=a0, val=a1.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); branch-free:
+         seqz t0, a1   ; t0 = (a1 == 0)
+         add  a0, a1, t0
+       so a0 = a1 if a1 != 0, else 1. */
+    ".globl " SYM(longjmp) "\n"
+    ".type "  SYM(longjmp) ", @function\n"
+    SYM(longjmp) ":\n"
+    RESTORE_FROM("a0")
+    "    seqz t0, a1\n"
+    "    add  a0, a1, t0\n"
+    "    ret\n"
+    ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n"
+
+    /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value.
+       Save into [a0], restore from [a1], deliver a2 in a0 (which is
+       both the return register and the trampoline's first-arg reg
+       on a fresh context's first run). */
+    ".globl " SYM(coro_switch) "\n"
+    ".type "  SYM(coro_switch) ", @function\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("a0")
+    RESTORE_FROM("a1")
+    "    mv a0, a2\n"
+    "    ret\n"
+    ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n"
+
+    /* __cfree_coro_trampoline -- on first entry: a0=value (delivered
+       by coro_switch's `mv a0, a2`), s0=entry (set by coro_init via
+       regs[2]), sp=stack_top. ebreak if entry returns. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    ".type "  SYM(__cfree_coro_trampoline) ", @function\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    jalr s0\n"
+    "    ebreak\n"
+    ".size " SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n"
+
+    ".section .note.GNU-stack,\"\",@progbits\n"
+);
diff --git a/lib/coro/riscv64.c b/lib/coro/riscv64.c
@@ -0,0 +1,193 @@
+/*
+ * lib/coro/riscv64.c -- RISC-V 64-bit (LP64D) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * RISC-V LP64D callee-saved set:
+ *   ra  (x1)              -- saved manually so longjmp/coro_switch can
+ *                            "return" to the original call site
+ *   sp  (x2)
+ *   s0-s11 (x8-x9, x18-x27)
+ *   fs0-fs11 (f8-f9, f18-f27)
+ *
+ * Layout (matches xOS rv64 tick_coro_ctx):
+ *
+ *   regs[0]:        ra
+ *   regs[1]:        sp
+ *   regs[2..13]:    s0-s11
+ *   fp_regs[0..11]: fs0-fs11   (offset 112)
+ *
+ * sizeof = 14*8 + 12*8 = 208, 16-byte aligned. Fits in the 256-byte
+ * storage carved out by jmp_buf and coro_ctx.
+ *
+ *   setjmp(env)             a0=env
+ *   longjmp(env, val)       a0=env, a1=val
+ *   coro_switch(f, t, val)  a0=from, a1=to, a2=val
+ *
+ * Value-passing trick: the destination context "returns" via
+ *     ld ra, 0(a1); ... ret
+ * where `ret` is `jalr x0, 0(ra)`. By moving the value into a0 just
+ * before `ret`, both a fresh trampoline (entry(value)) and a previously
+ * suspended coro_switch (= the value its switch call returned) see it
+ * as the a0 return register.
+ *
+ * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence
+ * is emitted in setjmp, longjmp, and coro_switch without duplication.
+ *
+ * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C
+ * compiler's call-site mangling (empty on RISC-V ELF).
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_riscv64_ctx {
+    uintptr_t regs[14];
+    uint64_t  fp_regs[12];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) == 208,                  "layout");
+_Static_assert(_Alignof(struct __cfree_riscv64_ctx) == 16,                 "align");
+_Static_assert(offsetof(struct __cfree_riscv64_ctx, fp_regs) == 112,       "fp off");
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_riscv64_ctx *c = (struct __cfree_riscv64_ctx *)ctx;
+
+    /* RISC-V stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */
+    c->regs[1] = top;                                /* sp */
+    c->regs[2] = (uintptr_t)entry;                   /* s0 -- entry fn */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved state into [reg]. reg is a register name string,
+   e.g. "a0". Emits straight-line sd/fsd; no scratch register needed. */
+#define SAVE_INTO(reg) \
+    "    sd  ra,    0(" reg ")\n"  \
+    "    sd  sp,    8(" reg ")\n"  \
+    "    sd  s0,   16(" reg ")\n"  \
+    "    sd  s1,   24(" reg ")\n"  \
+    "    sd  s2,   32(" reg ")\n"  \
+    "    sd  s3,   40(" reg ")\n"  \
+    "    sd  s4,   48(" reg ")\n"  \
+    "    sd  s5,   56(" reg ")\n"  \
+    "    sd  s6,   64(" reg ")\n"  \
+    "    sd  s7,   72(" reg ")\n"  \
+    "    sd  s8,   80(" reg ")\n"  \
+    "    sd  s9,   88(" reg ")\n"  \
+    "    sd  s10,  96(" reg ")\n"  \
+    "    sd  s11, 104(" reg ")\n"  \
+    "    fsd fs0,  112(" reg ")\n" \
+    "    fsd fs1,  120(" reg ")\n" \
+    "    fsd fs2,  128(" reg ")\n" \
+    "    fsd fs3,  136(" reg ")\n" \
+    "    fsd fs4,  144(" reg ")\n" \
+    "    fsd fs5,  152(" reg ")\n" \
+    "    fsd fs6,  160(" reg ")\n" \
+    "    fsd fs7,  168(" reg ")\n" \
+    "    fsd fs8,  176(" reg ")\n" \
+    "    fsd fs9,  184(" reg ")\n" \
+    "    fsd fs10, 192(" reg ")\n" \
+    "    fsd fs11, 200(" reg ")\n"
+
+/* Restore callee-saved state from [reg]. */
+#define RESTORE_FROM(reg) \
+    "    fld fs0,  112(" reg ")\n" \
+    "    fld fs1,  120(" reg ")\n" \
+    "    fld fs2,  128(" reg ")\n" \
+    "    fld fs3,  136(" reg ")\n" \
+    "    fld fs4,  144(" reg ")\n" \
+    "    fld fs5,  152(" reg ")\n" \
+    "    fld fs6,  160(" reg ")\n" \
+    "    fld fs7,  168(" reg ")\n" \
+    "    fld fs8,  176(" reg ")\n" \
+    "    fld fs9,  184(" reg ")\n" \
+    "    fld fs10, 192(" reg ")\n" \
+    "    fld fs11, 200(" reg ")\n" \
+    "    ld  ra,    0(" reg ")\n"  \
+    "    ld  sp,    8(" reg ")\n"  \
+    "    ld  s0,   16(" reg ")\n"  \
+    "    ld  s1,   24(" reg ")\n"  \
+    "    ld  s2,   32(" reg ")\n"  \
+    "    ld  s3,   40(" reg ")\n"  \
+    "    ld  s4,   48(" reg ")\n"  \
+    "    ld  s5,   56(" reg ")\n"  \
+    "    ld  s6,   64(" reg ")\n"  \
+    "    ld  s7,   72(" reg ")\n"  \
+    "    ld  s8,   80(" reg ")\n"  \
+    "    ld  s9,   88(" reg ")\n"  \
+    "    ld  s10,  96(" reg ")\n"  \
+    "    ld  s11, 104(" reg ")\n"
+
+__asm__ (
+    ".text\n"
+    ".align 2\n"
+
+    /* setjmp(env) -- env in a0. ra at call time is the caller's return
+       address, which is exactly what longjmp must restore. */
+    ".globl " SYM(setjmp) "\n"
+    ".type "  SYM(setjmp) ", @function\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("a0")
+    "    li   a0, 0\n"
+    "    ret\n"
+    ".size "  SYM(setjmp) ", .-" SYM(setjmp) "\n"
+
+    /* longjmp(env, val) -- env in a0, val in a1.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). Branch-free:
+       seqz t0, a1 -> t0 = (a1==0); a0 = a1 + t0. RESTORE_FROM
+       doesn't touch t0/a0/a1, so the seqz/add can run after it and
+       write a0 directly -- one fewer instruction than munging a1
+       first and mv'ing later. */
+    ".globl " SYM(longjmp) "\n"
+    ".type "  SYM(longjmp) ", @function\n"
+    SYM(longjmp) ":\n"
+    RESTORE_FROM("a0")
+    "    seqz t0, a1\n"
+    "    add  a0, a1, t0\n"
+    "    ret\n"
+    ".size "  SYM(longjmp) ", .-" SYM(longjmp) "\n"
+
+    /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value.
+       Save into [a0], restore from [a1] (which clobbers a0 and a1's
+       roles -- ra/sp/s* are loaded from the to-context), then deliver
+       value in a0 just before ret. */
+    ".globl " SYM(coro_switch) "\n"
+    ".type "  SYM(coro_switch) ", @function\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("a0")
+    RESTORE_FROM("a1")
+    "    mv   a0, a2\n"
+    "    ret\n"
+    ".size "  SYM(coro_switch) ", .-" SYM(coro_switch) "\n"
+
+    /* __cfree_coro_trampoline -- on first entry: a0=value (delivered),
+       s0=entry fn (set by coro_init), sp aligned to 16. ebreak if entry
+       returns. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    ".type "  SYM(__cfree_coro_trampoline) ", @function\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    jalr s0\n"
+    "    ebreak\n"
+    ".size "  SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n"
+
+    ".section .note.GNU-stack,\"\",%progbits\n"
+);
diff --git a/lib/coro/x86_64.c b/lib/coro/x86_64.c
@@ -0,0 +1,131 @@
+/*
+ * lib/coro/x86_64.c -- x86_64 System V ABI implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * Callee-saved set on SysV: rbx, rbp, r12-r15. (No callee-saved xmm
+ * regs -- those are MS-ABI specific; see x86_64_win.c.)
+ *
+ *   regs[0]:  rbx     regs[4]:  r14
+ *   regs[1]:  rbp     regs[5]:  r15
+ *   regs[2]:  r12     regs[6]:  rsp
+ *   regs[3]:  r13     regs[7]:  rip
+ *
+ * sizeof = 64, 16-byte aligned.
+ *
+ *   setjmp(env)             %rdi=env
+ *   longjmp(env, val)       %rdi=env, %esi=val
+ *   coro_switch(f, t, val)  %rdi=from, %rsi=to, %rdx=val
+ *
+ * The "save rsp/rip" trick: at function entry, (%rsp) holds the
+ * caller's return address (just pushed by `call`); 8(%rsp) is the
+ * caller's pre-call rsp. Saving those two lets longjmp/coro_switch
+ * "land" at the call site exactly as if the function had returned.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_x86_64_ctx {
+    uintptr_t regs[8];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) == 64,                   "layout");
+_Static_assert(_Alignof(struct __cfree_x86_64_ctx) == 16,                 "align");
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_x86_64_ctx *c = (struct __cfree_x86_64_ctx *)ctx;
+
+    /* x86_64 stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[1] = 0;                                  /* rbp */
+    c->regs[3] = (uintptr_t)entry;                   /* r13 -- entry fn */
+    c->regs[6] = top;                                /* rsp */
+    c->regs[7] = (uintptr_t)__cfree_coro_trampoline; /* rip */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) rsp + rip into [reg]; clobbers %rax.
+   Used at function-entry stack discipline: (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */
+#define SAVE_INTO(reg) \
+    "    movq %rbx,  0("  reg ")\n" \
+    "    movq %rbp,  8("  reg ")\n" \
+    "    movq %r12, 16("  reg ")\n" \
+    "    movq %r13, 24("  reg ")\n" \
+    "    movq %r14, 32("  reg ")\n" \
+    "    movq %r15, 40("  reg ")\n" \
+    "    leaq 8(%rsp), %rax\n"      \
+    "    movq %rax, 48("  reg ")\n" \
+    "    movq (%rsp), %rax\n"       \
+    "    movq %rax, 56("  reg ")\n"
+
+/* Restore callee-saved + rsp from [reg], leave rip in %rcx ready to
+   jmp. Caller delivers the destination value in %rax beforehand. */
+#define RESTORE_FROM(reg) \
+    "    movq  0("  reg "), %rbx\n" \
+    "    movq  8("  reg "), %rbp\n" \
+    "    movq 16("  reg "), %r12\n" \
+    "    movq 24("  reg "), %r13\n" \
+    "    movq 32("  reg "), %r14\n" \
+    "    movq 40("  reg "), %r15\n" \
+    "    movq 48("  reg "), %rsp\n" \
+    "    movq 56("  reg "), %rcx\n"
+
+__asm__ (
+    ".text\n"
+    ".p2align 4\n"
+
+    /* setjmp(env) -- env=%rdi */
+    ".globl " SYM(setjmp) "\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("%rdi")
+    "    xorl %eax, %eax\n"
+    "    ret\n"
+
+    /* longjmp(env, val) -- env=%rdi, val=%esi.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+    ".globl " SYM(longjmp) "\n"
+    SYM(longjmp) ":\n"
+    "    movslq %esi, %rax\n"        /* sign-extend int → long */
+    "    testq %rax, %rax\n"
+    "    movl  $1, %edx\n"
+    "    cmoveq %rdx, %rax\n"
+    RESTORE_FROM("%rdi")
+    "    jmpq *%rcx\n"
+
+    /* coro_switch(from, to, value) -- from=%rdi, to=%rsi, value=%rdx. */
+    ".globl " SYM(coro_switch) "\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("%rdi")
+    "    movq %rdx, %rax\n"          /* deliver value as return reg */
+    RESTORE_FROM("%rsi")
+    "    jmpq *%rcx\n"
+
+    /* __cfree_coro_trampoline -- on first entry: %rax=value,
+       %r13=entry, %rsp=stack_top (no return addr pushed -- coro_switch
+       reaches here via jmp). System V wants %rsp+8 ≡ 16 (mod 16) at
+       function entry; the andq below makes that hold defensively. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    andq $-16, %rsp\n"
+    "    movq %rax, %rdi\n"          /* value → first arg */
+    "    callq *%r13\n"               /* entry(value) */
+    "    ud2\n"
+);
diff --git a/lib/coro/x86_64_win.c b/lib/coro/x86_64_win.c
@@ -0,0 +1,176 @@
+/*
+ * lib/coro/x86_64_win.c -- x86_64 Windows (MS x64 ABI) implementations of
+ *   setjmp / longjmp                       (<setjmp.h>)
+ *   coro_init / coro_switch / trampoline   (<stdcoro.h>)
+ *
+ * MS x64 callee-saved set: rbx, rbp, rdi, rsi, r12-r15, xmm6-xmm15.
+ * (Compare with x86_64.c -- SysV doesn't preserve rdi/rsi or any xmm.)
+ * Windows additionally requires the TEB stack-bound slots gs:0x08
+ * (StackBase) and gs:0x10 (StackLimit) to track the live stack so
+ * exception unwinding etc. behave; these are saved/restored on every
+ * switch.
+ *
+ *   regs[0]:  rbx     regs[8]:  rsp
+ *   regs[1]:  rbp     regs[9]:  rip
+ *   regs[2]:  rdi     regs[10]: stack_base  (TEB gs:0x08)
+ *   regs[3]:  rsi     regs[11]: stack_limit (TEB gs:0x10)
+ *   regs[4..7]: r12-r15
+ *   fp_regs[0..19]: xmm6-xmm15  (10 regs * 128b = 20 * 64b slots, off 96)
+ *
+ * sizeof = 256, 16-byte aligned. Exactly fills jmp_buf / coro_ctx.
+ *
+ *   setjmp(env)             %rcx=env
+ *   longjmp(env, val)       %rcx=env, %edx=val
+ *   coro_switch(f, t, val)  %rcx=from, %rdx=to, %r8=value
+ *
+ * The "save rsp/rip" trick mirrors x86_64.c: at function entry,
+ * (%rsp) holds the caller's return address, 8(%rsp) is the caller's
+ * pre-call rsp.
+ */
+
+#include <setjmp.h>
+#include <stdcoro.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct __cfree_x86_64_win_ctx {
+    uintptr_t regs[12];
+    uint64_t  fp_regs[20];
+} __attribute__((aligned(16)));
+
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) == 256,                  "layout");
+_Static_assert(_Alignof(struct __cfree_x86_64_win_ctx) == 16,                 "align");
+_Static_assert(offsetof(struct __cfree_x86_64_win_ctx, fp_regs) == 96,        "fp off");
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(coro_ctx),     "fits coro_ctx");
+_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(jmp_buf),      "fits jmp_buf");
+_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_win_ctx), "align coro_ctx");
+
+extern void __cfree_coro_trampoline(void);
+
+void coro_init(coro_ctx *ctx,
+               void *stack_base, size_t stack_len,
+               coro_entry_fn entry) {
+    struct __cfree_x86_64_win_ctx *c = (struct __cfree_x86_64_win_ctx *)ctx;
+
+    /* x86_64 stacks grow down; align top to 16. */
+    uintptr_t top = (uintptr_t)stack_base + stack_len;
+    top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1);
+
+    for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i)
+        ((uintptr_t *)c)[i] = 0;
+
+    c->regs[1]  = 0;                                  /* rbp */
+    c->regs[4]  = (uintptr_t)entry;                   /* r12 -- entry fn */
+    c->regs[8]  = top;                                /* rsp */
+    c->regs[9]  = (uintptr_t)__cfree_coro_trampoline; /* rip */
+    c->regs[10] = top;                                /* stack_base (TEB) */
+    c->regs[11] = (uintptr_t)stack_base;              /* stack_limit (TEB) */
+}
+
+#define STR_(x) #x
+#define STR(x)  STR_(x)
+#define SYM(n)  STR(__USER_LABEL_PREFIX__) #n
+
+/* Save callee-saved + (caller's) rsp + rip + TEB stack bounds + xmm6-15
+   into [reg]; clobbers %rax. Used at function-entry stack discipline:
+   (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */
+#define SAVE_INTO(reg) \
+    "    movq %rbx,    0("  reg ")\n" \
+    "    movq %rbp,    8("  reg ")\n" \
+    "    movq %rdi,   16("  reg ")\n" \
+    "    movq %rsi,   24("  reg ")\n" \
+    "    movq %r12,   32("  reg ")\n" \
+    "    movq %r13,   40("  reg ")\n" \
+    "    movq %r14,   48("  reg ")\n" \
+    "    movq %r15,   56("  reg ")\n" \
+    "    leaq 8(%rsp), %rax\n"        \
+    "    movq %rax,   64("  reg ")\n" \
+    "    movq (%rsp), %rax\n"         \
+    "    movq %rax,   72("  reg ")\n" \
+    "    movq %gs:0x08, %rax\n"       \
+    "    movq %rax,   80("  reg ")\n" \
+    "    movq %gs:0x10, %rax\n"       \
+    "    movq %rax,   88("  reg ")\n" \
+    "    movaps %xmm6,   96("  reg ")\n" \
+    "    movaps %xmm7,  112("  reg ")\n" \
+    "    movaps %xmm8,  128("  reg ")\n" \
+    "    movaps %xmm9,  144("  reg ")\n" \
+    "    movaps %xmm10, 160("  reg ")\n" \
+    "    movaps %xmm11, 176("  reg ")\n" \
+    "    movaps %xmm12, 192("  reg ")\n" \
+    "    movaps %xmm13, 208("  reg ")\n" \
+    "    movaps %xmm14, 224("  reg ")\n" \
+    "    movaps %xmm15, 240("  reg ")\n"
+
+/* Restore callee-saved + xmm + TEB bounds + rsp from [reg]; leaves rip
+   in %r10 ready to jmp. Caller delivers the destination value in %rax
+   beforehand, so %rax must not be touched here. */
+#define RESTORE_FROM(reg) \
+    "    movaps  96("  reg "), %xmm6\n"  \
+    "    movaps 112("  reg "), %xmm7\n"  \
+    "    movaps 128("  reg "), %xmm8\n"  \
+    "    movaps 144("  reg "), %xmm9\n"  \
+    "    movaps 160("  reg "), %xmm10\n" \
+    "    movaps 176("  reg "), %xmm11\n" \
+    "    movaps 192("  reg "), %xmm12\n" \
+    "    movaps 208("  reg "), %xmm13\n" \
+    "    movaps 224("  reg "), %xmm14\n" \
+    "    movaps 240("  reg "), %xmm15\n" \
+    "    movq  0("  reg "), %rbx\n"      \
+    "    movq  8("  reg "), %rbp\n"      \
+    "    movq 16("  reg "), %rdi\n"      \
+    "    movq 24("  reg "), %rsi\n"      \
+    "    movq 32("  reg "), %r12\n"      \
+    "    movq 40("  reg "), %r13\n"      \
+    "    movq 48("  reg "), %r14\n"      \
+    "    movq 56("  reg "), %r15\n"      \
+    "    movq 80("  reg "), %r10\n"      \
+    "    movq %r10, %gs:0x08\n"          \
+    "    movq 88("  reg "), %r10\n"      \
+    "    movq %r10, %gs:0x10\n"          \
+    "    movq 64("  reg "), %rsp\n"      \
+    "    movq 72("  reg "), %r10\n"
+
+__asm__ (
+    ".text\n"
+    ".p2align 4\n"
+
+    /* setjmp(env) -- env=%rcx */
+    ".globl " SYM(setjmp) "\n"
+    SYM(setjmp) ":\n"
+    SAVE_INTO("%rcx")
+    "    xorl %eax, %eax\n"
+    "    ret\n"
+
+    /* longjmp(env, val) -- env=%rcx, val=%edx.
+       longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */
+    ".globl " SYM(longjmp) "\n"
+    SYM(longjmp) ":\n"
+    "    movslq %edx, %rax\n"        /* sign-extend int -> long */
+    "    testq %rax, %rax\n"
+    "    movl  $1, %r11d\n"
+    "    cmoveq %r11, %rax\n"
+    RESTORE_FROM("%rcx")
+    "    jmpq *%r10\n"
+
+    /* coro_switch(from, to, value) -- from=%rcx, to=%rdx, value=%r8. */
+    ".globl " SYM(coro_switch) "\n"
+    SYM(coro_switch) ":\n"
+    SAVE_INTO("%rcx")
+    "    movq %r8, %rax\n"           /* deliver value as return reg */
+    RESTORE_FROM("%rdx")
+    "    jmpq *%r10\n"
+
+    /* __cfree_coro_trampoline -- on first entry: %rax=value (delivered
+       by coro_switch), %r12=entry (set by coro_init), %rsp=stack_top
+       (no return addr pushed -- coro_switch reaches here via jmp). MS
+       x64 wants %rsp 16-byte aligned at call sites with 32 bytes of
+       shadow space reserved by the caller. */
+    ".globl " SYM(__cfree_coro_trampoline) "\n"
+    SYM(__cfree_coro_trampoline) ":\n"
+    "    andq $-16, %rsp\n"          /* defensive align */
+    "    subq $32, %rsp\n"           /* MS x64 shadow space */
+    "    movq %rax, %rcx\n"          /* value -> first arg */
+    "    callq *%r12\n"              /* entry(value) */
+    "    ud2\n"
+);
diff --git a/test/smoke.c b/test/smoke.c
@@ -34,6 +34,7 @@
 #include <stdarg.h>
 #include <stdatomic.h>
 #include <stdbool.h>
+#include <stdcoro.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdnoreturn.h>
@@ -135,7 +136,8 @@ static noreturn void cfree_trap(void) { for (;;) {} }
 /* setjmp: jmp_buf is an array type, setjmp is callable in the contexts
    permitted by C11 7.13.1.1p4, longjmp is _Noreturn. Compile-only --
    smoke.c never links against a setjmp implementation. */
-_Static_assert(sizeof(jmp_buf) >= sizeof(void *) * 8, "jmp_buf room for regs");
+_Static_assert(sizeof(jmp_buf) >= 64,         "jmp_buf room for regs");
+_Static_assert(_Alignof(jmp_buf) >= 16,       "jmp_buf 16-byte aligned");
 static jmp_buf cfree_jb;
 static int cfree_setjmp_compiles(int x) {
     if (setjmp(cfree_jb) != 0) return 1;     /* allowed context */
@@ -143,6 +145,19 @@ static int cfree_setjmp_compiles(int x) {
     return 0;
 }
 
+/* stdcoro: coro_ctx storage exists, the API surface compiles and
+   resolves; same compile-only caveat as setjmp. */
+_Static_assert(sizeof(coro_ctx) >= 64,        "coro_ctx room for regs");
+_Static_assert(_Alignof(coro_ctx) >= 16,      "coro_ctx 16-byte aligned");
+_Static_assert(CORO_STACK_ALIGN >= 8,         "stack align reasonable");
+static coro_ctx cfree_co_a, cfree_co_b;
+static _Alignas(16) unsigned char cfree_co_stack[4096];
+static void cfree_co_entry(uintptr_t v) { (void)v; for (;;) {} }
+static uintptr_t cfree_coro_compiles(void) {
+    coro_init(&cfree_co_b, cfree_co_stack, sizeof(cfree_co_stack), cfree_co_entry);
+    return coro_switch(&cfree_co_a, &cfree_co_b, 0xC0FFEEu);
+}
+
 /* stdatomic: types, memory_order, lock-free macros, plus a runtime
    exercise of load, store, exchange, CAS, fetch ops, and atomic_flag. */
 _Static_assert(sizeof(atomic_int)  == sizeof(int),  "atomic_int matches int");
@@ -185,5 +200,6 @@ int cfree_smoke_ok(void) {
     (void)aligned_buf;
     if (0) cfree_trap();
     if (0) (void)cfree_setjmp_compiles(0);
+    if (0) (void)cfree_coro_compiles();
     return sum_n(3, 1, 2, 3) == 6 && cfree_atomic_ok();
 }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/builtins.md	\|	20	+++++++++++++++++---
M	include/setjmp.h	\|	31	+++++++++++++++++++------------
A	include/stdcoro.h	\|	56	++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	lib/README.md	\|	22	++++++++++++++++++++++
M	lib/build.sh	\|	66	++++++++++++++++++++++++++++++++++++++++++------------------------
A	lib/coro/aarch64.c	\|	137	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/arm32.c	\|	202	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/arm32_thumb1.c	\|	174	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/i386.c	\|	143	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/riscv32.c	\|	219	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/riscv64.c	\|	193	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/x86_64.c	\|	131	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/coro/x86_64_win.c	\|	176	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/smoke.c	\|	18	+++++++++++++++++-