kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 8dd63074edda0ee085f6ebe758e9f0e7a9739594
parent 0dcc91b39522fbc9557618c46bf4d193aa40bba9
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  7 May 2026 13:47:23 -0700

setjmp.h stdcoro.h

Diffstat:
Mdoc/builtins.md | 20+++++++++++++++++---
Minclude/setjmp.h | 31+++++++++++++++++++------------
Ainclude/stdcoro.h | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/README.md | 22++++++++++++++++++++++
Mlib/build.sh | 66++++++++++++++++++++++++++++++++++++++++++------------------------
Alib/coro/aarch64.c | 137+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/arm32.c | 202+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/arm32_thumb1.c | 174+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/i386.c | 143+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/riscv32.c | 219+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/riscv64.c | 193+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/x86_64.c | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/coro/x86_64_win.c | 176+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/smoke.c | 18+++++++++++++++++-
14 files changed, 1548 insertions(+), 40 deletions(-)

diff --git a/doc/builtins.md b/doc/builtins.md @@ -149,9 +149,23 @@ Always: - Float → float: `__extendsfdf2`, `__extendsftf2`, `__extenddftf2`, `__truncdfsf2`, `__trunctfsf2`, `__trunctfdf2` - Compare: `__eq`, `__ne`, `__lt`, `__le`, `__gt`, `__ge`, `__unord` × `sf2`/`df2`/`tf2` -### Nonlocal jumps (always shipped) -- `setjmp`, `longjmp` — target-specific assembly. The `jmp_buf` layout is - internal to these two functions; `<setjmp.h>` only fixes the array size. +### Nonlocal jumps + stackful coroutines (per-arch, always shipped) +The `<setjmp.h>` and `<stdcoro.h>` primitives share one per-target context +struct: callee-saved GPRs + callee-saved FPRs + sp + return address. The +`jmp_buf` and `coro_ctx` typedefs are 256-byte aligned-16 storage; the +runtime reinterprets them as the per-arch struct. +- `setjmp`, `longjmp` — `<setjmp.h>` (C11 7.13). cfree extension: this + header is *not* in the C11 freestanding subset. +- `coro_init`, `coro_switch`, `__cfree_coro_trampoline` — `<stdcoro.h>` + (cfree-specific). `coro_switch(from, to, value) → uintptr_t` is the + one universal primitive; `setjmp` = save-and-return-0, + `longjmp` = restore-and-deliver-val. +- Implementations live one master `.c` per arch under `lib/coro/` + (file-scope asm + tiny C `coro_init`). ARM has two: `arm32.c` + (Thumb-2, ARMv7+, may use VFP `d8-d15`) and `arm32_thumb1.c` + (ARMv6-M, no IT blocks / no VFP / data-processing limited to + r0-r7). Not provided for: WASM (would need an Asyncify-fiber + port). ### Atomic fallbacks (only when target lacks native atomics for that width) - Generic: `__atomic_load`, `__atomic_store`, `__atomic_exchange`, `__atomic_compare_exchange` diff --git a/include/setjmp.h b/include/setjmp.h @@ -1,21 +1,28 @@ /* setjmp.h -- C11 7.13 -- Nonlocal jumps * - * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6); cfree - * provides it as an extension for code that wants nonlocal control flow - * without a hosted libc. The setjmp/longjmp pair is target-specific - * assembly and lives in libcfree_rt.a -- see doc/builtins.md. + * setjmp.h is *not* part of the C11 freestanding subset (C11 4p6); + * cfree provides it as an extension. The setjmp/longjmp pair is + * target-specific assembly in libcfree_rt.a -- see doc/builtins.md. * - * jmp_buf is an array type (C11 7.13p2). Its layout is internal to the - * runtime; the size below is conservative -- large enough to hold every - * cfree target's callee-saved GPRs + callee-saved FPRs + sp + return - * address. C11 7.13 explicitly excludes the floating-point status flags, - * the state of open files, and any other component of the abstract - * machine, so no signal-mask slot is reserved. - */ + * jmp_buf is an array type (C11 7.13p2). The runtime reinterprets the + * buffer as a per-target struct of callee-saved GPRs + callee-saved + * FPRs + sp + return address. The size below is sized to the largest + * such struct across cfree targets -- 256 bytes (x86_64 Windows: 12 + * GPR slots + xmm6-15). C11 explicitly excludes the FP status flags + * and open-file state, so no signal-mask slot is reserved. The same + * 256-byte payload is shared with <stdcoro.h>'s coro_ctx so the + * underlying save/restore halves are reused across all three + * primitives. */ #ifndef CFREE_SETJMP_H #define CFREE_SETJMP_H -typedef long jmp_buf[32]; +/* Wrap in a struct so 16-byte alignment is guaranteed even when the + user puts a jmp_buf on the stack -- xmm save instructions require + it on x86_64. The [1] makes jmp_buf an array type as the standard + demands, so passing one to setjmp/longjmp decays to a pointer. */ +typedef struct { + _Alignas(16) unsigned char __cfree_storage[256]; +} jmp_buf[1]; int setjmp(jmp_buf env); _Noreturn void longjmp(jmp_buf env, int val); diff --git a/include/stdcoro.h b/include/stdcoro.h @@ -0,0 +1,56 @@ +/* stdcoro.h -- cfree extension -- stackful symmetric coroutines + * + * stdcoro.h is non-standard: C11 has no stackful-coroutine facility. + * cfree ships it as a native counterpart to <setjmp.h>: the underlying + * per-target context struct, save sequence, and restore sequence are + * literally shared with setjmp/longjmp -- only the entry shapes differ + * (setjmp = save+return-0; longjmp = restore+return-val; coro_switch = + * save(from)+restore(to)+deliver-value). Implementations live in + * libcfree_rt.a -- see doc/builtins.md. + * + * Programming model + * 1. Allocate a coro_ctx and a stack region. + * 2. coro_init(&ctx, stack_base, stack_len, entry). + * 3. coro_switch(&caller, &ctx, value) -- delivers `value` to entry's + * uintptr_t argument on first switch in. + * 4. Inside the coroutine, coro_switch(&ctx, &caller, value) yields + * back, with `value` becoming the caller's coro_switch return. + * 5. entry must NOT return; the trampoline traps if it does. + * + * coro_ctx is sized conservatively -- large enough for every cfree + * target's callee-saved registers + sp + ip + (where applicable) + * callee-saved FP regs. Layout is internal to the runtime. + */ +#ifndef CFREE_STDCORO_H +#define CFREE_STDCORO_H + +#include <stddef.h> +#include <stdint.h> + +/* Stack alignment required at function-call boundaries on every cfree + target (16 on x86_64/aarch64/arm32-AAPCS-VFP/riscv; weaker on i386 + but 16 covers it). Caller stacks must be aligned to this. */ +#define CORO_STACK_ALIGN 16 + +/* 256 bytes is the largest per-target context across cfree's targets + (x86_64 Windows: 12 GPR slots + xmm6-15). Same byte payload as + <setjmp.h>'s jmp_buf -- the per-arch runtime reinterprets either + as the same internal struct. */ +typedef struct coro_ctx { + _Alignas(16) unsigned char __cfree_storage[256]; +} coro_ctx; + +typedef void (*coro_entry_fn)(uintptr_t value); + +/* Initialize *ctx to begin executing entry(value) on first switch in, + using the stack region [stack_base, stack_base + stack_len). The + stack base must be CORO_STACK_ALIGN-aligned. entry must not return. */ +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry); + +/* Save callee-saved state into *from, restore it from *to, deliver + `value` to *to. Returns the value passed by the next switch back. */ +uintptr_t coro_switch(coro_ctx *from, coro_ctx *to, uintptr_t value); + +#endif diff --git a/lib/README.md b/lib/README.md @@ -33,6 +33,7 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired. | `riscv/rv64.S` | `__riscv_save_*` + `__riscv_restore_*` (rv64) | RISC-V rv64 with `-msave-restore` | | `mem/mem.c` | `memcpy` / `memmove` / `memset` / `memcmp` (weak) | All; user libc overrides | | `atomic/atomic_freestanding.c` | `__atomic_*` fallback shim | All | +| `coro/<arch>.c` | `setjmp` / `longjmp` (`<setjmp.h>`) + `coro_init` / `coro_switch` / `__cfree_coro_trampoline` (`<stdcoro.h>`) | One of `aarch64`, `arm32`, `arm32_thumb1`, `i386`, `riscv32`, `riscv64`, `x86_64`, `x86_64_win`. Not built for `wasm32`. | ### Build-time include dirs (consumed by the masters; nothing here lands in `libcfree_rt.a`) @@ -141,6 +142,27 @@ Hand-written portable C (not from compiler-rt). All four functions are weak so a user libc, or a tuned arch-specific replacement, wins at link time. `arm/aeabi_thumb{1,2}.S`'s `aeabi_mem*` symbols forward to these. +### `coro/<arch>.c` +One master `.c` per arch that supplies both `<setjmp.h>` (`setjmp`, +`longjmp`) and `<stdcoro.h>` (`coro_init`, `coro_switch`, +`__cfree_coro_trampoline`). The setjmp/longjmp/coro_switch primitives +share a per-arch struct (callee-saved GPRs + callee-saved FPRs + sp + +return address) and one pair of C string-concat macros +`SAVE_INTO(reg)` / `RESTORE_FROM(reg)` so the same instruction bytes +are emitted in all three places. Written as file-scope `__asm__` +inside a `.c` file (not a separate `.S`) so the asm and the tiny +`coro_init` C function stay co-located. Symbol naming uses +`__USER_LABEL_PREFIX__` so the same source compiles for ELF / Mach-O / +COFF. + +ARM ships two variants: `arm32.c` (Thumb-2, ARMv7+, optional VFP +`d8-d15` gated on `__ARM_FP`) and `arm32_thumb1.c` (ARMv6-M / +Cortex-M0/M0+; no IT blocks, no VFP, data-processing restricted to +r0-r7, no `str sp` / `str rN, [sp,...]` -- the asm sequences don't +share with arm32.c so it's a separate file). + +Not provided for `wasm32` (would need an Asyncify-fiber port). + ### `atomic/atomic_freestanding.c` Defines a pointer-sized `_Atomic(uintptr_t)` spinlock as the lock primitive (no OS dependency) then `#include`s `atomic_common.inc`, which contains the diff --git a/lib/build.sh b/lib/build.sh @@ -93,6 +93,16 @@ ARM_AEABI_THUMB1="arm/aeabi_thumb1.S arm/aeabi.c" RV32_SR="riscv/rv32.S" RV64_SR="riscv/rv64.S" +# Coro + setjmp/longjmp: one master .c per arch, file-scope asm inside. +CORO_X86_64="coro/x86_64.c" +CORO_X86_64_WIN="coro/x86_64_win.c" +CORO_I386="coro/i386.c" +CORO_AARCH64="coro/aarch64.c" +CORO_ARM32="coro/arm32.c" +CORO_ARM32_THUMB1="coro/arm32_thumb1.c" +CORO_RV32="coro/riscv32.c" +CORO_RV64="coro/riscv64.c" + #------------------------------------------------------------------------------- # Variants #------------------------------------------------------------------------------- @@ -103,71 +113,79 @@ echo # ---- LP64 little-endian ------------------------------------------------------ LP64_BASE="$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C" +# Coro impl needs cfree's own headers (setjmp.h, stdcoro.h). +CORO_INC="-I../include" + build_variant x86_64-linux \ - "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1" \ - "$LP64_BASE" + "--target=x86_64-linux-gnu -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ + "$LP64_BASE $CORO_X86_64" build_variant x86_64-apple-darwin \ - "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \ - "$LP64_BASE" + "--target=x86_64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ + "$LP64_BASE $CORO_X86_64" # aarch64-linux: long double is binary128; needs fp_tf + fp_ti and the # tf_supplement.h pre-include. build_variant aarch64-linux \ "--target=aarch64-linux-gnu \ -Iinclude/lp64_le_ldbl128 -Iinclude/lp64_le -DHAS_INT128=1 \ - -include include/lp64_le_ldbl128/tf_supplement.h" \ - "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C" + -include include/lp64_le_ldbl128/tf_supplement.h $CORO_INC" \ + "$INT_C $INT64_C $FP_C $FP_TF_C $FP_TI_C $MEM_C $ATOMIC_C $CORO_AARCH64" build_variant aarch64-apple-darwin \ - "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1" \ - "$LP64_BASE" + "--target=aarch64-apple-darwin -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ + "$LP64_BASE $CORO_AARCH64" build_variant riscv64-elf \ "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd \ - -Iinclude/lp64_le -DHAS_INT128=1" \ - "$LP64_BASE" + -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ + "$LP64_BASE $CORO_RV64" build_variant riscv64-elf-save-restore \ "--target=riscv64-unknown-elf -mabi=lp64 -march=rv64imafd -msave-restore \ - -Iinclude/lp64_le -DHAS_INT128=1" \ - "$LP64_BASE $RV64_SR" + -Iinclude/lp64_le -DHAS_INT128=1 $CORO_INC" \ + "$LP64_BASE $RV64_SR $CORO_RV64" # ---- LLP64 little-endian (Win64) -------------------------------------------- build_variant x86_64-pc-windows \ - "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1" \ - "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C" + "--target=x86_64-pc-windows-msvc -Iinclude/llp64_le -DHAS_INT128=1 $CORO_INC" \ + "$INT_C $INT64_C $FP_C $MEM_C $ATOMIC_C $CORO_X86_64_WIN" # ---- ILP32 little-endian ----------------------------------------------------- ILP32_BASE="$INT_C $INT32_C $FP_C $MEM_C $ATOMIC_C" build_variant i386-linux \ - "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0" \ - "$ILP32_BASE" + "--target=i386-linux-gnu -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ + "$ILP32_BASE $CORO_I386" +# wasm32: no setjmp/coro impl yet -- Emscripten fibers / sjlj are a +# separate runtime model that hasn't been ported to cfree. build_variant wasm32 \ "--target=wasm32-unknown-unknown -Iinclude/ilp32_le -DHAS_INT128=0" \ "$ILP32_BASE" build_variant riscv32-elf \ "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd \ - -Iinclude/ilp32_le -DHAS_INT128=0" \ - "$ILP32_BASE" + -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ + "$ILP32_BASE $CORO_RV32" build_variant riscv32-elf-save-restore \ "--target=riscv32-unknown-elf -mabi=ilp32 -march=rv32imafd -msave-restore \ - -Iinclude/ilp32_le -DHAS_INT128=0" \ - "$ILP32_BASE $RV32_SR" + -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ + "$ILP32_BASE $RV32_SR $CORO_RV32" build_variant arm-eabi-thumb2 \ "--target=arm-none-eabi -march=armv7-a -mthumb -mfloat-abi=soft \ - -Iinclude/ilp32_le -DHAS_INT128=0" \ - "$ILP32_BASE $ARM_AEABI_THUMB2" + -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ + "$ILP32_BASE $ARM_AEABI_THUMB2 $CORO_ARM32" +# arm-eabi-thumb1 (Cortex-M0/M0+, ARMv6-M): Thumb-1 ISA, no IT blocks, +# data-processing ops restricted to r0-r7, no VFP. Coro impl is a +# separate file from arm32.c since the asm sequences don't share. build_variant arm-eabi-thumb1 \ "--target=arm-none-eabi -march=armv6-m -mthumb -mfloat-abi=soft \ - -Iinclude/ilp32_le -DHAS_INT128=0" \ - "$ILP32_BASE $ARM_AEABI_THUMB1" + -Iinclude/ilp32_le -DHAS_INT128=0 $CORO_INC" \ + "$ILP32_BASE $ARM_AEABI_THUMB1 $CORO_ARM32_THUMB1" #------------------------------------------------------------------------------- echo diff --git a/lib/coro/aarch64.c b/lib/coro/aarch64.c @@ -0,0 +1,137 @@ +/* + * lib/coro/aarch64.c -- AArch64 (AAPCS) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * All three primitives sit on one per-target context layout: + * + * regs[0..9] x19-x28 + * regs[10..11] fp (x29), lr (x30) + * regs[12] sp + * fp_regs[0..7] d8-d15 (low 64 bits of v8-v15; AAPCS only mandates + * the lower 64 bits be preserved) + * + * sizeof = 176 (alignof-16 padded), 16-byte aligned. Fits in the + * 256-byte storage carved out by jmp_buf and coro_ctx. + * + * SAVE_/RESTORE_ are C string-concat macros so the same byte + * sequence is emitted in setjmp, longjmp, and coro_switch without + * any duplication or gas-specific .macro tricks. + * + * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C + * compiler's call-site mangling on both ELF (no prefix) and Mach-O + * (leading "_"). + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_arm64_ctx { + uintptr_t regs[13]; + uint64_t fp_regs[8]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_arm64_ctx) == 176, "layout"); +_Static_assert(_Alignof(struct __cfree_arm64_ctx) == 16, "align"); +_Static_assert(offsetof(struct __cfree_arm64_ctx, fp_regs) == 104, "fp off"); +_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_arm64_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm64_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_arm64_ctx *c = (struct __cfree_arm64_ctx *)ctx; + + /* AArch64 stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)entry; /* x19 -- entry fn */ + c->regs[10] = 0; /* fp */ + c->regs[11] = (uintptr_t)__cfree_coro_trampoline;/* lr */ + c->regs[12] = top; /* sp */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved state into [reg]; clobbers x9 (caller-saved). */ +#define SAVE_INTO(reg) \ + " stp x19, x20, [" reg ", #0]\n" \ + " stp x21, x22, [" reg ", #16]\n" \ + " stp x23, x24, [" reg ", #32]\n" \ + " stp x25, x26, [" reg ", #48]\n" \ + " stp x27, x28, [" reg ", #64]\n" \ + " stp x29, x30, [" reg ", #80]\n" \ + " mov x9, sp\n" \ + " str x9, [" reg ", #96]\n" \ + " stp d8, d9, [" reg ", #104]\n" \ + " stp d10, d11, [" reg ", #120]\n" \ + " stp d12, d13, [" reg ", #136]\n" \ + " stp d14, d15, [" reg ", #152]\n" + +/* Restore callee-saved state from [reg]; clobbers x9. */ +#define RESTORE_FROM(reg) \ + " ldp d8, d9, [" reg ", #104]\n" \ + " ldp d10, d11, [" reg ", #120]\n" \ + " ldp d12, d13, [" reg ", #136]\n" \ + " ldp d14, d15, [" reg ", #152]\n" \ + " ldp x19, x20, [" reg ", #0]\n" \ + " ldp x21, x22, [" reg ", #16]\n" \ + " ldp x23, x24, [" reg ", #32]\n" \ + " ldp x25, x26, [" reg ", #48]\n" \ + " ldp x27, x28, [" reg ", #64]\n" \ + " ldp x29, x30, [" reg ", #80]\n" \ + " ldr x9, [" reg ", #96]\n" \ + " mov sp, x9\n" + +__asm__ ( + ".text\n" + ".align 4\n" + + /* setjmp(env) -- env in x0. lr at call time is the return address + into the caller, exactly what longjmp must restore. */ + ".globl " SYM(setjmp) "\n" + SYM(setjmp) ":\n" + SAVE_INTO("x0") + " mov x0, #0\n" + " ret\n" + + /* longjmp(env, val) -- env in x0, val in x1. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); csinc gives + x0 = (x1 != 0) ? x1 : 1 branch-free. */ + ".globl " SYM(longjmp) "\n" + SYM(longjmp) ":\n" + RESTORE_FROM("x0") + " cmp x1, #0\n" + " csinc x0, x1, xzr, ne\n" + " ret\n" + + /* coro_switch(from, to, value) -- x0, x1, x2. Save into [x0], + restore from [x1], deliver x2 in x0 (which is both the return + register here and the first-arg register the trampoline reads + on a fresh context's first run). */ + ".globl " SYM(coro_switch) "\n" + SYM(coro_switch) ":\n" + SAVE_INTO("x0") + RESTORE_FROM("x1") + " mov x0, x2\n" + " ret\n" + + /* __cfree_coro_trampoline -- on first entry x0 = value (delivered), + x19 = entry fn (set by coro_init), sp aligned to 16. brk if entry + returns. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + SYM(__cfree_coro_trampoline) ":\n" + " blr x19\n" + " brk #0\n" +); diff --git a/lib/coro/arm32.c b/lib/coro/arm32.c @@ -0,0 +1,202 @@ +/* + * lib/coro/arm32.c -- ARM32 Thumb-2 (AAPCS) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * All three primitives sit on one per-target context layout: + * + * regs[0..7] r4-r11 + * regs[8] sp + * regs[9] lr + * fp_regs[0..7] d8-d15 (AAPCS only mandates the lower 64 bits of + * v8-v15 be preserved across calls; saved + * only when __ARM_FP is defined, but the + * slots are always allocated so the byte + * layout is stable across soft/hard-float + * builds). + * + * 10*4 GPR slots + 8*8 fp_regs slots = 104 bytes of payload, padded + * to 112 by alignof(16). fp_regs at offset 40. Fits in the 256-byte + * storage carved out by jmp_buf and coro_ctx. + * + * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence + * is emitted in setjmp, longjmp, and coro_switch. The VFP half is + * gated by a C-level #ifdef on __ARM_FP -- the cpp pass picks one + * macro body before the assembler sees anything, so we can't hide + * `#ifdef` inside the asm string. + * + * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C + * compiler's call-site mangling on both ELF (no prefix) and Mach-O + * (leading "_"). + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_arm32_ctx { + uintptr_t regs[10]; + uint64_t fp_regs[8]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_arm32_ctx) == 112, "layout"); +_Static_assert(_Alignof(struct __cfree_arm32_ctx) == 16, "align"); +_Static_assert(offsetof(struct __cfree_arm32_ctx, fp_regs) == 40, "fp off"); +_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_arm32_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_arm32_ctx *c = (struct __cfree_arm32_ctx *)ctx; + + /* ARM32 stacks grow down; align top to 16 (AAPCS public-boundary + requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)entry; /* r4 -- entry fn */ + c->regs[3] = 0; /* r7 -- frame ptr */ + c->regs[8] = top; /* sp */ + c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* lr */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save/restore macros. The VFP half is conditional on __ARM_FP at the + C-cpp level -- by the time the inline assembler sees the string, + only one variant remains. The byte offsets match the struct layout + regardless (slots are always allocated). */ +#ifdef __ARM_FP +#define SAVE_INTO(reg) \ + " str r4, [" reg ", #0]\n" \ + " str r5, [" reg ", #4]\n" \ + " str r6, [" reg ", #8]\n" \ + " str r7, [" reg ", #12]\n" \ + " str r8, [" reg ", #16]\n" \ + " str r9, [" reg ", #20]\n" \ + " str r10, [" reg ", #24]\n" \ + " str r11, [" reg ", #28]\n" \ + " str sp, [" reg ", #32]\n" \ + " str lr, [" reg ", #36]\n" \ + " vstr d8, [" reg ", #40]\n" \ + " vstr d9, [" reg ", #48]\n" \ + " vstr d10, [" reg ", #56]\n" \ + " vstr d11, [" reg ", #64]\n" \ + " vstr d12, [" reg ", #72]\n" \ + " vstr d13, [" reg ", #80]\n" \ + " vstr d14, [" reg ", #88]\n" \ + " vstr d15, [" reg ", #96]\n" + +#define RESTORE_FROM(reg) \ + " vldr d8, [" reg ", #40]\n" \ + " vldr d9, [" reg ", #48]\n" \ + " vldr d10, [" reg ", #56]\n" \ + " vldr d11, [" reg ", #64]\n" \ + " vldr d12, [" reg ", #72]\n" \ + " vldr d13, [" reg ", #80]\n" \ + " vldr d14, [" reg ", #88]\n" \ + " vldr d15, [" reg ", #96]\n" \ + " ldr r4, [" reg ", #0]\n" \ + " ldr r5, [" reg ", #4]\n" \ + " ldr r6, [" reg ", #8]\n" \ + " ldr r7, [" reg ", #12]\n" \ + " ldr r8, [" reg ", #16]\n" \ + " ldr r9, [" reg ", #20]\n" \ + " ldr r10, [" reg ", #24]\n" \ + " ldr r11, [" reg ", #28]\n" \ + " ldr sp, [" reg ", #32]\n" \ + " ldr lr, [" reg ", #36]\n" +#else +#define SAVE_INTO(reg) \ + " str r4, [" reg ", #0]\n" \ + " str r5, [" reg ", #4]\n" \ + " str r6, [" reg ", #8]\n" \ + " str r7, [" reg ", #12]\n" \ + " str r8, [" reg ", #16]\n" \ + " str r9, [" reg ", #20]\n" \ + " str r10, [" reg ", #24]\n" \ + " str r11, [" reg ", #28]\n" \ + " str sp, [" reg ", #32]\n" \ + " str lr, [" reg ", #36]\n" + +#define RESTORE_FROM(reg) \ + " ldr r4, [" reg ", #0]\n" \ + " ldr r5, [" reg ", #4]\n" \ + " ldr r6, [" reg ", #8]\n" \ + " ldr r7, [" reg ", #12]\n" \ + " ldr r8, [" reg ", #16]\n" \ + " ldr r9, [" reg ", #20]\n" \ + " ldr r10, [" reg ", #24]\n" \ + " ldr r11, [" reg ", #28]\n" \ + " ldr sp, [" reg ", #32]\n" \ + " ldr lr, [" reg ", #36]\n" +#endif + +__asm__ ( + ".syntax unified\n" + ".thumb\n" + ".text\n" + ".align 2\n" + + /* setjmp(env) -- env in r0. lr at call time is the return address + into the caller, exactly what longjmp must restore. */ + ".globl " SYM(setjmp) "\n" + ".thumb_func\n" + ".type " SYM(setjmp) ", %function\n" + SYM(setjmp) ":\n" + SAVE_INTO("r0") + " movs r0, #0\n" + " bx lr\n" + + /* longjmp(env, val) -- env in r0, val in r1. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); the IT block + gives r1 = (r1 == 0) ? 1 : r1, then we move it into r0 and + branch to the saved lr. */ + ".globl " SYM(longjmp) "\n" + ".thumb_func\n" + ".type " SYM(longjmp) ", %function\n" + SYM(longjmp) ":\n" + RESTORE_FROM("r0") + " cmp r1, #0\n" + " it eq\n" + " moveq r1, #1\n" + " mov r0, r1\n" + " bx lr\n" + + /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value. + Save into [r0], restore from [r1], deliver r2 in r0. The lr + loaded by RESTORE_FROM is either a real return address (a + previously-suspended coro) or __cfree_coro_trampoline (a fresh + coro initialized by coro_init). Either way `bx lr` lands there + with r0 holding `value`. */ + ".globl " SYM(coro_switch) "\n" + ".thumb_func\n" + ".type " SYM(coro_switch) ", %function\n" + SYM(coro_switch) ":\n" + SAVE_INTO("r0") + RESTORE_FROM("r1") + " mov r0, r2\n" + " bx lr\n" + + /* __cfree_coro_trampoline -- on first entry r0 = value (delivered + by coro_switch's `mov r0, r2`), r4 = entry fn (set by coro_init), + sp aligned to 16. udf if entry returns. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + ".thumb_func\n" + ".type " SYM(__cfree_coro_trampoline) ", %function\n" + SYM(__cfree_coro_trampoline) ":\n" + " blx r4\n" + " udf #0\n" + + ".section .note.GNU-stack,\"\",%progbits\n" +); diff --git a/lib/coro/arm32_thumb1.c b/lib/coro/arm32_thumb1.c @@ -0,0 +1,174 @@ +/* + * lib/coro/arm32_thumb1.c -- ARMv6-M (Cortex-M0 / M0+, Thumb-1) impls of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * Thumb-1 / ARMv6-M is a strict subset of the Thumb-2 ISA used by the + * sibling arm32.c, and several conveniences disappear: + * + * - no IT blocks: conditional execution must use a forward branch. + * - data-processing ops are restricted to r0-r7. r8-r15 are reachable + * only via the `mov` high-register form and a few specials; in + * particular there is no `str rN, [sp,...]` / `str sp, [rN,...]`. + * - `mov rd, rm` with *both* operands low is UNPREDICTABLE in + * ARMv6-M; use the T2 flags-setting form `movs rd, rm` for low->low + * register copies. The plain `mov` form is reserved for cases where + * at least one operand is a high register (sp/lr/r8-r11). + * - no VFP coprocessor on M0/M0+, so no fp_regs slots. + * + * Layout: 10 GPR slots (r4-r11, sp, lr) = 40 bytes, padded to 16-byte + * alignment by alignof(16). Fits in the 256-byte storage carved out by + * jmp_buf and coro_ctx. + * + * SAVE_INTO uses r4-r7 as scratches *after* they have themselves been + * stored, so r0-r3 are never clobbered. That matters for coro_switch: + * `to` (r1) and `value` (r2) survive across the save half and are still + * live for the restore half / value delivery. + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_arm32_thumb1_ctx { + uintptr_t regs[10]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) == 48, "layout"); +_Static_assert(_Alignof(struct __cfree_arm32_thumb1_ctx) == 16, "align"); +_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_arm32_thumb1_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_arm32_thumb1_ctx),"align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_arm32_thumb1_ctx *c = (struct __cfree_arm32_thumb1_ctx *)ctx; + + /* ARM stacks grow down; align top to 16 (AAPCS public-boundary + requirement is 8, but coro stacks promise CORO_STACK_ALIGN=16). */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)entry; /* r4 -- entry fn */ + c->regs[3] = 0; /* r7 -- frame ptr */ + c->regs[8] = top; /* sp */ + c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* lr */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved state into [reg]. + Stage 1: store r4-r7 directly (low->low str is fine). + Stage 2: with r4-r7 already saved, reuse them as scratches to copy + the high regs r8-r11 down and store them. + Stage 3: same trick for sp and lr. + r0-r3 are never touched. */ +#define SAVE_INTO(reg) \ + " str r4, [" reg ", #0]\n" \ + " str r5, [" reg ", #4]\n" \ + " str r6, [" reg ", #8]\n" \ + " str r7, [" reg ", #12]\n" \ + " mov r4, r8\n" \ + " mov r5, r9\n" \ + " mov r6, r10\n" \ + " mov r7, r11\n" \ + " str r4, [" reg ", #16]\n" \ + " str r5, [" reg ", #20]\n" \ + " str r6, [" reg ", #24]\n" \ + " str r7, [" reg ", #28]\n" \ + " mov r4, sp\n" \ + " mov r5, lr\n" \ + " str r4, [" reg ", #32]\n" \ + " str r5, [" reg ", #36]\n" + +/* Restore callee-saved state from [reg]. Mirror image: load r8-r11/sp/lr + first via r4-r7 as scratches, then restore the real r4-r7 last. */ +#define RESTORE_FROM(reg) \ + " ldr r4, [" reg ", #16]\n" \ + " ldr r5, [" reg ", #20]\n" \ + " ldr r6, [" reg ", #24]\n" \ + " ldr r7, [" reg ", #28]\n" \ + " mov r8, r4\n" \ + " mov r9, r5\n" \ + " mov r10, r6\n" \ + " mov r11, r7\n" \ + " ldr r4, [" reg ", #32]\n" \ + " ldr r5, [" reg ", #36]\n" \ + " mov sp, r4\n" \ + " mov lr, r5\n" \ + " ldr r4, [" reg ", #0]\n" \ + " ldr r5, [" reg ", #4]\n" \ + " ldr r6, [" reg ", #8]\n" \ + " ldr r7, [" reg ", #12]\n" + +__asm__ ( + ".syntax unified\n" + ".thumb\n" + ".text\n" + ".align 2\n" + + /* setjmp(env) -- env in r0. lr at call time is the return address + into the caller, exactly what longjmp must restore. */ + ".globl " SYM(setjmp) "\n" + ".thumb_func\n" + ".type " SYM(setjmp) ", %function\n" + SYM(setjmp) ":\n" + SAVE_INTO("r0") + " movs r0, #0\n" + " bx lr\n" + + /* longjmp(env, val) -- env in r0, val in r1. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). No IT blocks in + Thumb-1, so use a forward branch for the substitution. + Both `r0 <- r1` and the immediate ops use the T2 (`movs`) form + since plain `mov rd, rm` with both low operands is UNPREDICTABLE + on ARMv6-M. */ + ".globl " SYM(longjmp) "\n" + ".thumb_func\n" + ".type " SYM(longjmp) ", %function\n" + SYM(longjmp) ":\n" + RESTORE_FROM("r0") + " cmp r1, #0\n" + " bne 1f\n" + " movs r1, #1\n" + "1:\n" + " movs r0, r1\n" + " bx lr\n" + + /* coro_switch(from, to, value) -- r0=from, r1=to, r2=value. + SAVE_INTO leaves r0-r3 untouched, so r1 (to) and r2 (value) are + still live. RESTORE_FROM clobbers r4-r7 freely (they belong to + the resumed coro). The lr loaded by RESTORE_FROM is either a + real return address (a previously-suspended coro) or + __cfree_coro_trampoline (a fresh coro initialized by coro_init); + either way `bx lr` lands there with r0 holding `value`. */ + ".globl " SYM(coro_switch) "\n" + ".thumb_func\n" + ".type " SYM(coro_switch) ", %function\n" + SYM(coro_switch) ":\n" + SAVE_INTO("r0") + RESTORE_FROM("r1") + " movs r0, r2\n" + " bx lr\n" + + /* __cfree_coro_trampoline -- on first entry r0 = value (delivered + by coro_switch's `movs r0, r2`), r4 = entry fn (set by coro_init), + sp aligned to 16. UDF #0 (T1, ARMv6-M) traps if entry returns. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + ".thumb_func\n" + ".type " SYM(__cfree_coro_trampoline) ", %function\n" + SYM(__cfree_coro_trampoline) ":\n" + " blx r4\n" + " udf #0\n" + + ".section .note.GNU-stack,\"\",%progbits\n" +); diff --git a/lib/coro/i386.c b/lib/coro/i386.c @@ -0,0 +1,143 @@ +/* + * lib/coro/i386.c -- i386 System V (cdecl, ILP32) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * cdecl callee-saved set: ebx, esi, edi, ebp, esp. Args are pushed + * right-to-left on the stack: at function entry, 4(%esp)=arg0, + * 8(%esp)=arg1, 12(%esp)=arg2, (%esp)=return-address. + * + * regs[0]: ebx (also stashes entry fn for the trampoline) + * regs[1]: esi + * regs[2]: edi + * regs[3]: ebp + * regs[4]: esp (caller's pre-call esp) + * regs[5]: eip (return address) + * + * 6 × 4 = 24 bytes of state, padded to sizeof = 32 by the 16-byte + * over-alignment (vs. natural 4) so coro_ctx's 16-byte alignment is + * matched. + * + * setjmp(env) 4(%esp)=env + * longjmp(env, val) 4(%esp)=env, 8(%esp)=val + * coro_switch(f, t, val) 4(%esp)=from, 8(%esp)=to, 12(%esp)=value + * + * The "save esp/eip" trick: at function entry, (%esp) holds the caller's + * return address (just pushed by `call`); 4(%esp) is the caller's + * pre-call esp. Saving those two lets longjmp/coro_switch "land" at the + * call site exactly as if the function had returned. + * + * Modern SysV i386 (ABI rev 1.1+) requires 16-byte stack alignment + * before each `call`; the trampoline `andl $-16, %esp` enforces this + * defensively for fresh contexts. + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_i386_ctx { + uintptr_t regs[6]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_i386_ctx) == 32, "layout"); +_Static_assert(_Alignof(struct __cfree_i386_ctx) == 16, "align"); +_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_i386_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_i386_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_i386_ctx *c = (struct __cfree_i386_ctx *)ctx; + + /* i386 stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)entry; /* ebx -- entry fn */ + c->regs[3] = 0; /* ebp */ + c->regs[4] = top; /* esp */ + c->regs[5] = (uintptr_t)__cfree_coro_trampoline; /* eip */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved + (caller's) esp + eip into [reg]; clobbers %eax. + Used at function-entry stack discipline: (%esp)=ret-addr, 4(%esp)=pre-call esp. */ +#define SAVE_INTO(reg) \ + " movl %ebx, 0(" reg ")\n" \ + " movl %esi, 4(" reg ")\n" \ + " movl %edi, 8(" reg ")\n" \ + " movl %ebp, 12(" reg ")\n" \ + " leal 4(%esp), %eax\n" \ + " movl %eax, 16(" reg ")\n" \ + " movl (%esp), %eax\n" \ + " movl %eax, 20(" reg ")\n" + +/* Restore callee-saved + esp from [reg], leave eip in %ecx ready to + jmp. Caller delivers the destination value in %eax beforehand. */ +#define RESTORE_FROM(reg) \ + " movl 0(" reg "), %ebx\n" \ + " movl 4(" reg "), %esi\n" \ + " movl 8(" reg "), %edi\n" \ + " movl 12(" reg "), %ebp\n" \ + " movl 16(" reg "), %esp\n" \ + " movl 20(" reg "), %ecx\n" + +__asm__ ( + ".text\n" + ".p2align 4\n" + + /* setjmp(env) -- env at 4(%esp). */ + ".globl " SYM(setjmp) "\n" + SYM(setjmp) ":\n" + " movl 4(%esp), %edx\n" + SAVE_INTO("%edx") + " xorl %eax, %eax\n" + " ret\n" + + /* longjmp(env, val) -- env at 4(%esp), val at 8(%esp). + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */ + ".globl " SYM(longjmp) "\n" + SYM(longjmp) ":\n" + " movl 4(%esp), %edx\n" /* env */ + " movl 8(%esp), %eax\n" /* val */ + " testl %eax, %eax\n" + " movl $1, %ecx\n" + " cmovel %ecx, %eax\n" + RESTORE_FROM("%edx") + " jmp *%ecx\n" + + /* coro_switch(from, to, value) -- 4(%esp)=from, 8(%esp)=to, 12(%esp)=value. + Read all three args before SAVE_INTO clobbers the stack frame. */ + ".globl " SYM(coro_switch) "\n" + SYM(coro_switch) ":\n" + " movl 4(%esp), %edx\n" /* from */ + SAVE_INTO("%edx") + " movl 8(%esp), %edx\n" /* to (re-read; SAVE clobbered %eax not stack) */ + " movl 12(%esp), %eax\n" /* value -- delivered as return reg */ + RESTORE_FROM("%edx") + " jmp *%ecx\n" + + /* __cfree_coro_trampoline -- on first entry: %eax=value, %ebx=entry, + %esp=stack_top (no return addr pushed -- coro_switch reaches here + via jmp). cdecl needs the arg pushed; align defensively, then + reserve 12 bytes + push value so that after the upcoming `call` + pushes the 4-byte return addr, the callee sees %esp+4 16-aligned. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + SYM(__cfree_coro_trampoline) ":\n" + " andl $-16, %esp\n" + " subl $12, %esp\n" + " pushl %eax\n" /* arg0 = value */ + " calll *%ebx\n" /* entry(value) */ + " ud2\n" +); diff --git a/lib/coro/riscv32.c b/lib/coro/riscv32.c @@ -0,0 +1,219 @@ +/* + * lib/coro/riscv32.c -- RISC-V 32-bit (ILP32/ILP32F/ILP32D) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * Per-target context layout (matches xOS rv32 tick_coro_ctx): + * + * regs[0]: ra + * regs[1]: sp + * regs[2..13]: s0-s11 + * fp_regs[0..11]: fs0-fs11 + * + * The fp_regs slots are always allocated (12 * 8 = 96 bytes at offset + * 56) so the struct layout is constant regardless of the F/D extension. + * The save/restore code is conditional on __riscv_flen: + * __riscv_flen == 64 -> fsd/fld (64-bit, fills slots fully) + * __riscv_flen == 32 -> fsw/flw (32-bit, packs into the low halves) + * else -> no FP save/restore + * + * Field bytes = 14*4 + 12*8 = 152; sizeof = 160 after 16-byte align + * tail padding. Fits in the 256-byte storage carved out by jmp_buf + * and coro_ctx. + * + * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence + * is emitted in setjmp, longjmp, and coro_switch without duplication. + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_riscv32_ctx { + uintptr_t regs[14]; + uint64_t fp_regs[12]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_riscv32_ctx) == 160, "layout"); +_Static_assert(_Alignof(struct __cfree_riscv32_ctx) == 16, "align"); +_Static_assert(offsetof(struct __cfree_riscv32_ctx, fp_regs) == 56, "fp off"); +_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_riscv32_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv32_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_riscv32_ctx *c = (struct __cfree_riscv32_ctx *)ctx; + + /* RISC-V stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */ + c->regs[1] = top; /* sp */ + c->regs[2] = (uintptr_t)entry; /* s0 -- entry fn */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Integer save: ra, sp, s0-s11 into regs[0..13] at offsets 0..52. */ +#define SAVE_GPR(reg) \ + " sw ra, 0(" reg ")\n" \ + " sw sp, 4(" reg ")\n" \ + " sw s0, 8(" reg ")\n" \ + " sw s1, 12(" reg ")\n" \ + " sw s2, 16(" reg ")\n" \ + " sw s3, 20(" reg ")\n" \ + " sw s4, 24(" reg ")\n" \ + " sw s5, 28(" reg ")\n" \ + " sw s6, 32(" reg ")\n" \ + " sw s7, 36(" reg ")\n" \ + " sw s8, 40(" reg ")\n" \ + " sw s9, 44(" reg ")\n" \ + " sw s10, 48(" reg ")\n" \ + " sw s11, 52(" reg ")\n" + +#define RESTORE_GPR(reg) \ + " lw ra, 0(" reg ")\n" \ + " lw sp, 4(" reg ")\n" \ + " lw s0, 8(" reg ")\n" \ + " lw s1, 12(" reg ")\n" \ + " lw s2, 16(" reg ")\n" \ + " lw s3, 20(" reg ")\n" \ + " lw s4, 24(" reg ")\n" \ + " lw s5, 28(" reg ")\n" \ + " lw s6, 32(" reg ")\n" \ + " lw s7, 36(" reg ")\n" \ + " lw s8, 40(" reg ")\n" \ + " lw s9, 44(" reg ")\n" \ + " lw s10, 48(" reg ")\n" \ + " lw s11, 52(" reg ")\n" + +#if __riscv_flen == 64 +#define SAVE_FPR(reg) \ + " fsd fs0, 56(" reg ")\n" \ + " fsd fs1, 64(" reg ")\n" \ + " fsd fs2, 72(" reg ")\n" \ + " fsd fs3, 80(" reg ")\n" \ + " fsd fs4, 88(" reg ")\n" \ + " fsd fs5, 96(" reg ")\n" \ + " fsd fs6, 104(" reg ")\n" \ + " fsd fs7, 112(" reg ")\n" \ + " fsd fs8, 120(" reg ")\n" \ + " fsd fs9, 128(" reg ")\n" \ + " fsd fs10, 136(" reg ")\n" \ + " fsd fs11, 144(" reg ")\n" +#define RESTORE_FPR(reg) \ + " fld fs0, 56(" reg ")\n" \ + " fld fs1, 64(" reg ")\n" \ + " fld fs2, 72(" reg ")\n" \ + " fld fs3, 80(" reg ")\n" \ + " fld fs4, 88(" reg ")\n" \ + " fld fs5, 96(" reg ")\n" \ + " fld fs6, 104(" reg ")\n" \ + " fld fs7, 112(" reg ")\n" \ + " fld fs8, 120(" reg ")\n" \ + " fld fs9, 128(" reg ")\n" \ + " fld fs10, 136(" reg ")\n" \ + " fld fs11, 144(" reg ")\n" +#elif __riscv_flen == 32 +#define SAVE_FPR(reg) \ + " fsw fs0, 56(" reg ")\n" \ + " fsw fs1, 60(" reg ")\n" \ + " fsw fs2, 64(" reg ")\n" \ + " fsw fs3, 68(" reg ")\n" \ + " fsw fs4, 72(" reg ")\n" \ + " fsw fs5, 76(" reg ")\n" \ + " fsw fs6, 80(" reg ")\n" \ + " fsw fs7, 84(" reg ")\n" \ + " fsw fs8, 88(" reg ")\n" \ + " fsw fs9, 92(" reg ")\n" \ + " fsw fs10, 96(" reg ")\n" \ + " fsw fs11, 100(" reg ")\n" +#define RESTORE_FPR(reg) \ + " flw fs0, 56(" reg ")\n" \ + " flw fs1, 60(" reg ")\n" \ + " flw fs2, 64(" reg ")\n" \ + " flw fs3, 68(" reg ")\n" \ + " flw fs4, 72(" reg ")\n" \ + " flw fs5, 76(" reg ")\n" \ + " flw fs6, 80(" reg ")\n" \ + " flw fs7, 84(" reg ")\n" \ + " flw fs8, 88(" reg ")\n" \ + " flw fs9, 92(" reg ")\n" \ + " flw fs10, 96(" reg ")\n" \ + " flw fs11, 100(" reg ")\n" +#else +#define SAVE_FPR(reg) "" +#define RESTORE_FPR(reg) "" +#endif + +/* Save: int first, FP second (matches xOS rv32 pattern, and rv64 here). + Restore: FP first, int second -- mirror order, minimizes register + reuse window. Note none of these loads write to the address-base + register, so the integer/FP order is purely cosmetic. */ +#define SAVE_INTO(reg) SAVE_GPR(reg) SAVE_FPR(reg) +#define RESTORE_FROM(reg) RESTORE_FPR(reg) RESTORE_GPR(reg) + +__asm__ ( + ".text\n" + ".align 2\n" + + /* setjmp(env) -- env=a0. ra at function entry is the caller's + return address, exactly what longjmp must restore. */ + ".globl " SYM(setjmp) "\n" + ".type " SYM(setjmp) ", @function\n" + SYM(setjmp) ":\n" + SAVE_INTO("a0") + " li a0, 0\n" + " ret\n" + ".size " SYM(setjmp) ", .-" SYM(setjmp) "\n" + + /* longjmp(env, val) -- env=a0, val=a1. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4); branch-free: + seqz t0, a1 ; t0 = (a1 == 0) + add a0, a1, t0 + so a0 = a1 if a1 != 0, else 1. */ + ".globl " SYM(longjmp) "\n" + ".type " SYM(longjmp) ", @function\n" + SYM(longjmp) ":\n" + RESTORE_FROM("a0") + " seqz t0, a1\n" + " add a0, a1, t0\n" + " ret\n" + ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n" + + /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value. + Save into [a0], restore from [a1], deliver a2 in a0 (which is + both the return register and the trampoline's first-arg reg + on a fresh context's first run). */ + ".globl " SYM(coro_switch) "\n" + ".type " SYM(coro_switch) ", @function\n" + SYM(coro_switch) ":\n" + SAVE_INTO("a0") + RESTORE_FROM("a1") + " mv a0, a2\n" + " ret\n" + ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n" + + /* __cfree_coro_trampoline -- on first entry: a0=value (delivered + by coro_switch's `mv a0, a2`), s0=entry (set by coro_init via + regs[2]), sp=stack_top. ebreak if entry returns. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + ".type " SYM(__cfree_coro_trampoline) ", @function\n" + SYM(__cfree_coro_trampoline) ":\n" + " jalr s0\n" + " ebreak\n" + ".size " SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n" + + ".section .note.GNU-stack,\"\",@progbits\n" +); diff --git a/lib/coro/riscv64.c b/lib/coro/riscv64.c @@ -0,0 +1,193 @@ +/* + * lib/coro/riscv64.c -- RISC-V 64-bit (LP64D) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * RISC-V LP64D callee-saved set: + * ra (x1) -- saved manually so longjmp/coro_switch can + * "return" to the original call site + * sp (x2) + * s0-s11 (x8-x9, x18-x27) + * fs0-fs11 (f8-f9, f18-f27) + * + * Layout (matches xOS rv64 tick_coro_ctx): + * + * regs[0]: ra + * regs[1]: sp + * regs[2..13]: s0-s11 + * fp_regs[0..11]: fs0-fs11 (offset 112) + * + * sizeof = 14*8 + 12*8 = 208, 16-byte aligned. Fits in the 256-byte + * storage carved out by jmp_buf and coro_ctx. + * + * setjmp(env) a0=env + * longjmp(env, val) a0=env, a1=val + * coro_switch(f, t, val) a0=from, a1=to, a2=val + * + * Value-passing trick: the destination context "returns" via + * ld ra, 0(a1); ... ret + * where `ret` is `jalr x0, 0(ra)`. By moving the value into a0 just + * before `ret`, both a fresh trampoline (entry(value)) and a previously + * suspended coro_switch (= the value its switch call returned) see it + * as the a0 return register. + * + * SAVE_/RESTORE_ are C string-concat macros so the same byte sequence + * is emitted in setjmp, longjmp, and coro_switch without duplication. + * + * Symbol naming uses __USER_LABEL_PREFIX__ so labels match the C + * compiler's call-site mangling (empty on RISC-V ELF). + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_riscv64_ctx { + uintptr_t regs[14]; + uint64_t fp_regs[12]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_riscv64_ctx) == 208, "layout"); +_Static_assert(_Alignof(struct __cfree_riscv64_ctx) == 16, "align"); +_Static_assert(offsetof(struct __cfree_riscv64_ctx, fp_regs) == 112, "fp off"); +_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_riscv64_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_riscv64_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_riscv64_ctx *c = (struct __cfree_riscv64_ctx *)ctx; + + /* RISC-V stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[0] = (uintptr_t)__cfree_coro_trampoline; /* ra */ + c->regs[1] = top; /* sp */ + c->regs[2] = (uintptr_t)entry; /* s0 -- entry fn */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved state into [reg]. reg is a register name string, + e.g. "a0". Emits straight-line sd/fsd; no scratch register needed. */ +#define SAVE_INTO(reg) \ + " sd ra, 0(" reg ")\n" \ + " sd sp, 8(" reg ")\n" \ + " sd s0, 16(" reg ")\n" \ + " sd s1, 24(" reg ")\n" \ + " sd s2, 32(" reg ")\n" \ + " sd s3, 40(" reg ")\n" \ + " sd s4, 48(" reg ")\n" \ + " sd s5, 56(" reg ")\n" \ + " sd s6, 64(" reg ")\n" \ + " sd s7, 72(" reg ")\n" \ + " sd s8, 80(" reg ")\n" \ + " sd s9, 88(" reg ")\n" \ + " sd s10, 96(" reg ")\n" \ + " sd s11, 104(" reg ")\n" \ + " fsd fs0, 112(" reg ")\n" \ + " fsd fs1, 120(" reg ")\n" \ + " fsd fs2, 128(" reg ")\n" \ + " fsd fs3, 136(" reg ")\n" \ + " fsd fs4, 144(" reg ")\n" \ + " fsd fs5, 152(" reg ")\n" \ + " fsd fs6, 160(" reg ")\n" \ + " fsd fs7, 168(" reg ")\n" \ + " fsd fs8, 176(" reg ")\n" \ + " fsd fs9, 184(" reg ")\n" \ + " fsd fs10, 192(" reg ")\n" \ + " fsd fs11, 200(" reg ")\n" + +/* Restore callee-saved state from [reg]. */ +#define RESTORE_FROM(reg) \ + " fld fs0, 112(" reg ")\n" \ + " fld fs1, 120(" reg ")\n" \ + " fld fs2, 128(" reg ")\n" \ + " fld fs3, 136(" reg ")\n" \ + " fld fs4, 144(" reg ")\n" \ + " fld fs5, 152(" reg ")\n" \ + " fld fs6, 160(" reg ")\n" \ + " fld fs7, 168(" reg ")\n" \ + " fld fs8, 176(" reg ")\n" \ + " fld fs9, 184(" reg ")\n" \ + " fld fs10, 192(" reg ")\n" \ + " fld fs11, 200(" reg ")\n" \ + " ld ra, 0(" reg ")\n" \ + " ld sp, 8(" reg ")\n" \ + " ld s0, 16(" reg ")\n" \ + " ld s1, 24(" reg ")\n" \ + " ld s2, 32(" reg ")\n" \ + " ld s3, 40(" reg ")\n" \ + " ld s4, 48(" reg ")\n" \ + " ld s5, 56(" reg ")\n" \ + " ld s6, 64(" reg ")\n" \ + " ld s7, 72(" reg ")\n" \ + " ld s8, 80(" reg ")\n" \ + " ld s9, 88(" reg ")\n" \ + " ld s10, 96(" reg ")\n" \ + " ld s11, 104(" reg ")\n" + +__asm__ ( + ".text\n" + ".align 2\n" + + /* setjmp(env) -- env in a0. ra at call time is the caller's return + address, which is exactly what longjmp must restore. */ + ".globl " SYM(setjmp) "\n" + ".type " SYM(setjmp) ", @function\n" + SYM(setjmp) ":\n" + SAVE_INTO("a0") + " li a0, 0\n" + " ret\n" + ".size " SYM(setjmp) ", .-" SYM(setjmp) "\n" + + /* longjmp(env, val) -- env in a0, val in a1. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). Branch-free: + seqz t0, a1 -> t0 = (a1==0); a0 = a1 + t0. RESTORE_FROM + doesn't touch t0/a0/a1, so the seqz/add can run after it and + write a0 directly -- one fewer instruction than munging a1 + first and mv'ing later. */ + ".globl " SYM(longjmp) "\n" + ".type " SYM(longjmp) ", @function\n" + SYM(longjmp) ":\n" + RESTORE_FROM("a0") + " seqz t0, a1\n" + " add a0, a1, t0\n" + " ret\n" + ".size " SYM(longjmp) ", .-" SYM(longjmp) "\n" + + /* coro_switch(from, to, value) -- a0=from, a1=to, a2=value. + Save into [a0], restore from [a1] (which clobbers a0 and a1's + roles -- ra/sp/s* are loaded from the to-context), then deliver + value in a0 just before ret. */ + ".globl " SYM(coro_switch) "\n" + ".type " SYM(coro_switch) ", @function\n" + SYM(coro_switch) ":\n" + SAVE_INTO("a0") + RESTORE_FROM("a1") + " mv a0, a2\n" + " ret\n" + ".size " SYM(coro_switch) ", .-" SYM(coro_switch) "\n" + + /* __cfree_coro_trampoline -- on first entry: a0=value (delivered), + s0=entry fn (set by coro_init), sp aligned to 16. ebreak if entry + returns. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + ".type " SYM(__cfree_coro_trampoline) ", @function\n" + SYM(__cfree_coro_trampoline) ":\n" + " jalr s0\n" + " ebreak\n" + ".size " SYM(__cfree_coro_trampoline) ", .-" SYM(__cfree_coro_trampoline) "\n" + + ".section .note.GNU-stack,\"\",%progbits\n" +); diff --git a/lib/coro/x86_64.c b/lib/coro/x86_64.c @@ -0,0 +1,131 @@ +/* + * lib/coro/x86_64.c -- x86_64 System V ABI implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * Callee-saved set on SysV: rbx, rbp, r12-r15. (No callee-saved xmm + * regs -- those are MS-ABI specific; see x86_64_win.c.) + * + * regs[0]: rbx regs[4]: r14 + * regs[1]: rbp regs[5]: r15 + * regs[2]: r12 regs[6]: rsp + * regs[3]: r13 regs[7]: rip + * + * sizeof = 64, 16-byte aligned. + * + * setjmp(env) %rdi=env + * longjmp(env, val) %rdi=env, %esi=val + * coro_switch(f, t, val) %rdi=from, %rsi=to, %rdx=val + * + * The "save rsp/rip" trick: at function entry, (%rsp) holds the + * caller's return address (just pushed by `call`); 8(%rsp) is the + * caller's pre-call rsp. Saving those two lets longjmp/coro_switch + * "land" at the call site exactly as if the function had returned. + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_x86_64_ctx { + uintptr_t regs[8]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_x86_64_ctx) == 64, "layout"); +_Static_assert(_Alignof(struct __cfree_x86_64_ctx) == 16, "align"); +_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_x86_64_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_x86_64_ctx *c = (struct __cfree_x86_64_ctx *)ctx; + + /* x86_64 stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[1] = 0; /* rbp */ + c->regs[3] = (uintptr_t)entry; /* r13 -- entry fn */ + c->regs[6] = top; /* rsp */ + c->regs[7] = (uintptr_t)__cfree_coro_trampoline; /* rip */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved + (caller's) rsp + rip into [reg]; clobbers %rax. + Used at function-entry stack discipline: (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */ +#define SAVE_INTO(reg) \ + " movq %rbx, 0(" reg ")\n" \ + " movq %rbp, 8(" reg ")\n" \ + " movq %r12, 16(" reg ")\n" \ + " movq %r13, 24(" reg ")\n" \ + " movq %r14, 32(" reg ")\n" \ + " movq %r15, 40(" reg ")\n" \ + " leaq 8(%rsp), %rax\n" \ + " movq %rax, 48(" reg ")\n" \ + " movq (%rsp), %rax\n" \ + " movq %rax, 56(" reg ")\n" + +/* Restore callee-saved + rsp from [reg], leave rip in %rcx ready to + jmp. Caller delivers the destination value in %rax beforehand. */ +#define RESTORE_FROM(reg) \ + " movq 0(" reg "), %rbx\n" \ + " movq 8(" reg "), %rbp\n" \ + " movq 16(" reg "), %r12\n" \ + " movq 24(" reg "), %r13\n" \ + " movq 32(" reg "), %r14\n" \ + " movq 40(" reg "), %r15\n" \ + " movq 48(" reg "), %rsp\n" \ + " movq 56(" reg "), %rcx\n" + +__asm__ ( + ".text\n" + ".p2align 4\n" + + /* setjmp(env) -- env=%rdi */ + ".globl " SYM(setjmp) "\n" + SYM(setjmp) ":\n" + SAVE_INTO("%rdi") + " xorl %eax, %eax\n" + " ret\n" + + /* longjmp(env, val) -- env=%rdi, val=%esi. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */ + ".globl " SYM(longjmp) "\n" + SYM(longjmp) ":\n" + " movslq %esi, %rax\n" /* sign-extend int → long */ + " testq %rax, %rax\n" + " movl $1, %edx\n" + " cmoveq %rdx, %rax\n" + RESTORE_FROM("%rdi") + " jmpq *%rcx\n" + + /* coro_switch(from, to, value) -- from=%rdi, to=%rsi, value=%rdx. */ + ".globl " SYM(coro_switch) "\n" + SYM(coro_switch) ":\n" + SAVE_INTO("%rdi") + " movq %rdx, %rax\n" /* deliver value as return reg */ + RESTORE_FROM("%rsi") + " jmpq *%rcx\n" + + /* __cfree_coro_trampoline -- on first entry: %rax=value, + %r13=entry, %rsp=stack_top (no return addr pushed -- coro_switch + reaches here via jmp). System V wants %rsp+8 ≡ 16 (mod 16) at + function entry; the andq below makes that hold defensively. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + SYM(__cfree_coro_trampoline) ":\n" + " andq $-16, %rsp\n" + " movq %rax, %rdi\n" /* value → first arg */ + " callq *%r13\n" /* entry(value) */ + " ud2\n" +); diff --git a/lib/coro/x86_64_win.c b/lib/coro/x86_64_win.c @@ -0,0 +1,176 @@ +/* + * lib/coro/x86_64_win.c -- x86_64 Windows (MS x64 ABI) implementations of + * setjmp / longjmp (<setjmp.h>) + * coro_init / coro_switch / trampoline (<stdcoro.h>) + * + * MS x64 callee-saved set: rbx, rbp, rdi, rsi, r12-r15, xmm6-xmm15. + * (Compare with x86_64.c -- SysV doesn't preserve rdi/rsi or any xmm.) + * Windows additionally requires the TEB stack-bound slots gs:0x08 + * (StackBase) and gs:0x10 (StackLimit) to track the live stack so + * exception unwinding etc. behave; these are saved/restored on every + * switch. + * + * regs[0]: rbx regs[8]: rsp + * regs[1]: rbp regs[9]: rip + * regs[2]: rdi regs[10]: stack_base (TEB gs:0x08) + * regs[3]: rsi regs[11]: stack_limit (TEB gs:0x10) + * regs[4..7]: r12-r15 + * fp_regs[0..19]: xmm6-xmm15 (10 regs * 128b = 20 * 64b slots, off 96) + * + * sizeof = 256, 16-byte aligned. Exactly fills jmp_buf / coro_ctx. + * + * setjmp(env) %rcx=env + * longjmp(env, val) %rcx=env, %edx=val + * coro_switch(f, t, val) %rcx=from, %rdx=to, %r8=value + * + * The "save rsp/rip" trick mirrors x86_64.c: at function entry, + * (%rsp) holds the caller's return address, 8(%rsp) is the caller's + * pre-call rsp. + */ + +#include <setjmp.h> +#include <stdcoro.h> +#include <stddef.h> +#include <stdint.h> + +struct __cfree_x86_64_win_ctx { + uintptr_t regs[12]; + uint64_t fp_regs[20]; +} __attribute__((aligned(16))); + +_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) == 256, "layout"); +_Static_assert(_Alignof(struct __cfree_x86_64_win_ctx) == 16, "align"); +_Static_assert(offsetof(struct __cfree_x86_64_win_ctx, fp_regs) == 96, "fp off"); +_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(coro_ctx), "fits coro_ctx"); +_Static_assert(sizeof(struct __cfree_x86_64_win_ctx) <= sizeof(jmp_buf), "fits jmp_buf"); +_Static_assert(_Alignof(coro_ctx) >= _Alignof(struct __cfree_x86_64_win_ctx), "align coro_ctx"); + +extern void __cfree_coro_trampoline(void); + +void coro_init(coro_ctx *ctx, + void *stack_base, size_t stack_len, + coro_entry_fn entry) { + struct __cfree_x86_64_win_ctx *c = (struct __cfree_x86_64_win_ctx *)ctx; + + /* x86_64 stacks grow down; align top to 16. */ + uintptr_t top = (uintptr_t)stack_base + stack_len; + top &= ~(uintptr_t)(CORO_STACK_ALIGN - 1); + + for (size_t i = 0; i < sizeof(*c) / sizeof(uintptr_t); ++i) + ((uintptr_t *)c)[i] = 0; + + c->regs[1] = 0; /* rbp */ + c->regs[4] = (uintptr_t)entry; /* r12 -- entry fn */ + c->regs[8] = top; /* rsp */ + c->regs[9] = (uintptr_t)__cfree_coro_trampoline; /* rip */ + c->regs[10] = top; /* stack_base (TEB) */ + c->regs[11] = (uintptr_t)stack_base; /* stack_limit (TEB) */ +} + +#define STR_(x) #x +#define STR(x) STR_(x) +#define SYM(n) STR(__USER_LABEL_PREFIX__) #n + +/* Save callee-saved + (caller's) rsp + rip + TEB stack bounds + xmm6-15 + into [reg]; clobbers %rax. Used at function-entry stack discipline: + (%rsp)=ret-addr, 8(%rsp)=pre-call rsp. */ +#define SAVE_INTO(reg) \ + " movq %rbx, 0(" reg ")\n" \ + " movq %rbp, 8(" reg ")\n" \ + " movq %rdi, 16(" reg ")\n" \ + " movq %rsi, 24(" reg ")\n" \ + " movq %r12, 32(" reg ")\n" \ + " movq %r13, 40(" reg ")\n" \ + " movq %r14, 48(" reg ")\n" \ + " movq %r15, 56(" reg ")\n" \ + " leaq 8(%rsp), %rax\n" \ + " movq %rax, 64(" reg ")\n" \ + " movq (%rsp), %rax\n" \ + " movq %rax, 72(" reg ")\n" \ + " movq %gs:0x08, %rax\n" \ + " movq %rax, 80(" reg ")\n" \ + " movq %gs:0x10, %rax\n" \ + " movq %rax, 88(" reg ")\n" \ + " movaps %xmm6, 96(" reg ")\n" \ + " movaps %xmm7, 112(" reg ")\n" \ + " movaps %xmm8, 128(" reg ")\n" \ + " movaps %xmm9, 144(" reg ")\n" \ + " movaps %xmm10, 160(" reg ")\n" \ + " movaps %xmm11, 176(" reg ")\n" \ + " movaps %xmm12, 192(" reg ")\n" \ + " movaps %xmm13, 208(" reg ")\n" \ + " movaps %xmm14, 224(" reg ")\n" \ + " movaps %xmm15, 240(" reg ")\n" + +/* Restore callee-saved + xmm + TEB bounds + rsp from [reg]; leaves rip + in %r10 ready to jmp. Caller delivers the destination value in %rax + beforehand, so %rax must not be touched here. */ +#define RESTORE_FROM(reg) \ + " movaps 96(" reg "), %xmm6\n" \ + " movaps 112(" reg "), %xmm7\n" \ + " movaps 128(" reg "), %xmm8\n" \ + " movaps 144(" reg "), %xmm9\n" \ + " movaps 160(" reg "), %xmm10\n" \ + " movaps 176(" reg "), %xmm11\n" \ + " movaps 192(" reg "), %xmm12\n" \ + " movaps 208(" reg "), %xmm13\n" \ + " movaps 224(" reg "), %xmm14\n" \ + " movaps 240(" reg "), %xmm15\n" \ + " movq 0(" reg "), %rbx\n" \ + " movq 8(" reg "), %rbp\n" \ + " movq 16(" reg "), %rdi\n" \ + " movq 24(" reg "), %rsi\n" \ + " movq 32(" reg "), %r12\n" \ + " movq 40(" reg "), %r13\n" \ + " movq 48(" reg "), %r14\n" \ + " movq 56(" reg "), %r15\n" \ + " movq 80(" reg "), %r10\n" \ + " movq %r10, %gs:0x08\n" \ + " movq 88(" reg "), %r10\n" \ + " movq %r10, %gs:0x10\n" \ + " movq 64(" reg "), %rsp\n" \ + " movq 72(" reg "), %r10\n" + +__asm__ ( + ".text\n" + ".p2align 4\n" + + /* setjmp(env) -- env=%rcx */ + ".globl " SYM(setjmp) "\n" + SYM(setjmp) ":\n" + SAVE_INTO("%rcx") + " xorl %eax, %eax\n" + " ret\n" + + /* longjmp(env, val) -- env=%rcx, val=%edx. + longjmp(_, 0) must deliver 1 (C11 7.13.2.1p4). */ + ".globl " SYM(longjmp) "\n" + SYM(longjmp) ":\n" + " movslq %edx, %rax\n" /* sign-extend int -> long */ + " testq %rax, %rax\n" + " movl $1, %r11d\n" + " cmoveq %r11, %rax\n" + RESTORE_FROM("%rcx") + " jmpq *%r10\n" + + /* coro_switch(from, to, value) -- from=%rcx, to=%rdx, value=%r8. */ + ".globl " SYM(coro_switch) "\n" + SYM(coro_switch) ":\n" + SAVE_INTO("%rcx") + " movq %r8, %rax\n" /* deliver value as return reg */ + RESTORE_FROM("%rdx") + " jmpq *%r10\n" + + /* __cfree_coro_trampoline -- on first entry: %rax=value (delivered + by coro_switch), %r12=entry (set by coro_init), %rsp=stack_top + (no return addr pushed -- coro_switch reaches here via jmp). MS + x64 wants %rsp 16-byte aligned at call sites with 32 bytes of + shadow space reserved by the caller. */ + ".globl " SYM(__cfree_coro_trampoline) "\n" + SYM(__cfree_coro_trampoline) ":\n" + " andq $-16, %rsp\n" /* defensive align */ + " subq $32, %rsp\n" /* MS x64 shadow space */ + " movq %rax, %rcx\n" /* value -> first arg */ + " callq *%r12\n" /* entry(value) */ + " ud2\n" +); diff --git a/test/smoke.c b/test/smoke.c @@ -34,6 +34,7 @@ #include <stdarg.h> #include <stdatomic.h> #include <stdbool.h> +#include <stdcoro.h> #include <stddef.h> #include <stdint.h> #include <stdnoreturn.h> @@ -135,7 +136,8 @@ static noreturn void cfree_trap(void) { for (;;) {} } /* setjmp: jmp_buf is an array type, setjmp is callable in the contexts permitted by C11 7.13.1.1p4, longjmp is _Noreturn. Compile-only -- smoke.c never links against a setjmp implementation. */ -_Static_assert(sizeof(jmp_buf) >= sizeof(void *) * 8, "jmp_buf room for regs"); +_Static_assert(sizeof(jmp_buf) >= 64, "jmp_buf room for regs"); +_Static_assert(_Alignof(jmp_buf) >= 16, "jmp_buf 16-byte aligned"); static jmp_buf cfree_jb; static int cfree_setjmp_compiles(int x) { if (setjmp(cfree_jb) != 0) return 1; /* allowed context */ @@ -143,6 +145,19 @@ static int cfree_setjmp_compiles(int x) { return 0; } +/* stdcoro: coro_ctx storage exists, the API surface compiles and + resolves; same compile-only caveat as setjmp. */ +_Static_assert(sizeof(coro_ctx) >= 64, "coro_ctx room for regs"); +_Static_assert(_Alignof(coro_ctx) >= 16, "coro_ctx 16-byte aligned"); +_Static_assert(CORO_STACK_ALIGN >= 8, "stack align reasonable"); +static coro_ctx cfree_co_a, cfree_co_b; +static _Alignas(16) unsigned char cfree_co_stack[4096]; +static void cfree_co_entry(uintptr_t v) { (void)v; for (;;) {} } +static uintptr_t cfree_coro_compiles(void) { + coro_init(&cfree_co_b, cfree_co_stack, sizeof(cfree_co_stack), cfree_co_entry); + return coro_switch(&cfree_co_a, &cfree_co_b, 0xC0FFEEu); +} + /* stdatomic: types, memory_order, lock-free macros, plus a runtime exercise of load, store, exchange, CAS, fetch ops, and atomic_flag. */ _Static_assert(sizeof(atomic_int) == sizeof(int), "atomic_int matches int"); @@ -185,5 +200,6 @@ int cfree_smoke_ok(void) { (void)aligned_buf; if (0) cfree_trap(); if (0) (void)cfree_setjmp_compiles(0); + if (0) (void)cfree_coro_compiles(); return sum_n(3, 1, 2, 3) == 6 && cfree_atomic_ok(); }