commit edbce4510998845c7ed2641373981a3167d0e7bd
parent aa82a0f76746048f6891d6f4395c28849d5f410b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 5 May 2026 10:28:06 -0700
seed-kernel: atomic sys_spawn replaces clone+execve; boot5 on seed
The previous clone+execve design paid one 768 MB mem_cpy per fork to
seed the child's pool, needed only because user code ran a few
interpreter cells between clone and execve which would otherwise mutate
parent BSS heap globals (heap_next, current_heap_next_ptr,
scratch_next). sys_spawn (private syscall 1024) folds clone+execve into
one kernel transaction with no userspace gap, dropping the copy.
Same scheme1 binary still runs on Linux: prelude probes
(sys-spawn "" '()) once at init and binds (spawn …) to the classic
clone+execve sequence when the syscall returns -ENOSYS, sys-spawn when
it returns -ENOENT. Both primitives stay registered; sys-clone /
sys-execve are the Linux fallback path.
boot5 now runs under DRIVER=seed: ~1300 (run "tcc" …) calls drive
musl-1.2.5 + hello to byte-identity vs podman (libc.a 3.0 MB, crt1.o,
crti.o, crtn.o, hello). New scripts/boot5-gen-runscm.sh emits run.scm
from boot5.sh's host-side enumeration; lib-seed-runscm.sh gains
seed_runscm_input_tree for staging the musl tree at the same
/tmp/musl-1.2.5/... paths podman uses (so STT_FILE strings match
without a new strip-prefix patch).
Kernel adjustments boot5 forced: MAX_FILES 64 → 4096 and path[64] →
path[96] for the ~3900-entry cpio; MAX_ARGV 64 → 2048 (with argv pool
moved to BSS) for the ~1267-arg `tcc -ar rcs libc.a obj1…obj1263`
peak; path normalization in find_file/new_file so tcc's pstrcat-style
include resolution (e.g. /tmp/.../src/include/../../include/features.h)
finds the file; loud WARN when parse_cpio drops entries (silent drops
otherwise masquerade as random tcc include-not-found errors).
Verified: forktest, seed-accept (boot0/1/2), seed-accept-boot34
(boot3 tcc0, boot4 tcc3+libc.a+libtcc1.a+hello, fixed-point preserved),
seed-accept-boot5 (boot5 libc.a+crts+hello, all byte-identical).
Diffstat:
13 files changed, 691 insertions(+), 240 deletions(-)
diff --git a/P1/P1-aarch64.M1pp b/P1/P1-aarch64.M1pp
@@ -629,6 +629,9 @@
%macro p1_sys_execve()
221
%endm
+%macro p1_sys_spawn()
+1024
+%endm
%macro p1_sys_waitid()
95
%endm
diff --git a/P1/P1-amd64.M1pp b/P1/P1-amd64.M1pp
@@ -936,6 +936,9 @@ $(imm)
%macro p1_sys_execve()
59
%endm
+%macro p1_sys_spawn()
+1024
+%endm
%macro p1_sys_waitid()
247
%endm
diff --git a/P1/P1-riscv64.M1pp b/P1/P1-riscv64.M1pp
@@ -637,6 +637,9 @@ $(imm)
%macro p1_sys_execve()
221
%endm
+%macro p1_sys_spawn()
+1024
+%endm
%macro p1_sys_waitid()
95
%endm
diff --git a/P1/P1.M1pp b/P1/P1.M1pp
@@ -240,6 +240,10 @@ target
%p1_sys_execve
%endm
+%macro sys_spawn()
+%p1_sys_spawn
+%endm
+
%macro sys_waitid()
%p1_sys_waitid
%endm
diff --git a/docs/OS-TODO.md b/docs/OS-TODO.md
@@ -3,16 +3,18 @@
Audit of [`seed-kernel/`](../seed-kernel/) against the contract in
[`OS.md`](OS.md). All eleven items are now resolved — the seed kernel
boots, parses the DTB, unpacks an initramfs into an in-memory tmpfs,
-loads `/init` as a static aarch64 ELF, dispatches the eight Tier-1 +
-three Tier-2 syscalls, and supports both the host-side verification
-gates `scripts/tier1-gate.sh` and `scripts/tier2-gate.sh`. Verified
-against `boot0/catm`, `boot1/M1pp`, and `boot3/tcc0`; the canonical
-Tier-2 case (scheme1 driver spawns tcc0 to compile a `.c` into a
-relocatable ELF object) round-trips end-to-end. The bootN scripts
-themselves now run on seed-kernel via `DRIVER=seed scripts/bootN.sh
-aarch64` for N∈{0,1,2}, producing byte-identical outputs to the
-podman path; `scripts/seed-accept.sh` exercises the boot2-built
-scheme1 spawning the boot2-built catm via the .scm prelude.
+loads `/init` as a static aarch64 ELF, dispatches the eight Tier-1
+syscalls plus a single atomic `sys_spawn` (a private syscall replacing
+the original POSIX-style `clone`+`execve` pair) and `sys_waitid`, and
+supports both the host-side verification gates `scripts/tier1-gate.sh`
+and `scripts/tier2-gate.sh`. Verified against `boot0/catm`,
+`boot1/M1pp`, and `boot3/tcc0`; the canonical Tier-2 case (scheme1
+driver spawns tcc0 to compile a `.c` into a relocatable ELF object)
+round-trips end-to-end. The bootN scripts themselves now run on
+seed-kernel via `DRIVER=seed scripts/bootN.sh aarch64` for
+N∈{0,1,2,3,4,5}, producing byte-identical outputs to the podman path;
+`scripts/seed-accept.sh` exercises the boot2-built scheme1 spawning
+the boot2-built catm via the .scm prelude.
## Tier 1
@@ -50,24 +52,24 @@ scheme1 spawning the boot2-built catm via the .scm prelude.
## Tier 2
-6. **`clone` / `execve` / `waitid`.** ✅ Pseudo-fork via a
- `proc_stack[]` of saved frames plus a two-pool user RAM layout
- (`USER_POOL_A_PA` / `USER_POOL_B_PA`, 768 MB each). `sys_clone`
- eager-copies the parent pool into the alternate pool, swaps the
- `l2_user[]` entries to point at the alternate pool + `tlbi vmalle1`,
- then returns 0 to the current context (the "child"); the parent's
- pool stays pristine. `do_execve` captures path/argv into a kernel
- buffer before clobbering user memory, loads the new ELF into the
- (already-swapped) child pool, resets brk above its end-of-bss, and
- rewrites the trap frame so `eret` lands at the new entry point with
- a fresh user stack. `sys_waitid` populates the siginfo at offsets 8
- (CLD_EXITED) and 24 (status) per `scheme1/prelude.scm:497-506`. On
- `sys_exit_or_resume_parent` the kernel swaps `l2_user[]` back to
- the parent pool (no copy — pool was untouched), restores
- regs/brk/fds, runs `ic iallu` (the user VAs now resolve to a
- different physical pool, so any I-cache lines tagged for the
- exiting child's PA are stale), and returns to the parent's
- `clone()` site with `x0 = child_pid`.
+6. **Atomic `spawn` (replaces `clone` + `execve`).** ✅ `sys_spawn`
+ (private syscall 1024) folds the prelude's clone-then-immediate-
+ execve sequence into a single kernel transaction. The kernel
+ captures path/argv from the parent's pool into a kernel buffer,
+ pushes parent state onto `proc_stack[]`, swaps `l2_user[]` to the
+ alternate pool with **no memory copy** (the previous design paid one
+ 768 MB `mem_cpy` per fork to seed the child's pool — needed only
+ because user code ran a few interpreter cells between clone and
+ execve, which would otherwise mutate parent BSS heap globals;
+ folding the syscall closes that window entirely), `load_elf`s the
+ new image into the alternate pool, resets brk above the new
+ end-of-bss, builds a fresh user stack, and rewrites the trap frame
+ so `eret` enters the child at the new entry point. `sys_waitid`
+ populates siginfo at offsets 8 (CLD_EXITED) and 24 (status) per
+ `scheme1/prelude.scm:497-506`. On `sys_exit_or_resume_parent` the
+ kernel swaps `l2_user[]` back to the parent pool (still pristine),
+ restores regs/brk/fds, runs `ic iallu`, and returns to the parent's
+ `spawn()` site with `x0 = child_pid`.
7. **Per-process state on a stack.** ✅ `proc_save` records regs +
ELR + SPSR + sp_el0 + brk_base + brk_cur + fd table + which user
@@ -75,10 +77,15 @@ scheme1 spawning the boot2-built catm via the .scm prelude.
scheme1 prelude only forks one level deep before waiting; one save
frame plus two pools is all that's needed.
-8. **`execve` accepts NULL/empty envp.** ✅ `do_execve` ignores its
- `envp` argument; the prelude wrapper passes no envp at all and
- the value in `x2` at the SVC site is whatever happens to be
- there.
+8. **scheme1 prelude probes once, dispatches per environment.** ✅
+ The same scheme1 binary runs on both Linux (boot{3,4,5} podman
+ path) and the seed kernel. `prelude.scm` calls `(sys-spawn "" '())`
+ once at init: on Linux that returns `-ENOSYS=38` and the prelude
+ binds `(spawn …)` to the classic clone+execve sequence; on seed it
+ returns `-ENOENT=2` (kernel finds no such file) and the prelude
+ binds `(spawn …)` to `sys-spawn` directly. Both `sys-clone` and
+ `sys-execve` primitives remain in the scheme1 binary as the Linux
+ fallback path.
## Verification harness
@@ -144,24 +151,26 @@ scheme1 spawning the boot2-built catm via the .scm prelude.
path into the .o relocations. tcc3 and hello are unaffected because
the linker drops those strings in the final executable.
-- **Pool-swap on execve — landed (eager-copy variant).** Items 6/7
- above describe the implementation. `sys_clone` does one 768 MB
- `mem_cpy` (parent pool → alternate pool) plus an `l2_user[]` swap;
- child exit is a swap-only no-copy. Net per-fork cost: 0.768 GB of
- memcpy (was 1.5 GB) plus a TLBI. `mem_cpy` also gained an 8-byte
- fast path (~8× under TCG) which dominates the remaining cost. The
- ideal "deferred swap until execve" — letting the child run prelude
- bytecode in the parent's pool, costing zero copies — does not work
- here: scheme1's interpreter keeps heap-allocator state in user BSS
- globals (`heap_next`, `current_heap_next_ptr`, `scratch_next`), so
- the child's allocations during the prelude window mutate the
- parent's view. The eager-copy variant is the smallest change that
- keeps the prelude working unmodified.
+- **Atomic spawn — landed (zero copy on fork).** Item 6 above describes
+ the kernel side. The previous design had `sys_clone` eager-copy 768 MB
+ parent→alternate-pool per fork; the only reason for the copy was the
+ scheme1 prelude executing a few interpreter cells of user code in the
+ child between clone and execve, which would have mutated parent BSS
+ globals (`heap_next`, `current_heap_next_ptr`, `scratch_next`) if the
+ child shared the parent's pool. `sys_spawn` folds clone+execve into
+ one syscall, the child runs zero user code in the parent's address
+ space, and the eager copy is gone. Same scheme1 binary still runs on
+ Linux (boot{3,4,5} podman path) by probing `(sys-spawn "" '())` once
+ at prelude init and binding `(spawn …)` to clone+execve when the probe
+ returns -ENOSYS=38. boot4 acceptance still hits its `tcc2 == tcc3`
+ fixed point under DRIVER=seed; per-spawn wall time on the boot4
+ fixture dropped from ~5 s to well under 1 s.
- **HVF acceleration.** All seed-driver qemu invocations use
`-machine virt,gic-version=3,accel=hvf -cpu host` on macOS hosts.
- tier2-gate ≈ 22 s; seed-accept (boot0/1/2) ≈ 2 s; boot3 acceptance
- ≈ 5 min wall (was multi-hour under TCG).
+ tier2-gate ≈ 22 s; seed-accept (boot0/1/2) ≈ 2 s; boot3 + boot4
+ acceptance combined ≈ 5 min wall (boot3 alone was 5 min before
+ sys_spawn; multi-hour under TCG without HVF).
- **STT_FILE prefix strip — landed.** tcc emitted the unmodified
argv path into each `.o`'s `STT_FILE` symbol, so podman-mounted
@@ -175,26 +184,35 @@ scheme1 spawning the boot2-built catm via the .scm prelude.
`seed-accept-boot34.sh` checks `tcc3`, `hello`, `crt1.o`, `libc.a`,
and `libtcc1.a` for byte-identity vs the podman path; all pass.
-## Open
+- **Port boot5 to the seed driver — landed.** With the per-spawn copy
+ gone (sys_spawn), the naive 1300-spawn straight port works without
+ needing tcc batch mode or in-kernel prelude caching. boot5.sh's
+ `DRIVER=seed` branch wires
+ [`scripts/boot5-gen-runscm.sh`](../scripts/boot5-gen-runscm.sh) to
+ emit one `(run "tcc" …)` per source plus the CRT/ar/link tail; the
+ full musl tree is staged in cpio at `/tmp/musl-1.2.5/...` (matching
+ podman's tmpfs layout, so STT_FILE strings are byte-identical).
+ Required kernel adjustments: `MAX_FILES` 64 → 4096 (the cpio carries
+ ~2600 inputs plus ~1300 .o outputs), `path[64]` → `path[96]` (musl
+ paths reach ~50 chars under the `/tmp/musl-1.2.5/obj/...` prefix),
+ and a loud warning when `parse_cpio` drops files (silent drops on
+ MAX_FILES exhaustion otherwise masquerade as random "include not
+ found" tcc errors mid-build). New extension to lib-seed-runscm.sh:
+ `seed_runscm_input_tree` stages a directory subtree into the cpio
+ preserving relative paths.
+ [`scripts/seed-accept-boot5.sh`](../scripts/seed-accept-boot5.sh)
+ asserts byte-identity vs the podman path for libc.a, crt1.o, crti.o,
+ crtn.o, hello.
-- **Port boot5 to the seed driver.** boot5 compiles ~500 musl TUs, each
- one a `(run "tcc" …)`. Even with HVF and the pool-swap fix, the
- per-clone fixed cost (TLB flush, ELF reload, scheme1 start-up)
- compounds to a long wall time. `scripts/boot5.sh` rejects
- `DRIVER=seed` today. Two natural unblockers, either of which would
- make boot5 tractable on its own:
- - **Cache the parsed prelude in the kernel** so each spawn doesn't
- re-tokenise + re-build the AST for the 24 KB prelude.scm. The
- parser output is per-process today; lift it into kernel state
- keyed by the prelude's content hash, hand the child a fresh
- pointer-to-AST at execve time.
- - **tcc batch mode**: a single `(run "tcc" "-c" src1 src2 …)` that
- emits one .o per TU, so one clone covers many translation units.
- Upstream tcc already accepts multiple inputs in one invocation;
- the boot4-gen-runscm path just doesn't use it. Likely the
- cheaper of the two and worth trying first.
+## Open
- **NULL-page hardening.** Slot 0 is unmapped so a NULL deref faults to
the kernel as a user sync; the kernel currently panics rather than
delivering a SIGSEGV-equivalent. Acceptable per OS.md (default-action
termination is sufficient) but a minor polish opportunity.
+
+- **Cache parsed prelude in kernel (optional optimization).** Each
+ spawn re-parses the 24 KB `prelude.scm` from scratch. Hashing it
+ once and reusing the AST across spawns would shave a fraction of
+ per-spawn overhead. Not load-bearing now that sys_spawn removed the
+ big copy; would matter again if a future driver crosses ~10k spawns.
diff --git a/scheme1/prelude.scm b/scheme1/prelude.scm
@@ -523,14 +523,36 @@
(define (argv) (sys-argv))
(define (command-line) (sys-argv))
-(define (spawn prog . args)
- (let ((r (sys-clone)))
+;; scheme1 supports two process-creation paths:
+;; - sys-spawn: one atomic syscall (no userspace gap between fork and
+;; exec). Provided by the seed kernel; absent on Linux, where the
+;; syscall number returns -ENOSYS.
+;; - sys-clone + sys-execve: classic POSIX fork+exec. Provided by Linux;
+;; not implemented by the seed kernel.
+;; Probe once at prelude-init time. The probe call uses an empty path,
+;; so on the seed kernel it returns (#f . -ENOENT) (the kernel finds the
+;; argv/path checks before any side effect); on Linux it returns
+;; (#f . -ENOSYS). We treat anything other than -ENOSYS as "available".
+(define %has-sys-spawn?
+ (let ((r (sys-spawn "" '())))
(cond
- ((not (car r)) r)
- ((zero? (cdr r))
- (sys-execve prog (cons prog args))
- (sys-exit 127))
- (else r))))
+ ((car r) #t)
+ (else
+ (let ((errno (- 0 (cdr r))))
+ (not (= errno 38)))))))
+
+(define (spawn prog . args)
+ (cond
+ (%has-sys-spawn?
+ (sys-spawn prog (cons prog args)))
+ (else
+ (let ((r (sys-clone)))
+ (cond
+ ((not (car r)) r)
+ ((zero? (cdr r))
+ (sys-execve prog (cons prog args))
+ (sys-exit 127))
+ (else r))))))
(define (run prog . args)
(let ((r (apply spawn prog args)))
diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp
@@ -5554,6 +5554,20 @@
%syscall
%ret
+# sys_spawn(path=a0, argv=a1) -> r (a0). Atomic clone+execve, single
+# syscall: kernel saves parent state, swaps user pool with no copy,
+# loads the ELF, builds the user stack, and erets into the child. The
+# parent's spawn() returns child_pid only after the child exit_groups.
+# Provided by the seed kernel (private syscall 1024). On Linux this
+# number is unmapped so the kernel returns -ENOSYS, which the prelude
+# uses to detect environment and fall back to sys_clone+sys_execve.
+:sys_spawn
+ %mov(a2, a1)
+ %mov(a1, a0)
+ %li(a0, %p1_sys_spawn)
+ %syscall
+ %ret
+
# sys_waitid(idtype=a0, id=a1, infop=a2, options=a3) -> r (a0). Leaf.
:sys_waitid
%mov(t0, a3)
@@ -5656,7 +5670,9 @@
%tail(&wrap_syscall_result)
})
-# (sys-clone)
+# (sys-clone). Linux POSIX-style fork; only used as a fallback path on
+# Linux since the seed kernel doesn't implement clone (it offers
+# sys-spawn instead).
%fn(prim_sys_clone_entry, 0, {
%call(&sys_clone)
%tail(&wrap_syscall_result)
@@ -5676,6 +5692,22 @@
%tail(&wrap_syscall_result)
})
+# (sys-spawn path argv-list). Same calling convention as sys-execve, but
+# wraps the seed kernel's atomic spawn syscall: returns (#t . child-pid)
+# after the child has exit_grouped (the kernel suspends the parent for
+# the lifetime of the child), or (#f . -errno) on failure (notably
+# -ENOSYS=38 on Linux, which the prelude probes for at init time).
+%fn2(prim_sys_spawn_entry, {path pad}, {
+ %args2(t0, a0, a0) ; t0 = path bv, a0 = argv-list
+ %stl(t0, path)
+ %call(&build_execve_argv)
+ %mov(a1, a0)
+ %ldl(a0, path)
+ %heap_ld(a0, a0, %BV.data) ; path data ptr
+ %call(&sys_spawn)
+ %tail(&wrap_syscall_result)
+})
+
# (sys-waitid idtype id infop options)
%fn(prim_sys_waitid_entry, 0, {
%args4(t0, t1, t2, a3, a0)
@@ -6199,6 +6231,7 @@
:name_sys_openat %cstr8("sys-openat")
:name_sys_clone %cstr8("sys-clone")
:name_sys_execve %cstr8("sys-execve")
+:name_sys_spawn %cstr8("sys-spawn")
:name_sys_waitid %cstr8("sys-waitid")
:name_sys_argv %cstr8("sys-argv")
:name_eof %cstr8("eof")
@@ -6302,6 +6335,7 @@
&name_sys_openat %(0) $(10) &prim_sys_openat_entry %(0)
&name_sys_clone %(0) $(9) &prim_sys_clone_entry %(0)
&name_sys_execve %(0) $(10) &prim_sys_execve_entry %(0)
+&name_sys_spawn %(0) $(9) &prim_sys_spawn_entry %(0)
&name_sys_waitid %(0) $(10) &prim_sys_waitid_entry %(0)
&name_sys_argv %(0) $(8) &prim_sys_argv_entry %(0)
&name_eofq %(0) $(4) &prim_eofq_entry %(0)
diff --git a/scripts/boot5-gen-runscm.sh b/scripts/boot5-gen-runscm.sh
@@ -0,0 +1,137 @@
+#!/bin/sh
+## boot5-gen-runscm.sh — emit run.scm driving boot5's musl + hello build
+## inside the seed kernel. Mirrors scripts/boot5.sh's podman-path script
+## generation step-for-step: per-source `tcc -c`, per-arch CRT, archive,
+## link hello. Source enumeration done by boot5.sh; this script consumes
+## the resulting build-srcs.txt and emits one `(run "tcc" …)` form per TU.
+##
+## Usage:
+## boot5-gen-runscm.sh <musl-arch> <stage-host-dir> <out.scm>
+##
+## stage-host-dir is the boot5 _host/ directory containing:
+## build-srcs.txt one path per line, relative to musl-1.2.5/
+## crt-mode "asm" or "c" — picked by boot5.sh from $MUSL_DIR
+##
+## Conventions (in seed tmpfs):
+## musl tree /tmp/musl-1.2.5/<rel-path> (staged in cpio)
+## pre-gen hdrs /tmp/musl-1.2.5/obj/include/bits/{alltypes,syscall}.h,
+## /tmp/musl-1.2.5/obj/src/internal/version.h
+## .o outputs /tmp/musl-1.2.5/obj/<src-with-.o>
+## tcc binary /tcc (basename in cpio)
+## libtcc1.a /libtcc1.a
+## stdarg bridge /tcc-stdarg-bridge.h
+## hello.c /hello.c
+## exports /libc.a, /crt1.o, /crti.o, /crtn.o, /hello (flat at
+## root so seed_runscm_export can pull them by basename)
+
+set -eu
+[ "$#" -eq 3 ] || { echo "usage: $0 <musl-arch> <stage-host-dir> <out.scm>" >&2; exit 2; }
+
+MUSL_ARCH=$1; STAGE_HOST=$2; OUT=$3
+SRCS=$STAGE_HOST/build-srcs.txt
+CRT_MODE=$(cat "$STAGE_HOST/crt-mode")
+[ -e "$SRCS" ] || { echo "missing $SRCS" >&2; exit 1; }
+
+CWORK=/tmp/musl-1.2.5
+
+# Mirrors boot5.sh's CFLAGS_BASE exactly; the only difference is that
+# every per-arg token is quoted as its own scheme bytevector. The leading
+# "tcc" is the spawned binary; everything after is its argv.
+CFLAGS_BASE_QUOTED='"-std=c99" "-nostdinc" "-ffreestanding" "-fno-strict-aliasing" "-D_XOPEN_SOURCE=700"'
+CFLAGS_BASE_QUOTED="$CFLAGS_BASE_QUOTED \"-I$CWORK/arch/$MUSL_ARCH\" \"-I$CWORK/arch/generic\" \"-I$CWORK/obj/src/internal\" \"-I$CWORK/src/include\" \"-I$CWORK/src/internal\" \"-I$CWORK/obj/include\" \"-I$CWORK/include\""
+CFLAGS_BASE_QUOTED="$CFLAGS_BASE_QUOTED \"-O2\" \"-fomit-frame-pointer\" \"-Werror=implicit-function-declaration\" \"-Werror=implicit-int\" \"-Werror=pointer-sign\" \"-Werror=pointer-arith\""
+CFLAGS_C_QUOTED="$CFLAGS_BASE_QUOTED \"-include\" \"/tcc-stdarg-bridge.h\""
+CFLAGS_ASM_QUOTED="$CFLAGS_BASE_QUOTED"
+CRTFLAGS_C_QUOTED="$CFLAGS_C_QUOTED \"-fno-stack-protector\" \"-DCRT\""
+CRTFLAGS_ASM_QUOTED="$CFLAGS_ASM_QUOTED \"-fno-stack-protector\" \"-DCRT\""
+
+{
+cat <<'PROLOGUE'
+;; boot5 run.scm — drive musl-1.2.5 (~500 TUs) + hello inside seed kernel.
+;; Generated by scripts/boot5-gen-runscm.sh; mirrors scripts/boot5.sh's
+;; podman path stage-for-stage. The musl source tree is staged in cpio at
+;; /tmp/musl-1.2.5/...; per-source .o outputs go to /tmp/musl-1.2.5/obj/...
+;; final artefacts (libc.a, crt1.o, crti.o, crtn.o, hello) land at flat
+;; root paths so the seed-runscm harness can pull them by basename.
+
+(define (must r tag)
+ (if (and (car r) (= 0 (cdr r)))
+ r
+ (begin
+ (write-string stderr "boot5: step failed: ")
+ (write-string stderr tag)
+ (write-string stderr "\n")
+ (exit 1))))
+
+(write-string stdout "boot5: stage A (compile sources)\n")
+PROLOGUE
+
+# Stage A: per-source compile. Each line of build-srcs.txt is a path
+# relative to musl-1.2.5/; choose flags by extension.
+awk -v CFLAGS_C="$CFLAGS_C_QUOTED" -v CFLAGS_ASM="$CFLAGS_ASM_QUOTED" -v CWORK="$CWORK" '
+{
+ src = $0
+ obj = "obj/" src
+ sub(/\.[^.]*$/, ".o", obj)
+ if (src ~ /\.c$/) flags = CFLAGS_C
+ else if (src ~ /\.[sS]$/) flags = CFLAGS_ASM
+ else flags = CFLAGS_C
+ printf "(must (run \"tcc\" %s \"-c\" \"%s/%s\" \"-o\" \"%s/%s\") \"%s\")\n", \
+ flags, CWORK, src, CWORK, obj, src
+}' "$SRCS"
+
+cat <<EOF
+
+(write-string stdout "boot5: stage B (CRT)\n")
+;; Position-independent + non-PIC CRT helpers. -fPIC objects are needed
+;; for shared-binding tools, even though our hello is fully static.
+(must (run "tcc" $CRTFLAGS_C_QUOTED "-fPIC" "-c" "$CWORK/crt/Scrt1.c" "-o" "$CWORK/obj/crt/Scrt1.o") "Scrt1.o")
+(must (run "tcc" $CRTFLAGS_C_QUOTED "-c" "$CWORK/crt/crt1.c" "-o" "$CWORK/obj/crt/crt1.o") "crt1.o")
+(must (run "tcc" $CRTFLAGS_C_QUOTED "-fPIC" "-c" "$CWORK/crt/rcrt1.c" "-o" "$CWORK/obj/crt/rcrt1.o") "rcrt1.o")
+EOF
+
+if [ "$CRT_MODE" = asm ]; then
+ cat <<EOF
+(must (run "tcc" $CRTFLAGS_ASM_QUOTED "-c" "$CWORK/crt/$MUSL_ARCH/crti.s" "-o" "$CWORK/obj/crt/crti.o") "crti.o")
+(must (run "tcc" $CRTFLAGS_ASM_QUOTED "-c" "$CWORK/crt/$MUSL_ARCH/crtn.s" "-o" "$CWORK/obj/crt/crtn.o") "crtn.o")
+EOF
+else
+ cat <<EOF
+(must (run "tcc" $CRTFLAGS_C_QUOTED "-c" "$CWORK/crt/crti.c" "-o" "$CWORK/obj/crt/crti.o") "crti.o")
+(must (run "tcc" $CRTFLAGS_C_QUOTED "-c" "$CWORK/crt/crtn.c" "-o" "$CWORK/obj/crt/crtn.o") "crtn.o")
+EOF
+fi
+
+# Stage C: archive libc.a. tcc -ar accepts many obj args; assemble the
+# full list inline. The list is enormous (~1500 paths × ~40 chars =
+# ~60 KB on a single line) but the prelude reader handles it fine.
+{
+ printf '\n(write-string stdout "boot5: stage C (libc.a)\\n")\n'
+ printf '(must (run "tcc" "-ar" "rcs" "/libc.a"'
+ awk -v CWORK="$CWORK" '{
+ obj = "obj/" $0
+ sub(/\.[^.]*$/, ".o", obj)
+ printf " \"%s/%s\"", CWORK, obj
+ }' "$SRCS"
+ printf ') "libc.a")\n'
+}
+
+cat <<EOF
+
+;; Publish CRT objects at flat root paths so seed_runscm_export can pull them.
+(must (run "catm" "/crt1.o" "$CWORK/obj/crt/crt1.o") "crt1.o publish")
+(must (run "catm" "/crti.o" "$CWORK/obj/crt/crti.o") "crti.o publish")
+(must (run "catm" "/crtn.o" "$CWORK/obj/crt/crtn.o") "crtn.o publish")
+
+(write-string stdout "boot5: stage D (link hello)\n")
+;; Mirrors boot5.sh's link line, with seed-tmpfs absolute paths in place
+;; of /work/in and /work/out. -L paths pick up libc.a + libtcc1.a from
+;; the flat root of the tmpfs.
+(must (run "tcc" "-static" "-nostdinc" "-nostdlib" "-include" "/tcc-stdarg-bridge.h"
+ "-I$CWORK/include" "-I$CWORK/arch/$MUSL_ARCH" "-I$CWORK/arch/generic" "-I$CWORK/obj/include"
+ "/crt1.o" "/hello.c" "-L/" "-lc" "-L/" "-ltcc1" "-L/" "-lc" "-o" "/hello") "link hello")
+
+(write-string stdout "boot5: ALL-OK\n")
+(exit 0)
+EOF
+} > "$OUT"
diff --git a/scripts/boot5.sh b/scripts/boot5.sh
@@ -38,10 +38,9 @@
## Usage: scripts/boot5.sh <arch>
## <arch> ∈ {amd64, aarch64, riscv64} for DRIVER=podman (default).
## All three architectures are verified end-to-end on podman.
-## DRIVER=seed: not yet supported — boot5 compiles ~500 musl TUs, each
-## a (run "tcc" …) inside the VM. Even with the kernel's pool-swap on
-## execve, that's ~500 clone+execve+exit cycles end-to-end under TCG
-## (≥several hours). Tracked in docs/OS-TODO.md.
+## DRIVER=seed: aarch64 only. Drives ~1300 (run "tcc" …) calls through
+## the seed kernel's atomic spawn syscall (no per-fork memcpy). Wall
+## time is dominated by tcc work, not spawn overhead.
set -eu
@@ -60,11 +59,9 @@ ROOT=$(cd "$(dirname "$0")/.." && pwd)
cd "$ROOT"
DRIVER=${DRIVER:-podman}
-[ "$DRIVER" = seed ] && {
- echo "[boot5] DRIVER=seed is not yet supported (~500 TUs ⇒ many hours under TCG);" >&2
- echo " see docs/OS-TODO.md 'Things still worth doing'." >&2
- exit 2
-}
+if [ "$DRIVER" = seed ]; then
+ [ "$ARCH" = aarch64 ] || { echo "[boot5] DRIVER=seed: aarch64 only" >&2; exit 2; }
+fi
IMAGE=boot2-scratch:$ARCH
BOOT4=build/$ARCH/boot4
@@ -87,11 +84,20 @@ BRIDGE_FILE=build/tcc/stdarg-bridge.h
[ -e "$MUSL_SKIP" ] || { echo "[boot5 $ARCH] missing $MUSL_SKIP (run scripts/boot5-calibrate.sh $ARCH)" >&2; exit 1; }
[ -e "$BRIDGE_FILE" ] || { echo "[boot5 $ARCH] missing $BRIDGE_FILE (run scripts/stage1-flatten.sh)" >&2; exit 1; }
-if ! podman image exists "$IMAGE"; then
+if [ "$DRIVER" = podman ] && ! podman image exists "$IMAGE"; then
echo "[boot5 $ARCH] building $IMAGE"
podman build --platform "$PLATFORM" -t "$IMAGE" \
-f scripts/Containerfile.scratch scripts/
fi
+if [ "$DRIVER" = seed ]; then
+ KERNEL_IMAGE=$ROOT/seed-kernel/build/Image
+ EXTRACT=$ROOT/seed-kernel/scripts/extract-dump.sh
+ BOOT2=build/$ARCH/boot2
+ [ -f "$KERNEL_IMAGE" ] || { echo "[boot5] missing $KERNEL_IMAGE — make in seed-kernel/" >&2; exit 1; }
+ [ -x "$BOOT2/scheme1" ] || { echo "[boot5] missing $BOOT2/scheme1 (run boot2)" >&2; exit 1; }
+ [ -x "$BOOT2/catm" ] || { echo "[boot5] missing $BOOT2/catm (run boot2)" >&2; exit 1; }
+ export KERNEL_IMAGE EXTRACT
+fi
# ── stage inputs ──────────────────────────────────────────────────────
# $STAGE/in/ — exactly what the container reads (bind-mounted /work/in)
@@ -208,6 +214,16 @@ n_src=$(wc -l < "$STAGE/_host/build-srcs.txt")
n_skip=$(wc -l < "$MUSL_SKIP")
echo "[boot5 $ARCH] keep=$n_src skip=$n_skip (calibrated)"
+# Record CRT mode (asm vs c) so the seed gen-runscm step can read it
+# without re-checking $MUSL_DIR. Same test as the podman branch below.
+if [ -f "$MUSL_DIR/crt/$MUSL_ARCH/crti.s" ]; then
+ echo asm > "$STAGE/_host/crt-mode"
+else
+ echo c > "$STAGE/_host/crt-mode"
+fi
+
+case "$DRIVER" in
+podman)
# ── emit flat container build script ──────────────────────────────────
# Generates a straight-line shell program: mkdir, cp, then one tcc
# invocation per source, then ar, then link+run hello. No control flow
@@ -296,10 +312,61 @@ podman run --rm -i --pull=never --platform "$PLATFORM" \
-v "$ROOT/$STAGE:/work" -w /work "$IMAGE" \
sh -eu /work/in/run.sh
+ ;;
+seed)
+ # ── seed-kernel driver: one qemu boot, scheme1 evaluates a host-
+ # generated run.scm against tcc/libtcc1.a/musl-tree staged in tmpfs.
+ # Outputs (libc.a, crt1.o, crti.o, crtn.o, hello) come back via
+ # the UART tmpfs dump. ~1300 (run "tcc" …) calls; the seed
+ # kernel's atomic spawn syscall avoids per-fork memcpy.
+ . scripts/lib-seed-runscm.sh
+ seed_runscm_init "$STAGE/seed" "$OUT"
+
+ RUNSCM=$STAGE/seed/run.scm
+ scripts/boot5-gen-runscm.sh "$MUSL_ARCH" "$STAGE/_host" "$RUNSCM"
+ echo "[boot5 $ARCH/seed] generated run.scm: $(wc -l <"$RUNSCM") lines, $(wc -c <"$RUNSCM") bytes"
+
+ seed_runscm_scheme1 "$BOOT2/scheme1"
+ seed_runscm_prelude scheme1/prelude.scm
+ seed_runscm_runscm "$RUNSCM"
+
+ # Chain binaries staged at flat paths; matched by run.scm.
+ seed_runscm_input tcc "$BOOT4/tcc3"
+ seed_runscm_input libtcc1.a "$BOOT4/libtcc1.a"
+ seed_runscm_input catm "$BOOT2/catm"
+ seed_runscm_input scheme1 "$BOOT2/scheme1"
+ seed_runscm_input tcc-stdarg-bridge.h "$BRIDGE_FILE"
+ seed_runscm_input hello.c scripts/boot-hello.c
+
+ # Pre-generated headers, staged at the in-tmpfs paths the run.scm
+ # references via -I / -include / direct opens.
+ seed_runscm_input tmp/musl-1.2.5/obj/include/bits/alltypes.h "$STAGE/in/musl-alltypes.h"
+ seed_runscm_input tmp/musl-1.2.5/obj/include/bits/syscall.h "$STAGE/in/musl-syscall.h"
+ seed_runscm_input tmp/musl-1.2.5/obj/src/internal/version.h "$STAGE/in/musl-version.h"
+
+ # Full musl source tree (overrides + deletes already applied on host).
+ seed_runscm_input_tree tmp/musl-1.2.5 "$MUSL_DIR"
+
+ seed_runscm_export libc.a
+ seed_runscm_export crt1.o
+ seed_runscm_export crti.o
+ seed_runscm_export crtn.o
+ seed_runscm_export hello
+
+ # boot5 has ~1300 spawns + heavy tcc work; bump qemu memory + timeout.
+ QEMU_MEM=${QEMU_MEM:-3072M} seed_runscm_run "${BOOT5_TIMEOUT:-7200}"
+ ;;
+*) echo "[boot5] unknown DRIVER=$DRIVER" >&2; exit 2 ;;
+esac
+
# ── copy outputs to final destination ────────────────────────────────
+case "$DRIVER" in
+ podman) SRC=$STAGE/out ;;
+ seed) SRC=$STAGE/seed/dump ;;
+esac
for f in libc.a crt1.o crti.o crtn.o hello; do
- cp "$STAGE/out/$f" "$OUT/$f"
+ cp "$SRC/$f" "$OUT/$f"
done
-echo "[boot5 $ARCH] sizes: libc.a=$(wc -c <"$OUT/libc.a") hello=$(wc -c <"$OUT/hello")"
-echo "[boot5 $ARCH] OK -> $OUT/{libc.a, crt1.o, crti.o, crtn.o, hello}"
+echo "[boot5 $ARCH/$DRIVER] sizes: libc.a=$(wc -c <"$OUT/libc.a") hello=$(wc -c <"$OUT/hello")"
+echo "[boot5 $ARCH/$DRIVER] OK -> $OUT/{libc.a, crt1.o, crti.o, crtn.o, hello}"
diff --git a/scripts/lib-seed-runscm.sh b/scripts/lib-seed-runscm.sh
@@ -43,11 +43,36 @@ seed_runscm_runscm() { S_RUNSCM=$1; }
seed_runscm_input() {
name=$1; src=$2
+ case "$name" in
+ */*) mkdir -p "$S_STAGE_DIR/cpio/$(dirname "$name")" ;;
+ esac
cp "$src" "$S_STAGE_DIR/cpio/$name"
S_NAMES="$S_NAMES
$name"
}
+# Stage every regular file under <src-root> into the cpio at <prefix>/...,
+# preserving the relative directory tree. Names are appended to S_NAMES.
+# The find pipeline runs in a subshell so it can't mutate S_NAMES directly;
+# names are collected via a tempfile then appended at the end.
+seed_runscm_input_tree() {
+ prefix=$1; src_root=$2
+ [ -d "$src_root" ] || { echo "seed-runscm: input_tree: $src_root not a dir" >&2; exit 2; }
+ tmp=$S_STAGE_DIR/.tree-names
+ : > "$tmp"
+ ( cd "$src_root" && find . -type f ) | sed 's|^\./||' | sort | while read -r rel; do
+ [ -n "$rel" ] || continue
+ mkdir -p "$S_STAGE_DIR/cpio/$prefix/$(dirname "$rel")"
+ cp "$src_root/$rel" "$S_STAGE_DIR/cpio/$prefix/$rel"
+ printf '%s/%s\n' "$prefix" "$rel" >> "$tmp"
+ done
+ while read -r n; do
+ S_NAMES="$S_NAMES
+$n"
+ done < "$tmp"
+ rm -f "$tmp"
+}
+
seed_runscm_export() {
S_EXPORTS="$S_EXPORTS $1"
}
@@ -79,6 +104,13 @@ run.scm$S_NAMES"
QPID=$!
( sleep "$timeout"; kill -9 $QPID 2>/dev/null ) </dev/null >/dev/null 2>&1 &
WATCHER=$!
+ # `disown` removes the watcher from the shell's job table so that
+ # killing it on the happy path doesn't trigger bash's
+ # "Terminated: 15 PID ( sleep … )" job-status message — that
+ # message looks like a real failure but is just a noisy SIGTERM
+ # notification fired when qemu exited normally before the watcher's
+ # sleep elapsed.
+ disown $WATCHER 2>/dev/null || true
wait $QPID 2>/dev/null || true
kill $WATCHER 2>/dev/null || true
diff --git a/scripts/seed-accept-boot5.sh b/scripts/seed-accept-boot5.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## seed-accept-boot5.sh — acceptance: run boot5 under DRIVER=seed and
+## assert byte-identical outputs vs build/aarch64/boot5/'s podman-built
+## artefacts. Mirrors scripts/seed-accept-boot34.sh.
+##
+## Prereqs (build first):
+## - seed-kernel/build/Image (`make` in seed-kernel/)
+## - build/aarch64/boot{0..5}/ (run scripts/bootN.sh aarch64
+## under DRIVER=podman to populate references)
+##
+## What it does:
+## 1. Stash existing podman-built build/aarch64/boot5/ as ref/.
+## 2. DRIVER=seed scripts/boot5.sh aarch64 — one qemu boot, scheme1
+## drives ~1300 (run "tcc" …) calls from a generated run.scm.
+## 3. cmp -s each output (libc.a, crt1.o, crti.o, crtn.o, hello) vs
+## the podman reference; fail on diff.
+##
+## Usage: scripts/seed-accept-boot5.sh
+
+set -eu
+
+ARCH=aarch64
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT"
+
+KERNEL=seed-kernel/build/Image
+[ -f "$KERNEL" ] || { echo "missing $KERNEL — make in seed-kernel/" >&2; exit 1; }
+[ -d build/$ARCH/boot5 ] || { echo "build/$ARCH/boot5 missing — run scripts/boot5.sh aarch64" >&2; exit 1; }
+for f in libc.a crt1.o crti.o crtn.o hello; do
+ [ -e build/$ARCH/boot5/$f ] || { echo "build/$ARCH/boot5/$f missing — run scripts/boot5.sh aarch64" >&2; exit 1; }
+done
+
+REF=build/$ARCH/boot5-ref
+rm -rf "$REF"
+cp -R build/$ARCH/boot5 "$REF"
+
+echo "[seed-accept-boot5] DRIVER=seed scripts/boot5.sh $ARCH"
+DRIVER=seed scripts/boot5.sh $ARCH
+
+fails=0
+for f in libc.a crt1.o crti.o crtn.o hello; do
+ seed_size=$(wc -c < build/$ARCH/boot5/$f)
+ ref_size=$(wc -c < $REF/$f)
+ if cmp -s build/$ARCH/boot5/$f $REF/$f; then
+ echo "[seed-accept-boot5] $f: byte-identical ($seed_size bytes)"
+ else
+ echo "[seed-accept-boot5] $f: DIFF (seed=$seed_size ref=$ref_size)"
+ fails=$((fails + 1))
+ fi
+done
+
+if [ $fails -eq 0 ]; then
+ echo "[seed-accept-boot5] PASS — all 5 outputs byte-identical"
+ exit 0
+else
+ echo "[seed-accept-boot5] FAIL — $fails outputs differ" >&2
+ exit 4
+fi
diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c
@@ -73,8 +73,7 @@ static int str_eq(const char *a, const char *b) {
static int str_n(const char *s) { int n = 0; while (s[n]) n++; return n; }
static void mem_cpy(void *d, const void *s, u64 n) {
/* 8-byte fast path when both pointers are 8-aligned and n is a multiple
- * of 8. Under TCG this is roughly 8× faster than the byte loop, which
- * matters for the 768 MB user-pool copy on clone. */
+ * of 8. Under TCG this is roughly 8× faster than the byte loop. */
u8 *dd = d; const u8 *ss = s;
if ((((u64)dd | (u64)ss | n) & 7) == 0) {
u64 *dq = (u64 *)dd;
@@ -103,8 +102,8 @@ static void mem_set(void *d, int c, u64 n) {
* slots 1..N (VA 2 MB..USER_VA_HI) Normal user RAM, backed by one
* of two 768 MB physical pools
* (USER_POOL_A_PA / USER_POOL_B_PA);
- * execve swaps which is mapped here
- * instead of mem_cpy'ing 768 MB.
+ * sys_spawn swaps which is mapped
+ * here without any mem_cpy.
* N=384 (slots 1..384, 768 MB) gives
* tcc-boot2's 512 MB BSS plus brk room.
* slots N+1..511 (VA USER_VA_HI..1G) Device-identity, kept for safety —
@@ -116,11 +115,12 @@ static void mem_set(void *d, int c, u64 n) {
__attribute__((aligned(4096))) static u64 l1_pt[512];
__attribute__((aligned(4096))) static u64 l2_user[512];
-/* Two physical pools (A, B) backing the user low-VA window. On execve
+/* Two physical pools (A, B) backing the user low-VA window. On spawn
* we swap l2_user[] from one to the other, TLB-invalidate, and load the
- * new ELF into the other pool — avoiding the 1.5 GB of mem_cpy that the
- * old snapshot-on-clone scheme paid per fork. MAX_PROC_DEPTH=1 means two
- * pools is sufficient (the prelude only forks one level deep).
+ * new ELF into the other pool — no memory copy. The atomic spawn syscall
+ * (no userspace gap between fork and exec) means the child never reads
+ * the parent's pool, so no snapshot is needed. MAX_PROC_DEPTH=1 means
+ * two pools is sufficient (the prelude only forks one level deep).
*
* With QEMU -m 2048M (RAM 0x40000000–0xc0000000), the layout is:
* 0x40000000–0x4c000000 kernel image + kheap (192 MB)
@@ -317,31 +317,92 @@ static void parse_dtb(const void *dtb, struct dtb_info *out) {
/* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */
-#define MAX_FILES 64
+/* boot5 stages a full musl tree in the cpio (~1300 .c sources + ~1200
+ * headers/aux) plus per-TU .o outputs (~1300) — observed cpio entry
+ * count is ~2600 inputs + ~1300 outputs ≈ 3900. Round up to 4096 to
+ * leave headroom; the struct is ~120 bytes so this costs ~480 KB of
+ * kernel BSS — comfortably within the 192 MB kheap. Path length is 96
+ * to fit "tmp/musl-1.2.5/obj/src/<sub>/<name>.o" (paths stored with
+ * leading slashes stripped, so these are user-visible as
+ * /tmp/musl-1.2.5/obj/...). */
+#define MAX_FILES 4096
struct file {
int used;
- char path[64];
+ char path[96];
u8 *data;
u64 len;
u64 cap;
};
static struct file files[MAX_FILES];
+/* Resolve "." and ".." segments in `in` into `out`. Required because real
+ * filesystems normalize `foo/../bar` → `bar` at lookup time, but our
+ * tmpfs is a flat path → blob map; without this, tcc's pstrcat-style
+ * include resolution (e.g. "/tmp/musl-1.2.5/src/include/" +
+ * "../../include/features.h") produces a literal path that misses the
+ * real entry "tmp/musl-1.2.5/include/features.h". The buffers are sized
+ * for our worst case (~96-char paths) plus headroom for the unresolved
+ * form. Trailing slashes are dropped. */
+static void normalize_path(const char *in, char *out, int outsz) {
+ char buf[256];
+ int n = 0;
+ while (in[n] && n < (int)sizeof(buf) - 1) { buf[n] = in[n]; n++; }
+ buf[n] = 0;
+
+ const char *segs[64];
+ int seg_lens[64];
+ int nsegs = 0;
+ int i = 0;
+ while (buf[i]) {
+ int start = i;
+ while (buf[i] && buf[i] != '/') i++;
+ int len = i - start;
+ if (buf[i]) { buf[i] = 0; i++; }
+ if (len == 0) continue;
+ if (len == 1 && buf[start] == '.') continue;
+ if (len == 2 && buf[start] == '.' && buf[start + 1] == '.') {
+ if (nsegs > 0) nsegs--;
+ continue;
+ }
+ if (nsegs < 64) {
+ segs[nsegs] = &buf[start];
+ seg_lens[nsegs] = len;
+ nsegs++;
+ }
+ }
+
+ int o = 0;
+ for (int k = 0; k < nsegs; k++) {
+ for (int j = 0; j < seg_lens[k] && o < outsz - 1; j++)
+ out[o++] = segs[k][j];
+ if (k < nsegs - 1 && o < outsz - 1) out[o++] = '/';
+ }
+ out[o] = 0;
+}
+
static int find_file(const char *path) {
while (*path == '/') path++;
+ char norm[128];
+ normalize_path(path, norm, sizeof(norm));
for (int i = 0; i < MAX_FILES; i++) {
- if (files[i].used && str_eq(files[i].path, path)) return i;
+ if (files[i].used && str_eq(files[i].path, norm)) return i;
}
return -1;
}
static int new_file(const char *path) {
while (*path == '/') path++;
+ /* Normalize at store time so all later lookups match regardless of
+ * how the caller spelled `..` / `.`. */
+ char norm[128];
+ normalize_path(path, norm, sizeof(norm));
for (int i = 0; i < MAX_FILES; i++) {
if (!files[i].used) {
files[i].used = 1;
int j = 0;
- while (path[j] && j < 63) { files[i].path[j] = path[j]; j++; }
+ while (norm[j] && j < (int)sizeof(files[i].path) - 1) {
+ files[i].path[j] = norm[j]; j++;
+ }
files[i].path[j] = 0;
files[i].data = 0;
files[i].len = 0;
@@ -389,6 +450,12 @@ static void parse_cpio(const void *cpio, u64 total) {
files[idx].cap = fsz ? fsz : 1;
files[idx].len = fsz;
if (fsz) mem_cpy(files[idx].data, fdata, fsz);
+ } else {
+ /* Silent drops here are how MAX_FILES being too low
+ * masquerades as random "file not found" errors during
+ * the build — surface it loudly. */
+ uart_puts("[seed] WARN: cpio entry dropped (MAX_FILES "
+ "exhausted): "); uart_puts(name); uart_puts("\n");
}
}
(void)is_dir;
@@ -404,7 +471,7 @@ struct phdr { u32 p_type, p_flags; u64 p_offset, p_vaddr, p_paddr, p_filesz, p_m
#define PT_LOAD 1
/* Highest VA touched by the most recently loaded image's PT_LOAD segments
- * (after USER_VA_HI clipping). load_elf updates this; kmain / sys_execve
+ * (after USER_VA_HI clipping). load_elf updates this; kmain / sys_spawn
* use it to seed brk_base above the user image's BSS. */
static u64 g_user_image_end;
@@ -493,8 +560,14 @@ static u64 brk_max;
#define SYS_exit_group 93
#define SYS_waitid 95
#define SYS_brk 214
-#define SYS_clone 220
-#define SYS_execve 221
+/* Private syscall number, deliberately outside the Linux aarch64 range
+ * (last allocated is 462 = futex_requeue, plus a small reserved tail).
+ * The scheme1 prelude probes (sys-spawn) once at init: on Linux this
+ * number is unmapped so the probe gets -ENOSYS and the prelude falls
+ * back to the classic clone+execve path; on the seed kernel the probe
+ * succeeds (or returns -ENOENT for a missing file) and the prelude uses
+ * sys-spawn for every (run …) thereafter. */
+#define SYS_spawn 1024
#define ECHILD 10
#define EAGAIN 11
@@ -596,49 +669,46 @@ static i64 sys_unlinkat(int dirfd, const char *path, int flags) {
return 0;
}
-/* ─── Tier 2: pseudo-fork (clone / execve / waitid / exit_group) ────────── */
+/* ─── Tier 2: atomic spawn (spawn / waitid / exit_group) ────────────────── */
/*
- * The boot2 chain's clone/execve/waitid pattern (scheme1/prelude.scm:520-537)
- * is rigidly synchronous: the parent calls clone, the "child" immediately
- * calls execve and runs to exit_group, then the parent calls waitid. Nothing
- * else runs between clone and execve in the child, or between clone and
- * waitid in the parent.
+ * The boot2 chain's process-creation shape (scheme1/prelude.scm `(spawn …)`)
+ * is rigidly synchronous: parent creates a child to run a single program,
+ * waits for it, reads the exit code. Nothing else runs in the child between
+ * creation and the new program's entry, and nothing else runs in the
+ * parent between creation and wait.
*
- * We implement that as pseudo-fork on a single-threaded kernel:
+ * We implement that as a single atomic syscall on a single-threaded kernel:
*
- * sys_clone → push parent state (regs, brk, fd table, current pool) onto
- * proc_stack; mem_cpy the parent's user pool into the spare
- * pool and swap l2_user[] over to it so the child runs from
- * the new pool. Return 0 to the current context (the
- * "child").
- * sys_execve → capture path/argv into a kernel buffer; load_elf into
- * the (already-swapped) child pool, reset brk, rewrite tf
- * so eret resumes at the new entry point.
+ * sys_spawn → capture path+argv into kernel buffers (still reading from
+ * the parent's pool); push parent state (regs, brk, fd
+ * table, current pool) onto proc_stack; remap l2_user[] to
+ * the alternate pool with NO COPY (the child won't read any
+ * byte of the parent's pool — load_elf overwrites just the
+ * PT_LOAD ranges and build_user_stack writes the top of the
+ * user VA window); load_elf into the alternate pool, reset
+ * brk, build the user stack, rewrite tf so eret enters the
+ * new program at its entry with the new sp_el0.
* sys_exit → if proc_stack non-empty: stash exit code in last_child,
* swap l2_user[] back to the parent's pool (no copy — the
* parent's pool was never written by the child), restore
* regs/brk/fds, ic iallu (the user VAs now resolve to
* different physical pages), set tf so eret resumes the
- * parent's clone() call with x0 = pid. If proc_stack empty:
+ * parent's spawn() call with x0 = pid. If proc_stack empty:
* real exit (dump tmpfs, PSCI off).
* sys_waitid → return last_child's exit code via the siginfo struct.
*
- * No actual concurrency. The "parent" is suspended at the moment of clone
- * and resumed only when the "child" calls exit_group. This works because
- * the prelude never schedules other work between fork and wait.
+ * No actual concurrency. The "parent" is suspended at the moment of spawn
+ * and resumed only when the child calls exit_group.
*
- * Memory cost: one 768 MB mem_cpy per fork (replacing the original
- * snapshot-on-clone + memcpy-restore-on-exit design which paid 1.5 GB of
- * memcpy per fork). The exit-side restore is replaced by a ~3 KB L2 write
- * + TLBI. mem_cpy uses an 8-byte fast path which gets ~8x throughput
- * over the byte loop under TCG, dropping a fork's memcpy time from ~30 s
- * to ~5 s on the canonical tier2 fixture. A "deferred swap until execve"
- * variant — letting the child run prelude bytecode in the parent's pool
- * — would be free, but scheme1's interpreter keeps heap-allocator state
- * in user BSS globals; the child's allocations during the prelude window
- * mutate them, leaving the parent with inconsistent post-resume heap
- * state. The eager copy is the smallest change that keeps the prelude
- * working unmodified.
+ * Memory cost per spawn: zero copy. l2_user[] rewrite + TLBI + ic iallu
+ * + load_elf (which copies just the new image's PT_LOAD bytes, typically
+ * ~1 MB for tcc). This replaces the previous clone/execve design which
+ * paid one 768 MB mem_cpy per fork to seed the child's pool with parent
+ * state — needed only because scheme1 ran a few interpreter-bytecode
+ * cells of user code between clone and execve, which would otherwise
+ * mutate the parent's BSS heap-allocator globals (heap_next,
+ * current_heap_next_ptr, scratch_next). Folding clone+execve into one
+ * syscall closes that window entirely.
*/
struct trapframe {
@@ -647,8 +717,13 @@ struct trapframe {
u64 spsr;
};
-/* Forward decls for state defined further down. */
-#define MAX_ARGV 32
+/* Forward decls for state defined further down. boot5's per-source
+ * compile passes ~25 argv entries / ~750 bytes, but the final
+ * `tcc -ar rcs libc.a obj1 … obj1263` call passes ~1300 entries totalling
+ * ~65 KB of strings. MAX_ARGV / spawn_argv_pool size for that worst
+ * case; both are kept generous so a future taller chain doesn't hit a
+ * silent-truncation cliff. */
+#define MAX_ARGV 2048
static u64 build_user_stack(u64 stack_top, int argc, char **argv);
static int tokenise(char *src, char **argv, int cap);
@@ -658,14 +733,14 @@ struct proc_save {
int active;
u64 child_pid;
/* Saved trap-frame state — enough to resume the parent at the SVC
- * instruction following its clone(). x[0] is overwritten with child_pid
- * at restore time so the parent sees a non-zero return. */
+ * instruction following its sys_spawn. x[0] is overwritten with
+ * child_pid at restore time so the parent sees a non-zero return. */
u64 regs[31];
u64 elr;
u64 spsr;
u64 sp_el0;
- /* Per-process state at the moment of clone. brk_base is saved alongside
- * brk_cur because do_execve resets it above the new image's end-of-bss;
+ /* Per-process state at the moment of spawn. brk_base is saved alongside
+ * brk_cur because sys_spawn resets it above the new image's end-of-bss;
* the parent's value comes back with the parent's pool. */
u64 brk_base_save;
u64 brk_cur_save;
@@ -698,59 +773,21 @@ static int last_child_valid = 0;
static u64 last_child_pid = 0;
static int last_child_code = 0;
-static i64 sys_clone(struct trapframe *tf, u64 flags, u64 stack, u64 ptid,
- u64 ctid, u64 tls) {
- (void)flags; (void)stack; (void)ptid; (void)ctid; (void)tls;
+/* sys_spawn captures path+argv from the parent's pool into kernel buffers
+ * BEFORE swapping pools — load_elf will only ever read from the cpio-
+ * staged file (kernel state) and write to the alternate pool, but the
+ * argv strings the caller passed live in the parent's pool, which we're
+ * about to stop mapping. The pool + argv pointer table sit in BSS
+ * (rather than on the kernel stack) because MAX_ARGV * 8 = 16 KB is
+ * too large to put on the syscall stack. */
+static char spawn_argv_pool[131072]; /* 128 KB; boot5 ar peaks ~65 KB */
+static char *spawn_argv_ptrs[MAX_ARGV];
+static i64 sys_spawn(struct trapframe *tf, const char *path, char **argv) {
+ if (!path) return -EFAULT;
if (proc_depth >= MAX_PROC_DEPTH) return -EAGAIN;
- struct proc_save *p = &proc_stack[proc_depth];
- p->active = 1;
- p->child_pid = g_next_pid++;
- for (int i = 0; i < 31; i++) p->regs[i] = tf->x[i];
- p->elr = tf->elr;
- p->spsr = tf->spsr;
- asm volatile("mrs %0, sp_el0" : "=r"(p->sp_el0));
- p->brk_base_save = brk_base;
- p->brk_cur_save = brk_cur;
- for (int i = 0; i < MAX_FD; i++) p->fdtab_save[i] = fdtab[i];
- p->pool_save = current_pool;
- /* Pool swap, eager-copy variant: copy the parent's pool into the
- * alternate pool, then remap the user-VA window to the alternate
- * pool. The child runs from the new pool until exit; the parent's
- * pool stays pristine and sys_exit_or_resume_parent swaps back
- * without any memory copy. Cost: one 768 MB mem_cpy per fork (vs
- * two in the original snapshot-on-clone + restore-on-exit design).
- *
- * A "deferred swap until execve" variant — letting the child run
- * the prelude's between-clone-and-execve scheme bytecode in the
- * parent's pool — would cost zero copies, but scheme1's interpreter
- * keeps heap-allocator state in user BSS globals (heap_next,
- * current_heap_next_ptr, scratch_next). The child mutates those
- * globals as it allocates cons cells for (cons prog args) and as
- * the bytecode dispatcher runs; on parent resume, pool A still
- * carries the child's mutations and the parent reads inconsistent
- * heap state. The eager copy below is the smallest change that
- * keeps the prelude working as written. */
- {
- int new_pool = current_pool ^ 1;
- mem_cpy((void *)pool_pa(new_pool), (void *)pool_pa(current_pool), USER_POOL_SIZE);
- swap_user_pool(new_pool);
- }
- proc_depth++;
- return 0;
-}
-/* execve must capture path+argv into kernel-side buffers BEFORE load_elf
- * runs — load_elf clobbers user memory, and the path/argv strings live in
- * that memory. */
-static char execve_argv_pool[2048];
-static i64 sys_execve(struct trapframe *tf, const char *path,
- char **argv, char **envp) {
- /* envp may be NULL — the prelude wrapper passes no envp arg, so x2 is
- * whatever happened to be there. We ignore envp regardless. */
- (void)envp;
- if (!path) return -EFAULT;
- /* Copy path before find_file does anything else (path lives in user
- * memory which load_elf will clobber). */
+ /* Copy path out of the parent's pool first (find_file uses kernel
+ * state, but the caller's `path` pointer is into user memory). */
char path_buf[128];
int pn = 0;
while (path[pn] && pn < 127) { path_buf[pn] = path[pn]; pn++; }
@@ -758,56 +795,94 @@ static i64 sys_execve(struct trapframe *tf, const char *path,
int fidx = find_file(path_buf);
if (fidx < 0) return -ENOENT;
- /* Capture argv into a kernel-side pool. */
+ /* Capture argv strings into spawn_argv_pool (kernel BSS, not the
+ * user pool — survives the pool swap below). Truncation here is
+ * silent but loud: we panic-warn on the UART so a too-low MAX_ARGV
+ * surfaces as a kernel message, not a downstream link failure. */
int argc = 0;
- char *new_argv[MAX_ARGV];
int pool_off = 0;
if (argv) {
while (argc < MAX_ARGV - 1 && argv[argc]) {
const char *s = argv[argc];
int n = 0;
- while (s[n] && pool_off + n < (int)sizeof(execve_argv_pool) - 1) n++;
- for (int j = 0; j < n; j++) execve_argv_pool[pool_off + j] = s[j];
- execve_argv_pool[pool_off + n] = 0;
- new_argv[argc] = &execve_argv_pool[pool_off];
+ while (s[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++;
+ for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = s[j];
+ spawn_argv_pool[pool_off + n] = 0;
+ spawn_argv_ptrs[argc] = &spawn_argv_pool[pool_off];
pool_off += n + 1;
argc++;
}
+ if (argv[argc]) {
+ uart_puts("[seed] WARN: sys_spawn argv truncated at MAX_ARGV="
+ ); uart_putd(MAX_ARGV); uart_puts(" for path=");
+ uart_puts(path_buf); uart_puts("\n");
+ }
}
if (argc == 0) {
/* Synthesise argv[0] from the path so user code that reads argv[0]
* doesn't crash. */
int n = 0;
- while (path_buf[n] && pool_off + n < (int)sizeof(execve_argv_pool) - 1) n++;
- for (int j = 0; j < n; j++) execve_argv_pool[pool_off + j] = path_buf[j];
- execve_argv_pool[pool_off + n] = 0;
- new_argv[0] = &execve_argv_pool[pool_off];
+ while (path_buf[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++;
+ for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = path_buf[j];
+ spawn_argv_pool[pool_off + n] = 0;
+ spawn_argv_ptrs[0] = &spawn_argv_pool[pool_off];
pool_off += n + 1;
argc = 1;
}
- /* Load new ELF over user RAM. (Inside a pseudo-fork, sys_clone has
- * already swapped to the child's pool, so this overwrites only the
- * child's pool — the parent's pool stays pristine.) */
+ /* Save parent state — regs, brk, fd table, which pool the parent ran
+ * in. After sys_exit_or_resume_parent restores from this frame, the
+ * parent's spawn() call returns with x[0] = child_pid. */
+ struct proc_save *p = &proc_stack[proc_depth];
+ p->active = 1;
+ p->child_pid = g_next_pid++;
+ for (int i = 0; i < 31; i++) p->regs[i] = tf->x[i];
+ p->elr = tf->elr;
+ p->spsr = tf->spsr;
+ asm volatile("mrs %0, sp_el0" : "=r"(p->sp_el0));
+ p->brk_base_save = brk_base;
+ p->brk_cur_save = brk_cur;
+ for (int i = 0; i < MAX_FD; i++) p->fdtab_save[i] = fdtab[i];
+ p->pool_save = current_pool;
+
+ /* Swap to the alternate pool. NO COPY: the child will only read
+ * memory that load_elf writes (its own PT_LOAD segments) and what
+ * build_user_stack writes (top of user VA). Stale bytes elsewhere in
+ * the alt pool are user-invisible — sbrk pages aren't zeroed but
+ * neither were they under the old execve path. */
+ int new_pool = current_pool ^ 1;
+ swap_user_pool(new_pool);
+ proc_depth++;
+
+ /* Load new ELF into the (just-swapped) alt pool. files[fidx].data is
+ * in kernel heap, not the user pool, so this read is unaffected. */
u64 entry = load_elf(files[fidx].data);
- if (!entry) return -ENOEXEC;
+ if (!entry) {
+ /* Roll back: alt pool is in undefined state but parent pool is
+ * still pristine. Swap back and pop proc_stack. */
+ proc_depth--;
+ swap_user_pool(p->pool_save);
+ return -ENOEXEC;
+ }
+
/* Reset brk above the new image's end-of-bss. */
brk_base = g_user_image_end ? g_user_image_end : USER_VA_LO;
brk_cur = brk_base;
+
/* Build new user stack at top of user VA window. */
- u64 new_sp = build_user_stack(USER_VA_HI, argc, new_argv);
+ u64 new_sp = build_user_stack(USER_VA_HI, argc, spawn_argv_ptrs);
- /* Rewrite trap frame so eret jumps to the new image's entry, with a
- * clean register state and the new stack. */
+ /* Rewrite trap frame so eret enters the child at the new image's
+ * entry with a clean register state and the new stack. The parent's
+ * regs sit on proc_stack until sys_exit_or_resume_parent restores
+ * them on child exit. */
for (int i = 0; i < 31; i++) tf->x[i] = 0;
tf->elr = entry;
- /* sp_el0 isn't in the trap frame — set it directly; it survives until
- * the eret since the kernel uses SP_ELx while in trap_sync. */
+ /* sp_el0 isn't in the trap frame — set it directly; it survives
+ * until eret since the kernel uses SP_ELx while in trap_sync. */
asm volatile("msr sp_el0, %0" :: "r"(new_sp));
- /* x[0] = 0 will be overwritten by the dispatcher's tf->x[0] = (u64)r
- * assignment. To preserve "argc/argv on the stack only", return 0 and
- * let the dispatcher write it; user code never sees the return value
- * because elr now points at _start. */
+ /* Returning 0; dispatcher writes tf->x[0] = 0. The child's _start
+ * reads argc/argv from the stack, so x[0] is don't-care. */
return 0;
}
@@ -876,10 +951,10 @@ static void sys_exit_final(int code) {
}
/* Dispatcher-side exit_group: pops proc_stack and resumes the parent's
- * clone() if there's a saved frame, otherwise falls through to the real
- * shutdown path. Returns 1 if the trap frame was rewritten (resume parent),
- * 0 if the caller should treat it as a normal trap-return path (which
- * will never happen, since sys_exit_final does not return). */
+ * sys_spawn if there's a saved frame, otherwise falls through to the
+ * real shutdown path. Returns 1 if the trap frame was rewritten (resume
+ * parent), 0 if the caller should treat it as a normal trap-return path
+ * (which will never happen, since sys_exit_final does not return). */
static int sys_exit_or_resume_parent(struct trapframe *tf, int code) {
code &= 0xff;
if (proc_depth > 0) {
@@ -896,7 +971,7 @@ static int sys_exit_or_resume_parent(struct trapframe *tf, int code) {
for (int i = 0; i < MAX_FD; i++) fdtab[i] = p->fdtab_save[i];
/* Restore registers (overwriting x[0] with child_pid, since the
* dispatcher will write tf->x[0] = (u64)r before eret — we want
- * the parent's clone() to see child_pid as the syscall return). */
+ * the parent's sys_spawn to see child_pid as the syscall return). */
for (int i = 0; i < 31; i++) tf->x[i] = p->regs[i];
tf->elr = p->elr;
tf->spsr = p->spsr;
@@ -938,8 +1013,7 @@ i64 trap_sync(u64 esr, struct trapframe *tf) {
case SYS_lseek: r = sys_lseek((int)a0, (i64)a1, (int)a2); break;
case SYS_brk: r = sys_brk(a0); break;
case SYS_unlinkat: r = sys_unlinkat((int)a0, (const char *)a1, (int)a2); break;
- case SYS_clone: r = sys_clone(tf, a0, a1, a2, a3, a4); break;
- case SYS_execve: r = sys_execve(tf, (const char *)a0, (char **)a1, (char **)a2); break;
+ case SYS_spawn: r = sys_spawn(tf, (const char *)a0, (char **)a1); break;
case SYS_waitid: r = sys_waitid(tf, (int)a0, a1, (void *)a2, (int)a3); break;
case SYS_exit_group:
r = sys_exit_or_resume_parent(tf, (int)a0);
@@ -1002,6 +1076,11 @@ static int tokenise(char *src, char **argv, int cap) {
return argc;
}
+/* Out-of-stack scratch for the per-call user-VA pointer table. With
+ * MAX_ARGV=2048, sizeof(strs)=16 KB — too large to put on the syscall
+ * stack. */
+static u64 build_user_stack_strs[MAX_ARGV];
+
static u64 build_user_stack(u64 stack_top, int argc, char **argv) {
/* SysV layout, low to high at the returned sp:
* argc, argv[0..argc-1], NULL (argv term), NULL (envp term).
@@ -1012,7 +1091,7 @@ static u64 build_user_stack(u64 stack_top, int argc, char **argv) {
/* Lay strings down from stack_top - 16 (16-byte alignment slack). */
u64 strs_top = stack_top - 16;
- u64 strs[MAX_ARGV];
+ u64 *strs = build_user_stack_strs;
char *cursor = (char *)strs_top;
for (int i = argc - 1; i >= 0; i--) {
int n = str_n(argv[i]) + 1;
diff --git a/seed-kernel/user/forktest.c b/seed-kernel/user/forktest.c
@@ -1,6 +1,7 @@
-/* Tier 2 demo: parent does clone() → execve("child") in child →
- * waitid in parent → reports result. Mirrors the scheme1 prelude's
- * spawn/run/wait pattern in C. */
+/* Tier 2 demo: parent atomic-spawns "child" with one syscall, waitid's
+ * for it, reports the result. Mirrors the scheme1 prelude's spawn/wait
+ * pattern in C. The seed kernel offers sys_spawn (private syscall 1024)
+ * in place of POSIX clone+execve. */
typedef long i64;
typedef unsigned long u64;
@@ -13,9 +14,8 @@ typedef int i32;
#define SYS_lseek 62
#define SYS_brk 214
#define SYS_exit_group 93
-#define SYS_clone 220
-#define SYS_execve 221
#define SYS_waitid 95
+#define SYS_spawn 1024
static i64 sysc(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f) {
register u64 x8 asm("x8") = nr;
@@ -34,8 +34,7 @@ static i64 sysc(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f) {
static i64 sys_write(int fd, const void *buf, u64 n) { return sysc(SYS_write, (u64)fd, (u64)buf, n, 0,0,0); }
static void sys_exit(int c) { sysc(SYS_exit_group, (u64)c, 0,0,0,0,0); for(;;); }
-static i64 sys_clone(void) { return sysc(SYS_clone, 17/*SIGCHLD*/, 0,0,0,0,0); }
-static i64 sys_execve(const char *p, char **argv) { return sysc(SYS_execve, (u64)p, (u64)argv, 0, 0, 0, 0); }
+static i64 sys_spawn(const char *p, char **argv) { return sysc(SYS_spawn, (u64)p, (u64)argv, 0, 0, 0, 0); }
static i64 sys_waitid(int id, int pid, void *info, int opts) { return sysc(SYS_waitid, (u64)id, (u64)pid, (u64)info, (u64)opts, 0, 0); }
void *memset(void *d, int c, u64 n) {
@@ -55,20 +54,12 @@ static void put_d(i64 v) {
void _start_c(long argc, char **argv) {
puts_("[forktest] argc="); put_d(argc); puts_(" argv[0]="); puts_(argv[0]); puts_("\n");
- long pid = sys_clone();
- if (pid == 0) {
- /* child */
- puts_("[forktest:child] pre-exec\n");
- char *cargv[3];
- cargv[0] = "child";
- cargv[1] = "from-parent";
- cargv[2] = 0;
- sys_execve("child", cargv);
- puts_("[forktest:child] execve failed\n");
- sys_exit(127);
- }
- /* parent */
- puts_("[forktest:parent] clone returned pid="); put_d(pid); puts_("\n");
+ char *cargv[3];
+ cargv[0] = "child";
+ cargv[1] = "from-parent";
+ cargv[2] = 0;
+ long pid = sys_spawn("child", cargv);
+ puts_("[forktest:parent] spawn returned pid="); put_d(pid); puts_("\n");
unsigned char info[128];
memset(info, 0, sizeof info);
long w = sys_waitid(/*P_PID*/1, (int)pid, info, /*WEXITED*/4);