commit 6853e7a46ad4a95dbab4acfb6b36a256d5aef466
parent c7d6dc60f8453a4d4fc5384c29030d5b23c8c438
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 29 Apr 2026 11:02:42 -0700
tests/cc-libc
Diffstat:
23 files changed, 1006 insertions(+), 214 deletions(-)
diff --git a/Makefile b/Makefile
@@ -226,20 +226,46 @@ TCC_FLAT := build/cc-bootstrap/$(TCC_TARGET)/tcc.flat.c
TCC_BOOT2_BINS := $(foreach a,$(ALL_ARCHES),build/$(a)/tcc-boot2/tcc-boot2)
TCC_BOOT2_P1PPS := $(foreach a,$(ALL_ARCHES),build/$(a)/tcc-boot2/tcc.flat.P1pp)
+# Vendored mes-libc, flattened on the host then compiled by cc.scm,
+# linked into tcc-boot2 itself so the unresolved libc symbols (printf,
+# malloc, fopen, …) resolve against our own libc.P1pp instead of the
+# host's. Phase A of docs/LIBC.md.
+LIBC_FLATS := $(foreach a,$(ALL_ARCHES),build/cc-bootstrap/$(a)/libc.flat.c)
+LIBC_P1PPS := $(foreach a,$(ALL_ARCHES),build/$(a)/libc.P1pp)
+
tcc-flat: $(TCC_FLAT)
tcc-boot2: $(OUT_DIR)/tcc-boot2/tcc-boot2
$(TCC_FLAT): scripts/stage1-flatten.sh
sh scripts/stage1-flatten.sh --arch $(TCC_TARGET)
+# Catalog of inputs the host preprocessor reads when flattening libc.
+LIBC_VENDOR_SRCS := $(shell find vendor/mes-libc -type f \( -name '*.c' -o -name '*.h' \) 2>/dev/null) \
+ $(wildcard vendor/mes-libc/patches/*.before) \
+ $(wildcard vendor/mes-libc/patches/*.after)
+
+$(LIBC_FLATS): build/cc-bootstrap/%/libc.flat.c: \
+ scripts/libc-flatten.sh $(LIBC_VENDOR_SRCS)
+ sh scripts/libc-flatten.sh --arch $*
+
+$(LIBC_P1PPS): build/%/libc.P1pp: \
+ build/cc-bootstrap/%/libc.flat.c \
+ build/%/scheme1 build/%/cc/cc.scm \
+ scripts/boot-build-cc.sh build/%/.image
+ $(call PODMAN,$*) sh scripts/boot-build-cc.sh $< $@
+
$(TCC_BOOT2_P1PPS): build/%/tcc-boot2/tcc.flat.P1pp: \
$(TCC_FLAT) build/%/scheme1 build/%/cc/cc.scm \
scripts/boot-build-cc.sh build/%/.image
$(call PODMAN,$*) sh scripts/boot-build-cc.sh $(TCC_FLAT) $@
$(TCC_BOOT2_BINS): build/%/tcc-boot2/tcc-boot2: \
- build/%/tcc-boot2/tcc.flat.P1pp $(P1PP_BUILD_DEPS)
- $(call PODMAN,$*) env P1PP_TRACE=1 sh scripts/boot-build-p1pp.sh $< $@
+ build/%/tcc-boot2/tcc.flat.P1pp build/%/libc.P1pp \
+ scripts/boot-libc-prepend.sh \
+ $(P1PP_BUILD_DEPS)
+ $(call PODMAN,$*) sh scripts/boot-libc-prepend.sh \
+ build/$*/libc.P1pp $< $(@D)/tcc-boot2.P1pp
+ $(call PODMAN,$*) env P1PP_TRACE=1 sh scripts/boot-build-p1pp.sh $(@D)/tcc-boot2.P1pp $@
# --- Native tools (opt-in dev-loop helpers) -------------------------------
@@ -301,6 +327,12 @@ TEST_CC_UNIT_DEPS := $(foreach a,$(TEST_ARCHES), \
TEST_CC_DEPS := $(TEST_CC_UNIT_DEPS) \
$(foreach a,$(TEST_ARCHES),build/$(a)/cc/cc.scm)
+# cc-libc: cc deps + the pre-built libc.P1pp the suite prepends to every
+# fixture. Targeted red-green TDD on the cc.scm + libc combination.
+TEST_CC_LIBC_DEPS := $(TEST_CC_DEPS) \
+ $(foreach a,$(TEST_ARCHES),build/$(a)/libc.P1pp) \
+ scripts/boot-libc-prepend.sh
+
test:
ifeq ($(SUITE),)
@$(MAKE) --no-print-directory test SUITE=m1pp
@@ -326,6 +358,9 @@ else ifeq ($(filter $(SUITE),cc-util cc-lex cc-pp cc-cg),$(SUITE))
else ifeq ($(SUITE),cc)
@$(MAKE) --no-print-directory $(TEST_CC_DEPS)
sh scripts/run-tests.sh --suite=cc $(if $(ARCH_FILTER),--arch=$(ARCH_FILTER))
+else ifeq ($(SUITE),cc-libc)
+ @$(MAKE) --no-print-directory $(TEST_CC_LIBC_DEPS)
+ sh scripts/run-tests.sh --suite=cc-libc $(if $(ARCH_FILTER),--arch=$(ARCH_FILTER))
else
- @echo "unknown SUITE='$(SUITE)' (m1pp | p1 | scheme1 | cc-util | cc-lex | cc-pp | cc-cg | cc)" >&2; exit 2
+ @echo "unknown SUITE='$(SUITE)' (m1pp | p1 | scheme1 | cc-util | cc-lex | cc-pp | cc-cg | cc | cc-libc)" >&2; exit 2
endif
diff --git a/docs/LIBC.md b/docs/LIBC.md
@@ -1,82 +1,85 @@
-# lispcc libc — implementation plan
+# lispcc libc
-Engineer-facing handoff. Goal: a `tcc-boot2` that **runs and produces
-working binaries.** That requires three things, in order:
+Goal: a `tcc-boot2` that runs and produces working binaries. Three
+phases:
-1. **Phase A** — define every symbol in [LIBC.txt](LIBC.txt) so
- tcc-boot2 itself links. The output is `libc.P1pp`, catm'd with
- `tcc.P1pp` to produce the tcc-boot2 ELF.
+1. **Phase A** — compile mes-libc to a P1pp library and link it into
+ tcc-boot2 itself. Status: linkage green, runtime hardening in
+ progress (see [§Phase A status](#phase-a-status)).
2. **Phase B1** — produce a `libc.a` archive on disk at the path
- tcc-boot2 expects (`$LIBDIR/libc.a`). tcc-boot2 auto-appends `-lc`
- when linking user code; without this archive, even `hello.c` fails
- to link.
+ tcc-boot2 expects (`$LIBDIR/libc.a`). Not started.
3. **Phase B2** — produce a `libtcc1.a` archive on disk at
- `$LIBDIR/tcc/libtcc1.a`. tcc emits calls to runtime helpers
- (`__divdi3`, `__floatundidf`, …) in the code it compiles; without
- this archive, anything using long-long divmod or FP fails to link.
+ `$LIBDIR/tcc/libtcc1.a` (runtime helpers tcc emits calls to:
+ `__divdi3`, `__floatundidf`, …). Not started.
-Phases B1/B2 use tcc-boot2 itself as the compiler, the same way
-live-bootstrap uses tcc-mes. They are bootstrap steps, not separate
-projects: until they're done, "tcc-boot2 works" is only true for
-`-version` and similar trivial paths.
+Until Phase B lands, tcc-boot2 can only run paths that don't need to
+link external archives — `-version`, parse-only smokes.
Strategy in one sentence: **vendor a curated subset of mes libc as
-source, patch four small things, replace mes's inline-asm syscall
-wrappers with one hand-written file that calls P1pp's labelled
-`sys_*` entry points, then build it three different ways: as P1pp
-(Phase A) and as ELF object files via tcc-boot2 (Phase B1). Phase B2
-compiles upstream tcc's `lib/libtcc1.c` with tcc-boot2.** Rationale
-lives in [TCC-TODO.md §libc strategy](TCC-TODO.md#libc--see-libcmd);
-read it once, then operate from this file.
+source, patch a handful of small things, replace mes's per-arch
+inline-asm syscall wrappers with one hand-written file
+(`lispcc-syscall.c`) that calls our P1pp labelled `sys_*` entry
+points, then build it three different ways: as P1pp linked into
+tcc-boot2 (Phase A), as ELF object files via tcc-boot2 itself
+(Phase B1), and tcc's own `lib/libtcc1.c` via tcc-boot2 (Phase B2).**
+Rationale lives in [TCC-TODO.md §libc strategy](TCC-TODO.md#libc--see-libcmd).
Anchors: mes source lives at `../mes/lib/`. P1pp syscall block is at
-[P1/P1pp.P1pp:986-1029](../P1/P1pp.P1pp). cc.scm's C linkage is the
-recent commit `6488cca`. Live-bootstrap's reference catm command is
-the long line in
-`../live-bootstrap/steps/tcc-0.9.26/pass1.kaem` (search for
+[P1/P1pp.P1pp:986-1058](../P1/P1pp.P1pp). cc.scm's C linkage rule is
+commit `6488cca`. Live-bootstrap's reference catm command is the long
+line in `../live-bootstrap/steps/tcc-0.9.26/pass1.kaem` (search for
`unified-libc.c`).
-## Prerequisites
+## Layout
-- `make scheme1 cc ARCH=aarch64` succeeds (i.e. `build/aarch64/scheme1`
- and `build/aarch64/cc/cc.scm` exist).
-- `make tcc-boot2 ARCH=aarch64` runs to the linker stage; the unresolved
- symbols match LIBC.txt. Refresh the list with `scripts/boot-undef.sh`.
+```
+vendor/mes-libc/
+├── ctype/ string/ stdlib/ stdio/ posix/ mes/ (verbatim from
+│ └── *.c ../mes/lib/)
+├── linux/
+│ └── malloc.c (only file kept; the others are replaced
+│ by lispcc-syscall.c)
+├── include/ (verbatim copy of ../mes/include/, plus an
+│ empty mes/config.h shim)
+├── patches/ (literal-block .before/.after pairs)
+├── lispcc-syscall.c (our hand-written replacement for mes's
+│ per-arch inline-asm syscall.c + glue)
+├── unified-libc.c (#include's every .c above; host -E flattens)
+└── LICENSE (mes's COPYING; libc subset is GPLv3+)
+
+scripts/
+├── libc-flatten.sh host (no boot- prefix); stage + patch + -E
+├── boot-build-cc.sh container; cc.scm → libc.P1pp
+└── boot-libc-prepend.sh container; link-time transforms (§Linking)
+
+tests/cc-libc/ targeted fixtures for cc.scm + libc TDD
+```
## Phase A — link tcc-boot2
-### 1. Add `sys_lseek`, `sys_brk`, `sys_unlink` to P1pp
-
-Edit [P1/P1pp.P1pp](../P1/P1pp.P1pp) — append three labelled entries
-next to `:sys_close` (lines 1015-1019), shape mirrors the existing
-`:sys_open` (route `unlink` through `unlinkat(AT_FDCWD, path, 0)`):
-
-```
-:sys_lseek ; (fd, off, whence) -> off
-:sys_brk ; (addr) -> new_brk ; addr=0 returns current break
-:sys_unlink ; (path) -> 0 / -errno ; via unlinkat on aarch64/riscv64
-```
+### P1pp syscall wrappers
-Then add the syscall numbers to
-`P1/P1-{aarch64,amd64,riscv64}.M1pp`:
+`P1/P1pp.P1pp` defines labelled syscall entry points; per-arch
+backend macros (`P1/P1-{aarch64,amd64,riscv64}.M1pp`) supply the
+syscall numbers. Original wrappers covered tcc's needs partly; we
+added three for libc:
-| arch | lseek | brk | unlink |
-|---------|------:|----:|----------------|
-| amd64 | 8 | 12 | 87 |
-| aarch64 | 62 | 214 | 35 (unlinkat) |
-| riscv64 | 62 | 214 | 35 (unlinkat) |
+| arch | lseek | brk | unlinkat |
+|---------|------:|----:|---------:|
+| amd64 | 8 | 12 | 263 |
+| aarch64 | 62 | 214 | 35 |
+| riscv64 | 62 | 214 | 35 |
-Acceptance for this step: a hand-written P1pp test that calls each
-of the three (e.g. `tests/p1pp/sys_brk.P1pp`) prints expected values
-under `make test ARCH=aarch64`.
+`:sys_unlink` always routes through `unlinkat(AT_FDCWD, path, 0)` —
+same trick as `:sys_open → openat`, so the C-visible interface is
+identical across arches.
-### 2. Vendor mes libc subset to `vendor/mes-libc/`
+Acceptance fixture: `tests/p1/sys_calls.P1pp` exercises all three on
+every arch via `make test SUITE=p1`.
-Mirror mes's directory structure under `vendor/mes-libc/` and copy
-the files listed in the manifest below verbatim. Keep mes's
-copyright headers; add a top-level `LICENSE` (mes is GPLv3+).
+### Vendored manifest
-**Manifest** (paths are relative to `../mes/lib/`):
+Verbatim copies from `../mes/lib/`:
```
ctype/ isalnum.c isalpha.c isascii.c iscntrl.c isdigit.c
@@ -89,7 +92,7 @@ string/ memchr.c memcmp.c memcpy.c memmem.c memmove.c memset.c
strpbrk.c strrchr.c strspn.c strstr.c strupr.c
stdlib/ abort.c atoi.c atol.c calloc.c exit.c __exit.c free.c
- qsort.c realloc.c strtof.c strtol.c strtoll.c
+ puts.c qsort.c realloc.c strtof.c strtol.c strtoll.c
strtoul.c strtoull.c
stdio/ clearerr.c fclose.c fdopen.c feof.c ferror.c fflush.c
@@ -98,7 +101,7 @@ stdio/ clearerr.c fclose.c fdopen.c feof.c ferror.c fflush.c
perror.c printf.c putc.c remove.c snprintf.c sprintf.c
ungetc.c vfprintf.c vprintf.c vsnprintf.c vsprintf.c
-linux/ brk.c close.c lseek.c malloc.c _open3.c _read.c unlink.c
+linux/ malloc.c (only)
posix/ buffered-read.c execvp.c getcwd.c getenv.c open.c
sbrk.c write.c
@@ -110,140 +113,185 @@ mes/ abtol.c __assert_fail.c __buffered_read.c cast.c dtoab.c
search-path.c ultoa.c utoa.c
```
-Also vendor the headers cc.scm needs to flatten the file list. Copy
-`../mes/include/` → `vendor/mes-libc/include/` (it's already used by
-`stage1-flatten.sh` via `MES_INCLUDE`; reuse the same tree).
-
-### 3. Apply the four surgical patches
-
-Place these as `vendor/mes-libc/patches/*.patch` and apply in
-`scripts/boot-build-libc.sh` the same way `stage1-flatten.sh` applies
-its simple-patches.
-
-1. **`mes/globals.c`** — leave as-is. Sanity-check that it declares
- `int errno;`, `char **environ;`, and `int __stdin/out/err;` as
- plain globals. (mes already does this; the patch is empty, listed
- here so the engineer doesn't accidentally "fix" it to TLS.)
-2. **`linux/malloc.c`** — replace `sizeof (max_align_t)` with the
- integer literal `16`. cc.scm has no `max_align_t`. The arithmetic
- is unchanged.
-3. **`string/strstr.c`** — drop `#include <sys/mman.h>`. The function
- doesn't use mmap; the include is a stray.
-4. **printf-family `ap` shift** — no patch required. The blocks
- guarded by `#if __GNUC__ && __x86_64__ && !SYSTEM_LIBC` in
- `stdio/{snprintf,sprintf,vsprintf,fprintf,printf}.c` evaluate to
- zero under cc.scm (no `__GNUC__`), so they compile out cleanly.
- Confirm by grep after preprocessing.
-
-### 4. Write `vendor/mes-libc/lispcc-syscall.c`
-
-This is the only file we author. ~80 lines. It replaces every
-`linux/<arch>-mes-mescc/syscall.c` from mes (those rely on inline
-asm). One C wrapper per syscall; each calls a P1pp label by name,
-relying on cc.scm's external-linkage rule (commit `6488cca`).
-
-Sketch:
-
-```c
-extern long sys_read (long fd, long buf, long n);
-extern long sys_write (long fd, long buf, long n);
-extern long sys_open (long path, long flags, long mode);
-extern long sys_close (long fd);
-extern long sys_lseek (long fd, long off, long whence);
-extern long sys_brk (long addr);
-extern long sys_unlink (long path);
-extern long sys_exit (long code);
-
-extern int errno;
-
-static long set_errno (long r) {
- if (r < 0) { errno = -r; return -1; }
- errno = 0; return r;
-}
-
-ssize_t read (int fd, void *buf, size_t n) {
- return set_errno (sys_read (fd, (long) buf, n));
-}
-ssize_t write (int fd, void const *buf, size_t n) {
- return set_errno (sys_write (fd, (long) buf, n));
-}
-int close (int fd) { return (int) set_errno (sys_close (fd)); }
-off_t lseek (int fd, off_t off, int w) {
- return set_errno (sys_lseek (fd, off, w));
-}
-long brk (void *p) { return set_errno (sys_brk ((long) p)); }
-int unlink(char const *p) {
- return (int) set_errno (sys_unlink ((long) p));
-}
-void _exit (int c) { sys_exit (c); }
-
-/* execve gets a similar wrapper; see mes/lib/linux/execve.c for the
- * argv/envp marshalling. */
+Linux syscall-touching files (`brk`, `close`, `lseek`, `_open3`,
+`_read`, `unlink`) are replaced by `lispcc-syscall.c` directly so we
+don't drag in mes's `_sys_callN` indirection. `linux/malloc.c` stays
+— it's a free-list allocator on top of `brk()`, no syscall plumbing
+of its own.
+
+Headers: `vendor/mes-libc/include/` is a verbatim copy of
+`../mes/include/`, plus an empty `mes/config.h` shim (no-op for
+`HAVE_CONFIG_H` consumers).
+
+### Patches
+
+`vendor/mes-libc/patches/*.{before,after}` are literal-block pairs
+applied by `scripts/libc-flatten.sh` (same `apply_simple_patch`
+shape `stage1-flatten.sh` uses for tcc):
+
+| patch | target | reason |
+|------------------------|-------------------------|-----------------------------------------------------------------------|
+| malloc-max-align | linux/malloc.c | `sizeof(max_align_t)` → `16`. cc.scm has no `max_align_t`. |
+| strstr-drop-mman | string/strstr.c | drop unused `#include <sys/mman.h>`. |
+| libmini-write-proto | include/mes/lib-mini.h | `void __init_io ();` → typed prototype. cc.scm rejects empty-arg-list redecls when followed by a typed definition. |
+| libmini-write-proto2 | include/mes/lib-mini.h | same fix for `ssize_t _write ();`. |
+| lib-mes-debug-proto | include/mes/lib.h | same fix for `__mes_debug ()` and `__ungetc_init ()`. |
+
+`mes/globals.c` is intentionally **not** patched — it must stay plain
+int globals (no TLS).
+
+`stdio/{snprintf,sprintf,vsprintf,fprintf,printf}.c`'s `#if __GNUC__
+&& __x86_64__ && !SYSTEM_LIBC` blocks evaluate to zero under our
+defines (no `__GNUC__`); confirm by grep on `libc.flat.c` if in doubt.
+
+### lispcc-syscall.c
+
+The only file we author. Provides:
+
+- `_read`, `_write`, `_open3`, `close`, `_lseek`, `lseek`, `brk`,
+ `unlink`, `_exit` — thin C wrappers around our `sys_*` P1pp labels.
+ Public posix functions (`read`, `write`, `open`, …) come from
+ `posix/*.c` on top.
+- `__libc_init(argc, argv)` — populates `environ` from argv's NULL
+ terminator at process entry. Without this, tcc's first `getenv()`
+ in `tcc_new()` dereferences a NULL `environ` and segfaults.
+- Real `stdin` / `stdout` / `stderr` symbols (`(FILE*)0/1/2`).
+ mes's `<stdio.h>` defines these as macros; we `#undef` them after
+ libc's includes and define globals so client code can use canonical
+ `extern FILE *stdout; fputs(s, stdout);`.
+- ENOSYS stubs for libc-internal symbols transitively pulled in by
+ the manifest but not exercised by the tcc-boot2 path: `access`,
+ `assert_msg`, `execve`, `fsync`, `raise`, `rmdir`, `stat`,
+ `_getcwd`, `strtod`. Replace with real wrappers if any surfaces in
+ a real workload.
+
+### Build
+
+`scripts/libc-flatten.sh --arch <a>` (host):
+
+1. Stage `vendor/mes-libc/` to `build/cc-bootstrap/$ARCH/libc-stage/`
+ so patching is non-destructive.
+2. `ln -sfn linux/$MES_ARCH include/arch` so mes's `<arch/...>`
+ includes resolve through the canonical `<sys/stat.h>` chain.
+3. Apply patches on the staged copy.
+4. `host_cc -E -nostdinc -I include -I . -D __linux__=1
+ -D __${MES_ARCH}__=1 -D __riscv_xlen=64 unified-libc.c
+ → build/cc-bootstrap/$ARCH/libc.flat.c` (~52 KB, ~2400 lines).
+
+`MES_ARCH` mapping is `aarch64→riscv64`, `amd64→x86_64`,
+`riscv64→riscv64`. mes ships no aarch64 headers; the riscv64 set
+suffices because nothing in our flatten ends up referencing arch-
+specific syscall numbers or struct stat layouts (lispcc-syscall.c
+goes around them).
+
+`-I` order is **`include` before `include/linux/$MES_ARCH`** —
+reverse breaks `<signal.h>` resolution because the per-arch tree
+holds a partial `signal.h` that shadows the canonical `typedef long
+stack_t;`.
+
+`scripts/boot-build-cc.sh` (container) then runs `cc.scm` over
+`libc.flat.c` to produce `build/$ARCH/libc.P1pp` (~520 KB,
+~21 K lines).
+
+### Linking — `scripts/boot-libc-prepend.sh`
+
+cc.scm's output `libc.P1pp` is *almost* a P1pp library — but cc.scm
+emits the same standard executable tail for every TU. The link-time
+script transforms libc.P1pp:
+
+1. Drop cc.scm's auto-emitted exec tail (`# entry stub` comment plus
+ the `%fn(p1_main, 16, { %call(&main) })` block). Library TUs must
+ not own `:p1_main`.
+2. Drop the trailing `:ELF_end`. `ELF.hex2` sizes `p_filesz` from
+ the first `:ELF_end` it sees; only the executable TU may emit it.
+3. Rename internal-linkage `cc__str_N` (anonymous string literals) to
+ `libc__cc__str_N`. cc.scm restarts that counter at 0 per TU; hex2
+ silently first-def-wins on duplicates and refs in the loser TU
+ bind to the wrong bytes.
+4. Pad each `'<hex>'` literal to an 8-byte boundary. cc.scm emits
+ strings at their natural length; if the byte count isn't a
+ multiple of 4, every label that follows lands at a non-4-aligned
+ address and aarch64 BLR SIGBUSes.
+
+Then it appends our own fixed `:p1_main` wrapper that calls
+`__libc_init` (sets `environ`) before forwarding to `main`, and
+applies the same auto-`:p1_main` strip to the executable TU so its
+copy doesn't shadow ours.
+
+The Makefile uses this script for both the tcc-boot2 link and the
+cc-libc test suite, so there's exactly one place to evolve the
+link-time invariants.
+
+All four numbered transforms work around cc.scm bugs; the right
+long-term fix is a `--library` mode in cc.scm that does (1, 2, 3)
+internally and a string-padding pass that handles (4). See
+[TCC-TODO.md §cc.scm-libc-issues](TCC-TODO.md).
+
+### Wiring
+
+```
+make tcc-boot2 ARCH=aarch64 # builds libc.P1pp + tcc.flat.P1pp,
+ # then libc-prepends and assembles to ELF
```
-This file replaces these mes files (do not vendor them):
-`linux/<arch>-mes-mescc/syscall.c`, `linux/<arch>-mes-mescc/_exit.c`,
-`linux/<arch>-mes-mescc/_write.c`, `linux/<arch>-mes-gcc/*.c`,
-`linux/<arch>-mes-mescc/syscall-internal.c`.
+The tcc-boot2 link rule depends on `build/$ARCH/libc.P1pp` and
+`scripts/boot-libc-prepend.sh`; rebuilds when either changes.
-Also: drop mes's `linux/_read.c` (it dispatches to the inline-asm
-wrapper); our `read` above replaces it. Keep
-`posix/buffered-read.c` — it consumes our `read` via the
-`__buffered_read` indirection.
+## Phase A status
-### 5. Write `scripts/boot-build-libc.sh`
+`make tcc-boot2 ARCH=aarch64` links cleanly (0 unresolved symbols).
+`tcc-boot2 -version` currently segfaults. We're driving the failure
+mode through the **cc-libc** test suite (next §) so each cc.scm/libc
+bug surfaces as one focused fixture instead of a 1.8 MB binary
+diagnostic.
-Mirror `boot-build-cc.sh`'s shape. Pseudocode:
+| fixture | status | exercises |
+|--------------------|--------|------------------------------------------------------------------------------------|
+| 00-exit | PASS | bare `int main() { return 7; }` |
+| 01-write-syscall | PASS | direct `extern long sys_write` (P1pp label) |
+| 02-write-libc | PASS | `posix/write.c → _write → sys_write` (errno layer) |
+| 03-fputs-stdout | PASS | `fputs(s, stdout) → fdputs → write` |
+| 04-printf-literal | FAIL | `printf("plain literal\n")` — prints, then segfaults on main return |
+| 05-printf-int | FAIL | `printf("got %d\n", 42)` — pulls 100 instead of 42 (varargs bug) + segfaults |
+| 06-puts | FAIL | `puts("ok")` — silent: `oputs` writes through `__stdout` which reads 0 (cc.scm tentative-vs-initialized merge bug) |
+| 07-malloc-roundtrip| PASS | `malloc → brk → sys_brk` round-trip |
-```sh
-ROOT=...; ARCH=...
-LIBC_FLAT=build/cc-bootstrap/$ARCH/libc.flat.c
-LIBC_P1PP=build/$ARCH/libc.P1pp
+Phase A acceptance lands when 04, 05, 06 turn green: that
+demonstrates varargs, return-from-libc, and global-initializer
+resolution all work end-to-end. `tcc-boot2 -version` retest follows
+naturally; expect more cc-libc fixtures to be born from whatever
+breaks at that point.
-# (a) preprocess + concat (host cc -E -nostdinc, like stage1-flatten)
-host_cc -E -nostdinc \
- -I vendor/mes-libc/include \
- -I vendor/mes-libc/include/linux/$MES_ARCH \
- -D HAVE_CONFIG_H=1 \
- vendor/mes-libc/unified-libc.c \
- > "$LIBC_FLAT"
+## Workflow — adding a cc-libc fixture
-# (b) compile with cc.scm in container
-podman run ... build/$ARCH/scheme1 build/$ARCH/cc/cc.scm \
- "$LIBC_FLAT" "$LIBC_P1PP"
+```
+tests/cc-libc/<name>.c # source, plain C
+tests/cc-libc/<name>.expected # exact stdout match (default empty)
+tests/cc-libc/<name>.expected-exit # exit status (default 0)
```
-Where `vendor/mes-libc/unified-libc.c` is a hand-written file that
-just `#include`s every .c in the manifest order (live-bootstrap
-catms; we use `#include` so the host preprocessor handles dedup of
-mes's per-file `#include <mes/lib.h>` etc.). The `#include "*.c"`
-pattern is the same one used by `tcc.flat.c`'s `#include "libtcc.c"`
-upstream.
-
-Wire into the Makefile so `make tcc-boot2 ARCH=$A` runs
-`boot-build-libc.sh` before the link step. The link step gains one
-line:
+cc.scm doesn't run a preprocessor, so fixtures use explicit `extern`
+decls today. (TODO: thread mes headers through `host_cc -E` like
+libc-flatten.sh does for the libc itself, then fixtures can use
+`#include <stdio.h>` etc.)
```
-cat tcc.P1pp libc.P1pp > tcc-boot2.P1pp
+make test SUITE=cc-libc ARCH=aarch64
+make test SUITE=cc-libc ARCH=aarch64 -- 05-printf-int # one fixture
```
-(libc *after* tcc — its .bss must follow tcc's data without crossing
-it).
-
-### 6. Phase A smoke tests
+Per-fixture artefacts at `build/$ARCH/cc-libc/<name>/`:
-- `tests/cc/200-libc-hello.c` — a hand-written `main()` that calls
- `printf("hi\n")` then `exit(0)`. Compile with cc.scm, link against
- libc.P1pp, run in the container, check stdout = `hi\n` and exit
- status 0.
-- `tests/cc/201-libc-malloc.c` — round-trips malloc/free/realloc.
-- `tests/cc/202-libc-stdio.c` — fopen/fwrite/fclose, then re-read
- and compare bytes.
+- `<name>.client.P1pp` — cc.scm output for the fixture
+- `<name>.P1pp` — merged (libc + client) input to boot-build-p1pp.sh
+- `<name>` — final ELF
+- `cc.log` / `prepend.log` / `p1pp.log` — captured stdout+stderr from
+ each pipeline stage; the suite handler dumps the relevant log under
+ the FAIL row when a stage exits non-zero.
-Phase A acceptance: `make tcc-boot2 ARCH=aarch64` links to a runnable
-ELF, and `tcc-boot2 -version` prints the version string under the
-per-arch container.
+When triaging a failure, the merged `.P1pp` is the artefact to grep
+for the symbol or sequence in question; cc.scm's output marks
+function regions clearly.
## Phase B — build the on-disk archives tcc-boot2 needs
@@ -261,7 +309,8 @@ Build them with tcc-boot2 itself, mirroring live-bootstrap's
### B1. libc.a from the same vendored sources
Reuse `vendor/mes-libc/unified-libc.c` from Phase A. Compile with
-tcc-boot2 (per arch). Add `scripts/boot-build-libc-archive.sh`:
+tcc-boot2 (per arch). Add `scripts/boot-build-libc-archive.sh` (boot-*
+because tcc-boot2 itself runs in the container):
```sh
TCC_BOOT2=build/$ARCH/tcc-boot2
@@ -357,16 +406,49 @@ That's tracked in [TCC.md](TCC.md), not here.
## Notes for the engineer
- Refresh LIBC.txt with `scripts/boot-undef.sh > docs/LIBC.txt` after
- the link starts working — new externals may surface that the
- current static analysis missed.
+ fixing a cc-libc bug that opens a previously-dead path. The
+ unresolved-symbol set may shift as cc.scm-libc bugs get fixed and
+ formerly-static-DCE'd code paths come live.
- If a mes file pulls in a header path we don't have, the right move
is almost always to copy the matching `mes/include/` header
verbatim — don't write a substitute.
- cc.scm's debug flag (`--cc-debug`, see TCC-TODO.md "Repro") prints
- per-phase heap usage. libc.flat.c is small (~30 KB after flatten)
+ per-phase heap usage. libc.flat.c is small (~52 KB after flatten)
so heap should be flat; if it isn't, that's a cc.scm bug, not a
libc bug.
- The existing `vendor/seed/` layout is `<tool>/<arch>/...`. mes-libc
is per-arch only via headers; the .c manifest is arch-agnostic.
Layout `vendor/mes-libc/{ctype,string,...}/` flat, with
`vendor/mes-libc/include/linux/<arch>/` per-arch.
+
+### cc.scm bugs surfaced by Phase A
+
+All worked around at link time (boot-libc-prepend.sh) or via patches.
+Migrate into cc.scm as that becomes the cleanest place to fix them.
+
+- **String literal padding.** cc.scm emits `'<hex>'` literals at their
+ natural byte length. If a string's count isn't a multiple of 4, every
+ label that follows lands at a non-4-aligned address; aarch64 BLR
+ SIGBUSes. Workaround: link-time pad to 8 bytes.
+- **Per-TU label namespacing.** cc.scm restarts `cc__str_N` at 0 per
+ TU. Linking two cc.scm outputs gives duplicate `:cc__str_0..N`;
+ hex2 first-def-wins silently. Workaround: rename library TU's
+ `cc__str_` to `libc__cc__str_` at link time.
+- **Library mode missing.** cc.scm always emits `# entry stub` +
+ `%fn(p1_main, 16, { %call(&main) })` + trailing `:ELF_end`. Library
+ TUs must not own those. Workaround: strip them at link time.
+- **Empty-arg-list redecl rejection.** mes headers' K&R-style `f();`
+ followed by a typed definition fails with `redecl: type mismatch`.
+ Workaround: vendored-header patches that prototype the offenders
+ (see [§Patches](#patches)).
+- **Tentative-vs-initialized merge (suspected).** `mes/globals.c`'s
+ `int __stdout;` (tentative) plus `mes/mes_open.c`'s `int __stdout
+ = STDOUT;` (initialized) seems to resolve to the tentative zero
+ rather than the initializer in a single TU. Symptom is
+ cc-libc/06-puts; under investigation.
+- **Varargs lowering (suspected).** `printf("got %d\n", 42)` pulls
+ 100 instead of 42 from `va_arg`. Symptom is cc-libc/05-printf-int;
+ not yet diagnosed.
+- **Floating-point literals.** cc.scm rejects `0.0`. Use `(double) 0`
+ or similar non-literal forms (one site in lispcc-syscall.c's
+ `strtod` stub).
diff --git a/docs/OS.md b/docs/OS.md
@@ -0,0 +1,246 @@
+# Minimal OS contract
+
+The lispcc bootstrap depends on a small, well-bounded set of OS
+capabilities. This document specifies that contract so a minimal OS
+kernel can be implemented (and verified) against it. The rest of the
+chain — `M0`, `hex2`, `cc.scm`, `tcc-boot2`, libc — assumes nothing
+beyond what's listed here.
+
+The "shell" here is scheme1 evaluating a driver `.scm` over the
+process-management and file-I/O layer in
+[`scheme1/prelude.scm`](../scheme1/prelude.scm) (see lines 493–696).
+It's not a POSIX shell: it spawns and waits, opens files, reads, and
+writes. It does **not** pipe, redirect, or `cd`. Bootstrap steps
+compose through files (`catm`-style), not pipelines.
+
+Two tiers:
+
+- **Tier 1 — toolchain.** Enough to run `cc.scm` and `tcc-boot2` and
+ to compile/link a static ELF. Eight syscalls.
+- **Tier 2 — driver.** Adds spawn-and-wait so a scheme1 driver can
+ invoke tcc-boot2 (and other compiled binaries) as subprocesses.
+ Three more syscalls.
+
+Anything past Tier 2 (threads, signals beyond default-action, mmap,
+dynamic linking, sockets, timers, locale, IEEE-754 math, pipes,
+redirection, working-directory state) is out of scope. See
+[§Out of scope](#out-of-scope) for the explicit non-list.
+
+## Targets
+
+Three architectures, identical contract. P1-64 only (LP64).
+
+| arch | platform | syscall instr | arg regs | nr reg | ret reg |
+|---------|-----------------|------------------|-----------------------------|--------|---------|
+| amd64 | Linux x86-64 | `syscall` | rdi rsi rdx r10 r8 r9 | rax | rax |
+| aarch64 | Linux ARM64 | `svc #0` | x0 x1 x2 x3 x4 x5 | x8 | x0 |
+| riscv64 | Linux RISC-V 64 | `ecall` | a0 a1 a2 a3 a4 a5 | a7 | a0 |
+
+These are the native Linux ABIs; the per-arch shims in
+`P1/P1-{aarch64,amd64,riscv64}.M1pp` (`%macro p1_syscall`, lines
+~520–930) marshal P1 registers into them. Any kernel that implements
+these three ABIs verbatim can host the chain.
+
+Syscall numbers are the standard Linux-on-`uname-m` numbers used by
+those macros (e.g. `read=63` on aarch64, `read=0` on amd64). A
+fresh-write OS is free to renumber, but only at the cost of also
+rewriting the per-arch `p1_sys_*` macros.
+
+## Process image
+
+### ELF
+
+- **ET_EXEC, static.** No `PT_INTERP`, no dynamic linker. tcc-boot2's
+ output and every host artefact are statically linked.
+- **`PT_LOAD` segments only.** Permissions from `p_flags` (R/W/X bits).
+ No `PT_GNU_STACK`, no `PT_NOTE` parsing, no `PT_TLS`.
+- **Entry at `e_entry`.** No `_start` indirection required from the
+ kernel; the loader's job is to transfer control to `e_entry` with
+ the stack laid out below and to return execution to userspace.
+- **Single arch per image.** No multi-arch fat ELFs.
+
+The `ELF.hex2` file in this repo emits exactly this shape (one
+`PT_LOAD`, `e_entry` set, no PHDR self-reference).
+
+### Stack at entry
+
+Standard Linux SysV layout. The kernel must place at the initial
+stack pointer, low to high:
+
+```
+sp + 0 argc (word)
+sp + 8 argv[0] (pointer)
+ ...
+ argv[argc-1]
+ NULL (argv terminator)
+ envp[0]
+ ...
+ NULL (envp terminator)
+ [argv/envp string bytes follow, anywhere in image]
+```
+
+`__libc_init` (`vendor/mes-libc/lispcc-syscall.c`) walks past argv's
+NULL to find `environ`. **auxv is not required** — nothing in the
+chain reads it.
+
+### Address space
+
+- **One contiguous heap, grown via `brk`.** The kernel exposes a
+ per-process program break; `sys_brk(0)` returns it, `sys_brk(addr)`
+ sets it (POSIX/Linux semantics). `linux/malloc.c` is a free-list
+ allocator on top — no `mmap` required.
+- **No shared memory, no per-thread state.** Single-threaded
+ processes only.
+- **Pages must be readable/writable/executable as their `p_flags`
+ request.** No W^X enforcement complications: tcc-boot2 doesn't JIT;
+ every page is either RX (text) or RW (data/bss/stack/heap).
+
+## Process lifecycle
+
+- **Image swap via `execve`** (Tier 2). Replaces the calling process's
+ memory map; on success, control returns at the new image's
+ `e_entry`.
+- **Spawn via `clone`** with `fork()` semantics (Tier 2): new
+ address space (no `CLONE_VM`), new fd table, parent/child return
+ distinguished by return value (0 in child, child-pid in parent).
+ The scheme1 prelude calls `(sys-clone)` with no arguments — the
+ P1pp wrapper supplies `SIGCHLD` as the only flag. The `fork()`
+ syscall itself is not required.
+- **Reap via `waitid`** (Tier 2). Only `WEXITED` (=4) is used. Job
+ control flags are not needed.
+- **Termination via `exit_group`.** Exit status is the low byte of
+ the argument. No `atexit`, no destructors.
+
+No signal-handler installation is required. Default actions
+(SIGSEGV → terminate, SIGPIPE → terminate, etc.) are sufficient. The
+chain installs zero handlers; `lispcc-syscall.c` stubs `raise` to
+ENOSYS.
+
+## Filesystem
+
+A flat, byte-addressable file abstraction with POSIX read/write
+semantics. Concretely:
+
+- Regular files have a length and an in-file byte offset per fd.
+- `O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND` flags
+ honored; no `O_NONBLOCK`, no `O_DIRECT`.
+- Mode bits on `openat(O_CREAT)`: only the user-rwx bits need
+ honoring; group/other and setuid bits can be ignored.
+- `lseek` whences: `SEEK_SET=0`, `SEEK_CUR=1`, `SEEK_END=2`.
+- `unlinkat(AT_FDCWD, path, 0)` removes a regular file.
+
+**Not required:**
+
+- `stat`, `fstat`, directory iteration, symlinks, hard links, file
+ modes beyond a usable subset, mtime, ownership.
+- A hierarchical filesystem in any rich sense; flat directory plus
+ `/` separators is enough. tcc-boot2 reads files by literal path
+ strings the build emits.
+
+The chain opens 3 fd kinds: source files (read), output files
+(write+create+trunc), and the inherited stdin/stdout/stderr (0/1/2).
+Pipes appear only at Tier 2.
+
+## Tier 1 — toolchain syscalls
+
+Eight calls. Wired in `P1/P1pp.P1pp:986-1055`.
+
+| name | linux nr (aa64 / amd64 / riscv64) | semantics |
+|-----------|-----------------------------------|------------------------------------------------------|
+| read | 63 / 0 / 63 | `ssize_t read(fd, buf, len)` |
+| write | 64 / 1 / 64 | `ssize_t write(fd, buf, len)` |
+| openat | 56 / 257 / 56 | called as `openat(AT_FDCWD=-100, path, flags, mode)` |
+| close | 57 / 3 / 57 | `int close(fd)` |
+| lseek | 62 / 8 / 62 | `off_t lseek(fd, off, whence)` |
+| brk | 214 / 12 / 214 | `void *brk(addr)`; `addr=0` returns current break |
+| unlinkat | 35 / 263 / 35 | called as `unlinkat(AT_FDCWD=-100, path, 0)` |
+| exit_group| 93 / 60 / 93 | `void exit(status)`; never returns |
+
+Errors are returned as negative errno (`-EBADF`, `-ENOENT`, …) in the
+result register, per the standard Linux convention. The libc errno
+layer (`vendor/mes-libc/lispcc-syscall.c`) negates and stores into a
+single global `errno` int.
+
+Everything in `docs/LIBC.txt`'s "syscall-using" column reduces to
+exactly these eight (`fopen → openat`, `fseek → lseek`, `malloc/
+realloc/free → brk`, `__assert_fail / abort / exit → exit_group`,
+etc.).
+
+## Tier 2 — driver syscalls
+
+Adds three. Per-arch macros already exist in `P1/P1-*.M1pp`. The
+scheme1 prelude's `spawn` / `run` / `wait` / `exit` are built
+directly on these (`scheme1/prelude.scm:520-537`).
+
+| name | linux nr (aa64 / amd64 / riscv64) | driver role |
+|---------|-----------------------------------|-------------------------------------------|
+| clone | 220 / 56 / 220 | spawn child; called bare (no flags arg in the prelude — kernel must accept clone-as-fork with SIGCHLD) |
+| execve | 221 / 59 / 221 | image swap; takes `(prog, argv)` — no envp arg in the prelude wrapper, so the kernel-side execve must accept a NULL/empty envp without erroring |
+| waitid | 95 / 247 / 95 | reap child; called as `waitid(P_PID=1, pid, info, WEXITED=4)` — info[8]=si_code, info[24]=si_status (`scheme1/prelude.scm:497-506`) |
+
+**Notably not required:**
+
+- `dup3` / `dup2`, `pipe` / `pipe2` — no fd plumbing between
+ processes. Children inherit stdin/stdout/stderr (0/1/2) from the
+ parent and that's the entire fd contract.
+- `chdir`, `getcwd` — no working-directory manipulation. All paths
+ the driver passes to children are absolute or relative to the
+ starting cwd.
+- `getpid`, `getppid`, `setpgid`, `tcsetpgrp` — no job control.
+
+If a future driver needs redirection (say, capturing tcc-boot2's
+stderr into a file), the right move is to grow the prelude to use
+`dup3` and add the syscall here; until then it's not in the
+contract.
+
+## Errors
+
+- **Convention:** every syscall returns either a non-negative result
+ or a negative errno value in the result register. No errno TLS
+ variable in the kernel/userspace contract — the value lives in the
+ return register.
+- **Errno numbers:** standard Linux constants (`EBADF=9`,
+ `ENOENT=2`, `EFAULT=14`, …). The libc layer maps them through
+ `strerror` lookup tables vendored from mes.
+
+## Out of scope
+
+Explicitly **not** required by the chain. Trying to implement these
+adds complexity without enabling any chain step:
+
+- **Threading.** `clone` with `CLONE_VM`/`CLONE_THREAD`, futexes,
+ TLS. The chain is single-threaded; `errno` is one int global.
+- **mmap / munmap / mprotect.** `linux/malloc.c` is brk-only.
+ Anonymous and file mmap are unused.
+- **Signals beyond default-action.** No `rt_sigaction`,
+ `rt_sigprocmask`, `rt_sigreturn`. Default termination on SIGSEGV/
+ SIGPIPE/etc. is sufficient.
+- **Dynamic linking.** No `PT_INTERP`, no `ld.so`. All binaries
+ static.
+- **IEEE-754 math.** `HAVE_FLOAT` is off through the entire libc;
+ `0.0` literals are even rejected by cc.scm. The kernel needs no
+ FPU save/restore beyond what the platform demands at context switch
+ (and we're single-process anyway, so that's moot).
+- **Sockets, IPC primitives beyond pipes, timers, RNG, /proc, /sys,
+ ptrace, namespaces, cgroups.**
+- **Filesystem features:** stat-family, directory listing, symlinks,
+ hard links, mode/owner semantics beyond user-rwx, mtime,
+ cross-device rename.
+- **auxv at process entry.** Not consumed.
+- **Locale, wide chars, IDN, Unicode normalization.** Bytes are
+ bytes.
+
+## Verification
+
+A minimal-OS implementation is compliant when:
+
+1. **Tier 1 acceptance:** `make tcc-boot2 ARCH=<a>` runs to
+ completion on it (parses + assembles + links via the chain),
+ and `make test SUITE=cc-libc ARCH=<a>` passes.
+2. **Tier 2 acceptance:** a scheme1 driver (scheme1 binary + a
+ `.scm` over `prelude.scm`'s `spawn`/`run`/`wait` and file-port
+ layer) can invoke `tcc-boot2` on a `.c` source, wait for it to
+ exit, and read the resulting ELF back from disk.
+
+Both acceptance suites run end-to-end in the lispcc tree; an OS
+reaching Tier 2 needs no lispcc-side changes.
diff --git a/scripts/boot-libc-prepend.sh b/scripts/boot-libc-prepend.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+## boot-libc-prepend.sh — produce a single .P1pp suitable for
+## scripts/boot-build-p1pp.sh by stripping cc.scm's standard
+## executable tail from libc.P1pp and prepending the result to a
+## client (executable) TU.
+##
+## Per the scripts/ convention (boot-*.sh always runs in the minimal
+## container) callers are: the cc-libc suite handler in
+## boot-run-tests.sh (already in container), and the tcc-boot2 link
+## rule in the Makefile (which wraps with `$(call PODMAN,…)`). The
+## transforms are pure awk + sh — they happen to work on the host too,
+## but the convention is what's load-bearing.
+##
+## Used by both call sites so the link-time invariants (which symbols
+## collide between two cc.scm outputs, what alignment cc.scm doesn't
+## enforce, what startup wiring our :_start expects) live in exactly
+## one place.
+##
+## Transforms applied to libc.P1pp:
+## 1. Drop cc.scm's auto-emitted exec tail —
+## `# entry stub: forwards argc=a0, argv=a1 to main`
+## `%fn(p1_main, 16, { %call(&main) })`. Library TUs must not
+## define :p1_main; the executable TU is the only owner.
+## 2. Drop the trailing `:ELF_end`. ELF.hex2 sizes p_filesz from
+## the first :ELF_end it sees; only the executable TU may emit it.
+## 3. Rename internal-linkage `cc__str_N` (anonymous string literals)
+## to `libc__cc__str_N`. cc.scm restarts that counter at 0 per TU,
+## so libc and the client otherwise duplicate :cc__str_0..N. hex2
+## resolves duplicates first-def-wins and silently mis-binds refs.
+## 4. Pad each `'<hex>'` literal up to an 8-byte boundary. cc.scm
+## emits strings at their natural length; if a string's byte
+## count is not a multiple of 4, every label that follows lands
+## at a non-4-aligned address and aarch64 BLR SIGBUSes.
+##
+## Then we append a fixed :p1_main wrapper that calls __libc_init
+## (lispcc-syscall.c — populates `environ` from argv's NULL terminator)
+## before forwarding to the client's :main.
+##
+## Same `# entry stub` strip is applied to the client TU so its own
+## auto-:p1_main doesn't collide with our wrapper.
+##
+## Usage: boot-libc-prepend.sh <libc.P1pp> <client.P1pp> <out.P1pp>
+
+set -eu
+[ "$#" -eq 3 ] || {
+ echo "usage: $0 <libc.P1pp> <client.P1pp> <out.P1pp>" >&2; exit 2
+}
+
+LIBC=$1
+CLIENT=$2
+OUT=$3
+
+[ -r "$LIBC" ] || { echo "missing $LIBC" >&2; exit 1; }
+[ -r "$CLIENT" ] || { echo "missing $CLIENT" >&2; exit 1; }
+
+WORK=$(dirname "$OUT")
+mkdir -p "$WORK"
+
+LIBC_LIB=$WORK/libc.lib.P1pp
+CLIENT_NOM=$WORK/client.no-p1main.P1pp
+
+awk '
+ /^# entry stub: forwards argc=a0, argv=a1 to main$/ { skip=1; next }
+ skip && /^%fn\(p1_main,/ { next }
+ skip && /^%call\(&main\)/ { next }
+ skip && /^}\)$/ { skip=0; next }
+ /^:ELF_end[ \t]*$/ { next }
+ { gsub(/cc__str_/, "libc__cc__str_") }
+ /^'\''[0-9A-Fa-f]+'\''$/ {
+ print
+ s = $0; gsub(/'\''/, "", s)
+ bytes = length(s) / 2
+ pad = (8 - bytes % 8) % 8
+ if (pad > 0) {
+ pstr = ""
+ for (i = 0; i < pad; i++) pstr = pstr "00"
+ printf "'\''%s'\''\n", pstr
+ }
+ next
+ }
+ { print }
+' "$LIBC" > "$LIBC_LIB"
+
+printf '%s\n' \
+ '%fn(p1_main, 16, {' \
+ ' %call(&__libc_init)' \
+ ' %call(&main)' \
+ '})' >> "$LIBC_LIB"
+
+awk '
+ /^# entry stub: forwards argc=a0, argv=a1 to main$/ { skip=1; next }
+ skip && /^%fn\(p1_main,/ { next }
+ skip && /^%call\(&main\)/ { next }
+ skip && /^}\)$/ { skip=0; next }
+ { print }
+' "$CLIENT" > "$CLIENT_NOM"
+
+cat "$LIBC_LIB" "$CLIENT_NOM" > "$OUT"
diff --git a/scripts/boot-run-tests.sh b/scripts/boot-run-tests.sh
@@ -14,7 +14,7 @@
## host preflights lint and passes the explicit kept list down.
##
## Env: ARCH=aarch64|amd64|riscv64
-## Usage: boot-run-tests.sh --suite=<m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc> [name ...]
+## Usage: boot-run-tests.sh --suite=<m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc|cc-libc> [name ...]
set -eu
@@ -35,7 +35,7 @@ while [ "$#" -gt 0 ]; do
done
case "$SUITE" in
- m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc) ;;
+ m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc|cc-libc) ;;
"") echo "$0: --suite required" >&2; exit 2 ;;
*) echo "$0: unknown suite '$SUITE'" >&2; exit 2 ;;
esac
@@ -60,6 +60,19 @@ show_diff() {
printf '%s\n' "$actual" | sed 's/^/ /'
}
+# fail <label> [<heading>] [<log-file>]
+# Emit a FAIL row plus an optional indented heading and indented log
+# contents. Lets every suite handle a failed sub-step without re-running
+# the failing command to capture its stderr.
+fail() {
+ label=$1
+ report "$label" FAIL
+ [ -n "${2:-}" ] && echo " $2" || :
+ if [ -n "${3:-}" ] && [ -e "${3:-}" ]; then
+ sed 's/^/ /' "$3" >&2 || true
+ fi
+}
+
## --- m1pp suite ---------------------------------------------------------
run_m1pp_suite() {
@@ -92,10 +105,11 @@ run_m1pp_suite() {
fi
elif [ -e "$m1_src" ]; then
bin=build/$ARCH/m1pp-tests/$name
- if ! sh scripts/boot-build-p1.sh "$m1_src" "$bin" >/dev/null 2>&1; then
- report "$label" FAIL
- sh scripts/boot-build-p1.sh "$m1_src" "$bin" 2>&1 \
- | sed 's/^/ /' >&2 || true
+ log=$bin.build.log
+ mkdir -p "$(dirname "$bin")"
+ if ! sh scripts/boot-build-p1.sh "$m1_src" "$bin" \
+ >"$log" 2>&1; then
+ fail "$label" "" "$log"
continue
fi
actual=$("./$bin" 2>&1 || true)
@@ -128,10 +142,11 @@ run_p1_suite() {
label="[$ARCH] $name"
bin=build/$ARCH/p1-tests/$name
- if ! sh scripts/boot-build-p1pp.sh "$fixture" "$bin" >/dev/null 2>&1; then
- report "$label" FAIL
- sh scripts/boot-build-p1pp.sh "$fixture" "$bin" 2>&1 \
- | sed 's/^/ /' >&2 || true
+ log=$bin.build.log
+ mkdir -p "$(dirname "$bin")"
+ if ! sh scripts/boot-build-p1pp.sh "$fixture" "$bin" \
+ >"$log" 2>&1; then
+ fail "$label" "" "$log"
continue
fi
actual=$("./$bin" 2>&1 || true)
@@ -340,18 +355,17 @@ _cc_runtime_suite() {
exec build/$ARCH/scheme1 /tmp/cc-test.scm
"
fi
- if ! sh -c "$cmd" >"$p1pp" 2>/dev/null; then
- report "[$ARCH] $suite/$name" FAIL
- echo " cg emission failed:"
- sh -c "$cmd" 2>&1 >/dev/null | sed 's/^/ /' >&2 || true
+ label="[$ARCH] $suite/$name"
+ cg_log=$outdir/cg.log
+ if ! sh -c "$cmd" >"$p1pp" 2>"$cg_log"; then
+ fail "$label" "cg emission failed:" "$cg_log"
continue
fi
- if ! sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" >/dev/null 2>&1; then
- report "[$ARCH] $suite/$name" FAIL
- echo " P1pp assemble failed:"
- sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" 2>&1 \
- | sed 's/^/ /' >&2 || true
+ p1pp_log=$outdir/p1pp.log
+ if ! sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" \
+ >"$p1pp_log" 2>&1; then
+ fail "$label" "P1pp assemble failed:" "$p1pp_log"
continue
fi
@@ -389,19 +403,84 @@ run_cc_suite() {
outdir=build/$ARCH/cc/$name
p1pp=$outdir/$name.P1pp
elf=$outdir/$name
+ label="[$ARCH] cc/$name"
mkdir -p "$outdir"
+
+ cc_log=$outdir/cc.log
if ! build/$ARCH/scheme1 build/$ARCH/cc/cc.scm "$src" "$p1pp" \
- >/dev/null 2>&1; then
- report "[$ARCH] cc/$name" FAIL
- echo " cc compile failed"
+ >"$cc_log" 2>&1; then
+ fail "$label" "cc compile failed:" "$cc_log"
+ continue
+ fi
+
+ p1pp_log=$outdir/p1pp.log
+ if ! sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" \
+ >"$p1pp_log" 2>&1; then
+ fail "$label" "P1pp assemble failed:" "$p1pp_log"
+ continue
+ fi
+
+ tmp=$(mktemp)
+ if "./$elf" >"$tmp" 2>&1; then
+ act_exit=0
+ else
+ act_exit=$?
+ fi
+ act_out=$(cat "$tmp"); rm -f "$tmp"
+ _cc_check "$label" "$expout" "$expexit" "$act_out" "$act_exit"
+ done
+}
+
+## --- cc-libc suite ------------------------------------------------------
+##
+## Mirrors run_cc_suite but links the prepended libc.P1pp into every
+## fixture. Targeted red-green TDD on the cc.scm + libc combination —
+## each .c is small (one feature: printf with %d, malloc round-trip,
+## getenv lookup, …) so a failure isolates the bug to one symbol path.
+run_cc_libc_suite() {
+ [ -n "$NAMES" ] || NAMES=$(discover tests/cc-libc c)
+ for name in $NAMES; do
+ src=tests/cc-libc/$name.c
+ [ -e "$src" ] || { echo " SKIP $name (no .c)"; continue; }
+ if [ -e tests/cc-libc/$name.expected ]; then
+ expout=$(cat tests/cc-libc/$name.expected)
+ else
+ expout=
+ fi
+ if [ -e tests/cc-libc/$name.expected-exit ]; then
+ expexit=$(cat tests/cc-libc/$name.expected-exit)
+ else
+ expexit=0
+ fi
+ outdir=build/$ARCH/cc-libc/$name
+ client_p1pp=$outdir/$name.client.P1pp
+ merged_p1pp=$outdir/$name.P1pp
+ elf=$outdir/$name
+ label="[$ARCH] cc-libc/$name"
+ mkdir -p "$outdir"
+
+ cc_log=$outdir/cc.log
+ if ! build/$ARCH/scheme1 build/$ARCH/cc/cc.scm "$src" "$client_p1pp" \
+ >"$cc_log" 2>&1; then
+ fail "$label" "cc compile failed:" "$cc_log"
continue
fi
- if ! sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" >/dev/null 2>&1; then
- report "[$ARCH] cc/$name" FAIL
- sh scripts/boot-build-p1pp.sh "$p1pp" "$elf" 2>&1 \
- | sed 's/^/ /' >&2 || true
+
+ prepend_log=$outdir/prepend.log
+ if ! sh scripts/boot-libc-prepend.sh \
+ build/$ARCH/libc.P1pp "$client_p1pp" "$merged_p1pp" \
+ >"$prepend_log" 2>&1; then
+ fail "$label" "libc-prepend failed:" "$prepend_log"
continue
fi
+
+ p1pp_log=$outdir/p1pp.log
+ if ! sh scripts/boot-build-p1pp.sh "$merged_p1pp" "$elf" \
+ >"$p1pp_log" 2>&1; then
+ fail "$label" "P1pp assemble failed:" "$p1pp_log"
+ continue
+ fi
+
tmp=$(mktemp)
if "./$elf" >"$tmp" 2>&1; then
act_exit=0
@@ -409,7 +488,7 @@ run_cc_suite() {
act_exit=$?
fi
act_out=$(cat "$tmp"); rm -f "$tmp"
- _cc_check "[$ARCH] cc/$name" "$expout" "$expexit" "$act_out" "$act_exit"
+ _cc_check "$label" "$expout" "$expexit" "$act_out" "$act_exit"
done
}
@@ -422,4 +501,5 @@ case "$SUITE" in
cc-pp) run_cc_pp_suite ;;
cc-cg) run_cc_cg_suite ;;
cc) run_cc_suite ;;
+ cc-libc) run_cc_libc_suite ;;
esac
diff --git a/scripts/libc-flatten.sh b/scripts/libc-flatten.sh
@@ -0,0 +1,141 @@
+#!/bin/sh
+## libc-flatten.sh — flatten the vendored mes-libc + lispcc-syscall.c
+## into a single libc.flat.c using the host preprocessor. Mirrors
+## stage1-flatten.sh; runs on the host, no container — hence the
+## non-`boot-` name (the convention in scripts/ is that boot-*.sh
+## runs inside the minimal container).
+##
+## Steps:
+## 1. stage vendor/mes-libc → build/cc-bootstrap/<arch>/libc-stage/
+## 2. apply simple-patches (literal-block replacement, idempotent)
+## 3. HOST_CC -E -nostdinc -I staging/include … staging/unified-libc.c
+## → build/cc-bootstrap/<arch>/libc.flat.c
+##
+## Stage 4 (cc.scm libc.flat.c → libc.P1pp) is a separate Makefile rule
+## that reuses scripts/boot-build-cc.sh inside the per-arch container.
+##
+## ARCH selects the lispcc target (aarch64/amd64/riscv64). MES_ARCH is
+## the mes header tree we hand the host preprocessor; mes ships
+## x86_64/riscv64 only, so aarch64 builds borrow riscv64's headers (the
+## resulting libc.flat.c references no SYS_* / kernel-stat fields, so
+## the choice only affects type widths, all 64-bit Linux-identical).
+##
+## Usage: scripts/libc-flatten.sh [--arch <aarch64|amd64|riscv64>]
+
+set -eu
+
+ARCH=aarch64
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --arch) ARCH=$2; shift 2 ;;
+ -h|--help) sed -n 's/^## \{0,1\}//p' "$0"; exit 0 ;;
+ *) echo "unknown arg: $1" >&2; exit 2 ;;
+ esac
+done
+
+case "$ARCH" in
+ aarch64) MES_ARCH=riscv64 ;;
+ amd64) MES_ARCH=x86_64 ;;
+ riscv64) MES_ARCH=riscv64 ;;
+ *) echo "unknown ARCH: $ARCH" >&2; exit 2 ;;
+esac
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+VENDOR=$ROOT/vendor/mes-libc
+WORK=$ROOT/build/cc-bootstrap/$ARCH
+STAGE=$WORK/libc-stage
+FLAT=$WORK/libc.flat.c
+
+[ -d "$VENDOR" ] || { echo "missing $VENDOR" >&2; exit 1; }
+[ -d "$VENDOR/include" ] || { echo "missing $VENDOR/include" >&2; exit 1; }
+[ -d "$VENDOR/include/linux/$MES_ARCH" ] \
+ || { echo "missing $VENDOR/include/linux/$MES_ARCH" >&2; exit 1; }
+
+# --- (1) stage --------------------------------------------------------
+mkdir -p "$WORK"
+rm -rf "$STAGE"
+mkdir -p "$STAGE"
+# cp -R copies symlinks as files; staging is our writable scratch.
+cp -R "$VENDOR/." "$STAGE/"
+
+# mes's sys/stat.h, signal.h, dirent.h reach for <arch/kernel-stat.h>
+# and similar; the per-arch tree under include/linux/<MES_ARCH>/ is what
+# they want. Drop a sibling include/arch -> include/linux/<MES_ARCH>
+# symlink so the unprefixed `arch/...` includes resolve.
+ln -sfn "linux/$MES_ARCH" "$STAGE/include/arch"
+
+# --- (2) patches ------------------------------------------------------
+# Same literal-block replacer as stage1-flatten.sh apply_simple_patch.
+apply_simple_patch() {
+ target=$1; before=$2; after=$3
+ [ -r "$target" ] || { echo "patch target missing: $target" >&2; exit 1; }
+ [ -r "$before" ] || { echo "patch before missing: $before" >&2; exit 1; }
+ [ -r "$after" ] || { echo "patch after missing: $after" >&2; exit 1; }
+ awk -v BFILE="$before" -v AFILE="$after" '
+ BEGIN {
+ while ((getline line < BFILE) > 0) bef = bef line "\n";
+ close(BFILE);
+ while ((getline line < AFILE) > 0) aft = aft line "\n";
+ close(AFILE);
+ }
+ { src = src $0 "\n" }
+ END {
+ if (index(src, aft) > 0) {
+ printf "%s", src;
+ exit 0;
+ }
+ i = index(src, bef);
+ if (i == 0) { print "patch did not match" > "/dev/stderr"; exit 1 }
+ printf "%s%s%s",
+ substr(src, 1, i - 1),
+ aft,
+ substr(src, i + length(bef));
+ }
+ ' "$target" > "$target.new"
+ mv "$target.new" "$target"
+}
+
+PATCHES=$STAGE/patches
+apply_simple_patch \
+ "$STAGE/linux/malloc.c" \
+ "$PATCHES/malloc-max-align.before" \
+ "$PATCHES/malloc-max-align.after"
+apply_simple_patch \
+ "$STAGE/string/strstr.c" \
+ "$PATCHES/strstr-drop-mman.before" \
+ "$PATCHES/strstr-drop-mman.after"
+apply_simple_patch \
+ "$STAGE/include/mes/lib-mini.h" \
+ "$PATCHES/libmini-write-proto.before" \
+ "$PATCHES/libmini-write-proto.after"
+apply_simple_patch \
+ "$STAGE/include/mes/lib-mini.h" \
+ "$PATCHES/libmini-write-proto2.before" \
+ "$PATCHES/libmini-write-proto2.after"
+apply_simple_patch \
+ "$STAGE/include/mes/lib.h" \
+ "$PATCHES/lib-mes-debug-proto.before" \
+ "$PATCHES/lib-mes-debug-proto.after"
+
+# --- (3) flatten via host preprocessor --------------------------------
+HOST_CC=${HOST_CC:-cc}
+
+# -I order matters: include first so <signal.h>, <stdio.h> etc. hit
+# the canonical mes/include versions; arch/<…> resolves through the
+# include/arch symlink to include/linux/$MES_ARCH. Putting the per-arch
+# directory ahead of include/ makes <signal.h> resolve to the partial
+# arch-specific snippet (no stack_t typedef etc) and the build breaks.
+"$HOST_CC" -E -P \
+ -nostdinc \
+ -I "$STAGE/include" \
+ -I "$STAGE" \
+ -D HAVE_CONFIG_H=0 \
+ -D __linux__=1 \
+ -D __${MES_ARCH}__=1 \
+ -D __riscv_xlen=64 \
+ -D inline= \
+ "$STAGE/unified-libc.c" > "$FLAT"
+
+LINES=$(wc -l < "$FLAT")
+BYTES=$(wc -c < "$FLAT")
+echo "produced $FLAT ($LINES lines, $BYTES bytes)"
diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh
@@ -50,8 +50,8 @@ while [ "$#" -gt 0 ]; do
done
case "$SUITE" in
- m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc) ;;
- "") echo "$0: --suite required (m1pp | p1 | scheme1 | cc-util | cc-lex | cc-pp | cc-cg | cc)" >&2; exit 2 ;;
+ m1pp|p1|scheme1|cc-util|cc-lex|cc-pp|cc-cg|cc|cc-libc) ;;
+ "") echo "$0: --suite required (m1pp | p1 | scheme1 | cc-util | cc-lex | cc-pp | cc-cg | cc | cc-libc)" >&2; exit 2 ;;
*) echo "$0: unknown suite '$SUITE'" >&2; exit 2 ;;
esac
diff --git a/tests/cc-libc/00-exit.c b/tests/cc-libc/00-exit.c
@@ -0,0 +1,8 @@
+/* Sanity: bare main returning a value. Doesn't touch libc — confirms
+ * the cc-libc harness (cc.scm + boot-libc-prepend + boot-build-p1pp)
+ * produces a runnable ELF when nothing in libc is exercised. */
+int
+main (void)
+{
+ return 7;
+}
diff --git a/tests/cc-libc/00-exit.expected-exit b/tests/cc-libc/00-exit.expected-exit
@@ -0,0 +1 @@
+7
diff --git a/tests/cc-libc/01-write-syscall.c b/tests/cc-libc/01-write-syscall.c
@@ -0,0 +1,10 @@
+/* Confirms our P1pp sys_write -> _write path works end-to-end through
+ * lispcc-syscall.c. No buffering, no FILE struct, no varargs. */
+extern long sys_write (long fd, long buf, long len);
+
+int
+main (void)
+{
+ sys_write (1, (long) "hello\n", 6);
+ return 0;
+}
diff --git a/tests/cc-libc/01-write-syscall.expected b/tests/cc-libc/01-write-syscall.expected
@@ -0,0 +1 @@
+hello
diff --git a/tests/cc-libc/02-write-libc.c b/tests/cc-libc/02-write-libc.c
@@ -0,0 +1,16 @@
+/* posix/write.c -> lispcc-syscall.c::_write -> P1pp sys_write.
+ * Adds the public `write(int, const void *, size_t)` layer with errno
+ * handling on top of 01-write-syscall. cc.scm rejects #include
+ * (file inclusion is upstream of cc.scm via host -E pre-flatten); use
+ * extern decls directly. */
+typedef unsigned long size_t;
+typedef long ssize_t;
+
+extern ssize_t write (int fd, void const *buffer, size_t size);
+
+int
+main (void)
+{
+ write (1, "hello\n", 6);
+ return 0;
+}
diff --git a/tests/cc-libc/02-write-libc.expected b/tests/cc-libc/02-write-libc.expected
@@ -0,0 +1 @@
+hello
diff --git a/tests/cc-libc/03-fputs-stdout.c b/tests/cc-libc/03-fputs-stdout.c
@@ -0,0 +1,14 @@
+/* fputs(s, stdout) — canonical libc usage. fputs casts stream to long
+ * (the fd) and forwards to write. stdout is a global (FILE*)1 that
+ * lispcc-syscall.c provides as a real symbol (mes's #define is undef'd
+ * post-include). */
+typedef long FILE;
+extern FILE *stdout;
+extern int fputs (char const *s, FILE *stream);
+
+int
+main (void)
+{
+ fputs ("hi\n", stdout);
+ return 0;
+}
diff --git a/tests/cc-libc/03-fputs-stdout.expected b/tests/cc-libc/03-fputs-stdout.expected
@@ -0,0 +1 @@
+hi
diff --git a/tests/cc-libc/04-printf-literal.c b/tests/cc-libc/04-printf-literal.c
@@ -0,0 +1,11 @@
+/* printf with no format specifiers — exercises stdio/printf -> vfprintf
+ * but does not touch va_arg. If this passes and 05-printf-int fails,
+ * the bug is in cc.scm's varargs lowering, not the printf core. */
+extern int printf (char const *fmt, ...);
+
+int
+main (void)
+{
+ printf ("plain literal\n");
+ return 0;
+}
diff --git a/tests/cc-libc/04-printf-literal.expected b/tests/cc-libc/04-printf-literal.expected
@@ -0,0 +1 @@
+plain literal
diff --git a/tests/cc-libc/05-printf-int.c b/tests/cc-libc/05-printf-int.c
@@ -0,0 +1,11 @@
+/* printf with %d — exercises va_arg int-pull. With our hand-written
+ * tcc-boot2 smoke, this prints "got " then segfaults; this fixture
+ * pins the failure for red-green TDD on cc.scm+libc varargs. */
+extern int printf (char const *fmt, ...);
+
+int
+main (void)
+{
+ printf ("got %d\n", 42);
+ return 0;
+}
diff --git a/tests/cc-libc/05-printf-int.expected b/tests/cc-libc/05-printf-int.expected
@@ -0,0 +1 @@
+got 42
diff --git a/tests/cc-libc/06-puts.c b/tests/cc-libc/06-puts.c
@@ -0,0 +1,11 @@
+/* puts() — newline-appending fputs. Mes implements via fputs+'\n' on
+ * stdout; verifies the same __stdout-init invariant as 03 and adds
+ * stdio's auto-newline. */
+extern int puts (char const *s);
+
+int
+main (void)
+{
+ puts ("ok");
+ return 0;
+}
diff --git a/tests/cc-libc/06-puts.expected b/tests/cc-libc/06-puts.expected
@@ -0,0 +1 @@
+ok
diff --git a/tests/cc-libc/07-malloc-roundtrip.c b/tests/cc-libc/07-malloc-roundtrip.c
@@ -0,0 +1,21 @@
+/* Allocator smoke: malloc -> brk syscall (via lispcc-syscall.c::brk).
+ * Writes a sentinel through the returned pointer and reads it back so
+ * a successful malloc that returned a bad address is still caught. */
+typedef unsigned long size_t;
+
+extern void *malloc (size_t size);
+extern long sys_write (long fd, long buf, long len);
+
+int
+main (void)
+{
+ int *p = (int *) malloc (4);
+ if (!p)
+ {
+ sys_write (1, (long) "malloc-null\n", 12);
+ return 1;
+ }
+ *p = 0x000a4b4f; /* "OK\n\0" little-endian */
+ sys_write (1, (long) p, 3);
+ return 0;
+}
diff --git a/tests/cc-libc/07-malloc-roundtrip.expected b/tests/cc-libc/07-malloc-roundtrip.expected
@@ -0,0 +1 @@
+OK