boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit dd522a0494571c5ac9875945e924d23e8d2fb593
parent d1195e5542db523f480175d9e965855cd9dd62e6
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 28 Apr 2026 15:35:42 -0700

cc: stream static array initializers past asm_instrs scratch wall

%parse-init-array-list now parses each element as a unit inside a
heap-mark/rewind window: the element's pieces and the parser/pp/lex
lookahead get deep-copied into main, then scratch is rewound to the
pre-element mark. The outer pieces accumulator and the rbrace-pad
piece are built directly on main via small mode-switch helpers
(%init-main-cons, %init-main-reverse, %init-main-prepend-reversed,
%init-main-pad-piece). The trailing inter-element comma is consumed
inside the same mark/rewind window so its lookahead doesn't leak.

Selected by initializer shape (file-scope/static positional
aggregate), not by symbol name. Designated struct fields still go
through %parse-init-struct-list + %merge-init-entries unchanged;
local autos go through parse-init-local-aggregate and are untouched.

tcc.flat.c now parses past the 333-row asm_instrs[] table at line
14527-14861 and stops at the next documented blocker, the
options_W[] offsetof const-expr at line 18026.

Diffstat:
Mcc/cc.scm | 106++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mdocs/TCC-TODO.md | 171++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
2 files changed, 203 insertions(+), 74 deletions(-)

diff --git a/cc/cc.scm b/cc/cc.scm @@ -5135,6 +5135,52 @@ (define (%pad-piece nbytes) (make-bytevector nbytes 0)) +;; Static aggregate init streaming support. parse-translation-unit runs +;; declarations in scratch, but a large file-scope array initializer can +;; contain hundreds of independent elements. Parse one array element +;; past a heap mark, promote the element pieces and parser lookahead into +;; main, rewind that element's transient lexer/pp/const-expr scratch, and +;; keep the outer pieces list in main as well. +(define (%init-promote-unit ps thunk) + (let ((mark (heap-mark))) + (let ((scratch-result (thunk))) + (use-main-heap!) + (let ((ctx (make-deep-copy-context))) + (promote-roots! (ps-world ps) ctx) + (promote-iter-buffers! (ps-iter ps) ctx) + (let ((main-result (deep-copy ctx scratch-result))) + (use-scratch-heap!) + (heap-rewind! mark) + main-result))))) + +(define (%init-main-cons x xs) + (use-main-heap!) + (let ((r (cons x xs))) + (use-scratch-heap!) + r)) + +(define (%init-main-reverse xs) + (use-main-heap!) + (let ((r (reverse xs))) + (use-scratch-heap!) + r)) + +(define (%init-main-prepend-reversed xs acc) + (use-main-heap!) + (let loop ((ys xs) (out acc)) + (cond + ((null? ys) + (use-scratch-heap!) + out) + (else + (loop (cdr ys) (cons (car ys) out)))))) + +(define (%init-main-pad-piece nbytes) + (use-main-heap!) + (let ((p (%pad-piece nbytes))) + (use-scratch-heap!) + p)) + ;; ----- Global initializers --------------------------------------------- ;; Returns (values pieces final-ty). For inferred-length array `ty`, ;; final-ty is a freshly-built array ctype with the resolved length; @@ -5199,34 +5245,45 @@ (let* ((final (cond ((< decl 0) count) (else decl))) (pad (- final count))) (values - (cond + (cond ((> pad 0) - (reverse (cons (%pad-piece (* pad esize)) acc))) - (else (reverse acc))) + (%init-main-reverse + (%init-main-cons (%init-main-pad-piece (* pad esize)) acc))) + (else (%init-main-reverse acc))) count))) (else (let ((piece - (cond - ((at-punct? ps 'lbrace) - ;; Nested aggregate: brace-flatten via recursion. - (advance ps) - ;; element is itself struct/array - (cond - ((eq? (ctype-kind elem) 'arr) - (let-values (((p _c) (%parse-init-array-list ps elem))) - p)) - ((or (eq? (ctype-kind elem) 'struct) - (eq? (ctype-kind elem) 'union)) - (%parse-init-struct-list ps elem)) - (else - (let ((p (%const-init-piece ps elem))) - (cond ((at-punct? ps 'comma) (advance ps))) - (expect-punct ps 'rbrace) - (list p))))) - (else - (list (%const-init-piece ps elem)))))) - (cond ((at-punct? ps 'comma) (advance ps))) - (lp (append (reverse piece) acc) (+ count 1)))))))) + (%init-promote-unit + ps + (lambda () + ;; The trailing inter-element comma must be consumed + ;; *inside* the mark/rewind window: advance loads pp/lex + ;; lookahead into iter buffers, which promote-iter-buffers! + ;; then deep-copies into main. Consuming it after the + ;; rewind would leave that lookahead leaking on scratch. + (let ((p + (cond + ((at-punct? ps 'lbrace) + ;; Nested aggregate: brace-flatten via recursion. + (advance ps) + ;; element is itself struct/array + (cond + ((eq? (ctype-kind elem) 'arr) + (let-values (((p _c) (%parse-init-array-list ps elem))) + p)) + ((or (eq? (ctype-kind elem) 'struct) + (eq? (ctype-kind elem) 'union)) + (%parse-init-struct-list ps elem)) + (else + (let ((p (%const-init-piece ps elem))) + (cond ((at-punct? ps 'comma) (advance ps))) + (expect-punct ps 'rbrace) + (list p))))) + (else + (list (%const-init-piece ps elem)))))) + (cond ((at-punct? ps 'comma) (advance ps))) + p))))) + (lp (%init-main-prepend-reversed piece acc) (+ count 1)))))))) (define (%piece-bytesize p) ;; Output width of one piece (cf. %cg-init-piece->bv): a bv emits @@ -6427,4 +6484,3 @@ "out-bytes" (bytevector-length out)) (%cc-write out-path out)) 0)))) - diff --git a/docs/TCC-TODO.md b/docs/TCC-TODO.md @@ -37,21 +37,70 @@ head -c 50000 build/cc-bootstrap/X86_64/tcc.flat.c \ # then re-run the podman invocation against tcc.head.c ``` -## Blocker — scratch exhaustion on `asm_instrs[]` (line 14527) +## Blocker — offsetof-style const expr in `options_W[]` (line 18026) -After fixing the four blockers below, parse advances to the -`static const ASMInstr asm_instrs[] = { … };` table at line 14527 -(~333 entries spanning lines 14528–14860) and aborts with -`scheme1: scratch exhausted` partway through. Per-element scratch -growth measured (heap-mark deltas): +Current run gets past `asm_instrs[]` and stops in the option flag +tables: + +```c +static const FlagDef options_W[] = { + { 0, 0, "all" }, + { ((size_t) &((TCCState *)0)->warn_unsupported), 0, "unsupported" }, + { ((size_t) &((TCCState *)0)->warn_write_strings), 0, "write-strings" }, + ... +}; +``` + +Diagnostic: ``` -elem 16 → scratch 287 503 672 -elem 176 → scratch 400 656 488 -delta 113 152 816 over 160 elements ≈ 707 KB / elem +build/cc-bootstrap/X86_64/tcc.flat.c:18026:17: error: const-expr: bad operand: amp ``` -Each row writes ~12 bytes of static data but consumes ~700 KB of +This is the classic `offsetof` idiom: take the address of a member +through a null pointer, cast it to `size_t`, and use the resulting +field offset as a static integer initializer. + +The present `parse-const-expr` handles integer arithmetic, casts to +integer types, enum constants, and `sizeof`, but not address-of, +pointer casts, member access, or `->` in unevaluated/address contexts. + +Likely narrow fix shape: + +- allow const-expr casts to pointer types when the operand is an + integer constant, at least for `(T *)0` +- add a const-expression path for unary `&` over an lvalue expression + made of null pointer cast + `.` / `->` field selection +- compute the member offset from the ctype layout and return it as an + integer constant of the enclosing cast type +- keep this const-only; do not make general pointer values acceptable + as arbitrary integer constants + +This should cover the `options_W[]` / `options_f[]` tables without +expanding static initializer semantics beyond the tcc bootstrap need. + +## Resolved — scratch pressure on `asm_instrs[]` (line 14527) + +Done. `parse-init-global` still returns a pieces list to +`cg-emit-global`, but `%parse-init-array-list` now parses each array +element as a unit, promotes that unit's pieces plus parser/pp/lex +lookahead into main storage, rewinds scratch to a pre-unit mark, and +continues with the outer accumulator in main. This is selected by +initializer shape (file-scope/static positional aggregate), not by the +symbol name. + +Before the fix, parse reached the `static const ASMInstr asm_instrs[] = +{ ... };` table at line 14527 (~333 entries spanning lines +14528-14860) and aborted with `scheme1: scratch exhausted` partway +through. Per-element scratch growth measured then: + +``` +elem 16 -> scratch 287 503 672 +elem 176 -> scratch 400 656 488 +delta 113 152 816 over 160 elements ~= 707 KB / elem +``` + +Each row writes ~12 bytes of static data but consumed ~700 KB of scratch to do it. tcc.c entries are dense: ```c @@ -64,30 +113,17 @@ scratch to do it. tcc.c entries are dense: 0, { 0 } }, ``` -The whole array sits inside one parse-decl-or-fn boundary, so scratch -never resets between rows; per-element parse-const-expr / lex / pp / -init-piece allocations all pile up. 128 MiB scratch cap covers ~176 -rows out of 333. - -**Likely fix shape — streaming init emission.** Currently -`parse-init-global` builds a flat `pieces` list across the whole -top-level array (with each row's struct-init merged into a small -sub-list) and `cg-emit-global` walks it once at the end. For an -array-of-struct init with no designators (which asm_instrs is), per-row -pieces could stream directly to `cg-data` (a fixed-storage main-heap -buf) and the per-row scratch state could be reset between elements. -The non-pieces scratch state is small (token buffer, ps, cg vstack -empty between top-level decls) but is currently mingled with init -state, so a clean streaming variant of `%parse-init-array-list` plus a -mid-decl scratch-reset hook is the substantive change. - -Diagnostic note: the source of the per-row 700 KB has not been pinned -down by static reading. tok+loc per token (~80–120 B) × ~50 tokens/row -predicts ~5 KB/row; const-expr cons cells predict another ~1 KB/row; -the merge_init_entries / append-pair traffic is a few hundred bytes. -The remaining ~690 KB is unaccounted-for and worth a deeper instrument -pass before committing to a refactor — there may be a simpler O(N)-vs- -O(N²) bug hiding in lex/pp/parse. +Current aarch64 run: + +``` +[cc] decl: line 14527 heap 50929348 +[cc] decl: line 14861 heap 61008644 +delta 10079296 over 333 rows ~= 30 KB / row +``` + +The table now completes and parse advances to line 18026. The remaining +row-scale heap growth is persistent `.data` output plus promoted +pieces/metadata; it is no longer a scratch cap blocker. ## Resolved — `static` tentative-def merge @@ -145,12 +181,43 @@ not honoured semantically, just parsed away. The two earlier memory blockers (whole-TU heap explosion, single-decl scratch peak inside the `enum tcc_token` block) are both gone after -the per-decl scratch arena (Phase 3, [CC-SCRATCH.md](CC-SCRATCH.md)) +the per-decl scratch arena (Phase 3) plus the static aggregate +initializer unit rewind path ([CC-INIT-SCRATCH.md](CC-INIT-SCRATCH.md)) plus the recent scope-bind alist / scratch reclamation work. -Probing prefixes that end at clean top-level `};` boundaries so the -parse completes (HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES = -128 MiB): +Current full-file aarch64 run against +`build/cc-bootstrap/X86_64/tcc.flat.c`: + +``` +[cc] phase=start: heap 1225052 +[cc] phase=slurp: heap 3101100 src-bytes 608547 +[cc] decl: line 14527 heap 50929348 +[cc] decl: line 14861 heap 61008644 +[cc] decl: line 18024 heap 66002084 +build/cc-bootstrap/X86_64/tcc.flat.c:18026:17: error: const-expr: bad operand: amp +``` + +Milestones from that run: + +| point | line | heap | Δ from start | note | +|------:|-----:|-----:|-------------:|------| +| start | - | 1 225 052 | - | runtime before slurp | +| slurp | - | 3 101 100 | 1 876 048 | 608 547-byte source loaded | +| before `asm_instrs[]` | 14 527 | 50 929 348 | 49 704 296 | enters large static table | +| after `asm_instrs[]` | 14 861 | 61 008 644 | 59 783 592 | table completed | +| before current blocker | 18 024 | 66 002 084 | 64 777 032 | next decl, then line 18026 error | + +Observed rates: + +- Full progress to line 18024 consumes ~64.8 MB above start for + 608 547 source bytes loaded and most of the TU parsed. +- The `asm_instrs[]` table adds ~10.1 MB over 333 rows, about + 30 KB / row after the streaming initializer fix. +- The compiler now reaches line 18026 under the existing caps; the + current failure is semantic parser coverage, not memory exhaustion. + +Older prefix probes that end at clean top-level `};` boundaries +(HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES = 128 MiB): | line | bytes | heap after parse | Δ from start | KB / source byte | |-----:|-------:|-----------------:|-------------:|-----------------:| @@ -165,11 +232,11 @@ parse completes (HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES = Marginal residency converges to roughly **0.9 KB / input byte** at the small-prefix scale and drops further across the enum block. The -per-byte average is now low enough that the full 608 KB TU is -projected to fit comfortably under a 256 MiB heap cap; in practice -parse aborts on `__attribute__` long before approaching the cap. +current full-file run confirms the 608 KB TU fits comfortably under +the existing heap and scratch caps until the line 18026 const-expr +coverage failure. -The full 608 KB TU itself slurps to a heap of 3 086 092 bytes (~3 MB) +The full 608 KB TU itself slurps to a heap of 3 101 100 bytes (~3 MB) — bytevector storage for the source plus runtime baseline. Heap delta minus the start-of-process baseline (~1.21 MB scheme1 @@ -198,16 +265,22 @@ enum constants) overflowed even 128 MiB of scratch because O(N²) in member count. The recent scratch / alist work makes that decl complete with parse heap at ~31 MB on the 1612-line cut. -## Suspected next-tier blockers (past asm_instrs) +## Expected next-tier blockers -Once the asm_instrs scratch issue is past, the remaining wave we expect: +After the line 18026 const-expression issue, the remaining wave is +still likely to include: +- **More static initializer const-expr forms** — tcc tables use C + implementation idioms (`offsetof`, pointer-ish integer constants, + possibly address arithmetic) that are not covered by the current + integer-only const evaluator. - **`_Bool`, bitfield-typed struct fields, `setjmp.h` typedefs** — - same "parse, don't codegen" softening as floats. tcc.c carries - these under `HAVE_BITFIELD` / `HAVE_SETJMP` gates that are off but - leave the declarations in the flattened text. -- **Throughput / wall-clock** — lex+pp+parse on 18 896 lines under - scheme1 is going to be slow even after heap residency drops. + same "parse, don't codegen" softening as floats. tcc.c carries these + under `HAVE_BITFIELD` / `HAVE_SETJMP` gates that are off but leave + the declarations in the flattened text. +- **Throughput / wall-clock** — the current failing aarch64 run takes + about 29 seconds to parse to line 18026 under scheme1. A successful + full compile will add cg-finish and P1pp assembly time. The end goal is milestone 4 in [CC.md §Validation milestones](CC.md) — "Compile tcc.c (under the tcc-mes defines) → tcc-lispcc; verify