commit dd522a0494571c5ac9875945e924d23e8d2fb593
parent d1195e5542db523f480175d9e965855cd9dd62e6
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 28 Apr 2026 15:35:42 -0700
cc: stream static array initializers past asm_instrs scratch wall
%parse-init-array-list now parses each element as a unit inside a
heap-mark/rewind window: the element's pieces and the parser/pp/lex
lookahead get deep-copied into main, then scratch is rewound to the
pre-element mark. The outer pieces accumulator and the rbrace-pad
piece are built directly on main via small mode-switch helpers
(%init-main-cons, %init-main-reverse, %init-main-prepend-reversed,
%init-main-pad-piece). The trailing inter-element comma is consumed
inside the same mark/rewind window so its lookahead doesn't leak.
Selected by initializer shape (file-scope/static positional
aggregate), not by symbol name. Designated struct fields still go
through %parse-init-struct-list + %merge-init-entries unchanged;
local autos go through parse-init-local-aggregate and are untouched.
tcc.flat.c now parses past the 333-row asm_instrs[] table at line
14527-14861 and stops at the next documented blocker, the
options_W[] offsetof const-expr at line 18026.
Diffstat:
| M | cc/cc.scm | | | 106 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- |
| M | docs/TCC-TODO.md | | | 171 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- |
2 files changed, 203 insertions(+), 74 deletions(-)
diff --git a/cc/cc.scm b/cc/cc.scm
@@ -5135,6 +5135,52 @@
(define (%pad-piece nbytes)
(make-bytevector nbytes 0))
+;; Static aggregate init streaming support. parse-translation-unit runs
+;; declarations in scratch, but a large file-scope array initializer can
+;; contain hundreds of independent elements. Parse one array element
+;; past a heap mark, promote the element pieces and parser lookahead into
+;; main, rewind that element's transient lexer/pp/const-expr scratch, and
+;; keep the outer pieces list in main as well.
+(define (%init-promote-unit ps thunk)
+ (let ((mark (heap-mark)))
+ (let ((scratch-result (thunk)))
+ (use-main-heap!)
+ (let ((ctx (make-deep-copy-context)))
+ (promote-roots! (ps-world ps) ctx)
+ (promote-iter-buffers! (ps-iter ps) ctx)
+ (let ((main-result (deep-copy ctx scratch-result)))
+ (use-scratch-heap!)
+ (heap-rewind! mark)
+ main-result)))))
+
+(define (%init-main-cons x xs)
+ (use-main-heap!)
+ (let ((r (cons x xs)))
+ (use-scratch-heap!)
+ r))
+
+(define (%init-main-reverse xs)
+ (use-main-heap!)
+ (let ((r (reverse xs)))
+ (use-scratch-heap!)
+ r))
+
+(define (%init-main-prepend-reversed xs acc)
+ (use-main-heap!)
+ (let loop ((ys xs) (out acc))
+ (cond
+ ((null? ys)
+ (use-scratch-heap!)
+ out)
+ (else
+ (loop (cdr ys) (cons (car ys) out))))))
+
+(define (%init-main-pad-piece nbytes)
+ (use-main-heap!)
+ (let ((p (%pad-piece nbytes)))
+ (use-scratch-heap!)
+ p))
+
;; ----- Global initializers ---------------------------------------------
;; Returns (values pieces final-ty). For inferred-length array `ty`,
;; final-ty is a freshly-built array ctype with the resolved length;
@@ -5199,34 +5245,45 @@
(let* ((final (cond ((< decl 0) count) (else decl)))
(pad (- final count)))
(values
- (cond
+ (cond
((> pad 0)
- (reverse (cons (%pad-piece (* pad esize)) acc)))
- (else (reverse acc)))
+ (%init-main-reverse
+ (%init-main-cons (%init-main-pad-piece (* pad esize)) acc)))
+ (else (%init-main-reverse acc)))
count)))
(else
(let ((piece
- (cond
- ((at-punct? ps 'lbrace)
- ;; Nested aggregate: brace-flatten via recursion.
- (advance ps)
- ;; element is itself struct/array
- (cond
- ((eq? (ctype-kind elem) 'arr)
- (let-values (((p _c) (%parse-init-array-list ps elem)))
- p))
- ((or (eq? (ctype-kind elem) 'struct)
- (eq? (ctype-kind elem) 'union))
- (%parse-init-struct-list ps elem))
- (else
- (let ((p (%const-init-piece ps elem)))
- (cond ((at-punct? ps 'comma) (advance ps)))
- (expect-punct ps 'rbrace)
- (list p)))))
- (else
- (list (%const-init-piece ps elem))))))
- (cond ((at-punct? ps 'comma) (advance ps)))
- (lp (append (reverse piece) acc) (+ count 1))))))))
+ (%init-promote-unit
+ ps
+ (lambda ()
+ ;; The trailing inter-element comma must be consumed
+ ;; *inside* the mark/rewind window: advance loads pp/lex
+ ;; lookahead into iter buffers, which promote-iter-buffers!
+ ;; then deep-copies into main. Consuming it after the
+ ;; rewind would leave that lookahead leaking on scratch.
+ (let ((p
+ (cond
+ ((at-punct? ps 'lbrace)
+ ;; Nested aggregate: brace-flatten via recursion.
+ (advance ps)
+ ;; element is itself struct/array
+ (cond
+ ((eq? (ctype-kind elem) 'arr)
+ (let-values (((p _c) (%parse-init-array-list ps elem)))
+ p))
+ ((or (eq? (ctype-kind elem) 'struct)
+ (eq? (ctype-kind elem) 'union))
+ (%parse-init-struct-list ps elem))
+ (else
+ (let ((p (%const-init-piece ps elem)))
+ (cond ((at-punct? ps 'comma) (advance ps)))
+ (expect-punct ps 'rbrace)
+ (list p)))))
+ (else
+ (list (%const-init-piece ps elem))))))
+ (cond ((at-punct? ps 'comma) (advance ps)))
+ p)))))
+ (lp (%init-main-prepend-reversed piece acc) (+ count 1))))))))
(define (%piece-bytesize p)
;; Output width of one piece (cf. %cg-init-piece->bv): a bv emits
@@ -6427,4 +6484,3 @@
"out-bytes" (bytevector-length out))
(%cc-write out-path out))
0))))
-
diff --git a/docs/TCC-TODO.md b/docs/TCC-TODO.md
@@ -37,21 +37,70 @@ head -c 50000 build/cc-bootstrap/X86_64/tcc.flat.c \
# then re-run the podman invocation against tcc.head.c
```
-## Blocker — scratch exhaustion on `asm_instrs[]` (line 14527)
+## Blocker — offsetof-style const expr in `options_W[]` (line 18026)
-After fixing the four blockers below, parse advances to the
-`static const ASMInstr asm_instrs[] = { … };` table at line 14527
-(~333 entries spanning lines 14528–14860) and aborts with
-`scheme1: scratch exhausted` partway through. Per-element scratch
-growth measured (heap-mark deltas):
+Current run gets past `asm_instrs[]` and stops in the option flag
+tables:
+
+```c
+static const FlagDef options_W[] = {
+ { 0, 0, "all" },
+ { ((size_t) &((TCCState *)0)->warn_unsupported), 0, "unsupported" },
+ { ((size_t) &((TCCState *)0)->warn_write_strings), 0, "write-strings" },
+ ...
+};
+```
+
+Diagnostic:
```
-elem 16 → scratch 287 503 672
-elem 176 → scratch 400 656 488
-delta 113 152 816 over 160 elements ≈ 707 KB / elem
+build/cc-bootstrap/X86_64/tcc.flat.c:18026:17: error: const-expr: bad operand: amp
```
-Each row writes ~12 bytes of static data but consumes ~700 KB of
+This is the classic `offsetof` idiom: take the address of a member
+through a null pointer, cast it to `size_t`, and use the resulting
+field offset as a static integer initializer.
+
+The present `parse-const-expr` handles integer arithmetic, casts to
+integer types, enum constants, and `sizeof`, but not address-of,
+pointer casts, member access, or `->` in unevaluated/address contexts.
+
+Likely narrow fix shape:
+
+- allow const-expr casts to pointer types when the operand is an
+ integer constant, at least for `(T *)0`
+- add a const-expression path for unary `&` over an lvalue expression
+ made of null pointer cast + `.` / `->` field selection
+- compute the member offset from the ctype layout and return it as an
+ integer constant of the enclosing cast type
+- keep this const-only; do not make general pointer values acceptable
+ as arbitrary integer constants
+
+This should cover the `options_W[]` / `options_f[]` tables without
+expanding static initializer semantics beyond the tcc bootstrap need.
+
+## Resolved — scratch pressure on `asm_instrs[]` (line 14527)
+
+Done. `parse-init-global` still returns a pieces list to
+`cg-emit-global`, but `%parse-init-array-list` now parses each array
+element as a unit, promotes that unit's pieces plus parser/pp/lex
+lookahead into main storage, rewinds scratch to a pre-unit mark, and
+continues with the outer accumulator in main. This is selected by
+initializer shape (file-scope/static positional aggregate), not by the
+symbol name.
+
+Before the fix, parse reached the `static const ASMInstr asm_instrs[] =
+{ ... };` table at line 14527 (~333 entries spanning lines
+14528-14860) and aborted with `scheme1: scratch exhausted` partway
+through. Per-element scratch growth measured then:
+
+```
+elem 16 -> scratch 287 503 672
+elem 176 -> scratch 400 656 488
+delta 113 152 816 over 160 elements ~= 707 KB / elem
+```
+
+Each row writes ~12 bytes of static data but consumed ~700 KB of
scratch to do it. tcc.c entries are dense:
```c
@@ -64,30 +113,17 @@ scratch to do it. tcc.c entries are dense:
0, { 0 } },
```
-The whole array sits inside one parse-decl-or-fn boundary, so scratch
-never resets between rows; per-element parse-const-expr / lex / pp /
-init-piece allocations all pile up. 128 MiB scratch cap covers ~176
-rows out of 333.
-
-**Likely fix shape — streaming init emission.** Currently
-`parse-init-global` builds a flat `pieces` list across the whole
-top-level array (with each row's struct-init merged into a small
-sub-list) and `cg-emit-global` walks it once at the end. For an
-array-of-struct init with no designators (which asm_instrs is), per-row
-pieces could stream directly to `cg-data` (a fixed-storage main-heap
-buf) and the per-row scratch state could be reset between elements.
-The non-pieces scratch state is small (token buffer, ps, cg vstack
-empty between top-level decls) but is currently mingled with init
-state, so a clean streaming variant of `%parse-init-array-list` plus a
-mid-decl scratch-reset hook is the substantive change.
-
-Diagnostic note: the source of the per-row 700 KB has not been pinned
-down by static reading. tok+loc per token (~80–120 B) × ~50 tokens/row
-predicts ~5 KB/row; const-expr cons cells predict another ~1 KB/row;
-the merge_init_entries / append-pair traffic is a few hundred bytes.
-The remaining ~690 KB is unaccounted-for and worth a deeper instrument
-pass before committing to a refactor — there may be a simpler O(N)-vs-
-O(N²) bug hiding in lex/pp/parse.
+Current aarch64 run:
+
+```
+[cc] decl: line 14527 heap 50929348
+[cc] decl: line 14861 heap 61008644
+delta 10079296 over 333 rows ~= 30 KB / row
+```
+
+The table now completes and parse advances to line 18026. The remaining
+row-scale heap growth is persistent `.data` output plus promoted
+pieces/metadata; it is no longer a scratch cap blocker.
## Resolved — `static` tentative-def merge
@@ -145,12 +181,43 @@ not honoured semantically, just parsed away.
The two earlier memory blockers (whole-TU heap explosion, single-decl
scratch peak inside the `enum tcc_token` block) are both gone after
-the per-decl scratch arena (Phase 3, [CC-SCRATCH.md](CC-SCRATCH.md))
+the per-decl scratch arena (Phase 3) plus the static aggregate
+initializer unit rewind path ([CC-INIT-SCRATCH.md](CC-INIT-SCRATCH.md))
plus the recent scope-bind alist / scratch reclamation work.
-Probing prefixes that end at clean top-level `};` boundaries so the
-parse completes (HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES =
-128 MiB):
+Current full-file aarch64 run against
+`build/cc-bootstrap/X86_64/tcc.flat.c`:
+
+```
+[cc] phase=start: heap 1225052
+[cc] phase=slurp: heap 3101100 src-bytes 608547
+[cc] decl: line 14527 heap 50929348
+[cc] decl: line 14861 heap 61008644
+[cc] decl: line 18024 heap 66002084
+build/cc-bootstrap/X86_64/tcc.flat.c:18026:17: error: const-expr: bad operand: amp
+```
+
+Milestones from that run:
+
+| point | line | heap | Δ from start | note |
+|------:|-----:|-----:|-------------:|------|
+| start | - | 1 225 052 | - | runtime before slurp |
+| slurp | - | 3 101 100 | 1 876 048 | 608 547-byte source loaded |
+| before `asm_instrs[]` | 14 527 | 50 929 348 | 49 704 296 | enters large static table |
+| after `asm_instrs[]` | 14 861 | 61 008 644 | 59 783 592 | table completed |
+| before current blocker | 18 024 | 66 002 084 | 64 777 032 | next decl, then line 18026 error |
+
+Observed rates:
+
+- Full progress to line 18024 consumes ~64.8 MB above start for
+ 608 547 source bytes loaded and most of the TU parsed.
+- The `asm_instrs[]` table adds ~10.1 MB over 333 rows, about
+ 30 KB / row after the streaming initializer fix.
+- The compiler now reaches line 18026 under the existing caps; the
+ current failure is semantic parser coverage, not memory exhaustion.
+
+Older prefix probes that end at clean top-level `};` boundaries
+(HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES = 128 MiB):
| line | bytes | heap after parse | Δ from start | KB / source byte |
|-----:|-------:|-----------------:|-------------:|-----------------:|
@@ -165,11 +232,11 @@ parse completes (HEAP_CAP_BYTES = 256 MiB, SCRATCH_CAP_BYTES =
Marginal residency converges to roughly **0.9 KB / input byte** at
the small-prefix scale and drops further across the enum block. The
-per-byte average is now low enough that the full 608 KB TU is
-projected to fit comfortably under a 256 MiB heap cap; in practice
-parse aborts on `__attribute__` long before approaching the cap.
+current full-file run confirms the 608 KB TU fits comfortably under
+the existing heap and scratch caps until the line 18026 const-expr
+coverage failure.
-The full 608 KB TU itself slurps to a heap of 3 086 092 bytes (~3 MB)
+The full 608 KB TU itself slurps to a heap of 3 101 100 bytes (~3 MB)
— bytevector storage for the source plus runtime baseline.
Heap delta minus the start-of-process baseline (~1.21 MB scheme1
@@ -198,16 +265,22 @@ enum constants) overflowed even 128 MiB of scratch because
O(N²) in member count. The recent scratch / alist work makes that
decl complete with parse heap at ~31 MB on the 1612-line cut.
-## Suspected next-tier blockers (past asm_instrs)
+## Expected next-tier blockers
-Once the asm_instrs scratch issue is past, the remaining wave we expect:
+After the line 18026 const-expression issue, the remaining wave is
+still likely to include:
+- **More static initializer const-expr forms** — tcc tables use C
+ implementation idioms (`offsetof`, pointer-ish integer constants,
+ possibly address arithmetic) that are not covered by the current
+ integer-only const evaluator.
- **`_Bool`, bitfield-typed struct fields, `setjmp.h` typedefs** —
- same "parse, don't codegen" softening as floats. tcc.c carries
- these under `HAVE_BITFIELD` / `HAVE_SETJMP` gates that are off but
- leave the declarations in the flattened text.
-- **Throughput / wall-clock** — lex+pp+parse on 18 896 lines under
- scheme1 is going to be slow even after heap residency drops.
+ same "parse, don't codegen" softening as floats. tcc.c carries these
+ under `HAVE_BITFIELD` / `HAVE_SETJMP` gates that are off but leave
+ the declarations in the flattened text.
+- **Throughput / wall-clock** — the current failing aarch64 run takes
+ about 29 seconds to parse to line 18026 under scheme1. A successful
+ full compile will add cg-finish and P1pp assembly time.
The end goal is milestone 4 in [CC.md §Validation milestones](CC.md)
— "Compile tcc.c (under the tcc-mes defines) → tcc-lispcc; verify