boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 257fdd5b913fd29cea9637e958304a511bc7d84b
parent 67e43e7cea97347519640114712c419b7a4f9261
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 27 Apr 2026 21:01:53 -0700

cc streaming architecture

Diffstat:
Mcc/cc.scm | 638++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 434 insertions(+), 204 deletions(-)

diff --git a/cc/cc.scm b/cc/cc.scm @@ -414,10 +414,14 @@ ;; -------------------------------------------------------------------- ;; pstate — parser state. Owned by parse.scm; read-only to cg. ;; -------------------------------------------------------------------- +;; iter holds a tok-iter (typically a pp-iter chained over a lex-iter). +;; peek / peek2 / advance go through iter-peek / iter-peek2 / iter-next +;; so the parser pulls one token at a time, with no full materialized +;; token list. See docs/CC-STREAM.md. (define-record-type pstate - (%pstate toks scope tags loops fn-ctx typedefs cg) + (%pstate iter scope tags loops fn-ctx typedefs cg) pstate? - (toks ps-toks ps-toks-set!) + (iter ps-iter ps-iter-set!) (scope ps-scope ps-scope-set!) (tags ps-tags ps-tags-set!) (loops ps-loops ps-loops-set!) @@ -1379,7 +1383,90 @@ (else #f)))))))))))) ;; -------------------------------------------------------------------- -;; lex-tokenize src file -> list of tok ending in EOF. +;; tok-iter — streaming token source. +;; -------------------------------------------------------------------- +;; Each pipeline layer (lex, pp, parser) wraps the layer below as a +;; tok-iter. iter-next pulls one token at a time. iter-peek/iter-peek2 +;; cache lookahead in `buf`. iter-unget! pushes back. Live-data bound is +;; lookahead (≤2) + per-layer state, not source length. See +;; docs/CC-STREAM.md. +;; +;; Pull-fns must keep yielding EOF after the first EOF (idempotent). +(define-record-type tok-iter + (%tok-iter pull-fn state buf) + tok-iter? + (pull-fn tok-iter-pull-fn) + (state tok-iter-state) + (buf tok-iter-buf tok-iter-buf-set!)) + +(define (iter-next it) + (let ((b (tok-iter-buf it))) + (cond + ((null? b) ((tok-iter-pull-fn it) (tok-iter-state it))) + (else + (tok-iter-buf-set! it (cdr b)) + (car b))))) + +(define (iter-peek it) + (let ((b (tok-iter-buf it))) + (cond + ((null? b) + (let ((t ((tok-iter-pull-fn it) (tok-iter-state it)))) + (tok-iter-buf-set! it (list t)) + t)) + (else (car b))))) + +(define (iter-peek2 it) + (let ((b (tok-iter-buf it))) + (cond + ((null? b) + (let* ((t1 ((tok-iter-pull-fn it) (tok-iter-state it))) + (t2 ((tok-iter-pull-fn it) (tok-iter-state it)))) + (tok-iter-buf-set! it (list t1 t2)) + t2)) + ((null? (cdr b)) + (let ((t2 ((tok-iter-pull-fn it) (tok-iter-state it)))) + (tok-iter-buf-set! it (cons (car b) (list t2))) + t2)) + (else (car (cdr b)))))) + +(define (iter-unget! it t) + (tok-iter-buf-set! it (cons t (tok-iter-buf it)))) + +;; Drain to a list ending in EOF. Used by back-compat wrappers +;; (lex-tokenize, pp-expand) so the cc-lex / cc-pp test runners can +;; inspect the materialized stream. +(define (iter->list it) + (let loop ((acc '())) + (let ((t (iter-next it))) + (cond + ((eq? (tok-kind t) 'EOF) (reverse (cons t acc))) + (else (loop (cons t acc))))))) + +;; -------------------------------------------------------------------- +;; list-iter — wrap an existing token list as a tok-iter. Yields each +;; tok in turn; once exhausted, keeps yielding EOF (idempotent). The +;; wrapped list typically already ends in EOF. +;; -------------------------------------------------------------------- +(define-record-type list-iter-state + (%list-iter-state toks) + list-iter-state? + (toks lis-toks lis-toks-set!)) + +(define (make-list-iter toks) + (%tok-iter %list-iter-pull (%list-iter-state toks) '())) + +(define (%list-iter-pull st) + (let ((toks (lis-toks st))) + (cond + ((null? toks) (make-tok 'EOF #f #f)) + (else + (lis-toks-set! st (cdr toks)) + (car toks))))) + +;; -------------------------------------------------------------------- +;; lex-iter — streaming lexer. Steady state: pos/line/col + bol? in +;; lex-state; per-token allocation reclaimed via heap-mark/rewind. ;; -------------------------------------------------------------------- ;; bol? — `#t` when no token has been emitted on the current physical ;; line yet (start of file, or only NL + whitespace seen since the last @@ -1387,155 +1474,179 @@ ;; at line-start; we forward that decision into the token stream by ;; emitting `HASH` instead of `(PUNCT hash …)` for a line-leading `#`. ;; -;; Heap discipline: each iteration of `loop` is wrapped in a +;; Heap discipline: each call to %lex-iter-pull is wrapped in a ;; heap-mark / heap-rewind!. All scratch the helper allocates (the ;; helper's own tok/loc, the `(cons tok 4-list)` it returns, every ;; bind_params / let* / eval_args env-cons consumed getting in and out) -;; lives above the mark and is reclaimed before recursing. The driver -;; ferries scalars across the rewind via the outer let bindings (set! -;; mutates the binding cdr in place — no fresh heap), and bv contents -;; via the sticky %lex-scratch buffer + %lex-intern pool. The only -;; survivors per token are the freshly-built tok (48 B), loc (40 B), -;; cons-onto-acc (16 B) and, on first sight, an interned bv. -(define (lex-tokenize src file) +;; lives above the mark and is reclaimed before returning. Per-token +;; scratch (kind/val/vlen/loc-line/loc-col/npos/nline/ncol/nbol?) is +;; allocated in the outer `let` BEFORE the mark — set! mutates those +;; cells in place across the rewind. Bv contents survive via the +;; sticky %lex-scratch buffer + %lex-scratch->bv (allocated post-rewind). +;; The survivors per token are tok (48 B) + loc (40 B) + bv if any. +(define-record-type lex-state + (%lex-state src file pos line col bol? done?) + lex-state? + (src lex-state-src) + (file lex-state-file) + (pos lex-state-pos lex-state-pos-set!) + (line lex-state-line lex-state-line-set!) + (col lex-state-col lex-state-col-set!) + (bol? lex-state-bol? lex-state-bol?-set!) + (done? lex-state-done? lex-state-done?-set!)) + +(define (make-lex-iter src file) (%lex-init!) - (let ((pos 0) (line 1) (col 1) (acc '()) (bol? #t) (done? #f) - (mark 0) - ;; per-iteration scratch: kind + value (+ vlen for bv-typed), - ;; loc-line/col (the start of the token), and npos/nline/ncol - ;; (where the lexer stopped). All scalar — set! is safe across - ;; the rewind because the binding cons lives in the outer let, - ;; below the mark. + (%tok-iter %lex-iter-pull + (%lex-state src file 0 1 1 #t #f) + '())) + +(define (%lex-iter-pull st) + (cond + ((lex-state-done? st) + ;; Idempotent EOF: keep yielding EOF after the first one. + (make-tok 'EOF #f (%loc (lex-state-file st) + (lex-state-line st) + (lex-state-col st)))) + (else (%lex-iter-step st)))) + +(define (%lex-iter-step st) + (let ((src (lex-state-src st)) + (file (lex-state-file st)) + (pos (lex-state-pos st)) + (line (lex-state-line st)) + (col (lex-state-col st)) + (bol? (lex-state-bol? st)) + ;; Per-iteration scratch — must be allocated BEFORE the mark + ;; so that set!s after heap-rewind! still find live cells. (kind #f) (val #f) (vlen 0) (loc-line 1) (loc-col 1) - (npos 0) (nline 1) (ncol 1) (nbol? #t)) - (let loop () + (npos 0) (nline 1) (ncol 1) (nbol? #f) + (mark 0)) + (set! mark (heap-mark)) + (let* ((sw (%skip-ws-and-comments src pos line col file)) + (pos1 (car sw)) + (line1 (car (cdr sw))) + (col1 (car (cdr (cdr sw)))) + (p (%lex-peek src pos1 line1 col1)) + (b (%pk-byte p))) + (set! loc-line line1) + (set! loc-col col1) + (set! val #f) (set! vlen 0) (set! nbol? #f) (cond - (done? (reverse acc)) - (else - (set! mark (heap-mark)) - (let* ((sw (%skip-ws-and-comments src pos line col file)) - (pos1 (car sw)) - (line1 (car (cdr sw))) - (col1 (car (cdr (cdr sw)))) - (p (%lex-peek src pos1 line1 col1)) - (b (%pk-byte p))) - (set! loc-line line1) - (set! loc-col col1) - (set! val #f) (set! vlen 0) (set! nbol? #f) + ;; EOF + ((not b) + (set! kind 'EOF) + (set! npos pos1) (set! nline line1) (set! ncol col1)) + ;; Newline → NL token; next call starts at bol. + ((%newline? b) + (set! kind 'NL) + (set! npos (%pk-pos p)) + (set! nline (%pk-line p)) + (set! ncol (%pk-col p)) + (set! nbol? #t)) + ;; Line-leading `#`: bare `#` becomes HASH; `##` falls + ;; through to punctuator (lexes as `paste`). + ((and bol? (= b 35)) + (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) (cond - ;; EOF - ((not b) - (set! kind 'EOF) - (set! npos pos1) (set! nline line1) (set! ncol col1)) - ;; Newline → NL token; next iter starts at bol. - ((%newline? b) - (set! kind 'NL) - (set! npos (%pk-pos p)) - (set! nline (%pk-line p)) - (set! ncol (%pk-col p)) - (set! nbol? #t)) - ;; Line-leading `#`: bare `#` becomes HASH; `##` falls - ;; through to punctuator (lexes as `paste`). - ((and bol? (= b 35)) - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) - (cond - ((and b2 (= b2 35)) - (let* ((r (%lex-read-punct src pos1 line1 col1 file)) - (tok (car r)) (rest (cdr r))) - (set! kind 'PUNCT) (set! val (tok-value tok)) - (set! npos (car rest)) - (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest)))))) - (else - (set! kind 'HASH) - (set! npos (%pk-pos p)) - (set! nline (%pk-line p)) - (set! ncol (%pk-col p)))))) - ;; Identifier / keyword - ((%ident-start? b) - (let* ((r (%lex-read-ident src pos1 line1 col1 file)) - (tok (car r)) (rest (cdr r))) - (set! kind (tok-kind tok)) - (cond ((eq? (tok-kind tok) 'KW) - (set! val (tok-value tok))) - (else - (let ((bv (tok-value tok))) - (set! vlen (bytevector-length bv)) - (%lex-scratch<- bv vlen)))) - (set! npos (car rest)) - (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest)))))) - ;; Number (digit start) - ((%digit? b) - (let* ((r (%lex-read-number src pos1 line1 col1 file)) - (tok (car r)) (rest (cdr r))) - (set! kind 'INT) (set! val (tok-value tok)) - (set! npos (car rest)) - (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest)))))) - ;; '.' is a punctuator unless followed by a digit (float). - ((= b 46) - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) - (cond - ((and b2 (%digit? b2)) - (die (%loc file line1 col1) - "floating-point literal not supported")) - (else - (let* ((r (%lex-read-punct src pos1 line1 col1 file)) - (tok (car r)) (rest (cdr r))) - (set! kind 'PUNCT) (set! val (tok-value tok)) - (set! npos (car rest)) - (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest))))))))) - ;; String - ((= b 34) - (let* ((r (%lex-read-string src pos1 line1 col1 file)) - (tok (car r)) (rest (cdr r)) - (bv (tok-value tok))) - (set! kind 'STR) - (set! vlen (bytevector-length bv)) - (%lex-scratch<- bv vlen) - (set! npos (car rest)) - (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest)))))) - ;; Char - ((= b 39) - (let* ((r (%lex-read-char src pos1 line1 col1 file)) + ((and b2 (= b2 35)) + (let* ((r (%lex-read-punct src pos1 line1 col1 file)) (tok (car r)) (rest (cdr r))) - (set! kind 'CHAR) (set! val (tok-value tok)) + (set! kind 'PUNCT) (set! val (tok-value tok)) (set! npos (car rest)) (set! nline (car (cdr rest))) (set! ncol (car (cdr (cdr rest)))))) - ;; Punctuator (default) + (else + (set! kind 'HASH) + (set! npos (%pk-pos p)) + (set! nline (%pk-line p)) + (set! ncol (%pk-col p)))))) + ;; Identifier / keyword + ((%ident-start? b) + (let* ((r (%lex-read-ident src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind (tok-kind tok)) + (cond ((eq? (tok-kind tok) 'KW) + (set! val (tok-value tok))) + (else + (let ((bv (tok-value tok))) + (set! vlen (bytevector-length bv)) + (%lex-scratch<- bv vlen)))) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Number (digit start) + ((%digit? b) + (let* ((r (%lex-read-number src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'INT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; '.' is a punctuator unless followed by a digit (float). + ((= b 46) + (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) + (cond + ((and b2 (%digit? b2)) + (die (%loc file line1 col1) + "floating-point literal not supported")) (else (let* ((r (%lex-read-punct src pos1 line1 col1 file)) (tok (car r)) (rest (cdr r))) (set! kind 'PUNCT) (set! val (tok-value tok)) (set! npos (car rest)) (set! nline (car (cdr rest))) - (set! ncol (car (cdr (cdr rest)))))))) - (heap-rewind! mark) - ;; Reconstruct survivors post-rewind. Bv-typed values go - ;; through the intern pool so repeats share storage. - (cond - ((eq? kind 'EOF) - (set! acc (cons (make-tok 'EOF #f (%loc file loc-line loc-col)) - acc)) - (set! done? #t)) - (else - (let ((tok-val (cond ((eq? kind 'IDENT) (%lex-scratch->bv vlen)) - ((eq? kind 'STR) (%lex-scratch->bv vlen)) - (else val)))) - (set! acc (cons (make-tok kind tok-val - (%loc file loc-line loc-col)) - acc)) - (set! pos npos) (set! line nline) (set! col ncol) - (set! bol? nbol?))) - ) - (loop)))))) + (set! ncol (car (cdr (cdr rest))))))))) + ;; String + ((= b 34) + (let* ((r (%lex-read-string src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r)) + (bv (tok-value tok))) + (set! kind 'STR) + (set! vlen (bytevector-length bv)) + (%lex-scratch<- bv vlen) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Char + ((= b 39) + (let* ((r (%lex-read-char src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'CHAR) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Punctuator (default) + (else + (let* ((r (%lex-read-punct src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'PUNCT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))))) + (heap-rewind! mark) + ;; Reconstruct the survivor below the mark and advance state. + (cond + ((eq? kind 'EOF) + (lex-state-done?-set! st #t) + (make-tok 'EOF #f (%loc file loc-line loc-col))) + (else + (let ((tok-val (cond ((eq? kind 'IDENT) (%lex-scratch->bv vlen)) + ((eq? kind 'STR) (%lex-scratch->bv vlen)) + (else val)))) + (lex-state-pos-set! st npos) + (lex-state-line-set! st nline) + (lex-state-col-set! st ncol) + (lex-state-bol?-set! st nbol?) + (make-tok kind tok-val (%loc file loc-line loc-col))))))) + +;; Back-compat wrapper — drains lex-iter into a list ending in EOF. +;; Used by the cc-lex test runner. +(define (lex-tokenize src file) + (iter->list (make-lex-iter src file))) ;; cc/pp.scm — token list -> expanded token list. ;; Realizes docs/CC-INTERNALS.md §pp.scm. Hide-set per C11 6.10.3.4. ;; #include rejected (CC.md §Toolchain envelope). @@ -1559,15 +1670,27 @@ ;; --- pp-state (private record) --- ;; cond-stack: list of (active? . has-taken?). Outer-active gating is ;; computed by walking the stack rather than encoding it in frames. +;; +;; Streaming fields (used only by make-pp-iter; #f for the legacy +;; bounded-buffer path used by pp-eval-cexpr): +;; lex-iter — upstream tok-iter, or #f +;; up-pending — toks unshifted upstream (macro-expansion bodies that +;; must be re-scanned for further expansion) +;; out-buf — toks already dispatched but stashed for the next pull +;; (peek-and-fuse for adjacent STRs lookahead overshoots +;; by one tok, which lands here) (define-record-type pp-state - (%pp-state macros cond-stack cur-file line-delta) + (%pp-state macros cond-stack cur-file line-delta lex-iter up-pending out-buf) pp-state? (macros pps-macros pps-macros-set!) (cond-stack pps-cond-stack pps-cond-stack-set!) (cur-file pps-cur-file pps-cur-file-set!) - (line-delta pps-line-delta pps-line-delta-set!)) + (line-delta pps-line-delta pps-line-delta-set!) + (lex-iter pps-lex-iter) + (up-pending pps-up-pending pps-up-pending-set!) + (out-buf pps-out-buf pps-out-buf-set!)) -(define (%pp-make-state defs) (%pp-state defs '() #f 0)) +(define (%pp-make-state defs) (%pp-state defs '() #f 0 #f '() '())) (define (%pp-active? state) (let loop ((xs (pps-cond-stack state))) @@ -1648,59 +1771,162 @@ (else (buf-list-push! b (car ts)) (loop (cdr ts)))))) (define (buf-list-flush b) (reverse (buf-list-xs b))) -;; Translation phase 6: concatenate adjacent string literals. The merged -;; token keeps the loc and hide-set of the first; values are byte-appended. -(define (%pp-merge-adjacent-strs toks) - (let loop ((toks toks) (acc '())) +;; --- make-pp-iter: streaming preprocessor --- +;; Wraps a lex-iter (or any tok-iter). Returns a tok-iter. Live data +;; bounded by parser state + lookahead, not source length. Adjacent-STR +;; fusion happens inline via peek-and-stash. +(define (make-pp-iter src-iter initial-defines) + (let ((st (%pp-state initial-defines '() #f 0 src-iter '() '()))) + (%tok-iter %pp-iter-pull st '()))) + +(define (%pp-iter-pull st) + (let ((ob (pps-out-buf st))) (cond - ((null? toks) (reverse acc)) - ((and (not (null? acc)) - (eq? (tok-kind (car toks)) 'STR) - (eq? (tok-kind (car acc)) 'STR)) - (let* ((prev (car acc)) - (cur (car toks)) - (merged (%tok 'STR - (bytevector-append (tok-value prev) (tok-value cur)) - (tok-loc prev) - (tok-hide prev)))) - (loop (cdr toks) (cons merged (cdr acc))))) - (else (loop (cdr toks) (cons (car toks) acc)))))) + ((not (null? ob)) + (pps-out-buf-set! st (cdr ob)) + (car ob)) + (else (%pp-maybe-fuse-str st (%pp-dispatch-step st)))))) + +;; --- upstream helpers --- +;; Upstream tokens come either from up-pending (macro-expansion bodies +;; that need re-scanning) or from the wrapped lex-iter. +(define (%pp-pull-upstream st) + (let ((up (pps-up-pending st))) + (cond + ((not (null? up)) + (pps-up-pending-set! st (cdr up)) + (car up)) + (else (iter-next (pps-lex-iter st)))))) -;; --- pp-expand: top-level driver --- -(define (pp-expand toks initial-defines) - (let ((state (%pp-make-state initial-defines)) - (out (make-buf-list))) - (let loop ((toks toks)) +(define (%pp-peek-upstream st) + (let ((up (pps-up-pending st))) + (cond + ((not (null? up)) (car up)) + (else (iter-peek (pps-lex-iter st)))))) + +;; Push toks to the front of upstream so (car toks) is yielded next. +(define (%pp-unshift-upstream! st toks) + (pps-up-pending-set! st (append toks (pps-up-pending st)))) + +;; Collect tokens up to (not including) NL or EOF. NL is consumed; EOF +;; is unshifted back so the main loop sees it. +(define (%pp-collect-line-stream st) + (let loop ((acc '())) + (let ((t (%pp-pull-upstream st))) (cond - ((null? toks) (die #f "pp-expand: missing EOF token")) - ((%pp-eof? (car toks)) - (cond ((not (null? (pps-cond-stack state))) - (die (tok-loc (car toks)) "unterminated #if/#ifdef/#ifndef")) - (else - (buf-list-push! out (car toks)) - (%pp-merge-adjacent-strs (buf-list-flush out))))) - ((%pp-nl? (car toks)) (loop (cdr toks))) - ((%pp-hash? (car toks)) - (let-values (((line rest) (%pp-take-line (cdr toks)))) - (%pp-dispatch-directive (car toks) line state out) - (loop rest))) - (else - (let-values (((line rest) (%pp-take-line toks))) - (cond ((%pp-active? state) - (%pp-emit-expanded line state out)) - (else #t)) - (loop rest))))))) - -;; Take tokens up to (not including) the next NL or EOF. NL is consumed; -;; EOF is left in the stream so the driver sees it next. -;; Returns (values line rest). -(define (%pp-take-line toks) - (let loop ((toks toks) (acc '())) + ((%pp-eof? t) + (%pp-unshift-upstream! st (list t)) + (reverse acc)) + ((%pp-nl? t) (reverse acc)) + (else (loop (cons t acc))))))) + +;; Streaming arg collection for fn-like macro calls. Position is just +;; AFTER the opening `(`. Returns the list of arg-tokenlists. +(define (%pp-collect-args-stream st call-loc) + (let loop ((depth 0) (cur '()) (args '())) + (let ((t (%pp-pull-upstream st))) + (cond + ((%pp-eof? t) + (die call-loc "macro call: unterminated argument list")) + ((and (= depth 0) (%pp-punct? t 'rparen)) + (cond + ;; Empty parens count as one empty argument; bind-args + ;; degenerates this back to "no args" for 0-param macros. + ((and (null? args) (null? cur)) (list '())) + (else (reverse (cons (reverse cur) args))))) + ((and (= depth 0) (%pp-punct? t 'comma)) + (loop 0 '() (cons (reverse cur) args))) + ((%pp-punct? t 'lparen) + (loop (+ depth 1) (cons t cur) args)) + ((%pp-punct? t 'rparen) + (loop (- depth 1) (cons t cur) args)) + (else (loop depth (cons t cur) args)))))) + +;; Single dispatch step. Returns one post-pp tok (skipping NLs, +;; processing directives, expanding macros). Does NOT apply STR-fusion +;; — that happens one layer up in %pp-iter-pull, otherwise the +;; recursive lookahead during fusion would itself fuse further STRs +;; and drag tokens past the run into out-buf. +(define (%pp-dispatch-step st) + (let ((t (%pp-pull-upstream st))) (cond - ((null? toks) (values (reverse acc) toks)) - ((%pp-eof? (car toks)) (values (reverse acc) toks)) - ((%pp-nl? (car toks)) (values (reverse acc) (cdr toks))) - (else (loop (cdr toks) (cons (car toks) acc)))))) + ((%pp-eof? t) + (cond ((not (null? (pps-cond-stack st))) + (die (tok-loc t) "unterminated #if/#ifdef/#ifndef")) + (else t))) + ((%pp-nl? t) (%pp-dispatch-step st)) + ((%pp-hash? t) + (let ((line (%pp-collect-line-stream st))) + (%pp-dispatch-directive t line st #f) + (%pp-dispatch-step st))) + ((not (%pp-active? st)) + (%pp-dispatch-step st)) + ((%pp-ident? t) + (let ((name (tok-value t))) + (cond + ((%pp-bv-mem? name (tok-hide t)) + (%pp-relocate t st)) + ((%pp-builtin? name) + (let ((toks (%pp-expand-builtin name (tok-loc t) st))) + (%pp-unshift-upstream! st toks) + (%pp-dispatch-step st))) + (else + (let ((m (alist-ref name (pps-macros st)))) + (cond + ((not m) (%pp-relocate t st)) + ((eq? (macro-kind m) 'obj) + (let ((body (%pp-prepare-body (macro-body m) + (cons name (tok-hide t))))) + (%pp-unshift-upstream! st body) + (%pp-dispatch-step st))) + (else + ;; fn-like or fn-vararg: peek upstream for `(`. If + ;; not present, pass IDENT through unchanged (no + ;; consumption); the next iter call will process the + ;; following tok normally. + (let ((next (%pp-peek-upstream st))) + (cond + ((not (%pp-punct? next 'lparen)) + (%pp-relocate t st)) + (else + (%pp-pull-upstream st) ; consume `(` + (let* ((args (%pp-collect-args-stream st (tok-loc t))) + (params (macro-params m)) + (variadic? (eq? (macro-kind m) 'fn-vararg)) + (env (%pp-bind-args params args variadic? (tok-loc t))) + (sub (%pp-substitute (macro-body m) env (tok-loc t))) + (body (%pp-prepare-body sub + (cons name (tok-hide t))))) + (%pp-unshift-upstream! st body) + (%pp-dispatch-step st)))))))))))) + (else (%pp-relocate t st))))) + +;; Translation phase 6 (peek-and-fuse). If `cur` is STR, look at the +;; next post-pp tok; if it's STR, fuse and repeat. Anything else gets +;; stashed in out-buf for the next iter-next call. Lookahead goes +;; through %pp-dispatch-step (no fusion), so a non-STR neighbor +;; correctly terminates the run. +(define (%pp-maybe-fuse-str st cur) + (cond + ((not (eq? (tok-kind cur) 'STR)) cur) + (else + (let loop ((cur cur)) + (let ((next (%pp-dispatch-step st))) + (cond + ((eq? (tok-kind next) 'STR) + (loop (%tok 'STR + (bytevector-append (tok-value cur) (tok-value next)) + (tok-loc cur) + (tok-hide cur)))) + (else + (pps-out-buf-set! st (cons next (pps-out-buf st))) + cur))))))) + +;; Back-compat wrapper — drains pp-iter into a list ending in EOF. +;; Used by the cc-pp test runner. Wraps the input list in a list-iter +;; so the streaming engine sees a normal upstream. +(define (pp-expand toks initial-defines) + (iter->list (make-pp-iter (make-list-iter toks) initial-defines))) ;; --- directive dispatch --- ;; pmatch-based on the directive name bv. bv literals match by equal?. @@ -2190,7 +2416,7 @@ ;; `die` (which sys-exits), so no rewind there. (define (pp-eval-cexpr toks macros) (let ((mark (heap-mark))) - (let* ((state (%pp-state macros '() #f 0)) + (let* ((state (%pp-state macros '() #f 0 #f '() '())) (s1 (%pp-resolve-defined toks state)) (s2 (%pp-expand-line s1 state)) (s3 (%pp-idents-as-zero s2))) @@ -3755,15 +3981,12 @@ aligned)) ;; cc/parse.scm — recursive-descent + Pratt parser. Minimal scheme1. -(define (make-pstate toks cg) - (%pstate toks (list '()) (list '()) '() #f '() cg)) +(define (make-pstate iter cg) + (%pstate iter (list '()) (list '()) '() #f '() cg)) -(define (peek ps) (car (ps-toks ps))) -(define (peek2 ps) - (let ((r (cdr (ps-toks ps)))) - (if (null? r) (car (ps-toks ps)) (car r)))) -(define (advance ps) - (let ((t (peek ps))) (ps-toks-set! ps (cdr (ps-toks ps))) t)) +(define (peek ps) (iter-peek (ps-iter ps))) +(define (peek2 ps) (iter-peek2 (ps-iter ps))) +(define (advance ps) (iter-next (ps-iter ps))) (define (at-kw? ps s) (pmatch (peek ps) (($ tok? (kind KW) (value ,v)) (eq? v s)) @@ -5255,11 +5478,16 @@ (cond ((null? stk) #t) (else - (let ((sv (ps-toks ps))) - (ps-toks-set! ps - (append stk (list (make-tok 'EOF #f #f)))) + ;; Step expr is parsed AFTER the body. Swap the + ;; parser's iter for a list-iter wrapping the saved + ;; tokens (followed by EOF so parse-expr stops); + ;; restore on exit. + (let ((sv (ps-iter ps))) + (ps-iter-set! ps + (make-list-iter + (append stk (list (make-tok 'EOF #f #f))))) (parse-expr ps) (cg-pop (ps-cg ps)) - (ps-toks-set! ps sv))))))) + (ps-iter-set! ps sv))))))) (scope-leave! ps) #t) (define (collect-til-rparen ps) @@ -5921,15 +6149,17 @@ (let* ((in-path (car args)) (out-path (car (cdr args)))) (debug-log "phase=start" "heap" (heap-usage)) + ;; Streaming pipeline: lex → pp → parser → cg, all concurrent. + ;; Each stage pulls one tok at a time from upstream. Steady-state + ;; live data is bounded by parser/pp state, not source length. + ;; See docs/CC-STREAM.md. (let* ((src (%cc-slurp in-path)) (_1 (debug-log "phase=slurp" "heap" (heap-usage) "src-bytes" (bytevector-length src))) - (toks (lex-tokenize src in-path)) - (_2 (debug-log "phase=lex" "heap" (heap-usage))) - (expanded (pp-expand toks '())) - (_3 (debug-log "phase=pp" "heap" (heap-usage))) + (lex-iter (make-lex-iter src in-path)) + (pp-iter (make-pp-iter lex-iter '())) (cg (cg-init)) - (ps (make-pstate expanded cg))) + (ps (make-pstate pp-iter cg))) (parse-translation-unit ps) (debug-log "phase=parse" "heap" (heap-usage)) (let ((out (cg-finish cg)))