commit 62dabed96093fc078dc4098443bb6b59887cc5ad
parent 14855baae25c60c5747ae85815079ad84d16e992
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 27 Apr 2026 19:56:24 -0700
cc: lex per-token heap-mark/rewind + scratch buffer
Wrap each lex-tokenize iteration in heap-mark/heap-rewind!. The
helpers' tok+loc + 4-list result and all the interpreter-overhead
cons cells (bind_params, eval_args, let* env extension, named-let
frames in the inner scanners) sit above the mark and are reclaimed
before recursing. Scalar fields ferry across the rewind via outer-
let bindings (set! mutates the binding cdr in place, no fresh heap);
IDENT/STR bv contents ferry through %lex-scratch, a single sticky
bytevector allocated below all marks. Post-rewind the driver
rebuilds tok+loc+acc-cons and a fresh bv sized exactly to the
content.
Investigated interning (single bv per unique IDENT/STRING). With
scheme1's interpreter, walking the pool cons-list per lookup costs
~50-150 B per step in bind_params/eval_args/named-let env-extension
overhead. 16-way bucketing did worse than no intern on a 50 KB
prefix (60+ MB vs 17 MB). Skipped until scheme1 grows a vector
primitive.
Per-token heap delta on tests/cc/001-kitchen-sink.c (9.7 KB):
before: 12283 B/tok (~3 KB/source-byte)
after: 840 B/tok (~250 B/source-byte)
The 50 KB tcc.flat.c prefix that previously exhausted the heap
during lex now lex+pp's cleanly through to blocker 2 (dup decl).
Diffstat:
| M | cc/cc.scm | | | 292 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------- |
1 file changed, 200 insertions(+), 92 deletions(-)
diff --git a/cc/cc.scm b/cc/cc.scm
@@ -562,6 +562,58 @@
;; - Numeric digit runs accumulate their value inline via
;; %accum-int-while; they no longer materialize a per-byte cons list
;; and then a separate %digits-value walk.
+;; - lex-tokenize wraps each token-emitting iteration in an outer
+;; heap-mark / heap-rewind!. The helper allocates its own tok+loc+bv
+;; above this outer mark; the driver reads the scalar fields and
+;; copies any bv contents into %lex-scratch (sticky, pre-mark) before
+;; rewinding. Post-rewind it rebuilds a fresh tok+loc and a fresh bv
+;; (for IDENT/STR) sized to the actual content. Surface effect on
+;; the kitchen-sink test: per-token heap delta drops from ~12 KB
+;; (interpreter overhead, env-extension cons cells, eval_args lists,
+;; tail-rec named-let frames in the inner scanners, all sticky for
+;; the duration of lex) to ~840 B (tok + loc + acc-cons + bv +
+;; the irreducible eval-cost of the post-rewind reconstruction).
+
+;; --------------------------------------------------------------------
+;; Cross-rewind transport for IDENT / STR bv values.
+;;
+;; %lex-scratch is a single sticky bytevector allocated below any
+;; lex-tokenize heap-mark. The driver copies bv data here *before* the
+;; rewind, then post-rewind allocates a fresh bv (sized exactly to the
+;; ident/string content) by copying back out of scratch. The whole
+;; lex run shares this one buffer.
+;;
+;; We considered a deduplicating *intern pool* (one bv per unique
+;; ident, repeats share storage). With scheme1's interpreter, walking
+;; the pool cons-list per lookup costs ~50–150 B per step in
+;; bind_params/eval_args/named-let env-extension overhead. Even with
+;; 16-way bucketing, a 50 KB tcc.flat.c prefix saw walk costs blow
+;; past the bv-allocation savings (60+ MB heap vs 17 MB un-interned).
+;; The cleanest path until scheme1 gets a vector primitive is no
+;; intern; if a downstream eq?-fast-path is needed later, layer it
+;; atop %lex-scratch with a hash backed by a u8 bytevector.
+;; --------------------------------------------------------------------
+(define %lex-scratch-cap 65536)
+(define %lex-scratch (make-bytevector %lex-scratch-cap 0))
+
+(define (%lex-init!) #t)
+
+(define (%lex-scratch<- bv len)
+ (cond ((> len %lex-scratch-cap)
+ (die #f "lex: token exceeds scratch cap" len)))
+ (let loop ((i 0))
+ (cond ((< i len)
+ (bytevector-u8-set! %lex-scratch i (bytevector-u8-ref bv i))
+ (loop (+ i 1))))))
+
+(define (%lex-scratch->bv len)
+ ;; Allocate a fresh bv (exact size) and copy scratch[0..len) into it.
+ (let ((bv (make-bytevector len 0)))
+ (let copy ((i 0))
+ (cond ((< i len)
+ (bytevector-u8-set! bv i (bytevector-u8-ref %lex-scratch i))
+ (copy (+ i 1)))))
+ bv))
;; --------------------------------------------------------------------
;; Byte-class predicates (raw u8 values, not chars).
@@ -1329,105 +1381,161 @@
;; --------------------------------------------------------------------
;; lex-tokenize src file -> list of tok ending in EOF.
;; --------------------------------------------------------------------
-(define (lex-tokenize src file)
- (%lex-loop src 0 1 1 file '() #t))
-
;; bol? — `#t` when no token has been emitted on the current physical
;; line yet (start of file, or only NL + whitespace seen since the last
;; line break). pp recognizes a directive only when its leading `#` is
;; at line-start; we forward that decision into the token stream by
;; emitting `HASH` instead of `(PUNCT hash …)` for a line-leading `#`.
-(define (%lex-loop src pos line col file acc bol?)
- (let* ((sw (%skip-ws-and-comments src pos line col file))
- (pos1 (car sw))
- (line1 (car (cdr sw)))
- (col1 (car (cdr (cdr sw))))
- (p (%lex-peek src pos1 line1 col1))
- (b (%pk-byte p)))
- (cond
- ;; EOF
- ((not b)
- (let* ((eof-tok (make-tok 'EOF #f (%loc file line1 col1))))
- (reverse (cons eof-tok acc))))
- ;; Newline → emit NL, reset bol?.
- ((%newline? b)
- (let ((nl (make-tok 'NL #f (%loc file line1 col1))))
- (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p)
- file (cons nl acc) #t)))
- ;; Line-leading `#` → emit HASH, but only the bare `#`. `##` is
- ;; never line-leading in valid C; if it appears, fall through to
- ;; normal punctuator handling so it lexes as `paste`.
- ((and bol? (= b 35))
- (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
- (b2 (%pk-byte q)))
- (cond
- ((and b2 (= b2 35))
- (let* ((r (%lex-read-punct src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f)))
- (else
- (let ((tok (make-tok 'HASH #f (%loc file line1 col1))))
- (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p)
- file (cons tok acc) #f))))))
- ;; Identifier / keyword
- ((%ident-start? b)
- (let* ((r (%lex-read-ident src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f)))
- ;; Number (digit start)
- ((%digit? b)
- (let* ((r (%lex-read-number src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f)))
- ;; '.' might start a number (1.0 actually starts with digit; .5
- ;; would route here). We keep this as a punctuator unless followed
- ;; by a digit, in which case the lexer rejects per spec.
- ((= b 46)
- (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
- (b2 (%pk-byte q)))
+;;
+;; Heap discipline: each iteration of `loop` is wrapped in a
+;; heap-mark / heap-rewind!. All scratch the helper allocates (the
+;; helper's own tok/loc, the `(cons tok 4-list)` it returns, every
+;; bind_params / let* / eval_args env-cons consumed getting in and out)
+;; lives above the mark and is reclaimed before recursing. The driver
+;; ferries scalars across the rewind via the outer let bindings (set!
+;; mutates the binding cdr in place — no fresh heap), and bv contents
+;; via the sticky %lex-scratch buffer + %lex-intern pool. The only
+;; survivors per token are the freshly-built tok (48 B), loc (40 B),
+;; cons-onto-acc (16 B) and, on first sight, an interned bv.
+(define (lex-tokenize src file)
+ (%lex-init!)
+ (let ((pos 0) (line 1) (col 1) (acc '()) (bol? #t) (done? #f)
+ (mark 0)
+ ;; per-iteration scratch: kind + value (+ vlen for bv-typed),
+ ;; loc-line/col (the start of the token), and npos/nline/ncol
+ ;; (where the lexer stopped). All scalar — set! is safe across
+ ;; the rewind because the binding cons lives in the outer let,
+ ;; below the mark.
+ (kind #f) (val #f) (vlen 0)
+ (loc-line 1) (loc-col 1)
+ (npos 0) (nline 1) (ncol 1) (nbol? #t))
+ (let loop ()
+ (cond
+ (done? (reverse acc))
+ (else
+ (set! mark (heap-mark))
+ (let* ((sw (%skip-ws-and-comments src pos line col file))
+ (pos1 (car sw))
+ (line1 (car (cdr sw)))
+ (col1 (car (cdr (cdr sw))))
+ (p (%lex-peek src pos1 line1 col1))
+ (b (%pk-byte p)))
+ (set! loc-line line1)
+ (set! loc-col col1)
+ (set! val #f) (set! vlen 0) (set! nbol? #f)
+ (cond
+ ;; EOF
+ ((not b)
+ (set! kind 'EOF)
+ (set! npos pos1) (set! nline line1) (set! ncol col1))
+ ;; Newline → NL token; next iter starts at bol.
+ ((%newline? b)
+ (set! kind 'NL)
+ (set! npos (%pk-pos p))
+ (set! nline (%pk-line p))
+ (set! ncol (%pk-col p))
+ (set! nbol? #t))
+ ;; Line-leading `#`: bare `#` becomes HASH; `##` falls
+ ;; through to punctuator (lexes as `paste`).
+ ((and bol? (= b 35))
+ (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
+ (b2 (%pk-byte q)))
+ (cond
+ ((and b2 (= b2 35))
+ (let* ((r (%lex-read-punct src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind 'PUNCT) (set! val (tok-value tok))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))
+ (else
+ (set! kind 'HASH)
+ (set! npos (%pk-pos p))
+ (set! nline (%pk-line p))
+ (set! ncol (%pk-col p))))))
+ ;; Identifier / keyword
+ ((%ident-start? b)
+ (let* ((r (%lex-read-ident src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind (tok-kind tok))
+ (cond ((eq? (tok-kind tok) 'KW)
+ (set! val (tok-value tok)))
+ (else
+ (let ((bv (tok-value tok)))
+ (set! vlen (bytevector-length bv))
+ (%lex-scratch<- bv vlen))))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))
+ ;; Number (digit start)
+ ((%digit? b)
+ (let* ((r (%lex-read-number src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind 'INT) (set! val (tok-value tok))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))
+ ;; '.' is a punctuator unless followed by a digit (float).
+ ((= b 46)
+ (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
+ (b2 (%pk-byte q)))
+ (cond
+ ((and b2 (%digit? b2))
+ (die (%loc file line1 col1)
+ "floating-point literal not supported"))
+ (else
+ (let* ((r (%lex-read-punct src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind 'PUNCT) (set! val (tok-value tok))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest)))))))))
+ ;; String
+ ((= b 34)
+ (let* ((r (%lex-read-string src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r))
+ (bv (tok-value tok)))
+ (set! kind 'STR)
+ (set! vlen (bytevector-length bv))
+ (%lex-scratch<- bv vlen)
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))
+ ;; Char
+ ((= b 39)
+ (let* ((r (%lex-read-char src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind 'CHAR) (set! val (tok-value tok))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))
+ ;; Punctuator (default)
+ (else
+ (let* ((r (%lex-read-punct src pos1 line1 col1 file))
+ (tok (car r)) (rest (cdr r)))
+ (set! kind 'PUNCT) (set! val (tok-value tok))
+ (set! npos (car rest))
+ (set! nline (car (cdr rest)))
+ (set! ncol (car (cdr (cdr rest))))))))
+ (heap-rewind! mark)
+ ;; Reconstruct survivors post-rewind. Bv-typed values go
+ ;; through the intern pool so repeats share storage.
(cond
- ((and b2 (%digit? b2))
- (die (%loc file line1 col1) "floating-point literal not supported"))
+ ((eq? kind 'EOF)
+ (set! acc (cons (make-tok 'EOF #f (%loc file loc-line loc-col))
+ acc))
+ (set! done? #t))
(else
- (let* ((r (%lex-read-punct src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f))))))
- ;; String
- ((= b 34)
- (let* ((r (%lex-read-string src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f)))
- ;; Char
- ((= b 39)
- (let* ((r (%lex-read-char src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f)))
- ;; Punctuator (default)
- (else
- (let* ((r (%lex-read-punct src pos1 line1 col1 file))
- (tok (car r))
- (npos (car (cdr r)))
- (nline (car (cdr (cdr r))))
- (ncol (car (cdr (cdr (cdr r))))))
- (%lex-loop src npos nline ncol file (cons tok acc) #f))))))
+ (let ((tok-val (cond ((eq? kind 'IDENT) (%lex-scratch->bv vlen))
+ ((eq? kind 'STR) (%lex-scratch->bv vlen))
+ (else val))))
+ (set! acc (cons (make-tok kind tok-val
+ (%loc file loc-line loc-col))
+ acc))
+ (set! pos npos) (set! line nline) (set! col ncol)
+ (set! bol? nbol?)))
+ )
+ (loop))))))
;; cc/pp.scm — token list -> expanded token list.
;; Realizes docs/CC-INTERNALS.md §pp.scm. Hide-set per C11 6.10.3.4.
;; #include rejected (CC.md §Toolchain envelope).