boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 62dabed96093fc078dc4098443bb6b59887cc5ad
parent 14855baae25c60c5747ae85815079ad84d16e992
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 27 Apr 2026 19:56:24 -0700

cc: lex per-token heap-mark/rewind + scratch buffer

Wrap each lex-tokenize iteration in heap-mark/heap-rewind!. The
helpers' tok+loc + 4-list result and all the interpreter-overhead
cons cells (bind_params, eval_args, let* env extension, named-let
frames in the inner scanners) sit above the mark and are reclaimed
before recursing. Scalar fields ferry across the rewind via outer-
let bindings (set! mutates the binding cdr in place, no fresh heap);
IDENT/STR bv contents ferry through %lex-scratch, a single sticky
bytevector allocated below all marks. Post-rewind the driver
rebuilds tok+loc+acc-cons and a fresh bv sized exactly to the
content.

Investigated interning (single bv per unique IDENT/STRING). With
scheme1's interpreter, walking the pool cons-list per lookup costs
~50-150 B per step in bind_params/eval_args/named-let env-extension
overhead. 16-way bucketing did worse than no intern on a 50 KB
prefix (60+ MB vs 17 MB). Skipped until scheme1 grows a vector
primitive.

Per-token heap delta on tests/cc/001-kitchen-sink.c (9.7 KB):
  before: 12283 B/tok (~3 KB/source-byte)
  after:    840 B/tok (~250 B/source-byte)

The 50 KB tcc.flat.c prefix that previously exhausted the heap
during lex now lex+pp's cleanly through to blocker 2 (dup decl).

Diffstat:
Mcc/cc.scm | 292++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 200 insertions(+), 92 deletions(-)

diff --git a/cc/cc.scm b/cc/cc.scm @@ -562,6 +562,58 @@ ;; - Numeric digit runs accumulate their value inline via ;; %accum-int-while; they no longer materialize a per-byte cons list ;; and then a separate %digits-value walk. +;; - lex-tokenize wraps each token-emitting iteration in an outer +;; heap-mark / heap-rewind!. The helper allocates its own tok+loc+bv +;; above this outer mark; the driver reads the scalar fields and +;; copies any bv contents into %lex-scratch (sticky, pre-mark) before +;; rewinding. Post-rewind it rebuilds a fresh tok+loc and a fresh bv +;; (for IDENT/STR) sized to the actual content. Surface effect on +;; the kitchen-sink test: per-token heap delta drops from ~12 KB +;; (interpreter overhead, env-extension cons cells, eval_args lists, +;; tail-rec named-let frames in the inner scanners, all sticky for +;; the duration of lex) to ~840 B (tok + loc + acc-cons + bv + +;; the irreducible eval-cost of the post-rewind reconstruction). + +;; -------------------------------------------------------------------- +;; Cross-rewind transport for IDENT / STR bv values. +;; +;; %lex-scratch is a single sticky bytevector allocated below any +;; lex-tokenize heap-mark. The driver copies bv data here *before* the +;; rewind, then post-rewind allocates a fresh bv (sized exactly to the +;; ident/string content) by copying back out of scratch. The whole +;; lex run shares this one buffer. +;; +;; We considered a deduplicating *intern pool* (one bv per unique +;; ident, repeats share storage). With scheme1's interpreter, walking +;; the pool cons-list per lookup costs ~50–150 B per step in +;; bind_params/eval_args/named-let env-extension overhead. Even with +;; 16-way bucketing, a 50 KB tcc.flat.c prefix saw walk costs blow +;; past the bv-allocation savings (60+ MB heap vs 17 MB un-interned). +;; The cleanest path until scheme1 gets a vector primitive is no +;; intern; if a downstream eq?-fast-path is needed later, layer it +;; atop %lex-scratch with a hash backed by a u8 bytevector. +;; -------------------------------------------------------------------- +(define %lex-scratch-cap 65536) +(define %lex-scratch (make-bytevector %lex-scratch-cap 0)) + +(define (%lex-init!) #t) + +(define (%lex-scratch<- bv len) + (cond ((> len %lex-scratch-cap) + (die #f "lex: token exceeds scratch cap" len))) + (let loop ((i 0)) + (cond ((< i len) + (bytevector-u8-set! %lex-scratch i (bytevector-u8-ref bv i)) + (loop (+ i 1)))))) + +(define (%lex-scratch->bv len) + ;; Allocate a fresh bv (exact size) and copy scratch[0..len) into it. + (let ((bv (make-bytevector len 0))) + (let copy ((i 0)) + (cond ((< i len) + (bytevector-u8-set! bv i (bytevector-u8-ref %lex-scratch i)) + (copy (+ i 1))))) + bv)) ;; -------------------------------------------------------------------- ;; Byte-class predicates (raw u8 values, not chars). @@ -1329,105 +1381,161 @@ ;; -------------------------------------------------------------------- ;; lex-tokenize src file -> list of tok ending in EOF. ;; -------------------------------------------------------------------- -(define (lex-tokenize src file) - (%lex-loop src 0 1 1 file '() #t)) - ;; bol? — `#t` when no token has been emitted on the current physical ;; line yet (start of file, or only NL + whitespace seen since the last ;; line break). pp recognizes a directive only when its leading `#` is ;; at line-start; we forward that decision into the token stream by ;; emitting `HASH` instead of `(PUNCT hash …)` for a line-leading `#`. -(define (%lex-loop src pos line col file acc bol?) - (let* ((sw (%skip-ws-and-comments src pos line col file)) - (pos1 (car sw)) - (line1 (car (cdr sw))) - (col1 (car (cdr (cdr sw)))) - (p (%lex-peek src pos1 line1 col1)) - (b (%pk-byte p))) - (cond - ;; EOF - ((not b) - (let* ((eof-tok (make-tok 'EOF #f (%loc file line1 col1)))) - (reverse (cons eof-tok acc)))) - ;; Newline → emit NL, reset bol?. - ((%newline? b) - (let ((nl (make-tok 'NL #f (%loc file line1 col1)))) - (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p) - file (cons nl acc) #t))) - ;; Line-leading `#` → emit HASH, but only the bare `#`. `##` is - ;; never line-leading in valid C; if it appears, fall through to - ;; normal punctuator handling so it lexes as `paste`. - ((and bol? (= b 35)) - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) - (cond - ((and b2 (= b2 35)) - (let* ((r (%lex-read-punct src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f))) - (else - (let ((tok (make-tok 'HASH #f (%loc file line1 col1)))) - (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p) - file (cons tok acc) #f)))))) - ;; Identifier / keyword - ((%ident-start? b) - (let* ((r (%lex-read-ident src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f))) - ;; Number (digit start) - ((%digit? b) - (let* ((r (%lex-read-number src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f))) - ;; '.' might start a number (1.0 actually starts with digit; .5 - ;; would route here). We keep this as a punctuator unless followed - ;; by a digit, in which case the lexer rejects per spec. - ((= b 46) - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) +;; +;; Heap discipline: each iteration of `loop` is wrapped in a +;; heap-mark / heap-rewind!. All scratch the helper allocates (the +;; helper's own tok/loc, the `(cons tok 4-list)` it returns, every +;; bind_params / let* / eval_args env-cons consumed getting in and out) +;; lives above the mark and is reclaimed before recursing. The driver +;; ferries scalars across the rewind via the outer let bindings (set! +;; mutates the binding cdr in place — no fresh heap), and bv contents +;; via the sticky %lex-scratch buffer + %lex-intern pool. The only +;; survivors per token are the freshly-built tok (48 B), loc (40 B), +;; cons-onto-acc (16 B) and, on first sight, an interned bv. +(define (lex-tokenize src file) + (%lex-init!) + (let ((pos 0) (line 1) (col 1) (acc '()) (bol? #t) (done? #f) + (mark 0) + ;; per-iteration scratch: kind + value (+ vlen for bv-typed), + ;; loc-line/col (the start of the token), and npos/nline/ncol + ;; (where the lexer stopped). All scalar — set! is safe across + ;; the rewind because the binding cons lives in the outer let, + ;; below the mark. + (kind #f) (val #f) (vlen 0) + (loc-line 1) (loc-col 1) + (npos 0) (nline 1) (ncol 1) (nbol? #t)) + (let loop () + (cond + (done? (reverse acc)) + (else + (set! mark (heap-mark)) + (let* ((sw (%skip-ws-and-comments src pos line col file)) + (pos1 (car sw)) + (line1 (car (cdr sw))) + (col1 (car (cdr (cdr sw)))) + (p (%lex-peek src pos1 line1 col1)) + (b (%pk-byte p))) + (set! loc-line line1) + (set! loc-col col1) + (set! val #f) (set! vlen 0) (set! nbol? #f) + (cond + ;; EOF + ((not b) + (set! kind 'EOF) + (set! npos pos1) (set! nline line1) (set! ncol col1)) + ;; Newline → NL token; next iter starts at bol. + ((%newline? b) + (set! kind 'NL) + (set! npos (%pk-pos p)) + (set! nline (%pk-line p)) + (set! ncol (%pk-col p)) + (set! nbol? #t)) + ;; Line-leading `#`: bare `#` becomes HASH; `##` falls + ;; through to punctuator (lexes as `paste`). + ((and bol? (= b 35)) + (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) + (cond + ((and b2 (= b2 35)) + (let* ((r (%lex-read-punct src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'PUNCT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + (else + (set! kind 'HASH) + (set! npos (%pk-pos p)) + (set! nline (%pk-line p)) + (set! ncol (%pk-col p)))))) + ;; Identifier / keyword + ((%ident-start? b) + (let* ((r (%lex-read-ident src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind (tok-kind tok)) + (cond ((eq? (tok-kind tok) 'KW) + (set! val (tok-value tok))) + (else + (let ((bv (tok-value tok))) + (set! vlen (bytevector-length bv)) + (%lex-scratch<- bv vlen)))) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Number (digit start) + ((%digit? b) + (let* ((r (%lex-read-number src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'INT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; '.' is a punctuator unless followed by a digit (float). + ((= b 46) + (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) + (cond + ((and b2 (%digit? b2)) + (die (%loc file line1 col1) + "floating-point literal not supported")) + (else + (let* ((r (%lex-read-punct src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'PUNCT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest))))))))) + ;; String + ((= b 34) + (let* ((r (%lex-read-string src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r)) + (bv (tok-value tok))) + (set! kind 'STR) + (set! vlen (bytevector-length bv)) + (%lex-scratch<- bv vlen) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Char + ((= b 39) + (let* ((r (%lex-read-char src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'CHAR) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))) + ;; Punctuator (default) + (else + (let* ((r (%lex-read-punct src pos1 line1 col1 file)) + (tok (car r)) (rest (cdr r))) + (set! kind 'PUNCT) (set! val (tok-value tok)) + (set! npos (car rest)) + (set! nline (car (cdr rest))) + (set! ncol (car (cdr (cdr rest)))))))) + (heap-rewind! mark) + ;; Reconstruct survivors post-rewind. Bv-typed values go + ;; through the intern pool so repeats share storage. (cond - ((and b2 (%digit? b2)) - (die (%loc file line1 col1) "floating-point literal not supported")) + ((eq? kind 'EOF) + (set! acc (cons (make-tok 'EOF #f (%loc file loc-line loc-col)) + acc)) + (set! done? #t)) (else - (let* ((r (%lex-read-punct src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f)))))) - ;; String - ((= b 34) - (let* ((r (%lex-read-string src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f))) - ;; Char - ((= b 39) - (let* ((r (%lex-read-char src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f))) - ;; Punctuator (default) - (else - (let* ((r (%lex-read-punct src pos1 line1 col1 file)) - (tok (car r)) - (npos (car (cdr r))) - (nline (car (cdr (cdr r)))) - (ncol (car (cdr (cdr (cdr r)))))) - (%lex-loop src npos nline ncol file (cons tok acc) #f)))))) + (let ((tok-val (cond ((eq? kind 'IDENT) (%lex-scratch->bv vlen)) + ((eq? kind 'STR) (%lex-scratch->bv vlen)) + (else val)))) + (set! acc (cons (make-tok kind tok-val + (%loc file loc-line loc-col)) + acc)) + (set! pos npos) (set! line nline) (set! col ncol) + (set! bol? nbol?))) + ) + (loop)))))) ;; cc/pp.scm — token list -> expanded token list. ;; Realizes docs/CC-INTERNALS.md §pp.scm. Hide-set per C11 6.10.3.4. ;; #include rejected (CC.md §Toolchain envelope).