boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 313276b56269e5599f1d94a89090fb6dd9ec6461
parent 84f28e2b008ec0d51d66fd7bae09749a90a2fd53
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 27 Apr 2026 08:01:00 -0700

cc/lex: cut heap allocations via fast-path peek + per-byte bv elision + punct buckets

Three independent reductions in lex.scm tight loops:

A. Add %fast-byte? and inline a non-allocating fast path in
   %collect-while, %collect-ident, %collect-octals, %collect-string,
   %match-bytes, %skip-ws-and-comments, %skip-line-comment, and
   %skip-block-comment. The common ASCII case (byte not '?', '\\',
   '\n') reads via bytevector-u8-ref and increments pos/col directly,
   skipping %lex-peek's 4-list allocation. The slow path still routes
   through %lex-peek so trigraphs and \<nl> line splices remain correct.

B. %collect-ident now collects raw byte ints (forward-cons) and
   %lex-read-ident materializes the name via a single %bv-of-bytes call.
   Replaces N per-byte make-bytevector calls + a trailing bv-cat with
   one allocation per identifier.

C. Add %punct-buckets, an alist of (first-byte . sub-alist) computed
   once at module init from %punct-alist. %lex-read-punct now peeks
   once, dispatches by leading byte, and only retries the small set of
   patterns sharing that byte instead of all ~50 every time.

Heap-usage delta on tests/cc-e2e/00-return-argc.c (234 source bytes):
  before  1,068,080 B  in lex
  after     427,856 B  in lex      (-60%)

kitchen-sink (9,782 bytes) previously exhausted the 64 MiB heap inside
lex; the lex phase now completes (parse becomes the next bottleneck).

cc-lex 16/16, cc-pp 31/31, cc-parse 68/68, cc-cg 48/48 pass on aarch64.

Diffstat:
Mcc/lex.scm | 352++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 260 insertions(+), 92 deletions(-)

diff --git a/cc/lex.scm b/cc/lex.scm @@ -102,6 +102,17 @@ (define (%pk-line p) (car (cdr (cdr p)))) (define (%pk-col p) (car (cdr (cdr (cdr p))))) +;; Fast-byte test. When (%fast-byte? b) is #t, reading b directly with +;; bytevector-u8-ref is exactly equivalent to %lex-peek's result: the +;; logical byte is b, npos = pos+1, nline unchanged, ncol = col+1, and +;; no list allocation is needed. Excludes the three bytes that %lex-peek +;; can transform: '?' (trigraph), '\\' (line splice), '\n' (line bump). +(define (%fast-byte? b) + (cond ((= b 63) #f) + ((= b 92) #f) + ((= b 10) #f) + (else #t))) + ;; -------------------------------------------------------------------- ;; Build a fresh logical-byte snapshot of `src`. Used by lex-read-* ;; helpers (which consume from a bv with translations already applied, @@ -125,63 +136,109 @@ ;; *not* consume `\n` — newlines are tokens. ;; -------------------------------------------------------------------- (define (%skip-ws-and-comments src pos line col file) - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) + (let ((n (bytevector-length src))) (cond - ((not b) (list pos line col)) - ((%hspace? b) - (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p) file)) - ;; '/' starts a possible comment - ((= b 47) - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) + ((>= pos n) (list pos line col)) + (else + (let ((b (bytevector-u8-ref src pos))) (cond - ;; "//" line comment — eat to but not through '\n' - ((and b2 (= b2 47)) - (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file)) - ;; "/*" block comment — eat until closing "*/" - ((and b2 (= b2 42)) - (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q) - file line col)) - (else (list pos line col))))) + ((and (%fast-byte? b) (%hspace? b)) + (%skip-ws-and-comments src (+ pos 1) line (+ col 1) file)) + ((%fast-byte? b) + ;; Fast-byte that isn't hspace. Only '/' is interesting; + ;; everything else terminates the skip. + (cond + ((= b 47) (%maybe-comment src pos line col file)) + (else (list pos line col)))) + (else + ;; Slow path: trigraph / splice / newline. + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (cond + ((not b2) (list pos line col)) + ((%hspace? b2) + (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p) + file)) + ((= b2 47) (%maybe-comment src pos line col file)) + (else (list pos line col))))))))))) + +(define (%maybe-comment src pos line col file) + ;; Source byte at pos resolves to '/'. Decide between // line comment, + ;; /* block comment, or "leave the slash alone" (it's a punctuator). + (let* ((p (%lex-peek src pos line col)) + (q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) + (cond + ((and b2 (= b2 47)) + (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file)) + ((and b2 (= b2 42)) + (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q) + file line col)) (else (list pos line col))))) (define (%skip-line-comment src pos line col file) ;; Consume bytes until end-of-stream or until we *see* '\n' (do not ;; consume the newline itself; outer loop emits the NL). - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) + (let ((n (bytevector-length src))) (cond - ((not b) (%skip-ws-and-comments src pos line col file)) - ((%newline? b) (%skip-ws-and-comments src pos line col file)) + ((>= pos n) (%skip-ws-and-comments src pos line col file)) (else - (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p) file))))) + (let ((b (bytevector-u8-ref src pos))) + (cond + ;; '\n' terminates without consuming. + ((= b 10) (%skip-ws-and-comments src pos line col file)) + ((%fast-byte? b) + (%skip-line-comment src (+ pos 1) line (+ col 1) file)) + (else + ;; Slow path: ?/\ — let %lex-peek handle trigraph/splice. + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (cond + ((not b2) (%skip-ws-and-comments src pos line col file)) + ((%newline? b2) (%skip-ws-and-comments src pos line col file)) + (else + (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p) + file))))))))))) (define (%skip-block-comment src pos line col file start-line start-col) - (let* ((p (%lex-peek src pos line col)) - (b1 (%pk-byte p))) + (let ((n (bytevector-length src))) (cond - ((not b1) + ((>= pos n) (die (%loc file start-line start-col) "unterminated /* block comment")) - ((= b1 42) ; '*' - (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) - (b2 (%pk-byte q))) + (else + (let ((b (bytevector-u8-ref src pos))) (cond - ((not b2) - (die (%loc file start-line start-col) - "unterminated /* block comment")) - ((= b2 47) ; '*' '/' - (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q) - file)) + ;; Fast path for plain content bytes that aren't '*'. + ((and (%fast-byte? b) (not (= b 42))) + (%skip-block-comment src (+ pos 1) line (+ col 1) + file start-line start-col)) (else - ;; Re-scan starting at the byte after '*'; the '*' was not - ;; the closer, but the next byte might itself be '*'. - (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) - file start-line start-col))))) - (else - (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) - file start-line start-col))))) + ;; Slow path: '*', '\n', '?' (trigraph), '\\' (splice). + (let* ((p (%lex-peek src pos line col)) + (b1 (%pk-byte p))) + (cond + ((not b1) + (die (%loc file start-line start-col) + "unterminated /* block comment")) + ((= b1 42) + (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) + (b2 (%pk-byte q))) + (cond + ((not b2) + (die (%loc file start-line start-col) + "unterminated /* block comment")) + ((= b2 47) + (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q) + file)) + (else + ;; Re-scan starting at the byte after '*'; the '*' was + ;; not the closer, but the next byte might itself be '*'. + (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) + file start-line start-col))))) + (else + (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) + file start-line start-col))))))))))) ;; -------------------------------------------------------------------- ;; Identifier / keyword reader. @@ -190,16 +247,26 @@ ;; first byte at `pos` satisfies %ident-start?. ;; -------------------------------------------------------------------- (define (%collect-ident src pos line col) - ;; Walk %ident-cont? bytes; return (chunks-rev npos nline ncol) where - ;; chunks-rev is a reversed list of one-byte bvs. We reuse make-tok - ;; loc from the caller. - (let loop ((pos pos) (line line) (col col) (acc '())) - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) - (if (and b (%ident-cont? b)) - (loop (%pk-pos p) (%pk-line p) (%pk-col p) - (cons (bv-of-byte b) acc)) - (list (reverse acc) pos line col))))) + ;; Walk %ident-cont? bytes; return (bytes-fwd npos nline ncol) where + ;; bytes-fwd is a forward list of byte ints. Caller calls %bv-of-bytes + ;; once to materialize the name, avoiding a per-byte bv allocation. + (let ((n (bytevector-length src))) + (let loop ((pos pos) (line line) (col col) (acc '())) + (cond + ((>= pos n) (list (reverse acc) pos line col)) + (else + (let ((b (bytevector-u8-ref src pos))) + (cond + ((%fast-byte? b) + (if (%ident-cont? b) + (loop (+ pos 1) line (+ col 1) (cons b acc)) + (list (reverse acc) pos line col))) + (else + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (if (and b2 (%ident-cont? b2)) + (loop (%pk-pos p) (%pk-line p) (%pk-col p) (cons b2 acc)) + (list (reverse acc) pos line col))))))))))) (define (lex-read-ident src pos file) ;; Public for tests. Threads line/col from a fresh start. @@ -207,12 +274,12 @@ (define (%lex-read-ident src pos line col file) (let* ((start-loc (%loc file line col)) - (r (%collect-ident src pos line col)) - (chs (car r)) + (r (%collect-ident src pos line col)) + (bs (car r)) (npos (car (cdr r))) (nline (car (cdr (cdr r)))) (ncol (car (cdr (cdr (cdr r))))) - (name (bv-cat chs)) + (name (%bv-of-bytes bs)) (kw (alist-ref name %keyword-alist))) (cons (if kw (make-tok 'KW kw start-loc) @@ -233,14 +300,24 @@ (%lex-read-number src pos 1 (+ pos 1) file)) (define (%collect-while pred src pos line col) - ;; Generic byte collector. Returns (chunks-rev-list npos nline ncol). - (let loop ((pos pos) (line line) (col col) (acc '())) - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) - (if (and b (pred b)) - (loop (%pk-pos p) (%pk-line p) (%pk-col p) - (cons b acc)) - (list (reverse acc) pos line col))))) + ;; Generic byte collector. Returns (forward-byte-list npos nline ncol). + (let ((n (bytevector-length src))) + (let loop ((pos pos) (line line) (col col) (acc '())) + (cond + ((>= pos n) (list (reverse acc) pos line col)) + (else + (let ((b (bytevector-u8-ref src pos))) + (cond + ((%fast-byte? b) + (if (pred b) + (loop (+ pos 1) line (+ col 1) (cons b acc)) + (list (reverse acc) pos line col))) + (else + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (if (and b2 (pred b2)) + (loop (%pk-pos p) (%pk-line p) (%pk-col p) (cons b2 acc)) + (list (reverse acc) pos line col))))))))))) (define (%bv-of-bytes bs) ;; Build a fresh bv from a list of byte ints. Allocates one bv. @@ -388,23 +465,39 @@ file start-loc '())))) (define (%collect-string src pos line col file start-loc acc) - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) + (let ((n (bytevector-length src))) (cond - ((not b) - (die start-loc "unterminated string literal")) - ((= b 34) ; closing '"' - (let ((bv (%bv-of-bytes (reverse acc)))) - (cons (make-tok 'STR bv start-loc) - (list (%pk-pos p) (%pk-line p) (%pk-col p))))) - ((%newline? b) - (die start-loc "newline in string literal")) - ((= b 92) ; '\\' — escape - (%read-escape src (%pk-pos p) (%pk-line p) (%pk-col p) - file start-loc acc 'string)) + ((>= pos n) (die start-loc "unterminated string literal")) (else - (%collect-string src (%pk-pos p) (%pk-line p) (%pk-col p) - file start-loc (cons b acc)))))) + (let ((b (bytevector-u8-ref src pos))) + (cond + ;; Closing quote — fast byte but special. + ((= b 34) + (let ((bv (%bv-of-bytes (reverse acc)))) + (cons (make-tok 'STR bv start-loc) + (list (+ pos 1) line (+ col 1))))) + ((%fast-byte? b) + (%collect-string src (+ pos 1) line (+ col 1) + file start-loc (cons b acc))) + (else + ;; Slow path: ?/\ (trigraph/splice/escape) or '\n'. + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (cond + ((not b2) + (die start-loc "unterminated string literal")) + ((= b2 34) + (let ((bv (%bv-of-bytes (reverse acc)))) + (cons (make-tok 'STR bv start-loc) + (list (%pk-pos p) (%pk-line p) (%pk-col p))))) + ((%newline? b2) + (die start-loc "newline in string literal")) + ((= b2 92) + (%read-escape src (%pk-pos p) (%pk-line p) (%pk-col p) + file start-loc acc 'string)) + (else + (%collect-string src (%pk-pos p) (%pk-line p) (%pk-col p) + file start-loc (cons b2 acc)))))))))))) ;; -------------------------------------------------------------------- ;; Char reader. @@ -512,12 +605,24 @@ (cond ((zero? k) (list (reverse acc) pos line col)) (else - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) - (if (and b (%octal? b)) - (%collect-octals src (%pk-pos p) (%pk-line p) (%pk-col p) - (- k 1) (cons b acc)) - (list (reverse acc) pos line col)))))) + (let ((n (bytevector-length src))) + (cond + ((>= pos n) (list (reverse acc) pos line col)) + (else + (let ((b (bytevector-u8-ref src pos))) + (cond + ((%fast-byte? b) + (if (%octal? b) + (%collect-octals src (+ pos 1) line (+ col 1) + (- k 1) (cons b acc)) + (list (reverse acc) pos line col))) + (else + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (if (and b2 (%octal? b2)) + (%collect-octals src (%pk-pos p) (%pk-line p) (%pk-col p) + (- k 1) (cons b2 acc)) + (list (reverse acc) pos line col)))))))))))) (define (%escape-result mode val pos line col src file start-loc acc) (cond @@ -531,14 +636,64 @@ ;; Punctuator reader. ;; ;; Greedy longest-match against %punct-alist (cc/data.scm). The alist -;; is already ordered longest-first. +;; is already ordered longest-first. We additionally bucket entries by +;; their first byte so %lex-read-punct only loops over the small set of +;; patterns that can start at the current source byte. ;; -------------------------------------------------------------------- + +(define (%alist-ref-int k al) + ;; Lookup in an int-keyed alist (linear scan, '= compare). + (cond ((null? al) #f) + ((= (car (car al)) k) (cdr (car al))) + (else (%alist-ref-int k (cdr al))))) + +(define (%mem-int? k xs) + (cond ((null? xs) #f) + ((= (car xs) k) #t) + (else (%mem-int? k (cdr xs))))) + +(define (%filter-by-first-byte b al) + ;; Subset of `al` whose pattern starts with byte b, preserving order. + (cond + ((null? al) '()) + ((= (bytevector-u8-ref (car (car al)) 0) b) + (cons (car al) (%filter-by-first-byte b (cdr al)))) + (else (%filter-by-first-byte b (cdr al))))) + +(define (%group-by-first-byte al) + ;; Build ((first-byte . sub-alist) ...) over `al`, one bucket per + ;; distinct first byte; sub-alist preserves longest-match-first + ;; order from the source list. + (let loop ((xs al) (seen '()) (out '())) + (cond + ((null? xs) (reverse out)) + (else + (let* ((entry (car xs)) + (pat (car entry)) + (b (bytevector-u8-ref pat 0))) + (cond + ((%mem-int? b seen) (loop (cdr xs) seen out)) + (else + (loop (cdr xs) + (cons b seen) + (cons (cons b (%filter-by-first-byte b al)) out))))))))) + +(define %punct-buckets (%group-by-first-byte %punct-alist)) + (define (lex-read-punct src pos file) (%lex-read-punct src pos 1 (+ pos 1) file)) (define (%lex-read-punct src pos line col file) - (let ((start-loc (%loc file line col))) - (%punct-loop src pos line col file start-loc %punct-alist))) + (let* ((start-loc (%loc file line col)) + (p (%lex-peek src pos line col)) + (b (%pk-byte p))) + (cond + ((not b) (die start-loc "unrecognized byte" "EOF")) + (else + (let ((bucket (%alist-ref-int b %punct-buckets))) + (cond + ((not bucket) (die start-loc "unrecognized byte" (bv-of-byte b))) + (else (%punct-loop src pos line col file start-loc bucket)))))))) (define (%punct-loop src pos line col file start-loc al) (cond @@ -562,13 +717,26 @@ (cond ((= i (bytevector-length pat)) (list pos line col)) (else - (let* ((p (%lex-peek src pos line col)) - (b (%pk-byte p))) + (let ((n (bytevector-length src))) (cond - ((not b) #f) - ((= b (bytevector-u8-ref pat i)) - (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p) pat (+ i 1))) - (else #f)))))) + ((>= pos n) #f) + (else + (let ((b (bytevector-u8-ref src pos)) + (pb (bytevector-u8-ref pat i))) + (cond + ((%fast-byte? b) + (if (= b pb) + (%match-bytes src (+ pos 1) line (+ col 1) pat (+ i 1)) + #f)) + (else + (let* ((p (%lex-peek src pos line col)) + (b2 (%pk-byte p))) + (cond + ((not b2) #f) + ((= b2 pb) + (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p) + pat (+ i 1))) + (else #f)))))))))))) ;; -------------------------------------------------------------------- ;; lex-tokenize src file -> list of tok ending in EOF.