commit 313276b56269e5599f1d94a89090fb6dd9ec6461
parent 84f28e2b008ec0d51d66fd7bae09749a90a2fd53
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 27 Apr 2026 08:01:00 -0700
cc/lex: cut heap allocations via fast-path peek + per-byte bv elision + punct buckets
Three independent reductions in lex.scm tight loops:
A. Add %fast-byte? and inline a non-allocating fast path in
%collect-while, %collect-ident, %collect-octals, %collect-string,
%match-bytes, %skip-ws-and-comments, %skip-line-comment, and
%skip-block-comment. The common ASCII case (byte not '?', '\\',
'\n') reads via bytevector-u8-ref and increments pos/col directly,
skipping %lex-peek's 4-list allocation. The slow path still routes
through %lex-peek so trigraphs and \<nl> line splices remain correct.
B. %collect-ident now collects raw byte ints (forward-cons) and
%lex-read-ident materializes the name via a single %bv-of-bytes call.
Replaces N per-byte make-bytevector calls + a trailing bv-cat with
one allocation per identifier.
C. Add %punct-buckets, an alist of (first-byte . sub-alist) computed
once at module init from %punct-alist. %lex-read-punct now peeks
once, dispatches by leading byte, and only retries the small set of
patterns sharing that byte instead of all ~50 every time.
Heap-usage delta on tests/cc-e2e/00-return-argc.c (234 source bytes):
before 1,068,080 B in lex
after 427,856 B in lex (-60%)
kitchen-sink (9,782 bytes) previously exhausted the 64 MiB heap inside
lex; the lex phase now completes (parse becomes the next bottleneck).
cc-lex 16/16, cc-pp 31/31, cc-parse 68/68, cc-cg 48/48 pass on aarch64.
Diffstat:
| M | cc/lex.scm | | | 352 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
1 file changed, 260 insertions(+), 92 deletions(-)
diff --git a/cc/lex.scm b/cc/lex.scm
@@ -102,6 +102,17 @@
(define (%pk-line p) (car (cdr (cdr p))))
(define (%pk-col p) (car (cdr (cdr (cdr p)))))
+;; Fast-byte test. When (%fast-byte? b) is #t, reading b directly with
+;; bytevector-u8-ref is exactly equivalent to %lex-peek's result: the
+;; logical byte is b, npos = pos+1, nline unchanged, ncol = col+1, and
+;; no list allocation is needed. Excludes the three bytes that %lex-peek
+;; can transform: '?' (trigraph), '\\' (line splice), '\n' (line bump).
+(define (%fast-byte? b)
+ (cond ((= b 63) #f)
+ ((= b 92) #f)
+ ((= b 10) #f)
+ (else #t)))
+
;; --------------------------------------------------------------------
;; Build a fresh logical-byte snapshot of `src`. Used by lex-read-*
;; helpers (which consume from a bv with translations already applied,
@@ -125,63 +136,109 @@
;; *not* consume `\n` — newlines are tokens.
;; --------------------------------------------------------------------
(define (%skip-ws-and-comments src pos line col file)
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
+ (let ((n (bytevector-length src)))
(cond
- ((not b) (list pos line col))
- ((%hspace? b)
- (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p) file))
- ;; '/' starts a possible comment
- ((= b 47)
- (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
- (b2 (%pk-byte q)))
+ ((>= pos n) (list pos line col))
+ (else
+ (let ((b (bytevector-u8-ref src pos)))
(cond
- ;; "//" line comment — eat to but not through '\n'
- ((and b2 (= b2 47))
- (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file))
- ;; "/*" block comment — eat until closing "*/"
- ((and b2 (= b2 42))
- (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q)
- file line col))
- (else (list pos line col)))))
+ ((and (%fast-byte? b) (%hspace? b))
+ (%skip-ws-and-comments src (+ pos 1) line (+ col 1) file))
+ ((%fast-byte? b)
+ ;; Fast-byte that isn't hspace. Only '/' is interesting;
+ ;; everything else terminates the skip.
+ (cond
+ ((= b 47) (%maybe-comment src pos line col file))
+ (else (list pos line col))))
+ (else
+ ;; Slow path: trigraph / splice / newline.
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (cond
+ ((not b2) (list pos line col))
+ ((%hspace? b2)
+ (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file))
+ ((= b2 47) (%maybe-comment src pos line col file))
+ (else (list pos line col)))))))))))
+
+(define (%maybe-comment src pos line col file)
+ ;; Source byte at pos resolves to '/'. Decide between // line comment,
+ ;; /* block comment, or "leave the slash alone" (it's a punctuator).
+ (let* ((p (%lex-peek src pos line col))
+ (q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
+ (b2 (%pk-byte q)))
+ (cond
+ ((and b2 (= b2 47))
+ (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file))
+ ((and b2 (= b2 42))
+ (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q)
+ file line col))
(else (list pos line col)))))
(define (%skip-line-comment src pos line col file)
;; Consume bytes until end-of-stream or until we *see* '\n' (do not
;; consume the newline itself; outer loop emits the NL).
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
+ (let ((n (bytevector-length src)))
(cond
- ((not b) (%skip-ws-and-comments src pos line col file))
- ((%newline? b) (%skip-ws-and-comments src pos line col file))
+ ((>= pos n) (%skip-ws-and-comments src pos line col file))
(else
- (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p) file)))))
+ (let ((b (bytevector-u8-ref src pos)))
+ (cond
+ ;; '\n' terminates without consuming.
+ ((= b 10) (%skip-ws-and-comments src pos line col file))
+ ((%fast-byte? b)
+ (%skip-line-comment src (+ pos 1) line (+ col 1) file))
+ (else
+ ;; Slow path: ?/\ — let %lex-peek handle trigraph/splice.
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (cond
+ ((not b2) (%skip-ws-and-comments src pos line col file))
+ ((%newline? b2) (%skip-ws-and-comments src pos line col file))
+ (else
+ (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file)))))))))))
(define (%skip-block-comment src pos line col file start-line start-col)
- (let* ((p (%lex-peek src pos line col))
- (b1 (%pk-byte p)))
+ (let ((n (bytevector-length src)))
(cond
- ((not b1)
+ ((>= pos n)
(die (%loc file start-line start-col)
"unterminated /* block comment"))
- ((= b1 42) ; '*'
- (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
- (b2 (%pk-byte q)))
+ (else
+ (let ((b (bytevector-u8-ref src pos)))
(cond
- ((not b2)
- (die (%loc file start-line start-col)
- "unterminated /* block comment"))
- ((= b2 47) ; '*' '/'
- (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q)
- file))
+ ;; Fast path for plain content bytes that aren't '*'.
+ ((and (%fast-byte? b) (not (= b 42)))
+ (%skip-block-comment src (+ pos 1) line (+ col 1)
+ file start-line start-col))
(else
- ;; Re-scan starting at the byte after '*'; the '*' was not
- ;; the closer, but the next byte might itself be '*'.
- (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
- file start-line start-col)))))
- (else
- (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
- file start-line start-col)))))
+ ;; Slow path: '*', '\n', '?' (trigraph), '\\' (splice).
+ (let* ((p (%lex-peek src pos line col))
+ (b1 (%pk-byte p)))
+ (cond
+ ((not b1)
+ (die (%loc file start-line start-col)
+ "unterminated /* block comment"))
+ ((= b1 42)
+ (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
+ (b2 (%pk-byte q)))
+ (cond
+ ((not b2)
+ (die (%loc file start-line start-col)
+ "unterminated /* block comment"))
+ ((= b2 47)
+ (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q)
+ file))
+ (else
+ ;; Re-scan starting at the byte after '*'; the '*' was
+ ;; not the closer, but the next byte might itself be '*'.
+ (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file start-line start-col)))))
+ (else
+ (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file start-line start-col)))))))))))
;; --------------------------------------------------------------------
;; Identifier / keyword reader.
@@ -190,16 +247,26 @@
;; first byte at `pos` satisfies %ident-start?.
;; --------------------------------------------------------------------
(define (%collect-ident src pos line col)
- ;; Walk %ident-cont? bytes; return (chunks-rev npos nline ncol) where
- ;; chunks-rev is a reversed list of one-byte bvs. We reuse make-tok
- ;; loc from the caller.
- (let loop ((pos pos) (line line) (col col) (acc '()))
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
- (if (and b (%ident-cont? b))
- (loop (%pk-pos p) (%pk-line p) (%pk-col p)
- (cons (bv-of-byte b) acc))
- (list (reverse acc) pos line col)))))
+ ;; Walk %ident-cont? bytes; return (bytes-fwd npos nline ncol) where
+ ;; bytes-fwd is a forward list of byte ints. Caller calls %bv-of-bytes
+ ;; once to materialize the name, avoiding a per-byte bv allocation.
+ (let ((n (bytevector-length src)))
+ (let loop ((pos pos) (line line) (col col) (acc '()))
+ (cond
+ ((>= pos n) (list (reverse acc) pos line col))
+ (else
+ (let ((b (bytevector-u8-ref src pos)))
+ (cond
+ ((%fast-byte? b)
+ (if (%ident-cont? b)
+ (loop (+ pos 1) line (+ col 1) (cons b acc))
+ (list (reverse acc) pos line col)))
+ (else
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (if (and b2 (%ident-cont? b2))
+ (loop (%pk-pos p) (%pk-line p) (%pk-col p) (cons b2 acc))
+ (list (reverse acc) pos line col)))))))))))
(define (lex-read-ident src pos file)
;; Public for tests. Threads line/col from a fresh start.
@@ -207,12 +274,12 @@
(define (%lex-read-ident src pos line col file)
(let* ((start-loc (%loc file line col))
- (r (%collect-ident src pos line col))
- (chs (car r))
+ (r (%collect-ident src pos line col))
+ (bs (car r))
(npos (car (cdr r)))
(nline (car (cdr (cdr r))))
(ncol (car (cdr (cdr (cdr r)))))
- (name (bv-cat chs))
+ (name (%bv-of-bytes bs))
(kw (alist-ref name %keyword-alist)))
(cons (if kw
(make-tok 'KW kw start-loc)
@@ -233,14 +300,24 @@
(%lex-read-number src pos 1 (+ pos 1) file))
(define (%collect-while pred src pos line col)
- ;; Generic byte collector. Returns (chunks-rev-list npos nline ncol).
- (let loop ((pos pos) (line line) (col col) (acc '()))
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
- (if (and b (pred b))
- (loop (%pk-pos p) (%pk-line p) (%pk-col p)
- (cons b acc))
- (list (reverse acc) pos line col)))))
+ ;; Generic byte collector. Returns (forward-byte-list npos nline ncol).
+ (let ((n (bytevector-length src)))
+ (let loop ((pos pos) (line line) (col col) (acc '()))
+ (cond
+ ((>= pos n) (list (reverse acc) pos line col))
+ (else
+ (let ((b (bytevector-u8-ref src pos)))
+ (cond
+ ((%fast-byte? b)
+ (if (pred b)
+ (loop (+ pos 1) line (+ col 1) (cons b acc))
+ (list (reverse acc) pos line col)))
+ (else
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (if (and b2 (pred b2))
+ (loop (%pk-pos p) (%pk-line p) (%pk-col p) (cons b2 acc))
+ (list (reverse acc) pos line col)))))))))))
(define (%bv-of-bytes bs)
;; Build a fresh bv from a list of byte ints. Allocates one bv.
@@ -388,23 +465,39 @@
file start-loc '()))))
(define (%collect-string src pos line col file start-loc acc)
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
+ (let ((n (bytevector-length src)))
(cond
- ((not b)
- (die start-loc "unterminated string literal"))
- ((= b 34) ; closing '"'
- (let ((bv (%bv-of-bytes (reverse acc))))
- (cons (make-tok 'STR bv start-loc)
- (list (%pk-pos p) (%pk-line p) (%pk-col p)))))
- ((%newline? b)
- (die start-loc "newline in string literal"))
- ((= b 92) ; '\\' — escape
- (%read-escape src (%pk-pos p) (%pk-line p) (%pk-col p)
- file start-loc acc 'string))
+ ((>= pos n) (die start-loc "unterminated string literal"))
(else
- (%collect-string src (%pk-pos p) (%pk-line p) (%pk-col p)
- file start-loc (cons b acc))))))
+ (let ((b (bytevector-u8-ref src pos)))
+ (cond
+ ;; Closing quote — fast byte but special.
+ ((= b 34)
+ (let ((bv (%bv-of-bytes (reverse acc))))
+ (cons (make-tok 'STR bv start-loc)
+ (list (+ pos 1) line (+ col 1)))))
+ ((%fast-byte? b)
+ (%collect-string src (+ pos 1) line (+ col 1)
+ file start-loc (cons b acc)))
+ (else
+ ;; Slow path: ?/\ (trigraph/splice/escape) or '\n'.
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (cond
+ ((not b2)
+ (die start-loc "unterminated string literal"))
+ ((= b2 34)
+ (let ((bv (%bv-of-bytes (reverse acc))))
+ (cons (make-tok 'STR bv start-loc)
+ (list (%pk-pos p) (%pk-line p) (%pk-col p)))))
+ ((%newline? b2)
+ (die start-loc "newline in string literal"))
+ ((= b2 92)
+ (%read-escape src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file start-loc acc 'string))
+ (else
+ (%collect-string src (%pk-pos p) (%pk-line p) (%pk-col p)
+ file start-loc (cons b2 acc))))))))))))
;; --------------------------------------------------------------------
;; Char reader.
@@ -512,12 +605,24 @@
(cond
((zero? k) (list (reverse acc) pos line col))
(else
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
- (if (and b (%octal? b))
- (%collect-octals src (%pk-pos p) (%pk-line p) (%pk-col p)
- (- k 1) (cons b acc))
- (list (reverse acc) pos line col))))))
+ (let ((n (bytevector-length src)))
+ (cond
+ ((>= pos n) (list (reverse acc) pos line col))
+ (else
+ (let ((b (bytevector-u8-ref src pos)))
+ (cond
+ ((%fast-byte? b)
+ (if (%octal? b)
+ (%collect-octals src (+ pos 1) line (+ col 1)
+ (- k 1) (cons b acc))
+ (list (reverse acc) pos line col)))
+ (else
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (if (and b2 (%octal? b2))
+ (%collect-octals src (%pk-pos p) (%pk-line p) (%pk-col p)
+ (- k 1) (cons b2 acc))
+ (list (reverse acc) pos line col))))))))))))
(define (%escape-result mode val pos line col src file start-loc acc)
(cond
@@ -531,14 +636,64 @@
;; Punctuator reader.
;;
;; Greedy longest-match against %punct-alist (cc/data.scm). The alist
-;; is already ordered longest-first.
+;; is already ordered longest-first. We additionally bucket entries by
+;; their first byte so %lex-read-punct only loops over the small set of
+;; patterns that can start at the current source byte.
;; --------------------------------------------------------------------
+
+(define (%alist-ref-int k al)
+ ;; Lookup in an int-keyed alist (linear scan, '= compare).
+ (cond ((null? al) #f)
+ ((= (car (car al)) k) (cdr (car al)))
+ (else (%alist-ref-int k (cdr al)))))
+
+(define (%mem-int? k xs)
+ (cond ((null? xs) #f)
+ ((= (car xs) k) #t)
+ (else (%mem-int? k (cdr xs)))))
+
+(define (%filter-by-first-byte b al)
+ ;; Subset of `al` whose pattern starts with byte b, preserving order.
+ (cond
+ ((null? al) '())
+ ((= (bytevector-u8-ref (car (car al)) 0) b)
+ (cons (car al) (%filter-by-first-byte b (cdr al))))
+ (else (%filter-by-first-byte b (cdr al)))))
+
+(define (%group-by-first-byte al)
+ ;; Build ((first-byte . sub-alist) ...) over `al`, one bucket per
+ ;; distinct first byte; sub-alist preserves longest-match-first
+ ;; order from the source list.
+ (let loop ((xs al) (seen '()) (out '()))
+ (cond
+ ((null? xs) (reverse out))
+ (else
+ (let* ((entry (car xs))
+ (pat (car entry))
+ (b (bytevector-u8-ref pat 0)))
+ (cond
+ ((%mem-int? b seen) (loop (cdr xs) seen out))
+ (else
+ (loop (cdr xs)
+ (cons b seen)
+ (cons (cons b (%filter-by-first-byte b al)) out)))))))))
+
+(define %punct-buckets (%group-by-first-byte %punct-alist))
+
(define (lex-read-punct src pos file)
(%lex-read-punct src pos 1 (+ pos 1) file))
(define (%lex-read-punct src pos line col file)
- (let ((start-loc (%loc file line col)))
- (%punct-loop src pos line col file start-loc %punct-alist)))
+ (let* ((start-loc (%loc file line col))
+ (p (%lex-peek src pos line col))
+ (b (%pk-byte p)))
+ (cond
+ ((not b) (die start-loc "unrecognized byte" "EOF"))
+ (else
+ (let ((bucket (%alist-ref-int b %punct-buckets)))
+ (cond
+ ((not bucket) (die start-loc "unrecognized byte" (bv-of-byte b)))
+ (else (%punct-loop src pos line col file start-loc bucket))))))))
(define (%punct-loop src pos line col file start-loc al)
(cond
@@ -562,13 +717,26 @@
(cond
((= i (bytevector-length pat)) (list pos line col))
(else
- (let* ((p (%lex-peek src pos line col))
- (b (%pk-byte p)))
+ (let ((n (bytevector-length src)))
(cond
- ((not b) #f)
- ((= b (bytevector-u8-ref pat i))
- (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p) pat (+ i 1)))
- (else #f))))))
+ ((>= pos n) #f)
+ (else
+ (let ((b (bytevector-u8-ref src pos))
+ (pb (bytevector-u8-ref pat i)))
+ (cond
+ ((%fast-byte? b)
+ (if (= b pb)
+ (%match-bytes src (+ pos 1) line (+ col 1) pat (+ i 1))
+ #f))
+ (else
+ (let* ((p (%lex-peek src pos line col))
+ (b2 (%pk-byte p)))
+ (cond
+ ((not b2) #f)
+ ((= b2 pb)
+ (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p)
+ pat (+ i 1)))
+ (else #f))))))))))))
;; --------------------------------------------------------------------
;; lex-tokenize src file -> list of tok ending in EOF.