boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

lex.scm (39039B)


      1 ;; cc/lex.scm — bytestream → token list. Pure function; no I/O,
      2 ;; no macro awareness.
      3 ;;
      4 ;; Realization of docs/CC-INTERNALS.md §lex.scm. Symbol alphabets
      5 ;; (KW, PUNCT, tok-kind) live in cc/data.scm; do not duplicate.
      6 ;;
      7 ;; Owner: <unassigned>
      8 ;;
      9 ;; Implementation notes:
     10 ;;
     11 ;; - The lexer walks `src` byte-by-byte, threading (pos, line, col)
     12 ;;   explicitly through every helper (no mutable state). Each token
     13 ;;   captures its starting loc; helpers return (tok npos nline ncol).
     14 ;; - Trigraphs and `\<newline>` line splicing are handled via a single
     15 ;;   logical-byte primitive `%lex-peek`: it advances over splices and
     16 ;;   translates trigraphs in-place, so downstream code only ever sees
     17 ;;   the "translation phase 2" stream.
     18 ;; - Comments are stripped at the same level as whitespace.
     19 ;; - NL tokens are emitted at every physical newline so pp can use
     20 ;;   them to terminate directives.
     21 ;;
     22 ;; Heap discipline (per tests/scheme1/93-heap-mark-rewind.scm):
     23 ;;
     24 ;; - Token-producing helpers wrap their inner work in a heap-mark /
     25 ;;   heap-rewind! arena. The slots that must survive the rewind
     26 ;;   (start-loc and the integer holders for npos/nline/ncol) are bound
     27 ;;   *before* the (set! mark (heap-mark)) so the let's env extensions
     28 ;;   live below the mark. The byte-run scanners' tail-call env frames
     29 ;;   and any %lex-peek 4-lists are above the mark and get reclaimed.
     30 ;;   For helpers that produce a fresh bytevector (ident, string), the
     31 ;;   bv is allocated post-rewind so it persists into the parent arena.
     32 ;; - Numeric digit runs accumulate their value inline via
     33 ;;   %accum-int-while; they no longer materialize a per-byte cons list
     34 ;;   and then a separate %digits-value walk.
     35 
     36 ;; --------------------------------------------------------------------
     37 ;; Byte-class predicates (raw u8 values, not chars).
     38 ;; --------------------------------------------------------------------
     39 (define (%digit? b)        (if (< b 48) #f (if (< 57 b) #f #t)))     ; '0'..'9'
     40 (define (%hex? b)
     41   (cond ((%digit? b) #t)
     42         ((if (< b 65) #f (if (< 70 b) #f #t)) #t)                    ; 'A'..'F'
     43         ((if (< b 97) #f (if (< 102 b) #f #t)) #t)                   ; 'a'..'f'
     44         (else #f)))
     45 (define (%octal? b)        (if (< b 48) #f (if (< 55 b) #f #t)))     ; '0'..'7'
     46 (define (%alpha? b)
     47   (cond ((if (< b 65) #f (if (< 90 b) #f #t)) #t)                    ; 'A'..'Z'
     48         ((if (< b 97) #f (if (< 122 b) #f #t)) #t)                   ; 'a'..'z'
     49         (else #f)))
     50 (define (%ident-start? b)  (or (%alpha? b) (= b 95)))                ; '_'
     51 (define (%ident-cont?  b)  (or (%ident-start? b) (%digit? b)))
     52 (define (%hspace? b)       (or (= b 32) (= b 9) (= b 11) (= b 12)))  ; SP TAB VT FF
     53 (define (%newline? b)      (= b 10))                                 ; '\n'
     54 
     55 ;; --------------------------------------------------------------------
     56 ;; Logical byte access. %lex-peek returns
     57 ;;   (byte npos nline ncol)
     58 ;; where (npos, nline, ncol) points *just past* the consumed physical
     59 ;; bytes. On EOF it returns (#f pos line col).
     60 ;;
     61 ;; Two transformations folded in here:
     62 ;;
     63 ;;   - Trigraphs:  ??=  ??(  ??/  ??)  ??'  ??<  ??!  ??>  ??-
     64 ;;                  #    [    \    ]    ^    {    |    }    ~
     65 ;;     The pair `??` followed by one of the nine trigraph completers
     66 ;;     produces the translated byte and advances 3 source bytes.
     67 ;;   - Line splice: a backslash immediately followed by `\n` is removed
     68 ;;     as a unit (incrementing line, resetting col to 1) and we recurse
     69 ;;     to fetch the next logical byte.
     70 ;;
     71 ;; Other escapes (e.g. `\<not-newline>`) are returned as-is — string and
     72 ;; char literals do their own escape-handling.
     73 ;; --------------------------------------------------------------------
     74 (define (%trigraph-byte b)
     75   ;; Map the third trigraph byte to its replacement, or #f.
     76   (cond ((= b 61) 35)   ; '=' -> '#'
     77         ((= b 40) 91)   ; '(' -> '['
     78         ((= b 47) 92)   ; '/' -> '\\'
     79         ((= b 41) 93)   ; ')' -> ']'
     80         ((= b 39) 94)   ; '\'' -> '^'
     81         ((= b 60) 123)  ; '<' -> '{'
     82         ((= b 33) 124)  ; '!' -> '|'
     83         ((= b 62) 125)  ; '>' -> '}'
     84         ((= b 45) 126)  ; '-' -> '~'
     85         (else #f)))
     86 
     87 (define (%lex-peek src pos line col)
     88   (let ((n (bytevector-length src)))
     89     (cond
     90       ((>= pos n) (list #f pos line col))
     91       (else
     92        (let ((b (bytevector-u8-ref src pos)))
     93          (cond
     94            ;; Trigraph: ?? + completer
     95            ((and (= b 63)
     96                  (< (+ pos 2) n)
     97                  (= (bytevector-u8-ref src (+ pos 1)) 63))
     98             (let ((tr (%trigraph-byte (bytevector-u8-ref src (+ pos 2)))))
     99               (if tr
    100                   (list tr (+ pos 3) line (+ col 3))
    101                   (list b (+ pos 1) line (+ col 1)))))
    102            ;; Line splice: backslash + newline (consume both, no token)
    103            ((and (= b 92)
    104                  (< (+ pos 1) n)
    105                  (= (bytevector-u8-ref src (+ pos 1)) 10))
    106             (%lex-peek src (+ pos 2) (+ line 1) 1))
    107            ;; Newline: pass through but caller decides line/col bump
    108            ((%newline? b)
    109             (list b (+ pos 1) (+ line 1) 1))
    110            (else
    111             (list b (+ pos 1) line (+ col 1)))))))))
    112 
    113 ;; Convenience accessors over the 4-list.
    114 (define (%pk-byte p)  (car p))
    115 (define (%pk-pos  p)  (car (cdr p)))
    116 (define (%pk-line p)  (car (cdr (cdr p))))
    117 (define (%pk-col  p)  (car (cdr (cdr (cdr p)))))
    118 
    119 ;; Fast-byte test. When (%fast-byte? b) is #t, reading b directly with
    120 ;; bytevector-u8-ref is exactly equivalent to %lex-peek's result: the
    121 ;; logical byte is b, npos = pos+1, nline unchanged, ncol = col+1, and
    122 ;; no list allocation is needed. Excludes the three bytes that %lex-peek
    123 ;; can transform: '?' (trigraph), '\\' (line splice), '\n' (line bump).
    124 (define (%fast-byte? b)
    125   (cond ((= b 63) #f)
    126         ((= b 92) #f)
    127         ((= b 10) #f)
    128         (else #t)))
    129 
    130 ;; --------------------------------------------------------------------
    131 ;; Whitespace + comment skipper.  Returns (pos line col).
    132 ;; Handles spaces/tabs, // line comments, /* block */ comments. Does
    133 ;; *not* consume `\n` — newlines are tokens.
    134 ;; --------------------------------------------------------------------
    135 (define (%skip-ws-and-comments src pos line col file)
    136   (let ((n (bytevector-length src)))
    137     (cond
    138       ((>= pos n) (list pos line col))
    139       (else
    140        (let ((b (bytevector-u8-ref src pos)))
    141          (cond
    142            ((and (%fast-byte? b) (%hspace? b))
    143             (%skip-ws-and-comments src (+ pos 1) line (+ col 1) file))
    144            ((%fast-byte? b)
    145             ;; Fast-byte that isn't hspace. Only '/' is interesting;
    146             ;; everything else terminates the skip.
    147             (cond
    148               ((= b 47) (%maybe-comment src pos line col file))
    149               (else (list pos line col))))
    150            (else
    151             ;; Slow path: trigraph / splice / newline.
    152             (let* ((p (%lex-peek src pos line col))
    153                    (b2 (%pk-byte p)))
    154               (cond
    155                 ((not b2) (list pos line col))
    156                 ((%hspace? b2)
    157                  (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p)
    158                                         file))
    159                 ((= b2 47) (%maybe-comment src pos line col file))
    160                 (else (list pos line col)))))))))))
    161 
    162 (define (%maybe-comment src pos line col file)
    163   ;; Source byte at pos resolves to '/'. Decide between // line comment,
    164   ;; /* block comment, or "leave the slash alone" (it's a punctuator).
    165   (let* ((p (%lex-peek src pos line col))
    166          (q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    167          (b2 (%pk-byte q)))
    168     (cond
    169       ((and b2 (= b2 47))
    170        (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file))
    171       ((and b2 (= b2 42))
    172        (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q)
    173                             file line col))
    174       (else (list pos line col)))))
    175 
    176 (define (%skip-line-comment src pos line col file)
    177   ;; Consume bytes until end-of-stream or until we *see* '\n' (do not
    178   ;; consume the newline itself; outer loop emits the NL).
    179   (let ((n (bytevector-length src)))
    180     (cond
    181       ((>= pos n) (%skip-ws-and-comments src pos line col file))
    182       (else
    183        (let ((b (bytevector-u8-ref src pos)))
    184          (cond
    185            ;; '\n' terminates without consuming.
    186            ((= b 10) (%skip-ws-and-comments src pos line col file))
    187            ((%fast-byte? b)
    188             (%skip-line-comment src (+ pos 1) line (+ col 1) file))
    189            (else
    190             ;; Slow path: ?/\ — let %lex-peek handle trigraph/splice.
    191             (let* ((p (%lex-peek src pos line col))
    192                    (b2 (%pk-byte p)))
    193               (cond
    194                 ((not b2) (%skip-ws-and-comments src pos line col file))
    195                 ((%newline? b2) (%skip-ws-and-comments src pos line col file))
    196                 (else
    197                  (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
    198                                      file)))))))))))
    199 
    200 (define (%skip-block-comment src pos line col file start-line start-col)
    201   (let ((n (bytevector-length src)))
    202     (cond
    203       ((>= pos n)
    204        (die (%loc file start-line start-col)
    205             "unterminated /* block comment"))
    206       (else
    207        (let ((b (bytevector-u8-ref src pos)))
    208          (cond
    209            ;; Fast path for plain content bytes that aren't '*'.
    210            ((and (%fast-byte? b) (not (= b 42)))
    211             (%skip-block-comment src (+ pos 1) line (+ col 1)
    212                                  file start-line start-col))
    213            (else
    214             ;; Slow path: '*', '\n', '?' (trigraph), '\\' (splice).
    215             (let* ((p (%lex-peek src pos line col))
    216                    (b1 (%pk-byte p)))
    217               (cond
    218                 ((not b1)
    219                  (die (%loc file start-line start-col)
    220                       "unterminated /* block comment"))
    221                 ((= b1 42)
    222                  (let* ((q  (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    223                         (b2 (%pk-byte q)))
    224                    (cond
    225                      ((not b2)
    226                       (die (%loc file start-line start-col)
    227                            "unterminated /* block comment"))
    228                      ((= b2 47)
    229                       (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q)
    230                                              file))
    231                      (else
    232                       ;; Re-scan starting at the byte after '*'; the '*' was
    233                       ;; not the closer, but the next byte might itself be '*'.
    234                       (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
    235                                            file start-line start-col)))))
    236                 (else
    237                  (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p)
    238                                       file start-line start-col)))))))))))
    239 
    240 ;; --------------------------------------------------------------------
    241 ;; Byte-run scanners.
    242 ;;
    243 ;; Tail-recursive walkers used by ident/number/string readers. None
    244 ;; allocate per scanned byte on the fast path (only %lex-peek 4-lists
    245 ;; on trigraph/splice/newline); the per-iteration env frames allocated
    246 ;; by tail recursion are reclaimed by the caller's heap-rewind!.
    247 ;;
    248 ;; - %scan-while:    count bytes that satisfy pred. (count npos nline ncol)
    249 ;; - %fill-while-bv: write matching bytes into a pre-sized bv.
    250 ;; - %accum-int-while: accumulate a base-N integer over digit bytes.
    251 ;;     (val count npos nline ncol)
    252 ;; - %accum-octal-bounded: same, but stops after k digits.
    253 ;; --------------------------------------------------------------------
    254 (define (%scan-while pred src pos line col)
    255   (let ((n (bytevector-length src)))
    256     (let loop ((pos pos) (line line) (col col) (cnt 0))
    257       (cond
    258         ((>= pos n) (list cnt pos line col))
    259         (else
    260          (let ((b (bytevector-u8-ref src pos)))
    261            (cond
    262              ((%fast-byte? b)
    263               (if (pred b)
    264                   (loop (+ pos 1) line (+ col 1) (+ cnt 1))
    265                   (list cnt pos line col)))
    266              (else
    267               (let* ((p (%lex-peek src pos line col))
    268                      (b2 (%pk-byte p)))
    269                 (if (and b2 (pred b2))
    270                     (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ cnt 1))
    271                     (list cnt pos line col)))))))))))
    272 
    273 (define (%fill-while-bv pred src pos line col bv idx)
    274   (let ((n (bytevector-length src)))
    275     (let loop ((pos pos) (line line) (col col) (idx idx))
    276       (cond
    277         ((>= pos n) idx)
    278         (else
    279          (let ((b (bytevector-u8-ref src pos)))
    280            (cond
    281              ((%fast-byte? b)
    282               (cond
    283                 ((pred b)
    284                  (bytevector-u8-set! bv idx b)
    285                  (loop (+ pos 1) line (+ col 1) (+ idx 1)))
    286                 (else idx)))
    287              (else
    288               (let* ((p (%lex-peek src pos line col))
    289                      (b2 (%pk-byte p)))
    290                 (cond
    291                   ((and b2 (pred b2))
    292                    (bytevector-u8-set! bv idx b2)
    293                    (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ idx 1)))
    294                   (else idx)))))))))))
    295 
    296 (define (%digit-val-byte b)
    297   ;; ASCII digit byte → integer value. Caller guarantees b is a valid
    298   ;; digit in the relevant base (0-9 / 0-7 / 0-9a-fA-F).
    299   (cond ((%digit? b) (- b 48))
    300         ((if (< b 65) #f (if (< 70 b) #f #t)) (+ (- b 65) 10))
    301         ((if (< b 97) #f (if (< 102 b) #f #t)) (+ (- b 97) 10))
    302         (else 0)))
    303 
    304 (define (%accum-int-while pred src pos line col base)
    305   (let ((n (bytevector-length src)))
    306     (let loop ((pos pos) (line line) (col col) (val 0) (cnt 0))
    307       (cond
    308         ((>= pos n) (list val cnt pos line col))
    309         (else
    310          (let ((b (bytevector-u8-ref src pos)))
    311            (cond
    312              ((%fast-byte? b)
    313               (if (pred b)
    314                   (loop (+ pos 1) line (+ col 1)
    315                         (+ (* val base) (%digit-val-byte b)) (+ cnt 1))
    316                   (list val cnt pos line col)))
    317              (else
    318               (let* ((p (%lex-peek src pos line col))
    319                      (b2 (%pk-byte p)))
    320                 (if (and b2 (pred b2))
    321                     (loop (%pk-pos p) (%pk-line p) (%pk-col p)
    322                           (+ (* val base) (%digit-val-byte b2)) (+ cnt 1))
    323                     (list val cnt pos line col)))))))))))
    324 
    325 (define (%accum-octal-bounded src pos line col k)
    326   ;; Up to k octal digits. Returns (val count npos nline ncol).
    327   (let ((n (bytevector-length src)))
    328     (let loop ((pos pos) (line line) (col col) (k k) (val 0) (cnt 0))
    329       (cond
    330         ((zero? k) (list val cnt pos line col))
    331         ((>= pos n) (list val cnt pos line col))
    332         (else
    333          (let ((b (bytevector-u8-ref src pos)))
    334            (cond
    335              ((%fast-byte? b)
    336               (if (%octal? b)
    337                   (loop (+ pos 1) line (+ col 1) (- k 1)
    338                         (+ (* val 8) (- b 48)) (+ cnt 1))
    339                   (list val cnt pos line col)))
    340              (else
    341               (let* ((p (%lex-peek src pos line col))
    342                      (b2 (%pk-byte p)))
    343                 (if (and b2 (%octal? b2))
    344                     (loop (%pk-pos p) (%pk-line p) (%pk-col p) (- k 1)
    345                           (+ (* val 8) (- b2 48)) (+ cnt 1))
    346                     (list val cnt pos line col)))))))))))
    347 
    348 ;; --------------------------------------------------------------------
    349 ;; Identifier / keyword reader.
    350 ;;
    351 ;; Returns (tok npos nline ncol). Caller has already verified that the
    352 ;; first byte at `pos` satisfies %ident-start?.
    353 ;;
    354 ;; Two-pass with heap-mark/rewind: pass 1 (%scan-while) sizes the run,
    355 ;; then we rewind, allocate `name` bv post-rewind so it survives, then
    356 ;; pass 2 (%fill-while-bv) writes into it under a fresh mark. The
    357 ;; integer slots count/npos/nline/ncol are bound *before* the mark so
    358 ;; they survive both rewinds.
    359 ;; --------------------------------------------------------------------
    360 (define (lex-read-ident src pos file)
    361   ;; Public for tests. Threads line/col from a fresh start.
    362   (%lex-read-ident src pos 1 (+ pos 1) file))
    363 
    364 (define (%lex-read-ident src pos line col file)
    365   (let ((start-loc (%loc file line col))
    366         (count 0) (npos 0) (nline 0) (ncol 0)
    367         (mark 0))
    368     (set! mark (heap-mark))
    369     (let ((sres (%scan-while %ident-cont? src pos line col)))
    370       (set! count (car sres))
    371       (set! npos  (car (cdr sres)))
    372       (set! nline (car (cdr (cdr sres))))
    373       (set! ncol  (car (cdr (cdr (cdr sres))))))
    374     (heap-rewind! mark)
    375     (let ((name  (make-bytevector count 0))
    376           (mark2 0))
    377       (set! mark2 (heap-mark))
    378       (%fill-while-bv %ident-cont? src pos line col name 0)
    379       (heap-rewind! mark2)
    380       (let ((kw (alist-ref name %keyword-alist)))
    381         (cons (if kw
    382                   (make-tok 'KW kw start-loc)
    383                   (make-tok 'IDENT name start-loc))
    384               (list npos nline ncol))))))
    385 
    386 ;; --------------------------------------------------------------------
    387 ;; Number reader.
    388 ;;
    389 ;; Decimal: [1-9][0-9]*  (suffix: u U l L ll LL combinations)
    390 ;; Hex:     0x[0-9a-fA-F]+ | 0X...
    391 ;; Octal:   0[0-7]*
    392 ;; Float:   anything looking like 1.0, 1e3, .5 → die crisply.
    393 ;;
    394 ;; Returns (tok npos nline ncol) on success. Aborts via `die` on float.
    395 ;;
    396 ;; %accum-int-while folds digit collection and value computation into
    397 ;; one walk — no per-byte cons cells, no separate digits-list pass.
    398 ;; --------------------------------------------------------------------
    399 (define (lex-read-number src pos file)
    400   (%lex-read-number src pos 1 (+ pos 1) file))
    401 
    402 (define (%lex-read-number src pos line col file)
    403   (let* ((start-loc (%loc file line col))
    404          (p (%lex-peek src pos line col))
    405          (b (%pk-byte p)))
    406     (cond
    407       ;; '0x' / '0X' hex prefix
    408       ((and (= b 48)
    409             (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    410                    (b2 (%pk-byte q)))
    411               (and b2 (or (= b2 120) (= b2 88)))))   ; 'x' or 'X'
    412        (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    413               (r (%accum-int-while %hex? src
    414                                     (%pk-pos q) (%pk-line q) (%pk-col q) 16))
    415               (val   (car r))
    416               (cnt   (car (cdr r)))
    417               (pos2  (car (cdr (cdr r))))
    418               (line2 (car (cdr (cdr (cdr r)))))
    419               (col2  (car (cdr (cdr (cdr (cdr r)))))))
    420          (if (zero? cnt)
    421              (die start-loc "expected hex digits after 0x")
    422              (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file)))
    423                (cons (make-tok 'INT val start-loc) after)))))
    424       ;; '0' alone → octal sequence (could be just zero)
    425       ((= b 48)
    426        (let* ((r (%accum-int-while %octal? src
    427                                     (%pk-pos p) (%pk-line p) (%pk-col p) 8))
    428               (val   (car r))
    429               (pos2  (car (cdr (cdr r))))
    430               (line2 (car (cdr (cdr (cdr r)))))
    431               (col2  (car (cdr (cdr (cdr (cdr r)))))))
    432          ;; Reject '.' / 'e' / 'E' immediately after the octal run — float.
    433          (%check-no-float src pos2 line2 col2 file start-loc)
    434          ;; Reject stray digits 8/9 in an octal context (e.g. 089).
    435          (let* ((p3 (%lex-peek src pos2 line2 col2))
    436                 (b3 (%pk-byte p3)))
    437            (if (and b3 (%digit? b3))
    438                (die start-loc "invalid octal digit" (bv-of-byte b3))
    439                (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file)))
    440                  (cons (make-tok 'INT val start-loc) after))))))
    441       ;; '1'-'9' → decimal
    442       ((%digit? b)
    443        (let* ((r (%accum-int-while %digit? src pos line col 10))
    444               (val   (car r))
    445               (pos2  (car (cdr (cdr r))))
    446               (line2 (car (cdr (cdr (cdr r)))))
    447               (col2  (car (cdr (cdr (cdr (cdr r)))))))
    448          (%check-no-float src pos2 line2 col2 file start-loc)
    449          (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file)))
    450            (cons (make-tok 'INT val start-loc) after))))
    451       ;; '.' followed by a digit = float-style literal — reject.
    452       ((= b 46)
    453        (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    454               (b2 (%pk-byte q)))
    455          (if (and b2 (%digit? b2))
    456              (die start-loc "floating-point literal not supported")
    457              ;; Otherwise '.' was a punctuator — caller wouldn't have
    458              ;; routed here unless it was a digit-led prefix.
    459              (die start-loc "internal: number reader on non-number"))))
    460       (else
    461        (die start-loc "internal: number reader on non-number")))))
    462 
    463 (define (%check-no-float src pos line col file start-loc)
    464   ;; If the byte at pos starts a fractional/exponent part, abort.
    465   (let* ((p (%lex-peek src pos line col))
    466          (b (%pk-byte p)))
    467     (cond
    468       ((not b) #t)
    469       ((= b 46)  ; '.'
    470        (die start-loc "floating-point literal not supported"))
    471       ((or (= b 101) (= b 69))  ; 'e' / 'E'
    472        ;; Only a float exponent if followed by [+-]?digit.
    473        (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    474               (b2 (%pk-byte q)))
    475          (cond
    476            ((and b2 (%digit? b2))
    477             (die start-loc "floating-point literal not supported"))
    478            ((and b2 (or (= b2 43) (= b2 45)))
    479             (let* ((r (%lex-peek src (%pk-pos q) (%pk-line q) (%pk-col q)))
    480                    (b3 (%pk-byte r)))
    481               (if (and b3 (%digit? b3))
    482                   (die start-loc "floating-point literal not supported")
    483                   #t)))
    484            (else #t))))
    485       (else #t))))
    486 
    487 (define (%lex-strip-int-suffix src pos line col file)
    488   ;; Consume any combination of u U l L (the long can be doubled). We
    489   ;; don't validate orderings strictly; tcc.c uses the canonical forms.
    490   ;; Returns (npos nline ncol).
    491   (let loop ((pos pos) (line line) (col col))
    492     (let* ((p (%lex-peek src pos line col))
    493            (b (%pk-byte p)))
    494       (cond
    495         ((not b) (list pos line col))
    496         ((or (= b 117) (= b 85)    ; u U
    497              (= b 108) (= b 76))   ; l L
    498          (loop (%pk-pos p) (%pk-line p) (%pk-col p)))
    499         (else (list pos line col))))))
    500 
    501 ;; --------------------------------------------------------------------
    502 ;; Escape sequence reader.
    503 ;;
    504 ;; %scan-or-fill-escape decodes one escape sequence starting at `pos`
    505 ;; (which points one past the leading `\\`). When `bv` is a bytevector,
    506 ;; the resulting byte is written to (bv idx); when it is #f, no write
    507 ;; occurs (used during the string-pass scan phase). Returns the 4-list
    508 ;; (val npos nline ncol).
    509 ;; --------------------------------------------------------------------
    510 (define (%scan-or-fill-escape src pos line col file start-loc bv idx)
    511   (let* ((p (%lex-peek src pos line col))
    512          (b (%pk-byte p)))
    513     (cond
    514       ((not b) (die start-loc "unterminated escape sequence"))
    515       ;; \xNN — 1+ hex digits (tcc.c uses 1- and 2-digit forms).
    516       ((or (= b 120) (= b 88))   ; 'x' / 'X'
    517        (let* ((r (%accum-int-while %hex? src
    518                                     (%pk-pos p) (%pk-line p) (%pk-col p) 16))
    519               (val0  (car r))
    520               (cnt   (car (cdr r)))
    521               (pos2  (car (cdr (cdr r))))
    522               (line2 (car (cdr (cdr (cdr r)))))
    523               (col2  (car (cdr (cdr (cdr (cdr r)))))))
    524          (cond
    525            ((zero? cnt) (die start-loc "expected hex digits after \\x"))
    526            (else
    527             (let ((val (bit-and val0 255)))
    528               (cond (bv (bytevector-u8-set! bv idx val))
    529                     (else #f))
    530               (list val pos2 line2 col2))))))
    531       ;; \NNN — 1..3 octal digits.
    532       ((%octal? b)
    533        (let* ((r (%accum-octal-bounded src pos line col 3))
    534               (val0  (car r))
    535               (pos2  (car (cdr (cdr r))))
    536               (line2 (car (cdr (cdr (cdr r)))))
    537               (col2  (car (cdr (cdr (cdr (cdr r))))))
    538               (val   (bit-and val0 255)))
    539          (cond (bv (bytevector-u8-set! bv idx val))
    540                (else #f))
    541          (list val pos2 line2 col2)))
    542       (else
    543        (let ((val (cond ((= b 110) 10)        ; n
    544                         ((= b 116) 9)         ; t
    545                         ((= b 114) 13)        ; r
    546                         ((= b 92)  92)        ; \\
    547                         ((= b 39)  39)        ; '
    548                         ((= b 34)  34)        ; "
    549                         ((= b 48)  0)         ; 0 (already handled by octal but be safe)
    550                         ((= b 97)  7)         ; \a -> BEL
    551                         ((= b 98)  8)         ; \b
    552                         ((= b 102) 12)        ; \f
    553                         ((= b 118) 11)        ; \v
    554                         ((= b 63)  63)        ; \?
    555                         (else
    556                          (die start-loc "unknown escape" (bv-of-byte b))))))
    557          (cond (bv (bytevector-u8-set! bv idx val))
    558                (else #f))
    559          (list val (%pk-pos p) (%pk-line p) (%pk-col p)))))))
    560 
    561 ;; --------------------------------------------------------------------
    562 ;; String reader.
    563 ;;
    564 ;; Caller has verified src[pos] == '"' (raw byte 34). Returns
    565 ;; (tok npos nline ncol) with the raw decoded bytes (no NUL appended).
    566 ;;
    567 ;; Two-pass: %string-pass with bv=#f counts effective bytes (escapes
    568 ;; collapse to 1 byte each); after rewind we allocate the final bv and
    569 ;; rerun with bv set so the bytes are written directly into it.
    570 ;; --------------------------------------------------------------------
    571 (define (lex-read-string src pos file)
    572   (%lex-read-string src pos 1 (+ pos 1) file))
    573 
    574 (define (%lex-read-string src pos line col file)
    575   (let ((start-loc (%loc file line col))
    576         (cnt 0) (npos 0) (nline 0) (ncol 0)
    577         (mark 0))
    578     ;; '"' (34) is a fast-byte and never a trigraph result, so the
    579     ;; physical byte at `pos` is exactly the opening quote.
    580     (cond
    581       ((or (>= pos (bytevector-length src))
    582            (not (= (bytevector-u8-ref src pos) 34)))
    583        (die start-loc "internal: string reader on non-quote"))
    584       (else
    585        (set! mark (heap-mark))
    586        (let ((sres (%string-pass src (+ pos 1) line (+ col 1)
    587                                   file start-loc #f)))
    588          (set! cnt   (car sres))
    589          (set! npos  (car (cdr sres)))
    590          (set! nline (car (cdr (cdr sres))))
    591          (set! ncol  (car (cdr (cdr (cdr sres))))))
    592        (heap-rewind! mark)
    593        (let ((bv    (make-bytevector cnt 0))
    594              (mark2 0))
    595          (set! mark2 (heap-mark))
    596          (%string-pass src (+ pos 1) line (+ col 1) file start-loc bv)
    597          (heap-rewind! mark2)
    598          (cons (make-tok 'STR bv start-loc)
    599                (list npos nline ncol)))))))
    600 
    601 (define (%string-pass src pos line col file start-loc bv)
    602   ;; Walk the string body (after opening "). When `bv` is #f, count
    603   ;; effective bytes; when it is a bytevector, write bytes into it at
    604   ;; index 0..count-1. Returns (count npos nline ncol).
    605   (let ((n (bytevector-length src)))
    606     (let loop ((pos pos) (line line) (col col) (idx 0))
    607       (cond
    608         ((>= pos n) (die start-loc "unterminated string literal"))
    609         (else
    610          (let ((b (bytevector-u8-ref src pos)))
    611            (cond
    612              ;; Closing quote — fast byte but special.
    613              ((= b 34)
    614               (list idx (+ pos 1) line (+ col 1)))
    615              ((%fast-byte? b)
    616               (cond (bv (bytevector-u8-set! bv idx b))
    617                     (else #f))
    618               (loop (+ pos 1) line (+ col 1) (+ idx 1)))
    619              (else
    620               ;; Slow path: ?/\ (trigraph/splice/escape) or '\n'.
    621               (let* ((p (%lex-peek src pos line col))
    622                      (b2 (%pk-byte p)))
    623                 (cond
    624                   ((not b2)
    625                    (die start-loc "unterminated string literal"))
    626                   ((= b2 34)
    627                    (list idx (%pk-pos p) (%pk-line p) (%pk-col p)))
    628                   ((%newline? b2)
    629                    (die start-loc "newline in string literal"))
    630                   ((= b2 92)
    631                    (let* ((er    (%scan-or-fill-escape
    632                                    src (%pk-pos p) (%pk-line p) (%pk-col p)
    633                                    file start-loc bv idx))
    634                           (epos  (car (cdr er)))
    635                           (eline (car (cdr (cdr er))))
    636                           (ecol  (car (cdr (cdr (cdr er))))))
    637                      (loop epos eline ecol (+ idx 1))))
    638                   (else
    639                    (cond (bv (bytevector-u8-set! bv idx b2))
    640                          (else #f))
    641                    (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ idx 1)))))))))))))
    642 
    643 ;; --------------------------------------------------------------------
    644 ;; Char reader.
    645 ;;
    646 ;; Caller has verified src[pos] == '\''. Multi-character constants
    647 ;; ('AB') are rejected via die.
    648 ;; --------------------------------------------------------------------
    649 (define (lex-read-char src pos file)
    650   (%lex-read-char src pos 1 (+ pos 1) file))
    651 
    652 (define (%lex-read-char src pos line col file)
    653   (let* ((start-loc (%loc file line col))
    654          (p0 (%lex-peek src pos line col))
    655          (b0 (%pk-byte p0)))
    656     (if (not (and b0 (= b0 39)))
    657         (die start-loc "internal: char reader on non-quote")
    658         (%collect-char src (%pk-pos p0) (%pk-line p0) (%pk-col p0)
    659                        file start-loc))))
    660 
    661 (define (%collect-char src pos line col file start-loc)
    662   ;; Read exactly one byte (handling escapes), then expect closing '\''.
    663   (let* ((p (%lex-peek src pos line col))
    664          (b (%pk-byte p)))
    665     (cond
    666       ((not b) (die start-loc "unterminated char literal"))
    667       ((= b 39) (die start-loc "empty char literal"))
    668       ((%newline? b) (die start-loc "newline in char literal"))
    669       ((= b 92)   ; escape
    670        (let* ((r     (%scan-or-fill-escape src
    671                                             (%pk-pos p) (%pk-line p) (%pk-col p)
    672                                             file start-loc #f 0))
    673               (val   (car r))
    674               (pos2  (car (cdr r)))
    675               (line2 (car (cdr (cdr r))))
    676               (col2  (car (cdr (cdr (cdr r))))))
    677          (%expect-char-close src pos2 line2 col2 file start-loc val)))
    678       (else
    679        (%expect-char-close src (%pk-pos p) (%pk-line p) (%pk-col p)
    680                            file start-loc b)))))
    681 
    682 (define (%expect-char-close src pos line col file start-loc val)
    683   (let* ((p (%lex-peek src pos line col))
    684          (b (%pk-byte p)))
    685     (cond
    686       ((not b) (die start-loc "unterminated char literal"))
    687       ((= b 39)
    688        (cons (make-tok 'CHAR val start-loc)
    689              (list (%pk-pos p) (%pk-line p) (%pk-col p))))
    690       (else
    691        (die start-loc "multi-character char constant not supported")))))
    692 
    693 ;; --------------------------------------------------------------------
    694 ;; Punctuator reader.
    695 ;;
    696 ;; Greedy longest-match against %punct-alist (cc/data.scm). The alist
    697 ;; is already ordered longest-first. We additionally bucket entries by
    698 ;; their first byte so %lex-read-punct only loops over the small set of
    699 ;; patterns that can start at the current source byte.
    700 ;; --------------------------------------------------------------------
    701 
    702 (define (%alist-ref-int k al)
    703   ;; Lookup in an int-keyed alist (linear scan, '= compare).
    704   (cond ((null? al) #f)
    705         ((= (car (car al)) k) (cdr (car al)))
    706         (else (%alist-ref-int k (cdr al)))))
    707 
    708 (define (%mem-int? k xs)
    709   (cond ((null? xs) #f)
    710         ((= (car xs) k) #t)
    711         (else (%mem-int? k (cdr xs)))))
    712 
    713 (define (%filter-by-first-byte b al)
    714   ;; Subset of `al` whose pattern starts with byte b, preserving order.
    715   (cond
    716     ((null? al) '())
    717     ((= (bytevector-u8-ref (car (car al)) 0) b)
    718      (cons (car al) (%filter-by-first-byte b (cdr al))))
    719     (else (%filter-by-first-byte b (cdr al)))))
    720 
    721 (define (%group-by-first-byte al)
    722   ;; Build ((first-byte . sub-alist) ...) over `al`, one bucket per
    723   ;; distinct first byte; sub-alist preserves longest-match-first
    724   ;; order from the source list.
    725   (let loop ((xs al) (seen '()) (out '()))
    726     (cond
    727       ((null? xs) (reverse out))
    728       (else
    729        (let* ((entry (car xs))
    730               (pat   (car entry))
    731               (b     (bytevector-u8-ref pat 0)))
    732          (cond
    733            ((%mem-int? b seen) (loop (cdr xs) seen out))
    734            (else
    735             (loop (cdr xs)
    736                   (cons b seen)
    737                   (cons (cons b (%filter-by-first-byte b al)) out)))))))))
    738 
    739 (define %punct-buckets (%group-by-first-byte %punct-alist))
    740 
    741 (define (lex-read-punct src pos file)
    742   (%lex-read-punct src pos 1 (+ pos 1) file))
    743 
    744 (define (%lex-read-punct src pos line col file)
    745   (let* ((start-loc (%loc file line col))
    746          (p (%lex-peek src pos line col))
    747          (b (%pk-byte p)))
    748     (cond
    749       ((not b) (die start-loc "unrecognized byte" "EOF"))
    750       (else
    751        (let ((bucket (%alist-ref-int b %punct-buckets)))
    752          (cond
    753            ((not bucket) (die start-loc "unrecognized byte" (bv-of-byte b)))
    754            (else (%punct-loop src pos line col file start-loc bucket))))))))
    755 
    756 (define (%punct-loop src pos line col file start-loc al)
    757   (cond
    758     ((null? al)
    759      (let* ((p (%lex-peek src pos line col)))
    760        (die start-loc "unrecognized byte"
    761             (if (%pk-byte p) (bv-of-byte (%pk-byte p)) "EOF"))))
    762     (else
    763      (let* ((entry (car al))
    764             (pat   (car entry))
    765             (sym   (cdr entry))
    766             (m     (%match-bytes src pos line col pat 0)))
    767        (if m
    768            (cons (make-tok 'PUNCT sym start-loc) m)
    769            (%punct-loop src pos line col file start-loc (cdr al)))))))
    770 
    771 (define (%match-bytes src pos line col pat i)
    772   ;; If the next bytes from (pos line col), in logical-byte stream
    773   ;; order, equal `pat[i..]`, return (npos nline ncol) after the
    774   ;; match. Otherwise #f.
    775   (cond
    776     ((= i (bytevector-length pat)) (list pos line col))
    777     (else
    778      (let ((n (bytevector-length src)))
    779        (cond
    780          ((>= pos n) #f)
    781          (else
    782           (let ((b  (bytevector-u8-ref src pos))
    783                 (pb (bytevector-u8-ref pat i)))
    784             (cond
    785               ((%fast-byte? b)
    786                (if (= b pb)
    787                    (%match-bytes src (+ pos 1) line (+ col 1) pat (+ i 1))
    788                    #f))
    789               (else
    790                (let* ((p (%lex-peek src pos line col))
    791                       (b2 (%pk-byte p)))
    792                  (cond
    793                    ((not b2) #f)
    794                    ((= b2 pb)
    795                     (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p)
    796                                   pat (+ i 1)))
    797                    (else #f))))))))))))
    798 
    799 ;; --------------------------------------------------------------------
    800 ;; lex-tokenize  src file -> list of tok ending in EOF.
    801 ;; --------------------------------------------------------------------
    802 (define (lex-tokenize src file)
    803   (%lex-loop src 0 1 1 file '() #t))
    804 
    805 ;; bol? — `#t` when no token has been emitted on the current physical
    806 ;; line yet (start of file, or only NL + whitespace seen since the last
    807 ;; line break). pp recognizes a directive only when its leading `#` is
    808 ;; at line-start; we forward that decision into the token stream by
    809 ;; emitting `HASH` instead of `(PUNCT hash …)` for a line-leading `#`.
    810 (define (%lex-loop src pos line col file acc bol?)
    811   (let* ((sw (%skip-ws-and-comments src pos line col file))
    812          (pos1  (car sw))
    813          (line1 (car (cdr sw)))
    814          (col1  (car (cdr (cdr sw))))
    815          (p     (%lex-peek src pos1 line1 col1))
    816          (b     (%pk-byte p)))
    817     (cond
    818       ;; EOF
    819       ((not b)
    820        (let* ((eof-tok (make-tok 'EOF #f (%loc file line1 col1))))
    821          (reverse (cons eof-tok acc))))
    822       ;; Newline → emit NL, reset bol?.
    823       ((%newline? b)
    824        (let ((nl (make-tok 'NL #f (%loc file line1 col1))))
    825          (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p)
    826                     file (cons nl acc) #t)))
    827       ;; Line-leading `#` → emit HASH, but only the bare `#`. `##` is
    828       ;; never line-leading in valid C; if it appears, fall through to
    829       ;; normal punctuator handling so it lexes as `paste`.
    830       ((and bol? (= b 35))
    831        (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    832               (b2 (%pk-byte q)))
    833          (cond
    834            ((and b2 (= b2 35))
    835             (let* ((r (%lex-read-punct src pos1 line1 col1 file))
    836                    (tok (car r))
    837                    (npos (car (cdr r)))
    838                    (nline (car (cdr (cdr r))))
    839                    (ncol  (car (cdr (cdr (cdr r))))))
    840               (%lex-loop src npos nline ncol file (cons tok acc) #f)))
    841            (else
    842             (let ((tok (make-tok 'HASH #f (%loc file line1 col1))))
    843               (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p)
    844                          file (cons tok acc) #f))))))
    845       ;; Identifier / keyword
    846       ((%ident-start? b)
    847        (let* ((r (%lex-read-ident src pos1 line1 col1 file))
    848               (tok (car r))
    849               (npos (car (cdr r)))
    850               (nline (car (cdr (cdr r))))
    851               (ncol  (car (cdr (cdr (cdr r))))))
    852          (%lex-loop src npos nline ncol file (cons tok acc) #f)))
    853       ;; Number (digit start)
    854       ((%digit? b)
    855        (let* ((r (%lex-read-number src pos1 line1 col1 file))
    856               (tok (car r))
    857               (npos (car (cdr r)))
    858               (nline (car (cdr (cdr r))))
    859               (ncol  (car (cdr (cdr (cdr r))))))
    860          (%lex-loop src npos nline ncol file (cons tok acc) #f)))
    861       ;; '.' might start a number (1.0 actually starts with digit; .5
    862       ;; would route here). We keep this as a punctuator unless followed
    863       ;; by a digit, in which case the lexer rejects per spec.
    864       ((= b 46)
    865        (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p)))
    866               (b2 (%pk-byte q)))
    867          (cond
    868            ((and b2 (%digit? b2))
    869             (die (%loc file line1 col1) "floating-point literal not supported"))
    870            (else
    871             (let* ((r (%lex-read-punct src pos1 line1 col1 file))
    872                    (tok (car r))
    873                    (npos (car (cdr r)))
    874                    (nline (car (cdr (cdr r))))
    875                    (ncol  (car (cdr (cdr (cdr r))))))
    876               (%lex-loop src npos nline ncol file (cons tok acc) #f))))))
    877       ;; String
    878       ((= b 34)
    879        (let* ((r (%lex-read-string src pos1 line1 col1 file))
    880               (tok (car r))
    881               (npos (car (cdr r)))
    882               (nline (car (cdr (cdr r))))
    883               (ncol  (car (cdr (cdr (cdr r))))))
    884          (%lex-loop src npos nline ncol file (cons tok acc) #f)))
    885       ;; Char
    886       ((= b 39)
    887        (let* ((r (%lex-read-char src pos1 line1 col1 file))
    888               (tok (car r))
    889               (npos (car (cdr r)))
    890               (nline (car (cdr (cdr r))))
    891               (ncol  (car (cdr (cdr (cdr r))))))
    892          (%lex-loop src npos nline ncol file (cons tok acc) #f)))
    893       ;; Punctuator (default)
    894       (else
    895        (let* ((r (%lex-read-punct src pos1 line1 col1 file))
    896               (tok (car r))
    897               (npos (car (cdr r)))
    898               (nline (car (cdr (cdr r))))
    899               (ncol  (car (cdr (cdr (cdr r))))))
    900          (%lex-loop src npos nline ncol file (cons tok acc) #f))))))