lex.scm (39039B)
1 ;; cc/lex.scm — bytestream → token list. Pure function; no I/O, 2 ;; no macro awareness. 3 ;; 4 ;; Realization of docs/CC-INTERNALS.md §lex.scm. Symbol alphabets 5 ;; (KW, PUNCT, tok-kind) live in cc/data.scm; do not duplicate. 6 ;; 7 ;; Owner: <unassigned> 8 ;; 9 ;; Implementation notes: 10 ;; 11 ;; - The lexer walks `src` byte-by-byte, threading (pos, line, col) 12 ;; explicitly through every helper (no mutable state). Each token 13 ;; captures its starting loc; helpers return (tok npos nline ncol). 14 ;; - Trigraphs and `\<newline>` line splicing are handled via a single 15 ;; logical-byte primitive `%lex-peek`: it advances over splices and 16 ;; translates trigraphs in-place, so downstream code only ever sees 17 ;; the "translation phase 2" stream. 18 ;; - Comments are stripped at the same level as whitespace. 19 ;; - NL tokens are emitted at every physical newline so pp can use 20 ;; them to terminate directives. 21 ;; 22 ;; Heap discipline (per tests/scheme1/93-heap-mark-rewind.scm): 23 ;; 24 ;; - Token-producing helpers wrap their inner work in a heap-mark / 25 ;; heap-rewind! arena. The slots that must survive the rewind 26 ;; (start-loc and the integer holders for npos/nline/ncol) are bound 27 ;; *before* the (set! mark (heap-mark)) so the let's env extensions 28 ;; live below the mark. The byte-run scanners' tail-call env frames 29 ;; and any %lex-peek 4-lists are above the mark and get reclaimed. 30 ;; For helpers that produce a fresh bytevector (ident, string), the 31 ;; bv is allocated post-rewind so it persists into the parent arena. 32 ;; - Numeric digit runs accumulate their value inline via 33 ;; %accum-int-while; they no longer materialize a per-byte cons list 34 ;; and then a separate %digits-value walk. 35 36 ;; -------------------------------------------------------------------- 37 ;; Byte-class predicates (raw u8 values, not chars). 38 ;; -------------------------------------------------------------------- 39 (define (%digit? b) (if (< b 48) #f (if (< 57 b) #f #t))) ; '0'..'9' 40 (define (%hex? b) 41 (cond ((%digit? b) #t) 42 ((if (< b 65) #f (if (< 70 b) #f #t)) #t) ; 'A'..'F' 43 ((if (< b 97) #f (if (< 102 b) #f #t)) #t) ; 'a'..'f' 44 (else #f))) 45 (define (%octal? b) (if (< b 48) #f (if (< 55 b) #f #t))) ; '0'..'7' 46 (define (%alpha? b) 47 (cond ((if (< b 65) #f (if (< 90 b) #f #t)) #t) ; 'A'..'Z' 48 ((if (< b 97) #f (if (< 122 b) #f #t)) #t) ; 'a'..'z' 49 (else #f))) 50 (define (%ident-start? b) (or (%alpha? b) (= b 95))) ; '_' 51 (define (%ident-cont? b) (or (%ident-start? b) (%digit? b))) 52 (define (%hspace? b) (or (= b 32) (= b 9) (= b 11) (= b 12))) ; SP TAB VT FF 53 (define (%newline? b) (= b 10)) ; '\n' 54 55 ;; -------------------------------------------------------------------- 56 ;; Logical byte access. %lex-peek returns 57 ;; (byte npos nline ncol) 58 ;; where (npos, nline, ncol) points *just past* the consumed physical 59 ;; bytes. On EOF it returns (#f pos line col). 60 ;; 61 ;; Two transformations folded in here: 62 ;; 63 ;; - Trigraphs: ??= ??( ??/ ??) ??' ??< ??! ??> ??- 64 ;; # [ \ ] ^ { | } ~ 65 ;; The pair `??` followed by one of the nine trigraph completers 66 ;; produces the translated byte and advances 3 source bytes. 67 ;; - Line splice: a backslash immediately followed by `\n` is removed 68 ;; as a unit (incrementing line, resetting col to 1) and we recurse 69 ;; to fetch the next logical byte. 70 ;; 71 ;; Other escapes (e.g. `\<not-newline>`) are returned as-is — string and 72 ;; char literals do their own escape-handling. 73 ;; -------------------------------------------------------------------- 74 (define (%trigraph-byte b) 75 ;; Map the third trigraph byte to its replacement, or #f. 76 (cond ((= b 61) 35) ; '=' -> '#' 77 ((= b 40) 91) ; '(' -> '[' 78 ((= b 47) 92) ; '/' -> '\\' 79 ((= b 41) 93) ; ')' -> ']' 80 ((= b 39) 94) ; '\'' -> '^' 81 ((= b 60) 123) ; '<' -> '{' 82 ((= b 33) 124) ; '!' -> '|' 83 ((= b 62) 125) ; '>' -> '}' 84 ((= b 45) 126) ; '-' -> '~' 85 (else #f))) 86 87 (define (%lex-peek src pos line col) 88 (let ((n (bytevector-length src))) 89 (cond 90 ((>= pos n) (list #f pos line col)) 91 (else 92 (let ((b (bytevector-u8-ref src pos))) 93 (cond 94 ;; Trigraph: ?? + completer 95 ((and (= b 63) 96 (< (+ pos 2) n) 97 (= (bytevector-u8-ref src (+ pos 1)) 63)) 98 (let ((tr (%trigraph-byte (bytevector-u8-ref src (+ pos 2))))) 99 (if tr 100 (list tr (+ pos 3) line (+ col 3)) 101 (list b (+ pos 1) line (+ col 1))))) 102 ;; Line splice: backslash + newline (consume both, no token) 103 ((and (= b 92) 104 (< (+ pos 1) n) 105 (= (bytevector-u8-ref src (+ pos 1)) 10)) 106 (%lex-peek src (+ pos 2) (+ line 1) 1)) 107 ;; Newline: pass through but caller decides line/col bump 108 ((%newline? b) 109 (list b (+ pos 1) (+ line 1) 1)) 110 (else 111 (list b (+ pos 1) line (+ col 1))))))))) 112 113 ;; Convenience accessors over the 4-list. 114 (define (%pk-byte p) (car p)) 115 (define (%pk-pos p) (car (cdr p))) 116 (define (%pk-line p) (car (cdr (cdr p)))) 117 (define (%pk-col p) (car (cdr (cdr (cdr p))))) 118 119 ;; Fast-byte test. When (%fast-byte? b) is #t, reading b directly with 120 ;; bytevector-u8-ref is exactly equivalent to %lex-peek's result: the 121 ;; logical byte is b, npos = pos+1, nline unchanged, ncol = col+1, and 122 ;; no list allocation is needed. Excludes the three bytes that %lex-peek 123 ;; can transform: '?' (trigraph), '\\' (line splice), '\n' (line bump). 124 (define (%fast-byte? b) 125 (cond ((= b 63) #f) 126 ((= b 92) #f) 127 ((= b 10) #f) 128 (else #t))) 129 130 ;; -------------------------------------------------------------------- 131 ;; Whitespace + comment skipper. Returns (pos line col). 132 ;; Handles spaces/tabs, // line comments, /* block */ comments. Does 133 ;; *not* consume `\n` — newlines are tokens. 134 ;; -------------------------------------------------------------------- 135 (define (%skip-ws-and-comments src pos line col file) 136 (let ((n (bytevector-length src))) 137 (cond 138 ((>= pos n) (list pos line col)) 139 (else 140 (let ((b (bytevector-u8-ref src pos))) 141 (cond 142 ((and (%fast-byte? b) (%hspace? b)) 143 (%skip-ws-and-comments src (+ pos 1) line (+ col 1) file)) 144 ((%fast-byte? b) 145 ;; Fast-byte that isn't hspace. Only '/' is interesting; 146 ;; everything else terminates the skip. 147 (cond 148 ((= b 47) (%maybe-comment src pos line col file)) 149 (else (list pos line col)))) 150 (else 151 ;; Slow path: trigraph / splice / newline. 152 (let* ((p (%lex-peek src pos line col)) 153 (b2 (%pk-byte p))) 154 (cond 155 ((not b2) (list pos line col)) 156 ((%hspace? b2) 157 (%skip-ws-and-comments src (%pk-pos p) (%pk-line p) (%pk-col p) 158 file)) 159 ((= b2 47) (%maybe-comment src pos line col file)) 160 (else (list pos line col))))))))))) 161 162 (define (%maybe-comment src pos line col file) 163 ;; Source byte at pos resolves to '/'. Decide between // line comment, 164 ;; /* block comment, or "leave the slash alone" (it's a punctuator). 165 (let* ((p (%lex-peek src pos line col)) 166 (q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 167 (b2 (%pk-byte q))) 168 (cond 169 ((and b2 (= b2 47)) 170 (%skip-line-comment src (%pk-pos q) (%pk-line q) (%pk-col q) file)) 171 ((and b2 (= b2 42)) 172 (%skip-block-comment src (%pk-pos q) (%pk-line q) (%pk-col q) 173 file line col)) 174 (else (list pos line col))))) 175 176 (define (%skip-line-comment src pos line col file) 177 ;; Consume bytes until end-of-stream or until we *see* '\n' (do not 178 ;; consume the newline itself; outer loop emits the NL). 179 (let ((n (bytevector-length src))) 180 (cond 181 ((>= pos n) (%skip-ws-and-comments src pos line col file)) 182 (else 183 (let ((b (bytevector-u8-ref src pos))) 184 (cond 185 ;; '\n' terminates without consuming. 186 ((= b 10) (%skip-ws-and-comments src pos line col file)) 187 ((%fast-byte? b) 188 (%skip-line-comment src (+ pos 1) line (+ col 1) file)) 189 (else 190 ;; Slow path: ?/\ — let %lex-peek handle trigraph/splice. 191 (let* ((p (%lex-peek src pos line col)) 192 (b2 (%pk-byte p))) 193 (cond 194 ((not b2) (%skip-ws-and-comments src pos line col file)) 195 ((%newline? b2) (%skip-ws-and-comments src pos line col file)) 196 (else 197 (%skip-line-comment src (%pk-pos p) (%pk-line p) (%pk-col p) 198 file))))))))))) 199 200 (define (%skip-block-comment src pos line col file start-line start-col) 201 (let ((n (bytevector-length src))) 202 (cond 203 ((>= pos n) 204 (die (%loc file start-line start-col) 205 "unterminated /* block comment")) 206 (else 207 (let ((b (bytevector-u8-ref src pos))) 208 (cond 209 ;; Fast path for plain content bytes that aren't '*'. 210 ((and (%fast-byte? b) (not (= b 42))) 211 (%skip-block-comment src (+ pos 1) line (+ col 1) 212 file start-line start-col)) 213 (else 214 ;; Slow path: '*', '\n', '?' (trigraph), '\\' (splice). 215 (let* ((p (%lex-peek src pos line col)) 216 (b1 (%pk-byte p))) 217 (cond 218 ((not b1) 219 (die (%loc file start-line start-col) 220 "unterminated /* block comment")) 221 ((= b1 42) 222 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 223 (b2 (%pk-byte q))) 224 (cond 225 ((not b2) 226 (die (%loc file start-line start-col) 227 "unterminated /* block comment")) 228 ((= b2 47) 229 (%skip-ws-and-comments src (%pk-pos q) (%pk-line q) (%pk-col q) 230 file)) 231 (else 232 ;; Re-scan starting at the byte after '*'; the '*' was 233 ;; not the closer, but the next byte might itself be '*'. 234 (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) 235 file start-line start-col))))) 236 (else 237 (%skip-block-comment src (%pk-pos p) (%pk-line p) (%pk-col p) 238 file start-line start-col))))))))))) 239 240 ;; -------------------------------------------------------------------- 241 ;; Byte-run scanners. 242 ;; 243 ;; Tail-recursive walkers used by ident/number/string readers. None 244 ;; allocate per scanned byte on the fast path (only %lex-peek 4-lists 245 ;; on trigraph/splice/newline); the per-iteration env frames allocated 246 ;; by tail recursion are reclaimed by the caller's heap-rewind!. 247 ;; 248 ;; - %scan-while: count bytes that satisfy pred. (count npos nline ncol) 249 ;; - %fill-while-bv: write matching bytes into a pre-sized bv. 250 ;; - %accum-int-while: accumulate a base-N integer over digit bytes. 251 ;; (val count npos nline ncol) 252 ;; - %accum-octal-bounded: same, but stops after k digits. 253 ;; -------------------------------------------------------------------- 254 (define (%scan-while pred src pos line col) 255 (let ((n (bytevector-length src))) 256 (let loop ((pos pos) (line line) (col col) (cnt 0)) 257 (cond 258 ((>= pos n) (list cnt pos line col)) 259 (else 260 (let ((b (bytevector-u8-ref src pos))) 261 (cond 262 ((%fast-byte? b) 263 (if (pred b) 264 (loop (+ pos 1) line (+ col 1) (+ cnt 1)) 265 (list cnt pos line col))) 266 (else 267 (let* ((p (%lex-peek src pos line col)) 268 (b2 (%pk-byte p))) 269 (if (and b2 (pred b2)) 270 (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ cnt 1)) 271 (list cnt pos line col))))))))))) 272 273 (define (%fill-while-bv pred src pos line col bv idx) 274 (let ((n (bytevector-length src))) 275 (let loop ((pos pos) (line line) (col col) (idx idx)) 276 (cond 277 ((>= pos n) idx) 278 (else 279 (let ((b (bytevector-u8-ref src pos))) 280 (cond 281 ((%fast-byte? b) 282 (cond 283 ((pred b) 284 (bytevector-u8-set! bv idx b) 285 (loop (+ pos 1) line (+ col 1) (+ idx 1))) 286 (else idx))) 287 (else 288 (let* ((p (%lex-peek src pos line col)) 289 (b2 (%pk-byte p))) 290 (cond 291 ((and b2 (pred b2)) 292 (bytevector-u8-set! bv idx b2) 293 (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ idx 1))) 294 (else idx))))))))))) 295 296 (define (%digit-val-byte b) 297 ;; ASCII digit byte → integer value. Caller guarantees b is a valid 298 ;; digit in the relevant base (0-9 / 0-7 / 0-9a-fA-F). 299 (cond ((%digit? b) (- b 48)) 300 ((if (< b 65) #f (if (< 70 b) #f #t)) (+ (- b 65) 10)) 301 ((if (< b 97) #f (if (< 102 b) #f #t)) (+ (- b 97) 10)) 302 (else 0))) 303 304 (define (%accum-int-while pred src pos line col base) 305 (let ((n (bytevector-length src))) 306 (let loop ((pos pos) (line line) (col col) (val 0) (cnt 0)) 307 (cond 308 ((>= pos n) (list val cnt pos line col)) 309 (else 310 (let ((b (bytevector-u8-ref src pos))) 311 (cond 312 ((%fast-byte? b) 313 (if (pred b) 314 (loop (+ pos 1) line (+ col 1) 315 (+ (* val base) (%digit-val-byte b)) (+ cnt 1)) 316 (list val cnt pos line col))) 317 (else 318 (let* ((p (%lex-peek src pos line col)) 319 (b2 (%pk-byte p))) 320 (if (and b2 (pred b2)) 321 (loop (%pk-pos p) (%pk-line p) (%pk-col p) 322 (+ (* val base) (%digit-val-byte b2)) (+ cnt 1)) 323 (list val cnt pos line col))))))))))) 324 325 (define (%accum-octal-bounded src pos line col k) 326 ;; Up to k octal digits. Returns (val count npos nline ncol). 327 (let ((n (bytevector-length src))) 328 (let loop ((pos pos) (line line) (col col) (k k) (val 0) (cnt 0)) 329 (cond 330 ((zero? k) (list val cnt pos line col)) 331 ((>= pos n) (list val cnt pos line col)) 332 (else 333 (let ((b (bytevector-u8-ref src pos))) 334 (cond 335 ((%fast-byte? b) 336 (if (%octal? b) 337 (loop (+ pos 1) line (+ col 1) (- k 1) 338 (+ (* val 8) (- b 48)) (+ cnt 1)) 339 (list val cnt pos line col))) 340 (else 341 (let* ((p (%lex-peek src pos line col)) 342 (b2 (%pk-byte p))) 343 (if (and b2 (%octal? b2)) 344 (loop (%pk-pos p) (%pk-line p) (%pk-col p) (- k 1) 345 (+ (* val 8) (- b2 48)) (+ cnt 1)) 346 (list val cnt pos line col))))))))))) 347 348 ;; -------------------------------------------------------------------- 349 ;; Identifier / keyword reader. 350 ;; 351 ;; Returns (tok npos nline ncol). Caller has already verified that the 352 ;; first byte at `pos` satisfies %ident-start?. 353 ;; 354 ;; Two-pass with heap-mark/rewind: pass 1 (%scan-while) sizes the run, 355 ;; then we rewind, allocate `name` bv post-rewind so it survives, then 356 ;; pass 2 (%fill-while-bv) writes into it under a fresh mark. The 357 ;; integer slots count/npos/nline/ncol are bound *before* the mark so 358 ;; they survive both rewinds. 359 ;; -------------------------------------------------------------------- 360 (define (lex-read-ident src pos file) 361 ;; Public for tests. Threads line/col from a fresh start. 362 (%lex-read-ident src pos 1 (+ pos 1) file)) 363 364 (define (%lex-read-ident src pos line col file) 365 (let ((start-loc (%loc file line col)) 366 (count 0) (npos 0) (nline 0) (ncol 0) 367 (mark 0)) 368 (set! mark (heap-mark)) 369 (let ((sres (%scan-while %ident-cont? src pos line col))) 370 (set! count (car sres)) 371 (set! npos (car (cdr sres))) 372 (set! nline (car (cdr (cdr sres)))) 373 (set! ncol (car (cdr (cdr (cdr sres)))))) 374 (heap-rewind! mark) 375 (let ((name (make-bytevector count 0)) 376 (mark2 0)) 377 (set! mark2 (heap-mark)) 378 (%fill-while-bv %ident-cont? src pos line col name 0) 379 (heap-rewind! mark2) 380 (let ((kw (alist-ref name %keyword-alist))) 381 (cons (if kw 382 (make-tok 'KW kw start-loc) 383 (make-tok 'IDENT name start-loc)) 384 (list npos nline ncol)))))) 385 386 ;; -------------------------------------------------------------------- 387 ;; Number reader. 388 ;; 389 ;; Decimal: [1-9][0-9]* (suffix: u U l L ll LL combinations) 390 ;; Hex: 0x[0-9a-fA-F]+ | 0X... 391 ;; Octal: 0[0-7]* 392 ;; Float: anything looking like 1.0, 1e3, .5 → die crisply. 393 ;; 394 ;; Returns (tok npos nline ncol) on success. Aborts via `die` on float. 395 ;; 396 ;; %accum-int-while folds digit collection and value computation into 397 ;; one walk — no per-byte cons cells, no separate digits-list pass. 398 ;; -------------------------------------------------------------------- 399 (define (lex-read-number src pos file) 400 (%lex-read-number src pos 1 (+ pos 1) file)) 401 402 (define (%lex-read-number src pos line col file) 403 (let* ((start-loc (%loc file line col)) 404 (p (%lex-peek src pos line col)) 405 (b (%pk-byte p))) 406 (cond 407 ;; '0x' / '0X' hex prefix 408 ((and (= b 48) 409 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 410 (b2 (%pk-byte q))) 411 (and b2 (or (= b2 120) (= b2 88))))) ; 'x' or 'X' 412 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 413 (r (%accum-int-while %hex? src 414 (%pk-pos q) (%pk-line q) (%pk-col q) 16)) 415 (val (car r)) 416 (cnt (car (cdr r))) 417 (pos2 (car (cdr (cdr r)))) 418 (line2 (car (cdr (cdr (cdr r))))) 419 (col2 (car (cdr (cdr (cdr (cdr r))))))) 420 (if (zero? cnt) 421 (die start-loc "expected hex digits after 0x") 422 (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file))) 423 (cons (make-tok 'INT val start-loc) after))))) 424 ;; '0' alone → octal sequence (could be just zero) 425 ((= b 48) 426 (let* ((r (%accum-int-while %octal? src 427 (%pk-pos p) (%pk-line p) (%pk-col p) 8)) 428 (val (car r)) 429 (pos2 (car (cdr (cdr r)))) 430 (line2 (car (cdr (cdr (cdr r))))) 431 (col2 (car (cdr (cdr (cdr (cdr r))))))) 432 ;; Reject '.' / 'e' / 'E' immediately after the octal run — float. 433 (%check-no-float src pos2 line2 col2 file start-loc) 434 ;; Reject stray digits 8/9 in an octal context (e.g. 089). 435 (let* ((p3 (%lex-peek src pos2 line2 col2)) 436 (b3 (%pk-byte p3))) 437 (if (and b3 (%digit? b3)) 438 (die start-loc "invalid octal digit" (bv-of-byte b3)) 439 (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file))) 440 (cons (make-tok 'INT val start-loc) after)))))) 441 ;; '1'-'9' → decimal 442 ((%digit? b) 443 (let* ((r (%accum-int-while %digit? src pos line col 10)) 444 (val (car r)) 445 (pos2 (car (cdr (cdr r)))) 446 (line2 (car (cdr (cdr (cdr r))))) 447 (col2 (car (cdr (cdr (cdr (cdr r))))))) 448 (%check-no-float src pos2 line2 col2 file start-loc) 449 (let ((after (%lex-strip-int-suffix src pos2 line2 col2 file))) 450 (cons (make-tok 'INT val start-loc) after)))) 451 ;; '.' followed by a digit = float-style literal — reject. 452 ((= b 46) 453 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 454 (b2 (%pk-byte q))) 455 (if (and b2 (%digit? b2)) 456 (die start-loc "floating-point literal not supported") 457 ;; Otherwise '.' was a punctuator — caller wouldn't have 458 ;; routed here unless it was a digit-led prefix. 459 (die start-loc "internal: number reader on non-number")))) 460 (else 461 (die start-loc "internal: number reader on non-number"))))) 462 463 (define (%check-no-float src pos line col file start-loc) 464 ;; If the byte at pos starts a fractional/exponent part, abort. 465 (let* ((p (%lex-peek src pos line col)) 466 (b (%pk-byte p))) 467 (cond 468 ((not b) #t) 469 ((= b 46) ; '.' 470 (die start-loc "floating-point literal not supported")) 471 ((or (= b 101) (= b 69)) ; 'e' / 'E' 472 ;; Only a float exponent if followed by [+-]?digit. 473 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 474 (b2 (%pk-byte q))) 475 (cond 476 ((and b2 (%digit? b2)) 477 (die start-loc "floating-point literal not supported")) 478 ((and b2 (or (= b2 43) (= b2 45))) 479 (let* ((r (%lex-peek src (%pk-pos q) (%pk-line q) (%pk-col q))) 480 (b3 (%pk-byte r))) 481 (if (and b3 (%digit? b3)) 482 (die start-loc "floating-point literal not supported") 483 #t))) 484 (else #t)))) 485 (else #t)))) 486 487 (define (%lex-strip-int-suffix src pos line col file) 488 ;; Consume any combination of u U l L (the long can be doubled). We 489 ;; don't validate orderings strictly; tcc.c uses the canonical forms. 490 ;; Returns (npos nline ncol). 491 (let loop ((pos pos) (line line) (col col)) 492 (let* ((p (%lex-peek src pos line col)) 493 (b (%pk-byte p))) 494 (cond 495 ((not b) (list pos line col)) 496 ((or (= b 117) (= b 85) ; u U 497 (= b 108) (= b 76)) ; l L 498 (loop (%pk-pos p) (%pk-line p) (%pk-col p))) 499 (else (list pos line col)))))) 500 501 ;; -------------------------------------------------------------------- 502 ;; Escape sequence reader. 503 ;; 504 ;; %scan-or-fill-escape decodes one escape sequence starting at `pos` 505 ;; (which points one past the leading `\\`). When `bv` is a bytevector, 506 ;; the resulting byte is written to (bv idx); when it is #f, no write 507 ;; occurs (used during the string-pass scan phase). Returns the 4-list 508 ;; (val npos nline ncol). 509 ;; -------------------------------------------------------------------- 510 (define (%scan-or-fill-escape src pos line col file start-loc bv idx) 511 (let* ((p (%lex-peek src pos line col)) 512 (b (%pk-byte p))) 513 (cond 514 ((not b) (die start-loc "unterminated escape sequence")) 515 ;; \xNN — 1+ hex digits (tcc.c uses 1- and 2-digit forms). 516 ((or (= b 120) (= b 88)) ; 'x' / 'X' 517 (let* ((r (%accum-int-while %hex? src 518 (%pk-pos p) (%pk-line p) (%pk-col p) 16)) 519 (val0 (car r)) 520 (cnt (car (cdr r))) 521 (pos2 (car (cdr (cdr r)))) 522 (line2 (car (cdr (cdr (cdr r))))) 523 (col2 (car (cdr (cdr (cdr (cdr r))))))) 524 (cond 525 ((zero? cnt) (die start-loc "expected hex digits after \\x")) 526 (else 527 (let ((val (bit-and val0 255))) 528 (cond (bv (bytevector-u8-set! bv idx val)) 529 (else #f)) 530 (list val pos2 line2 col2)))))) 531 ;; \NNN — 1..3 octal digits. 532 ((%octal? b) 533 (let* ((r (%accum-octal-bounded src pos line col 3)) 534 (val0 (car r)) 535 (pos2 (car (cdr (cdr r)))) 536 (line2 (car (cdr (cdr (cdr r))))) 537 (col2 (car (cdr (cdr (cdr (cdr r)))))) 538 (val (bit-and val0 255))) 539 (cond (bv (bytevector-u8-set! bv idx val)) 540 (else #f)) 541 (list val pos2 line2 col2))) 542 (else 543 (let ((val (cond ((= b 110) 10) ; n 544 ((= b 116) 9) ; t 545 ((= b 114) 13) ; r 546 ((= b 92) 92) ; \\ 547 ((= b 39) 39) ; ' 548 ((= b 34) 34) ; " 549 ((= b 48) 0) ; 0 (already handled by octal but be safe) 550 ((= b 97) 7) ; \a -> BEL 551 ((= b 98) 8) ; \b 552 ((= b 102) 12) ; \f 553 ((= b 118) 11) ; \v 554 ((= b 63) 63) ; \? 555 (else 556 (die start-loc "unknown escape" (bv-of-byte b)))))) 557 (cond (bv (bytevector-u8-set! bv idx val)) 558 (else #f)) 559 (list val (%pk-pos p) (%pk-line p) (%pk-col p))))))) 560 561 ;; -------------------------------------------------------------------- 562 ;; String reader. 563 ;; 564 ;; Caller has verified src[pos] == '"' (raw byte 34). Returns 565 ;; (tok npos nline ncol) with the raw decoded bytes (no NUL appended). 566 ;; 567 ;; Two-pass: %string-pass with bv=#f counts effective bytes (escapes 568 ;; collapse to 1 byte each); after rewind we allocate the final bv and 569 ;; rerun with bv set so the bytes are written directly into it. 570 ;; -------------------------------------------------------------------- 571 (define (lex-read-string src pos file) 572 (%lex-read-string src pos 1 (+ pos 1) file)) 573 574 (define (%lex-read-string src pos line col file) 575 (let ((start-loc (%loc file line col)) 576 (cnt 0) (npos 0) (nline 0) (ncol 0) 577 (mark 0)) 578 ;; '"' (34) is a fast-byte and never a trigraph result, so the 579 ;; physical byte at `pos` is exactly the opening quote. 580 (cond 581 ((or (>= pos (bytevector-length src)) 582 (not (= (bytevector-u8-ref src pos) 34))) 583 (die start-loc "internal: string reader on non-quote")) 584 (else 585 (set! mark (heap-mark)) 586 (let ((sres (%string-pass src (+ pos 1) line (+ col 1) 587 file start-loc #f))) 588 (set! cnt (car sres)) 589 (set! npos (car (cdr sres))) 590 (set! nline (car (cdr (cdr sres)))) 591 (set! ncol (car (cdr (cdr (cdr sres)))))) 592 (heap-rewind! mark) 593 (let ((bv (make-bytevector cnt 0)) 594 (mark2 0)) 595 (set! mark2 (heap-mark)) 596 (%string-pass src (+ pos 1) line (+ col 1) file start-loc bv) 597 (heap-rewind! mark2) 598 (cons (make-tok 'STR bv start-loc) 599 (list npos nline ncol))))))) 600 601 (define (%string-pass src pos line col file start-loc bv) 602 ;; Walk the string body (after opening "). When `bv` is #f, count 603 ;; effective bytes; when it is a bytevector, write bytes into it at 604 ;; index 0..count-1. Returns (count npos nline ncol). 605 (let ((n (bytevector-length src))) 606 (let loop ((pos pos) (line line) (col col) (idx 0)) 607 (cond 608 ((>= pos n) (die start-loc "unterminated string literal")) 609 (else 610 (let ((b (bytevector-u8-ref src pos))) 611 (cond 612 ;; Closing quote — fast byte but special. 613 ((= b 34) 614 (list idx (+ pos 1) line (+ col 1))) 615 ((%fast-byte? b) 616 (cond (bv (bytevector-u8-set! bv idx b)) 617 (else #f)) 618 (loop (+ pos 1) line (+ col 1) (+ idx 1))) 619 (else 620 ;; Slow path: ?/\ (trigraph/splice/escape) or '\n'. 621 (let* ((p (%lex-peek src pos line col)) 622 (b2 (%pk-byte p))) 623 (cond 624 ((not b2) 625 (die start-loc "unterminated string literal")) 626 ((= b2 34) 627 (list idx (%pk-pos p) (%pk-line p) (%pk-col p))) 628 ((%newline? b2) 629 (die start-loc "newline in string literal")) 630 ((= b2 92) 631 (let* ((er (%scan-or-fill-escape 632 src (%pk-pos p) (%pk-line p) (%pk-col p) 633 file start-loc bv idx)) 634 (epos (car (cdr er))) 635 (eline (car (cdr (cdr er)))) 636 (ecol (car (cdr (cdr (cdr er)))))) 637 (loop epos eline ecol (+ idx 1)))) 638 (else 639 (cond (bv (bytevector-u8-set! bv idx b2)) 640 (else #f)) 641 (loop (%pk-pos p) (%pk-line p) (%pk-col p) (+ idx 1))))))))))))) 642 643 ;; -------------------------------------------------------------------- 644 ;; Char reader. 645 ;; 646 ;; Caller has verified src[pos] == '\''. Multi-character constants 647 ;; ('AB') are rejected via die. 648 ;; -------------------------------------------------------------------- 649 (define (lex-read-char src pos file) 650 (%lex-read-char src pos 1 (+ pos 1) file)) 651 652 (define (%lex-read-char src pos line col file) 653 (let* ((start-loc (%loc file line col)) 654 (p0 (%lex-peek src pos line col)) 655 (b0 (%pk-byte p0))) 656 (if (not (and b0 (= b0 39))) 657 (die start-loc "internal: char reader on non-quote") 658 (%collect-char src (%pk-pos p0) (%pk-line p0) (%pk-col p0) 659 file start-loc)))) 660 661 (define (%collect-char src pos line col file start-loc) 662 ;; Read exactly one byte (handling escapes), then expect closing '\''. 663 (let* ((p (%lex-peek src pos line col)) 664 (b (%pk-byte p))) 665 (cond 666 ((not b) (die start-loc "unterminated char literal")) 667 ((= b 39) (die start-loc "empty char literal")) 668 ((%newline? b) (die start-loc "newline in char literal")) 669 ((= b 92) ; escape 670 (let* ((r (%scan-or-fill-escape src 671 (%pk-pos p) (%pk-line p) (%pk-col p) 672 file start-loc #f 0)) 673 (val (car r)) 674 (pos2 (car (cdr r))) 675 (line2 (car (cdr (cdr r)))) 676 (col2 (car (cdr (cdr (cdr r)))))) 677 (%expect-char-close src pos2 line2 col2 file start-loc val))) 678 (else 679 (%expect-char-close src (%pk-pos p) (%pk-line p) (%pk-col p) 680 file start-loc b))))) 681 682 (define (%expect-char-close src pos line col file start-loc val) 683 (let* ((p (%lex-peek src pos line col)) 684 (b (%pk-byte p))) 685 (cond 686 ((not b) (die start-loc "unterminated char literal")) 687 ((= b 39) 688 (cons (make-tok 'CHAR val start-loc) 689 (list (%pk-pos p) (%pk-line p) (%pk-col p)))) 690 (else 691 (die start-loc "multi-character char constant not supported"))))) 692 693 ;; -------------------------------------------------------------------- 694 ;; Punctuator reader. 695 ;; 696 ;; Greedy longest-match against %punct-alist (cc/data.scm). The alist 697 ;; is already ordered longest-first. We additionally bucket entries by 698 ;; their first byte so %lex-read-punct only loops over the small set of 699 ;; patterns that can start at the current source byte. 700 ;; -------------------------------------------------------------------- 701 702 (define (%alist-ref-int k al) 703 ;; Lookup in an int-keyed alist (linear scan, '= compare). 704 (cond ((null? al) #f) 705 ((= (car (car al)) k) (cdr (car al))) 706 (else (%alist-ref-int k (cdr al))))) 707 708 (define (%mem-int? k xs) 709 (cond ((null? xs) #f) 710 ((= (car xs) k) #t) 711 (else (%mem-int? k (cdr xs))))) 712 713 (define (%filter-by-first-byte b al) 714 ;; Subset of `al` whose pattern starts with byte b, preserving order. 715 (cond 716 ((null? al) '()) 717 ((= (bytevector-u8-ref (car (car al)) 0) b) 718 (cons (car al) (%filter-by-first-byte b (cdr al)))) 719 (else (%filter-by-first-byte b (cdr al))))) 720 721 (define (%group-by-first-byte al) 722 ;; Build ((first-byte . sub-alist) ...) over `al`, one bucket per 723 ;; distinct first byte; sub-alist preserves longest-match-first 724 ;; order from the source list. 725 (let loop ((xs al) (seen '()) (out '())) 726 (cond 727 ((null? xs) (reverse out)) 728 (else 729 (let* ((entry (car xs)) 730 (pat (car entry)) 731 (b (bytevector-u8-ref pat 0))) 732 (cond 733 ((%mem-int? b seen) (loop (cdr xs) seen out)) 734 (else 735 (loop (cdr xs) 736 (cons b seen) 737 (cons (cons b (%filter-by-first-byte b al)) out))))))))) 738 739 (define %punct-buckets (%group-by-first-byte %punct-alist)) 740 741 (define (lex-read-punct src pos file) 742 (%lex-read-punct src pos 1 (+ pos 1) file)) 743 744 (define (%lex-read-punct src pos line col file) 745 (let* ((start-loc (%loc file line col)) 746 (p (%lex-peek src pos line col)) 747 (b (%pk-byte p))) 748 (cond 749 ((not b) (die start-loc "unrecognized byte" "EOF")) 750 (else 751 (let ((bucket (%alist-ref-int b %punct-buckets))) 752 (cond 753 ((not bucket) (die start-loc "unrecognized byte" (bv-of-byte b))) 754 (else (%punct-loop src pos line col file start-loc bucket)))))))) 755 756 (define (%punct-loop src pos line col file start-loc al) 757 (cond 758 ((null? al) 759 (let* ((p (%lex-peek src pos line col))) 760 (die start-loc "unrecognized byte" 761 (if (%pk-byte p) (bv-of-byte (%pk-byte p)) "EOF")))) 762 (else 763 (let* ((entry (car al)) 764 (pat (car entry)) 765 (sym (cdr entry)) 766 (m (%match-bytes src pos line col pat 0))) 767 (if m 768 (cons (make-tok 'PUNCT sym start-loc) m) 769 (%punct-loop src pos line col file start-loc (cdr al))))))) 770 771 (define (%match-bytes src pos line col pat i) 772 ;; If the next bytes from (pos line col), in logical-byte stream 773 ;; order, equal `pat[i..]`, return (npos nline ncol) after the 774 ;; match. Otherwise #f. 775 (cond 776 ((= i (bytevector-length pat)) (list pos line col)) 777 (else 778 (let ((n (bytevector-length src))) 779 (cond 780 ((>= pos n) #f) 781 (else 782 (let ((b (bytevector-u8-ref src pos)) 783 (pb (bytevector-u8-ref pat i))) 784 (cond 785 ((%fast-byte? b) 786 (if (= b pb) 787 (%match-bytes src (+ pos 1) line (+ col 1) pat (+ i 1)) 788 #f)) 789 (else 790 (let* ((p (%lex-peek src pos line col)) 791 (b2 (%pk-byte p))) 792 (cond 793 ((not b2) #f) 794 ((= b2 pb) 795 (%match-bytes src (%pk-pos p) (%pk-line p) (%pk-col p) 796 pat (+ i 1))) 797 (else #f)))))))))))) 798 799 ;; -------------------------------------------------------------------- 800 ;; lex-tokenize src file -> list of tok ending in EOF. 801 ;; -------------------------------------------------------------------- 802 (define (lex-tokenize src file) 803 (%lex-loop src 0 1 1 file '() #t)) 804 805 ;; bol? — `#t` when no token has been emitted on the current physical 806 ;; line yet (start of file, or only NL + whitespace seen since the last 807 ;; line break). pp recognizes a directive only when its leading `#` is 808 ;; at line-start; we forward that decision into the token stream by 809 ;; emitting `HASH` instead of `(PUNCT hash …)` for a line-leading `#`. 810 (define (%lex-loop src pos line col file acc bol?) 811 (let* ((sw (%skip-ws-and-comments src pos line col file)) 812 (pos1 (car sw)) 813 (line1 (car (cdr sw))) 814 (col1 (car (cdr (cdr sw)))) 815 (p (%lex-peek src pos1 line1 col1)) 816 (b (%pk-byte p))) 817 (cond 818 ;; EOF 819 ((not b) 820 (let* ((eof-tok (make-tok 'EOF #f (%loc file line1 col1)))) 821 (reverse (cons eof-tok acc)))) 822 ;; Newline → emit NL, reset bol?. 823 ((%newline? b) 824 (let ((nl (make-tok 'NL #f (%loc file line1 col1)))) 825 (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p) 826 file (cons nl acc) #t))) 827 ;; Line-leading `#` → emit HASH, but only the bare `#`. `##` is 828 ;; never line-leading in valid C; if it appears, fall through to 829 ;; normal punctuator handling so it lexes as `paste`. 830 ((and bol? (= b 35)) 831 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 832 (b2 (%pk-byte q))) 833 (cond 834 ((and b2 (= b2 35)) 835 (let* ((r (%lex-read-punct src pos1 line1 col1 file)) 836 (tok (car r)) 837 (npos (car (cdr r))) 838 (nline (car (cdr (cdr r)))) 839 (ncol (car (cdr (cdr (cdr r)))))) 840 (%lex-loop src npos nline ncol file (cons tok acc) #f))) 841 (else 842 (let ((tok (make-tok 'HASH #f (%loc file line1 col1)))) 843 (%lex-loop src (%pk-pos p) (%pk-line p) (%pk-col p) 844 file (cons tok acc) #f)))))) 845 ;; Identifier / keyword 846 ((%ident-start? b) 847 (let* ((r (%lex-read-ident src pos1 line1 col1 file)) 848 (tok (car r)) 849 (npos (car (cdr r))) 850 (nline (car (cdr (cdr r)))) 851 (ncol (car (cdr (cdr (cdr r)))))) 852 (%lex-loop src npos nline ncol file (cons tok acc) #f))) 853 ;; Number (digit start) 854 ((%digit? b) 855 (let* ((r (%lex-read-number src pos1 line1 col1 file)) 856 (tok (car r)) 857 (npos (car (cdr r))) 858 (nline (car (cdr (cdr r)))) 859 (ncol (car (cdr (cdr (cdr r)))))) 860 (%lex-loop src npos nline ncol file (cons tok acc) #f))) 861 ;; '.' might start a number (1.0 actually starts with digit; .5 862 ;; would route here). We keep this as a punctuator unless followed 863 ;; by a digit, in which case the lexer rejects per spec. 864 ((= b 46) 865 (let* ((q (%lex-peek src (%pk-pos p) (%pk-line p) (%pk-col p))) 866 (b2 (%pk-byte q))) 867 (cond 868 ((and b2 (%digit? b2)) 869 (die (%loc file line1 col1) "floating-point literal not supported")) 870 (else 871 (let* ((r (%lex-read-punct src pos1 line1 col1 file)) 872 (tok (car r)) 873 (npos (car (cdr r))) 874 (nline (car (cdr (cdr r)))) 875 (ncol (car (cdr (cdr (cdr r)))))) 876 (%lex-loop src npos nline ncol file (cons tok acc) #f)))))) 877 ;; String 878 ((= b 34) 879 (let* ((r (%lex-read-string src pos1 line1 col1 file)) 880 (tok (car r)) 881 (npos (car (cdr r))) 882 (nline (car (cdr (cdr r)))) 883 (ncol (car (cdr (cdr (cdr r)))))) 884 (%lex-loop src npos nline ncol file (cons tok acc) #f))) 885 ;; Char 886 ((= b 39) 887 (let* ((r (%lex-read-char src pos1 line1 col1 file)) 888 (tok (car r)) 889 (npos (car (cdr r))) 890 (nline (car (cdr (cdr r)))) 891 (ncol (car (cdr (cdr (cdr r)))))) 892 (%lex-loop src npos nline ncol file (cons tok acc) #f))) 893 ;; Punctuator (default) 894 (else 895 (let* ((r (%lex-read-punct src pos1 line1 col1 file)) 896 (tok (car r)) 897 (npos (car (cdr r))) 898 (nline (car (cdr (cdr r)))) 899 (ncol (car (cdr (cdr (cdr r)))))) 900 (%lex-loop src npos nline ncol file (cons tok acc) #f))))))