lex.c (20578B)
1 /* C11 lexer (§6.4). Streams tokens out of a borrowed source buffer. 2 * 3 * Tokens are recognized per the standard's lexical grammar: 4 * - identifiers (§6.4.2) — keyword bucketing happens later in parse_c 5 * - pp-numbers (§6.4.8), classified into TOK_NUM / TOK_FLT 6 * - string literals (§6.4.5) and character constants (§6.4.4.4) 7 * including the L/u/u8/U encoding prefixes 8 * - punctuators (§6.4.6), longest-match, including digraphs 9 * - `#` and `##` surface as TOK_PP_HASH / TOK_PP_PASTE so the 10 * preprocessor can recognize directives and the paste operator 11 * 12 * Comments (§6.4.9) are consumed as whitespace; physical newlines surface 13 * as TOK_NEWLINE so PP can implement directive-line semantics. */ 14 15 #include "lex/lex.h" 16 17 #include <string.h> 18 19 struct Lexer { 20 Compiler* c; 21 Pool* pool; 22 Heap* heap; 23 const char* src; 24 size_t len; 25 size_t pos; 26 u32 file_id; 27 u32 line; 28 u32 col; 29 u8 at_bol; 30 u8 had_space; 31 /* §5.1.1.2 phase 4 directive context for header-name lexing. 32 * 0 = none, 1 = saw pp-hash, 2 = saw `#include`/etc and the next 33 * token may be a header-name. */ 34 u8 dstate; 35 }; 36 37 /* §5.1.1.2 translation phase 2: splice physical lines joined by 38 * backslash-newline. Advance past any splice sequence at l->pos so the 39 * cursor never rests on the leading backslash of a splice. */ 40 static void skip_splices(Lexer* l) { 41 while (l->pos + 1 < l->len && l->src[l->pos] == '\\' && 42 l->src[l->pos + 1] == '\n') { 43 l->pos += 2; 44 l->line++; 45 l->col = 1; 46 } 47 } 48 49 /* Logical peek: returns the off-th post-splice byte starting at l->pos, 50 * or -1 at end of input. Does not mutate l->pos. */ 51 static int peek(const Lexer* l, size_t off) { 52 size_t pos = l->pos; 53 size_t k = 0; 54 while (pos < l->len) { 55 if (pos + 1 < l->len && l->src[pos] == '\\' && l->src[pos + 1] == '\n') { 56 pos += 2; 57 continue; 58 } 59 if (k == off) return (unsigned char)l->src[pos]; 60 ++pos; 61 ++k; 62 } 63 return -1; 64 } 65 66 static int bump(Lexer* l) { 67 int ch; 68 skip_splices(l); 69 if (l->pos >= l->len) return -1; 70 ch = (unsigned char)l->src[l->pos++]; 71 if (ch == '\n') { 72 l->line++; 73 l->col = 1; 74 } else { 75 l->col++; 76 } 77 return ch; 78 } 79 80 static int is_digit(int c) { return c >= '0' && c <= '9'; } 81 static int is_hex_digit(int c) { 82 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || 83 (c >= 'A' && c <= 'F'); 84 } 85 /* Identifier-start byte (§6.4.2.1). Letters and underscore are ASCII; bytes 86 * ≥ 0x80 are accepted as the implementation-defined "other characters" 87 * permitted in identifiers — in practice UTF-8 lead/continuation bytes for 88 * extended source characters. UCNs are matched separately via ucn_len since 89 * they span multiple source bytes. */ 90 static int is_alpha(int c) { 91 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || 92 c >= 0x80; 93 } 94 static int is_alnum(int c) { return is_alpha(c) || is_digit(c); } 95 96 /* Match a UCN at offset `off` from the current position. Returns the total 97 * length (6 for \uXXXX, 10 for \UXXXXXXXX), or 0 if no UCN matches. The 98 * range constraints from §6.4.3 (no UCN < 00A0 except $/@/`, and none in 99 * D800–DFFF) are not enforced here — the lexical form is matched and any 100 * downstream phase that cares can diagnose. */ 101 static int ucn_len(const Lexer* l, size_t off) { 102 int n, i; 103 if (peek(l, off) != '\\') return 0; 104 if (peek(l, off + 1) == 'u') 105 n = 4; 106 else if (peek(l, off + 1) == 'U') 107 n = 8; 108 else 109 return 0; 110 for (i = 0; i < n; ++i) { 111 if (!is_hex_digit(peek(l, off + 2 + i))) return 0; 112 } 113 return 2 + n; 114 } 115 116 static SrcLoc lex_here(const Lexer* l) { 117 SrcLoc loc; 118 loc.file_id = l->file_id; 119 loc.line = l->line; 120 loc.col = l->col; 121 return loc; 122 } 123 124 Lexer* lex_open_mem(Compiler* c, const char* name, const char* src, 125 size_t len) { 126 Heap* h = (Heap*)kit_compiler_context(c)->heap; 127 Lexer* l = (Lexer*)h->alloc(h, sizeof(*l), _Alignof(Lexer)); 128 if (!l) return NULL; 129 memset(l, 0, sizeof(*l)); 130 l->c = c; 131 l->pool = c_pool_new(c); 132 if (!l->pool) { 133 h->free(h, l, sizeof(*l)); 134 return NULL; 135 } 136 l->heap = h; 137 l->src = src ? src : ""; 138 l->len = src ? len : 0; 139 l->pos = 0; 140 l->file_id = 0; 141 (void)kit_source_add_memory(c, kit_slice_cstr(name), &l->file_id); 142 l->line = 1; 143 l->col = 1; 144 l->at_bol = 1; 145 l->had_space = 0; 146 return l; 147 } 148 149 void lex_close(Lexer* l) { 150 if (!l) return; 151 c_pool_free(l->pool); 152 l->heap->free(l->heap, l, sizeof(*l)); 153 } 154 155 /* Skip a script "shebang" line: a `#!` at the very start of the source. 156 * The kernel-level `#!/path interpreter` mechanism (used to make a C file 157 * executable via `kit run`) leaves the interpreter line as the first line of 158 * the file, which is not valid C — `#!` would otherwise be lexed as a `#` 159 * directive introducer. We only recognize it at byte 0, so a `#!` anywhere 160 * else is left untouched. The line's trailing newline is left in place so the 161 * lexer emits its TOK_NEWLINE and line numbering stays accurate (the shebang 162 * remains line 1). No-op unless the buffer begins with the two bytes `#!`. 163 * Apply only to a primary source file, never to includes/paste buffers. */ 164 void lex_skip_shebang(Lexer* l) { 165 if (!l || l->pos != 0) return; 166 if (l->len < 2 || l->src[0] != '#' || l->src[1] != '!') return; 167 while (l->pos < l->len && l->src[l->pos] != '\n') l->pos++; 168 } 169 170 SrcLoc lex_loc(const Lexer* l) { return lex_here(l); } 171 u32 lex_file_id(const Lexer* l) { return l->file_id; } 172 173 /* Intern bytes [start, end) with line splices (\<newline>) removed, so token 174 * spellings reflect post-phase-2 logical text. */ 175 static Sym intern_spliced(Lexer* l, size_t start, size_t end) { 176 size_t i; 177 int has_splice = 0; 178 char* buf; 179 size_t k; 180 Sym sym; 181 182 for (i = start; i + 1 < end; ++i) { 183 if (l->src[i] == '\\' && l->src[i + 1] == '\n') { 184 has_splice = 1; 185 break; 186 } 187 } 188 if (!has_splice) 189 return kit_sym_intern(l->pool->c, 190 (KitSlice){.s = l->src + start, .len = end - start}); 191 192 buf = (char*)l->heap->alloc(l->heap, end - start, 1); 193 k = 0; 194 for (i = start; i < end;) { 195 if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') { 196 i += 2; 197 continue; 198 } 199 buf[k++] = l->src[i++]; 200 } 201 sym = kit_sym_intern(l->pool->c, (KitSlice){.s = buf, .len = k}); 202 l->heap->free(l->heap, buf, end - start); 203 return sym; 204 } 205 206 /* §6.4.7 header-name lookahead: in include-directive context, a `<` or `"` 207 * starts a header-name that runs to the matching `>` or `"`. The lexer 208 * recognizes only header-name forms (whose contents are implementation 209 * defined), not q-char-sequence escape rules. */ 210 static int matches_include_kw(const char* s, size_t n) { 211 if (n == 7 && memcmp(s, "include", 7) == 0) return 1; 212 if (n == 12 && memcmp(s, "include_next", 12) == 0) return 1; 213 if (n == 6 && memcmp(s, "import", 6) == 0) return 1; 214 if (n == 5 && memcmp(s, "embed", 5) == 0) return 1; 215 return 0; 216 } 217 218 /* Skip whitespace and comments. Returns 1 if a newline boundary was crossed 219 * via comment consumption (caller still emits the explicit newline token on 220 * an in-source '\n'). */ 221 static void skip_ws_and_comments(Lexer* l) { 222 for (;;) { 223 int ch = peek(l, 0); 224 if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\v' || ch == '\f') { 225 bump(l); 226 l->had_space = 1; 227 continue; 228 } 229 if (ch == '/' && peek(l, 1) == '/') { 230 bump(l); 231 bump(l); 232 while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l); 233 l->had_space = 1; 234 continue; 235 } 236 if (ch == '/' && peek(l, 1) == '*') { 237 bump(l); 238 bump(l); 239 while (peek(l, 0) >= 0) { 240 if (peek(l, 0) == '*' && peek(l, 1) == '/') { 241 bump(l); 242 bump(l); 243 break; 244 } 245 bump(l); 246 } 247 l->had_space = 1; 248 continue; 249 } 250 break; 251 } 252 } 253 254 /* Consume a pp-number per §6.4.8. The cursor is positioned at the leading 255 * digit (or `.` followed by a digit) on entry. */ 256 static void scan_pp_number(Lexer* l) { 257 if (peek(l, 0) == '.') bump(l); 258 bump(l); /* first digit */ 259 while (l->pos < l->len) { 260 int c = peek(l, 0); 261 int n = peek(l, 1); 262 if ((c == 'e' || c == 'E' || c == 'p' || c == 'P') && 263 (n == '+' || n == '-')) { 264 bump(l); 265 bump(l); 266 } else if (is_alnum(c) || c == '.') { 267 bump(l); 268 } else { 269 break; 270 } 271 } 272 } 273 274 /* 1 if the pp-number text is a floating constant (§6.4.4.2): contains a 275 * radix `.`, a hex `p`/`P` exponent, or a decimal `e`/`E` exponent. */ 276 static int pp_number_is_float(const char* s, size_t n) { 277 int is_hex = 0; 278 size_t i = 0; 279 if (n >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) { 280 is_hex = 1; 281 i = 2; 282 } 283 for (; i < n; ++i) { 284 char c = s[i]; 285 if (c == '.') return 1; 286 if (is_hex && (c == 'p' || c == 'P')) return 1; 287 if (!is_hex && (c == 'e' || c == 'E')) { 288 if (i + 1 < n) { 289 char nx = s[i + 1]; 290 if (nx == '+' || nx == '-' || (nx >= '0' && nx <= '9')) return 1; 291 } 292 } 293 } 294 return 0; 295 } 296 297 /* Consume a quoted body — string ('"') or character ('\''). The cursor is 298 * positioned at the opening quote on entry. Returns 1 on an unterminated or 299 * newline-broken literal, 0 on a clean close. */ 300 static int scan_quoted(Lexer* l, int quote) { 301 bump(l); /* opening quote */ 302 for (;;) { 303 int ch = peek(l, 0); 304 if (ch < 0) return 1; 305 if (ch == quote) { 306 bump(l); 307 return 0; 308 } 309 if (ch == '\n') return 1; 310 if (ch == '\\') { 311 bump(l); /* backslash */ 312 if (peek(l, 0) < 0) return 1; 313 bump(l); /* the escaped char */ 314 continue; 315 } 316 bump(l); 317 } 318 } 319 320 Tok lex_next(Lexer* l) { 321 Tok t; 322 SrcLoc tloc; 323 size_t start; 324 int ch; 325 326 memset(&t, 0, sizeof(t)); 327 328 /* Skip whitespace and comments. A newline token is emitted before any 329 * subsequent content tokens for the line that follows. */ 330 for (;;) { 331 skip_ws_and_comments(l); 332 skip_splices(l); 333 if (l->pos >= l->len) { 334 t.kind = TOK_EOF; 335 t.loc = lex_here(l); 336 return t; 337 } 338 if (peek(l, 0) == '\n') { 339 tloc = lex_here(l); 340 bump(l); 341 t.kind = TOK_NEWLINE; 342 t.loc = tloc; 343 l->at_bol = 1; 344 l->had_space = 0; 345 l->dstate = 0; 346 return t; 347 } 348 break; 349 } 350 351 tloc = lex_here(l); 352 start = l->pos; 353 ch = peek(l, 0); 354 355 if (l->at_bol) t.flags |= TF_AT_BOL; 356 if (l->had_space) t.flags |= TF_HAS_SPACE; 357 l->at_bol = 0; 358 l->had_space = 0; 359 t.loc = tloc; 360 361 /* §6.4.7 header-name: only valid in #include / #embed argument context. */ 362 if (l->dstate == 2 && (ch == '<' || ch == '"')) { 363 int closer = (ch == '<') ? '>' : '"'; 364 bump(l); 365 for (;;) { 366 int c = peek(l, 0); 367 if (c < 0 || c == '\n') { 368 t.flags |= TF_LITERAL_BAD; 369 break; 370 } 371 if (c == closer) { 372 bump(l); 373 break; 374 } 375 bump(l); 376 } 377 t.kind = TOK_HEADER; 378 t.spelling = intern_spliced(l, start, l->pos); 379 t.v.str = t.spelling; 380 l->dstate = 0; 381 return t; 382 } 383 384 /* String / character literal, with optional encoding prefix. The prefix 385 * length and encoding flag are decoded together so the spelling we 386 * intern includes the prefix bytes. */ 387 { 388 int sp_len = -1; 389 int is_char = 0; 390 u32 encf = 0; 391 392 if (ch == '"') { 393 sp_len = 0; 394 is_char = 0; 395 } else if (ch == '\'') { 396 sp_len = 0; 397 is_char = 1; 398 } else if (ch == 'L' && peek(l, 1) == '"') { 399 sp_len = 1; 400 is_char = 0; 401 encf = TF_STR_WIDE; 402 } else if (ch == 'L' && peek(l, 1) == '\'') { 403 sp_len = 1; 404 is_char = 1; 405 encf = TF_STR_WIDE; 406 } else if (ch == 'u' && peek(l, 1) == '8' && peek(l, 2) == '"') { 407 sp_len = 2; 408 is_char = 0; 409 encf = TF_STR_U8; 410 } else if (ch == 'u' && peek(l, 1) == '"') { 411 sp_len = 1; 412 is_char = 0; 413 encf = TF_STR_U16; 414 } else if (ch == 'u' && peek(l, 1) == '\'') { 415 sp_len = 1; 416 is_char = 1; 417 encf = TF_STR_U16; 418 } else if (ch == 'U' && peek(l, 1) == '"') { 419 sp_len = 1; 420 is_char = 0; 421 encf = TF_STR_U32; 422 } else if (ch == 'U' && peek(l, 1) == '\'') { 423 sp_len = 1; 424 is_char = 1; 425 encf = TF_STR_U32; 426 } 427 428 if (sp_len >= 0) { 429 int i; 430 for (i = 0; i < sp_len; ++i) bump(l); 431 if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD; 432 t.kind = (u16)(is_char ? TOK_CHR : TOK_STR); 433 t.flags |= encf; 434 t.spelling = intern_spliced(l, start, l->pos); 435 t.v.str = t.spelling; 436 l->dstate = 0; 437 return t; 438 } 439 } 440 441 /* Identifier (§6.4.2). Encoding-prefix candidates above are matched 442 * before this since L/u/U followed by a quote is a literal, not an 443 * identifier. The grammar's identifier-nondigit covers letters, _, 444 * extended source chars (impl-defined; bytes ≥ 0x80 here), and UCNs 445 * (§6.4.3) — the latter span multiple source bytes so they're matched 446 * via ucn_len rather than the per-byte is_alpha predicate. */ 447 { 448 int u = ucn_len(l, 0); 449 if (is_alpha(ch) || u) { 450 if (u) { 451 int i; 452 for (i = 0; i < u; ++i) bump(l); 453 } else 454 bump(l); 455 for (;;) { 456 int c = peek(l, 0); 457 if (is_alnum(c)) { 458 bump(l); 459 } else if ((u = ucn_len(l, 0))) { 460 int i; 461 for (i = 0; i < u; ++i) bump(l); 462 } else { 463 break; 464 } 465 } 466 t.kind = TOK_IDENT; 467 t.spelling = intern_spliced(l, start, l->pos); 468 t.v.ident = t.spelling; 469 if (l->dstate == 1) { 470 KitSlice s = kit_sym_str(l->pool->c, t.spelling); 471 l->dstate = (s.s && matches_include_kw(s.s, s.len)) ? 2 : 0; 472 } else { 473 l->dstate = 0; 474 } 475 return t; 476 } 477 } 478 479 /* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */ 480 if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) { 481 size_t plen; 482 char* pbuf; 483 size_t i, k; 484 scan_pp_number(l); 485 /* Classify on the post-splice text (the spelling we'll intern). */ 486 plen = l->pos - start; 487 pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1); 488 k = 0; 489 for (i = start; i < l->pos;) { 490 if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') { 491 i += 2; 492 continue; 493 } 494 pbuf[k++] = l->src[i++]; 495 } 496 t.kind = (u16)(pp_number_is_float(pbuf, k) ? TOK_FLT : TOK_NUM); 497 /* Suffix flags for §6.4.4.1 / §6.4.4.2. The parser dispatches on 498 * TF_INT_U/L/LL and TF_FLT_F/L to pick a TY_* tag for the literal, 499 * so missing flags would silently coerce `42U`/`42.0f` to plain 500 * int/double. */ 501 if (t.kind == TOK_FLT) { 502 size_t j = k; 503 while (j > 0) { 504 char c = pbuf[j - 1]; 505 if (c == 'f' || c == 'F') { 506 t.flags |= TF_FLT_F; 507 --j; 508 continue; 509 } 510 if (c == 'l' || c == 'L') { 511 t.flags |= TF_FLT_L; 512 --j; 513 continue; 514 } 515 break; 516 } 517 } else { 518 size_t j = k; 519 while (j > 0) { 520 char c = pbuf[j - 1]; 521 if (c == 'u' || c == 'U') { 522 t.flags |= TF_INT_U; 523 --j; 524 continue; 525 } 526 if (c == 'l' || c == 'L') { 527 if (j >= 2 && (pbuf[j - 2] == 'l' || pbuf[j - 2] == 'L')) { 528 t.flags |= TF_INT_LL; 529 j -= 2; 530 } else { 531 t.flags |= TF_INT_L; 532 --j; 533 } 534 continue; 535 } 536 break; 537 } 538 } 539 t.spelling = kit_sym_intern(l->pool->c, (KitSlice){.s = pbuf, .len = k}); 540 l->heap->free(l->heap, pbuf, plen ? plen : 1); 541 l->dstate = 0; 542 return t; 543 } 544 545 /* Punctuator (§6.4.6) — longest match. `#` and `##` (and their digraph 546 * forms `%:` and `%:%:`) become TOK_PP_HASH / TOK_PP_PASTE so PP can 547 * recognize directives and the paste operator. */ 548 { 549 int n0 = peek(l, 0); 550 int n1 = peek(l, 1); 551 int n2 = peek(l, 2); 552 int n3 = peek(l, 3); 553 int adv = 1; 554 u32 punct = P_NONE; 555 u16 kind = TOK_PUNCT; 556 int i; 557 558 switch (n0) { 559 case '#': 560 if (n1 == '#') { 561 adv = 2; 562 kind = TOK_PP_PASTE; 563 punct = P_HASH_HASH; 564 } else { 565 adv = 1; 566 kind = TOK_PP_HASH; 567 punct = '#'; 568 } 569 break; 570 case '.': 571 if (n1 == '.' && n2 == '.') { 572 adv = 3; 573 punct = P_ELLIPSIS; 574 } else { 575 adv = 1; 576 punct = '.'; 577 } 578 break; 579 case '-': 580 if (n1 == '>') { 581 adv = 2; 582 punct = P_ARROW; 583 } else if (n1 == '-') { 584 adv = 2; 585 punct = P_DEC; 586 } else if (n1 == '=') { 587 adv = 2; 588 punct = P_SUB_ASSIGN; 589 } else { 590 adv = 1; 591 punct = '-'; 592 } 593 break; 594 case '+': 595 if (n1 == '+') { 596 adv = 2; 597 punct = P_INC; 598 } else if (n1 == '=') { 599 adv = 2; 600 punct = P_ADD_ASSIGN; 601 } else { 602 adv = 1; 603 punct = '+'; 604 } 605 break; 606 case '<': 607 if (n1 == '<' && n2 == '=') { 608 adv = 3; 609 punct = P_SHL_ASSIGN; 610 } else if (n1 == '<') { 611 adv = 2; 612 punct = P_SHL; 613 } else if (n1 == '=') { 614 adv = 2; 615 punct = P_LE; 616 } else if (n1 == ':') { 617 adv = 2; 618 punct = '['; 619 } /* digraph */ 620 else if (n1 == '%') { 621 adv = 2; 622 punct = '{'; 623 } /* digraph */ 624 else { 625 adv = 1; 626 punct = '<'; 627 } 628 break; 629 case '>': 630 if (n1 == '>' && n2 == '=') { 631 adv = 3; 632 punct = P_SHR_ASSIGN; 633 } else if (n1 == '>') { 634 adv = 2; 635 punct = P_SHR; 636 } else if (n1 == '=') { 637 adv = 2; 638 punct = P_GE; 639 } else { 640 adv = 1; 641 punct = '>'; 642 } 643 break; 644 case '=': 645 if (n1 == '=') { 646 adv = 2; 647 punct = P_EQ; 648 } else { 649 adv = 1; 650 punct = '='; 651 } 652 break; 653 case '!': 654 if (n1 == '=') { 655 adv = 2; 656 punct = P_NE; 657 } else { 658 adv = 1; 659 punct = '!'; 660 } 661 break; 662 case '&': 663 if (n1 == '&') { 664 adv = 2; 665 punct = P_AND; 666 } else if (n1 == '=') { 667 adv = 2; 668 punct = P_AND_ASSIGN; 669 } else { 670 adv = 1; 671 punct = '&'; 672 } 673 break; 674 case '|': 675 if (n1 == '|') { 676 adv = 2; 677 punct = P_OR; 678 } else if (n1 == '=') { 679 adv = 2; 680 punct = P_OR_ASSIGN; 681 } else { 682 adv = 1; 683 punct = '|'; 684 } 685 break; 686 case '^': 687 if (n1 == '=') { 688 adv = 2; 689 punct = P_XOR_ASSIGN; 690 } else { 691 adv = 1; 692 punct = '^'; 693 } 694 break; 695 case '*': 696 if (n1 == '=') { 697 adv = 2; 698 punct = P_MUL_ASSIGN; 699 } else { 700 adv = 1; 701 punct = '*'; 702 } 703 break; 704 case '/': 705 if (n1 == '=') { 706 adv = 2; 707 punct = P_DIV_ASSIGN; 708 } else { 709 adv = 1; 710 punct = '/'; 711 } 712 break; 713 case '%': 714 if (n1 == ':' && n2 == '%' && n3 == ':') { 715 adv = 4; 716 kind = TOK_PP_PASTE; 717 punct = P_HASH_HASH; 718 } else if (n1 == ':') { 719 adv = 2; 720 kind = TOK_PP_HASH; 721 punct = '#'; 722 } else if (n1 == '=') { 723 adv = 2; 724 punct = P_MOD_ASSIGN; 725 } else if (n1 == '>') { 726 adv = 2; 727 punct = '}'; 728 } /* digraph */ 729 else { 730 adv = 1; 731 punct = '%'; 732 } 733 break; 734 case ':': 735 if (n1 == '>') { 736 adv = 2; 737 punct = ']'; 738 } /* digraph */ 739 else { 740 adv = 1; 741 punct = ':'; 742 } 743 break; 744 case '(': 745 case ')': 746 case '{': 747 case '}': 748 case '[': 749 case ']': 750 case ',': 751 case ';': 752 case '?': 753 case '~': 754 adv = 1; 755 punct = (u32)n0; 756 break; 757 default: 758 /* Unknown byte. Surface as a single-char punct so the token 759 * stream still progresses; PP/parse may diagnose. */ 760 adv = 1; 761 punct = (u32)n0; 762 break; 763 } 764 765 for (i = 0; i < adv; ++i) bump(l); 766 t.kind = kind; 767 t.v.punct = punct; 768 t.spelling = intern_spliced(l, start, l->pos); 769 if (kind == TOK_PP_HASH) 770 l->dstate = 1; 771 else 772 l->dstate = 0; 773 return t; 774 } 775 }