kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

lex.c (20578B)


      1 /* C11 lexer (§6.4). Streams tokens out of a borrowed source buffer.
      2  *
      3  * Tokens are recognized per the standard's lexical grammar:
      4  *   - identifiers (§6.4.2) — keyword bucketing happens later in parse_c
      5  *   - pp-numbers (§6.4.8), classified into TOK_NUM / TOK_FLT
      6  *   - string literals (§6.4.5) and character constants (§6.4.4.4)
      7  *     including the L/u/u8/U encoding prefixes
      8  *   - punctuators (§6.4.6), longest-match, including digraphs
      9  *   - `#` and `##` surface as TOK_PP_HASH / TOK_PP_PASTE so the
     10  *     preprocessor can recognize directives and the paste operator
     11  *
     12  * Comments (§6.4.9) are consumed as whitespace; physical newlines surface
     13  * as TOK_NEWLINE so PP can implement directive-line semantics. */
     14 
     15 #include "lex/lex.h"
     16 
     17 #include <string.h>
     18 
     19 struct Lexer {
     20   Compiler* c;
     21   Pool* pool;
     22   Heap* heap;
     23   const char* src;
     24   size_t len;
     25   size_t pos;
     26   u32 file_id;
     27   u32 line;
     28   u32 col;
     29   u8 at_bol;
     30   u8 had_space;
     31   /* §5.1.1.2 phase 4 directive context for header-name lexing.
     32    * 0 = none, 1 = saw pp-hash, 2 = saw `#include`/etc and the next
     33    * token may be a header-name. */
     34   u8 dstate;
     35 };
     36 
     37 /* §5.1.1.2 translation phase 2: splice physical lines joined by
     38  * backslash-newline. Advance past any splice sequence at l->pos so the
     39  * cursor never rests on the leading backslash of a splice. */
     40 static void skip_splices(Lexer* l) {
     41   while (l->pos + 1 < l->len && l->src[l->pos] == '\\' &&
     42          l->src[l->pos + 1] == '\n') {
     43     l->pos += 2;
     44     l->line++;
     45     l->col = 1;
     46   }
     47 }
     48 
     49 /* Logical peek: returns the off-th post-splice byte starting at l->pos,
     50  * or -1 at end of input. Does not mutate l->pos. */
     51 static int peek(const Lexer* l, size_t off) {
     52   size_t pos = l->pos;
     53   size_t k = 0;
     54   while (pos < l->len) {
     55     if (pos + 1 < l->len && l->src[pos] == '\\' && l->src[pos + 1] == '\n') {
     56       pos += 2;
     57       continue;
     58     }
     59     if (k == off) return (unsigned char)l->src[pos];
     60     ++pos;
     61     ++k;
     62   }
     63   return -1;
     64 }
     65 
     66 static int bump(Lexer* l) {
     67   int ch;
     68   skip_splices(l);
     69   if (l->pos >= l->len) return -1;
     70   ch = (unsigned char)l->src[l->pos++];
     71   if (ch == '\n') {
     72     l->line++;
     73     l->col = 1;
     74   } else {
     75     l->col++;
     76   }
     77   return ch;
     78 }
     79 
     80 static int is_digit(int c) { return c >= '0' && c <= '9'; }
     81 static int is_hex_digit(int c) {
     82   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
     83          (c >= 'A' && c <= 'F');
     84 }
     85 /* Identifier-start byte (§6.4.2.1). Letters and underscore are ASCII; bytes
     86  * ≥ 0x80 are accepted as the implementation-defined "other characters"
     87  * permitted in identifiers — in practice UTF-8 lead/continuation bytes for
     88  * extended source characters. UCNs are matched separately via ucn_len since
     89  * they span multiple source bytes. */
     90 static int is_alpha(int c) {
     91   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
     92          c >= 0x80;
     93 }
     94 static int is_alnum(int c) { return is_alpha(c) || is_digit(c); }
     95 
     96 /* Match a UCN at offset `off` from the current position. Returns the total
     97  * length (6 for \uXXXX, 10 for \UXXXXXXXX), or 0 if no UCN matches. The
     98  * range constraints from §6.4.3 (no UCN < 00A0 except $/@/`, and none in
     99  * D800–DFFF) are not enforced here — the lexical form is matched and any
    100  * downstream phase that cares can diagnose. */
    101 static int ucn_len(const Lexer* l, size_t off) {
    102   int n, i;
    103   if (peek(l, off) != '\\') return 0;
    104   if (peek(l, off + 1) == 'u')
    105     n = 4;
    106   else if (peek(l, off + 1) == 'U')
    107     n = 8;
    108   else
    109     return 0;
    110   for (i = 0; i < n; ++i) {
    111     if (!is_hex_digit(peek(l, off + 2 + i))) return 0;
    112   }
    113   return 2 + n;
    114 }
    115 
    116 static SrcLoc lex_here(const Lexer* l) {
    117   SrcLoc loc;
    118   loc.file_id = l->file_id;
    119   loc.line = l->line;
    120   loc.col = l->col;
    121   return loc;
    122 }
    123 
    124 Lexer* lex_open_mem(Compiler* c, const char* name, const char* src,
    125                     size_t len) {
    126   Heap* h = (Heap*)kit_compiler_context(c)->heap;
    127   Lexer* l = (Lexer*)h->alloc(h, sizeof(*l), _Alignof(Lexer));
    128   if (!l) return NULL;
    129   memset(l, 0, sizeof(*l));
    130   l->c = c;
    131   l->pool = c_pool_new(c);
    132   if (!l->pool) {
    133     h->free(h, l, sizeof(*l));
    134     return NULL;
    135   }
    136   l->heap = h;
    137   l->src = src ? src : "";
    138   l->len = src ? len : 0;
    139   l->pos = 0;
    140   l->file_id = 0;
    141   (void)kit_source_add_memory(c, kit_slice_cstr(name), &l->file_id);
    142   l->line = 1;
    143   l->col = 1;
    144   l->at_bol = 1;
    145   l->had_space = 0;
    146   return l;
    147 }
    148 
    149 void lex_close(Lexer* l) {
    150   if (!l) return;
    151   c_pool_free(l->pool);
    152   l->heap->free(l->heap, l, sizeof(*l));
    153 }
    154 
    155 /* Skip a script "shebang" line: a `#!` at the very start of the source.
    156  * The kernel-level `#!/path interpreter` mechanism (used to make a C file
    157  * executable via `kit run`) leaves the interpreter line as the first line of
    158  * the file, which is not valid C — `#!` would otherwise be lexed as a `#`
    159  * directive introducer. We only recognize it at byte 0, so a `#!` anywhere
    160  * else is left untouched. The line's trailing newline is left in place so the
    161  * lexer emits its TOK_NEWLINE and line numbering stays accurate (the shebang
    162  * remains line 1). No-op unless the buffer begins with the two bytes `#!`.
    163  * Apply only to a primary source file, never to includes/paste buffers. */
    164 void lex_skip_shebang(Lexer* l) {
    165   if (!l || l->pos != 0) return;
    166   if (l->len < 2 || l->src[0] != '#' || l->src[1] != '!') return;
    167   while (l->pos < l->len && l->src[l->pos] != '\n') l->pos++;
    168 }
    169 
    170 SrcLoc lex_loc(const Lexer* l) { return lex_here(l); }
    171 u32 lex_file_id(const Lexer* l) { return l->file_id; }
    172 
    173 /* Intern bytes [start, end) with line splices (\<newline>) removed, so token
    174  * spellings reflect post-phase-2 logical text. */
    175 static Sym intern_spliced(Lexer* l, size_t start, size_t end) {
    176   size_t i;
    177   int has_splice = 0;
    178   char* buf;
    179   size_t k;
    180   Sym sym;
    181 
    182   for (i = start; i + 1 < end; ++i) {
    183     if (l->src[i] == '\\' && l->src[i + 1] == '\n') {
    184       has_splice = 1;
    185       break;
    186     }
    187   }
    188   if (!has_splice)
    189     return kit_sym_intern(l->pool->c,
    190                           (KitSlice){.s = l->src + start, .len = end - start});
    191 
    192   buf = (char*)l->heap->alloc(l->heap, end - start, 1);
    193   k = 0;
    194   for (i = start; i < end;) {
    195     if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') {
    196       i += 2;
    197       continue;
    198     }
    199     buf[k++] = l->src[i++];
    200   }
    201   sym = kit_sym_intern(l->pool->c, (KitSlice){.s = buf, .len = k});
    202   l->heap->free(l->heap, buf, end - start);
    203   return sym;
    204 }
    205 
    206 /* §6.4.7 header-name lookahead: in include-directive context, a `<` or `"`
    207  * starts a header-name that runs to the matching `>` or `"`. The lexer
    208  * recognizes only header-name forms (whose contents are implementation
    209  * defined), not q-char-sequence escape rules. */
    210 static int matches_include_kw(const char* s, size_t n) {
    211   if (n == 7 && memcmp(s, "include", 7) == 0) return 1;
    212   if (n == 12 && memcmp(s, "include_next", 12) == 0) return 1;
    213   if (n == 6 && memcmp(s, "import", 6) == 0) return 1;
    214   if (n == 5 && memcmp(s, "embed", 5) == 0) return 1;
    215   return 0;
    216 }
    217 
    218 /* Skip whitespace and comments. Returns 1 if a newline boundary was crossed
    219  * via comment consumption (caller still emits the explicit newline token on
    220  * an in-source '\n'). */
    221 static void skip_ws_and_comments(Lexer* l) {
    222   for (;;) {
    223     int ch = peek(l, 0);
    224     if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\v' || ch == '\f') {
    225       bump(l);
    226       l->had_space = 1;
    227       continue;
    228     }
    229     if (ch == '/' && peek(l, 1) == '/') {
    230       bump(l);
    231       bump(l);
    232       while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l);
    233       l->had_space = 1;
    234       continue;
    235     }
    236     if (ch == '/' && peek(l, 1) == '*') {
    237       bump(l);
    238       bump(l);
    239       while (peek(l, 0) >= 0) {
    240         if (peek(l, 0) == '*' && peek(l, 1) == '/') {
    241           bump(l);
    242           bump(l);
    243           break;
    244         }
    245         bump(l);
    246       }
    247       l->had_space = 1;
    248       continue;
    249     }
    250     break;
    251   }
    252 }
    253 
    254 /* Consume a pp-number per §6.4.8. The cursor is positioned at the leading
    255  * digit (or `.` followed by a digit) on entry. */
    256 static void scan_pp_number(Lexer* l) {
    257   if (peek(l, 0) == '.') bump(l);
    258   bump(l); /* first digit */
    259   while (l->pos < l->len) {
    260     int c = peek(l, 0);
    261     int n = peek(l, 1);
    262     if ((c == 'e' || c == 'E' || c == 'p' || c == 'P') &&
    263         (n == '+' || n == '-')) {
    264       bump(l);
    265       bump(l);
    266     } else if (is_alnum(c) || c == '.') {
    267       bump(l);
    268     } else {
    269       break;
    270     }
    271   }
    272 }
    273 
    274 /* 1 if the pp-number text is a floating constant (§6.4.4.2): contains a
    275  * radix `.`, a hex `p`/`P` exponent, or a decimal `e`/`E` exponent. */
    276 static int pp_number_is_float(const char* s, size_t n) {
    277   int is_hex = 0;
    278   size_t i = 0;
    279   if (n >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
    280     is_hex = 1;
    281     i = 2;
    282   }
    283   for (; i < n; ++i) {
    284     char c = s[i];
    285     if (c == '.') return 1;
    286     if (is_hex && (c == 'p' || c == 'P')) return 1;
    287     if (!is_hex && (c == 'e' || c == 'E')) {
    288       if (i + 1 < n) {
    289         char nx = s[i + 1];
    290         if (nx == '+' || nx == '-' || (nx >= '0' && nx <= '9')) return 1;
    291       }
    292     }
    293   }
    294   return 0;
    295 }
    296 
    297 /* Consume a quoted body — string ('"') or character ('\''). The cursor is
    298  * positioned at the opening quote on entry. Returns 1 on an unterminated or
    299  * newline-broken literal, 0 on a clean close. */
    300 static int scan_quoted(Lexer* l, int quote) {
    301   bump(l); /* opening quote */
    302   for (;;) {
    303     int ch = peek(l, 0);
    304     if (ch < 0) return 1;
    305     if (ch == quote) {
    306       bump(l);
    307       return 0;
    308     }
    309     if (ch == '\n') return 1;
    310     if (ch == '\\') {
    311       bump(l); /* backslash */
    312       if (peek(l, 0) < 0) return 1;
    313       bump(l); /* the escaped char */
    314       continue;
    315     }
    316     bump(l);
    317   }
    318 }
    319 
    320 Tok lex_next(Lexer* l) {
    321   Tok t;
    322   SrcLoc tloc;
    323   size_t start;
    324   int ch;
    325 
    326   memset(&t, 0, sizeof(t));
    327 
    328   /* Skip whitespace and comments. A newline token is emitted before any
    329    * subsequent content tokens for the line that follows. */
    330   for (;;) {
    331     skip_ws_and_comments(l);
    332     skip_splices(l);
    333     if (l->pos >= l->len) {
    334       t.kind = TOK_EOF;
    335       t.loc = lex_here(l);
    336       return t;
    337     }
    338     if (peek(l, 0) == '\n') {
    339       tloc = lex_here(l);
    340       bump(l);
    341       t.kind = TOK_NEWLINE;
    342       t.loc = tloc;
    343       l->at_bol = 1;
    344       l->had_space = 0;
    345       l->dstate = 0;
    346       return t;
    347     }
    348     break;
    349   }
    350 
    351   tloc = lex_here(l);
    352   start = l->pos;
    353   ch = peek(l, 0);
    354 
    355   if (l->at_bol) t.flags |= TF_AT_BOL;
    356   if (l->had_space) t.flags |= TF_HAS_SPACE;
    357   l->at_bol = 0;
    358   l->had_space = 0;
    359   t.loc = tloc;
    360 
    361   /* §6.4.7 header-name: only valid in #include / #embed argument context. */
    362   if (l->dstate == 2 && (ch == '<' || ch == '"')) {
    363     int closer = (ch == '<') ? '>' : '"';
    364     bump(l);
    365     for (;;) {
    366       int c = peek(l, 0);
    367       if (c < 0 || c == '\n') {
    368         t.flags |= TF_LITERAL_BAD;
    369         break;
    370       }
    371       if (c == closer) {
    372         bump(l);
    373         break;
    374       }
    375       bump(l);
    376     }
    377     t.kind = TOK_HEADER;
    378     t.spelling = intern_spliced(l, start, l->pos);
    379     t.v.str = t.spelling;
    380     l->dstate = 0;
    381     return t;
    382   }
    383 
    384   /* String / character literal, with optional encoding prefix. The prefix
    385    * length and encoding flag are decoded together so the spelling we
    386    * intern includes the prefix bytes. */
    387   {
    388     int sp_len = -1;
    389     int is_char = 0;
    390     u32 encf = 0;
    391 
    392     if (ch == '"') {
    393       sp_len = 0;
    394       is_char = 0;
    395     } else if (ch == '\'') {
    396       sp_len = 0;
    397       is_char = 1;
    398     } else if (ch == 'L' && peek(l, 1) == '"') {
    399       sp_len = 1;
    400       is_char = 0;
    401       encf = TF_STR_WIDE;
    402     } else if (ch == 'L' && peek(l, 1) == '\'') {
    403       sp_len = 1;
    404       is_char = 1;
    405       encf = TF_STR_WIDE;
    406     } else if (ch == 'u' && peek(l, 1) == '8' && peek(l, 2) == '"') {
    407       sp_len = 2;
    408       is_char = 0;
    409       encf = TF_STR_U8;
    410     } else if (ch == 'u' && peek(l, 1) == '"') {
    411       sp_len = 1;
    412       is_char = 0;
    413       encf = TF_STR_U16;
    414     } else if (ch == 'u' && peek(l, 1) == '\'') {
    415       sp_len = 1;
    416       is_char = 1;
    417       encf = TF_STR_U16;
    418     } else if (ch == 'U' && peek(l, 1) == '"') {
    419       sp_len = 1;
    420       is_char = 0;
    421       encf = TF_STR_U32;
    422     } else if (ch == 'U' && peek(l, 1) == '\'') {
    423       sp_len = 1;
    424       is_char = 1;
    425       encf = TF_STR_U32;
    426     }
    427 
    428     if (sp_len >= 0) {
    429       int i;
    430       for (i = 0; i < sp_len; ++i) bump(l);
    431       if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD;
    432       t.kind = (u16)(is_char ? TOK_CHR : TOK_STR);
    433       t.flags |= encf;
    434       t.spelling = intern_spliced(l, start, l->pos);
    435       t.v.str = t.spelling;
    436       l->dstate = 0;
    437       return t;
    438     }
    439   }
    440 
    441   /* Identifier (§6.4.2). Encoding-prefix candidates above are matched
    442    * before this since L/u/U followed by a quote is a literal, not an
    443    * identifier. The grammar's identifier-nondigit covers letters, _,
    444    * extended source chars (impl-defined; bytes ≥ 0x80 here), and UCNs
    445    * (§6.4.3) — the latter span multiple source bytes so they're matched
    446    * via ucn_len rather than the per-byte is_alpha predicate. */
    447   {
    448     int u = ucn_len(l, 0);
    449     if (is_alpha(ch) || u) {
    450       if (u) {
    451         int i;
    452         for (i = 0; i < u; ++i) bump(l);
    453       } else
    454         bump(l);
    455       for (;;) {
    456         int c = peek(l, 0);
    457         if (is_alnum(c)) {
    458           bump(l);
    459         } else if ((u = ucn_len(l, 0))) {
    460           int i;
    461           for (i = 0; i < u; ++i) bump(l);
    462         } else {
    463           break;
    464         }
    465       }
    466       t.kind = TOK_IDENT;
    467       t.spelling = intern_spliced(l, start, l->pos);
    468       t.v.ident = t.spelling;
    469       if (l->dstate == 1) {
    470         KitSlice s = kit_sym_str(l->pool->c, t.spelling);
    471         l->dstate = (s.s && matches_include_kw(s.s, s.len)) ? 2 : 0;
    472       } else {
    473         l->dstate = 0;
    474       }
    475       return t;
    476     }
    477   }
    478 
    479   /* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */
    480   if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) {
    481     size_t plen;
    482     char* pbuf;
    483     size_t i, k;
    484     scan_pp_number(l);
    485     /* Classify on the post-splice text (the spelling we'll intern). */
    486     plen = l->pos - start;
    487     pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1);
    488     k = 0;
    489     for (i = start; i < l->pos;) {
    490       if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') {
    491         i += 2;
    492         continue;
    493       }
    494       pbuf[k++] = l->src[i++];
    495     }
    496     t.kind = (u16)(pp_number_is_float(pbuf, k) ? TOK_FLT : TOK_NUM);
    497     /* Suffix flags for §6.4.4.1 / §6.4.4.2. The parser dispatches on
    498      * TF_INT_U/L/LL and TF_FLT_F/L to pick a TY_* tag for the literal,
    499      * so missing flags would silently coerce `42U`/`42.0f` to plain
    500      * int/double. */
    501     if (t.kind == TOK_FLT) {
    502       size_t j = k;
    503       while (j > 0) {
    504         char c = pbuf[j - 1];
    505         if (c == 'f' || c == 'F') {
    506           t.flags |= TF_FLT_F;
    507           --j;
    508           continue;
    509         }
    510         if (c == 'l' || c == 'L') {
    511           t.flags |= TF_FLT_L;
    512           --j;
    513           continue;
    514         }
    515         break;
    516       }
    517     } else {
    518       size_t j = k;
    519       while (j > 0) {
    520         char c = pbuf[j - 1];
    521         if (c == 'u' || c == 'U') {
    522           t.flags |= TF_INT_U;
    523           --j;
    524           continue;
    525         }
    526         if (c == 'l' || c == 'L') {
    527           if (j >= 2 && (pbuf[j - 2] == 'l' || pbuf[j - 2] == 'L')) {
    528             t.flags |= TF_INT_LL;
    529             j -= 2;
    530           } else {
    531             t.flags |= TF_INT_L;
    532             --j;
    533           }
    534           continue;
    535         }
    536         break;
    537       }
    538     }
    539     t.spelling = kit_sym_intern(l->pool->c, (KitSlice){.s = pbuf, .len = k});
    540     l->heap->free(l->heap, pbuf, plen ? plen : 1);
    541     l->dstate = 0;
    542     return t;
    543   }
    544 
    545   /* Punctuator (§6.4.6) — longest match. `#` and `##` (and their digraph
    546    * forms `%:` and `%:%:`) become TOK_PP_HASH / TOK_PP_PASTE so PP can
    547    * recognize directives and the paste operator. */
    548   {
    549     int n0 = peek(l, 0);
    550     int n1 = peek(l, 1);
    551     int n2 = peek(l, 2);
    552     int n3 = peek(l, 3);
    553     int adv = 1;
    554     u32 punct = P_NONE;
    555     u16 kind = TOK_PUNCT;
    556     int i;
    557 
    558     switch (n0) {
    559       case '#':
    560         if (n1 == '#') {
    561           adv = 2;
    562           kind = TOK_PP_PASTE;
    563           punct = P_HASH_HASH;
    564         } else {
    565           adv = 1;
    566           kind = TOK_PP_HASH;
    567           punct = '#';
    568         }
    569         break;
    570       case '.':
    571         if (n1 == '.' && n2 == '.') {
    572           adv = 3;
    573           punct = P_ELLIPSIS;
    574         } else {
    575           adv = 1;
    576           punct = '.';
    577         }
    578         break;
    579       case '-':
    580         if (n1 == '>') {
    581           adv = 2;
    582           punct = P_ARROW;
    583         } else if (n1 == '-') {
    584           adv = 2;
    585           punct = P_DEC;
    586         } else if (n1 == '=') {
    587           adv = 2;
    588           punct = P_SUB_ASSIGN;
    589         } else {
    590           adv = 1;
    591           punct = '-';
    592         }
    593         break;
    594       case '+':
    595         if (n1 == '+') {
    596           adv = 2;
    597           punct = P_INC;
    598         } else if (n1 == '=') {
    599           adv = 2;
    600           punct = P_ADD_ASSIGN;
    601         } else {
    602           adv = 1;
    603           punct = '+';
    604         }
    605         break;
    606       case '<':
    607         if (n1 == '<' && n2 == '=') {
    608           adv = 3;
    609           punct = P_SHL_ASSIGN;
    610         } else if (n1 == '<') {
    611           adv = 2;
    612           punct = P_SHL;
    613         } else if (n1 == '=') {
    614           adv = 2;
    615           punct = P_LE;
    616         } else if (n1 == ':') {
    617           adv = 2;
    618           punct = '[';
    619         } /* digraph */
    620         else if (n1 == '%') {
    621           adv = 2;
    622           punct = '{';
    623         } /* digraph */
    624         else {
    625           adv = 1;
    626           punct = '<';
    627         }
    628         break;
    629       case '>':
    630         if (n1 == '>' && n2 == '=') {
    631           adv = 3;
    632           punct = P_SHR_ASSIGN;
    633         } else if (n1 == '>') {
    634           adv = 2;
    635           punct = P_SHR;
    636         } else if (n1 == '=') {
    637           adv = 2;
    638           punct = P_GE;
    639         } else {
    640           adv = 1;
    641           punct = '>';
    642         }
    643         break;
    644       case '=':
    645         if (n1 == '=') {
    646           adv = 2;
    647           punct = P_EQ;
    648         } else {
    649           adv = 1;
    650           punct = '=';
    651         }
    652         break;
    653       case '!':
    654         if (n1 == '=') {
    655           adv = 2;
    656           punct = P_NE;
    657         } else {
    658           adv = 1;
    659           punct = '!';
    660         }
    661         break;
    662       case '&':
    663         if (n1 == '&') {
    664           adv = 2;
    665           punct = P_AND;
    666         } else if (n1 == '=') {
    667           adv = 2;
    668           punct = P_AND_ASSIGN;
    669         } else {
    670           adv = 1;
    671           punct = '&';
    672         }
    673         break;
    674       case '|':
    675         if (n1 == '|') {
    676           adv = 2;
    677           punct = P_OR;
    678         } else if (n1 == '=') {
    679           adv = 2;
    680           punct = P_OR_ASSIGN;
    681         } else {
    682           adv = 1;
    683           punct = '|';
    684         }
    685         break;
    686       case '^':
    687         if (n1 == '=') {
    688           adv = 2;
    689           punct = P_XOR_ASSIGN;
    690         } else {
    691           adv = 1;
    692           punct = '^';
    693         }
    694         break;
    695       case '*':
    696         if (n1 == '=') {
    697           adv = 2;
    698           punct = P_MUL_ASSIGN;
    699         } else {
    700           adv = 1;
    701           punct = '*';
    702         }
    703         break;
    704       case '/':
    705         if (n1 == '=') {
    706           adv = 2;
    707           punct = P_DIV_ASSIGN;
    708         } else {
    709           adv = 1;
    710           punct = '/';
    711         }
    712         break;
    713       case '%':
    714         if (n1 == ':' && n2 == '%' && n3 == ':') {
    715           adv = 4;
    716           kind = TOK_PP_PASTE;
    717           punct = P_HASH_HASH;
    718         } else if (n1 == ':') {
    719           adv = 2;
    720           kind = TOK_PP_HASH;
    721           punct = '#';
    722         } else if (n1 == '=') {
    723           adv = 2;
    724           punct = P_MOD_ASSIGN;
    725         } else if (n1 == '>') {
    726           adv = 2;
    727           punct = '}';
    728         } /* digraph */
    729         else {
    730           adv = 1;
    731           punct = '%';
    732         }
    733         break;
    734       case ':':
    735         if (n1 == '>') {
    736           adv = 2;
    737           punct = ']';
    738         } /* digraph */
    739         else {
    740           adv = 1;
    741           punct = ':';
    742         }
    743         break;
    744       case '(':
    745       case ')':
    746       case '{':
    747       case '}':
    748       case '[':
    749       case ']':
    750       case ',':
    751       case ';':
    752       case '?':
    753       case '~':
    754         adv = 1;
    755         punct = (u32)n0;
    756         break;
    757       default:
    758         /* Unknown byte. Surface as a single-char punct so the token
    759          * stream still progresses; PP/parse may diagnose. */
    760         adv = 1;
    761         punct = (u32)n0;
    762         break;
    763     }
    764 
    765     for (i = 0; i < adv; ++i) bump(l);
    766     t.kind = kind;
    767     t.v.punct = punct;
    768     t.spelling = intern_spliced(l, start, l->pos);
    769     if (kind == TOK_PP_HASH)
    770       l->dstate = 1;
    771     else
    772       l->dstate = 0;
    773     return t;
    774   }
    775 }