kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

asm_lex.c (19900B)


      1 /* Assembler lexer. Streams tokens out of a borrowed source buffer.
      2  *
      3  * It intentionally keeps C-like number/string spelling rules because .S
      4  * sources arrive after C preprocessing and GNU as accepts those spellings
      5  * in directives and expressions. It does not own macro expansion or C
      6  * keyword classification.
      7  *
      8  * Comments are consumed as whitespace; physical newlines surface as
      9  * ASM_TOK_NEWLINE so the asm driver can keep line-oriented directive and
     10  * instruction parsing. */
     11 
     12 #include "asm/asm_lex.h"
     13 
     14 #include <string.h>
     15 
     16 #include "core/heap.h"
     17 #include "core/pool.h"
     18 #include "core/slice.h"
     19 
     20 struct AsmLexer {
     21   Compiler* c;
     22   Pool* pool;
     23   Heap* heap;
     24   const char* src;
     25   size_t len;
     26   size_t pos;
     27   u32 file_id;
     28   u32 line;
     29   u32 col;
     30   u8 at_bol;
     31   u8 had_space;
     32 };
     33 
     34 /* §5.1.1.2 translation phase 2: splice physical lines joined by
     35  * backslash-newline. Advance past any splice sequence at l->pos so the
     36  * cursor never rests on the leading backslash of a splice. */
     37 static void skip_splices(AsmLexer* l) {
     38   while (l->pos + 1 < l->len && l->src[l->pos] == '\\' &&
     39          l->src[l->pos + 1] == '\n') {
     40     l->pos += 2;
     41     l->line++;
     42     l->col = 1;
     43   }
     44 }
     45 
     46 /* Logical peek: returns the off-th post-splice byte starting at l->pos,
     47  * or -1 at end of input. Does not mutate l->pos. */
     48 static int peek(const AsmLexer* l, size_t off) {
     49   size_t pos = l->pos;
     50   size_t k = 0;
     51   while (pos < l->len) {
     52     if (pos + 1 < l->len && l->src[pos] == '\\' && l->src[pos + 1] == '\n') {
     53       pos += 2;
     54       continue;
     55     }
     56     if (k == off) return (unsigned char)l->src[pos];
     57     ++pos;
     58     ++k;
     59   }
     60   return -1;
     61 }
     62 
     63 static int bump(AsmLexer* l) {
     64   int ch;
     65   skip_splices(l);
     66   if (l->pos >= l->len) return -1;
     67   ch = (unsigned char)l->src[l->pos++];
     68   if (ch == '\n') {
     69     l->line++;
     70     l->col = 1;
     71   } else {
     72     l->col++;
     73   }
     74   return ch;
     75 }
     76 
     77 static int is_digit(int c) { return c >= '0' && c <= '9'; }
     78 static int is_hex_digit(int c) {
     79   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
     80          (c >= 'A' && c <= 'F');
     81 }
     82 /* Identifier-start byte (§6.4.2.1). Letters and underscore are ASCII; bytes
     83  * ≥ 0x80 are accepted as the implementation-defined "other characters"
     84  * permitted in identifiers — in practice UTF-8 lead/continuation bytes for
     85  * extended source characters. UCNs are matched separately via ucn_len since
     86  * they span multiple source bytes. */
     87 static int is_alpha(int c) {
     88   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
     89          c >= 0x80;
     90 }
     91 static int is_alnum(int c) { return is_alpha(c) || is_digit(c); }
     92 
     93 /* Match a UCN at offset `off` from the current position. Returns the total
     94  * length (6 for \uXXXX, 10 for \UXXXXXXXX), or 0 if no UCN matches. The
     95  * range constraints from §6.4.3 (no UCN < 00A0 except $/@/`, and none in
     96  * D800–DFFF) are not enforced here — the lexical form is matched and any
     97  * downstream phase that cares can diagnose. */
     98 static int ucn_len(const AsmLexer* l, size_t off) {
     99   int n, i;
    100   if (peek(l, off) != '\\') return 0;
    101   if (peek(l, off + 1) == 'u')
    102     n = 4;
    103   else if (peek(l, off + 1) == 'U')
    104     n = 8;
    105   else
    106     return 0;
    107   for (i = 0; i < n; ++i) {
    108     if (!is_hex_digit(peek(l, off + 2 + i))) return 0;
    109   }
    110   return 2 + n;
    111 }
    112 
    113 static SrcLoc asm_lex_here(const AsmLexer* l) {
    114   SrcLoc loc;
    115   loc.file_id = l->file_id;
    116   loc.line = l->line;
    117   loc.col = l->col;
    118   return loc;
    119 }
    120 
    121 AsmLexer* asm_lex_open_mem(Compiler* c, const char* name, const char* src,
    122                            size_t len) {
    123   Heap* h = (Heap*)c->ctx->heap;
    124   AsmLexer* l = (AsmLexer*)h->alloc(h, sizeof(*l), _Alignof(AsmLexer));
    125   if (!l) return NULL;
    126   memset(l, 0, sizeof(*l));
    127   l->c = c;
    128   l->pool = c->global;
    129   l->heap = h;
    130   l->src = src ? src : "";
    131   l->len = src ? len : 0;
    132   l->pos = 0;
    133   if (source_add_memory(c->sources, slice_from_cstr(name), &l->file_id) !=
    134       KIT_OK) {
    135     h->free(h, l, sizeof(*l));
    136     return NULL;
    137   }
    138   l->line = 1;
    139   l->col = 1;
    140   l->at_bol = 1;
    141   l->had_space = 0;
    142   return l;
    143 }
    144 
    145 void asm_lex_close(AsmLexer* l) {
    146   if (!l) return;
    147   l->heap->free(l->heap, l, sizeof(*l));
    148 }
    149 
    150 SrcLoc asm_lex_loc(const AsmLexer* l) { return asm_lex_here(l); }
    151 u32 asm_lex_file_id(const AsmLexer* l) { return l->file_id; }
    152 const AsmLitInfo* asm_lex_lit(const AsmLexer* l, AsmLitId id) {
    153   (void)l;
    154   (void)id;
    155   return NULL;
    156 }
    157 
    158 /* Intern bytes [start, end) with line splices (\<newline>) removed, so token
    159  * spellings reflect post-phase-2 logical text. */
    160 static Sym intern_spliced(AsmLexer* l, size_t start, size_t end) {
    161   size_t i;
    162   int has_splice = 0;
    163   char* buf;
    164   size_t k;
    165   Sym sym;
    166 
    167   for (i = start; i + 1 < end; ++i) {
    168     if (l->src[i] == '\\' && l->src[i + 1] == '\n') {
    169       has_splice = 1;
    170       break;
    171     }
    172   }
    173   if (!has_splice)
    174     return pool_intern_slice(l->pool,
    175                              (Slice){.s = l->src + start, .len = end - start});
    176 
    177   buf = (char*)l->heap->alloc(l->heap, end - start, 1);
    178   k = 0;
    179   for (i = start; i < end;) {
    180     if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') {
    181       i += 2;
    182       continue;
    183     }
    184     buf[k++] = l->src[i++];
    185   }
    186   sym = pool_intern_slice(l->pool, (Slice){.s = buf, .len = k});
    187   l->heap->free(l->heap, buf, end - start);
    188   return sym;
    189 }
    190 
    191 /* Skip whitespace and comments. Returns 1 if a newline boundary was crossed
    192  * via comment consumption (caller still emits the explicit newline token on
    193  * an in-source '\n'). */
    194 static void skip_ws_and_comments(AsmLexer* l) {
    195   for (;;) {
    196     int ch = peek(l, 0);
    197     if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\v' || ch == '\f') {
    198       bump(l);
    199       l->had_space = 1;
    200       continue;
    201     }
    202     if (ch == '/' && peek(l, 1) == '/') {
    203       bump(l);
    204       bump(l);
    205       while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l);
    206       l->had_space = 1;
    207       continue;
    208     }
    209     if (ch == '/' && peek(l, 1) == '*') {
    210       bump(l);
    211       bump(l);
    212       while (peek(l, 0) >= 0) {
    213         if (peek(l, 0) == '*' && peek(l, 1) == '/') {
    214           bump(l);
    215           bump(l);
    216           break;
    217         }
    218         bump(l);
    219       }
    220       l->had_space = 1;
    221       continue;
    222     }
    223     break;
    224   }
    225 }
    226 
    227 /* Consume a pp-number per §6.4.8. The cursor is positioned at the leading
    228  * digit (or `.` followed by a digit) on entry. */
    229 static void scan_pp_number(AsmLexer* l) {
    230   if (peek(l, 0) == '.') bump(l);
    231   bump(l); /* first digit */
    232   while (l->pos < l->len) {
    233     int c = peek(l, 0);
    234     int n = peek(l, 1);
    235     if ((c == 'e' || c == 'E' || c == 'p' || c == 'P') &&
    236         (n == '+' || n == '-')) {
    237       bump(l);
    238       bump(l);
    239     } else if (is_alnum(c) || c == '.') {
    240       bump(l);
    241     } else {
    242       break;
    243     }
    244   }
    245 }
    246 
    247 /* 1 if the pp-number text is a floating constant (§6.4.4.2): contains a
    248  * radix `.`, a hex `p`/`P` exponent, or a decimal `e`/`E` exponent. */
    249 static int pp_number_is_float(const char* s, size_t n) {
    250   int is_hex = 0;
    251   size_t i = 0;
    252   if (n >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
    253     is_hex = 1;
    254     i = 2;
    255   }
    256   for (; i < n; ++i) {
    257     char c = s[i];
    258     if (c == '.') return 1;
    259     if (is_hex && (c == 'p' || c == 'P')) return 1;
    260     if (!is_hex && (c == 'e' || c == 'E')) {
    261       if (i + 1 < n) {
    262         char nx = s[i + 1];
    263         if (nx == '+' || nx == '-' || (nx >= '0' && nx <= '9')) return 1;
    264       }
    265     }
    266   }
    267   return 0;
    268 }
    269 
    270 /* Consume a quoted body — string ('"') or character ('\''). The cursor is
    271  * positioned at the opening quote on entry. Returns 1 on an unterminated or
    272  * newline-broken literal, 0 on a clean close. */
    273 static int scan_quoted(AsmLexer* l, int quote) {
    274   bump(l); /* opening quote */
    275   for (;;) {
    276     int ch = peek(l, 0);
    277     if (ch < 0) return 1;
    278     if (ch == quote) {
    279       bump(l);
    280       return 0;
    281     }
    282     if (ch == '\n') return 1;
    283     if (ch == '\\') {
    284       bump(l); /* backslash */
    285       if (peek(l, 0) < 0) return 1;
    286       bump(l); /* the escaped char */
    287       continue;
    288     }
    289     bump(l);
    290   }
    291 }
    292 
    293 AsmTok asm_lex_next(AsmLexer* l) {
    294   AsmTok t;
    295   SrcLoc tloc;
    296   size_t start;
    297   int ch;
    298 
    299   memset(&t, 0, sizeof(t));
    300 
    301   /* Skip whitespace and comments. A newline token is emitted before any
    302    * subsequent content tokens for the line that follows. */
    303   for (;;) {
    304     skip_ws_and_comments(l);
    305     skip_splices(l);
    306     if (l->pos >= l->len) {
    307       t.kind = ASM_TOK_EOF;
    308       t.loc = asm_lex_here(l);
    309       return t;
    310     }
    311     if (peek(l, 0) == '\n') {
    312       tloc = asm_lex_here(l);
    313       bump(l);
    314       t.kind = ASM_TOK_NEWLINE;
    315       t.loc = tloc;
    316       l->at_bol = 1;
    317       l->had_space = 0;
    318       return t;
    319     }
    320     break;
    321   }
    322 
    323   tloc = asm_lex_here(l);
    324   start = l->pos;
    325   ch = peek(l, 0);
    326 
    327   if (l->at_bol) t.flags |= ASM_TF_AT_BOL;
    328   if (l->had_space) t.flags |= ASM_TF_HAS_SPACE;
    329   l->at_bol = 0;
    330   l->had_space = 0;
    331   t.loc = tloc;
    332 
    333   /* String / character literal, with optional encoding prefix. The prefix
    334    * length and encoding flag are decoded together so the spelling we
    335    * intern includes the prefix bytes. */
    336   {
    337     int sp_len = -1;
    338     int is_char = 0;
    339     u32 encf = 0;
    340 
    341     if (ch == '"') {
    342       sp_len = 0;
    343       is_char = 0;
    344     } else if (ch == '\'') {
    345       sp_len = 0;
    346       is_char = 1;
    347     } else if (ch == 'L' && peek(l, 1) == '"') {
    348       sp_len = 1;
    349       is_char = 0;
    350       encf = ASM_TF_STR_WIDE;
    351     } else if (ch == 'L' && peek(l, 1) == '\'') {
    352       sp_len = 1;
    353       is_char = 1;
    354       encf = ASM_TF_STR_WIDE;
    355     } else if (ch == 'u' && peek(l, 1) == '8' && peek(l, 2) == '"') {
    356       sp_len = 2;
    357       is_char = 0;
    358       encf = ASM_TF_STR_U8;
    359     } else if (ch == 'u' && peek(l, 1) == '"') {
    360       sp_len = 1;
    361       is_char = 0;
    362       encf = ASM_TF_STR_U16;
    363     } else if (ch == 'u' && peek(l, 1) == '\'') {
    364       sp_len = 1;
    365       is_char = 1;
    366       encf = ASM_TF_STR_U16;
    367     } else if (ch == 'U' && peek(l, 1) == '"') {
    368       sp_len = 1;
    369       is_char = 0;
    370       encf = ASM_TF_STR_U32;
    371     } else if (ch == 'U' && peek(l, 1) == '\'') {
    372       sp_len = 1;
    373       is_char = 1;
    374       encf = ASM_TF_STR_U32;
    375     }
    376 
    377     if (sp_len >= 0) {
    378       int i;
    379       for (i = 0; i < sp_len; ++i) bump(l);
    380       if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= ASM_TF_LITERAL_BAD;
    381       t.kind = (u16)(is_char ? ASM_TOK_CHR : ASM_TOK_STR);
    382       t.flags |= encf;
    383       t.spelling = intern_spliced(l, start, l->pos);
    384       t.v.str = t.spelling;
    385       return t;
    386     }
    387   }
    388 
    389   /* Local-label identifier: a `.L`-prefixed symbol name (the universal GNU
    390    * convention for assembler-local labels, e.g. `.Lkit_ro.0`, `.L.str`,
    391    * `.LBB0_1`). Lexed as a single ASM_TOK_IDENT — including the leading dot
    392    * and any embedded dots — so it flows through the same operand / label /
    393    * `.type` paths as an ordinary identifier. This is unambiguous against
    394    * directives: no assembler directive begins with `.L`, so `.text`,
    395    * `.section`, `.quad` etc. still tokenize as PUNCT('.') + IDENT and reach
    396    * the directive dispatcher. Embedded `.` is consumed only when followed by
    397    * another symbol char, so `.Lfoo, x` and `.Lfoo+4` stop at the delimiter. */
    398   if (ch == '.' && peek(l, 1) == 'L') {
    399     bump(l); /* '.' */
    400     bump(l); /* 'L' */
    401     for (;;) {
    402       int c = peek(l, 0);
    403       if (is_alnum(c) || c == '$') {
    404         bump(l);
    405       } else if (c == '.' && (is_alnum(peek(l, 1)) || peek(l, 1) == '$' ||
    406                               peek(l, 1) == '_')) {
    407         bump(l);
    408       } else {
    409         break;
    410       }
    411     }
    412     t.kind = ASM_TOK_IDENT;
    413     t.spelling = intern_spliced(l, start, l->pos);
    414     t.v.ident = t.spelling;
    415     return t;
    416   }
    417 
    418   /* Identifier (§6.4.2). Encoding-prefix candidates above are matched
    419    * before this since L/u/U followed by a quote is a literal, not an
    420    * identifier. The grammar's identifier-nondigit covers letters, _,
    421    * extended source chars (impl-defined; bytes ≥ 0x80 here), and UCNs
    422    * (§6.4.3) — the latter span multiple source bytes so they're matched
    423    * via ucn_len rather than the per-byte is_alpha predicate. */
    424   {
    425     int u = ucn_len(l, 0);
    426     if (is_alpha(ch) || u) {
    427       if (u) {
    428         int i;
    429         for (i = 0; i < u; ++i) bump(l);
    430       } else
    431         bump(l);
    432       for (;;) {
    433         int c = peek(l, 0);
    434         if (is_alnum(c)) {
    435           bump(l);
    436         } else if (c == '.' && is_digit(peek(l, 1))) {
    437           /* Discriminator-mangled symbol: `name.N` (static locals, lambda /
    438            * block-scope renaming, e.g. `acc.1`). A `.` followed by a digit
    439            * continues the identifier. Restricted to `.`+digit so it never
    440            * swallows a `.`-led mnemonic suffix (`b.eq`, `fcvt.w.s`) or the
    441            * `.size foo, .-foo` location-counter dot. */
    442           bump(l);
    443         } else if ((u = ucn_len(l, 0))) {
    444           int i;
    445           for (i = 0; i < u; ++i) bump(l);
    446         } else {
    447           break;
    448         }
    449       }
    450       t.kind = ASM_TOK_IDENT;
    451       t.spelling = intern_spliced(l, start, l->pos);
    452       t.v.ident = t.spelling;
    453       return t;
    454     }
    455   }
    456 
    457   /* Preprocessor-number shaped token, classified to ASM_TOK_NUM /
    458    * ASM_TOK_FLT for expression diagnostics and future directive support. */
    459   if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) {
    460     size_t plen;
    461     char* pbuf;
    462     size_t i, k;
    463     scan_pp_number(l);
    464     /* Classify on the post-splice text (the spelling we'll intern). */
    465     plen = l->pos - start;
    466     pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1);
    467     k = 0;
    468     for (i = start; i < l->pos;) {
    469       if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') {
    470         i += 2;
    471         continue;
    472       }
    473       pbuf[k++] = l->src[i++];
    474     }
    475     t.kind = (u16)(pp_number_is_float(pbuf, k) ? ASM_TOK_FLT : ASM_TOK_NUM);
    476     /* Preserve common C-style integer/float suffixes in token flags. The
    477      * current assembler expression evaluator ignores them, but keeping the
    478      * spelling metadata makes the lexer useful for future directive work. */
    479     if (t.kind == ASM_TOK_FLT) {
    480       size_t j = k;
    481       while (j > 0) {
    482         char c = pbuf[j - 1];
    483         if (c == 'f' || c == 'F') {
    484           t.flags |= ASM_TF_FLT_F;
    485           --j;
    486           continue;
    487         }
    488         if (c == 'l' || c == 'L') {
    489           t.flags |= ASM_TF_FLT_L;
    490           --j;
    491           continue;
    492         }
    493         break;
    494       }
    495     } else {
    496       size_t j = k;
    497       while (j > 0) {
    498         char c = pbuf[j - 1];
    499         if (c == 'u' || c == 'U') {
    500           t.flags |= ASM_TF_INT_U;
    501           --j;
    502           continue;
    503         }
    504         if (c == 'l' || c == 'L') {
    505           if (j >= 2 && (pbuf[j - 2] == 'l' || pbuf[j - 2] == 'L')) {
    506             t.flags |= ASM_TF_INT_LL;
    507             j -= 2;
    508           } else {
    509             t.flags |= ASM_TF_INT_L;
    510             --j;
    511           }
    512           continue;
    513         }
    514         break;
    515       }
    516     }
    517     t.spelling = pool_intern_slice(l->pool, (Slice){.s = pbuf, .len = k});
    518     l->heap->free(l->heap, pbuf, plen ? plen : 1);
    519     return t;
    520   }
    521 
    522   /* Punctuator, longest match. `#` is a distinct token because it is both
    523    * an asm immediate marker and, at BOL in preprocessed assembler, a line
    524    * marker introducer. */
    525   {
    526     int n0 = peek(l, 0);
    527     int n1 = peek(l, 1);
    528     int n2 = peek(l, 2);
    529     int n3 = peek(l, 3);
    530     int adv = 1;
    531     u32 punct = ASM_P_NONE;
    532     u16 kind = ASM_TOK_PUNCT;
    533     int i;
    534 
    535     switch (n0) {
    536       case '#':
    537         if (n1 == '#') {
    538           adv = 2;
    539           kind = ASM_TOK_HASH_HASH;
    540           punct = ASM_P_HASH_HASH;
    541         } else {
    542           adv = 1;
    543           kind = ASM_TOK_HASH;
    544           punct = '#';
    545         }
    546         break;
    547       case '.':
    548         if (n1 == '.' && n2 == '.') {
    549           adv = 3;
    550           punct = ASM_P_ELLIPSIS;
    551         } else {
    552           adv = 1;
    553           punct = '.';
    554         }
    555         break;
    556       case '-':
    557         if (n1 == '>') {
    558           adv = 2;
    559           punct = ASM_P_ARROW;
    560         } else if (n1 == '-') {
    561           adv = 2;
    562           punct = ASM_P_DEC;
    563         } else if (n1 == '=') {
    564           adv = 2;
    565           punct = ASM_P_SUB_ASSIGN;
    566         } else {
    567           adv = 1;
    568           punct = '-';
    569         }
    570         break;
    571       case '+':
    572         if (n1 == '+') {
    573           adv = 2;
    574           punct = ASM_P_INC;
    575         } else if (n1 == '=') {
    576           adv = 2;
    577           punct = ASM_P_ADD_ASSIGN;
    578         } else {
    579           adv = 1;
    580           punct = '+';
    581         }
    582         break;
    583       case '<':
    584         if (n1 == '<' && n2 == '=') {
    585           adv = 3;
    586           punct = ASM_P_SHL_ASSIGN;
    587         } else if (n1 == '<') {
    588           adv = 2;
    589           punct = ASM_P_SHL;
    590         } else if (n1 == '=') {
    591           adv = 2;
    592           punct = ASM_P_LE;
    593         } else if (n1 == ':') {
    594           adv = 2;
    595           punct = '[';
    596         } /* digraph */
    597         else if (n1 == '%') {
    598           adv = 2;
    599           punct = '{';
    600         } /* digraph */
    601         else {
    602           adv = 1;
    603           punct = '<';
    604         }
    605         break;
    606       case '>':
    607         if (n1 == '>' && n2 == '=') {
    608           adv = 3;
    609           punct = ASM_P_SHR_ASSIGN;
    610         } else if (n1 == '>') {
    611           adv = 2;
    612           punct = ASM_P_SHR;
    613         } else if (n1 == '=') {
    614           adv = 2;
    615           punct = ASM_P_GE;
    616         } else {
    617           adv = 1;
    618           punct = '>';
    619         }
    620         break;
    621       case '=':
    622         if (n1 == '=') {
    623           adv = 2;
    624           punct = ASM_P_EQ;
    625         } else {
    626           adv = 1;
    627           punct = '=';
    628         }
    629         break;
    630       case '!':
    631         if (n1 == '=') {
    632           adv = 2;
    633           punct = ASM_P_NE;
    634         } else {
    635           adv = 1;
    636           punct = '!';
    637         }
    638         break;
    639       case '&':
    640         if (n1 == '&') {
    641           adv = 2;
    642           punct = ASM_P_AND;
    643         } else if (n1 == '=') {
    644           adv = 2;
    645           punct = ASM_P_AND_ASSIGN;
    646         } else {
    647           adv = 1;
    648           punct = '&';
    649         }
    650         break;
    651       case '|':
    652         if (n1 == '|') {
    653           adv = 2;
    654           punct = ASM_P_OR;
    655         } else if (n1 == '=') {
    656           adv = 2;
    657           punct = ASM_P_OR_ASSIGN;
    658         } else {
    659           adv = 1;
    660           punct = '|';
    661         }
    662         break;
    663       case '^':
    664         if (n1 == '=') {
    665           adv = 2;
    666           punct = ASM_P_XOR_ASSIGN;
    667         } else {
    668           adv = 1;
    669           punct = '^';
    670         }
    671         break;
    672       case '*':
    673         if (n1 == '=') {
    674           adv = 2;
    675           punct = ASM_P_MUL_ASSIGN;
    676         } else {
    677           adv = 1;
    678           punct = '*';
    679         }
    680         break;
    681       case '/':
    682         if (n1 == '=') {
    683           adv = 2;
    684           punct = ASM_P_DIV_ASSIGN;
    685         } else {
    686           adv = 1;
    687           punct = '/';
    688         }
    689         break;
    690       case '%':
    691         if (n1 == ':' && n2 == '%' && n3 == ':') {
    692           adv = 4;
    693           kind = ASM_TOK_HASH_HASH;
    694           punct = ASM_P_HASH_HASH;
    695         } else if (n1 == ':') {
    696           adv = 2;
    697           kind = ASM_TOK_HASH;
    698           punct = '#';
    699         } else if (n1 == '=') {
    700           adv = 2;
    701           punct = ASM_P_MOD_ASSIGN;
    702         } else if (n1 == '>') {
    703           adv = 2;
    704           punct = '}';
    705         } /* digraph */
    706         else {
    707           adv = 1;
    708           punct = '%';
    709         }
    710         break;
    711       case ':':
    712         if (n1 == '>') {
    713           adv = 2;
    714           punct = ']';
    715         } /* digraph */
    716         else {
    717           adv = 1;
    718           punct = ':';
    719         }
    720         break;
    721       case '(':
    722       case ')':
    723       case '{':
    724       case '}':
    725       case '[':
    726       case ']':
    727       case ',':
    728       case ';':
    729       case '?':
    730       case '~':
    731         adv = 1;
    732         punct = (u32)n0;
    733         break;
    734       default:
    735         /* Unknown byte. Surface as a single-char punct so the token
    736          * stream still progresses; PP/parse may diagnose. */
    737         adv = 1;
    738         punct = (u32)n0;
    739         break;
    740     }
    741 
    742     for (i = 0; i < adv; ++i) bump(l);
    743     t.kind = kind;
    744     t.v.punct = punct;
    745     t.spelling = intern_spliced(l, start, l->pos);
    746     return t;
    747   }
    748 }