kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

lexer.c (10354B)


      1 #include "lexer.h"
      2 
      3 #include <stdlib.h>
      4 #include <string.h>
      5 
      6 void toy_lexer_init(ToyLexer* lex, const uint8_t* data, size_t len,
      7                     uint32_t file_id) {
      8   lex->cur = data;
      9   lex->end = data + len;
     10   lex->bol = data;
     11   lex->file_id = file_id;
     12   lex->line = 1;
     13 }
     14 
     15 static int toy_is_space(uint8_t c) {
     16   return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
     17          c == '\v';
     18 }
     19 
     20 static int toy_is_digit(uint8_t c) { return c >= '0' && c <= '9'; }
     21 
     22 static int toy_is_alpha(uint8_t c) {
     23   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
     24 }
     25 
     26 static int toy_is_alnum(uint8_t c) {
     27   return toy_is_alpha(c) || toy_is_digit(c);
     28 }
     29 
     30 static void toy_lexer_advance_line(ToyLexer* lex) {
     31   lex->bol = lex->cur + 1;
     32   lex->line++;
     33 }
     34 
     35 static void toy_skip_ws(ToyLexer* lex) {
     36   for (;;) {
     37     while (lex->cur < lex->end && toy_is_space(*lex->cur)) {
     38       if (*lex->cur == '\n') toy_lexer_advance_line(lex);
     39       lex->cur++;
     40     }
     41     if (lex->cur + 1 < lex->end && lex->cur[0] == '/' && lex->cur[1] == '/') {
     42       lex->cur += 2;
     43       while (lex->cur < lex->end && *lex->cur != '\n') lex->cur++;
     44       continue;
     45     }
     46     break;
     47   }
     48 }
     49 
     50 static ToyToken toy_lexer_emit(ToyLexer* lex, ToyTokenKind kind,
     51                                const uint8_t* start) {
     52   ToyToken tok;
     53   tok.kind = kind;
     54   tok.loc.file_id = lex->file_id;
     55   tok.loc.line = lex->line;
     56   tok.loc.col = (uint32_t)(start - lex->bol) + 1;
     57   tok.text = start;
     58   tok.text_len = (size_t)(lex->cur - start);
     59   tok.int_value = 0;
     60   tok.float_value = 0.0;
     61   tok.is_float = 0;
     62   return tok;
     63 }
     64 
     65 ToyToken toy_lexer_next(ToyLexer* lex) {
     66   const uint8_t* start;
     67   ToyToken tok;
     68 
     69   toy_skip_ws(lex);
     70   start = lex->cur;
     71   if (lex->cur >= lex->end) {
     72     tok.kind = TOK_EOF;
     73     tok.loc.file_id = 0;
     74     tok.loc.line = lex->line;
     75     tok.loc.col = (uint32_t)(start - lex->bol) + 1;
     76     tok.text = start;
     77     tok.text_len = 0;
     78     tok.int_value = 0;
     79     tok.float_value = 0.0;
     80     tok.is_float = 0;
     81     return tok;
     82   }
     83 
     84   uint8_t c = *lex->cur++;
     85 
     86   switch (c) {
     87     case '(':
     88       return toy_lexer_emit(lex, TOK_LPAREN, start);
     89     case ')':
     90       return toy_lexer_emit(lex, TOK_RPAREN, start);
     91     case '{':
     92       return toy_lexer_emit(lex, TOK_LBRACE, start);
     93     case '}':
     94       return toy_lexer_emit(lex, TOK_RBRACE, start);
     95     case '[':
     96       return toy_lexer_emit(lex, TOK_LBRACKET, start);
     97     case ']':
     98       return toy_lexer_emit(lex, TOK_RBRACKET, start);
     99     case ',':
    100       return toy_lexer_emit(lex, TOK_COMMA, start);
    101     case ';':
    102       return toy_lexer_emit(lex, TOK_SEMI, start);
    103     case ':':
    104       return toy_lexer_emit(lex, TOK_COLON, start);
    105     case '+':
    106       return toy_lexer_emit(lex, TOK_PLUS, start);
    107     case '*':
    108       return toy_lexer_emit(lex, TOK_STAR, start);
    109     case '/':
    110       return toy_lexer_emit(lex, TOK_SLASH, start);
    111     case '%':
    112       return toy_lexer_emit(lex, TOK_PERCENT, start);
    113     case '&':
    114       if (lex->cur < lex->end && *lex->cur == '&') {
    115         lex->cur++;
    116         return toy_lexer_emit(lex, TOK_ANDAND, start);
    117       }
    118       return toy_lexer_emit(lex, TOK_AMPERSAND, start);
    119     case '|':
    120       if (lex->cur < lex->end && *lex->cur == '|') {
    121         lex->cur++;
    122         return toy_lexer_emit(lex, TOK_PIPEPIPE, start);
    123       }
    124       return toy_lexer_emit(lex, TOK_PIPE, start);
    125     case '^':
    126       return toy_lexer_emit(lex, TOK_CARET, start);
    127     case '~':
    128       return toy_lexer_emit(lex, TOK_TILDE, start);
    129     case '=':
    130       if (lex->cur < lex->end && *lex->cur == '=') {
    131         lex->cur++;
    132         return toy_lexer_emit(lex, TOK_EQEQ, start);
    133       }
    134       return toy_lexer_emit(lex, TOK_EQ, start);
    135     case '!':
    136       if (lex->cur < lex->end && *lex->cur == '=') {
    137         lex->cur++;
    138         return toy_lexer_emit(lex, TOK_NE, start);
    139       }
    140       return toy_lexer_emit(lex, TOK_BANG, start);
    141     case '<':
    142       if (lex->cur < lex->end && *lex->cur == '<') {
    143         lex->cur++;
    144         return toy_lexer_emit(lex, TOK_SHL, start);
    145       }
    146       if (lex->cur < lex->end && *lex->cur == '=') {
    147         lex->cur++;
    148         return toy_lexer_emit(lex, TOK_LE, start);
    149       }
    150       return toy_lexer_emit(lex, TOK_LT, start);
    151     case '>':
    152       if (lex->cur < lex->end && *lex->cur == '>') {
    153         lex->cur++;
    154         return toy_lexer_emit(lex, TOK_SHR, start);
    155       }
    156       if (lex->cur < lex->end && *lex->cur == '=') {
    157         lex->cur++;
    158         return toy_lexer_emit(lex, TOK_GE, start);
    159       }
    160       return toy_lexer_emit(lex, TOK_GT, start);
    161     case '-':
    162       return toy_lexer_emit(lex, TOK_MINUS, start);
    163     case '.':
    164       if (lex->cur + 1 < lex->end && lex->cur[0] == '.' && lex->cur[1] == '.') {
    165         lex->cur += 2;
    166         return toy_lexer_emit(lex, TOK_DOTDOTDOT, start);
    167       }
    168       if (lex->cur < lex->end && *lex->cur == '*') {
    169         lex->cur++;
    170         return toy_lexer_emit(lex, TOK_DOTSTAR, start);
    171       }
    172       return toy_lexer_emit(lex, TOK_DOT, start);
    173     case '@':
    174       return toy_lexer_emit(lex, TOK_AT, start);
    175   }
    176 
    177   if (toy_is_digit(c)) {
    178     int64_t v = (int64_t)(c - '0');
    179     int is_float = 0;
    180     while (lex->cur < lex->end && toy_is_digit(*lex->cur)) {
    181       v = v * 10 + (int64_t)(*lex->cur - '0');
    182       lex->cur++;
    183     }
    184     if (lex->cur < lex->end && *lex->cur == '.' && lex->cur + 1 < lex->end &&
    185         toy_is_digit(lex->cur[1])) {
    186       is_float = 1;
    187       lex->cur++;
    188       while (lex->cur < lex->end && toy_is_digit(*lex->cur)) lex->cur++;
    189     }
    190     tok = toy_lexer_emit(lex, TOK_NUMBER, start);
    191     tok.int_value = v;
    192     tok.is_float = is_float;
    193     if (is_float) {
    194       char buf[64];
    195       size_t len = tok.text_len;
    196       if (len >= sizeof buf) len = sizeof buf - 1;
    197       memcpy(buf, tok.text, len);
    198       buf[len] = '\0';
    199       tok.float_value = strtod(buf, NULL);
    200     }
    201     return tok;
    202   }
    203 
    204   if (toy_is_alpha(c)) {
    205     while (lex->cur < lex->end && toy_is_alnum(*lex->cur)) lex->cur++;
    206     size_t len = (size_t)(lex->cur - start);
    207     ToyTokenKind kind = TOK_IDENT;
    208     if (len == 2 && start[0] == 'f' && start[1] == 'n')
    209       kind = TOK_FN;
    210     else if (len == 2 && start[0] == 'a' && start[1] == 's')
    211       kind = TOK_AS;
    212     else if (len == 3 && start[0] == 'a' && start[1] == 'n' && start[2] == 'd')
    213       kind = TOK_AND;
    214     else if (len == 2 && start[0] == 'o' && start[1] == 'r')
    215       kind = TOK_OR;
    216     else if (len == 3 && start[0] == 'v' && start[1] == 'a' && start[2] == 'r')
    217       kind = TOK_VAR;
    218     else if (len == 3 && start[0] == 'i' && start[1] == 'n' && start[2] == 't')
    219       kind = TOK_INT;
    220     else if (len == 3 && start[0] == 'l' && start[1] == 'e' && start[2] == 't')
    221       kind = TOK_LET;
    222     else if (len == 2 && start[0] == 'i' && start[1] == 'f')
    223       kind = TOK_IF;
    224     else if (len == 4 && start[0] == 'e' && start[1] == 'l' &&
    225              start[2] == 's' && start[3] == 'e')
    226       kind = TOK_ELSE;
    227     else if (len == 5 && start[0] == 'w' && start[1] == 'h' &&
    228              start[2] == 'i' && start[3] == 'l' && start[4] == 'e')
    229       kind = TOK_WHILE;
    230     else if (len == 6 && start[0] == 's' && start[1] == 'w' &&
    231              start[2] == 'i' && start[3] == 't' && start[4] == 'c' &&
    232              start[5] == 'h')
    233       kind = TOK_SWITCH;
    234     else if (len == 7 && start[0] == 'd' && start[1] == 'e' &&
    235              start[2] == 'f' && start[3] == 'a' && start[4] == 'u' &&
    236              start[5] == 'l' && start[6] == 't')
    237       kind = TOK_DEFAULT;
    238     else if (len == 5 && start[0] == 'l' && start[1] == 'a' &&
    239              start[2] == 'b' && start[3] == 'e' && start[4] == 'l')
    240       kind = TOK_LABEL;
    241     else if (len == 4 && start[0] == 'g' && start[1] == 'o' &&
    242              start[2] == 't' && start[3] == 'o')
    243       kind = TOK_GOTO;
    244     else if (len == 6 && start[0] == 'w' && start[1] == 'i' &&
    245              start[2] == 't' && start[3] == 'h' && start[4] == 'i' &&
    246              start[5] == 'n')
    247       kind = TOK_WITHIN;
    248     else if (len == 5 && start[0] == 'b' && start[1] == 'r' &&
    249              start[2] == 'e' && start[3] == 'a' && start[4] == 'k')
    250       kind = TOK_BREAK;
    251     else if (len == 8 && start[0] == 'c' && start[1] == 'o' &&
    252              start[2] == 'n' && start[3] == 't' && start[4] == 'i' &&
    253              start[5] == 'n' && start[6] == 'u' && start[7] == 'e')
    254       kind = TOK_CONTINUE;
    255     else if (len == 6 && start[0] == 'r' && start[1] == 'e' &&
    256              start[2] == 't' && start[3] == 'u' && start[4] == 'r' &&
    257              start[5] == 'n')
    258       kind = TOK_RETURN;
    259     else if (len == 4 && start[0] == 't' && start[1] == 'a' &&
    260              start[2] == 'i' && start[3] == 'l')
    261       kind = TOK_TAIL;
    262     else if (len == 8 && start[0] == 'm' && start[1] == 'u' &&
    263              start[2] == 's' && start[3] == 't' && start[4] == 't' &&
    264              start[5] == 'a' && start[6] == 'i' && start[7] == 'l')
    265       kind = TOK_MUSTTAIL;
    266     else if (len == 4 && start[0] == 't' && start[1] == 'y' &&
    267              start[2] == 'p' && start[3] == 'e')
    268       kind = TOK_TYPE;
    269     else if (len == 3 && start[0] == 'p' && start[1] == 'u' && start[2] == 'b')
    270       kind = TOK_PUB;
    271     else if (len == 6 && start[0] == 'e' && start[1] == 'x' &&
    272              start[2] == 't' && start[3] == 'e' && start[4] == 'r' &&
    273              start[5] == 'n')
    274       kind = TOK_EXTERN;
    275     else if (len == 5 && start[0] == 'a' && start[1] == 'l' &&
    276              start[2] == 'i' && start[3] == 'a' && start[4] == 's')
    277       kind = TOK_ALIAS;
    278     else if (len == 6 && start[0] == 'r' && start[1] == 'e' &&
    279              start[2] == 'c' && start[3] == 'o' && start[4] == 'r' &&
    280              start[5] == 'd')
    281       kind = TOK_RECORD;
    282     else if (len == 5 && start[0] == 't' && start[1] == 'u' &&
    283              start[2] == 'p' && start[3] == 'l' && start[4] == 'e')
    284       kind = TOK_TUPLE;
    285     else if (len == 4 && start[0] == 'e' && start[1] == 'n' &&
    286              start[2] == 'u' && start[3] == 'm')
    287       kind = TOK_ENUM;
    288     return toy_lexer_emit(lex, kind, start);
    289   }
    290 
    291   if (c == '"') {
    292     while (lex->cur < lex->end && *lex->cur != '"') {
    293       if (*lex->cur == '\n') toy_lexer_advance_line(lex);
    294       lex->cur++;
    295     }
    296     if (lex->cur < lex->end && *lex->cur == '"') lex->cur++;
    297     return toy_lexer_emit(lex, TOK_STRING, start);
    298   }
    299 
    300   return toy_lexer_emit(lex, TOK_EOF, start);
    301 }
    302 
    303 ToyToken toy_lexer_peek(const ToyLexer* lex) {
    304   ToyLexer tmp = *lex;
    305   return toy_lexer_next(&tmp);
    306 }