lexer.c (10354B)
1 #include "lexer.h" 2 3 #include <stdlib.h> 4 #include <string.h> 5 6 void toy_lexer_init(ToyLexer* lex, const uint8_t* data, size_t len, 7 uint32_t file_id) { 8 lex->cur = data; 9 lex->end = data + len; 10 lex->bol = data; 11 lex->file_id = file_id; 12 lex->line = 1; 13 } 14 15 static int toy_is_space(uint8_t c) { 16 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || 17 c == '\v'; 18 } 19 20 static int toy_is_digit(uint8_t c) { return c >= '0' && c <= '9'; } 21 22 static int toy_is_alpha(uint8_t c) { 23 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; 24 } 25 26 static int toy_is_alnum(uint8_t c) { 27 return toy_is_alpha(c) || toy_is_digit(c); 28 } 29 30 static void toy_lexer_advance_line(ToyLexer* lex) { 31 lex->bol = lex->cur + 1; 32 lex->line++; 33 } 34 35 static void toy_skip_ws(ToyLexer* lex) { 36 for (;;) { 37 while (lex->cur < lex->end && toy_is_space(*lex->cur)) { 38 if (*lex->cur == '\n') toy_lexer_advance_line(lex); 39 lex->cur++; 40 } 41 if (lex->cur + 1 < lex->end && lex->cur[0] == '/' && lex->cur[1] == '/') { 42 lex->cur += 2; 43 while (lex->cur < lex->end && *lex->cur != '\n') lex->cur++; 44 continue; 45 } 46 break; 47 } 48 } 49 50 static ToyToken toy_lexer_emit(ToyLexer* lex, ToyTokenKind kind, 51 const uint8_t* start) { 52 ToyToken tok; 53 tok.kind = kind; 54 tok.loc.file_id = lex->file_id; 55 tok.loc.line = lex->line; 56 tok.loc.col = (uint32_t)(start - lex->bol) + 1; 57 tok.text = start; 58 tok.text_len = (size_t)(lex->cur - start); 59 tok.int_value = 0; 60 tok.float_value = 0.0; 61 tok.is_float = 0; 62 return tok; 63 } 64 65 ToyToken toy_lexer_next(ToyLexer* lex) { 66 const uint8_t* start; 67 ToyToken tok; 68 69 toy_skip_ws(lex); 70 start = lex->cur; 71 if (lex->cur >= lex->end) { 72 tok.kind = TOK_EOF; 73 tok.loc.file_id = 0; 74 tok.loc.line = lex->line; 75 tok.loc.col = (uint32_t)(start - lex->bol) + 1; 76 tok.text = start; 77 tok.text_len = 0; 78 tok.int_value = 0; 79 tok.float_value = 0.0; 80 tok.is_float = 0; 81 return tok; 82 } 83 84 uint8_t c = *lex->cur++; 85 86 switch (c) { 87 case '(': 88 return toy_lexer_emit(lex, TOK_LPAREN, start); 89 case ')': 90 return toy_lexer_emit(lex, TOK_RPAREN, start); 91 case '{': 92 return toy_lexer_emit(lex, TOK_LBRACE, start); 93 case '}': 94 return toy_lexer_emit(lex, TOK_RBRACE, start); 95 case '[': 96 return toy_lexer_emit(lex, TOK_LBRACKET, start); 97 case ']': 98 return toy_lexer_emit(lex, TOK_RBRACKET, start); 99 case ',': 100 return toy_lexer_emit(lex, TOK_COMMA, start); 101 case ';': 102 return toy_lexer_emit(lex, TOK_SEMI, start); 103 case ':': 104 return toy_lexer_emit(lex, TOK_COLON, start); 105 case '+': 106 return toy_lexer_emit(lex, TOK_PLUS, start); 107 case '*': 108 return toy_lexer_emit(lex, TOK_STAR, start); 109 case '/': 110 return toy_lexer_emit(lex, TOK_SLASH, start); 111 case '%': 112 return toy_lexer_emit(lex, TOK_PERCENT, start); 113 case '&': 114 if (lex->cur < lex->end && *lex->cur == '&') { 115 lex->cur++; 116 return toy_lexer_emit(lex, TOK_ANDAND, start); 117 } 118 return toy_lexer_emit(lex, TOK_AMPERSAND, start); 119 case '|': 120 if (lex->cur < lex->end && *lex->cur == '|') { 121 lex->cur++; 122 return toy_lexer_emit(lex, TOK_PIPEPIPE, start); 123 } 124 return toy_lexer_emit(lex, TOK_PIPE, start); 125 case '^': 126 return toy_lexer_emit(lex, TOK_CARET, start); 127 case '~': 128 return toy_lexer_emit(lex, TOK_TILDE, start); 129 case '=': 130 if (lex->cur < lex->end && *lex->cur == '=') { 131 lex->cur++; 132 return toy_lexer_emit(lex, TOK_EQEQ, start); 133 } 134 return toy_lexer_emit(lex, TOK_EQ, start); 135 case '!': 136 if (lex->cur < lex->end && *lex->cur == '=') { 137 lex->cur++; 138 return toy_lexer_emit(lex, TOK_NE, start); 139 } 140 return toy_lexer_emit(lex, TOK_BANG, start); 141 case '<': 142 if (lex->cur < lex->end && *lex->cur == '<') { 143 lex->cur++; 144 return toy_lexer_emit(lex, TOK_SHL, start); 145 } 146 if (lex->cur < lex->end && *lex->cur == '=') { 147 lex->cur++; 148 return toy_lexer_emit(lex, TOK_LE, start); 149 } 150 return toy_lexer_emit(lex, TOK_LT, start); 151 case '>': 152 if (lex->cur < lex->end && *lex->cur == '>') { 153 lex->cur++; 154 return toy_lexer_emit(lex, TOK_SHR, start); 155 } 156 if (lex->cur < lex->end && *lex->cur == '=') { 157 lex->cur++; 158 return toy_lexer_emit(lex, TOK_GE, start); 159 } 160 return toy_lexer_emit(lex, TOK_GT, start); 161 case '-': 162 return toy_lexer_emit(lex, TOK_MINUS, start); 163 case '.': 164 if (lex->cur + 1 < lex->end && lex->cur[0] == '.' && lex->cur[1] == '.') { 165 lex->cur += 2; 166 return toy_lexer_emit(lex, TOK_DOTDOTDOT, start); 167 } 168 if (lex->cur < lex->end && *lex->cur == '*') { 169 lex->cur++; 170 return toy_lexer_emit(lex, TOK_DOTSTAR, start); 171 } 172 return toy_lexer_emit(lex, TOK_DOT, start); 173 case '@': 174 return toy_lexer_emit(lex, TOK_AT, start); 175 } 176 177 if (toy_is_digit(c)) { 178 int64_t v = (int64_t)(c - '0'); 179 int is_float = 0; 180 while (lex->cur < lex->end && toy_is_digit(*lex->cur)) { 181 v = v * 10 + (int64_t)(*lex->cur - '0'); 182 lex->cur++; 183 } 184 if (lex->cur < lex->end && *lex->cur == '.' && lex->cur + 1 < lex->end && 185 toy_is_digit(lex->cur[1])) { 186 is_float = 1; 187 lex->cur++; 188 while (lex->cur < lex->end && toy_is_digit(*lex->cur)) lex->cur++; 189 } 190 tok = toy_lexer_emit(lex, TOK_NUMBER, start); 191 tok.int_value = v; 192 tok.is_float = is_float; 193 if (is_float) { 194 char buf[64]; 195 size_t len = tok.text_len; 196 if (len >= sizeof buf) len = sizeof buf - 1; 197 memcpy(buf, tok.text, len); 198 buf[len] = '\0'; 199 tok.float_value = strtod(buf, NULL); 200 } 201 return tok; 202 } 203 204 if (toy_is_alpha(c)) { 205 while (lex->cur < lex->end && toy_is_alnum(*lex->cur)) lex->cur++; 206 size_t len = (size_t)(lex->cur - start); 207 ToyTokenKind kind = TOK_IDENT; 208 if (len == 2 && start[0] == 'f' && start[1] == 'n') 209 kind = TOK_FN; 210 else if (len == 2 && start[0] == 'a' && start[1] == 's') 211 kind = TOK_AS; 212 else if (len == 3 && start[0] == 'a' && start[1] == 'n' && start[2] == 'd') 213 kind = TOK_AND; 214 else if (len == 2 && start[0] == 'o' && start[1] == 'r') 215 kind = TOK_OR; 216 else if (len == 3 && start[0] == 'v' && start[1] == 'a' && start[2] == 'r') 217 kind = TOK_VAR; 218 else if (len == 3 && start[0] == 'i' && start[1] == 'n' && start[2] == 't') 219 kind = TOK_INT; 220 else if (len == 3 && start[0] == 'l' && start[1] == 'e' && start[2] == 't') 221 kind = TOK_LET; 222 else if (len == 2 && start[0] == 'i' && start[1] == 'f') 223 kind = TOK_IF; 224 else if (len == 4 && start[0] == 'e' && start[1] == 'l' && 225 start[2] == 's' && start[3] == 'e') 226 kind = TOK_ELSE; 227 else if (len == 5 && start[0] == 'w' && start[1] == 'h' && 228 start[2] == 'i' && start[3] == 'l' && start[4] == 'e') 229 kind = TOK_WHILE; 230 else if (len == 6 && start[0] == 's' && start[1] == 'w' && 231 start[2] == 'i' && start[3] == 't' && start[4] == 'c' && 232 start[5] == 'h') 233 kind = TOK_SWITCH; 234 else if (len == 7 && start[0] == 'd' && start[1] == 'e' && 235 start[2] == 'f' && start[3] == 'a' && start[4] == 'u' && 236 start[5] == 'l' && start[6] == 't') 237 kind = TOK_DEFAULT; 238 else if (len == 5 && start[0] == 'l' && start[1] == 'a' && 239 start[2] == 'b' && start[3] == 'e' && start[4] == 'l') 240 kind = TOK_LABEL; 241 else if (len == 4 && start[0] == 'g' && start[1] == 'o' && 242 start[2] == 't' && start[3] == 'o') 243 kind = TOK_GOTO; 244 else if (len == 6 && start[0] == 'w' && start[1] == 'i' && 245 start[2] == 't' && start[3] == 'h' && start[4] == 'i' && 246 start[5] == 'n') 247 kind = TOK_WITHIN; 248 else if (len == 5 && start[0] == 'b' && start[1] == 'r' && 249 start[2] == 'e' && start[3] == 'a' && start[4] == 'k') 250 kind = TOK_BREAK; 251 else if (len == 8 && start[0] == 'c' && start[1] == 'o' && 252 start[2] == 'n' && start[3] == 't' && start[4] == 'i' && 253 start[5] == 'n' && start[6] == 'u' && start[7] == 'e') 254 kind = TOK_CONTINUE; 255 else if (len == 6 && start[0] == 'r' && start[1] == 'e' && 256 start[2] == 't' && start[3] == 'u' && start[4] == 'r' && 257 start[5] == 'n') 258 kind = TOK_RETURN; 259 else if (len == 4 && start[0] == 't' && start[1] == 'a' && 260 start[2] == 'i' && start[3] == 'l') 261 kind = TOK_TAIL; 262 else if (len == 8 && start[0] == 'm' && start[1] == 'u' && 263 start[2] == 's' && start[3] == 't' && start[4] == 't' && 264 start[5] == 'a' && start[6] == 'i' && start[7] == 'l') 265 kind = TOK_MUSTTAIL; 266 else if (len == 4 && start[0] == 't' && start[1] == 'y' && 267 start[2] == 'p' && start[3] == 'e') 268 kind = TOK_TYPE; 269 else if (len == 3 && start[0] == 'p' && start[1] == 'u' && start[2] == 'b') 270 kind = TOK_PUB; 271 else if (len == 6 && start[0] == 'e' && start[1] == 'x' && 272 start[2] == 't' && start[3] == 'e' && start[4] == 'r' && 273 start[5] == 'n') 274 kind = TOK_EXTERN; 275 else if (len == 5 && start[0] == 'a' && start[1] == 'l' && 276 start[2] == 'i' && start[3] == 'a' && start[4] == 's') 277 kind = TOK_ALIAS; 278 else if (len == 6 && start[0] == 'r' && start[1] == 'e' && 279 start[2] == 'c' && start[3] == 'o' && start[4] == 'r' && 280 start[5] == 'd') 281 kind = TOK_RECORD; 282 else if (len == 5 && start[0] == 't' && start[1] == 'u' && 283 start[2] == 'p' && start[3] == 'l' && start[4] == 'e') 284 kind = TOK_TUPLE; 285 else if (len == 4 && start[0] == 'e' && start[1] == 'n' && 286 start[2] == 'u' && start[3] == 'm') 287 kind = TOK_ENUM; 288 return toy_lexer_emit(lex, kind, start); 289 } 290 291 if (c == '"') { 292 while (lex->cur < lex->end && *lex->cur != '"') { 293 if (*lex->cur == '\n') toy_lexer_advance_line(lex); 294 lex->cur++; 295 } 296 if (lex->cur < lex->end && *lex->cur == '"') lex->cur++; 297 return toy_lexer_emit(lex, TOK_STRING, start); 298 } 299 300 return toy_lexer_emit(lex, TOK_EOF, start); 301 } 302 303 ToyToken toy_lexer_peek(const ToyLexer* lex) { 304 ToyLexer tmp = *lex; 305 return toy_lexer_next(&tmp); 306 }