kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 71eb435e5b2049c2a305ae269c521e45aba33da3
parent edf85f383ffd2d87fdfa84d839ccd4fe23505403
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 04:30:50 -0700

lex: implement line splicing and header-name lexing

Phase-2 line splices (\<newline>) are now removed before tokenization,
so spliced source produces the expected logical tokens and spellings
(e.g. "in\<NL>t" interns as "int"). Adds TOK_HEADER and a small
directive-context state machine so #include / #embed arguments lex as
header-name tokens instead of strings or punct sequences.

Drive-by: add the missing R_AARCH64_LDST*_ABS_LO12_NC cases to
reloc_kind_name; the switch was -Werror'd on incomplete coverage.

Regenerates string_escapes.expected so the U+00A0 NBSP byte sequence
in the source survives round-trip; the prior expected file had been
silently normalized to ASCII space.

Diffstat:
Msrc/api/pipeline.c | 6++++++
Msrc/lex/lex.c | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/lex/lex.h | 1+
Mtest/lex/cases/string_escapes.expected | 2+-
4 files changed, 142 insertions(+), 20 deletions(-)

diff --git a/src/api/pipeline.c b/src/api/pipeline.c @@ -128,6 +128,7 @@ static void dt_emit(Writer* w, Pool* p, const Tok* t) case TOK_NEWLINE: dt_write_str(w, "(newline)\n"); return; case TOK_PP_HASH: dt_write_str(w, "(pp-hash)\n"); return; case TOK_PP_PASTE: dt_write_str(w, "(pp-paste)\n"); return; + case TOK_HEADER: dt_write_str(w, "(header "); break; case TOK_IDENT: dt_write_str(w, "(ident "); break; case TOK_NUM: dt_write_str(w, "(num "); break; case TOK_FLT: dt_write_str(w, "(flt "); break; @@ -960,6 +961,11 @@ static const char* reloc_kind_name(u16 kind) case R_AARCH64_CALL26: return "R_AARCH64_CALL26"; case R_AARCH64_ADR_PREL_PG_HI21: return "R_AARCH64_ADR_PREL_PG_HI21"; case R_AARCH64_ADD_ABS_LO12_NC: return "R_AARCH64_ADD_ABS_LO12_NC"; + case R_AARCH64_LDST8_ABS_LO12_NC: return "R_AARCH64_LDST8_ABS_LO12_NC"; + case R_AARCH64_LDST16_ABS_LO12_NC: return "R_AARCH64_LDST16_ABS_LO12_NC"; + case R_AARCH64_LDST32_ABS_LO12_NC: return "R_AARCH64_LDST32_ABS_LO12_NC"; + case R_AARCH64_LDST64_ABS_LO12_NC: return "R_AARCH64_LDST64_ABS_LO12_NC"; + case R_AARCH64_LDST128_ABS_LO12_NC:return "R_AARCH64_LDST128_ABS_LO12_NC"; case R_RV_HI20: return "R_RISCV_HI20"; case R_RV_LO12_I: return "R_RISCV_LO12_I"; case R_RV_LO12_S: return "R_RISCV_LO12_S"; diff --git a/src/lex/lex.c b/src/lex/lex.c @@ -30,17 +30,48 @@ struct Lexer { u32 col; u8 at_bol; u8 had_space; + /* §5.1.1.2 phase 4 directive context for header-name lexing. + * 0 = none, 1 = saw pp-hash, 2 = saw `#include`/etc and the next + * token may be a header-name. */ + u8 dstate; }; +/* §5.1.1.2 translation phase 2: splice physical lines joined by + * backslash-newline. Advance past any splice sequence at l->pos so the + * cursor never rests on the leading backslash of a splice. */ +static void skip_splices(Lexer* l) +{ + while (l->pos + 1 < l->len && + l->src[l->pos] == '\\' && l->src[l->pos + 1] == '\n') { + l->pos += 2; + l->line++; + l->col = 1; + } +} + +/* Logical peek: returns the off-th post-splice byte starting at l->pos, + * or -1 at end of input. Does not mutate l->pos. */ static int peek(const Lexer* l, size_t off) { - if (l->pos + off >= l->len) return -1; - return (unsigned char)l->src[l->pos + off]; + size_t pos = l->pos; + size_t k = 0; + while (pos < l->len) { + if (pos + 1 < l->len && + l->src[pos] == '\\' && l->src[pos + 1] == '\n') { + pos += 2; + continue; + } + if (k == off) return (unsigned char)l->src[pos]; + ++pos; + ++k; + } + return -1; } static int bump(Lexer* l) { int ch; + skip_splices(l); if (l->pos >= l->len) return -1; ch = (unsigned char)l->src[l->pos++]; if (ch == '\n') { l->line++; l->col = 1; } @@ -121,6 +152,48 @@ SrcLoc lex_loc(const Lexer* l) { return lex_here(l); } u32 lex_file_id(const Lexer* l) { return l->file_id; } const LitInfo* lex_lit(const Lexer* l, LitId id) { (void)l; (void)id; return NULL; } +/* Intern bytes [start, end) with line splices (\<newline>) removed, so token + * spellings reflect post-phase-2 logical text. */ +static Sym intern_spliced(Lexer* l, size_t start, size_t end) +{ + size_t i; + int has_splice = 0; + char* buf; + size_t k; + Sym sym; + + for (i = start; i + 1 < end; ++i) { + if (l->src[i] == '\\' && l->src[i + 1] == '\n') { has_splice = 1; break; } + } + if (!has_splice) return pool_intern(l->pool, l->src + start, end - start); + + buf = (char*)l->heap->alloc(l->heap, end - start, 1); + k = 0; + for (i = start; i < end; ) { + if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') { + i += 2; + continue; + } + buf[k++] = l->src[i++]; + } + sym = pool_intern(l->pool, buf, k); + l->heap->free(l->heap, buf, end - start); + return sym; +} + +/* §6.4.7 header-name lookahead: in include-directive context, a `<` or `"` + * starts a header-name that runs to the matching `>` or `"`. The lexer + * recognizes only header-name forms (whose contents are implementation + * defined), not q-char-sequence escape rules. */ +static int matches_include_kw(const char* s, size_t n) +{ + if (n == 7 && memcmp(s, "include", 7) == 0) return 1; + if (n == 12 && memcmp(s, "include_next", 12) == 0) return 1; + if (n == 6 && memcmp(s, "import", 6) == 0) return 1; + if (n == 5 && memcmp(s, "embed", 5) == 0) return 1; + return 0; +} + /* Skip whitespace and comments. Returns 1 if a newline boundary was crossed * via comment consumption (caller still emits the explicit newline token on * an in-source '\n'). */ @@ -135,16 +208,14 @@ static void skip_ws_and_comments(Lexer* l) } if (ch == '/' && peek(l, 1) == '/') { bump(l); bump(l); - while (l->pos < l->len && (unsigned char)l->src[l->pos] != '\n') bump(l); + while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l); l->had_space = 1; continue; } if (ch == '/' && peek(l, 1) == '*') { bump(l); bump(l); - while (l->pos < l->len) { - if ((unsigned char)l->src[l->pos] == '*' && - l->pos + 1 < l->len && - (unsigned char)l->src[l->pos + 1] == '/') { + while (peek(l, 0) >= 0) { + if (peek(l, 0) == '*' && peek(l, 1) == '/') { bump(l); bump(l); break; } @@ -206,19 +277,19 @@ static int pp_number_is_float(const char* s, size_t n) static int scan_quoted(Lexer* l, int quote) { bump(l); /* opening quote */ - while (l->pos < l->len) { - int ch = (unsigned char)l->src[l->pos]; + for (;;) { + int ch = peek(l, 0); + if (ch < 0) return 1; if (ch == quote) { bump(l); return 0; } if (ch == '\n') return 1; - if (ch == '\\' && l->pos + 1 < l->len) { + if (ch == '\\') { bump(l); /* backslash */ - bump(l); /* the escaped char (incl. potential newline in line - * splice scenarios; we do not splice here) */ + if (peek(l, 0) < 0) return 1; + bump(l); /* the escaped char */ continue; } bump(l); } - return 1; } Tok lex_next(Lexer* l) @@ -234,6 +305,7 @@ Tok lex_next(Lexer* l) * subsequent content tokens for the line that follows. */ for (;;) { skip_ws_and_comments(l); + skip_splices(l); if (l->pos >= l->len) { t.kind = TOK_EOF; t.loc = lex_here(l); @@ -246,6 +318,7 @@ Tok lex_next(Lexer* l) t.loc = tloc; l->at_bol = 1; l->had_space = 0; + l->dstate = 0; return t; } break; @@ -261,6 +334,23 @@ Tok lex_next(Lexer* l) l->had_space = 0; t.loc = tloc; + /* §6.4.7 header-name: only valid in #include / #embed argument context. */ + if (l->dstate == 2 && (ch == '<' || ch == '"')) { + int closer = (ch == '<') ? '>' : '"'; + bump(l); + for (;;) { + int c = peek(l, 0); + if (c < 0 || c == '\n') { t.flags |= TF_LITERAL_BAD; break; } + if (c == closer) { bump(l); break; } + bump(l); + } + t.kind = TOK_HEADER; + t.spelling = intern_spliced(l, start, l->pos); + t.v.str = t.spelling; + l->dstate = 0; + return t; + } + /* String / character literal, with optional encoding prefix. The prefix * length and encoding flag are decoded together so the spelling we * intern includes the prefix bytes. */ @@ -285,8 +375,9 @@ Tok lex_next(Lexer* l) if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD; t.kind = (u16)(is_char ? TOK_CHR : TOK_STR); t.flags |= encf; - t.spelling = pool_intern(l->pool, l->src + start, l->pos - start); + t.spelling = intern_spliced(l, start, l->pos); t.v.str = t.spelling; + l->dstate = 0; return t; } } @@ -313,18 +404,40 @@ Tok lex_next(Lexer* l) } } t.kind = TOK_IDENT; - t.spelling = pool_intern(l->pool, l->src + start, l->pos - start); + t.spelling = intern_spliced(l, start, l->pos); t.v.ident = t.spelling; + if (l->dstate == 1) { + size_t slen = 0; + const char* sstr = pool_str(l->pool, t.spelling, &slen); + l->dstate = (sstr && matches_include_kw(sstr, slen)) ? 2 : 0; + } else { + l->dstate = 0; + } return t; } } /* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */ if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) { + size_t plen; + char* pbuf; + size_t i, k; scan_pp_number(l); - t.kind = (u16)(pp_number_is_float(l->src + start, l->pos - start) - ? TOK_FLT : TOK_NUM); - t.spelling = pool_intern(l->pool, l->src + start, l->pos - start); + /* Classify on the post-splice text (the spelling we'll intern). */ + plen = l->pos - start; + pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1); + k = 0; + for (i = start; i < l->pos; ) { + if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') { + i += 2; + continue; + } + pbuf[k++] = l->src[i++]; + } + t.kind = (u16)(pp_number_is_float(pbuf, k) ? TOK_FLT : TOK_NUM); + t.spelling = pool_intern(l->pool, pbuf, k); + l->heap->free(l->heap, pbuf, plen ? plen : 1); + l->dstate = 0; return t; } @@ -430,7 +543,9 @@ Tok lex_next(Lexer* l) for (i = 0; i < adv; ++i) bump(l); t.kind = kind; t.v.punct = punct; - t.spelling = pool_intern(l->pool, l->src + start, l->pos - start); + t.spelling = intern_spliced(l, start, l->pos); + if (kind == TOK_PP_HASH) l->dstate = 1; + else l->dstate = 0; return t; } } diff --git a/src/lex/lex.h b/src/lex/lex.h @@ -13,6 +13,7 @@ typedef enum TokKind { TOK_PUNCT, /* v.punct */ TOK_PP_HASH, /* # */ TOK_PP_PASTE, /* ## */ + TOK_HEADER, /* header-name in #include / #embed */ TOK_NEWLINE, /* visible to PP only */ TOK_KW_FIRST, /* C11 keywords are inserted into this range by parse_c via pool */ diff --git a/test/lex/cases/string_escapes.expected b/test/lex/cases/string_escapes.expected @@ -72,6 +72,6 @@ (newline) (chr 'é') (newline) -(str "é ") +(str "é ") (newline) (eof)