lex: implement line splicing and header-name lexing - kit

commit 71eb435e5b2049c2a305ae269c521e45aba33da3
parent edf85f383ffd2d87fdfa84d839ccd4fe23505403
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 04:30:50 -0700

lex: implement line splicing and header-name lexing

Phase-2 line splices (\<newline>) are now removed before tokenization,
so spliced source produces the expected logical tokens and spellings
(e.g. "in\<NL>t" interns as "int"). Adds TOK_HEADER and a small
directive-context state machine so #include / #embed arguments lex as
header-name tokens instead of strings or punct sequences.

Drive-by: add the missing R_AARCH64_LDST*_ABS_LO12_NC cases to
reloc_kind_name; the switch was -Werror'd on incomplete coverage.

Regenerates string_escapes.expected so the U+00A0 NBSP byte sequence
in the source survives round-trip; the prior expected file had been
silently normalized to ASCII space.

Diffstat:
M src/api/pipeline.c  | 6 ++++++
M src/lex/lex.c  | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M src/lex/lex.h  | 1 +
M test/lex/cases/string_escapes.expected  | 2 +-

4 files changed, 142 insertions(+), 20 deletions(-)
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -128,6 +128,7 @@ static void dt_emit(Writer* w, Pool* p, const Tok* t)
     case TOK_NEWLINE:   dt_write_str(w, "(newline)\n");                          return;
     case TOK_PP_HASH:   dt_write_str(w, "(pp-hash)\n");                          return;
     case TOK_PP_PASTE:  dt_write_str(w, "(pp-paste)\n");                         return;
+    case TOK_HEADER:    dt_write_str(w, "(header ");                             break;
     case TOK_IDENT:     dt_write_str(w, "(ident ");                              break;
     case TOK_NUM:       dt_write_str(w, "(num ");                                break;
     case TOK_FLT:       dt_write_str(w, "(flt ");                                break;
@@ -960,6 +961,11 @@ static const char* reloc_kind_name(u16 kind)
     case R_AARCH64_CALL26:             return "R_AARCH64_CALL26";
     case R_AARCH64_ADR_PREL_PG_HI21:   return "R_AARCH64_ADR_PREL_PG_HI21";
     case R_AARCH64_ADD_ABS_LO12_NC:    return "R_AARCH64_ADD_ABS_LO12_NC";
+    case R_AARCH64_LDST8_ABS_LO12_NC:  return "R_AARCH64_LDST8_ABS_LO12_NC";
+    case R_AARCH64_LDST16_ABS_LO12_NC: return "R_AARCH64_LDST16_ABS_LO12_NC";
+    case R_AARCH64_LDST32_ABS_LO12_NC: return "R_AARCH64_LDST32_ABS_LO12_NC";
+    case R_AARCH64_LDST64_ABS_LO12_NC: return "R_AARCH64_LDST64_ABS_LO12_NC";
+    case R_AARCH64_LDST128_ABS_LO12_NC:return "R_AARCH64_LDST128_ABS_LO12_NC";
     case R_RV_HI20:                    return "R_RISCV_HI20";
     case R_RV_LO12_I:                  return "R_RISCV_LO12_I";
     case R_RV_LO12_S:                  return "R_RISCV_LO12_S";
diff --git a/src/lex/lex.c b/src/lex/lex.c
@@ -30,17 +30,48 @@ struct Lexer {
     u32         col;
     u8          at_bol;
     u8          had_space;
+    /* §5.1.1.2 phase 4 directive context for header-name lexing.
+     * 0 = none, 1 = saw pp-hash, 2 = saw `#include`/etc and the next
+     * token may be a header-name. */
+    u8          dstate;
 };
 
+/* §5.1.1.2 translation phase 2: splice physical lines joined by
+ * backslash-newline. Advance past any splice sequence at l->pos so the
+ * cursor never rests on the leading backslash of a splice. */
+static void skip_splices(Lexer* l)
+{
+    while (l->pos + 1 < l->len &&
+           l->src[l->pos] == '\\' && l->src[l->pos + 1] == '\n') {
+        l->pos += 2;
+        l->line++;
+        l->col = 1;
+    }
+}
+
+/* Logical peek: returns the off-th post-splice byte starting at l->pos,
+ * or -1 at end of input. Does not mutate l->pos. */
 static int peek(const Lexer* l, size_t off)
 {
-    if (l->pos + off >= l->len) return -1;
-    return (unsigned char)l->src[l->pos + off];
+    size_t pos = l->pos;
+    size_t k   = 0;
+    while (pos < l->len) {
+        if (pos + 1 < l->len &&
+            l->src[pos] == '\\' && l->src[pos + 1] == '\n') {
+            pos += 2;
+            continue;
+        }
+        if (k == off) return (unsigned char)l->src[pos];
+        ++pos;
+        ++k;
+    }
+    return -1;
 }
 
 static int bump(Lexer* l)
 {
     int ch;
+    skip_splices(l);
     if (l->pos >= l->len) return -1;
     ch = (unsigned char)l->src[l->pos++];
     if (ch == '\n') { l->line++; l->col = 1; }
@@ -121,6 +152,48 @@ SrcLoc lex_loc(const Lexer* l)         { return lex_here(l); }
 u32    lex_file_id(const Lexer* l)     { return l->file_id; }
 const LitInfo* lex_lit(const Lexer* l, LitId id) { (void)l; (void)id; return NULL; }
 
+/* Intern bytes [start, end) with line splices (\<newline>) removed, so token
+ * spellings reflect post-phase-2 logical text. */
+static Sym intern_spliced(Lexer* l, size_t start, size_t end)
+{
+    size_t i;
+    int    has_splice = 0;
+    char*  buf;
+    size_t k;
+    Sym    sym;
+
+    for (i = start; i + 1 < end; ++i) {
+        if (l->src[i] == '\\' && l->src[i + 1] == '\n') { has_splice = 1; break; }
+    }
+    if (!has_splice) return pool_intern(l->pool, l->src + start, end - start);
+
+    buf = (char*)l->heap->alloc(l->heap, end - start, 1);
+    k   = 0;
+    for (i = start; i < end; ) {
+        if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') {
+            i += 2;
+            continue;
+        }
+        buf[k++] = l->src[i++];
+    }
+    sym = pool_intern(l->pool, buf, k);
+    l->heap->free(l->heap, buf, end - start);
+    return sym;
+}
+
+/* §6.4.7 header-name lookahead: in include-directive context, a `<` or `"`
+ * starts a header-name that runs to the matching `>` or `"`. The lexer
+ * recognizes only header-name forms (whose contents are implementation
+ * defined), not q-char-sequence escape rules. */
+static int matches_include_kw(const char* s, size_t n)
+{
+    if (n == 7  && memcmp(s, "include", 7) == 0)              return 1;
+    if (n == 12 && memcmp(s, "include_next", 12) == 0)        return 1;
+    if (n == 6  && memcmp(s, "import", 6) == 0)               return 1;
+    if (n == 5  && memcmp(s, "embed", 5) == 0)                return 1;
+    return 0;
+}
+
 /* Skip whitespace and comments. Returns 1 if a newline boundary was crossed
  * via comment consumption (caller still emits the explicit newline token on
  * an in-source '\n'). */
@@ -135,16 +208,14 @@ static void skip_ws_and_comments(Lexer* l)
         }
         if (ch == '/' && peek(l, 1) == '/') {
             bump(l); bump(l);
-            while (l->pos < l->len && (unsigned char)l->src[l->pos] != '\n') bump(l);
+            while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l);
             l->had_space = 1;
             continue;
         }
         if (ch == '/' && peek(l, 1) == '*') {
             bump(l); bump(l);
-            while (l->pos < l->len) {
-                if ((unsigned char)l->src[l->pos] == '*' &&
-                    l->pos + 1 < l->len &&
-                    (unsigned char)l->src[l->pos + 1] == '/') {
+            while (peek(l, 0) >= 0) {
+                if (peek(l, 0) == '*' && peek(l, 1) == '/') {
                     bump(l); bump(l);
                     break;
                 }
@@ -206,19 +277,19 @@ static int pp_number_is_float(const char* s, size_t n)
 static int scan_quoted(Lexer* l, int quote)
 {
     bump(l); /* opening quote */
-    while (l->pos < l->len) {
-        int ch = (unsigned char)l->src[l->pos];
+    for (;;) {
+        int ch = peek(l, 0);
+        if (ch < 0)      return 1;
         if (ch == quote) { bump(l); return 0; }
         if (ch == '\n')  return 1;
-        if (ch == '\\' && l->pos + 1 < l->len) {
+        if (ch == '\\') {
             bump(l); /* backslash */
-            bump(l); /* the escaped char (incl. potential newline in line
-                      * splice scenarios; we do not splice here) */
+            if (peek(l, 0) < 0) return 1;
+            bump(l); /* the escaped char */
             continue;
         }
         bump(l);
     }
-    return 1;
 }
 
 Tok lex_next(Lexer* l)
@@ -234,6 +305,7 @@ Tok lex_next(Lexer* l)
      * subsequent content tokens for the line that follows. */
     for (;;) {
         skip_ws_and_comments(l);
+        skip_splices(l);
         if (l->pos >= l->len) {
             t.kind = TOK_EOF;
             t.loc  = lex_here(l);
@@ -246,6 +318,7 @@ Tok lex_next(Lexer* l)
             t.loc   = tloc;
             l->at_bol    = 1;
             l->had_space = 0;
+            l->dstate    = 0;
             return t;
         }
         break;
@@ -261,6 +334,23 @@ Tok lex_next(Lexer* l)
     l->had_space = 0;
     t.loc = tloc;
 
+    /* §6.4.7 header-name: only valid in #include / #embed argument context. */
+    if (l->dstate == 2 && (ch == '<' || ch == '"')) {
+        int closer = (ch == '<') ? '>' : '"';
+        bump(l);
+        for (;;) {
+            int c = peek(l, 0);
+            if (c < 0 || c == '\n') { t.flags |= TF_LITERAL_BAD; break; }
+            if (c == closer)        { bump(l); break; }
+            bump(l);
+        }
+        t.kind     = TOK_HEADER;
+        t.spelling = intern_spliced(l, start, l->pos);
+        t.v.str    = t.spelling;
+        l->dstate  = 0;
+        return t;
+    }
+
     /* String / character literal, with optional encoding prefix. The prefix
      * length and encoding flag are decoded together so the spelling we
      * intern includes the prefix bytes. */
@@ -285,8 +375,9 @@ Tok lex_next(Lexer* l)
             if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD;
             t.kind     = (u16)(is_char ? TOK_CHR : TOK_STR);
             t.flags   |= encf;
-            t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+            t.spelling = intern_spliced(l, start, l->pos);
             t.v.str    = t.spelling;
+            l->dstate  = 0;
             return t;
         }
     }
@@ -313,18 +404,40 @@ Tok lex_next(Lexer* l)
                 }
             }
             t.kind     = TOK_IDENT;
-            t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+            t.spelling = intern_spliced(l, start, l->pos);
             t.v.ident  = t.spelling;
+            if (l->dstate == 1) {
+                size_t      slen = 0;
+                const char* sstr = pool_str(l->pool, t.spelling, &slen);
+                l->dstate = (sstr && matches_include_kw(sstr, slen)) ? 2 : 0;
+            } else {
+                l->dstate = 0;
+            }
             return t;
         }
     }
 
     /* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */
     if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) {
+        size_t plen;
+        char*  pbuf;
+        size_t i, k;
         scan_pp_number(l);
-        t.kind     = (u16)(pp_number_is_float(l->src + start, l->pos - start)
-                           ? TOK_FLT : TOK_NUM);
-        t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+        /* Classify on the post-splice text (the spelling we'll intern). */
+        plen = l->pos - start;
+        pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1);
+        k    = 0;
+        for (i = start; i < l->pos; ) {
+            if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') {
+                i += 2;
+                continue;
+            }
+            pbuf[k++] = l->src[i++];
+        }
+        t.kind     = (u16)(pp_number_is_float(pbuf, k) ? TOK_FLT : TOK_NUM);
+        t.spelling = pool_intern(l->pool, pbuf, k);
+        l->heap->free(l->heap, pbuf, plen ? plen : 1);
+        l->dstate  = 0;
         return t;
     }
 
@@ -430,7 +543,9 @@ Tok lex_next(Lexer* l)
         for (i = 0; i < adv; ++i) bump(l);
         t.kind     = kind;
         t.v.punct  = punct;
-        t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+        t.spelling = intern_spliced(l, start, l->pos);
+        if (kind == TOK_PP_HASH) l->dstate = 1;
+        else                     l->dstate = 0;
         return t;
     }
 }
diff --git a/src/lex/lex.h b/src/lex/lex.h
@@ -13,6 +13,7 @@ typedef enum TokKind {
     TOK_PUNCT,        /* v.punct */
     TOK_PP_HASH,      /* # */
     TOK_PP_PASTE,     /* ## */
+    TOK_HEADER,       /* header-name in #include / #embed */
     TOK_NEWLINE,      /* visible to PP only */
     TOK_KW_FIRST,
     /* C11 keywords are inserted into this range by parse_c via pool */
diff --git a/test/lex/cases/string_escapes.expected b/test/lex/cases/string_escapes.expected
@@ -72,6 +72,6 @@
 (newline)
 (chr 'é')
 (newline)
-(str "é ")
+(str "é ")
 (newline)
 (eof)

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/api/pipeline.c	\|	6	++++++
M	src/lex/lex.c	\|	153	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M	src/lex/lex.h	\|	1	+
M	test/lex/cases/string_escapes.expected	\|	2	+-