commit 71eb435e5b2049c2a305ae269c521e45aba33da3
parent edf85f383ffd2d87fdfa84d839ccd4fe23505403
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 04:30:50 -0700
lex: implement line splicing and header-name lexing
Phase-2 line splices (\<newline>) are now removed before tokenization,
so spliced source produces the expected logical tokens and spellings
(e.g. "in\<NL>t" interns as "int"). Adds TOK_HEADER and a small
directive-context state machine so #include / #embed arguments lex as
header-name tokens instead of strings or punct sequences.
Drive-by: add the missing R_AARCH64_LDST*_ABS_LO12_NC cases to
reloc_kind_name; the switch was -Werror'd on incomplete coverage.
Regenerates string_escapes.expected so the U+00A0 NBSP byte sequence
in the source survives round-trip; the prior expected file had been
silently normalized to ASCII space.
Diffstat:
4 files changed, 142 insertions(+), 20 deletions(-)
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -128,6 +128,7 @@ static void dt_emit(Writer* w, Pool* p, const Tok* t)
case TOK_NEWLINE: dt_write_str(w, "(newline)\n"); return;
case TOK_PP_HASH: dt_write_str(w, "(pp-hash)\n"); return;
case TOK_PP_PASTE: dt_write_str(w, "(pp-paste)\n"); return;
+ case TOK_HEADER: dt_write_str(w, "(header "); break;
case TOK_IDENT: dt_write_str(w, "(ident "); break;
case TOK_NUM: dt_write_str(w, "(num "); break;
case TOK_FLT: dt_write_str(w, "(flt "); break;
@@ -960,6 +961,11 @@ static const char* reloc_kind_name(u16 kind)
case R_AARCH64_CALL26: return "R_AARCH64_CALL26";
case R_AARCH64_ADR_PREL_PG_HI21: return "R_AARCH64_ADR_PREL_PG_HI21";
case R_AARCH64_ADD_ABS_LO12_NC: return "R_AARCH64_ADD_ABS_LO12_NC";
+ case R_AARCH64_LDST8_ABS_LO12_NC: return "R_AARCH64_LDST8_ABS_LO12_NC";
+ case R_AARCH64_LDST16_ABS_LO12_NC: return "R_AARCH64_LDST16_ABS_LO12_NC";
+ case R_AARCH64_LDST32_ABS_LO12_NC: return "R_AARCH64_LDST32_ABS_LO12_NC";
+ case R_AARCH64_LDST64_ABS_LO12_NC: return "R_AARCH64_LDST64_ABS_LO12_NC";
+ case R_AARCH64_LDST128_ABS_LO12_NC:return "R_AARCH64_LDST128_ABS_LO12_NC";
case R_RV_HI20: return "R_RISCV_HI20";
case R_RV_LO12_I: return "R_RISCV_LO12_I";
case R_RV_LO12_S: return "R_RISCV_LO12_S";
diff --git a/src/lex/lex.c b/src/lex/lex.c
@@ -30,17 +30,48 @@ struct Lexer {
u32 col;
u8 at_bol;
u8 had_space;
+ /* §5.1.1.2 phase 4 directive context for header-name lexing.
+ * 0 = none, 1 = saw pp-hash, 2 = saw `#include`/etc and the next
+ * token may be a header-name. */
+ u8 dstate;
};
+/* §5.1.1.2 translation phase 2: splice physical lines joined by
+ * backslash-newline. Advance past any splice sequence at l->pos so the
+ * cursor never rests on the leading backslash of a splice. */
+static void skip_splices(Lexer* l)
+{
+ while (l->pos + 1 < l->len &&
+ l->src[l->pos] == '\\' && l->src[l->pos + 1] == '\n') {
+ l->pos += 2;
+ l->line++;
+ l->col = 1;
+ }
+}
+
+/* Logical peek: returns the off-th post-splice byte starting at l->pos,
+ * or -1 at end of input. Does not mutate l->pos. */
static int peek(const Lexer* l, size_t off)
{
- if (l->pos + off >= l->len) return -1;
- return (unsigned char)l->src[l->pos + off];
+ size_t pos = l->pos;
+ size_t k = 0;
+ while (pos < l->len) {
+ if (pos + 1 < l->len &&
+ l->src[pos] == '\\' && l->src[pos + 1] == '\n') {
+ pos += 2;
+ continue;
+ }
+ if (k == off) return (unsigned char)l->src[pos];
+ ++pos;
+ ++k;
+ }
+ return -1;
}
static int bump(Lexer* l)
{
int ch;
+ skip_splices(l);
if (l->pos >= l->len) return -1;
ch = (unsigned char)l->src[l->pos++];
if (ch == '\n') { l->line++; l->col = 1; }
@@ -121,6 +152,48 @@ SrcLoc lex_loc(const Lexer* l) { return lex_here(l); }
u32 lex_file_id(const Lexer* l) { return l->file_id; }
const LitInfo* lex_lit(const Lexer* l, LitId id) { (void)l; (void)id; return NULL; }
+/* Intern bytes [start, end) with line splices (\<newline>) removed, so token
+ * spellings reflect post-phase-2 logical text. */
+static Sym intern_spliced(Lexer* l, size_t start, size_t end)
+{
+ size_t i;
+ int has_splice = 0;
+ char* buf;
+ size_t k;
+ Sym sym;
+
+ for (i = start; i + 1 < end; ++i) {
+ if (l->src[i] == '\\' && l->src[i + 1] == '\n') { has_splice = 1; break; }
+ }
+ if (!has_splice) return pool_intern(l->pool, l->src + start, end - start);
+
+ buf = (char*)l->heap->alloc(l->heap, end - start, 1);
+ k = 0;
+ for (i = start; i < end; ) {
+ if (i + 1 < end && l->src[i] == '\\' && l->src[i + 1] == '\n') {
+ i += 2;
+ continue;
+ }
+ buf[k++] = l->src[i++];
+ }
+ sym = pool_intern(l->pool, buf, k);
+ l->heap->free(l->heap, buf, end - start);
+ return sym;
+}
+
+/* §6.4.7 header-name lookahead: in include-directive context, a `<` or `"`
+ * starts a header-name that runs to the matching `>` or `"`. The lexer
+ * recognizes only header-name forms (whose contents are implementation
+ * defined), not q-char-sequence escape rules. */
+static int matches_include_kw(const char* s, size_t n)
+{
+ if (n == 7 && memcmp(s, "include", 7) == 0) return 1;
+ if (n == 12 && memcmp(s, "include_next", 12) == 0) return 1;
+ if (n == 6 && memcmp(s, "import", 6) == 0) return 1;
+ if (n == 5 && memcmp(s, "embed", 5) == 0) return 1;
+ return 0;
+}
+
/* Skip whitespace and comments. Returns 1 if a newline boundary was crossed
* via comment consumption (caller still emits the explicit newline token on
* an in-source '\n'). */
@@ -135,16 +208,14 @@ static void skip_ws_and_comments(Lexer* l)
}
if (ch == '/' && peek(l, 1) == '/') {
bump(l); bump(l);
- while (l->pos < l->len && (unsigned char)l->src[l->pos] != '\n') bump(l);
+ while (peek(l, 0) >= 0 && peek(l, 0) != '\n') bump(l);
l->had_space = 1;
continue;
}
if (ch == '/' && peek(l, 1) == '*') {
bump(l); bump(l);
- while (l->pos < l->len) {
- if ((unsigned char)l->src[l->pos] == '*' &&
- l->pos + 1 < l->len &&
- (unsigned char)l->src[l->pos + 1] == '/') {
+ while (peek(l, 0) >= 0) {
+ if (peek(l, 0) == '*' && peek(l, 1) == '/') {
bump(l); bump(l);
break;
}
@@ -206,19 +277,19 @@ static int pp_number_is_float(const char* s, size_t n)
static int scan_quoted(Lexer* l, int quote)
{
bump(l); /* opening quote */
- while (l->pos < l->len) {
- int ch = (unsigned char)l->src[l->pos];
+ for (;;) {
+ int ch = peek(l, 0);
+ if (ch < 0) return 1;
if (ch == quote) { bump(l); return 0; }
if (ch == '\n') return 1;
- if (ch == '\\' && l->pos + 1 < l->len) {
+ if (ch == '\\') {
bump(l); /* backslash */
- bump(l); /* the escaped char (incl. potential newline in line
- * splice scenarios; we do not splice here) */
+ if (peek(l, 0) < 0) return 1;
+ bump(l); /* the escaped char */
continue;
}
bump(l);
}
- return 1;
}
Tok lex_next(Lexer* l)
@@ -234,6 +305,7 @@ Tok lex_next(Lexer* l)
* subsequent content tokens for the line that follows. */
for (;;) {
skip_ws_and_comments(l);
+ skip_splices(l);
if (l->pos >= l->len) {
t.kind = TOK_EOF;
t.loc = lex_here(l);
@@ -246,6 +318,7 @@ Tok lex_next(Lexer* l)
t.loc = tloc;
l->at_bol = 1;
l->had_space = 0;
+ l->dstate = 0;
return t;
}
break;
@@ -261,6 +334,23 @@ Tok lex_next(Lexer* l)
l->had_space = 0;
t.loc = tloc;
+ /* §6.4.7 header-name: only valid in #include / #embed argument context. */
+ if (l->dstate == 2 && (ch == '<' || ch == '"')) {
+ int closer = (ch == '<') ? '>' : '"';
+ bump(l);
+ for (;;) {
+ int c = peek(l, 0);
+ if (c < 0 || c == '\n') { t.flags |= TF_LITERAL_BAD; break; }
+ if (c == closer) { bump(l); break; }
+ bump(l);
+ }
+ t.kind = TOK_HEADER;
+ t.spelling = intern_spliced(l, start, l->pos);
+ t.v.str = t.spelling;
+ l->dstate = 0;
+ return t;
+ }
+
/* String / character literal, with optional encoding prefix. The prefix
* length and encoding flag are decoded together so the spelling we
* intern includes the prefix bytes. */
@@ -285,8 +375,9 @@ Tok lex_next(Lexer* l)
if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD;
t.kind = (u16)(is_char ? TOK_CHR : TOK_STR);
t.flags |= encf;
- t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+ t.spelling = intern_spliced(l, start, l->pos);
t.v.str = t.spelling;
+ l->dstate = 0;
return t;
}
}
@@ -313,18 +404,40 @@ Tok lex_next(Lexer* l)
}
}
t.kind = TOK_IDENT;
- t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+ t.spelling = intern_spliced(l, start, l->pos);
t.v.ident = t.spelling;
+ if (l->dstate == 1) {
+ size_t slen = 0;
+ const char* sstr = pool_str(l->pool, t.spelling, &slen);
+ l->dstate = (sstr && matches_include_kw(sstr, slen)) ? 2 : 0;
+ } else {
+ l->dstate = 0;
+ }
return t;
}
}
/* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */
if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) {
+ size_t plen;
+ char* pbuf;
+ size_t i, k;
scan_pp_number(l);
- t.kind = (u16)(pp_number_is_float(l->src + start, l->pos - start)
- ? TOK_FLT : TOK_NUM);
- t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+ /* Classify on the post-splice text (the spelling we'll intern). */
+ plen = l->pos - start;
+ pbuf = (char*)l->heap->alloc(l->heap, plen ? plen : 1, 1);
+ k = 0;
+ for (i = start; i < l->pos; ) {
+ if (i + 1 < l->pos && l->src[i] == '\\' && l->src[i + 1] == '\n') {
+ i += 2;
+ continue;
+ }
+ pbuf[k++] = l->src[i++];
+ }
+ t.kind = (u16)(pp_number_is_float(pbuf, k) ? TOK_FLT : TOK_NUM);
+ t.spelling = pool_intern(l->pool, pbuf, k);
+ l->heap->free(l->heap, pbuf, plen ? plen : 1);
+ l->dstate = 0;
return t;
}
@@ -430,7 +543,9 @@ Tok lex_next(Lexer* l)
for (i = 0; i < adv; ++i) bump(l);
t.kind = kind;
t.v.punct = punct;
- t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+ t.spelling = intern_spliced(l, start, l->pos);
+ if (kind == TOK_PP_HASH) l->dstate = 1;
+ else l->dstate = 0;
return t;
}
}
diff --git a/src/lex/lex.h b/src/lex/lex.h
@@ -13,6 +13,7 @@ typedef enum TokKind {
TOK_PUNCT, /* v.punct */
TOK_PP_HASH, /* # */
TOK_PP_PASTE, /* ## */
+ TOK_HEADER, /* header-name in #include / #embed */
TOK_NEWLINE, /* visible to PP only */
TOK_KW_FIRST,
/* C11 keywords are inserted into this range by parse_c via pool */
diff --git a/test/lex/cases/string_escapes.expected b/test/lex/cases/string_escapes.expected
@@ -72,6 +72,6 @@
(newline)
(chr 'é')
(newline)
-(str "é ")
+(str "é ")
(newline)
(eof)