parse: fuse adjacent string literals at the pp-pull boundary - kit

commit 938b10869a2db00a0b374d7cd2bfc2449d7a87d0
parent dad429ccbf9c02a28c8922d683832bff9ab12d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 11 May 2026 08:10:29 -0700

parse: fuse adjacent string literals at the pp-pull boundary

Implements C11 §6.4.5 ¶5: a run of adjacent TOK_STR tokens collapses
into one fused token before reaching parse productions, so existing
string consumers (primary expr, char-array init, _Static_assert,
recorded brace replays) need no per-callsite changes.

Fusion happens in fetch_tok(), called from advance(), peek1(), and the
TU bootstrap. fuse_string_lits() interns the combined spelling into the
pool, unions encoding flags per ¶5 (errors on mismatched non-ordinary
prefixes), keeps the first piece's loc for diagnostics.

Diffstat:
M src/parse/parse.c  | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
A test/parse/cases/6_4_5_01_string_concat_expr.c  | 10 ++++++++++
A test/parse/cases/6_4_5_01_string_concat_expr.expected  | 1 +
A test/parse/cases/6_4_5_02_string_concat_init.c  | 4 ++++
A test/parse/cases/6_4_5_02_string_concat_init.expected  | 1 +

5 files changed, 133 insertions(+), 4 deletions(-)
diff --git a/src/parse/parse.c b/src/parse/parse.c
@@ -229,6 +229,13 @@ typedef struct Parser {
   Tok next; /* second slot, populated lazily by peek1() */
   int has_next;
 
+  /* String-literal fusion (C11 §6.4.5 ¶5) is performed at the pp-pull
+   * boundary: a run of adjacent TOK_STR tokens collapses into one before
+   * landing in `cur`/`next`. To peek past the run we have to read the
+   * first non-TOK_STR from pp; `pending` parks it for the next pull. */
+  Tok pending;
+  int has_pending;
+
   Sym kw_sym[KW_COUNT];
 
   /* Interned spellings for the __builtin_* / __atomic_* family routed through
@@ -348,6 +355,111 @@ static _Noreturn void perr(Parser* p, const char* fmt, ...) {
  * Token helpers
  * ============================================================ */
 
+/* Width of an encoding prefix on a string-literal spelling: 0 for ordinary,
+ * 1 for L/u/U, 2 for u8. Driven by the TF_STR_* flag bits set by the lexer
+ * so we don't re-scan the spelling. */
+static size_t str_prefix_len(u16 flags) {
+  if (flags & TF_STR_U8) return 2;
+  if (flags & (TF_STR_WIDE | TF_STR_U16 | TF_STR_U32)) return 1;
+  return 0;
+}
+
+#define STR_ENC_MASK \
+  (TF_STR_WIDE | TF_STR_U8 | TF_STR_U16 | TF_STR_U32)
+
+/* Fuse two adjacent TOK_STR tokens into one per C11 §6.4.5 ¶5:
+ *   - same encoding prefix (or both ordinary): keep that encoding;
+ *   - one ordinary + one prefixed: the prefixed encoding wins;
+ *   - two different non-ordinary prefixes: ill-formed.
+ * The combined spelling is `<prefix>"<content-of-a><content-of-b>"`,
+ * interned into the global pool; `loc` stays at the first token's loc
+ * so diagnostics still point at the start of the run. */
+static Tok fuse_string_lits(Parser* p, Tok a, Tok b) {
+  u16 ae = (u16)(a.flags & STR_ENC_MASK);
+  u16 be = (u16)(b.flags & STR_ENC_MASK);
+  u16 fused_enc;
+  size_t alen = 0, blen = 0;
+  const char* as = pool_str(p->pool, a.spelling, &alen);
+  const char* bs = pool_str(p->pool, b.spelling, &blen);
+  size_t apfx, bpfx;
+  size_t a_content_len, b_content_len;
+  size_t out_pfx_len;
+  size_t out_len;
+  Heap* h = p->c->env->heap;
+  char* buf;
+  size_t k = 0;
+  Tok out;
+  if (!as || !bs) perr(p, "bad string literal in concatenation");
+  if (ae != 0 && be != 0 && ae != be) {
+    perr(p, "concatenating string literals with incompatible "
+            "encoding prefixes");
+  }
+  fused_enc = ae ? ae : be;
+  apfx = str_prefix_len(a.flags);
+  bpfx = str_prefix_len(b.flags);
+  /* Each spelling is `<prefix>"...content..."`; strip prefix and the two
+   * delimiting quotes. lexer guarantees at least the prefix + 2 quotes. */
+  if (alen < apfx + 2 || as[apfx] != '"' || as[alen - 1] != '"' ||
+      blen < bpfx + 2 || bs[bpfx] != '"' || bs[blen - 1] != '"') {
+    perr(p, "malformed string literal in concatenation");
+  }
+  a_content_len = alen - apfx - 2;
+  b_content_len = blen - bpfx - 2;
+  /* Output prefix: pick from whichever token contributed the surviving
+   * encoding (a if a was prefixed, else b — also covers both-ordinary). */
+  out_pfx_len = ae ? apfx : bpfx;
+  out_len = out_pfx_len + 1 + a_content_len + b_content_len + 1;
+  buf = (char*)h->alloc(h, out_len, 1);
+  if (!buf) perr(p, "out of memory fusing string literals");
+  if (out_pfx_len) {
+    const char* src = ae ? as : bs;
+    memcpy(buf + k, src, out_pfx_len);
+    k += out_pfx_len;
+  }
+  buf[k++] = '"';
+  if (a_content_len) {
+    memcpy(buf + k, as + apfx + 1, a_content_len);
+    k += a_content_len;
+  }
+  if (b_content_len) {
+    memcpy(buf + k, bs + bpfx + 1, b_content_len);
+    k += b_content_len;
+  }
+  buf[k++] = '"';
+  out = a;
+  out.spelling = pool_intern(p->pool, buf, k);
+  out.flags = (u16)((a.flags & ~STR_ENC_MASK) | fused_enc);
+  /* The fused token is freshly minted from the pool; LitId from the lexer
+   * pertained only to the first piece. Clear it so any future LitInfo
+   * lookups don't return stale per-token data. */
+  out.lit = LIT_NONE;
+  h->free(h, buf, 0);
+  return out;
+}
+
+/* Pull one logical token from pp, collapsing adjacent TOK_STR runs into a
+ * single fused TOK_STR. The first non-TOK_STR token that terminates a run
+ * is parked in `pending` for the next call. */
+static Tok fetch_tok(Parser* p) {
+  Tok t;
+  if (p->has_pending) {
+    t = p->pending;
+    p->has_pending = 0;
+  } else {
+    t = pp_next(p->pp);
+  }
+  if (t.kind != TOK_STR) return t;
+  for (;;) {
+    Tok n = pp_next(p->pp);
+    if (n.kind != TOK_STR) {
+      p->pending = n;
+      p->has_pending = 1;
+      return t;
+    }
+    t = fuse_string_lits(p, t, n);
+  }
+}
+
 static void advance(Parser* p) {
   if (p->replay_active) {
     if (p->replay_pos < p->replay_len) {
@@ -363,7 +475,7 @@ static void advance(Parser* p) {
     p->cur = p->next;
     p->has_next = 0;
   } else {
-    p->cur = pp_next(p->pp);
+    p->cur = fetch_tok(p);
   }
 }
 
@@ -373,7 +485,7 @@ static Tok peek1(Parser* p) {
     return p->replay[p->replay_pos];
   }
   if (!p->has_next) {
-    p->next = pp_next(p->pp);
+    p->next = fetch_tok(p);
     p->has_next = 1;
   }
   return p->next;
@@ -5678,8 +5790,9 @@ void parse_c(Compiler* c, Pp* pp, DeclTable* decls, CG* cg, Debug* debug) {
                            SF_ALLOC | SF_EXEC, 4u);
 
   /* Pull the first token. PP yields preprocessed C tokens; directives
-   * have already been consumed. */
-  p.cur = pp_next(p.pp);
+   * have already been consumed. fetch_tok performs adjacent-string-literal
+   * fusion (C11 §6.4.5 ¶5) before tokens reach the parse productions. */
+  p.cur = fetch_tok(&p);
 
   parse_translation_unit(&p);
 }
diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.c b/test/parse/cases/6_4_5_01_string_concat_expr.c
@@ -0,0 +1,10 @@
+/* Three-way adjacent string literal fusion per C11 §6.4.5 ¶5. Exercises
+ * the loop in fetch_tok so a run longer than two pieces collapses into
+ * one TOK_STR with the full concatenation as its spelling.
+ *
+ * Subscripting goes through a char[] (the direct-string-literal subscript
+ * codegen path is separately limited at nonzero offsets). */
+int test_main(void) {
+  char s[7] = "a" "bc" "def";
+  return s[5]; /* 'f' = 102 */
+}
diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.expected b/test/parse/cases/6_4_5_01_string_concat_expr.expected
@@ -0,0 +1 @@
+102
diff --git a/test/parse/cases/6_4_5_02_string_concat_init.c b/test/parse/cases/6_4_5_02_string_concat_init.c
@@ -0,0 +1,4 @@
+int test_main(void) {
+  char s[] = "hi" "lo";
+  return s[0] + s[1] + s[2] + s[3] + s[4];
+}
diff --git a/test/parse/cases/6_4_5_02_string_concat_init.expected b/test/parse/cases/6_4_5_02_string_concat_init.expected
@@ -0,0 +1 @@
+172

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/parse/parse.c	\|	121	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
A	test/parse/cases/6_4_5_01_string_concat_expr.c	\|	10	++++++++++
A	test/parse/cases/6_4_5_01_string_concat_expr.expected	\|	1	+
A	test/parse/cases/6_4_5_02_string_concat_init.c	\|	4	++++
A	test/parse/cases/6_4_5_02_string_concat_init.expected	\|	1	+