commit 938b10869a2db00a0b374d7cd2bfc2449d7a87d0
parent dad429ccbf9c02a28c8922d683832bff9ab12d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 11 May 2026 08:10:29 -0700
parse: fuse adjacent string literals at the pp-pull boundary
Implements C11 §6.4.5 ¶5: a run of adjacent TOK_STR tokens collapses
into one fused token before reaching parse productions, so existing
string consumers (primary expr, char-array init, _Static_assert,
recorded brace replays) need no per-callsite changes.
Fusion happens in fetch_tok(), called from advance(), peek1(), and the
TU bootstrap. fuse_string_lits() interns the combined spelling into the
pool, unions encoding flags per ¶5 (errors on mismatched non-ordinary
prefixes), keeps the first piece's loc for diagnostics.
Diffstat:
5 files changed, 133 insertions(+), 4 deletions(-)
diff --git a/src/parse/parse.c b/src/parse/parse.c
@@ -229,6 +229,13 @@ typedef struct Parser {
Tok next; /* second slot, populated lazily by peek1() */
int has_next;
+ /* String-literal fusion (C11 §6.4.5 ¶5) is performed at the pp-pull
+ * boundary: a run of adjacent TOK_STR tokens collapses into one before
+ * landing in `cur`/`next`. To peek past the run we have to read the
+ * first non-TOK_STR from pp; `pending` parks it for the next pull. */
+ Tok pending;
+ int has_pending;
+
Sym kw_sym[KW_COUNT];
/* Interned spellings for the __builtin_* / __atomic_* family routed through
@@ -348,6 +355,111 @@ static _Noreturn void perr(Parser* p, const char* fmt, ...) {
* Token helpers
* ============================================================ */
+/* Width of an encoding prefix on a string-literal spelling: 0 for ordinary,
+ * 1 for L/u/U, 2 for u8. Driven by the TF_STR_* flag bits set by the lexer
+ * so we don't re-scan the spelling. */
+static size_t str_prefix_len(u16 flags) {
+ if (flags & TF_STR_U8) return 2;
+ if (flags & (TF_STR_WIDE | TF_STR_U16 | TF_STR_U32)) return 1;
+ return 0;
+}
+
+#define STR_ENC_MASK \
+ (TF_STR_WIDE | TF_STR_U8 | TF_STR_U16 | TF_STR_U32)
+
+/* Fuse two adjacent TOK_STR tokens into one per C11 §6.4.5 ¶5:
+ * - same encoding prefix (or both ordinary): keep that encoding;
+ * - one ordinary + one prefixed: the prefixed encoding wins;
+ * - two different non-ordinary prefixes: ill-formed.
+ * The combined spelling is `<prefix>"<content-of-a><content-of-b>"`,
+ * interned into the global pool; `loc` stays at the first token's loc
+ * so diagnostics still point at the start of the run. */
+static Tok fuse_string_lits(Parser* p, Tok a, Tok b) {
+ u16 ae = (u16)(a.flags & STR_ENC_MASK);
+ u16 be = (u16)(b.flags & STR_ENC_MASK);
+ u16 fused_enc;
+ size_t alen = 0, blen = 0;
+ const char* as = pool_str(p->pool, a.spelling, &alen);
+ const char* bs = pool_str(p->pool, b.spelling, &blen);
+ size_t apfx, bpfx;
+ size_t a_content_len, b_content_len;
+ size_t out_pfx_len;
+ size_t out_len;
+ Heap* h = p->c->env->heap;
+ char* buf;
+ size_t k = 0;
+ Tok out;
+ if (!as || !bs) perr(p, "bad string literal in concatenation");
+ if (ae != 0 && be != 0 && ae != be) {
+ perr(p, "concatenating string literals with incompatible "
+ "encoding prefixes");
+ }
+ fused_enc = ae ? ae : be;
+ apfx = str_prefix_len(a.flags);
+ bpfx = str_prefix_len(b.flags);
+ /* Each spelling is `<prefix>"...content..."`; strip prefix and the two
+ * delimiting quotes. lexer guarantees at least the prefix + 2 quotes. */
+ if (alen < apfx + 2 || as[apfx] != '"' || as[alen - 1] != '"' ||
+ blen < bpfx + 2 || bs[bpfx] != '"' || bs[blen - 1] != '"') {
+ perr(p, "malformed string literal in concatenation");
+ }
+ a_content_len = alen - apfx - 2;
+ b_content_len = blen - bpfx - 2;
+ /* Output prefix: pick from whichever token contributed the surviving
+ * encoding (a if a was prefixed, else b — also covers both-ordinary). */
+ out_pfx_len = ae ? apfx : bpfx;
+ out_len = out_pfx_len + 1 + a_content_len + b_content_len + 1;
+ buf = (char*)h->alloc(h, out_len, 1);
+ if (!buf) perr(p, "out of memory fusing string literals");
+ if (out_pfx_len) {
+ const char* src = ae ? as : bs;
+ memcpy(buf + k, src, out_pfx_len);
+ k += out_pfx_len;
+ }
+ buf[k++] = '"';
+ if (a_content_len) {
+ memcpy(buf + k, as + apfx + 1, a_content_len);
+ k += a_content_len;
+ }
+ if (b_content_len) {
+ memcpy(buf + k, bs + bpfx + 1, b_content_len);
+ k += b_content_len;
+ }
+ buf[k++] = '"';
+ out = a;
+ out.spelling = pool_intern(p->pool, buf, k);
+ out.flags = (u16)((a.flags & ~STR_ENC_MASK) | fused_enc);
+ /* The fused token is freshly minted from the pool; LitId from the lexer
+ * pertained only to the first piece. Clear it so any future LitInfo
+ * lookups don't return stale per-token data. */
+ out.lit = LIT_NONE;
+ h->free(h, buf, 0);
+ return out;
+}
+
+/* Pull one logical token from pp, collapsing adjacent TOK_STR runs into a
+ * single fused TOK_STR. The first non-TOK_STR token that terminates a run
+ * is parked in `pending` for the next call. */
+static Tok fetch_tok(Parser* p) {
+ Tok t;
+ if (p->has_pending) {
+ t = p->pending;
+ p->has_pending = 0;
+ } else {
+ t = pp_next(p->pp);
+ }
+ if (t.kind != TOK_STR) return t;
+ for (;;) {
+ Tok n = pp_next(p->pp);
+ if (n.kind != TOK_STR) {
+ p->pending = n;
+ p->has_pending = 1;
+ return t;
+ }
+ t = fuse_string_lits(p, t, n);
+ }
+}
+
static void advance(Parser* p) {
if (p->replay_active) {
if (p->replay_pos < p->replay_len) {
@@ -363,7 +475,7 @@ static void advance(Parser* p) {
p->cur = p->next;
p->has_next = 0;
} else {
- p->cur = pp_next(p->pp);
+ p->cur = fetch_tok(p);
}
}
@@ -373,7 +485,7 @@ static Tok peek1(Parser* p) {
return p->replay[p->replay_pos];
}
if (!p->has_next) {
- p->next = pp_next(p->pp);
+ p->next = fetch_tok(p);
p->has_next = 1;
}
return p->next;
@@ -5678,8 +5790,9 @@ void parse_c(Compiler* c, Pp* pp, DeclTable* decls, CG* cg, Debug* debug) {
SF_ALLOC | SF_EXEC, 4u);
/* Pull the first token. PP yields preprocessed C tokens; directives
- * have already been consumed. */
- p.cur = pp_next(p.pp);
+ * have already been consumed. fetch_tok performs adjacent-string-literal
+ * fusion (C11 §6.4.5 ¶5) before tokens reach the parse productions. */
+ p.cur = fetch_tok(&p);
parse_translation_unit(&p);
}
diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.c b/test/parse/cases/6_4_5_01_string_concat_expr.c
@@ -0,0 +1,10 @@
+/* Three-way adjacent string literal fusion per C11 §6.4.5 ¶5. Exercises
+ * the loop in fetch_tok so a run longer than two pieces collapses into
+ * one TOK_STR with the full concatenation as its spelling.
+ *
+ * Subscripting goes through a char[] (the direct-string-literal subscript
+ * codegen path is separately limited at nonzero offsets). */
+int test_main(void) {
+ char s[7] = "a" "bc" "def";
+ return s[5]; /* 'f' = 102 */
+}
diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.expected b/test/parse/cases/6_4_5_01_string_concat_expr.expected
@@ -0,0 +1 @@
+102
diff --git a/test/parse/cases/6_4_5_02_string_concat_init.c b/test/parse/cases/6_4_5_02_string_concat_init.c
@@ -0,0 +1,4 @@
+int test_main(void) {
+ char s[] = "hi" "lo";
+ return s[0] + s[1] + s[2] + s[3] + s[4];
+}
diff --git a/test/parse/cases/6_4_5_02_string_concat_init.expected b/test/parse/cases/6_4_5_02_string_concat_init.expected
@@ -0,0 +1 @@
+172