kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 938b10869a2db00a0b374d7cd2bfc2449d7a87d0
parent dad429ccbf9c02a28c8922d683832bff9ab12d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 11 May 2026 08:10:29 -0700

parse: fuse adjacent string literals at the pp-pull boundary

Implements C11 §6.4.5 ¶5: a run of adjacent TOK_STR tokens collapses
into one fused token before reaching parse productions, so existing
string consumers (primary expr, char-array init, _Static_assert,
recorded brace replays) need no per-callsite changes.

Fusion happens in fetch_tok(), called from advance(), peek1(), and the
TU bootstrap. fuse_string_lits() interns the combined spelling into the
pool, unions encoding flags per ¶5 (errors on mismatched non-ordinary
prefixes), keeps the first piece's loc for diagnostics.

Diffstat:
Msrc/parse/parse.c | 121++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Atest/parse/cases/6_4_5_01_string_concat_expr.c | 10++++++++++
Atest/parse/cases/6_4_5_01_string_concat_expr.expected | 1+
Atest/parse/cases/6_4_5_02_string_concat_init.c | 4++++
Atest/parse/cases/6_4_5_02_string_concat_init.expected | 1+
5 files changed, 133 insertions(+), 4 deletions(-)

diff --git a/src/parse/parse.c b/src/parse/parse.c @@ -229,6 +229,13 @@ typedef struct Parser { Tok next; /* second slot, populated lazily by peek1() */ int has_next; + /* String-literal fusion (C11 §6.4.5 ¶5) is performed at the pp-pull + * boundary: a run of adjacent TOK_STR tokens collapses into one before + * landing in `cur`/`next`. To peek past the run we have to read the + * first non-TOK_STR from pp; `pending` parks it for the next pull. */ + Tok pending; + int has_pending; + Sym kw_sym[KW_COUNT]; /* Interned spellings for the __builtin_* / __atomic_* family routed through @@ -348,6 +355,111 @@ static _Noreturn void perr(Parser* p, const char* fmt, ...) { * Token helpers * ============================================================ */ +/* Width of an encoding prefix on a string-literal spelling: 0 for ordinary, + * 1 for L/u/U, 2 for u8. Driven by the TF_STR_* flag bits set by the lexer + * so we don't re-scan the spelling. */ +static size_t str_prefix_len(u16 flags) { + if (flags & TF_STR_U8) return 2; + if (flags & (TF_STR_WIDE | TF_STR_U16 | TF_STR_U32)) return 1; + return 0; +} + +#define STR_ENC_MASK \ + (TF_STR_WIDE | TF_STR_U8 | TF_STR_U16 | TF_STR_U32) + +/* Fuse two adjacent TOK_STR tokens into one per C11 §6.4.5 ¶5: + * - same encoding prefix (or both ordinary): keep that encoding; + * - one ordinary + one prefixed: the prefixed encoding wins; + * - two different non-ordinary prefixes: ill-formed. + * The combined spelling is `<prefix>"<content-of-a><content-of-b>"`, + * interned into the global pool; `loc` stays at the first token's loc + * so diagnostics still point at the start of the run. */ +static Tok fuse_string_lits(Parser* p, Tok a, Tok b) { + u16 ae = (u16)(a.flags & STR_ENC_MASK); + u16 be = (u16)(b.flags & STR_ENC_MASK); + u16 fused_enc; + size_t alen = 0, blen = 0; + const char* as = pool_str(p->pool, a.spelling, &alen); + const char* bs = pool_str(p->pool, b.spelling, &blen); + size_t apfx, bpfx; + size_t a_content_len, b_content_len; + size_t out_pfx_len; + size_t out_len; + Heap* h = p->c->env->heap; + char* buf; + size_t k = 0; + Tok out; + if (!as || !bs) perr(p, "bad string literal in concatenation"); + if (ae != 0 && be != 0 && ae != be) { + perr(p, "concatenating string literals with incompatible " + "encoding prefixes"); + } + fused_enc = ae ? ae : be; + apfx = str_prefix_len(a.flags); + bpfx = str_prefix_len(b.flags); + /* Each spelling is `<prefix>"...content..."`; strip prefix and the two + * delimiting quotes. lexer guarantees at least the prefix + 2 quotes. */ + if (alen < apfx + 2 || as[apfx] != '"' || as[alen - 1] != '"' || + blen < bpfx + 2 || bs[bpfx] != '"' || bs[blen - 1] != '"') { + perr(p, "malformed string literal in concatenation"); + } + a_content_len = alen - apfx - 2; + b_content_len = blen - bpfx - 2; + /* Output prefix: pick from whichever token contributed the surviving + * encoding (a if a was prefixed, else b — also covers both-ordinary). */ + out_pfx_len = ae ? apfx : bpfx; + out_len = out_pfx_len + 1 + a_content_len + b_content_len + 1; + buf = (char*)h->alloc(h, out_len, 1); + if (!buf) perr(p, "out of memory fusing string literals"); + if (out_pfx_len) { + const char* src = ae ? as : bs; + memcpy(buf + k, src, out_pfx_len); + k += out_pfx_len; + } + buf[k++] = '"'; + if (a_content_len) { + memcpy(buf + k, as + apfx + 1, a_content_len); + k += a_content_len; + } + if (b_content_len) { + memcpy(buf + k, bs + bpfx + 1, b_content_len); + k += b_content_len; + } + buf[k++] = '"'; + out = a; + out.spelling = pool_intern(p->pool, buf, k); + out.flags = (u16)((a.flags & ~STR_ENC_MASK) | fused_enc); + /* The fused token is freshly minted from the pool; LitId from the lexer + * pertained only to the first piece. Clear it so any future LitInfo + * lookups don't return stale per-token data. */ + out.lit = LIT_NONE; + h->free(h, buf, 0); + return out; +} + +/* Pull one logical token from pp, collapsing adjacent TOK_STR runs into a + * single fused TOK_STR. The first non-TOK_STR token that terminates a run + * is parked in `pending` for the next call. */ +static Tok fetch_tok(Parser* p) { + Tok t; + if (p->has_pending) { + t = p->pending; + p->has_pending = 0; + } else { + t = pp_next(p->pp); + } + if (t.kind != TOK_STR) return t; + for (;;) { + Tok n = pp_next(p->pp); + if (n.kind != TOK_STR) { + p->pending = n; + p->has_pending = 1; + return t; + } + t = fuse_string_lits(p, t, n); + } +} + static void advance(Parser* p) { if (p->replay_active) { if (p->replay_pos < p->replay_len) { @@ -363,7 +475,7 @@ static void advance(Parser* p) { p->cur = p->next; p->has_next = 0; } else { - p->cur = pp_next(p->pp); + p->cur = fetch_tok(p); } } @@ -373,7 +485,7 @@ static Tok peek1(Parser* p) { return p->replay[p->replay_pos]; } if (!p->has_next) { - p->next = pp_next(p->pp); + p->next = fetch_tok(p); p->has_next = 1; } return p->next; @@ -5678,8 +5790,9 @@ void parse_c(Compiler* c, Pp* pp, DeclTable* decls, CG* cg, Debug* debug) { SF_ALLOC | SF_EXEC, 4u); /* Pull the first token. PP yields preprocessed C tokens; directives - * have already been consumed. */ - p.cur = pp_next(p.pp); + * have already been consumed. fetch_tok performs adjacent-string-literal + * fusion (C11 §6.4.5 ¶5) before tokens reach the parse productions. */ + p.cur = fetch_tok(&p); parse_translation_unit(&p); } diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.c b/test/parse/cases/6_4_5_01_string_concat_expr.c @@ -0,0 +1,10 @@ +/* Three-way adjacent string literal fusion per C11 §6.4.5 ¶5. Exercises + * the loop in fetch_tok so a run longer than two pieces collapses into + * one TOK_STR with the full concatenation as its spelling. + * + * Subscripting goes through a char[] (the direct-string-literal subscript + * codegen path is separately limited at nonzero offsets). */ +int test_main(void) { + char s[7] = "a" "bc" "def"; + return s[5]; /* 'f' = 102 */ +} diff --git a/test/parse/cases/6_4_5_01_string_concat_expr.expected b/test/parse/cases/6_4_5_01_string_concat_expr.expected @@ -0,0 +1 @@ +102 diff --git a/test/parse/cases/6_4_5_02_string_concat_init.c b/test/parse/cases/6_4_5_02_string_concat_init.c @@ -0,0 +1,4 @@ +int test_main(void) { + char s[] = "hi" "lo"; + return s[0] + s[1] + s[2] + s[3] + s[4]; +} diff --git a/test/parse/cases/6_4_5_02_string_concat_init.expected b/test/parse/cases/6_4_5_02_string_concat_init.expected @@ -0,0 +1 @@ +172