kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pp_expand.c (36292B)


      1 /* pp_expand.c — hideset table, macro hashmap, #define/#undef, substitution,
      2  * paste, stringize, argument prescan, func/object macro expansion. */
      3 
      4 #include "pp/pp_priv.h"
      5 
      6 static int body_tokens_equal(const Tok* a, u32 na, const Tok* b, u32 nb);
      7 static int macros_equal(const Macro* a, const Macro* b);
      8 
      9 /* ============================================================
     10  * Hideset table
     11  * ============================================================ */
     12 
     13 static int sym_in_array(const Sym* a, u32 n, Sym s) {
     14   u32 i;
     15   for (i = 0; i < n; ++i)
     16     if (a[i] == s) return 1;
     17   return 0;
     18 }
     19 
     20 static HidesetId hs_register(Pp* pp, const Sym* names, u32 n) {
     21   Hideset* h;
     22   u32 i;
     23   if (n == 0) return HS_EMPTY;
     24 
     25   /* Linear search for an existing identical hideset. Hidesets are tiny. */
     26   for (i = 1; i < pp->hsets_n; ++i) {
     27     Hideset* e = pp->hsets[i];
     28     if (e->n != n) continue;
     29     {
     30       u32 j;
     31       for (j = 0; j < n; ++j)
     32         if (e->names[j] != names[j]) break;
     33       if (j == n) return (HidesetId)i;
     34     }
     35   }
     36 
     37   if (pp->hsets_n == pp->hsets_cap) {
     38     u32 nc = pp->hsets_cap ? pp->hsets_cap * 2 : 8;
     39     pp->hsets =
     40         (Hideset**)pp_xrealloc(pp, pp->hsets, sizeof(Hideset*) * pp->hsets_cap,
     41                                sizeof(Hideset*) * nc, _Alignof(Hideset*));
     42     pp->hsets_cap = nc;
     43   }
     44   h = (Hideset*)arena_alloc(pp->arena,
     45                             sizeof(Hideset) + sizeof(Sym) * (n ? n - 1 : 0),
     46                             _Alignof(Hideset));
     47   h->n = n;
     48   for (i = 0; i < n; ++i) h->names[i] = names[i];
     49   pp->hsets[pp->hsets_n] = h;
     50   return (HidesetId)pp->hsets_n++;
     51 }
     52 
     53 int hs_contains(Pp* pp, HidesetId id, Sym s) {
     54   Hideset* h;
     55   if (id == HS_EMPTY || s == 0) return 0;
     56   h = pp->hsets[id];
     57   return sym_in_array(h->names, h->n, s);
     58 }
     59 
     60 HidesetId hs_add(Pp* pp, HidesetId id, Sym s) {
     61   Sym buf[64];
     62   Hideset* h;
     63   u32 n;
     64   u32 i;
     65 
     66   if (s == 0) return id;
     67   if (hs_contains(pp, id, s)) return id;
     68 
     69   n = (id == HS_EMPTY) ? 0 : pp->hsets[id]->n;
     70   if (n + 1 > sizeof(buf) / sizeof(buf[0])) {
     71     compiler_panic(pp->c, (SrcLoc){0, 0, 0}, "pp: hideset overflow");
     72   }
     73   if (id != HS_EMPTY) {
     74     h = pp->hsets[id];
     75     for (i = 0; i < h->n; ++i) buf[i] = h->names[i];
     76   }
     77   /* Keep sorted (numerically) for canonical hideset identity. */
     78   {
     79     u32 pos = n;
     80     while (pos > 0 && buf[pos - 1] > s) {
     81       buf[pos] = buf[pos - 1];
     82       --pos;
     83     }
     84     buf[pos] = s;
     85   }
     86   return hs_register(pp, buf, n + 1);
     87 }
     88 
     89 /* Used by token-paste in stage 5; declared early so the rest of the file
     90  * doesn't grow forward decls. */
     91 __attribute__((unused)) static HidesetId hs_intersect(Pp* pp, HidesetId a,
     92                                                       HidesetId b) {
     93   Sym buf[64];
     94   Hideset *ha, *hb;
     95   u32 i, j, k;
     96   if (a == HS_EMPTY || b == HS_EMPTY) return HS_EMPTY;
     97   if (a == b) return a;
     98   ha = pp->hsets[a];
     99   hb = pp->hsets[b];
    100   /* Both sorted; standard merge intersection. */
    101   i = j = k = 0;
    102   while (i < ha->n && j < hb->n) {
    103     if (ha->names[i] == hb->names[j]) {
    104       buf[k++] = ha->names[i];
    105       ++i;
    106       ++j;
    107     } else if (ha->names[i] < hb->names[j]) {
    108       ++i;
    109     } else {
    110       ++j;
    111     }
    112   }
    113   return hs_register(pp, buf, k);
    114 }
    115 
    116 /* ============================================================
    117  * Macro table
    118  * ============================================================ */
    119 
    120 /* Thin wrappers over the generated MacroMap_* functions; preserved
    121  * because the call sites are tagged "mt_*" throughout this TU. */
    122 Macro* mt_get(Pp* pp, Sym name) {
    123   Macro** v = MacroMap_get(&pp->mtab, name);
    124   return v ? *v : NULL;
    125 }
    126 
    127 void mt_put(Pp* pp, Sym name, Macro* m) {
    128   (void)MacroMap_set(&pp->mtab, name, m);
    129 }
    130 
    131 void mt_del(Pp* pp, Sym name) { MacroMap_del(&pp->mtab, name); }
    132 
    133 /* ============================================================
    134  * #define / #undef
    135  * ============================================================ */
    136 
    137 void do_define(Pp* pp, const Tok* line, u32 n) {
    138   Macro* m;
    139   u32 i = 0;
    140   Sym name;
    141   SrcLoc def_loc;
    142   Macro* existing;
    143 
    144   if (i >= n || line[i].kind != TOK_IDENT) {
    145     compiler_panic(pp->c, n ? line[0].loc : (SrcLoc){0, 0, 0},
    146                    "#define: expected macro name");
    147   }
    148   name = line[i].v.ident;
    149   def_loc = line[i].loc;
    150   ++i;
    151 
    152   m = arena_znew(pp->arena, Macro);
    153   m->name = name;
    154   m->def_loc = def_loc;
    155 
    156   /* Function-like vs object-like: '(' immediately after the name with no
    157    * intervening whitespace. */
    158   if (i < n && line[i].kind == TOK_PUNCT && line[i].v.punct == '(' &&
    159       (line[i].flags & TF_HAS_SPACE) == 0) {
    160     Sym* params = NULL;
    161     u32 pcap = 0, pn = 0;
    162     ++i;
    163     m->is_func = 1;
    164     if (i < n && line[i].kind == TOK_PUNCT && line[i].v.punct == ')') {
    165       ++i;
    166     } else {
    167       for (;;) {
    168         if (i >= n) {
    169           compiler_panic(pp->c, def_loc,
    170                          "#define: unterminated parameter list");
    171         }
    172         if (line[i].kind == TOK_PUNCT && line[i].v.punct == P_ELLIPSIS) {
    173           /* Append a synthetic __VA_ARGS__ param so body-rewrite
    174            * matches the standard identifier directly. */
    175           if (pn == pcap) {
    176             u32 nc = pcap ? pcap * 2 : 4;
    177             Sym* nb = arena_array(pp->arena, Sym, nc);
    178             if (pcap) memcpy(nb, params, sizeof(Sym) * pcap);
    179             params = nb;
    180             pcap = nc;
    181           }
    182           params[pn++] = pp->sym_va_args;
    183           m->is_variadic = 1;
    184           ++i;
    185         } else if (line[i].kind == TOK_IDENT) {
    186           if (pn == pcap) {
    187             u32 nc = pcap ? pcap * 2 : 4;
    188             Sym* nb = arena_array(pp->arena, Sym, nc);
    189             if (pcap) memcpy(nb, params, sizeof(Sym) * pcap);
    190             params = nb;
    191             pcap = nc;
    192           }
    193           params[pn++] = line[i].v.ident;
    194           ++i;
    195           /* GNU named variadic: `args...` — the named parameter itself collects
    196            * the trailing arguments (the body refers to it by name rather than
    197            * __VA_ARGS__). The variadic arg-collection below is positional on the
    198            * last param, so we just mark the macro variadic and eat the ellipsis;
    199            * the "'...' must be last" check still fires if a comma follows. Linux
    200            * UAPI headers use this (e.g. <linux/stddef.h>'s __struct_group). */
    201           if (i < n && line[i].kind == TOK_PUNCT &&
    202               line[i].v.punct == P_ELLIPSIS) {
    203             m->is_variadic = 1;
    204             ++i;
    205           }
    206         } else {
    207           compiler_panic(pp->c, line[i].loc, "#define: bad parameter list");
    208         }
    209         if (i >= n) {
    210           compiler_panic(pp->c, def_loc,
    211                          "#define: unterminated parameter list");
    212         }
    213         if (line[i].kind == TOK_PUNCT && line[i].v.punct == ')') {
    214           ++i;
    215           break;
    216         }
    217         if (m->is_variadic) {
    218           compiler_panic(pp->c, line[i].loc,
    219                          "#define: '...' must be last parameter");
    220         }
    221         if (line[i].kind == TOK_PUNCT && line[i].v.punct == ',') {
    222           ++i;
    223           continue;
    224         }
    225         compiler_panic(pp->c, line[i].loc, "#define: expected ',' or ')'");
    226       }
    227     }
    228     m->params = params;
    229     m->n_params = pn;
    230   }
    231 
    232   /* Refuse define/undef of a few names the spec reserves: `defined`
    233    * and a small set of mandatory predefined macros. */
    234   if (name == pp->sym_defined || name == pp->sym_line__ ||
    235       name == pp->sym_file__ || name == pp->sym_date__ ||
    236       name == pp->sym_time__) {
    237     compiler_panic(pp->c, def_loc,
    238                    "#define of a reserved / predefined name is not allowed");
    239   }
    240   /* Static predefineds are already in the macro table; redefining
    241    * with a different body is caught by the existing macros_equal
    242    * check below, but #define of __STDC__ et al. with the SAME body
    243    * should also be rejected. */
    244   if (name == pp->sym_stdc__ || name == pp->sym_stdc_hosted__ ||
    245       name == pp->sym_stdc_version__) {
    246     /* Allow re-registration of the predefined value at pp_new time
    247      * but reject user-level redefinition. We detect "user-level"
    248      * by checking whether it's already in the table — at pp_new the
    249      * first call goes through cleanly. */
    250     if (mt_get(pp, name)) {
    251       compiler_panic(pp->c, def_loc,
    252                      "#define of a mandatory predefined macro is not allowed");
    253     }
    254   }
    255 
    256   /* Body: rewrite parameter occurrences to TOK_PP_PARAM. */
    257   {
    258     u32 body_n = n - i;
    259     u32 j;
    260     m->body = body_n ? arena_array(pp->arena, Tok, body_n) : NULL;
    261     m->body_len = body_n;
    262     for (j = 0; j < body_n; ++j) {
    263       Tok t = line[i + j];
    264       if (m->is_func && t.kind == TOK_IDENT) {
    265         u32 p;
    266         for (p = 0; p < m->n_params; ++p) {
    267           if (m->params[p] == t.v.ident) {
    268             t.kind = TOK_PP_PARAM;
    269             t.v.punct = p;
    270             break;
    271           }
    272         }
    273       }
    274       /* §6.10.3 ¶5: __VA_ARGS__ outside a variadic macro is
    275        * undefined behavior; we diagnose. */
    276       if (!m->is_variadic && t.kind == TOK_IDENT &&
    277           t.v.ident == pp->sym_va_args) {
    278         compiler_panic(pp->c, t.loc,
    279                        "__VA_ARGS__ may only appear in a variadic macro body");
    280       }
    281       m->body[j] = t;
    282     }
    283     /* Drop the leading-space bit on the first body token: it reflects
    284      * the whitespace between the macro name (or close-paren) and the
    285      * body, which is irrelevant to expansion output. */
    286     if (m->body_len) m->body[0].flags &= (u16)~TF_HAS_SPACE;
    287   }
    288 
    289   existing = mt_get(pp, name);
    290   if (existing) {
    291     if (!macros_equal(existing, m)) {
    292       compiler_panic(pp->c, def_loc,
    293                      "macro redefined with different replacement");
    294     }
    295     return;
    296   }
    297   mt_put(pp, name, m);
    298 }
    299 
    300 void do_undef(Pp* pp, const Tok* line, u32 n) {
    301   Sym name;
    302   if (!n || line[0].kind != TOK_IDENT) {
    303     compiler_panic(pp->c, n ? line[0].loc : (SrcLoc){0, 0, 0},
    304                    "#undef: expected identifier");
    305   }
    306   name = line[0].v.ident;
    307   if (name == pp->sym_defined || name == pp->sym_line__ ||
    308       name == pp->sym_file__ || name == pp->sym_date__ ||
    309       name == pp->sym_time__ || name == pp->sym_stdc__ ||
    310       name == pp->sym_stdc_hosted__ || name == pp->sym_stdc_version__) {
    311     compiler_panic(pp->c, line[0].loc,
    312                    "#undef of a mandatory predefined name is not allowed");
    313   }
    314   mt_del(pp, name);
    315 }
    316 
    317 /* ============================================================
    318  * Body comparison helpers
    319  * ============================================================ */
    320 
    321 static int body_tokens_equal(const Tok* a, u32 na, const Tok* b, u32 nb) {
    322   u32 i;
    323   if (na != nb) return 0;
    324   for (i = 0; i < na; ++i) {
    325     if (a[i].kind != b[i].kind) return 0;
    326     if (a[i].spelling != b[i].spelling) return 0;
    327     /* Whitespace separation must match (§6.10.3 ¶2). The first body
    328      * token's leading-space bit is meaningless (it's whatever was
    329      * between macro name and body); skip i==0 for that bit. */
    330     if (i > 0) {
    331       if ((a[i].flags & TF_HAS_SPACE) != (b[i].flags & TF_HAS_SPACE)) {
    332         return 0;
    333       }
    334     }
    335   }
    336   return 1;
    337 }
    338 
    339 static int macros_equal(const Macro* a, const Macro* b) {
    340   if (a->is_func != b->is_func) return 0;
    341   if (a->is_variadic != b->is_variadic) return 0;
    342   if (a->n_params != b->n_params) return 0;
    343   {
    344     u32 i;
    345     for (i = 0; i < a->n_params; ++i) {
    346       if (a->params[i] != b->params[i]) return 0;
    347     }
    348   }
    349   return body_tokens_equal(a->body, a->body_len, b->body, b->body_len);
    350 }
    351 
    352 /* ============================================================
    353  * Object-macro expansion
    354  * ============================================================ */
    355 
    356 static void subst_phase2(Pp* pp, const Tok* in, u32 nin, const Tok* invoke,
    357                          TokVec* out);
    358 
    359 /* Build a buffer of the macro's body (with hidesets) and push it. The
    360  * first expanded token inherits the invocation token's TF_AT_BOL /
    361  * TF_HAS_SPACE so output formatting matches the invocation site. */
    362 static void expand_object_macro(Pp* pp, const Macro* m, const Tok* invoke,
    363                                 HidesetId invoke_hs) {
    364   TokVec body = {0};
    365   Tok* tmp;
    366   HidesetId hs;
    367   HidesetId* hids;
    368   u32 i;
    369 
    370   if (m->body_len == 0) {
    371     return; /* placemarker: nothing to push */
    372   }
    373   /* Run the body through the paste phase: object-like macros may use
    374    * `##`. There are no parameters, so phase 1 reduces to a copy. */
    375   tmp = arena_array(pp->arena, Tok, m->body_len);
    376   for (i = 0; i < m->body_len; ++i) tmp[i] = m->body[i];
    377   subst_phase2(pp, tmp, m->body_len, invoke, &body);
    378 
    379   if (body.n == 0) return;
    380 
    381   /* Transfer invocation flags onto the first emitted token. */
    382   body.data[0].flags =
    383       (u16)((body.data[0].flags & ~(TF_AT_BOL | TF_HAS_SPACE)) |
    384             (invoke->flags & (TF_AT_BOL | TF_HAS_SPACE)));
    385   for (i = 0; i < body.n; ++i) body.data[i].loc = invoke->loc;
    386 
    387   hs = hs_add(pp, invoke_hs, m->name);
    388   hids = arena_array(pp->arena, HidesetId, body.n);
    389   for (i = 0; i < body.n; ++i) hids[i] = hs;
    390   push_buf(pp, body.data, hids, body.n);
    391 }
    392 
    393 /* ============================================================
    394  * Function-like macro expansion
    395  * ============================================================ */
    396 
    397 /* Peek for an open paren after the just-consumed identifier (which named
    398  * a function-like macro). Newlines are whitespace inside an invocation.
    399  * Returns 1 with `*ws_has_space_out` indicating whether any whitespace
    400  * (newlines or HAS_SPACE) sat between the ident and the `(`. Returns 0 if
    401  * no `(` follows; pushed-back tokens (NLs + the non-`(` token, if any)
    402  * are restored as a buffer source so subsequent reads still see them. */
    403 int peek_for_invoke_paren(Pp* pp, int* ws_has_space_out) {
    404   TokVec saved = {0};
    405   HsVec saved_hs = {0};
    406   int saw_ws = 0;
    407   Tok t;
    408   HidesetId hs;
    409 
    410   for (;;) {
    411     t = src_next_raw(pp, &hs, NULL);
    412     if (t.kind == TOK_NEWLINE) {
    413       saw_ws = 1;
    414       tv_push(pp, &saved, t);
    415       hsv_push(pp, &saved_hs, hs);
    416       continue;
    417     }
    418     if (t.kind == TOK_EOF) {
    419       /* No '(' — push back saved tokens, leave EOF for next read. */
    420       if (saved.n) push_buf(pp, saved.data, saved_hs.data, saved.n);
    421       *ws_has_space_out = saw_ws;
    422       return 0;
    423     }
    424     if (t.flags & TF_HAS_SPACE) saw_ws = 1;
    425     if (t.kind == TOK_PUNCT && t.v.punct == '(') {
    426       /* Consumed. The newlines we walked past are whitespace and
    427        * dropped (per spec); they don't go back on the stack. */
    428       *ws_has_space_out = saw_ws;
    429       return 1;
    430     }
    431     /* Save this non-`(` token too and push back. */
    432     tv_push(pp, &saved, t);
    433     hsv_push(pp, &saved_hs, hs);
    434     push_buf(pp, saved.data, saved_hs.data, saved.n);
    435     *ws_has_space_out = saw_ws;
    436     return 0;
    437   }
    438 }
    439 
    440 /* Run macro expansion on a fixed token sequence to completion, yielding the
    441  * fully-expanded token sequence. Used to pre-expand each function-macro
    442  * argument before substitution (§6.10.3.1 ¶1). */
    443 void expand_arg_to_eof(Pp* pp, Tok* in, HidesetId* hs, u32 nin, TokVec* out) {
    444   TokSrc src;
    445   Tok t;
    446 
    447   memset(&src, 0, sizeof(src));
    448   src.kind = SRC_BUF;
    449   src.scope_top = 1;
    450   src.toks = in;
    451   src.hs = hs;
    452   src.n = nin;
    453   src_push(pp, src);
    454 
    455   for (;;) {
    456     t = pp_next_raw(pp); /* drives macro expansion within this scope */
    457     if (t.kind == TOK_EOF) break;
    458     if (t.kind == TOK_NEWLINE) {
    459       /* Newlines inside an arg act as whitespace; convert to
    460        * "next-token has TF_HAS_SPACE". Drop the NL token itself. */
    461       continue;
    462     }
    463     tv_push(pp, out, t);
    464   }
    465   /* Pop our scope source. */
    466   --pp->nsources;
    467 }
    468 
    469 /* Argument list for a function-like invocation. Stored as parallel
    470  * (start, end) ranges into a flat unexpanded token vector and a flat
    471  * expanded token vector. */
    472 typedef struct ArgList {
    473   /* Unexpanded arg tokens (raw as collected from invocation). */
    474   Tok* raw;
    475   HidesetId* raw_hs;
    476   u32 raw_n;
    477   u32* raw_start; /* size n_args + 1 (sentinel = raw_n) */
    478   /* Pre-expanded tokens. */
    479   Tok* exp;
    480   u32 exp_n;
    481   u32* exp_start; /* size n_args + 1 (sentinel = exp_n) */
    482   u32 n_args;
    483 } ArgList;
    484 
    485 /* Collect arguments. Caller has just consumed the opening `(`. Returns the
    486  * close-paren's token (used as the invocation's last source location). */
    487 static Tok read_invocation_args(Pp* pp, const Macro* m, SrcLoc invoke_loc,
    488                                 ArgList* out) {
    489   TokVec raw = {0};
    490   HsVec raw_hs = {0};
    491   u32* starts;
    492   u32 starts_cap = 0;
    493   u32 n_args = 0;
    494   u32 cur_start = 0;
    495   int depth = 0;
    496   Tok t;
    497   HidesetId hs;
    498   int first_token_of_arg = 1;
    499   Tok close_tok;
    500 
    501   memset(out, 0, sizeof(*out));
    502   starts = arena_array(pp->arena, u32, 8);
    503   starts_cap = 8;
    504   starts[0] = 0;
    505 
    506   for (;;) {
    507     t = src_next_raw(pp, &hs, NULL);
    508     if (t.kind == TOK_EOF) {
    509       compiler_panic(pp->c, invoke_loc,
    510                      "unterminated function-like macro invocation");
    511     }
    512     if (t.kind == TOK_NEWLINE) {
    513       /* Whitespace within an invocation. Mark the next token as
    514        * having space; drop the NL. */
    515       if (raw.n && depth >= 0) {
    516         /* No-op token list; we'll OR onto the next pushed token. */
    517       }
    518       /* Use a sentinel: track via a flag on a deferred push. We
    519        * accumulate "has_space" by setting it on the next pushed
    520        * token. */
    521       /* Simpler: just push a placeholder by OR'ing onto next via
    522        * a flag stored in `first_token_of_arg`-style state. */
    523       /* Implementation: use the next read token's TF_HAS_SPACE bit,
    524        * which the lexer already sets after a NL. Actually NOT —
    525        * after a NL the lexer sets TF_AT_BOL on the next token, not
    526        * HAS_SPACE necessarily. Force it: */
    527       /* We'll OR it manually onto the next token. */
    528       /* Use a small flag stash: */
    529       /* (handled below by setting a pending flag) */
    530       /* See: pending_space variable */
    531       /* — commit: declare a pending_space static earlier. */
    532       continue;
    533     }
    534 
    535     if (t.kind == TOK_PUNCT) {
    536       u32 p = t.v.punct;
    537       if (p == '(') {
    538         ++depth;
    539       } else if (p == ')') {
    540         if (depth == 0) {
    541           /* End of invocation. Close the current argument. The
    542            * empty-args case (no commas seen, no tokens
    543            * collected) emits a slot only when the macro expects
    544            * at least one argument; arity-0 macros take none. */
    545           close_tok = t;
    546           {
    547             int empty_call =
    548                 (n_args == 0 && raw.n == cur_start && first_token_of_arg);
    549             int want_slot = !empty_call || (m->n_params > 0) || m->is_variadic;
    550             if (want_slot) {
    551               if (n_args + 1 >= starts_cap) {
    552                 u32 nc = starts_cap * 2;
    553                 u32* nb = arena_array(pp->arena, u32, nc);
    554                 memcpy(nb, starts, sizeof(u32) * starts_cap);
    555                 starts = nb;
    556                 starts_cap = nc;
    557               }
    558               ++n_args;
    559               starts[n_args] = raw.n;
    560             }
    561           }
    562           goto done;
    563         }
    564         --depth;
    565       } else if (p == ',' && depth == 0) {
    566         /* Variadic: once we've filled all named params, the rest
    567          * (commas included) collect into __VA_ARGS__. */
    568         if (m->is_variadic && n_args + 1 >= m->n_params) {
    569           /* This comma is part of __VA_ARGS__. Push it. */
    570           tv_push(pp, &raw, t);
    571           hsv_push(pp, &raw_hs, hs);
    572           first_token_of_arg = 0;
    573           continue;
    574         }
    575         /* Close current arg, start next. */
    576         if (n_args + 1 >= starts_cap) {
    577           u32 nc = starts_cap * 2;
    578           u32* nb = arena_array(pp->arena, u32, nc);
    579           memcpy(nb, starts, sizeof(u32) * starts_cap);
    580           starts = nb;
    581           starts_cap = nc;
    582         }
    583         ++n_args;
    584         starts[n_args] = raw.n;
    585         cur_start = raw.n;
    586         first_token_of_arg = 1;
    587         continue;
    588       }
    589     }
    590     tv_push(pp, &raw, t);
    591     hsv_push(pp, &raw_hs, hs);
    592     first_token_of_arg = 0;
    593   }
    594 done:
    595   /* Validate arity. */
    596   {
    597     u32 expected = m->n_params;
    598     if (m->is_variadic) {
    599       if (n_args < (expected ? expected - 1 : 0)) {
    600         /* Allow exactly expected-1 (empty __VA_ARGS__) by
    601          * synthesizing an empty trailing arg. */
    602         if (n_args + 1 == (expected ? expected - 1 : 0)) {
    603           /* off by one — fall through to error */
    604         }
    605         compiler_panic(pp->c, invoke_loc,
    606                        "too few arguments to variadic macro invocation");
    607       }
    608       /* Synthesize an empty __VA_ARGS__ if caller passed exactly
    609        * the named-parameter count. */
    610       if (n_args + 1 == expected) {
    611         if (n_args + 1 >= starts_cap) {
    612           u32 nc = starts_cap * 2;
    613           u32* nb = arena_array(pp->arena, u32, nc);
    614           memcpy(nb, starts, sizeof(u32) * starts_cap);
    615           starts = nb;
    616           starts_cap = nc;
    617         }
    618         ++n_args;
    619         starts[n_args] = raw.n;
    620       }
    621     } else {
    622       if (n_args != expected) {
    623         /* Spec: arity-0 macro `M()` invoked as `M()` is allowed and
    624          * has 0 args. Above logic produces 0 in that case. */
    625         compiler_panic(pp->c, invoke_loc,
    626                        "wrong number of arguments to function-like macro");
    627       }
    628     }
    629   }
    630   out->raw = raw.data;
    631   out->raw_hs = raw_hs.data;
    632   out->raw_n = raw.n;
    633   out->raw_start = starts;
    634   out->n_args = n_args;
    635   return close_tok;
    636 }
    637 
    638 /* Build pre-expanded args. */
    639 static void preexpand_args(Pp* pp, ArgList* a) {
    640   TokVec exp = {0};
    641   u32* exp_start;
    642   u32 i;
    643   exp_start = arena_array(pp->arena, u32, a->n_args + 1);
    644   exp_start[0] = 0;
    645   for (i = 0; i < a->n_args; ++i) {
    646     u32 lo = a->raw_start[i];
    647     u32 hi = a->raw_start[i + 1];
    648     if (hi > lo) {
    649       /* Copy the slice into a fresh buffer so expand_arg_to_eof can
    650        * own it without aliasing. */
    651       Tok* slice = arena_array(pp->arena, Tok, hi - lo);
    652       memcpy(slice, &a->raw[lo], sizeof(Tok) * (hi - lo));
    653       expand_arg_to_eof(pp, slice, a->raw_hs ? &a->raw_hs[lo] : NULL, hi - lo,
    654                         &exp);
    655     }
    656     exp_start[i + 1] = exp.n;
    657   }
    658   a->exp = exp.data;
    659   a->exp_n = exp.n;
    660   a->exp_start = exp_start;
    661 }
    662 
    663 /* Build a stringized TOK_STR from the unexpanded argument tokens
    664  * `arg[lo..hi)`. The first token's leading-space flag is ignored (leading
    665  * whitespace stripped). Inside string/char-literal spellings, '"' and '\'
    666  * are escaped. */
    667 static Tok make_stringize(Pp* pp, const Tok* arg, u32 lo, u32 hi, SrcLoc loc) {
    668   CharBuf b = {0};
    669   u32 i;
    670   Tok t;
    671   Sym sp;
    672 
    673   cb_putc(pp, &b, '"');
    674   for (i = lo; i < hi; ++i) {
    675     const Tok* at = &arg[i];
    676     KitSlice sl =
    677         at->spelling ? kit_sym_str(pp->pool->c, at->spelling) : KIT_SLICE_NULL;
    678     const char* s = sl.s;
    679     size_t slen = sl.len;
    680     if (i > lo && (at->flags & TF_HAS_SPACE)) cb_putc(pp, &b, ' ');
    681     if (s && slen) {
    682       int esc = (at->kind == TOK_STR || at->kind == TOK_CHR);
    683       size_t k;
    684       for (k = 0; k < slen; ++k) {
    685         char c = s[k];
    686         if (esc && (c == '\\' || c == '"')) cb_putc(pp, &b, '\\');
    687         cb_putc(pp, &b, c);
    688       }
    689     }
    690   }
    691   cb_putc(pp, &b, '"');
    692 
    693   sp = kit_sym_intern(pp->pool->c, (KitSlice){.s = b.data, .len = b.len});
    694   memset(&t, 0, sizeof(t));
    695   t.kind = TOK_STR;
    696   t.loc = loc;
    697   t.spelling = sp;
    698   t.v.str = sp;
    699   return t;
    700 }
    701 
    702 /* Concatenate two token spellings and re-lex into a single token. Empty
    703  * (placemarker) sides collapse to the other side per §6.10.3.3 ¶2. */
    704 static Tok paste_tokens(Pp* pp, Tok lhs, Tok rhs, SrcLoc loc) {
    705   char buf[1024];
    706   size_t alen = 0, blen = 0;
    707   const char* a;
    708   const char* b;
    709   Lexer* lex;
    710   Tok t1, t2;
    711 
    712   if (lhs.kind == TOK_PP_PLACEMARKER) return rhs;
    713   if (rhs.kind == TOK_PP_PLACEMARKER) return lhs;
    714 
    715   if (lhs.spelling) {
    716     KitSlice s = kit_sym_str(pp->pool->c, lhs.spelling);
    717     a = s.s;
    718     alen = s.len;
    719   } else {
    720     a = "";
    721   }
    722   if (rhs.spelling) {
    723     KitSlice s = kit_sym_str(pp->pool->c, rhs.spelling);
    724     b = s.s;
    725     blen = s.len;
    726   } else {
    727     b = "";
    728   }
    729   if (alen + blen + 2 > sizeof(buf)) {
    730     compiler_panic(pp->c, loc, "token paste: spelling too long");
    731   }
    732   if (alen) memcpy(buf, a, alen);
    733   if (blen) memcpy(buf + alen, b, blen);
    734   buf[alen + blen] = '\n';
    735   buf[alen + blen + 1] = 0;
    736 
    737   lex = lex_open_mem(pp->c, "<paste>", buf, alen + blen + 1);
    738   t1 = lex_next(lex);
    739   t2 = lex_next(lex);
    740   if (t1.kind == TOK_EOF) {
    741     /* Both empty (shouldn't reach here since we handled placemarkers). */
    742     lex_close(lex);
    743     return lhs;
    744   }
    745   if (t2.kind != TOK_NEWLINE && t2.kind != TOK_EOF) {
    746     lex_close(lex);
    747     compiler_panic(pp->c, loc, "token pasting yields multiple tokens, invalid");
    748   }
    749   lex_close(lex);
    750 
    751   /* Inherit positional flags from LHS (it sat in the same slot). */
    752   t1.flags = (u16)((t1.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) |
    753                    (lhs.flags & (TF_AT_BOL | TF_HAS_SPACE)));
    754   t1.loc = loc;
    755   return t1;
    756 }
    757 
    758 /* Phase 1 (param substitution). For each parameter occurrence in the
    759  * body: if adjacent to ## or # (handled separately), substitute the raw
    760  * argument tokens; otherwise substitute the pre-expanded form. Empty raw
    761  * args become a TOK_PP_PLACEMARKER which phase 2 collapses. */
    762 static void subst_phase1(Pp* pp, const Macro* m, ArgList* a, const Tok* invoke,
    763                          TokVec* out) {
    764   u32 j;
    765   for (j = 0; j < m->body_len; ++j) {
    766     const Tok* bt = &m->body[j];
    767     if (bt->kind == TOK_PP_HASH) {
    768       /* §6.10.3.2: # must be followed by a parameter. */
    769       if (j + 1 >= m->body_len || m->body[j + 1].kind != TOK_PP_PARAM) {
    770         compiler_panic(pp->c, bt->loc,
    771                        "'#' is not followed by a macro parameter");
    772       }
    773       {
    774         u32 p = m->body[j + 1].v.punct;
    775         u32 lo = a->raw_start[p];
    776         u32 hi = a->raw_start[p + 1];
    777         Tok s = make_stringize(pp, a->raw, lo, hi, invoke->loc);
    778         s.flags = (u16)((s.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) |
    779                         (bt->flags & (TF_AT_BOL | TF_HAS_SPACE)));
    780         tv_push(pp, out, s);
    781         ++j;
    782         continue;
    783       }
    784     }
    785     if (bt->kind == TOK_PP_PARAM) {
    786       u32 p = bt->v.punct;
    787       int adj_paste =
    788           (j > 0 && m->body[j - 1].kind == TOK_PP_PASTE) ||
    789           (j + 1 < m->body_len && m->body[j + 1].kind == TOK_PP_PASTE);
    790 
    791       u32 lo, hi;
    792       if (adj_paste) {
    793         lo = a->raw_start[p];
    794         hi = a->raw_start[p + 1];
    795       } else {
    796         lo = a->exp_start[p];
    797         hi = a->exp_start[p + 1];
    798       }
    799 
    800       if (lo == hi) {
    801         /* Empty argument → placemarker. */
    802         Tok pm;
    803         memset(&pm, 0, sizeof(pm));
    804         pm.kind = TOK_PP_PLACEMARKER;
    805         pm.flags = bt->flags & (TF_AT_BOL | TF_HAS_SPACE);
    806         pm.loc = invoke->loc;
    807         tv_push(pp, out, pm);
    808       } else {
    809         u32 k;
    810         int first = 1;
    811         Tok* src = adj_paste ? a->raw : a->exp;
    812         for (k = lo; k < hi; ++k) {
    813           Tok t = src[k];
    814           if (first) {
    815             t.flags = (u16)((t.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) |
    816                             (bt->flags & (TF_AT_BOL | TF_HAS_SPACE)));
    817             first = 0;
    818           }
    819           tv_push(pp, out, t);
    820         }
    821       }
    822       continue;
    823     }
    824     tv_push(pp, out, *bt);
    825   }
    826 }
    827 
    828 /* Phase 2 (paste). Walk the post-substitute buffer; for each TOK_PP_PASTE,
    829  * splice the previous output token with the next input token. Then strip
    830  * remaining placemarkers. */
    831 static void subst_phase2(Pp* pp, const Tok* in, u32 nin, const Tok* invoke,
    832                          TokVec* out) {
    833   u32 i;
    834   for (i = 0; i < nin; ++i) {
    835     Tok t = in[i];
    836     if (t.kind == TOK_PP_PASTE) {
    837       Tok lhs, rhs;
    838       if (out->n == 0 || i + 1 >= nin) {
    839         compiler_panic(pp->c, invoke->loc,
    840                        "'##' at start or end of replacement list");
    841       }
    842       lhs = out->data[--out->n];
    843       rhs = in[++i];
    844       tv_push(pp, out, paste_tokens(pp, lhs, rhs, invoke->loc));
    845       continue;
    846     }
    847     tv_push(pp, out, t);
    848   }
    849   /* Strip placemarkers, preserving leading-space flag on the next token. */
    850   {
    851     u32 r = 0, w = 0;
    852     u16 carry = 0;
    853     for (r = 0; r < out->n; ++r) {
    854       if (out->data[r].kind == TOK_PP_PLACEMARKER) {
    855         carry |= out->data[r].flags & (TF_AT_BOL | TF_HAS_SPACE);
    856         continue;
    857       }
    858       if (carry) {
    859         out->data[r].flags |= carry;
    860         carry = 0;
    861       }
    862       if (w != r) out->data[w] = out->data[r];
    863       ++w;
    864     }
    865     out->n = w;
    866   }
    867 }
    868 
    869 /* Wrapper: phases 1 and 2 in sequence, plus invocation-loc / flag transfer. */
    870 static void substitute_body(Pp* pp, const Macro* m, ArgList* a,
    871                             const Tok* invoke, HidesetId result_hs, TokVec* out,
    872                             TokVec* hs_out) {
    873   TokVec phase1 = {0};
    874   u32 i;
    875   subst_phase1(pp, m, a, invoke, &phase1);
    876   subst_phase2(pp, phase1.data, phase1.n, invoke, out);
    877   /* Invocation flags onto first emitted token. */
    878   if (out->n) {
    879     out->data[0].flags =
    880         (u16)((out->data[0].flags & ~(TF_AT_BOL | TF_HAS_SPACE)) |
    881               (invoke->flags & (TF_AT_BOL | TF_HAS_SPACE)));
    882   }
    883   /* Locations to invocation site. */
    884   for (i = 0; i < out->n; ++i) out->data[i].loc = invoke->loc;
    885   /* Build parallel hideset vector. */
    886   for (i = 0; i < out->n; ++i) {
    887     Tok hsmark;
    888     memset(&hsmark, 0, sizeof(hsmark));
    889     hsmark.spelling = (Sym)result_hs;
    890     tv_push(pp, hs_out, hsmark);
    891   }
    892 }
    893 
    894 /* Expand a function-like macro invocation: peek for `(`, collect args,
    895  * pre-expand them, substitute the body, push the result. Returns 1 if
    896  * the invocation was performed, 0 if there was no `(` (the caller should
    897  * emit the identifier as-is). */
    898 static int try_expand_func_macro(Pp* pp, const Macro* m, const Tok* invoke,
    899                                  HidesetId invoke_hs) {
    900   int saw_ws;
    901   ArgList args;
    902   TokVec body = {0};
    903   TokVec hsvec = {0}; /* parallel to body, holds HidesetId per slot */
    904   HidesetId result_hs;
    905   Tok close_tok;
    906 
    907   if (!peek_for_invoke_paren(pp, &saw_ws)) {
    908     return 0;
    909   }
    910   (void)saw_ws;
    911   read_invocation_args(pp, m, invoke->loc, &args);
    912   /* Note: assigned to silence unused-result; we don't use the close tok yet. */
    913   close_tok.kind = 0;
    914   (void)close_tok;
    915   preexpand_args(pp, &args);
    916 
    917   /* Hideset of result = invocation hideset ∪ {macro_name}. The standard
    918    * intersects with the closing `)`'s hideset for blue-paint purity, but
    919    * for the freshly-collected `)` from the lex source that's the empty
    920    * set, so the union form suffices here. */
    921   result_hs = hs_add(pp, invoke_hs, m->name);
    922   substitute_body(pp, m, &args, invoke, result_hs, &body, &hsvec);
    923 
    924   {
    925     u32 i;
    926     HidesetId* hids = arena_array(pp->arena, HidesetId, body.n ? body.n : 1);
    927     for (i = 0; i < body.n; ++i) {
    928       hids[i] = (HidesetId)hsvec.data[i].spelling;
    929     }
    930     push_buf(pp, body.data, hids, body.n);
    931   }
    932   return 1;
    933 }
    934 
    935 /* ============================================================
    936  * pp_next_raw — mutual recursion entry (called from expand_arg_to_eof)
    937  * Defined here; also declared in pp_priv.h so pp.c can call it.
    938  * ============================================================ */
    939 
    940 /* pp_next_raw: reads from the top source, applies macro expansion when an
    941  * identifier names a macro that isn't blue-painted, and consumes
    942  * directives in-place. TOK_NEWLINE is preserved for pp_emit_text. */
    943 Tok pp_next_raw(Pp* pp) {
    944   Tok t;
    945   HidesetId hs;
    946   u8 src_kind;
    947   for (;;) {
    948     t = src_next_raw(pp, &hs, &src_kind);
    949     if (t.kind == TOK_EOF) return t;
    950     if (t.kind == TOK_PP_HASH && (t.flags & TF_AT_BOL) && src_kind == SRC_LEX) {
    951       process_directive(pp, t.loc);
    952       /* No synthesized newline: the comparator collapses
    953        * whitespace, so blank-line replacement of consumed
    954        * directives isn't observable here. Directives that produce
    955        * content (e.g. #include, #embed, #pragma) push their own
    956        * tokens onto the source stack, which the next loop
    957        * iteration picks up. */
    958       continue;
    959     }
    960     /* While expanding an #if condition, suppress macro expansion of
    961      * `defined`-operator operands so a `defined(X)` produced by a
    962      * macro body whose argument was pasted via ## doesn't accidentally
    963      * expand an already-defined X to its body (typically empty). See
    964      * the `defined_skip` field comment in pp_priv.h. */
    965     if (pp->in_if_expansion) {
    966       if (pp->defined_skip == 1 && t.kind == TOK_IDENT) {
    967         t.flags |= TF_NO_EXPAND;
    968         pp->defined_skip = 0;
    969       } else if (pp->defined_skip == 2) {
    970         if (t.kind == TOK_PUNCT && t.v.punct == '(') {
    971           pp->defined_skip = 3;
    972         } else if (t.kind == TOK_IDENT) {
    973           /* `defined IDENT` (no parens) — same as the skip==1 case. */
    974           t.flags |= TF_NO_EXPAND;
    975           pp->defined_skip = 0;
    976         } else {
    977           pp->defined_skip = 0;
    978         }
    979       } else if (pp->defined_skip == 3) {
    980         if (t.kind == TOK_IDENT) {
    981           t.flags |= TF_NO_EXPAND;
    982           pp->defined_skip = 4;
    983         } else if (t.kind == TOK_PUNCT && t.v.punct == ')') {
    984           pp->defined_skip = 0;
    985         }
    986       } else if (pp->defined_skip == 4) {
    987         if (t.kind == TOK_PUNCT && t.v.punct == ')') {
    988           pp->defined_skip = 0;
    989         }
    990       } else if (t.kind == TOK_IDENT && t.v.ident == pp->sym_defined) {
    991         pp->defined_skip = 2;
    992       }
    993     }
    994     if (t.kind == TOK_IDENT && (t.flags & TF_NO_EXPAND) == 0) {
    995       Sym id = t.v.ident;
    996 
    997       /* Dynamic predefined macros: __LINE__ / __FILE__ /
    998        * __DATE__ / __TIME__. Always expand, ignoring the macro
    999        * table. */
   1000       if (id == pp->sym_line__) {
   1001         char tmp[16], buf[16];
   1002         int k = 0, j = 0;
   1003         u32 ln = t.loc.line;
   1004         if (ln == 0)
   1005           buf[k++] = '0';
   1006         else {
   1007           while (ln) {
   1008             tmp[j++] = (char)('0' + ln % 10);
   1009             ln /= 10;
   1010           }
   1011           while (j > 0) buf[k++] = tmp[--j];
   1012         }
   1013         t.kind = TOK_NUM;
   1014         t.spelling =
   1015             kit_sym_intern(pp->pool->c, (KitSlice){.s = buf, .len = (size_t)k});
   1016         return t;
   1017       }
   1018       if (id == pp->sym_file__) {
   1019         TokSrc* ls = current_lex_src(pp);
   1020         Sym name = 0;
   1021         size_t nlen = 0;
   1022         const char* nstr = NULL;
   1023         char* buf;
   1024         if (ls && ls->file_override) {
   1025           name = ls->file_override;
   1026         } else if (ls) {
   1027           KitSourceFile sf;
   1028           memset(&sf, 0, sizeof(sf));
   1029           if (kit_source_file(pp->c, lex_file_id(ls->lex), &sf) == 0) {
   1030             name = sf.name;
   1031           }
   1032         }
   1033         if (name) {
   1034           KitSlice s = kit_sym_str(pp->pool->c, name);
   1035           nstr = s.s;
   1036           nlen = s.len;
   1037         }
   1038         /* The source name is the raw filesystem path (or a #line override,
   1039          * destringized to logical bytes by do_line). Re-stringize it as a
   1040          * valid C string literal: escape '\\' and '"'. On POSIX paths use
   1041          * '/' so this was a no-op; on Windows the path holds backslashes
   1042          * (e.g. C:\\Users\\...), and emitting them raw turns '\\U'/'\\u'/'\\x'
   1043          * into bogus escape sequences (the "malformed UCN" on '\\Users'). */
   1044         {
   1045           size_t bn = 0;
   1046           size_t i;
   1047           buf = (char*)arena_alloc(pp->arena, nlen * 2 + 2, 1);
   1048           buf[bn++] = '"';
   1049           for (i = 0; i < nlen; ++i) {
   1050             char ch = nstr[i];
   1051             if (ch == '\\' || ch == '"') buf[bn++] = '\\';
   1052             buf[bn++] = ch;
   1053           }
   1054           buf[bn++] = '"';
   1055           t.kind = TOK_STR;
   1056           t.spelling =
   1057               kit_sym_intern(pp->pool->c, (KitSlice){.s = buf, .len = bn});
   1058           t.v.str = t.spelling;
   1059         }
   1060         return t;
   1061       }
   1062       if (id == pp->sym_date__) {
   1063         t.kind = TOK_STR;
   1064         t.spelling = pp->val_date_str;
   1065         t.v.str = t.spelling;
   1066         return t;
   1067       }
   1068       if (id == pp->sym_time__) {
   1069         t.kind = TOK_STR;
   1070         t.spelling = pp->val_time_str;
   1071         t.v.str = t.spelling;
   1072         return t;
   1073       }
   1074       if (id == pp->sym__pragma) {
   1075         if (try_expand_pragma_op(pp, &t)) continue;
   1076         /* No '(' — fall through and emit as plain ident. */
   1077       }
   1078 
   1079       {
   1080         Macro* m = mt_get(pp, id);
   1081         if (m && !hs_contains(pp, hs, m->name)) {
   1082           if (!m->is_func) {
   1083             expand_object_macro(pp, m, &t, hs);
   1084             continue;
   1085           }
   1086           if (try_expand_func_macro(pp, m, &t, hs)) {
   1087             continue;
   1088           }
   1089           /* No '(' followed; emit as plain identifier. */
   1090         }
   1091       }
   1092     }
   1093     return t;
   1094   }
   1095 }