pp_expand.c (36292B)
1 /* pp_expand.c — hideset table, macro hashmap, #define/#undef, substitution, 2 * paste, stringize, argument prescan, func/object macro expansion. */ 3 4 #include "pp/pp_priv.h" 5 6 static int body_tokens_equal(const Tok* a, u32 na, const Tok* b, u32 nb); 7 static int macros_equal(const Macro* a, const Macro* b); 8 9 /* ============================================================ 10 * Hideset table 11 * ============================================================ */ 12 13 static int sym_in_array(const Sym* a, u32 n, Sym s) { 14 u32 i; 15 for (i = 0; i < n; ++i) 16 if (a[i] == s) return 1; 17 return 0; 18 } 19 20 static HidesetId hs_register(Pp* pp, const Sym* names, u32 n) { 21 Hideset* h; 22 u32 i; 23 if (n == 0) return HS_EMPTY; 24 25 /* Linear search for an existing identical hideset. Hidesets are tiny. */ 26 for (i = 1; i < pp->hsets_n; ++i) { 27 Hideset* e = pp->hsets[i]; 28 if (e->n != n) continue; 29 { 30 u32 j; 31 for (j = 0; j < n; ++j) 32 if (e->names[j] != names[j]) break; 33 if (j == n) return (HidesetId)i; 34 } 35 } 36 37 if (pp->hsets_n == pp->hsets_cap) { 38 u32 nc = pp->hsets_cap ? pp->hsets_cap * 2 : 8; 39 pp->hsets = 40 (Hideset**)pp_xrealloc(pp, pp->hsets, sizeof(Hideset*) * pp->hsets_cap, 41 sizeof(Hideset*) * nc, _Alignof(Hideset*)); 42 pp->hsets_cap = nc; 43 } 44 h = (Hideset*)arena_alloc(pp->arena, 45 sizeof(Hideset) + sizeof(Sym) * (n ? n - 1 : 0), 46 _Alignof(Hideset)); 47 h->n = n; 48 for (i = 0; i < n; ++i) h->names[i] = names[i]; 49 pp->hsets[pp->hsets_n] = h; 50 return (HidesetId)pp->hsets_n++; 51 } 52 53 int hs_contains(Pp* pp, HidesetId id, Sym s) { 54 Hideset* h; 55 if (id == HS_EMPTY || s == 0) return 0; 56 h = pp->hsets[id]; 57 return sym_in_array(h->names, h->n, s); 58 } 59 60 HidesetId hs_add(Pp* pp, HidesetId id, Sym s) { 61 Sym buf[64]; 62 Hideset* h; 63 u32 n; 64 u32 i; 65 66 if (s == 0) return id; 67 if (hs_contains(pp, id, s)) return id; 68 69 n = (id == HS_EMPTY) ? 0 : pp->hsets[id]->n; 70 if (n + 1 > sizeof(buf) / sizeof(buf[0])) { 71 compiler_panic(pp->c, (SrcLoc){0, 0, 0}, "pp: hideset overflow"); 72 } 73 if (id != HS_EMPTY) { 74 h = pp->hsets[id]; 75 for (i = 0; i < h->n; ++i) buf[i] = h->names[i]; 76 } 77 /* Keep sorted (numerically) for canonical hideset identity. */ 78 { 79 u32 pos = n; 80 while (pos > 0 && buf[pos - 1] > s) { 81 buf[pos] = buf[pos - 1]; 82 --pos; 83 } 84 buf[pos] = s; 85 } 86 return hs_register(pp, buf, n + 1); 87 } 88 89 /* Used by token-paste in stage 5; declared early so the rest of the file 90 * doesn't grow forward decls. */ 91 __attribute__((unused)) static HidesetId hs_intersect(Pp* pp, HidesetId a, 92 HidesetId b) { 93 Sym buf[64]; 94 Hideset *ha, *hb; 95 u32 i, j, k; 96 if (a == HS_EMPTY || b == HS_EMPTY) return HS_EMPTY; 97 if (a == b) return a; 98 ha = pp->hsets[a]; 99 hb = pp->hsets[b]; 100 /* Both sorted; standard merge intersection. */ 101 i = j = k = 0; 102 while (i < ha->n && j < hb->n) { 103 if (ha->names[i] == hb->names[j]) { 104 buf[k++] = ha->names[i]; 105 ++i; 106 ++j; 107 } else if (ha->names[i] < hb->names[j]) { 108 ++i; 109 } else { 110 ++j; 111 } 112 } 113 return hs_register(pp, buf, k); 114 } 115 116 /* ============================================================ 117 * Macro table 118 * ============================================================ */ 119 120 /* Thin wrappers over the generated MacroMap_* functions; preserved 121 * because the call sites are tagged "mt_*" throughout this TU. */ 122 Macro* mt_get(Pp* pp, Sym name) { 123 Macro** v = MacroMap_get(&pp->mtab, name); 124 return v ? *v : NULL; 125 } 126 127 void mt_put(Pp* pp, Sym name, Macro* m) { 128 (void)MacroMap_set(&pp->mtab, name, m); 129 } 130 131 void mt_del(Pp* pp, Sym name) { MacroMap_del(&pp->mtab, name); } 132 133 /* ============================================================ 134 * #define / #undef 135 * ============================================================ */ 136 137 void do_define(Pp* pp, const Tok* line, u32 n) { 138 Macro* m; 139 u32 i = 0; 140 Sym name; 141 SrcLoc def_loc; 142 Macro* existing; 143 144 if (i >= n || line[i].kind != TOK_IDENT) { 145 compiler_panic(pp->c, n ? line[0].loc : (SrcLoc){0, 0, 0}, 146 "#define: expected macro name"); 147 } 148 name = line[i].v.ident; 149 def_loc = line[i].loc; 150 ++i; 151 152 m = arena_znew(pp->arena, Macro); 153 m->name = name; 154 m->def_loc = def_loc; 155 156 /* Function-like vs object-like: '(' immediately after the name with no 157 * intervening whitespace. */ 158 if (i < n && line[i].kind == TOK_PUNCT && line[i].v.punct == '(' && 159 (line[i].flags & TF_HAS_SPACE) == 0) { 160 Sym* params = NULL; 161 u32 pcap = 0, pn = 0; 162 ++i; 163 m->is_func = 1; 164 if (i < n && line[i].kind == TOK_PUNCT && line[i].v.punct == ')') { 165 ++i; 166 } else { 167 for (;;) { 168 if (i >= n) { 169 compiler_panic(pp->c, def_loc, 170 "#define: unterminated parameter list"); 171 } 172 if (line[i].kind == TOK_PUNCT && line[i].v.punct == P_ELLIPSIS) { 173 /* Append a synthetic __VA_ARGS__ param so body-rewrite 174 * matches the standard identifier directly. */ 175 if (pn == pcap) { 176 u32 nc = pcap ? pcap * 2 : 4; 177 Sym* nb = arena_array(pp->arena, Sym, nc); 178 if (pcap) memcpy(nb, params, sizeof(Sym) * pcap); 179 params = nb; 180 pcap = nc; 181 } 182 params[pn++] = pp->sym_va_args; 183 m->is_variadic = 1; 184 ++i; 185 } else if (line[i].kind == TOK_IDENT) { 186 if (pn == pcap) { 187 u32 nc = pcap ? pcap * 2 : 4; 188 Sym* nb = arena_array(pp->arena, Sym, nc); 189 if (pcap) memcpy(nb, params, sizeof(Sym) * pcap); 190 params = nb; 191 pcap = nc; 192 } 193 params[pn++] = line[i].v.ident; 194 ++i; 195 /* GNU named variadic: `args...` — the named parameter itself collects 196 * the trailing arguments (the body refers to it by name rather than 197 * __VA_ARGS__). The variadic arg-collection below is positional on the 198 * last param, so we just mark the macro variadic and eat the ellipsis; 199 * the "'...' must be last" check still fires if a comma follows. Linux 200 * UAPI headers use this (e.g. <linux/stddef.h>'s __struct_group). */ 201 if (i < n && line[i].kind == TOK_PUNCT && 202 line[i].v.punct == P_ELLIPSIS) { 203 m->is_variadic = 1; 204 ++i; 205 } 206 } else { 207 compiler_panic(pp->c, line[i].loc, "#define: bad parameter list"); 208 } 209 if (i >= n) { 210 compiler_panic(pp->c, def_loc, 211 "#define: unterminated parameter list"); 212 } 213 if (line[i].kind == TOK_PUNCT && line[i].v.punct == ')') { 214 ++i; 215 break; 216 } 217 if (m->is_variadic) { 218 compiler_panic(pp->c, line[i].loc, 219 "#define: '...' must be last parameter"); 220 } 221 if (line[i].kind == TOK_PUNCT && line[i].v.punct == ',') { 222 ++i; 223 continue; 224 } 225 compiler_panic(pp->c, line[i].loc, "#define: expected ',' or ')'"); 226 } 227 } 228 m->params = params; 229 m->n_params = pn; 230 } 231 232 /* Refuse define/undef of a few names the spec reserves: `defined` 233 * and a small set of mandatory predefined macros. */ 234 if (name == pp->sym_defined || name == pp->sym_line__ || 235 name == pp->sym_file__ || name == pp->sym_date__ || 236 name == pp->sym_time__) { 237 compiler_panic(pp->c, def_loc, 238 "#define of a reserved / predefined name is not allowed"); 239 } 240 /* Static predefineds are already in the macro table; redefining 241 * with a different body is caught by the existing macros_equal 242 * check below, but #define of __STDC__ et al. with the SAME body 243 * should also be rejected. */ 244 if (name == pp->sym_stdc__ || name == pp->sym_stdc_hosted__ || 245 name == pp->sym_stdc_version__) { 246 /* Allow re-registration of the predefined value at pp_new time 247 * but reject user-level redefinition. We detect "user-level" 248 * by checking whether it's already in the table — at pp_new the 249 * first call goes through cleanly. */ 250 if (mt_get(pp, name)) { 251 compiler_panic(pp->c, def_loc, 252 "#define of a mandatory predefined macro is not allowed"); 253 } 254 } 255 256 /* Body: rewrite parameter occurrences to TOK_PP_PARAM. */ 257 { 258 u32 body_n = n - i; 259 u32 j; 260 m->body = body_n ? arena_array(pp->arena, Tok, body_n) : NULL; 261 m->body_len = body_n; 262 for (j = 0; j < body_n; ++j) { 263 Tok t = line[i + j]; 264 if (m->is_func && t.kind == TOK_IDENT) { 265 u32 p; 266 for (p = 0; p < m->n_params; ++p) { 267 if (m->params[p] == t.v.ident) { 268 t.kind = TOK_PP_PARAM; 269 t.v.punct = p; 270 break; 271 } 272 } 273 } 274 /* §6.10.3 ¶5: __VA_ARGS__ outside a variadic macro is 275 * undefined behavior; we diagnose. */ 276 if (!m->is_variadic && t.kind == TOK_IDENT && 277 t.v.ident == pp->sym_va_args) { 278 compiler_panic(pp->c, t.loc, 279 "__VA_ARGS__ may only appear in a variadic macro body"); 280 } 281 m->body[j] = t; 282 } 283 /* Drop the leading-space bit on the first body token: it reflects 284 * the whitespace between the macro name (or close-paren) and the 285 * body, which is irrelevant to expansion output. */ 286 if (m->body_len) m->body[0].flags &= (u16)~TF_HAS_SPACE; 287 } 288 289 existing = mt_get(pp, name); 290 if (existing) { 291 if (!macros_equal(existing, m)) { 292 compiler_panic(pp->c, def_loc, 293 "macro redefined with different replacement"); 294 } 295 return; 296 } 297 mt_put(pp, name, m); 298 } 299 300 void do_undef(Pp* pp, const Tok* line, u32 n) { 301 Sym name; 302 if (!n || line[0].kind != TOK_IDENT) { 303 compiler_panic(pp->c, n ? line[0].loc : (SrcLoc){0, 0, 0}, 304 "#undef: expected identifier"); 305 } 306 name = line[0].v.ident; 307 if (name == pp->sym_defined || name == pp->sym_line__ || 308 name == pp->sym_file__ || name == pp->sym_date__ || 309 name == pp->sym_time__ || name == pp->sym_stdc__ || 310 name == pp->sym_stdc_hosted__ || name == pp->sym_stdc_version__) { 311 compiler_panic(pp->c, line[0].loc, 312 "#undef of a mandatory predefined name is not allowed"); 313 } 314 mt_del(pp, name); 315 } 316 317 /* ============================================================ 318 * Body comparison helpers 319 * ============================================================ */ 320 321 static int body_tokens_equal(const Tok* a, u32 na, const Tok* b, u32 nb) { 322 u32 i; 323 if (na != nb) return 0; 324 for (i = 0; i < na; ++i) { 325 if (a[i].kind != b[i].kind) return 0; 326 if (a[i].spelling != b[i].spelling) return 0; 327 /* Whitespace separation must match (§6.10.3 ¶2). The first body 328 * token's leading-space bit is meaningless (it's whatever was 329 * between macro name and body); skip i==0 for that bit. */ 330 if (i > 0) { 331 if ((a[i].flags & TF_HAS_SPACE) != (b[i].flags & TF_HAS_SPACE)) { 332 return 0; 333 } 334 } 335 } 336 return 1; 337 } 338 339 static int macros_equal(const Macro* a, const Macro* b) { 340 if (a->is_func != b->is_func) return 0; 341 if (a->is_variadic != b->is_variadic) return 0; 342 if (a->n_params != b->n_params) return 0; 343 { 344 u32 i; 345 for (i = 0; i < a->n_params; ++i) { 346 if (a->params[i] != b->params[i]) return 0; 347 } 348 } 349 return body_tokens_equal(a->body, a->body_len, b->body, b->body_len); 350 } 351 352 /* ============================================================ 353 * Object-macro expansion 354 * ============================================================ */ 355 356 static void subst_phase2(Pp* pp, const Tok* in, u32 nin, const Tok* invoke, 357 TokVec* out); 358 359 /* Build a buffer of the macro's body (with hidesets) and push it. The 360 * first expanded token inherits the invocation token's TF_AT_BOL / 361 * TF_HAS_SPACE so output formatting matches the invocation site. */ 362 static void expand_object_macro(Pp* pp, const Macro* m, const Tok* invoke, 363 HidesetId invoke_hs) { 364 TokVec body = {0}; 365 Tok* tmp; 366 HidesetId hs; 367 HidesetId* hids; 368 u32 i; 369 370 if (m->body_len == 0) { 371 return; /* placemarker: nothing to push */ 372 } 373 /* Run the body through the paste phase: object-like macros may use 374 * `##`. There are no parameters, so phase 1 reduces to a copy. */ 375 tmp = arena_array(pp->arena, Tok, m->body_len); 376 for (i = 0; i < m->body_len; ++i) tmp[i] = m->body[i]; 377 subst_phase2(pp, tmp, m->body_len, invoke, &body); 378 379 if (body.n == 0) return; 380 381 /* Transfer invocation flags onto the first emitted token. */ 382 body.data[0].flags = 383 (u16)((body.data[0].flags & ~(TF_AT_BOL | TF_HAS_SPACE)) | 384 (invoke->flags & (TF_AT_BOL | TF_HAS_SPACE))); 385 for (i = 0; i < body.n; ++i) body.data[i].loc = invoke->loc; 386 387 hs = hs_add(pp, invoke_hs, m->name); 388 hids = arena_array(pp->arena, HidesetId, body.n); 389 for (i = 0; i < body.n; ++i) hids[i] = hs; 390 push_buf(pp, body.data, hids, body.n); 391 } 392 393 /* ============================================================ 394 * Function-like macro expansion 395 * ============================================================ */ 396 397 /* Peek for an open paren after the just-consumed identifier (which named 398 * a function-like macro). Newlines are whitespace inside an invocation. 399 * Returns 1 with `*ws_has_space_out` indicating whether any whitespace 400 * (newlines or HAS_SPACE) sat between the ident and the `(`. Returns 0 if 401 * no `(` follows; pushed-back tokens (NLs + the non-`(` token, if any) 402 * are restored as a buffer source so subsequent reads still see them. */ 403 int peek_for_invoke_paren(Pp* pp, int* ws_has_space_out) { 404 TokVec saved = {0}; 405 HsVec saved_hs = {0}; 406 int saw_ws = 0; 407 Tok t; 408 HidesetId hs; 409 410 for (;;) { 411 t = src_next_raw(pp, &hs, NULL); 412 if (t.kind == TOK_NEWLINE) { 413 saw_ws = 1; 414 tv_push(pp, &saved, t); 415 hsv_push(pp, &saved_hs, hs); 416 continue; 417 } 418 if (t.kind == TOK_EOF) { 419 /* No '(' — push back saved tokens, leave EOF for next read. */ 420 if (saved.n) push_buf(pp, saved.data, saved_hs.data, saved.n); 421 *ws_has_space_out = saw_ws; 422 return 0; 423 } 424 if (t.flags & TF_HAS_SPACE) saw_ws = 1; 425 if (t.kind == TOK_PUNCT && t.v.punct == '(') { 426 /* Consumed. The newlines we walked past are whitespace and 427 * dropped (per spec); they don't go back on the stack. */ 428 *ws_has_space_out = saw_ws; 429 return 1; 430 } 431 /* Save this non-`(` token too and push back. */ 432 tv_push(pp, &saved, t); 433 hsv_push(pp, &saved_hs, hs); 434 push_buf(pp, saved.data, saved_hs.data, saved.n); 435 *ws_has_space_out = saw_ws; 436 return 0; 437 } 438 } 439 440 /* Run macro expansion on a fixed token sequence to completion, yielding the 441 * fully-expanded token sequence. Used to pre-expand each function-macro 442 * argument before substitution (§6.10.3.1 ¶1). */ 443 void expand_arg_to_eof(Pp* pp, Tok* in, HidesetId* hs, u32 nin, TokVec* out) { 444 TokSrc src; 445 Tok t; 446 447 memset(&src, 0, sizeof(src)); 448 src.kind = SRC_BUF; 449 src.scope_top = 1; 450 src.toks = in; 451 src.hs = hs; 452 src.n = nin; 453 src_push(pp, src); 454 455 for (;;) { 456 t = pp_next_raw(pp); /* drives macro expansion within this scope */ 457 if (t.kind == TOK_EOF) break; 458 if (t.kind == TOK_NEWLINE) { 459 /* Newlines inside an arg act as whitespace; convert to 460 * "next-token has TF_HAS_SPACE". Drop the NL token itself. */ 461 continue; 462 } 463 tv_push(pp, out, t); 464 } 465 /* Pop our scope source. */ 466 --pp->nsources; 467 } 468 469 /* Argument list for a function-like invocation. Stored as parallel 470 * (start, end) ranges into a flat unexpanded token vector and a flat 471 * expanded token vector. */ 472 typedef struct ArgList { 473 /* Unexpanded arg tokens (raw as collected from invocation). */ 474 Tok* raw; 475 HidesetId* raw_hs; 476 u32 raw_n; 477 u32* raw_start; /* size n_args + 1 (sentinel = raw_n) */ 478 /* Pre-expanded tokens. */ 479 Tok* exp; 480 u32 exp_n; 481 u32* exp_start; /* size n_args + 1 (sentinel = exp_n) */ 482 u32 n_args; 483 } ArgList; 484 485 /* Collect arguments. Caller has just consumed the opening `(`. Returns the 486 * close-paren's token (used as the invocation's last source location). */ 487 static Tok read_invocation_args(Pp* pp, const Macro* m, SrcLoc invoke_loc, 488 ArgList* out) { 489 TokVec raw = {0}; 490 HsVec raw_hs = {0}; 491 u32* starts; 492 u32 starts_cap = 0; 493 u32 n_args = 0; 494 u32 cur_start = 0; 495 int depth = 0; 496 Tok t; 497 HidesetId hs; 498 int first_token_of_arg = 1; 499 Tok close_tok; 500 501 memset(out, 0, sizeof(*out)); 502 starts = arena_array(pp->arena, u32, 8); 503 starts_cap = 8; 504 starts[0] = 0; 505 506 for (;;) { 507 t = src_next_raw(pp, &hs, NULL); 508 if (t.kind == TOK_EOF) { 509 compiler_panic(pp->c, invoke_loc, 510 "unterminated function-like macro invocation"); 511 } 512 if (t.kind == TOK_NEWLINE) { 513 /* Whitespace within an invocation. Mark the next token as 514 * having space; drop the NL. */ 515 if (raw.n && depth >= 0) { 516 /* No-op token list; we'll OR onto the next pushed token. */ 517 } 518 /* Use a sentinel: track via a flag on a deferred push. We 519 * accumulate "has_space" by setting it on the next pushed 520 * token. */ 521 /* Simpler: just push a placeholder by OR'ing onto next via 522 * a flag stored in `first_token_of_arg`-style state. */ 523 /* Implementation: use the next read token's TF_HAS_SPACE bit, 524 * which the lexer already sets after a NL. Actually NOT — 525 * after a NL the lexer sets TF_AT_BOL on the next token, not 526 * HAS_SPACE necessarily. Force it: */ 527 /* We'll OR it manually onto the next token. */ 528 /* Use a small flag stash: */ 529 /* (handled below by setting a pending flag) */ 530 /* See: pending_space variable */ 531 /* — commit: declare a pending_space static earlier. */ 532 continue; 533 } 534 535 if (t.kind == TOK_PUNCT) { 536 u32 p = t.v.punct; 537 if (p == '(') { 538 ++depth; 539 } else if (p == ')') { 540 if (depth == 0) { 541 /* End of invocation. Close the current argument. The 542 * empty-args case (no commas seen, no tokens 543 * collected) emits a slot only when the macro expects 544 * at least one argument; arity-0 macros take none. */ 545 close_tok = t; 546 { 547 int empty_call = 548 (n_args == 0 && raw.n == cur_start && first_token_of_arg); 549 int want_slot = !empty_call || (m->n_params > 0) || m->is_variadic; 550 if (want_slot) { 551 if (n_args + 1 >= starts_cap) { 552 u32 nc = starts_cap * 2; 553 u32* nb = arena_array(pp->arena, u32, nc); 554 memcpy(nb, starts, sizeof(u32) * starts_cap); 555 starts = nb; 556 starts_cap = nc; 557 } 558 ++n_args; 559 starts[n_args] = raw.n; 560 } 561 } 562 goto done; 563 } 564 --depth; 565 } else if (p == ',' && depth == 0) { 566 /* Variadic: once we've filled all named params, the rest 567 * (commas included) collect into __VA_ARGS__. */ 568 if (m->is_variadic && n_args + 1 >= m->n_params) { 569 /* This comma is part of __VA_ARGS__. Push it. */ 570 tv_push(pp, &raw, t); 571 hsv_push(pp, &raw_hs, hs); 572 first_token_of_arg = 0; 573 continue; 574 } 575 /* Close current arg, start next. */ 576 if (n_args + 1 >= starts_cap) { 577 u32 nc = starts_cap * 2; 578 u32* nb = arena_array(pp->arena, u32, nc); 579 memcpy(nb, starts, sizeof(u32) * starts_cap); 580 starts = nb; 581 starts_cap = nc; 582 } 583 ++n_args; 584 starts[n_args] = raw.n; 585 cur_start = raw.n; 586 first_token_of_arg = 1; 587 continue; 588 } 589 } 590 tv_push(pp, &raw, t); 591 hsv_push(pp, &raw_hs, hs); 592 first_token_of_arg = 0; 593 } 594 done: 595 /* Validate arity. */ 596 { 597 u32 expected = m->n_params; 598 if (m->is_variadic) { 599 if (n_args < (expected ? expected - 1 : 0)) { 600 /* Allow exactly expected-1 (empty __VA_ARGS__) by 601 * synthesizing an empty trailing arg. */ 602 if (n_args + 1 == (expected ? expected - 1 : 0)) { 603 /* off by one — fall through to error */ 604 } 605 compiler_panic(pp->c, invoke_loc, 606 "too few arguments to variadic macro invocation"); 607 } 608 /* Synthesize an empty __VA_ARGS__ if caller passed exactly 609 * the named-parameter count. */ 610 if (n_args + 1 == expected) { 611 if (n_args + 1 >= starts_cap) { 612 u32 nc = starts_cap * 2; 613 u32* nb = arena_array(pp->arena, u32, nc); 614 memcpy(nb, starts, sizeof(u32) * starts_cap); 615 starts = nb; 616 starts_cap = nc; 617 } 618 ++n_args; 619 starts[n_args] = raw.n; 620 } 621 } else { 622 if (n_args != expected) { 623 /* Spec: arity-0 macro `M()` invoked as `M()` is allowed and 624 * has 0 args. Above logic produces 0 in that case. */ 625 compiler_panic(pp->c, invoke_loc, 626 "wrong number of arguments to function-like macro"); 627 } 628 } 629 } 630 out->raw = raw.data; 631 out->raw_hs = raw_hs.data; 632 out->raw_n = raw.n; 633 out->raw_start = starts; 634 out->n_args = n_args; 635 return close_tok; 636 } 637 638 /* Build pre-expanded args. */ 639 static void preexpand_args(Pp* pp, ArgList* a) { 640 TokVec exp = {0}; 641 u32* exp_start; 642 u32 i; 643 exp_start = arena_array(pp->arena, u32, a->n_args + 1); 644 exp_start[0] = 0; 645 for (i = 0; i < a->n_args; ++i) { 646 u32 lo = a->raw_start[i]; 647 u32 hi = a->raw_start[i + 1]; 648 if (hi > lo) { 649 /* Copy the slice into a fresh buffer so expand_arg_to_eof can 650 * own it without aliasing. */ 651 Tok* slice = arena_array(pp->arena, Tok, hi - lo); 652 memcpy(slice, &a->raw[lo], sizeof(Tok) * (hi - lo)); 653 expand_arg_to_eof(pp, slice, a->raw_hs ? &a->raw_hs[lo] : NULL, hi - lo, 654 &exp); 655 } 656 exp_start[i + 1] = exp.n; 657 } 658 a->exp = exp.data; 659 a->exp_n = exp.n; 660 a->exp_start = exp_start; 661 } 662 663 /* Build a stringized TOK_STR from the unexpanded argument tokens 664 * `arg[lo..hi)`. The first token's leading-space flag is ignored (leading 665 * whitespace stripped). Inside string/char-literal spellings, '"' and '\' 666 * are escaped. */ 667 static Tok make_stringize(Pp* pp, const Tok* arg, u32 lo, u32 hi, SrcLoc loc) { 668 CharBuf b = {0}; 669 u32 i; 670 Tok t; 671 Sym sp; 672 673 cb_putc(pp, &b, '"'); 674 for (i = lo; i < hi; ++i) { 675 const Tok* at = &arg[i]; 676 KitSlice sl = 677 at->spelling ? kit_sym_str(pp->pool->c, at->spelling) : KIT_SLICE_NULL; 678 const char* s = sl.s; 679 size_t slen = sl.len; 680 if (i > lo && (at->flags & TF_HAS_SPACE)) cb_putc(pp, &b, ' '); 681 if (s && slen) { 682 int esc = (at->kind == TOK_STR || at->kind == TOK_CHR); 683 size_t k; 684 for (k = 0; k < slen; ++k) { 685 char c = s[k]; 686 if (esc && (c == '\\' || c == '"')) cb_putc(pp, &b, '\\'); 687 cb_putc(pp, &b, c); 688 } 689 } 690 } 691 cb_putc(pp, &b, '"'); 692 693 sp = kit_sym_intern(pp->pool->c, (KitSlice){.s = b.data, .len = b.len}); 694 memset(&t, 0, sizeof(t)); 695 t.kind = TOK_STR; 696 t.loc = loc; 697 t.spelling = sp; 698 t.v.str = sp; 699 return t; 700 } 701 702 /* Concatenate two token spellings and re-lex into a single token. Empty 703 * (placemarker) sides collapse to the other side per §6.10.3.3 ¶2. */ 704 static Tok paste_tokens(Pp* pp, Tok lhs, Tok rhs, SrcLoc loc) { 705 char buf[1024]; 706 size_t alen = 0, blen = 0; 707 const char* a; 708 const char* b; 709 Lexer* lex; 710 Tok t1, t2; 711 712 if (lhs.kind == TOK_PP_PLACEMARKER) return rhs; 713 if (rhs.kind == TOK_PP_PLACEMARKER) return lhs; 714 715 if (lhs.spelling) { 716 KitSlice s = kit_sym_str(pp->pool->c, lhs.spelling); 717 a = s.s; 718 alen = s.len; 719 } else { 720 a = ""; 721 } 722 if (rhs.spelling) { 723 KitSlice s = kit_sym_str(pp->pool->c, rhs.spelling); 724 b = s.s; 725 blen = s.len; 726 } else { 727 b = ""; 728 } 729 if (alen + blen + 2 > sizeof(buf)) { 730 compiler_panic(pp->c, loc, "token paste: spelling too long"); 731 } 732 if (alen) memcpy(buf, a, alen); 733 if (blen) memcpy(buf + alen, b, blen); 734 buf[alen + blen] = '\n'; 735 buf[alen + blen + 1] = 0; 736 737 lex = lex_open_mem(pp->c, "<paste>", buf, alen + blen + 1); 738 t1 = lex_next(lex); 739 t2 = lex_next(lex); 740 if (t1.kind == TOK_EOF) { 741 /* Both empty (shouldn't reach here since we handled placemarkers). */ 742 lex_close(lex); 743 return lhs; 744 } 745 if (t2.kind != TOK_NEWLINE && t2.kind != TOK_EOF) { 746 lex_close(lex); 747 compiler_panic(pp->c, loc, "token pasting yields multiple tokens, invalid"); 748 } 749 lex_close(lex); 750 751 /* Inherit positional flags from LHS (it sat in the same slot). */ 752 t1.flags = (u16)((t1.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) | 753 (lhs.flags & (TF_AT_BOL | TF_HAS_SPACE))); 754 t1.loc = loc; 755 return t1; 756 } 757 758 /* Phase 1 (param substitution). For each parameter occurrence in the 759 * body: if adjacent to ## or # (handled separately), substitute the raw 760 * argument tokens; otherwise substitute the pre-expanded form. Empty raw 761 * args become a TOK_PP_PLACEMARKER which phase 2 collapses. */ 762 static void subst_phase1(Pp* pp, const Macro* m, ArgList* a, const Tok* invoke, 763 TokVec* out) { 764 u32 j; 765 for (j = 0; j < m->body_len; ++j) { 766 const Tok* bt = &m->body[j]; 767 if (bt->kind == TOK_PP_HASH) { 768 /* §6.10.3.2: # must be followed by a parameter. */ 769 if (j + 1 >= m->body_len || m->body[j + 1].kind != TOK_PP_PARAM) { 770 compiler_panic(pp->c, bt->loc, 771 "'#' is not followed by a macro parameter"); 772 } 773 { 774 u32 p = m->body[j + 1].v.punct; 775 u32 lo = a->raw_start[p]; 776 u32 hi = a->raw_start[p + 1]; 777 Tok s = make_stringize(pp, a->raw, lo, hi, invoke->loc); 778 s.flags = (u16)((s.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) | 779 (bt->flags & (TF_AT_BOL | TF_HAS_SPACE))); 780 tv_push(pp, out, s); 781 ++j; 782 continue; 783 } 784 } 785 if (bt->kind == TOK_PP_PARAM) { 786 u32 p = bt->v.punct; 787 int adj_paste = 788 (j > 0 && m->body[j - 1].kind == TOK_PP_PASTE) || 789 (j + 1 < m->body_len && m->body[j + 1].kind == TOK_PP_PASTE); 790 791 u32 lo, hi; 792 if (adj_paste) { 793 lo = a->raw_start[p]; 794 hi = a->raw_start[p + 1]; 795 } else { 796 lo = a->exp_start[p]; 797 hi = a->exp_start[p + 1]; 798 } 799 800 if (lo == hi) { 801 /* Empty argument → placemarker. */ 802 Tok pm; 803 memset(&pm, 0, sizeof(pm)); 804 pm.kind = TOK_PP_PLACEMARKER; 805 pm.flags = bt->flags & (TF_AT_BOL | TF_HAS_SPACE); 806 pm.loc = invoke->loc; 807 tv_push(pp, out, pm); 808 } else { 809 u32 k; 810 int first = 1; 811 Tok* src = adj_paste ? a->raw : a->exp; 812 for (k = lo; k < hi; ++k) { 813 Tok t = src[k]; 814 if (first) { 815 t.flags = (u16)((t.flags & ~(TF_AT_BOL | TF_HAS_SPACE)) | 816 (bt->flags & (TF_AT_BOL | TF_HAS_SPACE))); 817 first = 0; 818 } 819 tv_push(pp, out, t); 820 } 821 } 822 continue; 823 } 824 tv_push(pp, out, *bt); 825 } 826 } 827 828 /* Phase 2 (paste). Walk the post-substitute buffer; for each TOK_PP_PASTE, 829 * splice the previous output token with the next input token. Then strip 830 * remaining placemarkers. */ 831 static void subst_phase2(Pp* pp, const Tok* in, u32 nin, const Tok* invoke, 832 TokVec* out) { 833 u32 i; 834 for (i = 0; i < nin; ++i) { 835 Tok t = in[i]; 836 if (t.kind == TOK_PP_PASTE) { 837 Tok lhs, rhs; 838 if (out->n == 0 || i + 1 >= nin) { 839 compiler_panic(pp->c, invoke->loc, 840 "'##' at start or end of replacement list"); 841 } 842 lhs = out->data[--out->n]; 843 rhs = in[++i]; 844 tv_push(pp, out, paste_tokens(pp, lhs, rhs, invoke->loc)); 845 continue; 846 } 847 tv_push(pp, out, t); 848 } 849 /* Strip placemarkers, preserving leading-space flag on the next token. */ 850 { 851 u32 r = 0, w = 0; 852 u16 carry = 0; 853 for (r = 0; r < out->n; ++r) { 854 if (out->data[r].kind == TOK_PP_PLACEMARKER) { 855 carry |= out->data[r].flags & (TF_AT_BOL | TF_HAS_SPACE); 856 continue; 857 } 858 if (carry) { 859 out->data[r].flags |= carry; 860 carry = 0; 861 } 862 if (w != r) out->data[w] = out->data[r]; 863 ++w; 864 } 865 out->n = w; 866 } 867 } 868 869 /* Wrapper: phases 1 and 2 in sequence, plus invocation-loc / flag transfer. */ 870 static void substitute_body(Pp* pp, const Macro* m, ArgList* a, 871 const Tok* invoke, HidesetId result_hs, TokVec* out, 872 TokVec* hs_out) { 873 TokVec phase1 = {0}; 874 u32 i; 875 subst_phase1(pp, m, a, invoke, &phase1); 876 subst_phase2(pp, phase1.data, phase1.n, invoke, out); 877 /* Invocation flags onto first emitted token. */ 878 if (out->n) { 879 out->data[0].flags = 880 (u16)((out->data[0].flags & ~(TF_AT_BOL | TF_HAS_SPACE)) | 881 (invoke->flags & (TF_AT_BOL | TF_HAS_SPACE))); 882 } 883 /* Locations to invocation site. */ 884 for (i = 0; i < out->n; ++i) out->data[i].loc = invoke->loc; 885 /* Build parallel hideset vector. */ 886 for (i = 0; i < out->n; ++i) { 887 Tok hsmark; 888 memset(&hsmark, 0, sizeof(hsmark)); 889 hsmark.spelling = (Sym)result_hs; 890 tv_push(pp, hs_out, hsmark); 891 } 892 } 893 894 /* Expand a function-like macro invocation: peek for `(`, collect args, 895 * pre-expand them, substitute the body, push the result. Returns 1 if 896 * the invocation was performed, 0 if there was no `(` (the caller should 897 * emit the identifier as-is). */ 898 static int try_expand_func_macro(Pp* pp, const Macro* m, const Tok* invoke, 899 HidesetId invoke_hs) { 900 int saw_ws; 901 ArgList args; 902 TokVec body = {0}; 903 TokVec hsvec = {0}; /* parallel to body, holds HidesetId per slot */ 904 HidesetId result_hs; 905 Tok close_tok; 906 907 if (!peek_for_invoke_paren(pp, &saw_ws)) { 908 return 0; 909 } 910 (void)saw_ws; 911 read_invocation_args(pp, m, invoke->loc, &args); 912 /* Note: assigned to silence unused-result; we don't use the close tok yet. */ 913 close_tok.kind = 0; 914 (void)close_tok; 915 preexpand_args(pp, &args); 916 917 /* Hideset of result = invocation hideset ∪ {macro_name}. The standard 918 * intersects with the closing `)`'s hideset for blue-paint purity, but 919 * for the freshly-collected `)` from the lex source that's the empty 920 * set, so the union form suffices here. */ 921 result_hs = hs_add(pp, invoke_hs, m->name); 922 substitute_body(pp, m, &args, invoke, result_hs, &body, &hsvec); 923 924 { 925 u32 i; 926 HidesetId* hids = arena_array(pp->arena, HidesetId, body.n ? body.n : 1); 927 for (i = 0; i < body.n; ++i) { 928 hids[i] = (HidesetId)hsvec.data[i].spelling; 929 } 930 push_buf(pp, body.data, hids, body.n); 931 } 932 return 1; 933 } 934 935 /* ============================================================ 936 * pp_next_raw — mutual recursion entry (called from expand_arg_to_eof) 937 * Defined here; also declared in pp_priv.h so pp.c can call it. 938 * ============================================================ */ 939 940 /* pp_next_raw: reads from the top source, applies macro expansion when an 941 * identifier names a macro that isn't blue-painted, and consumes 942 * directives in-place. TOK_NEWLINE is preserved for pp_emit_text. */ 943 Tok pp_next_raw(Pp* pp) { 944 Tok t; 945 HidesetId hs; 946 u8 src_kind; 947 for (;;) { 948 t = src_next_raw(pp, &hs, &src_kind); 949 if (t.kind == TOK_EOF) return t; 950 if (t.kind == TOK_PP_HASH && (t.flags & TF_AT_BOL) && src_kind == SRC_LEX) { 951 process_directive(pp, t.loc); 952 /* No synthesized newline: the comparator collapses 953 * whitespace, so blank-line replacement of consumed 954 * directives isn't observable here. Directives that produce 955 * content (e.g. #include, #embed, #pragma) push their own 956 * tokens onto the source stack, which the next loop 957 * iteration picks up. */ 958 continue; 959 } 960 /* While expanding an #if condition, suppress macro expansion of 961 * `defined`-operator operands so a `defined(X)` produced by a 962 * macro body whose argument was pasted via ## doesn't accidentally 963 * expand an already-defined X to its body (typically empty). See 964 * the `defined_skip` field comment in pp_priv.h. */ 965 if (pp->in_if_expansion) { 966 if (pp->defined_skip == 1 && t.kind == TOK_IDENT) { 967 t.flags |= TF_NO_EXPAND; 968 pp->defined_skip = 0; 969 } else if (pp->defined_skip == 2) { 970 if (t.kind == TOK_PUNCT && t.v.punct == '(') { 971 pp->defined_skip = 3; 972 } else if (t.kind == TOK_IDENT) { 973 /* `defined IDENT` (no parens) — same as the skip==1 case. */ 974 t.flags |= TF_NO_EXPAND; 975 pp->defined_skip = 0; 976 } else { 977 pp->defined_skip = 0; 978 } 979 } else if (pp->defined_skip == 3) { 980 if (t.kind == TOK_IDENT) { 981 t.flags |= TF_NO_EXPAND; 982 pp->defined_skip = 4; 983 } else if (t.kind == TOK_PUNCT && t.v.punct == ')') { 984 pp->defined_skip = 0; 985 } 986 } else if (pp->defined_skip == 4) { 987 if (t.kind == TOK_PUNCT && t.v.punct == ')') { 988 pp->defined_skip = 0; 989 } 990 } else if (t.kind == TOK_IDENT && t.v.ident == pp->sym_defined) { 991 pp->defined_skip = 2; 992 } 993 } 994 if (t.kind == TOK_IDENT && (t.flags & TF_NO_EXPAND) == 0) { 995 Sym id = t.v.ident; 996 997 /* Dynamic predefined macros: __LINE__ / __FILE__ / 998 * __DATE__ / __TIME__. Always expand, ignoring the macro 999 * table. */ 1000 if (id == pp->sym_line__) { 1001 char tmp[16], buf[16]; 1002 int k = 0, j = 0; 1003 u32 ln = t.loc.line; 1004 if (ln == 0) 1005 buf[k++] = '0'; 1006 else { 1007 while (ln) { 1008 tmp[j++] = (char)('0' + ln % 10); 1009 ln /= 10; 1010 } 1011 while (j > 0) buf[k++] = tmp[--j]; 1012 } 1013 t.kind = TOK_NUM; 1014 t.spelling = 1015 kit_sym_intern(pp->pool->c, (KitSlice){.s = buf, .len = (size_t)k}); 1016 return t; 1017 } 1018 if (id == pp->sym_file__) { 1019 TokSrc* ls = current_lex_src(pp); 1020 Sym name = 0; 1021 size_t nlen = 0; 1022 const char* nstr = NULL; 1023 char* buf; 1024 if (ls && ls->file_override) { 1025 name = ls->file_override; 1026 } else if (ls) { 1027 KitSourceFile sf; 1028 memset(&sf, 0, sizeof(sf)); 1029 if (kit_source_file(pp->c, lex_file_id(ls->lex), &sf) == 0) { 1030 name = sf.name; 1031 } 1032 } 1033 if (name) { 1034 KitSlice s = kit_sym_str(pp->pool->c, name); 1035 nstr = s.s; 1036 nlen = s.len; 1037 } 1038 /* The source name is the raw filesystem path (or a #line override, 1039 * destringized to logical bytes by do_line). Re-stringize it as a 1040 * valid C string literal: escape '\\' and '"'. On POSIX paths use 1041 * '/' so this was a no-op; on Windows the path holds backslashes 1042 * (e.g. C:\\Users\\...), and emitting them raw turns '\\U'/'\\u'/'\\x' 1043 * into bogus escape sequences (the "malformed UCN" on '\\Users'). */ 1044 { 1045 size_t bn = 0; 1046 size_t i; 1047 buf = (char*)arena_alloc(pp->arena, nlen * 2 + 2, 1); 1048 buf[bn++] = '"'; 1049 for (i = 0; i < nlen; ++i) { 1050 char ch = nstr[i]; 1051 if (ch == '\\' || ch == '"') buf[bn++] = '\\'; 1052 buf[bn++] = ch; 1053 } 1054 buf[bn++] = '"'; 1055 t.kind = TOK_STR; 1056 t.spelling = 1057 kit_sym_intern(pp->pool->c, (KitSlice){.s = buf, .len = bn}); 1058 t.v.str = t.spelling; 1059 } 1060 return t; 1061 } 1062 if (id == pp->sym_date__) { 1063 t.kind = TOK_STR; 1064 t.spelling = pp->val_date_str; 1065 t.v.str = t.spelling; 1066 return t; 1067 } 1068 if (id == pp->sym_time__) { 1069 t.kind = TOK_STR; 1070 t.spelling = pp->val_time_str; 1071 t.v.str = t.spelling; 1072 return t; 1073 } 1074 if (id == pp->sym__pragma) { 1075 if (try_expand_pragma_op(pp, &t)) continue; 1076 /* No '(' — fall through and emit as plain ident. */ 1077 } 1078 1079 { 1080 Macro* m = mt_get(pp, id); 1081 if (m && !hs_contains(pp, hs, m->name)) { 1082 if (!m->is_func) { 1083 expand_object_macro(pp, m, &t, hs); 1084 continue; 1085 } 1086 if (try_expand_func_macro(pp, m, &t, hs)) { 1087 continue; 1088 } 1089 /* No '(' followed; emit as plain identifier. */ 1090 } 1091 } 1092 } 1093 return t; 1094 } 1095 }