boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 3917becca3ca07243e9f5fe602981914c6880d30
parent 0b61df9e1743dabbe4d1912d22c97edbebd95d9b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 26 Apr 2026 19:07:28 -0700

cc/pp: adjacent-string concat, more built-ins, stringize escapes, empty args

- Translation phase 6: post-expand pass merges adjacent STR tokens.
- New built-ins: __DATE__, __TIME__ (fixed), __STDC_VERSION__, __STDC_HOSTED__.
- Stringize on STR/CHAR now escapes embedded `\` and the delimiter, so
  `#"a\b"` reconstructs the source spelling instead of dropping the slash.
- Empty function-like-macro arguments: `F()` for one-param F binds the
  parameter to the empty token list; the 0-param special case still
  resolves via the existing bind-args clause.
- cc-pp tests 31..37 cover the above (34/37 were already green; kept
  as regression coverage).

Diffstat:
Mcc/pp.scm | 69++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mdocs/CC.md | 7+++++++
Atests/cc-pp/31-string-concat.c | 4++++
Atests/cc-pp/31-string-concat.expected-toks | 6++++++
Atests/cc-pp/32-builtin-date-time.c | 2++
Atests/cc-pp/32-builtin-date-time.expected-toks | 5+++++
Atests/cc-pp/33-builtin-stdc-version-hosted.c | 2++
Atests/cc-pp/33-builtin-stdc-version-hosted.expected-toks | 3+++
Atests/cc-pp/34-builtin-file-line.c | 2++
Atests/cc-pp/34-builtin-file-line.expected-toks | 3+++
Atests/cc-pp/35-stringize-str-char.c | 4++++
Atests/cc-pp/35-stringize-str-char.expected-toks | 7+++++++
Atests/cc-pp/36-empty-arg.c | 3+++
Atests/cc-pp/36-empty-arg.expected-toks | 6++++++
Atests/cc-pp/37-vararg-empty.c | 3+++
Atests/cc-pp/37-vararg-empty.expected-toks | 10++++++++++
16 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/cc/pp.scm b/cc/pp.scm @@ -64,12 +64,22 @@ (define %pp-bv-LINE "__LINE__") (define %pp-bv-STDC "__STDC__") (define %pp-bv-LISPCC "__LISPCC__") +(define %pp-bv-DATE "__DATE__") +(define %pp-bv-TIME "__TIME__") +(define %pp-bv-STDC-VERSION "__STDC_VERSION__") +(define %pp-bv-STDC-HOSTED "__STDC_HOSTED__") (define %pp-bv-VA-ARGS "__VA_ARGS__") (define %pp-bv-defined "defined") +;; Fixed values for reproducibility — we don't read the wall clock. +(define %pp-bv-DATE-VALUE "Jan 1 1970") +(define %pp-bv-TIME-VALUE "00:00:00") + (define (%pp-builtin? name) (or (bv= name %pp-bv-FILE) (bv= name %pp-bv-LINE) - (bv= name %pp-bv-STDC) (bv= name %pp-bv-LISPCC))) + (bv= name %pp-bv-STDC) (bv= name %pp-bv-LISPCC) + (bv= name %pp-bv-DATE) (bv= name %pp-bv-TIME) + (bv= name %pp-bv-STDC-VERSION) (bv= name %pp-bv-STDC-HOSTED))) (define (%pp-expand-builtin name loc state) (let* ((file (or (pps-cur-file state) (loc-file loc))) @@ -77,10 +87,14 @@ (col (loc-col loc)) (here (%loc file line col))) (cond - ((bv= name %pp-bv-FILE) (list (%tok 'STR file here '()))) - ((bv= name %pp-bv-LINE) (list (%tok 'INT line here '()))) - ((bv= name %pp-bv-STDC) (list (%tok 'INT 1 here '()))) - ((bv= name %pp-bv-LISPCC) (list (%tok 'INT 1 here '()))) + ((bv= name %pp-bv-FILE) (list (%tok 'STR file here '()))) + ((bv= name %pp-bv-LINE) (list (%tok 'INT line here '()))) + ((bv= name %pp-bv-STDC) (list (%tok 'INT 1 here '()))) + ((bv= name %pp-bv-LISPCC) (list (%tok 'INT 1 here '()))) + ((bv= name %pp-bv-DATE) (list (%tok 'STR %pp-bv-DATE-VALUE here '()))) + ((bv= name %pp-bv-TIME) (list (%tok 'STR %pp-bv-TIME-VALUE here '()))) + ((bv= name %pp-bv-STDC-VERSION) (list (%tok 'INT 199901 here '()))) + ((bv= name %pp-bv-STDC-HOSTED) (list (%tok 'INT 1 here '()))) (else (die loc "internal: not a builtin" name))))) ;; --- buf-list: simple reversed-list buffer of toks --- @@ -96,6 +110,24 @@ (else (buf-list-push! b (car ts)) (loop (cdr ts)))))) (define (buf-list-flush b) (reverse (buf-list-xs b))) +;; Translation phase 6: concatenate adjacent string literals. The merged +;; token keeps the loc and hide-set of the first; values are byte-appended. +(define (%pp-merge-adjacent-strs toks) + (let loop ((toks toks) (acc '())) + (cond + ((null? toks) (reverse acc)) + ((and (not (null? acc)) + (eq? (tok-kind (car toks)) 'STR) + (eq? (tok-kind (car acc)) 'STR)) + (let* ((prev (car acc)) + (cur (car toks)) + (merged (%tok 'STR + (bytevector-append (tok-value prev) (tok-value cur)) + (tok-loc prev) + (tok-hide prev)))) + (loop (cdr toks) (cons merged (cdr acc))))) + (else (loop (cdr toks) (cons (car toks) acc)))))) + ;; --- pp-expand: top-level driver --- (define (pp-expand toks initial-defines) (let ((state (%pp-make-state initial-defines)) @@ -108,7 +140,7 @@ (die (tok-loc (car toks)) "unterminated #if/#ifdef/#ifndef")) (else (buf-list-push! out (car toks)) - (buf-list-flush out)))) + (%pp-merge-adjacent-strs (buf-list-flush out))))) ((%pp-nl? (car toks)) (loop (cdr toks))) ((%pp-hash? (car toks)) (let* ((lr (%pp-take-line (cdr toks))) @@ -349,12 +381,29 @@ (cond ((eq? k 'IDENT) v) ((eq? k 'INT) (fixnum->bv v 10)) - ((eq? k 'STR) (bytevector-append "\"" (bytevector-append v "\""))) - ((eq? k 'CHAR) (bytevector-append "'" (bytevector-append (bv-of-byte v) "'"))) + ((eq? k 'STR) (%pp-quote-bytes v 34)) + ((eq? k 'CHAR) (%pp-quote-bytes (bv-of-byte v) 39)) ((eq? k 'KW) (symbol->string v)) ((eq? k 'PUNCT) (symbol->string v)) (else "?")))) +;; Reconstruct a string/char literal source spelling from cooked content. +;; Per C11 6.10.3.2: insert `\` before each `"` and `\` (or `'` for char). +;; `delim` is 34 for STR, 39 for CHAR. +(define (%pp-quote-bytes bv delim) + (let* ((n (bytevector-length bv)) + (delim-bv (bv-of-byte delim))) + (let loop ((i 0) (acc (list delim-bv))) + (cond + ((= i n) (bv-cat (reverse (cons delim-bv acc)))) + (else + (let ((b (bytevector-u8-ref bv i))) + (cond + ((or (= b delim) (= b 92)) + (loop (+ i 1) (cons (bv-of-byte b) (cons "\\" acc)))) + (else + (loop (+ i 1) (cons (bv-of-byte b) acc)))))))))) + ;; --- #line / #pragma / #include --- ;; Approximate #line: subsequent toks have line = (orig-line + delta), ;; where delta = (N - here-line - 1). Good enough for most cases. @@ -456,7 +505,9 @@ ((and (= depth 0) (%pp-punct? (car toks) 'rparen)) (let ((args* (cond - ((and (null? args) (null? cur)) '()) + ;; Empty parens count as one empty argument; bind-args + ;; degenerates this back to "no args" for 0-param macros. + ((and (null? args) (null? cur)) (list '())) (else (reverse (cons (reverse cur) args)))))) (cons args* (cdr toks)))) ((and (= depth 0) (%pp-punct? (car toks) 'comma)) diff --git a/docs/CC.md b/docs/CC.md @@ -138,9 +138,16 @@ Built-in macros: - `__FILE__` — current source file (a string literal) - `__LINE__` — current line number (a decimal integer) +- `__DATE__` — `"Jan 1 1970"` (fixed; we don't read the wall clock) +- `__TIME__` — `"00:00:00"` (fixed) - `__STDC__` — `1` +- `__STDC_VERSION__` — `199901` +- `__STDC_HOSTED__` — `1` - `__LISPCC__` — `1` (our analogue of MesCC's `__MESC__`) +Adjacent string-literal tokens in the post-expansion stream are +concatenated (translation phase 6). + Expression evaluator (used by `#if`/`#elif`): - All integer operators including `defined NAME` / `defined(NAME)`. diff --git a/tests/cc-pp/31-string-concat.c b/tests/cc-pp/31-string-concat.c @@ -0,0 +1,4 @@ +"a" "b" "c" +1 +"p" "q" +2 "r" diff --git a/tests/cc-pp/31-string-concat.expected-toks b/tests/cc-pp/31-string-concat.expected-toks @@ -0,0 +1,6 @@ +(STR "abc" "31-string-concat.c" 1 1) +(INT 1 "31-string-concat.c" 2 1) +(STR "pq" "31-string-concat.c" 3 1) +(INT 2 "31-string-concat.c" 4 1) +(STR "r" "31-string-concat.c" 4 3) +(EOF #f "31-string-concat.c" 5 1) diff --git a/tests/cc-pp/32-builtin-date-time.c b/tests/cc-pp/32-builtin-date-time.c @@ -0,0 +1,2 @@ +__DATE__ ; +__TIME__ ; diff --git a/tests/cc-pp/32-builtin-date-time.expected-toks b/tests/cc-pp/32-builtin-date-time.expected-toks @@ -0,0 +1,5 @@ +(STR "Jan 1 1970" "32-builtin-date-time.c" 1 1) +(PUNCT semi "32-builtin-date-time.c" 1 10) +(STR "00:00:00" "32-builtin-date-time.c" 2 1) +(PUNCT semi "32-builtin-date-time.c" 2 10) +(EOF #f "32-builtin-date-time.c" 3 1) diff --git a/tests/cc-pp/33-builtin-stdc-version-hosted.c b/tests/cc-pp/33-builtin-stdc-version-hosted.c @@ -0,0 +1,2 @@ +__STDC_VERSION__ +__STDC_HOSTED__ diff --git a/tests/cc-pp/33-builtin-stdc-version-hosted.expected-toks b/tests/cc-pp/33-builtin-stdc-version-hosted.expected-toks @@ -0,0 +1,3 @@ +(INT 199901 "33-builtin-stdc-version-hosted.c" 1 1) +(INT 1 "33-builtin-stdc-version-hosted.c" 2 1) +(EOF #f "33-builtin-stdc-version-hosted.c" 3 1) diff --git a/tests/cc-pp/34-builtin-file-line.c b/tests/cc-pp/34-builtin-file-line.c @@ -0,0 +1,2 @@ +__FILE__ +__LINE__ diff --git a/tests/cc-pp/34-builtin-file-line.expected-toks b/tests/cc-pp/34-builtin-file-line.expected-toks @@ -0,0 +1,3 @@ +(STR "34-builtin-file-line.c" "34-builtin-file-line.c" 1 1) +(INT 2 "34-builtin-file-line.c" 2 1) +(EOF #f "34-builtin-file-line.c" 3 1) diff --git a/tests/cc-pp/35-stringize-str-char.c b/tests/cc-pp/35-stringize-str-char.c @@ -0,0 +1,4 @@ +#define S(x) #x +S("hi") ; +S('a') ; +S("a\\b") ; diff --git a/tests/cc-pp/35-stringize-str-char.expected-toks b/tests/cc-pp/35-stringize-str-char.expected-toks @@ -0,0 +1,7 @@ +(STR "\"hi\"" "35-stringize-str-char.c" 1 14) +(PUNCT semi "35-stringize-str-char.c" 2 9) +(STR "'a'" "35-stringize-str-char.c" 1 14) +(PUNCT semi "35-stringize-str-char.c" 3 8) +(STR "\"a\\\\b\"" "35-stringize-str-char.c" 1 14) +(PUNCT semi "35-stringize-str-char.c" 4 11) +(EOF #f "35-stringize-str-char.c" 5 1) diff --git a/tests/cc-pp/36-empty-arg.c b/tests/cc-pp/36-empty-arg.c @@ -0,0 +1,3 @@ +#define F(a) [a] +F(x) +F() diff --git a/tests/cc-pp/36-empty-arg.expected-toks b/tests/cc-pp/36-empty-arg.expected-toks @@ -0,0 +1,6 @@ +(PUNCT lbrack "36-empty-arg.c" 1 14) +(IDENT "x" "36-empty-arg.c" 2 3) +(PUNCT rbrack "36-empty-arg.c" 1 16) +(PUNCT lbrack "36-empty-arg.c" 1 14) +(PUNCT rbrack "36-empty-arg.c" 1 16) +(EOF #f "36-empty-arg.c" 4 1) diff --git a/tests/cc-pp/37-vararg-empty.c b/tests/cc-pp/37-vararg-empty.c @@ -0,0 +1,3 @@ +#define LOG(fmt, ...) [fmt; __VA_ARGS__] +LOG("x") +LOG("y", 1) diff --git a/tests/cc-pp/37-vararg-empty.expected-toks b/tests/cc-pp/37-vararg-empty.expected-toks @@ -0,0 +1,10 @@ +(PUNCT lbrack "37-vararg-empty.c" 1 23) +(STR "x" "37-vararg-empty.c" 2 5) +(PUNCT semi "37-vararg-empty.c" 1 27) +(PUNCT rbrack "37-vararg-empty.c" 1 40) +(PUNCT lbrack "37-vararg-empty.c" 1 23) +(STR "y" "37-vararg-empty.c" 3 5) +(PUNCT semi "37-vararg-empty.c" 1 27) +(INT 1 "37-vararg-empty.c" 3 10) +(PUNCT rbrack "37-vararg-empty.c" 1 40) +(EOF #f "37-vararg-empty.c" 4 1)