lex: add lexer implementation and lex/pp tests - kit

commit 13c7eb6490e2aaef2c457641287d02eadde21ed2
parent 36197435a80ec4f7368e486af514a63be5ee03ad
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 04:15:46 -0700

lex: add lexer implementation and lex/pp tests

Diffstat:
A src/lex/lex.c  | 436 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/basic_punct.c  | 3 +++
A test/lex/cases/basic_punct.expected  | 16 ++++++++++++++++
A test/lex/cases/comment_edges.c  | 12 ++++++++++++
A test/lex/cases/comment_edges.expected  | 21 +++++++++++++++++++++
A test/lex/cases/comments.c  | 16 ++++++++++++++++
A test/lex/cases/comments.expected  | 34 ++++++++++++++++++++++++++++++++++
A test/lex/cases/empty.c  | 0 
A test/lex/cases/empty.expected  | 1 +
A test/lex/cases/float_constants.c  | 34 ++++++++++++++++++++++++++++++++++
A test/lex/cases/float_constants.expected  | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/identifiers.c  | 31 +++++++++++++++++++++++++++++++
A test/lex/cases/identifiers.expected  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/int_constants.c  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/int_constants.expected  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/keywords.c  | 8 ++++++++
A test/lex/cases/keywords.expected  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/line_splice.c  | 25 +++++++++++++++++++++++++
A test/lex/cases/line_splice.expected  | 25 +++++++++++++++++++++++++
A test/lex/cases/maximal_munch.c  | 21 +++++++++++++++++++++
A test/lex/cases/maximal_munch.expected  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/pp_directives.c  | 21 +++++++++++++++++++++
A test/lex/cases/pp_directives.expected  | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/pp_numbers.c  | 29 +++++++++++++++++++++++++++++
A test/lex/cases/pp_numbers.expected  | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/pp_passthrough.c  | 2 ++
A test/lex/cases/pp_passthrough.expected  | 10 ++++++++++
A test/lex/cases/punctuators.c  | 10 ++++++++++
A test/lex/cases/punctuators.expected  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/string_escapes.c  | 38 ++++++++++++++++++++++++++++++++++++++
A test/lex/cases/string_escapes.expected  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/cases/strings_chars.c  | 23 +++++++++++++++++++++++
A test/lex/cases/strings_chars.expected  | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
A test/lex/run.sh  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/pp/cases/define_function.actual  | 0 
A test/pp/cases/define_function.c  | 2 ++
A test/pp/cases/define_function.expected  | 1 +
A test/pp/cases/define_object.actual  | 0 
A test/pp/cases/define_object.c  | 2 ++
A test/pp/cases/define_object.expected  | 1 +
A test/pp/cases/ifdef_basic.actual  | 0 
A test/pp/cases/ifdef_basic.c  | 11 +++++++++++
A test/pp/cases/ifdef_basic.expected  | 2 ++
A test/pp/cases/include_local.actual  | 0 
A test/pp/cases/include_local.c  | 2 ++
A test/pp/cases/include_local.expected  | 2 ++
A test/pp/cases/include_local.h  | 2 ++
A test/pp/cases/undef.actual  | 0 
A test/pp/cases/undef.c  | 6 ++++++
A test/pp/cases/undef.expected  | 3 +++
A test/pp/run.sh  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/test.mk  | 41 +++++++++++++++++++++++++++++++++++++++++

52 files changed, 1812 insertions(+), 0 deletions(-)
diff --git a/src/lex/lex.c b/src/lex/lex.c
@@ -0,0 +1,436 @@
+/* C11 lexer (§6.4). Streams tokens out of a borrowed source buffer.
+ *
+ * Tokens are recognized per the standard's lexical grammar:
+ *   - identifiers (§6.4.2) — keyword bucketing happens later in parse_c
+ *   - pp-numbers (§6.4.8), classified into TOK_NUM / TOK_FLT
+ *   - string literals (§6.4.5) and character constants (§6.4.4.4)
+ *     including the L/u/u8/U encoding prefixes
+ *   - punctuators (§6.4.6), longest-match, including digraphs
+ *   - `#` and `##` surface as TOK_PP_HASH / TOK_PP_PASTE so the
+ *     preprocessor can recognize directives and the paste operator
+ *
+ * Comments (§6.4.9) are consumed as whitespace; physical newlines surface
+ * as TOK_NEWLINE so PP can implement directive-line semantics. */
+
+#include "lex/lex.h"
+#include "core/heap.h"
+#include "core/pool.h"
+
+#include <string.h>
+
+struct Lexer {
+    Compiler*   c;
+    Pool*       pool;
+    Heap*       heap;
+    const char* src;
+    size_t      len;
+    size_t      pos;
+    u32         file_id;
+    u32         line;
+    u32         col;
+    u8          at_bol;
+    u8          had_space;
+};
+
+static int peek(const Lexer* l, size_t off)
+{
+    if (l->pos + off >= l->len) return -1;
+    return (unsigned char)l->src[l->pos + off];
+}
+
+static int bump(Lexer* l)
+{
+    int ch;
+    if (l->pos >= l->len) return -1;
+    ch = (unsigned char)l->src[l->pos++];
+    if (ch == '\n') { l->line++; l->col = 1; }
+    else            { l->col++; }
+    return ch;
+}
+
+static int is_digit(int c) { return c >= '0' && c <= '9'; }
+static int is_hex_digit(int c)
+{
+    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+/* Identifier-start byte (§6.4.2.1). Letters and underscore are ASCII; bytes
+ * ≥ 0x80 are accepted as the implementation-defined "other characters"
+ * permitted in identifiers — in practice UTF-8 lead/continuation bytes for
+ * extended source characters. UCNs are matched separately via ucn_len since
+ * they span multiple source bytes. */
+static int is_alpha(int c)
+{
+    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80;
+}
+static int is_alnum(int c) { return is_alpha(c) || is_digit(c); }
+
+/* Match a UCN at offset `off` from the current position. Returns the total
+ * length (6 for \uXXXX, 10 for \UXXXXXXXX), or 0 if no UCN matches. The
+ * range constraints from §6.4.3 (no UCN < 00A0 except $/@/`, and none in
+ * D800–DFFF) are not enforced here — the lexical form is matched and any
+ * downstream phase that cares can diagnose. */
+static int ucn_len(const Lexer* l, size_t off)
+{
+    int n, i;
+    if (peek(l, off) != '\\') return 0;
+    if      (peek(l, off + 1) == 'u') n = 4;
+    else if (peek(l, off + 1) == 'U') n = 8;
+    else return 0;
+    for (i = 0; i < n; ++i) {
+        if (!is_hex_digit(peek(l, off + 2 + i))) return 0;
+    }
+    return 2 + n;
+}
+
+static SrcLoc lex_here(const Lexer* l)
+{
+    SrcLoc loc;
+    loc.file_id = l->file_id;
+    loc.line    = l->line;
+    loc.col     = l->col;
+    return loc;
+}
+
+Lexer* lex_open_mem(Compiler* c, const char* name, const char* src, size_t len)
+{
+    Heap*  h = (Heap*)c->env->heap;
+    Lexer* l = (Lexer*)h->alloc(h, sizeof(*l), _Alignof(Lexer));
+    if (!l) return NULL;
+    memset(l, 0, sizeof(*l));
+    l->c        = c;
+    l->pool     = c->global;
+    l->heap     = h;
+    l->src      = src ? src : "";
+    l->len      = src ? len : 0;
+    l->pos      = 0;
+    l->file_id  = source_add_memory(c->sources, name);
+    l->line     = 1;
+    l->col      = 1;
+    l->at_bol   = 1;
+    l->had_space = 0;
+    return l;
+}
+
+void lex_close(Lexer* l)
+{
+    if (!l) return;
+    l->heap->free(l->heap, l, sizeof(*l));
+}
+
+SrcLoc lex_loc(const Lexer* l)         { return lex_here(l); }
+u32    lex_file_id(const Lexer* l)     { return l->file_id; }
+const LitInfo* lex_lit(const Lexer* l, LitId id) { (void)l; (void)id; return NULL; }
+
+/* Skip whitespace and comments. Returns 1 if a newline boundary was crossed
+ * via comment consumption (caller still emits the explicit newline token on
+ * an in-source '\n'). */
+static void skip_ws_and_comments(Lexer* l)
+{
+    for (;;) {
+        int ch = peek(l, 0);
+        if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\v' || ch == '\f') {
+            bump(l);
+            l->had_space = 1;
+            continue;
+        }
+        if (ch == '/' && peek(l, 1) == '/') {
+            bump(l); bump(l);
+            while (l->pos < l->len && (unsigned char)l->src[l->pos] != '\n') bump(l);
+            l->had_space = 1;
+            continue;
+        }
+        if (ch == '/' && peek(l, 1) == '*') {
+            bump(l); bump(l);
+            while (l->pos < l->len) {
+                if ((unsigned char)l->src[l->pos] == '*' &&
+                    l->pos + 1 < l->len &&
+                    (unsigned char)l->src[l->pos + 1] == '/') {
+                    bump(l); bump(l);
+                    break;
+                }
+                bump(l);
+            }
+            l->had_space = 1;
+            continue;
+        }
+        break;
+    }
+}
+
+/* Consume a pp-number per §6.4.8. The cursor is positioned at the leading
+ * digit (or `.` followed by a digit) on entry. */
+static void scan_pp_number(Lexer* l)
+{
+    if (peek(l, 0) == '.') bump(l);
+    bump(l); /* first digit */
+    while (l->pos < l->len) {
+        int c = peek(l, 0);
+        int n = peek(l, 1);
+        if ((c == 'e' || c == 'E' || c == 'p' || c == 'P') && (n == '+' || n == '-')) {
+            bump(l); bump(l);
+        } else if (is_alnum(c) || c == '.') {
+            bump(l);
+        } else {
+            break;
+        }
+    }
+}
+
+/* 1 if the pp-number text is a floating constant (§6.4.4.2): contains a
+ * radix `.`, a hex `p`/`P` exponent, or a decimal `e`/`E` exponent. */
+static int pp_number_is_float(const char* s, size_t n)
+{
+    int    is_hex = 0;
+    size_t i      = 0;
+    if (n >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+        is_hex = 1;
+        i      = 2;
+    }
+    for (; i < n; ++i) {
+        char c = s[i];
+        if (c == '.') return 1;
+        if (is_hex && (c == 'p' || c == 'P')) return 1;
+        if (!is_hex && (c == 'e' || c == 'E')) {
+            if (i + 1 < n) {
+                char nx = s[i + 1];
+                if (nx == '+' || nx == '-' || (nx >= '0' && nx <= '9')) return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+/* Consume a quoted body — string ('"') or character ('\''). The cursor is
+ * positioned at the opening quote on entry. Returns 1 on an unterminated or
+ * newline-broken literal, 0 on a clean close. */
+static int scan_quoted(Lexer* l, int quote)
+{
+    bump(l); /* opening quote */
+    while (l->pos < l->len) {
+        int ch = (unsigned char)l->src[l->pos];
+        if (ch == quote) { bump(l); return 0; }
+        if (ch == '\n')  return 1;
+        if (ch == '\\' && l->pos + 1 < l->len) {
+            bump(l); /* backslash */
+            bump(l); /* the escaped char (incl. potential newline in line
+                      * splice scenarios; we do not splice here) */
+            continue;
+        }
+        bump(l);
+    }
+    return 1;
+}
+
+Tok lex_next(Lexer* l)
+{
+    Tok    t;
+    SrcLoc tloc;
+    size_t start;
+    int    ch;
+
+    memset(&t, 0, sizeof(t));
+
+    /* Skip whitespace and comments. A newline token is emitted before any
+     * subsequent content tokens for the line that follows. */
+    for (;;) {
+        skip_ws_and_comments(l);
+        if (l->pos >= l->len) {
+            t.kind = TOK_EOF;
+            t.loc  = lex_here(l);
+            return t;
+        }
+        if (peek(l, 0) == '\n') {
+            tloc = lex_here(l);
+            bump(l);
+            t.kind  = TOK_NEWLINE;
+            t.loc   = tloc;
+            l->at_bol    = 1;
+            l->had_space = 0;
+            return t;
+        }
+        break;
+    }
+
+    tloc  = lex_here(l);
+    start = l->pos;
+    ch    = peek(l, 0);
+
+    if (l->at_bol)    t.flags |= TF_AT_BOL;
+    if (l->had_space) t.flags |= TF_HAS_SPACE;
+    l->at_bol    = 0;
+    l->had_space = 0;
+    t.loc = tloc;
+
+    /* String / character literal, with optional encoding prefix. The prefix
+     * length and encoding flag are decoded together so the spelling we
+     * intern includes the prefix bytes. */
+    {
+        int    sp_len  = -1;
+        int    is_char = 0;
+        u32    encf    = 0;
+
+        if (ch == '"')                                                                 { sp_len = 0; is_char = 0; }
+        else if (ch == '\'')                                                            { sp_len = 0; is_char = 1; }
+        else if (ch == 'L' && peek(l, 1) == '"')                                        { sp_len = 1; is_char = 0; encf = TF_STR_WIDE; }
+        else if (ch == 'L' && peek(l, 1) == '\'')                                       { sp_len = 1; is_char = 1; encf = TF_STR_WIDE; }
+        else if (ch == 'u' && peek(l, 1) == '8' && peek(l, 2) == '"')                   { sp_len = 2; is_char = 0; encf = TF_STR_U8;   }
+        else if (ch == 'u' && peek(l, 1) == '"')                                        { sp_len = 1; is_char = 0; encf = TF_STR_U16;  }
+        else if (ch == 'u' && peek(l, 1) == '\'')                                       { sp_len = 1; is_char = 1; encf = TF_STR_U16;  }
+        else if (ch == 'U' && peek(l, 1) == '"')                                        { sp_len = 1; is_char = 0; encf = TF_STR_U32;  }
+        else if (ch == 'U' && peek(l, 1) == '\'')                                       { sp_len = 1; is_char = 1; encf = TF_STR_U32;  }
+
+        if (sp_len >= 0) {
+            int i;
+            for (i = 0; i < sp_len; ++i) bump(l);
+            if (scan_quoted(l, is_char ? '\'' : '"')) t.flags |= TF_LITERAL_BAD;
+            t.kind     = (u16)(is_char ? TOK_CHR : TOK_STR);
+            t.flags   |= encf;
+            t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+            t.v.str    = t.spelling;
+            return t;
+        }
+    }
+
+    /* Identifier (§6.4.2). Encoding-prefix candidates above are matched
+     * before this since L/u/U followed by a quote is a literal, not an
+     * identifier. The grammar's identifier-nondigit covers letters, _,
+     * extended source chars (impl-defined; bytes ≥ 0x80 here), and UCNs
+     * (§6.4.3) — the latter span multiple source bytes so they're matched
+     * via ucn_len rather than the per-byte is_alpha predicate. */
+    {
+        int u = ucn_len(l, 0);
+        if (is_alpha(ch) || u) {
+            if (u) { int i; for (i = 0; i < u; ++i) bump(l); }
+            else   bump(l);
+            for (;;) {
+                int c = peek(l, 0);
+                if (is_alnum(c)) {
+                    bump(l);
+                } else if ((u = ucn_len(l, 0))) {
+                    int i; for (i = 0; i < u; ++i) bump(l);
+                } else {
+                    break;
+                }
+            }
+            t.kind     = TOK_IDENT;
+            t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+            t.v.ident  = t.spelling;
+            return t;
+        }
+    }
+
+    /* pp-number (§6.4.8), then classified to TOK_NUM / TOK_FLT. */
+    if (is_digit(ch) || (ch == '.' && is_digit(peek(l, 1)))) {
+        scan_pp_number(l);
+        t.kind     = (u16)(pp_number_is_float(l->src + start, l->pos - start)
+                           ? TOK_FLT : TOK_NUM);
+        t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+        return t;
+    }
+
+    /* Punctuator (§6.4.6) — longest match. `#` and `##` (and their digraph
+     * forms `%:` and `%:%:`) become TOK_PP_HASH / TOK_PP_PASTE so PP can
+     * recognize directives and the paste operator. */
+    {
+        int n0   = peek(l, 0);
+        int n1   = peek(l, 1);
+        int n2   = peek(l, 2);
+        int n3   = peek(l, 3);
+        int adv  = 1;
+        u32 punct = P_NONE;
+        u16 kind = TOK_PUNCT;
+        int i;
+
+        switch (n0) {
+        case '#':
+            if (n1 == '#') { adv = 2; kind = TOK_PP_PASTE; punct = P_HASH_HASH; }
+            else           { adv = 1; kind = TOK_PP_HASH;  punct = '#';        }
+            break;
+        case '.':
+            if (n1 == '.' && n2 == '.') { adv = 3; punct = P_ELLIPSIS; }
+            else                        { adv = 1; punct = '.';        }
+            break;
+        case '-':
+            if      (n1 == '>') { adv = 2; punct = P_ARROW;      }
+            else if (n1 == '-') { adv = 2; punct = P_DEC;        }
+            else if (n1 == '=') { adv = 2; punct = P_SUB_ASSIGN; }
+            else                { adv = 1; punct = '-';          }
+            break;
+        case '+':
+            if      (n1 == '+') { adv = 2; punct = P_INC;        }
+            else if (n1 == '=') { adv = 2; punct = P_ADD_ASSIGN; }
+            else                { adv = 1; punct = '+';          }
+            break;
+        case '<':
+            if      (n1 == '<' && n2 == '=') { adv = 3; punct = P_SHL_ASSIGN; }
+            else if (n1 == '<')              { adv = 2; punct = P_SHL;        }
+            else if (n1 == '=')              { adv = 2; punct = P_LE;         }
+            else if (n1 == ':')              { adv = 2; punct = '[';          } /* digraph */
+            else if (n1 == '%')              { adv = 2; punct = '{';          } /* digraph */
+            else                              { adv = 1; punct = '<';          }
+            break;
+        case '>':
+            if      (n1 == '>' && n2 == '=') { adv = 3; punct = P_SHR_ASSIGN; }
+            else if (n1 == '>')              { adv = 2; punct = P_SHR;        }
+            else if (n1 == '=')              { adv = 2; punct = P_GE;         }
+            else                              { adv = 1; punct = '>';          }
+            break;
+        case '=':
+            if (n1 == '=') { adv = 2; punct = P_EQ; }
+            else           { adv = 1; punct = '=';  }
+            break;
+        case '!':
+            if (n1 == '=') { adv = 2; punct = P_NE; }
+            else           { adv = 1; punct = '!';  }
+            break;
+        case '&':
+            if      (n1 == '&') { adv = 2; punct = P_AND;        }
+            else if (n1 == '=') { adv = 2; punct = P_AND_ASSIGN; }
+            else                { adv = 1; punct = '&';          }
+            break;
+        case '|':
+            if      (n1 == '|') { adv = 2; punct = P_OR;        }
+            else if (n1 == '=') { adv = 2; punct = P_OR_ASSIGN; }
+            else                { adv = 1; punct = '|';         }
+            break;
+        case '^':
+            if (n1 == '=') { adv = 2; punct = P_XOR_ASSIGN; }
+            else           { adv = 1; punct = '^';          }
+            break;
+        case '*':
+            if (n1 == '=') { adv = 2; punct = P_MUL_ASSIGN; }
+            else           { adv = 1; punct = '*';          }
+            break;
+        case '/':
+            if (n1 == '=') { adv = 2; punct = P_DIV_ASSIGN; }
+            else           { adv = 1; punct = '/';          }
+            break;
+        case '%':
+            if      (n1 == ':' && n2 == '%' && n3 == ':') { adv = 4; kind = TOK_PP_PASTE; punct = P_HASH_HASH; }
+            else if (n1 == ':')                            { adv = 2; kind = TOK_PP_HASH;  punct = '#';         }
+            else if (n1 == '=')                            { adv = 2; punct = P_MOD_ASSIGN; }
+            else if (n1 == '>')                            { adv = 2; punct = '}';          } /* digraph */
+            else                                           { adv = 1; punct = '%';          }
+            break;
+        case ':':
+            if (n1 == '>') { adv = 2; punct = ']'; } /* digraph */
+            else           { adv = 1; punct = ':';  }
+            break;
+        case '(': case ')': case '{': case '}': case '[': case ']':
+        case ',': case ';': case '?': case '~':
+            adv = 1; punct = (u32)n0;
+            break;
+        default:
+            /* Unknown byte. Surface as a single-char punct so the token
+             * stream still progresses; PP/parse may diagnose. */
+            adv = 1; punct = (u32)n0;
+            break;
+        }
+
+        for (i = 0; i < adv; ++i) bump(l);
+        t.kind     = kind;
+        t.v.punct  = punct;
+        t.spelling = pool_intern(l->pool, l->src + start, l->pos - start);
+        return t;
+    }
+}
diff --git a/test/lex/cases/basic_punct.c b/test/lex/cases/basic_punct.c
@@ -0,0 +1,3 @@
+int main(void) {
+    return x->y;
+}
diff --git a/test/lex/cases/basic_punct.expected b/test/lex/cases/basic_punct.expected
@@ -0,0 +1,16 @@
+(ident int)
+(ident main)
+(punct ()
+(ident void)
+(punct ))
+(punct {)
+(newline)
+(ident return)
+(ident x)
+(punct ->)
+(ident y)
+(punct ;)
+(newline)
+(punct })
+(newline)
+(eof)
diff --git a/test/lex/cases/comment_edges.c b/test/lex/cases/comment_edges.c
@@ -0,0 +1,12 @@
+/* /* */ x */
+/* / */ y
+/* * */ z
+/* foo **/ w
+/* a *//* b */ c
+/* /// not line comment */ k
+/* "looks like string" */ m
+/*****/
+/**//**/
+/* x
+   y
+   z */ n
diff --git a/test/lex/cases/comment_edges.expected b/test/lex/cases/comment_edges.expected
@@ -0,0 +1,21 @@
+(ident x)
+(punct *)
+(punct /)
+(newline)
+(ident y)
+(newline)
+(ident z)
+(newline)
+(ident w)
+(newline)
+(ident c)
+(newline)
+(ident k)
+(newline)
+(ident m)
+(newline)
+(newline)
+(newline)
+(ident n)
+(newline)
+(eof)
diff --git a/test/lex/cases/comments.c b/test/lex/cases/comments.c
@@ -0,0 +1,15 @@
+// just a line comment
+int x; /* block */ int y;
+/* multi
+   line
+   line */
+int z;
+a /**/ b
+c // tail comment
+d /* with * and / inside */ e
+// "string-like" text and /* nested-looking */ stays comment
+f /* contains // line-comment chars */ g
+h/**/i
+j/* one *//* two */k
+/*/ slash-star-slash content */ m
+// no trailing newline at EOF
+\ No newline at end of file
diff --git a/test/lex/cases/comments.expected b/test/lex/cases/comments.expected
@@ -0,0 +1,34 @@
+(newline)
+(ident int)
+(ident x)
+(punct ;)
+(ident int)
+(ident y)
+(punct ;)
+(newline)
+(newline)
+(ident int)
+(ident z)
+(punct ;)
+(newline)
+(ident a)
+(ident b)
+(newline)
+(ident c)
+(newline)
+(ident d)
+(ident e)
+(newline)
+(newline)
+(ident f)
+(ident g)
+(newline)
+(ident h)
+(ident i)
+(newline)
+(ident j)
+(ident k)
+(newline)
+(ident m)
+(newline)
+(eof)
diff --git a/test/lex/cases/empty.c b/test/lex/cases/empty.c
diff --git a/test/lex/cases/empty.expected b/test/lex/cases/empty.expected
@@ -0,0 +1 @@
+(eof)
diff --git a/test/lex/cases/float_constants.c b/test/lex/cases/float_constants.c
@@ -0,0 +1,34 @@
+1.0
+1.
+.1
+0.0
+3.14
+1e0
+1E10
+1e+1
+1e-1
+1.5e+2
+1.5e-2
+.5e10
+1.e10
+1.E-5
+.0
+.0L
+1.0f
+1.0F
+1.0l
+1.0L
+2.5f
+6.022e23L
+1.0Lf
+0x1p0
+0x1P0
+0X1P0
+0x1p+0
+0x1P-2
+0xFp-1
+0x1.8p+1
+0x.8p1
+0xA.Bp+3
+0x1.8
+0x1.
diff --git a/test/lex/cases/float_constants.expected b/test/lex/cases/float_constants.expected
@@ -0,0 +1,69 @@
+(flt 1.0)
+(newline)
+(flt 1.)
+(newline)
+(flt .1)
+(newline)
+(flt 0.0)
+(newline)
+(flt 3.14)
+(newline)
+(flt 1e0)
+(newline)
+(flt 1E10)
+(newline)
+(flt 1e+1)
+(newline)
+(flt 1e-1)
+(newline)
+(flt 1.5e+2)
+(newline)
+(flt 1.5e-2)
+(newline)
+(flt .5e10)
+(newline)
+(flt 1.e10)
+(newline)
+(flt 1.E-5)
+(newline)
+(flt .0)
+(newline)
+(flt .0L)
+(newline)
+(flt 1.0f)
+(newline)
+(flt 1.0F)
+(newline)
+(flt 1.0l)
+(newline)
+(flt 1.0L)
+(newline)
+(flt 2.5f)
+(newline)
+(flt 6.022e23L)
+(newline)
+(flt 1.0Lf)
+(newline)
+(flt 0x1p0)
+(newline)
+(flt 0x1P0)
+(newline)
+(flt 0X1P0)
+(newline)
+(flt 0x1p+0)
+(newline)
+(flt 0x1P-2)
+(newline)
+(flt 0xFp-1)
+(newline)
+(flt 0x1.8p+1)
+(newline)
+(flt 0x.8p1)
+(newline)
+(flt 0xA.Bp+3)
+(newline)
+(flt 0x1.8)
+(newline)
+(flt 0x1.)
+(newline)
+(eof)
diff --git a/test/lex/cases/identifiers.c b/test/lex/cases/identifiers.c
@@ -0,0 +1,31 @@
+foo
+_bar
+__baz
+foo123
+A1B2C3
+_
+__
+___
+a
+Z
+_0
+_9
+__func__
+__LINE__
+__FILE__
+camelCase
+PascalCase
+SHOUTY_SNAKE
+_private99
+mixed_Case_42
+_1_2_3
+abc_def_ghi
+x0
+x1y2z3_
+naïve
+λ_func
+café
+éstart
+caf\u00e9
+\u00e9start
+with\U0001F600paste
diff --git a/test/lex/cases/identifiers.expected b/test/lex/cases/identifiers.expected
@@ -0,0 +1,63 @@
+(ident foo)
+(newline)
+(ident _bar)
+(newline)
+(ident __baz)
+(newline)
+(ident foo123)
+(newline)
+(ident A1B2C3)
+(newline)
+(ident _)
+(newline)
+(ident __)
+(newline)
+(ident ___)
+(newline)
+(ident a)
+(newline)
+(ident Z)
+(newline)
+(ident _0)
+(newline)
+(ident _9)
+(newline)
+(ident __func__)
+(newline)
+(ident __LINE__)
+(newline)
+(ident __FILE__)
+(newline)
+(ident camelCase)
+(newline)
+(ident PascalCase)
+(newline)
+(ident SHOUTY_SNAKE)
+(newline)
+(ident _private99)
+(newline)
+(ident mixed_Case_42)
+(newline)
+(ident _1_2_3)
+(newline)
+(ident abc_def_ghi)
+(newline)
+(ident x0)
+(newline)
+(ident x1y2z3_)
+(newline)
+(ident naïve)
+(newline)
+(ident λ_func)
+(newline)
+(ident café)
+(newline)
+(ident éstart)
+(newline)
+(ident caf\u00e9)
+(newline)
+(ident \u00e9start)
+(newline)
+(ident with\U0001F600paste)
+(newline)
+(eof)
diff --git a/test/lex/cases/int_constants.c b/test/lex/cases/int_constants.c
@@ -0,0 +1,49 @@
+0
+1
+123
+2147483648
+00
+0755
+01234567
+0x0
+0X0
+0xFF
+0X1AbC
+0xDEADBEEF
+1u
+1U
+1l
+1L
+1ll
+1LL
+1ul
+1uL
+1Ul
+1UL
+1lu
+1Lu
+1lU
+1LU
+1ull
+1uLL
+1Ull
+1ULL
+1llu
+1LLu
+1llU
+1LLU
+0xFFu
+0755L
+0xFFFFFFFFULL
+1lL
+1Ll
+1lll
+1LLL
+1ufL
+0b1010ULL
+123abc
+0xGHI
+0x
+0X
+077u8
+0Lu8
diff --git a/test/lex/cases/int_constants.expected b/test/lex/cases/int_constants.expected
@@ -0,0 +1,99 @@
+(num 0)
+(newline)
+(num 1)
+(newline)
+(num 123)
+(newline)
+(num 2147483648)
+(newline)
+(num 00)
+(newline)
+(num 0755)
+(newline)
+(num 01234567)
+(newline)
+(num 0x0)
+(newline)
+(num 0X0)
+(newline)
+(num 0xFF)
+(newline)
+(num 0X1AbC)
+(newline)
+(num 0xDEADBEEF)
+(newline)
+(num 1u)
+(newline)
+(num 1U)
+(newline)
+(num 1l)
+(newline)
+(num 1L)
+(newline)
+(num 1ll)
+(newline)
+(num 1LL)
+(newline)
+(num 1ul)
+(newline)
+(num 1uL)
+(newline)
+(num 1Ul)
+(newline)
+(num 1UL)
+(newline)
+(num 1lu)
+(newline)
+(num 1Lu)
+(newline)
+(num 1lU)
+(newline)
+(num 1LU)
+(newline)
+(num 1ull)
+(newline)
+(num 1uLL)
+(newline)
+(num 1Ull)
+(newline)
+(num 1ULL)
+(newline)
+(num 1llu)
+(newline)
+(num 1LLu)
+(newline)
+(num 1llU)
+(newline)
+(num 1LLU)
+(newline)
+(num 0xFFu)
+(newline)
+(num 0755L)
+(newline)
+(num 0xFFFFFFFFULL)
+(newline)
+(num 1lL)
+(newline)
+(num 1Ll)
+(newline)
+(num 1lll)
+(newline)
+(num 1LLL)
+(newline)
+(num 1ufL)
+(newline)
+(num 0b1010ULL)
+(newline)
+(num 123abc)
+(newline)
+(num 0xGHI)
+(newline)
+(num 0x)
+(newline)
+(num 0X)
+(newline)
+(num 077u8)
+(newline)
+(num 0Lu8)
+(newline)
+(eof)
diff --git a/test/lex/cases/keywords.c b/test/lex/cases/keywords.c
@@ -0,0 +1,8 @@
+auto break case char const continue default do double
+else enum extern float for goto if inline int long
+register restrict return short signed sizeof static struct
+switch typedef union unsigned void volatile while
+_Alignas _Alignof _Atomic _Bool _Complex _Generic
+_Imaginary _Noreturn _Static_assert _Thread_local
+INT Int iNT _alignas _alignof _atomic _bool
+ints intt return0 returns ifx whilez forX gotoo
diff --git a/test/lex/cases/keywords.expected b/test/lex/cases/keywords.expected
@@ -0,0 +1,68 @@
+(ident auto)
+(ident break)
+(ident case)
+(ident char)
+(ident const)
+(ident continue)
+(ident default)
+(ident do)
+(ident double)
+(newline)
+(ident else)
+(ident enum)
+(ident extern)
+(ident float)
+(ident for)
+(ident goto)
+(ident if)
+(ident inline)
+(ident int)
+(ident long)
+(newline)
+(ident register)
+(ident restrict)
+(ident return)
+(ident short)
+(ident signed)
+(ident sizeof)
+(ident static)
+(ident struct)
+(newline)
+(ident switch)
+(ident typedef)
+(ident union)
+(ident unsigned)
+(ident void)
+(ident volatile)
+(ident while)
+(newline)
+(ident _Alignas)
+(ident _Alignof)
+(ident _Atomic)
+(ident _Bool)
+(ident _Complex)
+(ident _Generic)
+(newline)
+(ident _Imaginary)
+(ident _Noreturn)
+(ident _Static_assert)
+(ident _Thread_local)
+(newline)
+(ident INT)
+(ident Int)
+(ident iNT)
+(ident _alignas)
+(ident _alignof)
+(ident _atomic)
+(ident _bool)
+(newline)
+(ident ints)
+(ident intt)
+(ident return0)
+(ident returns)
+(ident ifx)
+(ident whilez)
+(ident forX)
+(ident gotoo)
+(newline)
+(eof)
diff --git a/test/lex/cases/line_splice.c b/test/lex/cases/line_splice.c
@@ -0,0 +1,25 @@
+he\
+llo
+"foo\
+bar"
+12\
+345
+0x1\
+2.3\
+p+\
+4
+in\
+t ma\
+in
+// foo\
+still
+end
+/* block\
+comment */ y
+a<\
+<b
+str\
+1
+\
+\
+trailing
diff --git a/test/lex/cases/line_splice.expected b/test/lex/cases/line_splice.expected
@@ -0,0 +1,25 @@
+(ident hello)
+(newline)
+(str "foobar")
+(newline)
+(num 12345)
+(newline)
+(flt 0x12.3p+4)
+(newline)
+(ident int)
+(ident main)
+(newline)
+(newline)
+(ident end)
+(newline)
+(ident y)
+(newline)
+(ident a)
+(punct <<)
+(ident b)
+(newline)
+(ident str1)
+(newline)
+(ident trailing)
+(newline)
+(eof)
diff --git a/test/lex/cases/maximal_munch.c b/test/lex/cases/maximal_munch.c
@@ -0,0 +1,21 @@
++++a
+a---b
+x<<=y
+x>>=y
+a<<b
+a>>b
+p->q
+i==j
+i!=j
+i<=j
+i>=j
+&&||
+...x
+..x
+.x
+x+++++y
+<::>
+<:::
+<%%>
+%:%:
+%:%
diff --git a/test/lex/cases/maximal_munch.expected b/test/lex/cases/maximal_munch.expected
@@ -0,0 +1,80 @@
+(punct ++)
+(punct +)
+(ident a)
+(newline)
+(ident a)
+(punct --)
+(punct -)
+(ident b)
+(newline)
+(ident x)
+(punct <<=)
+(ident y)
+(newline)
+(ident x)
+(punct >>=)
+(ident y)
+(newline)
+(ident a)
+(punct <<)
+(ident b)
+(newline)
+(ident a)
+(punct >>)
+(ident b)
+(newline)
+(ident p)
+(punct ->)
+(ident q)
+(newline)
+(ident i)
+(punct ==)
+(ident j)
+(newline)
+(ident i)
+(punct !=)
+(ident j)
+(newline)
+(ident i)
+(punct <=)
+(ident j)
+(newline)
+(ident i)
+(punct >=)
+(ident j)
+(newline)
+(punct &&)
+(punct ||)
+(newline)
+(punct ...)
+(ident x)
+(newline)
+(punct .)
+(punct .)
+(ident x)
+(newline)
+(punct .)
+(ident x)
+(newline)
+(ident x)
+(punct ++)
+(punct ++)
+(punct +)
+(ident y)
+(newline)
+(punct <:)
+(punct :>)
+(newline)
+(punct <:)
+(punct :)
+(punct :)
+(newline)
+(punct <%)
+(punct %>)
+(newline)
+(pp-paste)
+(newline)
+(pp-hash)
+(punct %)
+(newline)
+(eof)
diff --git a/test/lex/cases/pp_directives.c b/test/lex/cases/pp_directives.c
@@ -0,0 +1,21 @@
+#include "bar.h"
+#include <foo.h>
+#define MAX 100
+#define ID(x) x
+#define STR(x) #x
+#define CAT(a, b) a ## b
+#undef MAX
+#ifdef X
+#endif
+#ifndef Y
+#else
+#endif
+#if 1 + 2
+#elif 3
+#endif
+#error msg
+#pragma once
+#line 42 "f.c"
+#
+%:include "x.h"
+a %:%: b
diff --git a/test/lex/cases/pp_directives.expected b/test/lex/cases/pp_directives.expected
@@ -0,0 +1,100 @@
+(pp-hash)
+(ident include)
+(header "bar.h")
+(newline)
+(pp-hash)
+(ident include)
+(header <foo.h>)
+(newline)
+(pp-hash)
+(ident define)
+(ident MAX)
+(num 100)
+(newline)
+(pp-hash)
+(ident define)
+(ident ID)
+(punct ()
+(ident x)
+(punct ))
+(ident x)
+(newline)
+(pp-hash)
+(ident define)
+(ident STR)
+(punct ()
+(ident x)
+(punct ))
+(pp-hash)
+(ident x)
+(newline)
+(pp-hash)
+(ident define)
+(ident CAT)
+(punct ()
+(ident a)
+(punct ,)
+(ident b)
+(punct ))
+(ident a)
+(pp-paste)
+(ident b)
+(newline)
+(pp-hash)
+(ident undef)
+(ident MAX)
+(newline)
+(pp-hash)
+(ident ifdef)
+(ident X)
+(newline)
+(pp-hash)
+(ident endif)
+(newline)
+(pp-hash)
+(ident ifndef)
+(ident Y)
+(newline)
+(pp-hash)
+(ident else)
+(newline)
+(pp-hash)
+(ident endif)
+(newline)
+(pp-hash)
+(ident if)
+(num 1)
+(punct +)
+(num 2)
+(newline)
+(pp-hash)
+(ident elif)
+(num 3)
+(newline)
+(pp-hash)
+(ident endif)
+(newline)
+(pp-hash)
+(ident error)
+(ident msg)
+(newline)
+(pp-hash)
+(ident pragma)
+(ident once)
+(newline)
+(pp-hash)
+(ident line)
+(num 42)
+(str "f.c")
+(newline)
+(pp-hash)
+(newline)
+(pp-hash)
+(ident include)
+(header "x.h")
+(newline)
+(ident a)
+(pp-paste)
+(ident b)
+(newline)
+(eof)
diff --git a/test/lex/cases/pp_numbers.c b/test/lex/cases/pp_numbers.c
@@ -0,0 +1,29 @@
+0..1
+1...3
+.5.
+.5..6
+1e+1e+1
+123abc
+0xGHI
+1ea
+1e+x
+1e+
+1.e
+.5e
+0xAp
+0xAp+
+0xAp+x
+.
+.x
+. 5
+.5
+1...
+1.2.3.4
+0xA.Bp+3
+0xFFp+2.5
+99e
+99e+
+99e9
+99E-9
+1_underscore
+3.14_pi
diff --git a/test/lex/cases/pp_numbers.expected b/test/lex/cases/pp_numbers.expected
@@ -0,0 +1,61 @@
+(flt 0..1)
+(newline)
+(flt 1...3)
+(newline)
+(flt .5.)
+(newline)
+(flt .5..6)
+(newline)
+(flt 1e+1e+1)
+(newline)
+(num 123abc)
+(newline)
+(num 0xGHI)
+(newline)
+(num 1ea)
+(newline)
+(flt 1e+x)
+(newline)
+(flt 1e+)
+(newline)
+(flt 1.e)
+(newline)
+(flt .5e)
+(newline)
+(flt 0xAp)
+(newline)
+(flt 0xAp+)
+(newline)
+(flt 0xAp+x)
+(newline)
+(punct .)
+(newline)
+(punct .)
+(ident x)
+(newline)
+(punct .)
+(num 5)
+(newline)
+(flt .5)
+(newline)
+(flt 1...)
+(newline)
+(flt 1.2.3.4)
+(newline)
+(flt 0xA.Bp+3)
+(newline)
+(flt 0xFFp+2.5)
+(newline)
+(num 99e)
+(newline)
+(flt 99e+)
+(newline)
+(flt 99e9)
+(newline)
+(flt 99E-9)
+(newline)
+(num 1_underscore)
+(newline)
+(flt 3.14_pi)
+(newline)
+(eof)
diff --git a/test/lex/cases/pp_passthrough.c b/test/lex/cases/pp_passthrough.c
@@ -0,0 +1,2 @@
+#define X 1
+#include "foo.h"
diff --git a/test/lex/cases/pp_passthrough.expected b/test/lex/cases/pp_passthrough.expected
@@ -0,0 +1,10 @@
+(pp-hash)
+(ident define)
+(ident X)
+(num 1)
+(newline)
+(pp-hash)
+(ident include)
+(header "foo.h")
+(newline)
+(eof)
diff --git a/test/lex/cases/punctuators.c b/test/lex/cases/punctuators.c
@@ -0,0 +1,10 @@
+[ ] ( ) { } . ->
+++ -- & * + - ~ !
+/ % << >> < > <= >= == != ^ | && ||
+? : ; ...
+= *= /= %= += -= <<= >>= &= ^= |=
+,
+# ##
+<: :> <% %> %: %:%:
+<:a:>
+<%b%>
diff --git a/test/lex/cases/punctuators.expected b/test/lex/cases/punctuators.expected
@@ -0,0 +1,71 @@
+(punct [)
+(punct ])
+(punct ()
+(punct ))
+(punct {)
+(punct })
+(punct .)
+(punct ->)
+(newline)
+(punct ++)
+(punct --)
+(punct &)
+(punct *)
+(punct +)
+(punct -)
+(punct ~)
+(punct !)
+(newline)
+(punct /)
+(punct %)
+(punct <<)
+(punct >>)
+(punct <)
+(punct >)
+(punct <=)
+(punct >=)
+(punct ==)
+(punct !=)
+(punct ^)
+(punct |)
+(punct &&)
+(punct ||)
+(newline)
+(punct ?)
+(punct :)
+(punct ;)
+(punct ...)
+(newline)
+(punct =)
+(punct *=)
+(punct /=)
+(punct %=)
+(punct +=)
+(punct -=)
+(punct <<=)
+(punct >>=)
+(punct &=)
+(punct ^=)
+(punct |=)
+(newline)
+(punct ,)
+(newline)
+(pp-hash)
+(pp-paste)
+(newline)
+(punct <:)
+(punct :>)
+(punct <%)
+(punct %>)
+(pp-hash)
+(pp-paste)
+(newline)
+(punct <:)
+(ident a)
+(punct :>)
+(newline)
+(punct <%)
+(ident b)
+(punct %>)
+(newline)
+(eof)
diff --git a/test/lex/cases/string_escapes.c b/test/lex/cases/string_escapes.c
@@ -0,0 +1,38 @@
+'\''
+'\"'
+'\?'
+'\\'
+'\a'
+'\b'
+'\f'
+'\n'
+'\r'
+'\t'
+'\v'
+'\0'
+'\7'
+'\077'
+'\377'
+'\x0'
+'\x41'
+'\xff'
+'\xfff'
+'\U0001F600'
+"\a\b\f\n\r\t\v\"\'\\\?"
+"\0\7\077\377"
+"\x0\x41\xff\xfff"
+"\\"
+"\""
+"a\nb"
+"tab\there"
+L"\n"
+u8"\xff"
+u"é"
+U"\U0001F600"
+'\18'
+'\779'
+"\1234"
+"\xffG"
+"\xabc\x12"
+'é'
+"é "
diff --git a/test/lex/cases/string_escapes.expected b/test/lex/cases/string_escapes.expected
@@ -0,0 +1,77 @@
+(chr '\'')
+(newline)
+(chr '\"')
+(newline)
+(chr '\?')
+(newline)
+(chr '\\')
+(newline)
+(chr '\a')
+(newline)
+(chr '\b')
+(newline)
+(chr '\f')
+(newline)
+(chr '\n')
+(newline)
+(chr '\r')
+(newline)
+(chr '\t')
+(newline)
+(chr '\v')
+(newline)
+(chr '\0')
+(newline)
+(chr '\7')
+(newline)
+(chr '\077')
+(newline)
+(chr '\377')
+(newline)
+(chr '\x0')
+(newline)
+(chr '\x41')
+(newline)
+(chr '\xff')
+(newline)
+(chr '\xfff')
+(newline)
+(chr '\U0001F600')
+(newline)
+(str "\a\b\f\n\r\t\v\"\'\\\?")
+(newline)
+(str "\0\7\077\377")
+(newline)
+(str "\x0\x41\xff\xfff")
+(newline)
+(str "\\")
+(newline)
+(str "\"")
+(newline)
+(str "a\nb")
+(newline)
+(str "tab\there")
+(newline)
+(str L"\n")
+(newline)
+(str u8"\xff")
+(newline)
+(str u"é")
+(newline)
+(str U"\U0001F600")
+(newline)
+(chr '\18')
+(newline)
+(chr '\779')
+(newline)
+(str "\1234")
+(newline)
+(str "\xffG")
+(newline)
+(str "\xabc\x12")
+(newline)
+(chr 'é')
+(newline)
+(str "é ")
+(newline)
+(eof)
diff --git a/test/lex/cases/strings_chars.c b/test/lex/cases/strings_chars.c
@@ -0,0 +1,23 @@
+""
+"hello"
+'a'
+'0'
+' '
+'ab'
+"a" "b" "c"
+"L"
+"u8"
+"u"
+"U"
+L"wide"
+u8"utf8"
+u"u16"
+U"u32"
+L'w'
+u'A'
+U'B'
+u8'a'
+L "x"
+L"x"
+'À'
+"À\U0001F600"
diff --git a/test/lex/cases/strings_chars.expected b/test/lex/cases/strings_chars.expected
@@ -0,0 +1,51 @@
+(str "")
+(newline)
+(str "hello")
+(newline)
+(chr 'a')
+(newline)
+(chr '0')
+(newline)
+(chr ' ')
+(newline)
+(chr 'ab')
+(newline)
+(str "a")
+(str "b")
+(str "c")
+(newline)
+(str "L")
+(newline)
+(str "u8")
+(newline)
+(str "u")
+(newline)
+(str "U")
+(newline)
+(str L"wide")
+(newline)
+(str u8"utf8")
+(newline)
+(str u"u16")
+(newline)
+(str U"u32")
+(newline)
+(chr L'w')
+(newline)
+(chr u'A')
+(newline)
+(chr U'B')
+(newline)
+(ident u8)
+(chr 'a')
+(newline)
+(ident L)
+(str "x")
+(newline)
+(str L"x")
+(newline)
+(chr 'À')
+(newline)
+(str "À\U0001F600")
+(newline)
+(eof)
diff --git a/test/lex/run.sh b/test/lex/run.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+# Data-driven lexer test runner.
+#
+# For each test/lex/cases/*.c, runs `cfree cc --dump-tokens` and diffs the
+# output against the matching .expected file. Leaves .actual files behind on
+# failure so they can be reviewed and copied over the expected baseline once
+# intentional output changes are validated.
+#
+# Honors $CFREE for the binary path; defaults to build/cfree relative to the
+# repo root inferred from this script's location.
+
+set -u
+
+script_dir=$(cd "$(dirname "$0")" && pwd)
+repo_root=$(cd "$script_dir/../.." && pwd)
+cases_dir="$script_dir/cases"
+
+CFREE="${CFREE:-$repo_root/build/cfree}"
+
+if [ ! -x "$CFREE" ]; then
+    echo "lex: cfree binary not found at $CFREE" >&2
+    exit 2
+fi
+
+pass=0
+fail=0
+failures=
+
+for src in "$cases_dir"/*.c; do
+    [ -e "$src" ] || continue
+    expected="${src%.c}.expected"
+    actual="${src%.c}.actual"
+    name=$(basename "${src%.c}")
+
+    if [ ! -e "$expected" ]; then
+        printf 'FAIL %s (missing %s)\n' "$name" "$(basename "$expected")"
+        fail=$((fail + 1))
+        failures="$failures $name"
+        continue
+    fi
+
+    if ! "$CFREE" cc --dump-tokens "$src" -o "$actual" >/dev/null 2>&1; then
+        printf 'FAIL %s (cfree exit nonzero; see %s)\n' "$name" "$(basename "$actual")"
+        fail=$((fail + 1))
+        failures="$failures $name"
+        continue
+    fi
+
+    if diff -u "$expected" "$actual" >/dev/null 2>&1; then
+        printf 'PASS %s\n' "$name"
+        rm -f "$actual"
+        pass=$((pass + 1))
+    else
+        printf 'FAIL %s\n' "$name"
+        diff -u "$expected" "$actual" || true
+        fail=$((fail + 1))
+        failures="$failures $name"
+    fi
+done
+
+total=$((pass + fail))
+printf '\nlex: %d/%d passed\n' "$pass" "$total"
+if [ "$fail" -gt 0 ]; then
+    printf 'lex: failures:%s\n' "$failures"
+    exit 1
+fi
diff --git a/test/pp/cases/define_function.actual b/test/pp/cases/define_function.actual
diff --git a/test/pp/cases/define_function.c b/test/pp/cases/define_function.c
@@ -0,0 +1,2 @@
+#define ADD(a, b) ((a) + (b))
+ADD(1, 2 * 3)
diff --git a/test/pp/cases/define_function.expected b/test/pp/cases/define_function.expected
@@ -0,0 +1 @@
+((1) + (2 * 3))
diff --git a/test/pp/cases/define_object.actual b/test/pp/cases/define_object.actual
diff --git a/test/pp/cases/define_object.c b/test/pp/cases/define_object.c
@@ -0,0 +1,2 @@
+#define X 42
+X X X
diff --git a/test/pp/cases/define_object.expected b/test/pp/cases/define_object.expected
@@ -0,0 +1 @@
+42 42 42
diff --git a/test/pp/cases/ifdef_basic.actual b/test/pp/cases/ifdef_basic.actual
diff --git a/test/pp/cases/ifdef_basic.c b/test/pp/cases/ifdef_basic.c
@@ -0,0 +1,11 @@
+#define KEEP 1
+#ifdef KEEP
+chosen
+#else
+skipped
+#endif
+#ifndef MISSING
+also_chosen
+#else
+also_skipped
+#endif
diff --git a/test/pp/cases/ifdef_basic.expected b/test/pp/cases/ifdef_basic.expected
@@ -0,0 +1,2 @@
+chosen
+also_chosen
diff --git a/test/pp/cases/include_local.actual b/test/pp/cases/include_local.actual
diff --git a/test/pp/cases/include_local.c b/test/pp/cases/include_local.c
@@ -0,0 +1,2 @@
+#include "include_local.h"
+FROM_HEADER
diff --git a/test/pp/cases/include_local.expected b/test/pp/cases/include_local.expected
@@ -0,0 +1,2 @@
+header_token
+7
diff --git a/test/pp/cases/include_local.h b/test/pp/cases/include_local.h
@@ -0,0 +1,2 @@
+#define FROM_HEADER 7
+header_token
diff --git a/test/pp/cases/undef.actual b/test/pp/cases/undef.actual
diff --git a/test/pp/cases/undef.c b/test/pp/cases/undef.c
@@ -0,0 +1,6 @@
+#define X 1
+X
+#undef X
+X
+#define X 2
+X
diff --git a/test/pp/cases/undef.expected b/test/pp/cases/undef.expected
@@ -0,0 +1,3 @@
+1
+X
+2
diff --git a/test/pp/run.sh b/test/pp/run.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# Data-driven preprocessor test runner.
+#
+# For each test/pp/cases/*.c, runs `cfree cc -E` (with -I pointing at the
+# cases dir so sibling headers resolve) and diffs the output against the
+# matching .expected file. Leaves .actual files behind on failure so they
+# can be reviewed and copied over the expected baseline once intentional
+# output changes are validated.
+#
+# Honors $CFREE for the binary path; defaults to build/cfree relative to the
+# repo root inferred from this script's location.
+
+set -u
+
+script_dir=$(cd "$(dirname "$0")" && pwd)
+repo_root=$(cd "$script_dir/../.." && pwd)
+cases_dir="$script_dir/cases"
+
+CFREE="${CFREE:-$repo_root/build/cfree}"
+
+if [ ! -x "$CFREE" ]; then
+    echo "pp: cfree binary not found at $CFREE" >&2
+    exit 2
+fi
+
+pass=0
+fail=0
+failures=
+
+for src in "$cases_dir"/*.c; do
+    [ -e "$src" ] || continue
+    expected="${src%.c}.expected"
+    actual="${src%.c}.actual"
+    name=$(basename "${src%.c}")
+
+    if [ ! -e "$expected" ]; then
+        printf 'FAIL %s (missing %s)\n' "$name" "$(basename "$expected")"
+        fail=$((fail + 1))
+        failures="$failures $name"
+        continue
+    fi
+
+    if ! "$CFREE" cc -E -I "$cases_dir" "$src" -o "$actual" >/dev/null 2>&1; then
+        printf 'FAIL %s (cfree exit nonzero; see %s)\n' "$name" "$(basename "$actual")"
+        fail=$((fail + 1))
+        failures="$failures $name"
+        continue
+    fi
+
+    if diff -u "$expected" "$actual" >/dev/null 2>&1; then
+        printf 'PASS %s\n' "$name"
+        rm -f "$actual"
+        pass=$((pass + 1))
+    else
+        printf 'FAIL %s\n' "$name"
+        diff -u "$expected" "$actual" || true
+        fail=$((fail + 1))
+        failures="$failures $name"
+    fi
+done
+
+total=$((pass + fail))
+printf '\npp: %d/%d passed\n' "$pass" "$total"
+if [ "$fail" -gt 0 ]; then
+    printf 'pp: failures:%s\n' "$failures"
+    exit 1
+fi
diff --git a/test/test.mk b/test/test.mk
@@ -0,0 +1,41 @@
+# Data-driven tests. Included from the top-level Makefile.
+#
+# - test-lex / test-pp: C frontend runners; depend on the cfree driver
+#   binary, which today fails to link (most of libcfree is header-only).
+# - test-elf: ELF roundtrip harness in test/elf/; depends only on
+#   libcfree.a and compiles its own test binaries against it. Skipped
+#   layers are reported (set CFREE_TEST_ALLOW_SKIP=1 to allow skips).
+# - test-ar:  in-process ar reader/writer tests; depends only on
+#   libcfree.a. Set CFREE_AR_TEST_HOST=1 to also dump produced bytes
+#   to /tmp and run the host's `ar t` / `nm --print-armap` as a
+#   cross-check.
+
+.PHONY: test test-lex test-pp test-elf test-ar
+
+test: test-lex test-pp test-elf test-ar
+
+test-lex: bin
+	@CFREE=$(BIN) test/lex/run.sh
+
+test-pp: bin
+	@CFREE=$(BIN) test/pp/run.sh
+
+test-elf: lib bin-soft
+	bash test/elf/run.sh
+
+# Best-effort cfree binary build: Layer D needs build/cfree, but the
+# binary may not link until enough libcfree symbols exist. The harness
+# detects a missing binary and skips that layer; don't break test-elf
+# when bin fails.
+.PHONY: bin-soft
+bin-soft:
+	-@$(MAKE) bin 2>/dev/null || true
+
+AR_TEST_BIN = build/test/ar_test
+
+test-ar: $(AR_TEST_BIN)
+	$(AR_TEST_BIN)
+
+$(AR_TEST_BIN): test/ar_test.c $(LIB_AR)
+	@mkdir -p $(dir $@)
+	$(CC) $(DRIVER_CFLAGS) test/ar_test.c $(LIB_AR) -o $@

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

A	src/lex/lex.c	\|	436	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/basic_punct.c	\|	3	+++
A	test/lex/cases/basic_punct.expected	\|	16	++++++++++++++++
A	test/lex/cases/comment_edges.c	\|	12	++++++++++++
A	test/lex/cases/comment_edges.expected	\|	21	+++++++++++++++++++++
A	test/lex/cases/comments.c	\|	16	++++++++++++++++
A	test/lex/cases/comments.expected	\|	34	++++++++++++++++++++++++++++++++++
A	test/lex/cases/empty.c	\|	0
A	test/lex/cases/empty.expected	\|	1	+
A	test/lex/cases/float_constants.c	\|	34	++++++++++++++++++++++++++++++++++
A	test/lex/cases/float_constants.expected	\|	69	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/identifiers.c	\|	31	+++++++++++++++++++++++++++++++
A	test/lex/cases/identifiers.expected	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/int_constants.c	\|	49	+++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/int_constants.expected	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/keywords.c	\|	8	++++++++
A	test/lex/cases/keywords.expected	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/line_splice.c	\|	25	+++++++++++++++++++++++++
A	test/lex/cases/line_splice.expected	\|	25	+++++++++++++++++++++++++
A	test/lex/cases/maximal_munch.c	\|	21	+++++++++++++++++++++
A	test/lex/cases/maximal_munch.expected	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/pp_directives.c	\|	21	+++++++++++++++++++++
A	test/lex/cases/pp_directives.expected	\|	100	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/pp_numbers.c	\|	29	+++++++++++++++++++++++++++++
A	test/lex/cases/pp_numbers.expected	\|	61	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/pp_passthrough.c	\|	2	++
A	test/lex/cases/pp_passthrough.expected	\|	10	++++++++++
A	test/lex/cases/punctuators.c	\|	10	++++++++++
A	test/lex/cases/punctuators.expected	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/string_escapes.c	\|	38	++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/string_escapes.expected	\|	77	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/cases/strings_chars.c	\|	23	+++++++++++++++++++++++
A	test/lex/cases/strings_chars.expected	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/lex/run.sh	\|	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/pp/cases/define_function.actual	\|	0
A	test/pp/cases/define_function.c	\|	2	++
A	test/pp/cases/define_function.expected	\|	1	+
A	test/pp/cases/define_object.actual	\|	0
A	test/pp/cases/define_object.c	\|	2	++
A	test/pp/cases/define_object.expected	\|	1	+
A	test/pp/cases/ifdef_basic.actual	\|	0
A	test/pp/cases/ifdef_basic.c	\|	11	+++++++++++
A	test/pp/cases/ifdef_basic.expected	\|	2	++
A	test/pp/cases/include_local.actual	\|	0
A	test/pp/cases/include_local.c	\|	2	++
A	test/pp/cases/include_local.expected	\|	2	++
A	test/pp/cases/include_local.h	\|	2	++
A	test/pp/cases/undef.actual	\|	0
A	test/pp/cases/undef.c	\|	6	++++++
A	test/pp/cases/undef.expected	\|	3	+++
A	test/pp/run.sh	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/test.mk	\|	41	+++++++++++++++++++++++++++++++++++++++++