Make M1pp directives whitespace-insensitive in M1pp.c - boot2

commit 65d201320ea94388e45a60bb03831a769cba3382
parent e01f261694fd6df753bb3e6cf5b27490d44a6752
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat, 25 Apr 2026 13:02:11 -0700

Make M1pp directives whitespace-insensitive in M1pp.c

Paren-call (`%FOO(x)`, `!(expr)`, etc.) only fuses when `(` is tight
against the name — any whitespace, including a newline, prevents fusion
and falls through to literal pass-through. Implemented via a `tight` bit
on each Token, set by the lexer when no whitespace separator was seen.

Directive headers (%macro, %struct, %enum, %scope) accept newlines
between their tokens, so one-line definitions and multi-line param
lists both parse. Directives are recognized regardless of line position;
%macro/%endm/%endscope/struct-or-enum-} no longer require line_start.

Block terminators (%endm, %endscope, struct/enum }) must be immediately
followed by TOK_NEWLINE. The newline is consumed only when the directive
itself started at line_start — that way mid-line directives leave the
user's trailing newline in the stream for the main loop to emit.

`##` paste skips newlines on either side, so a body like `foo ##\n bar`
joins to `foobar`.

Diffstat:
M M1pp/M1pp.c  | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------

1 file changed, 139 insertions(+), 75 deletions(-)
diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c
@@ -33,18 +33,24 @@
  *      The source token array is pushed as the initial stream. Each iteration
  *      pops a token from the top stream:
  *
- *        %macro NAME(p,...) / %endm at line-start
+ *        %macro NAME(p,...) / %endm
  *          -> define_macro(): consume header + body tokens into macros[] and
- *             macro_body_tokens[]; register name and param list.
+ *             macro_body_tokens[]; register name and param list. Header is
+ *             whitespace-insensitive (newlines inside (...) are skipped);
+ *             %endm is recognized anywhere and must be followed by NEWLINE.
+ *             A directive that started at line_start consumes its trailing
+ *             newline; mid-line directives leave it for the main loop.
  *
  *        !(e) / @(e) / %(e) / $(e) / %select(c,t,e)
  *          -> expand_builtin_call(): parse arg spans, eval S-expression(s) via
  *             eval_expr_range(), emit LE hex or push the chosen token span.
+ *             Only fuses when ( is tight against the name (no whitespace).
  *
  *        %NAME(...) matching a defined macro
  *          -> expand_call() -> expand_macro_tokens(): substitute arguments,
  *             apply ## paste via paste_pool_range(), write result into
  *             expand_pool[], then push that slice as a new stream (rescan).
+ *             Tight ( required for paren-form; otherwise treated as 0-arg.
  *
  *        Anything else
  *          -> emit_token() / emit_newline() directly into output_buf.
@@ -120,6 +126,7 @@ struct TextSpan {
 
 struct Token {
     int kind;
+    int tight;
     struct TextSpan text;
 };
 
@@ -206,12 +213,13 @@ static char *append_text_len(const char *s, int len)
 }
 
 static int push_token(struct Token *buf, int *count, int max_count,
-                      int kind, struct TextSpan text)
+                      int kind, int tight, struct TextSpan text)
 {
     if (*count >= max_count) {
         return fail("token overflow");
     }
     buf[*count].kind = kind;
+    buf[*count].tight = tight;
     buf[*count].text = text;
     *count += 1;
     return 1;
@@ -242,27 +250,36 @@ static int span_eq_token(struct TextSpan span, const struct Token *tok)
 
 static int lex_source(const char *src)
 {
+    /* Track whether whitespace (space, tab, comment, OR newline) precedes
+     * the next token. tight=1 means "no whitespace before me"; only
+     * LPAREN's tight bit is consulted, to decide whether %FOO(...) /
+     * !(...) etc. are paren-call forms. */
     int i = 0;
+    int saw_separator = 1;
 
     while (src[i] != '\0') {
         int start;
         int len;
+        int tight;
 
         if (is_space_no_nl((unsigned char)src[i])) {
+            saw_separator = 1;
             i++;
             continue;
         }
         if (src[i] == '\n') {
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_NEWLINE, (struct TextSpan){src + i, 1})) {
+                            TOK_NEWLINE, 0, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
+            saw_separator = 1;
             i++;
             continue;
         }
         if (src[i] == '"' || src[i] == '\'') {
             int quote = src[i];
 
+            tight = !saw_separator;
             start = i;
             i++;
             while (src[i] != '\0' && src[i] != quote) {
@@ -273,66 +290,81 @@ static int lex_source(const char *src)
             }
             len = i - start;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_STRING, (struct TextSpan){src + start, len})) {
+                            TOK_STRING, tight, (struct TextSpan){src + start, len})) {
                 return 0;
             }
+            saw_separator = 0;
             continue;
         }
         if (src[i] == '#' && src[i + 1] == '#') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_PASTE, (struct TextSpan){src + i, 2})) {
+                            TOK_PASTE, tight, (struct TextSpan){src + i, 2})) {
                 return 0;
             }
             i += 2;
+            saw_separator = 0;
             continue;
         }
         if (src[i] == '#' || src[i] == ';') {
+            saw_separator = 1;
             while (src[i] != '\0' && src[i] != '\n') {
                 i++;
             }
             continue;
         }
         if (src[i] == '(') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_LPAREN, (struct TextSpan){src + i, 1})) {
+                            TOK_LPAREN, tight, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
             i++;
+            saw_separator = 0;
             continue;
         }
         if (src[i] == ')') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_RPAREN, (struct TextSpan){src + i, 1})) {
+                            TOK_RPAREN, tight, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
             i++;
+            saw_separator = 0;
             continue;
         }
         if (src[i] == ',') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_COMMA, (struct TextSpan){src + i, 1})) {
+                            TOK_COMMA, tight, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
             i++;
+            saw_separator = 0;
             continue;
         }
         if (src[i] == '{') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_LBRACE, (struct TextSpan){src + i, 1})) {
+                            TOK_LBRACE, tight, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
             i++;
+            saw_separator = 0;
             continue;
         }
         if (src[i] == '}') {
+            tight = !saw_separator;
             if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                            TOK_RBRACE, (struct TextSpan){src + i, 1})) {
+                            TOK_RBRACE, tight, (struct TextSpan){src + i, 1})) {
                 return 0;
             }
             i++;
+            saw_separator = 0;
             continue;
         }
 
+        tight = !saw_separator;
         start = i;
         while (src[i] != '\0' &&
                !is_space_no_nl((unsigned char)src[i]) &&
@@ -349,9 +381,10 @@ static int lex_source(const char *src)
         }
         len = i - start;
         if (!push_token(source_tokens, &source_count, MAX_TOKENS,
-                        TOK_WORD, (struct TextSpan){src + start, len})) {
+                        TOK_WORD, tight, (struct TextSpan){src + start, len})) {
             return 0;
         }
+        saw_separator = 0;
     }
 
     return 1;
@@ -538,7 +571,7 @@ static int push_pool_stream_from_mark(int mark)
                             mark);
 }
 
-static void skip_expr_newlines(struct Token **pos, struct Token *end)
+static void skip_newlines(struct Token **pos, struct Token *end)
 {
     while (*pos < end && (*pos)->kind == TOK_NEWLINE) {
         *pos += 1;
@@ -620,6 +653,7 @@ static int define_fielded_macro(struct TextSpan base, const char *suffix,
     }
     m->param_count = 0;
     body_tok.kind = TOK_WORD;
+    body_tok.tight = 0;
     if (!emit_decimal_text(value, &body_tok.text)) {
         return 0;
     }
@@ -636,20 +670,22 @@ static int define_fielded(struct Stream *s, long long stride,
     /* Parses `%struct NAME { f1 f2 ... }` or `%enum NAME { ... }` and
      * synthesizes N+1 zero-parameter macros:
      *   NAME.field_k  -> k * stride
-     *   NAME.<total>  -> N * stride    (SIZE for struct, COUNT for enum) */
+     *   NAME.<total>  -> N * stride    (SIZE for struct, COUNT for enum)
+     * The closing } must be immediately followed by TOK_NEWLINE. The
+     * newline is consumed iff the directive started at line_start. */
     struct TextSpan base;
     long long index = 0;
+    int started_at_line_start = s->line_start;
 
     s->pos++;
+    skip_newlines(&s->pos, s->end);
     if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
         return fail("bad directive");
     }
     base = s->pos->text;
     s->pos++;
 
-    while (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
-        s->pos++;
-    }
+    skip_newlines(&s->pos, s->end);
     if (s->pos >= s->end || s->pos->kind != TOK_LBRACE) {
         return fail("bad directive");
     }
@@ -682,20 +718,28 @@ static int define_fielded(struct Stream *s, long long stride,
         return 0;
     }
 
-    while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
-        s->pos++;
+    if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+        return fail("expected newline after struct/enum");
     }
-    if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
+    if (started_at_line_start) {
         s->pos++;
+        s->line_start = 1;
     }
-    s->line_start = 1;
     return 1;
 }
 
 static int define_macro(struct Stream *s)
 {
+    /* Header is whitespace-insensitive: newlines inside (...) and around
+     * the keywords are skipped. Body collection skips newlines that fall
+     * between `)` and the first body token (so `%macro N()\nbody\n%endm`
+     * has body=[WORD body, NEWLINE], same as the old required-newline form).
+     * %endm is recognized anywhere in the body; the next token must be
+     * TOK_NEWLINE. The newline is consumed only when the directive started
+     * at s->line_start — that way mid-line directives leave the user's
+     * trailing newline in the stream for the main loop to emit. */
     struct Macro *m;
-    int line_start;
+    int started_at_line_start = s->line_start;
 
     if (macro_count >= MAX_MACROS) {
         return fail("too many macros");
@@ -708,17 +752,20 @@ static int define_macro(struct Stream *s)
     memset(m, 0, sizeof(*m));
     s->pos++;
 
+    skip_newlines(&s->pos, s->end);
     if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
         return fail("bad macro header");
     }
     m->name = s->pos->text;
     s->pos++;
 
+    skip_newlines(&s->pos, s->end);
     if (s->pos >= s->end || s->pos->kind != TOK_LPAREN) {
         return fail("bad macro header");
     }
     s->pos++;
 
+    skip_newlines(&s->pos, s->end);
     if (s->pos < s->end && s->pos->kind != TOK_RPAREN) {
         while (1) {
             if (m->param_count >= MAX_PARAMS) {
@@ -730,8 +777,10 @@ static int define_macro(struct Stream *s)
             m->params[m->param_count] = s->pos->text;
             m->param_count++;
             s->pos++;
+            skip_newlines(&s->pos, s->end);
             if (s->pos < s->end && s->pos->kind == TOK_COMMA) {
                 s->pos++;
+                skip_newlines(&s->pos, s->end);
                 continue;
             }
             break;
@@ -742,26 +791,20 @@ static int define_macro(struct Stream *s)
         return fail("bad macro header");
     }
     s->pos++;
-
-    if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
-        return fail("bad macro header");
-    }
-    s->pos++;
+    skip_newlines(&s->pos, s->end);
 
     m->body_start = macro_body_tokens + macro_body_used;
-    line_start = 1;
     while (s->pos < s->end) {
-        if (line_start &&
-            s->pos->kind == TOK_WORD &&
-            token_text_eq(s->pos, "%endm")) {
-            while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
-                s->pos++;
+        if (s->pos->kind == TOK_WORD && token_text_eq(s->pos, "%endm")) {
+            s->pos++;
+            if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+                return fail("expected newline after %endm");
             }
-            if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
+            if (started_at_line_start) {
                 s->pos++;
+                s->line_start = 1;
             }
             m->body_end = macro_body_tokens + macro_body_used;
-            s->line_start = 1;
             macro_count++;
             return 1;
         }
@@ -769,7 +812,6 @@ static int define_macro(struct Stream *s)
             return fail("macro body overflow");
         }
         macro_body_tokens[macro_body_used++] = *s->pos;
-        line_start = (s->pos->kind == TOK_NEWLINE);
         s->pos++;
     }
 
@@ -915,6 +957,7 @@ static int append_pasted_token(struct Token *dst,
         return 0;
     }
     dst->kind = TOK_WORD;
+    dst->tight = 0;
     dst->text.ptr = text_ptr;
     dst->text.len = n;
     return 1;
@@ -922,6 +965,10 @@ static int append_pasted_token(struct Token *dst,
 
 static int paste_pool_range(int mark)
 {
+    /* Skip newlines on both sides of TOK_PASTE: a body like `foo ##\n bar`
+     * pastes to `foobar`, discarding the intervening newline. The left
+     * operand is the rightmost non-newline already copied to `out`; the
+     * right operand is the next non-newline past PASTE in `in`. */
     struct Token *start = expand_pool + mark;
     struct Token *in = start;
     struct Token *out = start;
@@ -929,22 +976,34 @@ static int paste_pool_range(int mark)
 
     while (in < end) {
         if (in->kind == TOK_PASTE) {
-            if (out == start || in + 1 >= end) {
+            struct Token *left = out;
+            struct Token *right = in + 1;
+
+            while (left > start && (left - 1)->kind == TOK_NEWLINE) {
+                left--;
+            }
+            if (left == start) {
                 pool_used = mark;
                 return fail("bad paste");
             }
-            if ((out - 1)->kind == TOK_NEWLINE ||
-                (out - 1)->kind == TOK_PASTE ||
-                (in + 1)->kind == TOK_NEWLINE ||
-                (in + 1)->kind == TOK_PASTE) {
+            left--;
+            if (left->kind == TOK_PASTE) {
                 pool_used = mark;
                 return fail("bad paste");
             }
-            if (!append_pasted_token(out - 1, out - 1, in + 1)) {
+            while (right < end && right->kind == TOK_NEWLINE) {
+                right++;
+            }
+            if (right >= end || right->kind == TOK_PASTE) {
+                pool_used = mark;
+                return fail("bad paste");
+            }
+            if (!append_pasted_token(left, left, right)) {
                 pool_used = mark;
                 return 0;
             }
-            in += 2;
+            out = left + 1;
+            in = right + 1;
             continue;
         }
         if (out != in) {
@@ -1011,6 +1070,7 @@ static int push_local_label_token(const struct Token *tok, int expansion_id)
     text_buf[text_used++] = '\0';
 
     out.kind = TOK_WORD;
+    out.tight = 0;
     out.text.ptr = text_buf + start;
     out.text.len = total;
     return push_pool_token(out);
@@ -1025,7 +1085,8 @@ static int expand_macro_tokens(struct Token *call_tok, struct Token *limit,
     int mark;
     int expansion_id;
 
-    if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN) {
+    if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN &&
+        (call_tok + 1)->tight) {
         if (!parse_args(call_tok + 1, limit)) {
             return 0;
         }
@@ -1316,7 +1377,8 @@ static int eval_expr_atom(struct Token *tok, struct Token *limit,
 
     macro = find_macro(tok);
     if (macro != NULL &&
-        ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN) ||
+        ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN &&
+          (tok + 1)->tight) ||
          macro->param_count == 0)) {
         if (!expand_macro_tokens(tok, limit, macro, &after, &mark)) {
             return 0;
@@ -1372,7 +1434,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
             continue;
         }
 
-        skip_expr_newlines(&pos, span.end);
+        skip_newlines(&pos, span.end);
         if (pos >= span.end) {
             break;
         }
@@ -1381,7 +1443,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
             enum ExprOp op;
 
             pos++;
-            skip_expr_newlines(&pos, span.end);
+            skip_newlines(&pos, span.end);
             if (pos >= span.end) {
                 return fail("bad expression");
             }
@@ -1394,7 +1456,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
                 /* strlen is degenerate: argument is a TOK_STRING atom,
                  * not a recursive expression. Handle inline and yield
                  * the string's raw byte count (span.len - 2). */
-                skip_expr_newlines(&pos, span.end);
+                skip_newlines(&pos, span.end);
                 if (pos >= span.end || pos->kind != TOK_STRING) {
                     return fail("bad expression");
                 }
@@ -1403,7 +1465,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
                 }
                 value = (long long)(pos->text.len - 2);
                 pos++;
-                skip_expr_newlines(&pos, span.end);
+                skip_newlines(&pos, span.end);
                 if (pos >= span.end || pos->kind != TOK_RPAREN) {
                     return fail("bad expression");
                 }
@@ -1478,6 +1540,7 @@ static int emit_hex_value(unsigned long long value, int bytes)
         return 0;
     }
     tok.kind = TOK_STRING;
+    tok.tight = 0;
     tok.text.ptr = text_ptr;
     tok.text.len = total_len;
     return emit_token(&tok);
@@ -1579,6 +1642,7 @@ static int expand_builtin_call(struct Stream *s, const struct Token *tok)
         text_buf[text_used++] = '\0';
 
         out_tok.kind = TOK_STRING;
+        out_tok.tight = 0;
         out_tok.text.ptr = text_ptr;
         out_tok.text.len = out_len;
         s->pos = end_pos;
@@ -1604,7 +1668,14 @@ static int expand_call(struct Stream *s, const struct Macro *macro)
 
 static int push_scope(struct Stream *s)
 {
+    /* Header self-terminates at the scope name. Newlines after the name
+     * are insignificant — they're skipped here so a multi-line scope
+     * (`%scope NAME\nbody\n%endscope`) doesn't introduce an extra blank
+     * line in output. */
+    int started_at_line_start = s->line_start;
+
     s->pos++;
+    skip_newlines(&s->pos, s->end);
     if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
         return fail("bad scope header");
     }
@@ -1613,30 +1684,31 @@ static int push_scope(struct Stream *s)
     }
     scope_stack[scope_depth++] = s->pos->text;
     s->pos++;
-    if (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
-        return fail("bad scope header");
+    if (started_at_line_start) {
+        skip_newlines(&s->pos, s->end);
+        s->line_start = 1;
     }
-    if (s->pos < s->end) {
-        s->pos++;
-    }
-    s->line_start = 1;
     return 1;
 }
 
 static int pop_scope(struct Stream *s)
 {
+    /* %endscope must be immediately followed by TOK_NEWLINE. The newline
+     * is consumed iff %endscope itself appeared at line_start. */
+    int started_at_line_start = s->line_start;
+
     s->pos++;
     if (scope_depth <= 0) {
         return fail("scope underflow");
     }
     scope_depth--;
-    while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
-        s->pos++;
+    if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+        return fail("expected newline after %endscope");
     }
-    if (s->pos < s->end) {
+    if (started_at_line_start) {
         s->pos++;
+        s->line_start = 1;
     }
-    s->line_start = 1;
     return 1;
 }
 
@@ -1662,45 +1734,35 @@ static int process_tokens(void)
 
         tok = s->pos;
 
-        if (s->line_start &&
-            tok->kind == TOK_WORD &&
-            token_text_eq(tok, "%macro")) {
+        if (tok->kind == TOK_WORD && token_text_eq(tok, "%macro")) {
             if (!define_macro(s)) {
                 return 0;
             }
             continue;
         }
 
-        if (s->line_start &&
-            tok->kind == TOK_WORD &&
-            token_text_eq(tok, "%struct")) {
+        if (tok->kind == TOK_WORD && token_text_eq(tok, "%struct")) {
             if (!define_fielded(s, 8, "SIZE", 4)) {
                 return 0;
             }
             continue;
         }
 
-        if (s->line_start &&
-            tok->kind == TOK_WORD &&
-            token_text_eq(tok, "%enum")) {
+        if (tok->kind == TOK_WORD && token_text_eq(tok, "%enum")) {
             if (!define_fielded(s, 1, "COUNT", 5)) {
                 return 0;
             }
             continue;
         }
 
-        if (s->line_start &&
-            tok->kind == TOK_WORD &&
-            token_text_eq(tok, "%scope")) {
+        if (tok->kind == TOK_WORD && token_text_eq(tok, "%scope")) {
             if (!push_scope(s)) {
                 return 0;
             }
             continue;
         }
 
-        if (s->line_start &&
-            tok->kind == TOK_WORD &&
-            token_text_eq(tok, "%endscope")) {
+        if (tok->kind == TOK_WORD && token_text_eq(tok, "%endscope")) {
             if (!pop_scope(s)) {
                 return 0;
             }
@@ -1719,6 +1781,7 @@ static int process_tokens(void)
         if (tok->kind == TOK_WORD &&
             tok + 1 < s->end &&
             (tok + 1)->kind == TOK_LPAREN &&
+            (tok + 1)->tight &&
             (token_text_eq(tok, "!") ||
              token_text_eq(tok, "@") ||
              token_text_eq(tok, "%") ||
@@ -1733,7 +1796,8 @@ static int process_tokens(void)
 
         macro = find_macro(tok);
         if (macro != NULL &&
-            ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN) ||
+            ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN &&
+              (tok + 1)->tight) ||
              macro->param_count == 0)) {
             if (!expand_call(s, macro)) {
                 return 0;

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README