boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 65d201320ea94388e45a60bb03831a769cba3382
parent e01f261694fd6df753bb3e6cf5b27490d44a6752
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat, 25 Apr 2026 13:02:11 -0700

Make M1pp directives whitespace-insensitive in M1pp.c

Paren-call (`%FOO(x)`, `!(expr)`, etc.) only fuses when `(` is tight
against the name — any whitespace, including a newline, prevents fusion
and falls through to literal pass-through. Implemented via a `tight` bit
on each Token, set by the lexer when no whitespace separator was seen.

Directive headers (%macro, %struct, %enum, %scope) accept newlines
between their tokens, so one-line definitions and multi-line param
lists both parse. Directives are recognized regardless of line position;
%macro/%endm/%endscope/struct-or-enum-} no longer require line_start.

Block terminators (%endm, %endscope, struct/enum }) must be immediately
followed by TOK_NEWLINE. The newline is consumed only when the directive
itself started at line_start — that way mid-line directives leave the
user's trailing newline in the stream for the main loop to emit.

`##` paste skips newlines on either side, so a body like `foo ##\n bar`
joins to `foobar`.

Diffstat:
MM1pp/M1pp.c | 214+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 139 insertions(+), 75 deletions(-)

diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c @@ -33,18 +33,24 @@ * The source token array is pushed as the initial stream. Each iteration * pops a token from the top stream: * - * %macro NAME(p,...) / %endm at line-start + * %macro NAME(p,...) / %endm * -> define_macro(): consume header + body tokens into macros[] and - * macro_body_tokens[]; register name and param list. + * macro_body_tokens[]; register name and param list. Header is + * whitespace-insensitive (newlines inside (...) are skipped); + * %endm is recognized anywhere and must be followed by NEWLINE. + * A directive that started at line_start consumes its trailing + * newline; mid-line directives leave it for the main loop. * * !(e) / @(e) / %(e) / $(e) / %select(c,t,e) * -> expand_builtin_call(): parse arg spans, eval S-expression(s) via * eval_expr_range(), emit LE hex or push the chosen token span. + * Only fuses when ( is tight against the name (no whitespace). * * %NAME(...) matching a defined macro * -> expand_call() -> expand_macro_tokens(): substitute arguments, * apply ## paste via paste_pool_range(), write result into * expand_pool[], then push that slice as a new stream (rescan). + * Tight ( required for paren-form; otherwise treated as 0-arg. * * Anything else * -> emit_token() / emit_newline() directly into output_buf. @@ -120,6 +126,7 @@ struct TextSpan { struct Token { int kind; + int tight; struct TextSpan text; }; @@ -206,12 +213,13 @@ static char *append_text_len(const char *s, int len) } static int push_token(struct Token *buf, int *count, int max_count, - int kind, struct TextSpan text) + int kind, int tight, struct TextSpan text) { if (*count >= max_count) { return fail("token overflow"); } buf[*count].kind = kind; + buf[*count].tight = tight; buf[*count].text = text; *count += 1; return 1; @@ -242,27 +250,36 @@ static int span_eq_token(struct TextSpan span, const struct Token *tok) static int lex_source(const char *src) { + /* Track whether whitespace (space, tab, comment, OR newline) precedes + * the next token. tight=1 means "no whitespace before me"; only + * LPAREN's tight bit is consulted, to decide whether %FOO(...) / + * !(...) etc. are paren-call forms. */ int i = 0; + int saw_separator = 1; while (src[i] != '\0') { int start; int len; + int tight; if (is_space_no_nl((unsigned char)src[i])) { + saw_separator = 1; i++; continue; } if (src[i] == '\n') { if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_NEWLINE, (struct TextSpan){src + i, 1})) { + TOK_NEWLINE, 0, (struct TextSpan){src + i, 1})) { return 0; } + saw_separator = 1; i++; continue; } if (src[i] == '"' || src[i] == '\'') { int quote = src[i]; + tight = !saw_separator; start = i; i++; while (src[i] != '\0' && src[i] != quote) { @@ -273,66 +290,81 @@ static int lex_source(const char *src) } len = i - start; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_STRING, (struct TextSpan){src + start, len})) { + TOK_STRING, tight, (struct TextSpan){src + start, len})) { return 0; } + saw_separator = 0; continue; } if (src[i] == '#' && src[i + 1] == '#') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_PASTE, (struct TextSpan){src + i, 2})) { + TOK_PASTE, tight, (struct TextSpan){src + i, 2})) { return 0; } i += 2; + saw_separator = 0; continue; } if (src[i] == '#' || src[i] == ';') { + saw_separator = 1; while (src[i] != '\0' && src[i] != '\n') { i++; } continue; } if (src[i] == '(') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_LPAREN, (struct TextSpan){src + i, 1})) { + TOK_LPAREN, tight, (struct TextSpan){src + i, 1})) { return 0; } i++; + saw_separator = 0; continue; } if (src[i] == ')') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_RPAREN, (struct TextSpan){src + i, 1})) { + TOK_RPAREN, tight, (struct TextSpan){src + i, 1})) { return 0; } i++; + saw_separator = 0; continue; } if (src[i] == ',') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_COMMA, (struct TextSpan){src + i, 1})) { + TOK_COMMA, tight, (struct TextSpan){src + i, 1})) { return 0; } i++; + saw_separator = 0; continue; } if (src[i] == '{') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_LBRACE, (struct TextSpan){src + i, 1})) { + TOK_LBRACE, tight, (struct TextSpan){src + i, 1})) { return 0; } i++; + saw_separator = 0; continue; } if (src[i] == '}') { + tight = !saw_separator; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_RBRACE, (struct TextSpan){src + i, 1})) { + TOK_RBRACE, tight, (struct TextSpan){src + i, 1})) { return 0; } i++; + saw_separator = 0; continue; } + tight = !saw_separator; start = i; while (src[i] != '\0' && !is_space_no_nl((unsigned char)src[i]) && @@ -349,9 +381,10 @@ static int lex_source(const char *src) } len = i - start; if (!push_token(source_tokens, &source_count, MAX_TOKENS, - TOK_WORD, (struct TextSpan){src + start, len})) { + TOK_WORD, tight, (struct TextSpan){src + start, len})) { return 0; } + saw_separator = 0; } return 1; @@ -538,7 +571,7 @@ static int push_pool_stream_from_mark(int mark) mark); } -static void skip_expr_newlines(struct Token **pos, struct Token *end) +static void skip_newlines(struct Token **pos, struct Token *end) { while (*pos < end && (*pos)->kind == TOK_NEWLINE) { *pos += 1; @@ -620,6 +653,7 @@ static int define_fielded_macro(struct TextSpan base, const char *suffix, } m->param_count = 0; body_tok.kind = TOK_WORD; + body_tok.tight = 0; if (!emit_decimal_text(value, &body_tok.text)) { return 0; } @@ -636,20 +670,22 @@ static int define_fielded(struct Stream *s, long long stride, /* Parses `%struct NAME { f1 f2 ... }` or `%enum NAME { ... }` and * synthesizes N+1 zero-parameter macros: * NAME.field_k -> k * stride - * NAME.<total> -> N * stride (SIZE for struct, COUNT for enum) */ + * NAME.<total> -> N * stride (SIZE for struct, COUNT for enum) + * The closing } must be immediately followed by TOK_NEWLINE. The + * newline is consumed iff the directive started at line_start. */ struct TextSpan base; long long index = 0; + int started_at_line_start = s->line_start; s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos >= s->end || s->pos->kind != TOK_WORD) { return fail("bad directive"); } base = s->pos->text; s->pos++; - while (s->pos < s->end && s->pos->kind == TOK_NEWLINE) { - s->pos++; - } + skip_newlines(&s->pos, s->end); if (s->pos >= s->end || s->pos->kind != TOK_LBRACE) { return fail("bad directive"); } @@ -682,20 +718,28 @@ static int define_fielded(struct Stream *s, long long stride, return 0; } - while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) { - s->pos++; + if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) { + return fail("expected newline after struct/enum"); } - if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) { + if (started_at_line_start) { s->pos++; + s->line_start = 1; } - s->line_start = 1; return 1; } static int define_macro(struct Stream *s) { + /* Header is whitespace-insensitive: newlines inside (...) and around + * the keywords are skipped. Body collection skips newlines that fall + * between `)` and the first body token (so `%macro N()\nbody\n%endm` + * has body=[WORD body, NEWLINE], same as the old required-newline form). + * %endm is recognized anywhere in the body; the next token must be + * TOK_NEWLINE. The newline is consumed only when the directive started + * at s->line_start — that way mid-line directives leave the user's + * trailing newline in the stream for the main loop to emit. */ struct Macro *m; - int line_start; + int started_at_line_start = s->line_start; if (macro_count >= MAX_MACROS) { return fail("too many macros"); @@ -708,17 +752,20 @@ static int define_macro(struct Stream *s) memset(m, 0, sizeof(*m)); s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos >= s->end || s->pos->kind != TOK_WORD) { return fail("bad macro header"); } m->name = s->pos->text; s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos >= s->end || s->pos->kind != TOK_LPAREN) { return fail("bad macro header"); } s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos < s->end && s->pos->kind != TOK_RPAREN) { while (1) { if (m->param_count >= MAX_PARAMS) { @@ -730,8 +777,10 @@ static int define_macro(struct Stream *s) m->params[m->param_count] = s->pos->text; m->param_count++; s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos < s->end && s->pos->kind == TOK_COMMA) { s->pos++; + skip_newlines(&s->pos, s->end); continue; } break; @@ -742,26 +791,20 @@ static int define_macro(struct Stream *s) return fail("bad macro header"); } s->pos++; - - if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) { - return fail("bad macro header"); - } - s->pos++; + skip_newlines(&s->pos, s->end); m->body_start = macro_body_tokens + macro_body_used; - line_start = 1; while (s->pos < s->end) { - if (line_start && - s->pos->kind == TOK_WORD && - token_text_eq(s->pos, "%endm")) { - while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) { - s->pos++; + if (s->pos->kind == TOK_WORD && token_text_eq(s->pos, "%endm")) { + s->pos++; + if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) { + return fail("expected newline after %endm"); } - if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) { + if (started_at_line_start) { s->pos++; + s->line_start = 1; } m->body_end = macro_body_tokens + macro_body_used; - s->line_start = 1; macro_count++; return 1; } @@ -769,7 +812,6 @@ static int define_macro(struct Stream *s) return fail("macro body overflow"); } macro_body_tokens[macro_body_used++] = *s->pos; - line_start = (s->pos->kind == TOK_NEWLINE); s->pos++; } @@ -915,6 +957,7 @@ static int append_pasted_token(struct Token *dst, return 0; } dst->kind = TOK_WORD; + dst->tight = 0; dst->text.ptr = text_ptr; dst->text.len = n; return 1; @@ -922,6 +965,10 @@ static int append_pasted_token(struct Token *dst, static int paste_pool_range(int mark) { + /* Skip newlines on both sides of TOK_PASTE: a body like `foo ##\n bar` + * pastes to `foobar`, discarding the intervening newline. The left + * operand is the rightmost non-newline already copied to `out`; the + * right operand is the next non-newline past PASTE in `in`. */ struct Token *start = expand_pool + mark; struct Token *in = start; struct Token *out = start; @@ -929,22 +976,34 @@ static int paste_pool_range(int mark) while (in < end) { if (in->kind == TOK_PASTE) { - if (out == start || in + 1 >= end) { + struct Token *left = out; + struct Token *right = in + 1; + + while (left > start && (left - 1)->kind == TOK_NEWLINE) { + left--; + } + if (left == start) { pool_used = mark; return fail("bad paste"); } - if ((out - 1)->kind == TOK_NEWLINE || - (out - 1)->kind == TOK_PASTE || - (in + 1)->kind == TOK_NEWLINE || - (in + 1)->kind == TOK_PASTE) { + left--; + if (left->kind == TOK_PASTE) { pool_used = mark; return fail("bad paste"); } - if (!append_pasted_token(out - 1, out - 1, in + 1)) { + while (right < end && right->kind == TOK_NEWLINE) { + right++; + } + if (right >= end || right->kind == TOK_PASTE) { + pool_used = mark; + return fail("bad paste"); + } + if (!append_pasted_token(left, left, right)) { pool_used = mark; return 0; } - in += 2; + out = left + 1; + in = right + 1; continue; } if (out != in) { @@ -1011,6 +1070,7 @@ static int push_local_label_token(const struct Token *tok, int expansion_id) text_buf[text_used++] = '\0'; out.kind = TOK_WORD; + out.tight = 0; out.text.ptr = text_buf + start; out.text.len = total; return push_pool_token(out); @@ -1025,7 +1085,8 @@ static int expand_macro_tokens(struct Token *call_tok, struct Token *limit, int mark; int expansion_id; - if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN) { + if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN && + (call_tok + 1)->tight) { if (!parse_args(call_tok + 1, limit)) { return 0; } @@ -1316,7 +1377,8 @@ static int eval_expr_atom(struct Token *tok, struct Token *limit, macro = find_macro(tok); if (macro != NULL && - ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN) || + ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN && + (tok + 1)->tight) || macro->param_count == 0)) { if (!expand_macro_tokens(tok, limit, macro, &after, &mark)) { return 0; @@ -1372,7 +1434,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out) continue; } - skip_expr_newlines(&pos, span.end); + skip_newlines(&pos, span.end); if (pos >= span.end) { break; } @@ -1381,7 +1443,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out) enum ExprOp op; pos++; - skip_expr_newlines(&pos, span.end); + skip_newlines(&pos, span.end); if (pos >= span.end) { return fail("bad expression"); } @@ -1394,7 +1456,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out) /* strlen is degenerate: argument is a TOK_STRING atom, * not a recursive expression. Handle inline and yield * the string's raw byte count (span.len - 2). */ - skip_expr_newlines(&pos, span.end); + skip_newlines(&pos, span.end); if (pos >= span.end || pos->kind != TOK_STRING) { return fail("bad expression"); } @@ -1403,7 +1465,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out) } value = (long long)(pos->text.len - 2); pos++; - skip_expr_newlines(&pos, span.end); + skip_newlines(&pos, span.end); if (pos >= span.end || pos->kind != TOK_RPAREN) { return fail("bad expression"); } @@ -1478,6 +1540,7 @@ static int emit_hex_value(unsigned long long value, int bytes) return 0; } tok.kind = TOK_STRING; + tok.tight = 0; tok.text.ptr = text_ptr; tok.text.len = total_len; return emit_token(&tok); @@ -1579,6 +1642,7 @@ static int expand_builtin_call(struct Stream *s, const struct Token *tok) text_buf[text_used++] = '\0'; out_tok.kind = TOK_STRING; + out_tok.tight = 0; out_tok.text.ptr = text_ptr; out_tok.text.len = out_len; s->pos = end_pos; @@ -1604,7 +1668,14 @@ static int expand_call(struct Stream *s, const struct Macro *macro) static int push_scope(struct Stream *s) { + /* Header self-terminates at the scope name. Newlines after the name + * are insignificant — they're skipped here so a multi-line scope + * (`%scope NAME\nbody\n%endscope`) doesn't introduce an extra blank + * line in output. */ + int started_at_line_start = s->line_start; + s->pos++; + skip_newlines(&s->pos, s->end); if (s->pos >= s->end || s->pos->kind != TOK_WORD) { return fail("bad scope header"); } @@ -1613,30 +1684,31 @@ static int push_scope(struct Stream *s) } scope_stack[scope_depth++] = s->pos->text; s->pos++; - if (s->pos < s->end && s->pos->kind != TOK_NEWLINE) { - return fail("bad scope header"); + if (started_at_line_start) { + skip_newlines(&s->pos, s->end); + s->line_start = 1; } - if (s->pos < s->end) { - s->pos++; - } - s->line_start = 1; return 1; } static int pop_scope(struct Stream *s) { + /* %endscope must be immediately followed by TOK_NEWLINE. The newline + * is consumed iff %endscope itself appeared at line_start. */ + int started_at_line_start = s->line_start; + s->pos++; if (scope_depth <= 0) { return fail("scope underflow"); } scope_depth--; - while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) { - s->pos++; + if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) { + return fail("expected newline after %endscope"); } - if (s->pos < s->end) { + if (started_at_line_start) { s->pos++; + s->line_start = 1; } - s->line_start = 1; return 1; } @@ -1662,45 +1734,35 @@ static int process_tokens(void) tok = s->pos; - if (s->line_start && - tok->kind == TOK_WORD && - token_text_eq(tok, "%macro")) { + if (tok->kind == TOK_WORD && token_text_eq(tok, "%macro")) { if (!define_macro(s)) { return 0; } continue; } - if (s->line_start && - tok->kind == TOK_WORD && - token_text_eq(tok, "%struct")) { + if (tok->kind == TOK_WORD && token_text_eq(tok, "%struct")) { if (!define_fielded(s, 8, "SIZE", 4)) { return 0; } continue; } - if (s->line_start && - tok->kind == TOK_WORD && - token_text_eq(tok, "%enum")) { + if (tok->kind == TOK_WORD && token_text_eq(tok, "%enum")) { if (!define_fielded(s, 1, "COUNT", 5)) { return 0; } continue; } - if (s->line_start && - tok->kind == TOK_WORD && - token_text_eq(tok, "%scope")) { + if (tok->kind == TOK_WORD && token_text_eq(tok, "%scope")) { if (!push_scope(s)) { return 0; } continue; } - if (s->line_start && - tok->kind == TOK_WORD && - token_text_eq(tok, "%endscope")) { + if (tok->kind == TOK_WORD && token_text_eq(tok, "%endscope")) { if (!pop_scope(s)) { return 0; } @@ -1719,6 +1781,7 @@ static int process_tokens(void) if (tok->kind == TOK_WORD && tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN && + (tok + 1)->tight && (token_text_eq(tok, "!") || token_text_eq(tok, "@") || token_text_eq(tok, "%") || @@ -1733,7 +1796,8 @@ static int process_tokens(void) macro = find_macro(tok); if (macro != NULL && - ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN) || + ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN && + (tok + 1)->tight) || macro->param_count == 0)) { if (!expand_call(s, macro)) { return 0;