commit 65d201320ea94388e45a60bb03831a769cba3382
parent e01f261694fd6df753bb3e6cf5b27490d44a6752
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 25 Apr 2026 13:02:11 -0700
Make M1pp directives whitespace-insensitive in M1pp.c
Paren-call (`%FOO(x)`, `!(expr)`, etc.) only fuses when `(` is tight
against the name — any whitespace, including a newline, prevents fusion
and falls through to literal pass-through. Implemented via a `tight` bit
on each Token, set by the lexer when no whitespace separator was seen.
Directive headers (%macro, %struct, %enum, %scope) accept newlines
between their tokens, so one-line definitions and multi-line param
lists both parse. Directives are recognized regardless of line position;
%macro/%endm/%endscope/struct-or-enum-} no longer require line_start.
Block terminators (%endm, %endscope, struct/enum }) must be immediately
followed by TOK_NEWLINE. The newline is consumed only when the directive
itself started at line_start — that way mid-line directives leave the
user's trailing newline in the stream for the main loop to emit.
`##` paste skips newlines on either side, so a body like `foo ##\n bar`
joins to `foobar`.
Diffstat:
| M | M1pp/M1pp.c | | | 214 | +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------- |
1 file changed, 139 insertions(+), 75 deletions(-)
diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c
@@ -33,18 +33,24 @@
* The source token array is pushed as the initial stream. Each iteration
* pops a token from the top stream:
*
- * %macro NAME(p,...) / %endm at line-start
+ * %macro NAME(p,...) / %endm
* -> define_macro(): consume header + body tokens into macros[] and
- * macro_body_tokens[]; register name and param list.
+ * macro_body_tokens[]; register name and param list. Header is
+ * whitespace-insensitive (newlines inside (...) are skipped);
+ * %endm is recognized anywhere and must be followed by NEWLINE.
+ * A directive that started at line_start consumes its trailing
+ * newline; mid-line directives leave it for the main loop.
*
* !(e) / @(e) / %(e) / $(e) / %select(c,t,e)
* -> expand_builtin_call(): parse arg spans, eval S-expression(s) via
* eval_expr_range(), emit LE hex or push the chosen token span.
+ * Only fuses when ( is tight against the name (no whitespace).
*
* %NAME(...) matching a defined macro
* -> expand_call() -> expand_macro_tokens(): substitute arguments,
* apply ## paste via paste_pool_range(), write result into
* expand_pool[], then push that slice as a new stream (rescan).
+ * Tight ( required for paren-form; otherwise treated as 0-arg.
*
* Anything else
* -> emit_token() / emit_newline() directly into output_buf.
@@ -120,6 +126,7 @@ struct TextSpan {
struct Token {
int kind;
+ int tight;
struct TextSpan text;
};
@@ -206,12 +213,13 @@ static char *append_text_len(const char *s, int len)
}
static int push_token(struct Token *buf, int *count, int max_count,
- int kind, struct TextSpan text)
+ int kind, int tight, struct TextSpan text)
{
if (*count >= max_count) {
return fail("token overflow");
}
buf[*count].kind = kind;
+ buf[*count].tight = tight;
buf[*count].text = text;
*count += 1;
return 1;
@@ -242,27 +250,36 @@ static int span_eq_token(struct TextSpan span, const struct Token *tok)
static int lex_source(const char *src)
{
+ /* Track whether whitespace (space, tab, comment, OR newline) precedes
+ * the next token. tight=1 means "no whitespace before me"; only
+ * LPAREN's tight bit is consulted, to decide whether %FOO(...) /
+ * !(...) etc. are paren-call forms. */
int i = 0;
+ int saw_separator = 1;
while (src[i] != '\0') {
int start;
int len;
+ int tight;
if (is_space_no_nl((unsigned char)src[i])) {
+ saw_separator = 1;
i++;
continue;
}
if (src[i] == '\n') {
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_NEWLINE, (struct TextSpan){src + i, 1})) {
+ TOK_NEWLINE, 0, (struct TextSpan){src + i, 1})) {
return 0;
}
+ saw_separator = 1;
i++;
continue;
}
if (src[i] == '"' || src[i] == '\'') {
int quote = src[i];
+ tight = !saw_separator;
start = i;
i++;
while (src[i] != '\0' && src[i] != quote) {
@@ -273,66 +290,81 @@ static int lex_source(const char *src)
}
len = i - start;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_STRING, (struct TextSpan){src + start, len})) {
+ TOK_STRING, tight, (struct TextSpan){src + start, len})) {
return 0;
}
+ saw_separator = 0;
continue;
}
if (src[i] == '#' && src[i + 1] == '#') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_PASTE, (struct TextSpan){src + i, 2})) {
+ TOK_PASTE, tight, (struct TextSpan){src + i, 2})) {
return 0;
}
i += 2;
+ saw_separator = 0;
continue;
}
if (src[i] == '#' || src[i] == ';') {
+ saw_separator = 1;
while (src[i] != '\0' && src[i] != '\n') {
i++;
}
continue;
}
if (src[i] == '(') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_LPAREN, (struct TextSpan){src + i, 1})) {
+ TOK_LPAREN, tight, (struct TextSpan){src + i, 1})) {
return 0;
}
i++;
+ saw_separator = 0;
continue;
}
if (src[i] == ')') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_RPAREN, (struct TextSpan){src + i, 1})) {
+ TOK_RPAREN, tight, (struct TextSpan){src + i, 1})) {
return 0;
}
i++;
+ saw_separator = 0;
continue;
}
if (src[i] == ',') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_COMMA, (struct TextSpan){src + i, 1})) {
+ TOK_COMMA, tight, (struct TextSpan){src + i, 1})) {
return 0;
}
i++;
+ saw_separator = 0;
continue;
}
if (src[i] == '{') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_LBRACE, (struct TextSpan){src + i, 1})) {
+ TOK_LBRACE, tight, (struct TextSpan){src + i, 1})) {
return 0;
}
i++;
+ saw_separator = 0;
continue;
}
if (src[i] == '}') {
+ tight = !saw_separator;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_RBRACE, (struct TextSpan){src + i, 1})) {
+ TOK_RBRACE, tight, (struct TextSpan){src + i, 1})) {
return 0;
}
i++;
+ saw_separator = 0;
continue;
}
+ tight = !saw_separator;
start = i;
while (src[i] != '\0' &&
!is_space_no_nl((unsigned char)src[i]) &&
@@ -349,9 +381,10 @@ static int lex_source(const char *src)
}
len = i - start;
if (!push_token(source_tokens, &source_count, MAX_TOKENS,
- TOK_WORD, (struct TextSpan){src + start, len})) {
+ TOK_WORD, tight, (struct TextSpan){src + start, len})) {
return 0;
}
+ saw_separator = 0;
}
return 1;
@@ -538,7 +571,7 @@ static int push_pool_stream_from_mark(int mark)
mark);
}
-static void skip_expr_newlines(struct Token **pos, struct Token *end)
+static void skip_newlines(struct Token **pos, struct Token *end)
{
while (*pos < end && (*pos)->kind == TOK_NEWLINE) {
*pos += 1;
@@ -620,6 +653,7 @@ static int define_fielded_macro(struct TextSpan base, const char *suffix,
}
m->param_count = 0;
body_tok.kind = TOK_WORD;
+ body_tok.tight = 0;
if (!emit_decimal_text(value, &body_tok.text)) {
return 0;
}
@@ -636,20 +670,22 @@ static int define_fielded(struct Stream *s, long long stride,
/* Parses `%struct NAME { f1 f2 ... }` or `%enum NAME { ... }` and
* synthesizes N+1 zero-parameter macros:
* NAME.field_k -> k * stride
- * NAME.<total> -> N * stride (SIZE for struct, COUNT for enum) */
+ * NAME.<total> -> N * stride (SIZE for struct, COUNT for enum)
+ * The closing } must be immediately followed by TOK_NEWLINE. The
+ * newline is consumed iff the directive started at line_start. */
struct TextSpan base;
long long index = 0;
+ int started_at_line_start = s->line_start;
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
return fail("bad directive");
}
base = s->pos->text;
s->pos++;
- while (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
- s->pos++;
- }
+ skip_newlines(&s->pos, s->end);
if (s->pos >= s->end || s->pos->kind != TOK_LBRACE) {
return fail("bad directive");
}
@@ -682,20 +718,28 @@ static int define_fielded(struct Stream *s, long long stride,
return 0;
}
- while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
- s->pos++;
+ if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+ return fail("expected newline after struct/enum");
}
- if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
+ if (started_at_line_start) {
s->pos++;
+ s->line_start = 1;
}
- s->line_start = 1;
return 1;
}
static int define_macro(struct Stream *s)
{
+ /* Header is whitespace-insensitive: newlines inside (...) and around
+ * the keywords are skipped. Body collection skips newlines that fall
+ * between `)` and the first body token (so `%macro N()\nbody\n%endm`
+ * has body=[WORD body, NEWLINE], same as the old required-newline form).
+ * %endm is recognized anywhere in the body; the next token must be
+ * TOK_NEWLINE. The newline is consumed only when the directive started
+ * at s->line_start — that way mid-line directives leave the user's
+ * trailing newline in the stream for the main loop to emit. */
struct Macro *m;
- int line_start;
+ int started_at_line_start = s->line_start;
if (macro_count >= MAX_MACROS) {
return fail("too many macros");
@@ -708,17 +752,20 @@ static int define_macro(struct Stream *s)
memset(m, 0, sizeof(*m));
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
return fail("bad macro header");
}
m->name = s->pos->text;
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos >= s->end || s->pos->kind != TOK_LPAREN) {
return fail("bad macro header");
}
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos < s->end && s->pos->kind != TOK_RPAREN) {
while (1) {
if (m->param_count >= MAX_PARAMS) {
@@ -730,8 +777,10 @@ static int define_macro(struct Stream *s)
m->params[m->param_count] = s->pos->text;
m->param_count++;
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos < s->end && s->pos->kind == TOK_COMMA) {
s->pos++;
+ skip_newlines(&s->pos, s->end);
continue;
}
break;
@@ -742,26 +791,20 @@ static int define_macro(struct Stream *s)
return fail("bad macro header");
}
s->pos++;
-
- if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
- return fail("bad macro header");
- }
- s->pos++;
+ skip_newlines(&s->pos, s->end);
m->body_start = macro_body_tokens + macro_body_used;
- line_start = 1;
while (s->pos < s->end) {
- if (line_start &&
- s->pos->kind == TOK_WORD &&
- token_text_eq(s->pos, "%endm")) {
- while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
- s->pos++;
+ if (s->pos->kind == TOK_WORD && token_text_eq(s->pos, "%endm")) {
+ s->pos++;
+ if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+ return fail("expected newline after %endm");
}
- if (s->pos < s->end && s->pos->kind == TOK_NEWLINE) {
+ if (started_at_line_start) {
s->pos++;
+ s->line_start = 1;
}
m->body_end = macro_body_tokens + macro_body_used;
- s->line_start = 1;
macro_count++;
return 1;
}
@@ -769,7 +812,6 @@ static int define_macro(struct Stream *s)
return fail("macro body overflow");
}
macro_body_tokens[macro_body_used++] = *s->pos;
- line_start = (s->pos->kind == TOK_NEWLINE);
s->pos++;
}
@@ -915,6 +957,7 @@ static int append_pasted_token(struct Token *dst,
return 0;
}
dst->kind = TOK_WORD;
+ dst->tight = 0;
dst->text.ptr = text_ptr;
dst->text.len = n;
return 1;
@@ -922,6 +965,10 @@ static int append_pasted_token(struct Token *dst,
static int paste_pool_range(int mark)
{
+ /* Skip newlines on both sides of TOK_PASTE: a body like `foo ##\n bar`
+ * pastes to `foobar`, discarding the intervening newline. The left
+ * operand is the rightmost non-newline already copied to `out`; the
+ * right operand is the next non-newline past PASTE in `in`. */
struct Token *start = expand_pool + mark;
struct Token *in = start;
struct Token *out = start;
@@ -929,22 +976,34 @@ static int paste_pool_range(int mark)
while (in < end) {
if (in->kind == TOK_PASTE) {
- if (out == start || in + 1 >= end) {
+ struct Token *left = out;
+ struct Token *right = in + 1;
+
+ while (left > start && (left - 1)->kind == TOK_NEWLINE) {
+ left--;
+ }
+ if (left == start) {
pool_used = mark;
return fail("bad paste");
}
- if ((out - 1)->kind == TOK_NEWLINE ||
- (out - 1)->kind == TOK_PASTE ||
- (in + 1)->kind == TOK_NEWLINE ||
- (in + 1)->kind == TOK_PASTE) {
+ left--;
+ if (left->kind == TOK_PASTE) {
pool_used = mark;
return fail("bad paste");
}
- if (!append_pasted_token(out - 1, out - 1, in + 1)) {
+ while (right < end && right->kind == TOK_NEWLINE) {
+ right++;
+ }
+ if (right >= end || right->kind == TOK_PASTE) {
+ pool_used = mark;
+ return fail("bad paste");
+ }
+ if (!append_pasted_token(left, left, right)) {
pool_used = mark;
return 0;
}
- in += 2;
+ out = left + 1;
+ in = right + 1;
continue;
}
if (out != in) {
@@ -1011,6 +1070,7 @@ static int push_local_label_token(const struct Token *tok, int expansion_id)
text_buf[text_used++] = '\0';
out.kind = TOK_WORD;
+ out.tight = 0;
out.text.ptr = text_buf + start;
out.text.len = total;
return push_pool_token(out);
@@ -1025,7 +1085,8 @@ static int expand_macro_tokens(struct Token *call_tok, struct Token *limit,
int mark;
int expansion_id;
- if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN) {
+ if (call_tok + 1 < limit && (call_tok + 1)->kind == TOK_LPAREN &&
+ (call_tok + 1)->tight) {
if (!parse_args(call_tok + 1, limit)) {
return 0;
}
@@ -1316,7 +1377,8 @@ static int eval_expr_atom(struct Token *tok, struct Token *limit,
macro = find_macro(tok);
if (macro != NULL &&
- ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN) ||
+ ((tok + 1 < limit && (tok + 1)->kind == TOK_LPAREN &&
+ (tok + 1)->tight) ||
macro->param_count == 0)) {
if (!expand_macro_tokens(tok, limit, macro, &after, &mark)) {
return 0;
@@ -1372,7 +1434,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
continue;
}
- skip_expr_newlines(&pos, span.end);
+ skip_newlines(&pos, span.end);
if (pos >= span.end) {
break;
}
@@ -1381,7 +1443,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
enum ExprOp op;
pos++;
- skip_expr_newlines(&pos, span.end);
+ skip_newlines(&pos, span.end);
if (pos >= span.end) {
return fail("bad expression");
}
@@ -1394,7 +1456,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
/* strlen is degenerate: argument is a TOK_STRING atom,
* not a recursive expression. Handle inline and yield
* the string's raw byte count (span.len - 2). */
- skip_expr_newlines(&pos, span.end);
+ skip_newlines(&pos, span.end);
if (pos >= span.end || pos->kind != TOK_STRING) {
return fail("bad expression");
}
@@ -1403,7 +1465,7 @@ static int eval_expr_range(struct TokenSpan span, long long *out)
}
value = (long long)(pos->text.len - 2);
pos++;
- skip_expr_newlines(&pos, span.end);
+ skip_newlines(&pos, span.end);
if (pos >= span.end || pos->kind != TOK_RPAREN) {
return fail("bad expression");
}
@@ -1478,6 +1540,7 @@ static int emit_hex_value(unsigned long long value, int bytes)
return 0;
}
tok.kind = TOK_STRING;
+ tok.tight = 0;
tok.text.ptr = text_ptr;
tok.text.len = total_len;
return emit_token(&tok);
@@ -1579,6 +1642,7 @@ static int expand_builtin_call(struct Stream *s, const struct Token *tok)
text_buf[text_used++] = '\0';
out_tok.kind = TOK_STRING;
+ out_tok.tight = 0;
out_tok.text.ptr = text_ptr;
out_tok.text.len = out_len;
s->pos = end_pos;
@@ -1604,7 +1668,14 @@ static int expand_call(struct Stream *s, const struct Macro *macro)
static int push_scope(struct Stream *s)
{
+ /* Header self-terminates at the scope name. Newlines after the name
+ * are insignificant — they're skipped here so a multi-line scope
+ * (`%scope NAME\nbody\n%endscope`) doesn't introduce an extra blank
+ * line in output. */
+ int started_at_line_start = s->line_start;
+
s->pos++;
+ skip_newlines(&s->pos, s->end);
if (s->pos >= s->end || s->pos->kind != TOK_WORD) {
return fail("bad scope header");
}
@@ -1613,30 +1684,31 @@ static int push_scope(struct Stream *s)
}
scope_stack[scope_depth++] = s->pos->text;
s->pos++;
- if (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
- return fail("bad scope header");
+ if (started_at_line_start) {
+ skip_newlines(&s->pos, s->end);
+ s->line_start = 1;
}
- if (s->pos < s->end) {
- s->pos++;
- }
- s->line_start = 1;
return 1;
}
static int pop_scope(struct Stream *s)
{
+ /* %endscope must be immediately followed by TOK_NEWLINE. The newline
+ * is consumed iff %endscope itself appeared at line_start. */
+ int started_at_line_start = s->line_start;
+
s->pos++;
if (scope_depth <= 0) {
return fail("scope underflow");
}
scope_depth--;
- while (s->pos < s->end && s->pos->kind != TOK_NEWLINE) {
- s->pos++;
+ if (s->pos >= s->end || s->pos->kind != TOK_NEWLINE) {
+ return fail("expected newline after %endscope");
}
- if (s->pos < s->end) {
+ if (started_at_line_start) {
s->pos++;
+ s->line_start = 1;
}
- s->line_start = 1;
return 1;
}
@@ -1662,45 +1734,35 @@ static int process_tokens(void)
tok = s->pos;
- if (s->line_start &&
- tok->kind == TOK_WORD &&
- token_text_eq(tok, "%macro")) {
+ if (tok->kind == TOK_WORD && token_text_eq(tok, "%macro")) {
if (!define_macro(s)) {
return 0;
}
continue;
}
- if (s->line_start &&
- tok->kind == TOK_WORD &&
- token_text_eq(tok, "%struct")) {
+ if (tok->kind == TOK_WORD && token_text_eq(tok, "%struct")) {
if (!define_fielded(s, 8, "SIZE", 4)) {
return 0;
}
continue;
}
- if (s->line_start &&
- tok->kind == TOK_WORD &&
- token_text_eq(tok, "%enum")) {
+ if (tok->kind == TOK_WORD && token_text_eq(tok, "%enum")) {
if (!define_fielded(s, 1, "COUNT", 5)) {
return 0;
}
continue;
}
- if (s->line_start &&
- tok->kind == TOK_WORD &&
- token_text_eq(tok, "%scope")) {
+ if (tok->kind == TOK_WORD && token_text_eq(tok, "%scope")) {
if (!push_scope(s)) {
return 0;
}
continue;
}
- if (s->line_start &&
- tok->kind == TOK_WORD &&
- token_text_eq(tok, "%endscope")) {
+ if (tok->kind == TOK_WORD && token_text_eq(tok, "%endscope")) {
if (!pop_scope(s)) {
return 0;
}
@@ -1719,6 +1781,7 @@ static int process_tokens(void)
if (tok->kind == TOK_WORD &&
tok + 1 < s->end &&
(tok + 1)->kind == TOK_LPAREN &&
+ (tok + 1)->tight &&
(token_text_eq(tok, "!") ||
token_text_eq(tok, "@") ||
token_text_eq(tok, "%") ||
@@ -1733,7 +1796,8 @@ static int process_tokens(void)
macro = find_macro(tok);
if (macro != NULL &&
- ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN) ||
+ ((tok + 1 < s->end && (tok + 1)->kind == TOK_LPAREN &&
+ (tok + 1)->tight) ||
macro->param_count == 0)) {
if (!expand_call(s, macro)) {
return 0;