commit ea3e075a6aab78970bf2b1726b72fb6688bf70b2 parent 27dba8a270502627d69babede9b2112837600836 Author: Ryan Sepassi <rsepassi@gmail.com> Date: Sun, 3 May 2026 16:27:46 -0700 M1pp: drop %bytes builtin; bare strings emit decoded bytes A TOK_STRING reaching the output stream now decodes its quoted body into raw little-endian hex bytes via the same escape table that used to live behind %bytes (\n \t \r \0 \\ \" \xNN). %bytes itself is removed: bare "..." (or '...') at statement position emits bytes directly, %str(IDENT) produces a STRING that flows through the same emission path, and (strlen "...") still reads the string atom inside expressions without decoding. Test suite: regenerate the five expected files whose strings used to pass through verbatim (01-passthrough, 09-args, 14-str-builtin, 14-str-paste, 29-string-escapes); rename 27-bytes.M1pp -> 27-string-emit.M1pp and 31-bytes-via-macro.M1pp -> 31-string-via-macro.M1pp with the %bytes(...) wrapper stripped; delete the underscore-prefixed negative fixtures (their %bytes-validator errors no longer apply). docs/M1PP.md: rewrite the STRING token, %str, and string-emission sections; drop %bytes from the feature bullet. Diffstat:
25 files changed, 208 insertions(+), 258 deletions(-)
diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c @@ -494,11 +494,17 @@ static int emit_newline(void) return 1; } +static int emit_string_as_bytes(const struct Token *tok); +static int emit_hex_value(unsigned long long value, int bytes); + static int emit_token(const struct Token *tok) { if (tok->kind == TOK_LBRACE || tok->kind == TOK_RBRACE) { return 1; } + if (tok->kind == TOK_STRING) { + return emit_string_as_bytes(tok); + } if (output_need_space) { if (output_used + 1 >= MAX_OUTPUT) { return fail("output overflow"); @@ -515,6 +521,67 @@ static int emit_token(const struct Token *tok) return 1; } +/* Decode a "..." or '...' literal and emit one TOK_WORD per byte + * (each token's text is the two hex digits for that byte). Recognised + * escapes inside the literal: \n \t \r \0 \\ \" \xNN. No NUL is + * appended; user code writes one explicitly if needed. */ +static int emit_string_as_bytes(const struct Token *tok) +{ + const char *src; + int src_len; + int src_i; + + if (tok->text.len < 2) { + return fail("bad string"); + } + src = tok->text.ptr + 1; + src_len = tok->text.len - 2; + src_i = 0; + while (src_i < src_len) { + unsigned int b; + char c = src[src_i++]; + if (c == '\\') { + char e; + if (src_i >= src_len) { + return fail("bad escape"); + } + e = src[src_i++]; + if (e == 'n') b = 0x0A; + else if (e == 't') b = 0x09; + else if (e == 'r') b = 0x0D; + else if (e == '0') b = 0x00; + else if (e == '\\') b = 0x5C; + else if (e == '"') b = 0x22; + else if (e == 'x') { + int hi, lo, hv, lv; + if (src_i + 2 > src_len) { + return fail("bad escape"); + } + hi = (unsigned char)src[src_i++]; + lo = (unsigned char)src[src_i++]; + hv = (hi >= '0' && hi <= '9') ? hi - '0' : + (hi >= 'a' && hi <= 'f') ? hi - 'a' + 10 : + (hi >= 'A' && hi <= 'F') ? hi - 'A' + 10 : -1; + lv = (lo >= '0' && lo <= '9') ? lo - '0' : + (lo >= 'a' && lo <= 'f') ? lo - 'a' + 10 : + (lo >= 'A' && lo <= 'F') ? lo - 'A' + 10 : -1; + if (hv < 0 || lv < 0) { + return fail("bad escape"); + } + b = (unsigned int)((hv << 4) | lv); + } else { + return fail("bad escape"); + } + } else { + b = (unsigned char)c; + } + if (!emit_hex_value((unsigned long long)b, 1)) { + return 0; + } + } + return 1; +} + static int push_stream_span(struct TokenSpan span, int pool_mark) { struct Stream *s; @@ -1713,83 +1780,6 @@ static int expand_builtin_call(struct Stream *s, const struct Token *tok) return push_pool_stream_from_mark(mark); } - if (token_text_eq(tok, "%bytes")) { - /* Emit the raw bytes of a "..." string as a sequence of one - * `TOK_WORD` per byte (each token's text is the two hex digits - * for that byte). Recognised escapes: \n \t \r \0 \\ \" and - * \xNN. No NUL is appended; the caller writes one explicitly - * if needed. hex2pp's parse_byte_stream coalesces hex digits - * across whitespace, so the emitted tokens reassemble into a - * contiguous byte sequence at link time. */ - struct Token *arg_tok; - struct Token *end_pos; - const char *src; - int src_len; - int src_i; - - if (arg_count != 1) { - return fail("bad builtin"); - } - if (arg_ends[0] - arg_starts[0] != 1) { - return fail("bad builtin"); - } - arg_tok = arg_starts[0]; - if (arg_tok->kind != TOK_STRING || - arg_tok->text.len < 2 || arg_tok->text.ptr[0] != '"') { - return fail("bad builtin"); - } - end_pos = call_end_pos; - s->pos = end_pos; - s->line_start = 0; - - src = arg_tok->text.ptr + 1; - src_len = arg_tok->text.len - 2; - src_i = 0; - while (src_i < src_len) { - unsigned int b; - char c = src[src_i++]; - if (c == '\\') { - char e; - if (src_i >= src_len) { - return fail("bad escape"); - } - e = src[src_i++]; - if (e == 'n') b = 0x0A; - else if (e == 't') b = 0x09; - else if (e == 'r') b = 0x0D; - else if (e == '0') b = 0x00; - else if (e == '\\') b = 0x5C; - else if (e == '"') b = 0x22; - else if (e == 'x') { - int hi, lo, hv, lv; - if (src_i + 2 > src_len) { - return fail("bad escape"); - } - hi = (unsigned char)src[src_i++]; - lo = (unsigned char)src[src_i++]; - hv = (hi >= '0' && hi <= '9') ? hi - '0' : - (hi >= 'a' && hi <= 'f') ? hi - 'a' + 10 : - (hi >= 'A' && hi <= 'F') ? hi - 'A' + 10 : -1; - lv = (lo >= '0' && lo <= '9') ? lo - '0' : - (lo >= 'a' && lo <= 'f') ? lo - 'a' + 10 : - (lo >= 'A' && lo <= 'F') ? lo - 'A' + 10 : -1; - if (hv < 0 || lv < 0) { - return fail("bad escape"); - } - b = (unsigned int)((hv << 4) | lv); - } else { - return fail("bad escape"); - } - } else { - b = (unsigned char)c; - } - if (!emit_hex_value((unsigned long long)b, 1)) { - return 0; - } - } - return 1; - } - if (token_text_eq(tok, "%str")) { struct Token *arg_tok; struct Token *end_pos; diff --git a/docs/M1PP.md b/docs/M1PP.md @@ -28,9 +28,11 @@ The implementation lives in `M1pp/M1pp.c`. It is one pass, allocation-free arithmetic, bitwise, shift, comparison, `strlen`) - Little-endian hex emission: `!` (1B), `@` (2B), `%` (4B), `$` (8B) — emits bare hex digits (e.g. `AABBCCDD`) consumable by `hex2++` -- Raw byte emission from string literals: `%bytes("...")` +- Raw byte emission from string literals: a bare `"..."` token at + statement position emits its decoded bytes - Conditional token selection: `%select(cond, then, else)` -- Stringification: `%str(IDENT)` → `"IDENT"` +- Stringification: `%str(IDENT)` produces a `STRING` token holding the + identifier text, which then emits as bytes - Line comments (`#`, `;`); whitespace-insensitive output normalization - Single-pass, allocation-free implementation with fixed static buffers; fail-fast on first error @@ -54,9 +56,8 @@ The lexer produces a flat token array. Token kinds: - `STRING` — `"..."` or `'...'` (quotes included in the token text). Inside a string, a backslash plus the next character is consumed as one unit, so `\"` and `\\` do not end the literal. The escape's - *meaning* is decoded later by whoever interprets the bytes (e.g. - `%bytes` decodes `\n`, `\xNN`, etc.); the lexer only uses the - backslash to find the right closing quote. + *meaning* is decoded at emission (see [String emission](#string-emission)); + the lexer only uses the backslash to find the right closing quote. - `NEWLINE` — a single `\n` - `LPAREN`, `RPAREN`, `COMMA`, `LBRACE`, `RBRACE` - `PASTE` — the `##` marker @@ -181,23 +182,31 @@ branches are raw token spans, not expressions. ### `%str(IDENT)` -Stringifies a single `WORD` token into a double-quoted string literal: -`%str(foo)` → `"foo"`. The argument must be exactly one word token. +Stringifies a single `WORD` token into a `STRING` token wrapping the +identifier text in double quotes. The argument must be exactly one word +token. The resulting `STRING` flows through emission like any bare +string literal: `%str(foo)` produces the same output bytes as `"foo"` +(`66 6F 6F`). Use it when the identifier is built up from macro +arguments or `##` paste and you want its text emitted as bytes. -### `%bytes(STRING)` +### String emission -Emits the raw bytes of a `"..."`-quoted string as a sequence of -two-hex-digit `WORD` tokens — one per byte. `hex2++` coalesces hex -digits across whitespace, so the result reassembles into a contiguous -byte sequence at link time. No NUL terminator is appended; write `00` -explicitly if you need one. Recognised escapes inside the string are: +A `"..."` token reaching the output stream is decoded into raw bytes, +one two-hex-digit `WORD` token per byte. `hex2++` coalesces hex digits +across whitespace, so the result reassembles into a contiguous byte +sequence at link time. No NUL terminator is appended; write `00` +explicitly (or use `\0`) if you need one. Recognised escapes inside the +string are: \n 0x0A \t 0x09 \r 0x0D \0 0x00 \\ 0x5C \" 0x22 \xNN byte NN (two hex digits) -Any other backslash escape is an error. The argument must be exactly one -`STRING` token quoted with `"`. Example: `%bytes("hi\n")` emits -`68 69 0A`. +Any other backslash escape is an error. Example: `:msg "hi\n"` emits +`68 69 0A` immediately after defining `:msg`. + +Strings inside expression arguments (e.g. `(strlen "literal")`) and +inside `%str(IDENT)` are not decoded — the string atom is read by the +expression evaluator instead. ### `%local(NAME)` diff --git a/tests/M1pp/01-passthrough.M1pp b/tests/M1pp/01-passthrough.M1pp @@ -1,6 +1,9 @@ ## Pass-through fixture: tokenizer + structural macro-keyword skip. -## No macro calls, no ## paste, no !@%$ or %select. The m1pp expander must -## match the C oracle byte-for-byte on this input. +## No macro calls, no ## paste, no !@%$ or %select. +## +## STRING tokens (`"..."` and `'...'`) decode to raw bytes on emission +## — the only "passthrough" exception, kept here so the byte form is +## visible in the expected output alongside the rest. hello world leading whitespace and runs of spaces diff --git a/tests/M1pp/01-passthrough.expected b/tests/M1pp/01-passthrough.expected @@ -1,6 +1,9 @@ ## Pass-through fixture: tokenizer + structural macro-keyword skip. -## No macro calls , no ## paste , no !@%$ or %select. The m1pp expander must -## match the C oracle byte-for-byte on this input. +## No macro calls , no ## paste , no !@%$ or %select. +## +## STRING tokens ( `"..."` and `'...'` ) decode to raw bytes on emission +## — the only 70 61 73 73 74 68 72 6F 75 67 68 exception , kept here so the byte form is +## visible in the expected output alongside the rest. hello world leading whitespace and runs of spaces @@ -8,8 +11,8 @@ mixed , punctuation ( goes ) through ## this comment goes away line with -"double-quoted string stays" -'single quoted too' +64 6F 75 62 6C 65 2D 71 75 6F 74 65 64 20 73 74 72 69 6E 67 20 73 74 61 79 73 +73 69 6E 67 6C 65 20 71 75 6F 74 65 64 20 74 6F 6F first top-level line after the macro diff --git a/tests/M1pp/09-args.expected b/tests/M1pp/09-args.expected @@ -14,7 +14,7 @@ ok [ ( 1 2 ( 3 4 ) 5 ) | other ] -[ "string with, comma" | x ] +[ 73 74 72 69 6E 67 20 77 69 74 68 2C 20 63 6F 6D 6D 61 | x ] < t1 t2 t3 t4 t5 t6 t7 t8 t9 tA tB tC tD tE tF tG > diff --git a/tests/M1pp/14-str-builtin.M1pp b/tests/M1pp/14-str-builtin.M1pp @@ -1,6 +1,9 @@ # %str stringification builtin. -# - %str(IDENT) wraps the identifier text in double quotes -# - result is a TOK_STRING, byte-identical to a hand-written literal +# - %str(IDENT) wraps the identifier text in double quotes, producing +# a TOK_STRING. +# - At emission, every TOK_STRING (whether hand-written or built by +# %str) decodes into raw bytes, so `%str(hello)` and `"hello"` +# yield identical output (`68 65 6C 6C 6F`). %macro quoteit(name) %str(name) diff --git a/tests/M1pp/14-str-builtin.expected b/tests/M1pp/14-str-builtin.expected @@ -3,15 +3,18 @@ -"hello" -"foo_bar" -"a" +68 65 6C 6C 6F +66 6F 6F 5F 62 61 72 -"hello" -"foo_bar" -"a" +61 + + + +68 65 6C 6C 6F +66 6F 6F 5F 62 61 72 +61 END diff --git a/tests/M1pp/14-str-paste.M1pp b/tests/M1pp/14-str-paste.M1pp @@ -1,7 +1,10 @@ # `##` paste + `%str` stringify composed on the same identifier. # - `##` joins word fragments: str_##n -> str_quote (TOK_WORD). -# - `%str(n)` wraps the same identifier in quotes (TOK_STRING). -# - Complementary operators: paste builds the label, %str builds the literal. +# - `%str(n)` wraps the same identifier in quotes (TOK_STRING), which +# is then emitted as raw bytes — so `:str_quote %str(quote)` +# produces `:str_quote 71 75 6F 74 65`. +# - Complementary operators: paste builds the label, %str builds the +# decoded byte sequence. %macro defsym(n) :str_ ## n %str(n) diff --git a/tests/M1pp/14-str-paste.expected b/tests/M1pp/14-str-paste.expected @@ -4,10 +4,13 @@ -:str_quote "quote" -:str_if "if" -:str_begin "begin" + +:str_quote 71 75 6F 74 65 + +:str_if 69 66 + +:str_begin 62 65 67 69 6E END diff --git a/tests/M1pp/27-bytes.M1pp b/tests/M1pp/27-bytes.M1pp @@ -1,17 +0,0 @@ -# %bytes(STRING): emit raw bytes of a "..." literal as contiguous hex. -# Recognised escapes: \n \t \r \0 \\ \" and \xNN. - -# Plain ASCII. -%bytes("hi") - -# Empty string emits nothing. -%bytes("") - -# Each supported single-char escape, plus \xNN. -%bytes("a\nb\tc\rd\0e\\f\"g\x7Fh") - -# Followed by trailing literal hex, to confirm hex2pp's byte-stream -# coalescing handles the boundary. -%bytes("X") 90 - -END diff --git a/tests/M1pp/27-bytes.expected b/tests/M1pp/27-bytes.expected @@ -1,17 +0,0 @@ - - - - -68 69 - - - - - -61 0A 62 09 63 0D 64 00 65 5C 66 22 67 7F 68 - - - -58 90 - -END diff --git a/tests/M1pp/27-string-emit.M1pp b/tests/M1pp/27-string-emit.M1pp @@ -0,0 +1,18 @@ +# Bare string-token emission: every TOK_STRING reaching the output +# decodes its quoted body to raw bytes, one two-hex-digit token per +# byte. Recognised escapes inside the literal: \n \t \r \0 \\ \" \xNN. + +# Plain ASCII. +"hi" + +# Empty string emits nothing. +"" + +# Each supported single-char escape, plus \xNN. +"a\nb\tc\rd\0e\\f\"g\x7Fh" + +# Followed by trailing literal hex, to confirm hex2pp's byte-stream +# coalescing handles the boundary. +"X" 90 + +END diff --git a/tests/M1pp/27-string-emit.expected b/tests/M1pp/27-string-emit.expected @@ -0,0 +1,18 @@ + + + + + +68 69 + + + + + +61 0A 62 09 63 0D 64 00 65 5C 66 22 67 7F 68 + + + +58 90 + +END diff --git a/tests/M1pp/29-string-escapes.M1pp b/tests/M1pp/29-string-escapes.M1pp @@ -1,11 +1,11 @@ -# Lexer-level string escape preservation outside %bytes. -# - In a plain STRING token the lexer treats `\X` as one unit when -# finding the closing quote, so `\"` and `\\` do NOT terminate the -# literal. The escape's *meaning* is left to the consumer; the -# emitter writes the bytes back verbatim. +# String escape decoding at emission. +# - In the lexer, `\X` inside a STRING is one unit (so `\"` and `\\` +# do NOT terminate the literal), but the byte value is left for the +# emitter to decode. +# - At output, `\\` becomes `5C`, `\"` becomes `22`, etc. — the same +# table used to live behind `%bytes(...)` and now applies to every +# bare string token. # - Both quote styles ("..." and '...') participate. -# - %str composition: the ## paste compactor sees the STRING as one -# token and passes it through alongside a paste-built WORD. "plain" "with \"quoted\" inside" diff --git a/tests/M1pp/29-string-escapes.expected b/tests/M1pp/29-string-escapes.expected @@ -7,16 +7,16 @@ -"plain" -"with \"quoted\" inside" -"trailing backslash pair: \\" -"mixed \\ and \" together" -'single \"quoted\" inside' -'single \\ pair' +70 6C 61 69 6E +77 69 74 68 20 22 71 75 6F 74 65 64 22 20 69 6E 73 69 64 65 +74 72 61 69 6C 69 6E 67 20 62 61 63 6B 73 6C 61 73 68 20 70 61 69 72 3A 20 5C +6D 69 78 65 64 20 5C 20 61 6E 64 20 22 20 74 6F 67 65 74 68 65 72 +73 69 6E 67 6C 65 20 22 71 75 6F 74 65 64 22 20 69 6E 73 69 64 65 +73 69 6E 67 6C 65 20 5C 20 70 61 69 72 -"arg with \"escaped\" quotes" +61 72 67 20 77 69 74 68 20 22 65 73 63 61 70 65 64 22 20 71 75 6F 74 65 73 -"arg with \\ backslash pair" +61 72 67 20 77 69 74 68 20 5C 20 62 61 63 6B 73 6C 61 73 68 20 70 61 69 72 END diff --git a/tests/M1pp/31-bytes-via-macro.M1pp b/tests/M1pp/31-bytes-via-macro.M1pp @@ -1,27 +0,0 @@ -# %bytes appearing inside a macro body, so the builtin runs against -# rescanned/expanded input rather than top-level source. The string -# argument may be a literal or a parameter-substituted STRING token. - -%macro EMIT_BYTES(s) -%bytes(s) -%endm - -%macro PREFIXED(s) -AA -%bytes(s) -BB -%endm - -# Literal string passed through a macro arg. -%EMIT_BYTES("hi") - -# Surrounded by literal hex inside the body, to confirm hex2pp's -# byte-stream coalescing works across the rescan boundary. -%PREFIXED("ok") - -# Each escape exercised again, but via macro substitution. -%EMIT_BYTES("a\nb") - -# Empty string substituted in. -%EMIT_BYTES("") -END diff --git a/tests/M1pp/31-string-via-macro.M1pp b/tests/M1pp/31-string-via-macro.M1pp @@ -0,0 +1,27 @@ +# String emission through macro substitution: STRING tokens passed as +# a macro argument decode to bytes when the body's `s` is rescanned at +# the call site. + +%macro EMIT(s) +s +%endm + +%macro PREFIXED(s) +AA +s +BB +%endm + +# Literal string passed through a macro arg. +%EMIT("hi") + +# Surrounded by literal hex inside the body, to confirm hex2pp's +# byte-stream coalescing works across the rescan boundary. +%PREFIXED("ok") + +# Each escape exercised again, but via macro substitution. +%EMIT("a\nb") + +# Empty string substituted in. +%EMIT("") +END diff --git a/tests/M1pp/31-bytes-via-macro.expected b/tests/M1pp/31-string-via-macro.expected diff --git a/tests/M1pp/_04-strlen-badarg.M1pp b/tests/M1pp/_04-strlen-badarg.M1pp @@ -1,7 +0,0 @@ -# Malformed: strlen requires a double-quoted TOK_STRING argument. -# Single-quoted '...' hex literals are meaningless for strlen and must -# be rejected. Expected behavior: non-zero exit from the expander. -# (Underscore-prefixed filename so test.sh skips this fixture.) - -%((strlen 'deadbeef')) -END diff --git a/tests/M1pp/_12-braced-malformed.M1pp b/tests/M1pp/_12-braced-malformed.M1pp @@ -1,17 +0,0 @@ -# Malformed: unmatched `{` inside a macro call. -# -# Expected behavior: the m1pp expander MUST exit non-zero. parse_args detects -# that the outer RPAREN closes the call while brace_depth is still > 0 and -# reports "unbalanced braces". -# -# No `.expected` file is needed — the leading underscore in the filename -# causes m1pp/test.sh to skip this fixture. Run by hand to observe the -# non-zero exit with "unbalanced braces". - -%macro F(a, b) -a b -%endm - -%F(first, { never_closed ) - -END diff --git a/tests/M1pp/_14-str-malformed.M1pp b/tests/M1pp/_14-str-malformed.M1pp @@ -1,8 +0,0 @@ -# %str malformed input. -# - Underscore-prefix => skipped by test.sh. -# - Expected outcome: m1pp exits non-zero. -# - %str takes exactly one single-token WORD argument. A multi-token -# argument (`a b`) must be rejected; so must an already-string arg -# (`"already_string"`). This fixture exercises the multi-token path. - -%str(a b) diff --git a/tests/M1pp/_27-bytes-bad-escape.M1pp b/tests/M1pp/_27-bytes-bad-escape.M1pp @@ -1,10 +0,0 @@ -# %bytes: unknown backslash escape must be rejected. -# -# Recognised escapes: \n \t \r \0 \\ \" and \xNN. Anything else -# (here `\q`) is an error. Expected outcome: m1pp exits non-zero -# with "bad escape". -# -# (Underscore-prefixed filename so test.sh skips this fixture.) - -%bytes("oops\q") -END diff --git a/tests/M1pp/_27-bytes-bad-hex.M1pp b/tests/M1pp/_27-bytes-bad-hex.M1pp @@ -1,9 +0,0 @@ -# %bytes: \xNN requires exactly two hex digits. A non-hex char in -# either position is an error. -# -# Expected outcome: m1pp exits non-zero with "bad escape". -# -# (Underscore-prefixed filename so test.sh skips this fixture.) - -%bytes("oops\xZZ") -END diff --git a/tests/M1pp/_27-bytes-not-string.M1pp b/tests/M1pp/_27-bytes-not-string.M1pp @@ -1,9 +0,0 @@ -# %bytes argument must be a single TOK_STRING token quoted with `"`. -# A WORD argument must be rejected. -# -# Expected outcome: m1pp exits non-zero with "bad builtin". -# -# (Underscore-prefixed filename so test.sh skips this fixture.) - -%bytes(notastring) -END diff --git a/tests/M1pp/_27-bytes-truncated-hex.M1pp b/tests/M1pp/_27-bytes-truncated-hex.M1pp @@ -1,9 +0,0 @@ -# %bytes: \xNN requires exactly two hex digits. A single trailing hex -# digit at the close-quote position must be rejected. -# -# Expected outcome: m1pp exits non-zero with "bad escape". -# -# (Underscore-prefixed filename so test.sh skips this fixture.) - -%bytes("oops\x7") -END