commit 19c3c3bd465f5e192537c247239a4d27497536e7
parent c0792835d2bf76ee366e38128102228745deceb0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 23 Apr 2026 14:00:31 -0700
m1pp: add structural section comments in m1pp.M1
Ten section dividers (constants, runtime shell, helpers, lexer,
output, main processor, %macro skip, error paths, rodata, BSS) each
with a short note on what the block does and how it composes with
its neighbors. The file header docstring now sketches the pipeline
from _start through lex_source, process_tokens, skip_macro_def.
No code changes; tests still pass.
Diffstat:
| M | m1pp/m1pp.M1 | | | 73 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- |
1 file changed, 69 insertions(+), 4 deletions(-)
diff --git a/m1pp/m1pp.M1 b/m1pp/m1pp.M1
@@ -5,10 +5,23 @@
## Phase 1: lexer + pass-through with structural %macro/%endm skip.
## Behavior mirrors m1pp/m1pp.c (the oracle) for definition-only inputs.
##
+## Pipeline:
+## _start argv/argc from kernel SP; openat+read into input_buf
+## -> call lex_source
+## -> call process_tokens
+## -> openat+write output_buf to argv[2]; exit
+## lex_source input_buf -> source_tokens[] (via append_text + push_source_token)
+## process_tokens source_tokens[] -> output_buf (via emit_token / emit_newline,
+## branching to skip_macro_def at
+## line-start %macro)
+## skip_macro_def structural consume through %endm; no tokens emitted
+##
## P1v2 ABI: a0..a3 arg/return, t0..t2 caller-saved temps, s0..s3 callee-saved
## (unused here). Non-leaf functions use enter_0 / leave. _start has no frame;
## the kernel-supplied SP carries argv/argc directly.
+## --- Constants & sizing ------------------------------------------------------
+
DEFINE M1M_INPUT_CAP 0020000000000000
DEFINE M1M_OUTPUT_CAP 0020000000000000
DEFINE M1M_TEXT_CAP 0010000000000000
@@ -26,6 +39,8 @@ DEFINE TOK_RPAREN 0400000000000000
DEFINE TOK_COMMA 0500000000000000
DEFINE TOK_PASTE 0600000000000000
+## --- Runtime shell: argv, read input, call pipeline, write output, exit ------
+
:_start
ld_a0,sp,0
li_a1 %3 %0
@@ -147,6 +162,11 @@ DEFINE TOK_PASTE 0600000000000000
li_a1 %0 %0
syscall
+## --- Helpers: text arena + token array + equality ----------------------------
+## append_text appends bytes to text_buf (used for synthesized token text,
+## e.g. single-char parens/commas and the paste `##`). Source-word and string
+## tokens point directly into input_buf and skip this arena.
+
## append_text(a0=src, a1=len) -> a0=text ptr. Leaf; clobbers a0..a3, t0..t2.
:append_text
la_a2 &text_used
@@ -227,6 +247,19 @@ DEFINE TOK_PASTE 0600000000000000
li_a0 %0 %0
ret
+## --- Lexer -------------------------------------------------------------------
+## Dispatches on the first byte at lex_ptr:
+## whitespace (sp/tab/cr/ff/vt) -> lex_skip_one
+## newline (\n) -> lex_newline -> TOK_NEWLINE
+## quote (" or ') -> lex_string -> TOK_STRING
+## `#` -> lex_hash -> TOK_PASTE on ##, else comment
+## `;` -> lex_comment (drop to end of line)
+## `(` `)` `,` -> lex_lparen / rparen / comma
+## otherwise -> lex_word -> TOK_WORD
+##
+## All branches loop back to lex_loop. lex_done exits once lex_ptr hits
+## the terminating NUL that _start writes past the end of input_buf.
+
## lex_source(): fills source_tokens[] from input_buf.
:lex_source
enter_0
@@ -498,6 +531,13 @@ DEFINE TOK_PASTE 0600000000000000
leave
ret
+## --- Output: normalized token stream to output_buf ---------------------------
+## emit_newline writes '\n' and clears output_need_space.
+## emit_token prefixes a space when output_need_space is set, copies the
+## token text, then sets output_need_space. This is how source whitespace
+## gets normalized: one '\n' per TOK_NEWLINE, one ' ' between consecutive
+## non-newline tokens.
+
## emit_newline(). Leaf.
:emit_newline
la_a0 &output_used
@@ -563,6 +603,14 @@ DEFINE TOK_PASTE 0600000000000000
st_a1,a0,0
ret
+## --- Main processor ----------------------------------------------------------
+## Walks source_tokens[] in order. For each token:
+## - line-start %macro -> call skip_macro_def (consumes through %endm)
+## - TOK_NEWLINE -> emit_newline, set line_start
+## - anything else -> emit_token, clear line_start
+## Phases 4+ will insert macro-call detection, builtin calls, and %select
+## between the %macro check and the newline check.
+
## process_tokens(): pass-through with structural %macro skipping.
:process_tokens
enter_0
@@ -638,6 +686,11 @@ DEFINE TOK_PASTE 0600000000000000
leave
ret
+## --- Structural %macro skip (placeholder for Phase 2's define_macro) ---------
+## Consumes the macro header, body, and %endm line without emitting anything
+## or recording the definition. Phase 2 replaces this with real storage of
+## the macro's name, params, and body tokens.
+
## skip_macro_def(): proc_pos at %macro. Leaves proc_pos after %endm line.
:skip_macro_def
enter_0
@@ -721,6 +774,11 @@ DEFINE TOK_PASTE 0600000000000000
la_br &skip_macro_loop
b
+## --- Error paths -------------------------------------------------------------
+## Each err_* loads a (msg, len) pair for fatal; fatal writes "m1m: <msg>\n"
+## to stderr and exits 1. Error labels are branched to from range/overflow
+## checks throughout the code.
+
:err_usage
la_a0 &msg_usage
li_a1 %29 %0
@@ -802,6 +860,8 @@ DEFINE TOK_PASTE 0600000000000000
li_a1 %1 %0
syscall
+## --- Rodata: const tokens (for tok_eq_const) and fatal messages --------------
+
:const_macro "%macro"
:const_endm "%endm"
:const_paste "##"
@@ -823,11 +883,16 @@ DEFINE TOK_PASTE 0600000000000000
:msg_output_overflow "output buffer overflow"
:msg_unterminated_macro "unterminated %macro definition"
-## BSS. Placed before :ELF_end so filesz/memsz (which this ELF header sets
-## equal) covers the whole zero-initialized region. Bloats the file by the
-## BSS size, but avoids a custom ELF header.
+## --- BSS ---------------------------------------------------------------------
+## Placed before :ELF_end so filesz/memsz (which this ELF header sets equal)
+## covers the whole zero-initialized region. Bloats the file by the BSS size,
+## but avoids a custom ELF header.
+##
+## Layout: scalars (pointers, counters, lexer/processor state), then the
+## four arenas — input_buf, output_buf, text_buf, source_tokens — whose
+## sizes match the CAP constants above.
-## Scalars first (each 8 bytes).
+## Scalars (each 8 bytes).
:input_fd
ZERO32
:input_len