M1pp: bump caps, cleanup duplication - boot2

commit e04409821656586c20a1688e9a86ab8384cf2e2c
parent 200e8f4f18f3894e47e56db1a1582f5b4ea24850
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 28 Apr 2026 17:36:35 -0700

M1pp: bump caps, cleanup duplication

Diffstat:
M M1pp/M1pp.P1  | 315 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M M1pp/M1pp.c  | 33 ++++++++++++++++++++++++++++-----

2 files changed, 171 insertions(+), 177 deletions(-)
diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1
@@ -27,11 +27,25 @@
 
 ## --- Constants & sizing ------------------------------------------------------
 
-DEFINE M1PP_INPUT_CAP 0000040000000000
-DEFINE M1PP_OUTPUT_CAP 0000040000000000
-DEFINE M1PP_TEXT_CAP 0000100000000000
-## source_tokens cap: 2 MB / 32-byte tokens = 65536 tokens.
-DEFINE M1PP_TOKENS_END 0000200000000000
+## Caps bumped 2025-04 for tcc-boot2: cc.scm against the flattened
+## tcc.flat.c emits ~6.5 MB of macro-rich .P1pp; m1pp's old caps were
+## sized for the scheme1 build at ~256 KiB. New caps cover the larger
+## working set the cc-bootstrap path exercises:
+##
+##   16 MiB input        — combined prelude + tcc.flat.P1pp
+##  128 MiB output       — final .M1 (~8x source after expansion)
+##   64 MiB text-arena   — pasted names, hex literals from %(...) eval,
+##                         per-call @local label rewrites (cc.scm
+##                         triggers ~hundreds of K of each)
+##  256 MiB source-toks  — 8 M Token slots × 32 B
+DEFINE M1PP_INPUT_CAP 0000000100000000
+DEFINE M1PP_OUTPUT_CAP 0000000800000000
+DEFINE M1PP_TEXT_CAP 0000000400000000
+## source_tokens cap: 256 MB / 32-byte tokens = ~8.4 M tokens. cc.scm
+## emits one source token per ~3 input bytes (heavy macro use), so a
+## 6.5 MB tcc.flat.P1pp tokenises to ~2.5 M; 8 M leaves comfortable
+## headroom for the larger TUs the harness will exercise next.
+DEFINE M1PP_TOKENS_END 0000001000000000
 ## Macro record is 296 bytes: name (16) + param_count (8) + params[16]*16 (256)
 ## + body_start (8) + body_end (8). MACROS_CAP fits 1024 records (303104 B).
 ## Body-token arena fits 65536 tokens (2 MB = 0x200000).
@@ -76,11 +90,23 @@ DEFINE M1PP_STREAM_POS_OFF 1000000000000000
 DEFINE M1PP_STREAM_LS_OFF 1800000000000000
 DEFINE M1PP_STREAM_MARK_OFF 2000000000000000
 
-## Stream stack cap: 16 streams × 40 = 640 bytes.
-DEFINE M1PP_STREAM_STACK_CAP 8002000000000000
-
-## Expansion pool fits 65536 Token slots × 32 bytes = 2 MB (0x200000).
-DEFINE M1PP_EXPAND_CAP 0000200000000000
+## Stream stack cap: 128 streams × 40 = 5120 bytes (was 16 × 40 = 640).
+## Each nested macro / expression eval pushes a stream; cc.scm's macro
+## chains (%ld → %p1_mem → %select → %aa64_mem → %aa64_mem_after_nonneg
+## → %aa64_ldst_uimm12 → %(EXPR), plus %fn / %ifelse_* / @local label
+## scopes around the call site) routinely exceed 16 deep on the
+## tcc-boot2 path. Note: the runtime overflow message ("token buffer
+## overflow") at line 2504 is misleadingly shared with EXPAND/TOKENS;
+## hitting the cap there means *stream* stack exhaustion.
+DEFINE M1PP_STREAM_STACK_CAP 0014000000000000
+
+## Expansion pool fits 524288 Token slots × 32 bytes = 16 MB (0x1000000).
+## cc.scm emits %fn(name, sz, { body }), and m1pp eagerly copies the
+## body argument into the pool when expanding %fn — so the entire body
+## of a long function lives in the pool until %fn pops. tcc.c's
+## next_nomacro1 (~5900 lines × ~13 tokens/line ≈ 77 K tokens, ~2.5 MiB)
+## plus nested expansions sit comfortably under 16 MiB.
+DEFINE M1PP_EXPAND_CAP 0000000100000000
 
 ## ExprFrame record: 144 bytes. Fields:
 ##   +0   op_code  u64
@@ -122,9 +148,20 @@ DEFINE EXPR_GE 1000000000000000
 DEFINE EXPR_STRLEN 1100000000000000
 DEFINE EXPR_INVALID 1200000000000000
 ## --- BSS layout (offsets from ELF_end) -------------------------------------
-## With 32-byte tokens we need ~2 MB per token region (source/body/pool) to
-## fit large expansions like the scheme1 build. Total BSS ~7.8 MB, under
-## the 8 MB segment memsz set by vendor/seed/<arch>/ELF.hex2.
+## Total BSS ~499 MB, under the 512 MB segment memsz set by
+## vendor/seed/<arch>/ELF.hex2. (If a future bump pushes past 512 MB,
+## raise p_memsz in vendor/seed/<arch>/ELF.hex2 to 1 GiB and bump here.)
+##
+## Per-region BSS slot sizes (offset diffs):
+##   input_buf          32 MB    (2x INPUT_CAP, headroom for trailing NUL etc.)
+##   output_buf        128 MB    (1x OUTPUT_CAP)
+##   text_buf           64 MB    (1x TEXT_CAP)
+##   source_tokens     256 MB    (1x TOKENS_END)
+##   macros            592 KB    (2x MACROS_CAP, unchanged)
+##   macro_body_tokens   2 MB
+##   streams          5120 B    (128 streams × 40 B; was 16 × 40 = 640)
+##   expand_pool        16 MB    (1x EXPAND_CAP)
+##   expr_frames     2304 B
 DEFINE OFF_paste_scratch 0000000000000000
 DEFINE OFF_local_label_scratch 0001000000000000
 DEFINE OFF_scope_stack 8001000000000000
@@ -133,14 +170,14 @@ DEFINE OFF_ebc_str_scratch 8004000000000000
 DEFINE OFF_arg_starts 8005000000000000
 DEFINE OFF_arg_ends 0006000000000000
 DEFINE OFF_input_buf 8006000000000000
-DEFINE OFF_output_buf 8006080000000000
-DEFINE OFF_text_buf 80060c0000000000
-DEFINE OFF_source_tokens 80061c0000000000
-DEFINE OFF_macros 80063c0000000000
-DEFINE OFF_macro_body_tokens 8046450000000000
-DEFINE OFF_streams 8046650000000000
-DEFINE OFF_expand_pool 0049650000000000
-DEFINE OFF_expr_frames 0049850000000000
+DEFINE OFF_output_buf 8006000200000000
+DEFINE OFF_text_buf 8006000A00000000
+DEFINE OFF_source_tokens 8006000E00000000
+DEFINE OFF_macros 8006001E00000000
+DEFINE OFF_macro_body_tokens 8046091E00000000
+DEFINE OFF_streams 8046291E00000000
+DEFINE OFF_expand_pool 805A291E00000000
+DEFINE OFF_expr_frames 805A291F00000000
 ## local_lookup_scratch — 256-byte working buffer used by
 ## expand_local_into_pool to assemble "<frame>_FRAME.<field>" before
 ## the macro-table linear search. Placed past expr_frames (BSS end) so
@@ -1159,226 +1196,146 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     la_br &proc_check_struct
     beqz_a0
 
-    # %macro: shim into define_macro through the proc_pos globals.
-    # define_macro reads/writes proc_pos and walks against source_end,
-    # so it only behaves correctly when s is the source stream — which
-    # holds in practice (line_start in expansion streams is cleared
-    # before any %macro could matter). proc_line_start receives the stream's
-    # line_start at directive entry — define_macro consults it to decide
-    # whether to consume the trailing newline after %endm. After the call
-    # we copy proc_pos back into s->pos and set s->line_start = 1.
+    # %macro: shim into define_macro through proc_pos / proc_line_start.
+    # define_macro reads/writes proc_pos against source_end, so it only
+    # behaves correctly when s is the source stream — which holds in
+    # practice (line_start in expansion streams is cleared before any
+    # %macro could matter). proc_line_start lets define_macro decide
+    # whether to consume the trailing newline after %endm.
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     la_br &define_macro
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%struct" ----
 ## The %macro guard above already proved kind == TOK_WORD; if we reach here
-## via a %macro non-match, that gate still holds.
+## via a %macro non-match, that gate still holds. Each handler:
+##   1. tok_eq_const(tok, NAME, len)
+##   2. miss -> branch to next check
+##   3. hit -> proc_save_pos_and_ls(s, tok); shim; proc_restore_and_loop
 :proc_check_struct
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_struct
     li_a2 %7 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_enum
     beqz_a0
-
-    # %struct matched: shim into define_fielded(stride=8, total="SIZE", len=4)
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     li_a0 %8 %0
     la_a1 &const_size
     li_a2 %4 %0
     la_br &define_fielded
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%enum" ----
 :proc_check_enum
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_enum
     li_a2 %5 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_scope
     beqz_a0
-
-    # %enum matched: shim into define_fielded(stride=1, total="COUNT", len=5)
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     li_a0 %1 %0
     la_a1 &const_count
     li_a2 %5 %0
     la_br &define_fielded
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%scope" ----
 :proc_check_scope
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_scope
     li_a2 %6 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_endscope
     beqz_a0
-
-    # %scope matched: shim into push_scope(stream_end).
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     ld_a0,sp,0
     ld_a0,a0,8
     la_br &push_scope
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%endscope" ----
 :proc_check_endscope
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_endscope
     li_a2 %9 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_frame
     beqz_a0
-
-    # %endscope matched: shim into pop_scope(stream_end).
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     ld_a0,sp,0
     ld_a0,a0,8
     la_br &pop_scope
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%frame" ----
 :proc_check_frame
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_frame
     li_a2 %6 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_endframe
     beqz_a0
-
-    # %frame matched: shim into push_frame(stream_end).
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     ld_a0,sp,0
     ld_a0,a0,8
     la_br &push_frame
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 ## ---- tok eq "%endframe" ----
 :proc_check_endframe
-    ld_t0,sp,8
-    mov_a0,t0
+    ld_a0,sp,8
     la_a1 &const_endframe
     li_a2 %9 %0
     la_br &tok_eq_const
     call
     la_br &proc_check_newline
     beqz_a0
-
-    # %endframe matched: shim into pop_frame(stream_end).
     ld_a0,sp,0
-    ld_t0,sp,8
-    la_a1 &proc_pos
-    st_t0,a1,0
-    ld_a2,a0,24
-    la_a1 &proc_line_start
-    st_a2,a1,0
+    ld_a1,sp,8
+    la_br &proc_save_pos_and_ls
+    call
     ld_a0,sp,0
     ld_a0,a0,8
     la_br &pop_frame
     call
-    ld_a0,sp,0
-    la_a1 &proc_pos
-    ld_t0,a1,0
-    st_t0,a0,16
-    li_t1 %1 %0
-    st_t1,a0,24
-    la_br &proc_loop
+    la_br &proc_restore_and_loop
     b
 
 :proc_check_newline
@@ -1567,6 +1524,30 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     bnez_t0
     eret
 
+## proc_save_pos_and_ls(a0=s, a1=tok): publish the directive's position into
+## proc_pos / proc_line_start so the directive handler can drive against the
+## source stream. Leaf — preserves the caller's other state.
+:proc_save_pos_and_ls
+    la_t0 &proc_pos
+    st_a1,t0,0
+    ld_t1,a0,24
+    la_t0 &proc_line_start
+    st_t1,t0,0
+    ret
+
+## proc_restore_and_loop: reached only via `b` (sp must be intact). Reads
+## sp,0 = s; copies proc_pos into s->pos, sets s->line_start=1, jumps to
+## proc_loop. Tail of every directive shim above.
+:proc_restore_and_loop
+    ld_a0,sp,0
+    la_a1 &proc_pos
+    ld_t0,a1,0
+    st_t0,a0,16
+    li_t1 %1 %0
+    st_t1,a0,24
+    la_br &proc_loop
+    b
+
 ## --- %scope / %endscope handlers --------------------------------------------
 ## Called at proc_pos == the `%scope` / `%endscope` word on a line-start.
 ## Input: a0 = stream end (pointer one past last token in the current stream).
@@ -4794,13 +4775,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %0 %0
-    la_br &aeo_cmp_store_zero
+    la_br &aeo_cmp_finish
     bne_a2,a3
     li_t0 %1 %0
-:aeo_cmp_store_zero
-    la_a0 &aeo_acc
-    st_t0,a0,0
-    la_br &aeo_finish
+    la_br &aeo_cmp_finish
     b
 
 :aeo_do_ne
@@ -4811,13 +4789,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %0 %0
-    la_br &aeo_cmp_store_zero1
+    la_br &aeo_cmp_finish
     beq_a2,a3
     li_t0 %1 %0
-:aeo_cmp_store_zero1
-    la_a0 &aeo_acc
-    st_t0,a0,0
-    la_br &aeo_finish
+    la_br &aeo_cmp_finish
     b
 
 :aeo_do_lt
@@ -4828,13 +4803,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %1 %0
-    la_br &aeo_cmp_store_one
+    la_br &aeo_cmp_finish
     blt_a2,a3
     li_t0 %0 %0
-:aeo_cmp_store_one
-    la_a0 &aeo_acc
-    st_t0,a0,0
-    la_br &aeo_finish
+    la_br &aeo_cmp_finish
     b
 
 :aeo_do_le
@@ -4846,13 +4818,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %0 %0
-    la_br &aeo_cmp_store_two
+    la_br &aeo_cmp_finish
     blt_a3,a2
     li_t0 %1 %0
-:aeo_cmp_store_two
-    la_a0 &aeo_acc
-    st_t0,a0,0
-    la_br &aeo_finish
+    la_br &aeo_cmp_finish
     b
 
 :aeo_do_gt
@@ -4864,13 +4833,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %1 %0
-    la_br &aeo_cmp_store_three
+    la_br &aeo_cmp_finish
     blt_a3,a2
     li_t0 %0 %0
-:aeo_cmp_store_three
-    la_a0 &aeo_acc
-    st_t0,a0,0
-    la_br &aeo_finish
+    la_br &aeo_cmp_finish
     b
 
 :aeo_do_ge
@@ -4882,10 +4848,15 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     ld_a2,a1,0
     ld_a3,a1,8
     li_t0 %0 %0
-    la_br &aeo_cmp_store_four
+    la_br &aeo_cmp_finish
     blt_a2,a3
     li_t0 %1 %0
-:aeo_cmp_store_four
+    la_br &aeo_cmp_finish
+    b
+
+## Shared tail for the comparison ops: store t0 (the 0/1 result) into
+## aeo_acc, then jump to aeo_finish. Reached only via `b`.
+:aeo_cmp_finish
     la_a0 &aeo_acc
     st_t0,a0,0
     la_br &aeo_finish
diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c
@@ -81,14 +81,37 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define MAX_INPUT             262144
-#define MAX_OUTPUT            524288
-#define MAX_TEXT              1048576
-#define MAX_TOKENS            65536
+/* Caps chosen to mirror the M1pp.P1 BSS layout, sized so the cc.scm
+ * emission of tcc.flat.c (~6.5 MB of macro-rich .P1pp) lexes cleanly.
+ * The native binary is host-side, so static globals at these sizes
+ * just live in .bss / anonymous mmap without any of the ELF-segment
+ * sizing dance the bootstrap m1pp has to do. */
+#define MAX_INPUT             16777216    /* 16 MiB */
+#define MAX_OUTPUT            134217728   /* 128 MiB */
+#define MAX_TEXT              67108864    /* 64 MiB:
+                                           * paste tokens, hex literals from
+                                           * %(EXPR) evaluation, and per-call
+                                           * @local label rewrites all live
+                                           * here for the run's lifetime. cc.scm
+                                           * triggers hundreds of thousands of
+                                           * each across the tcc.c expansion. */
+#define MAX_TOKENS            8388608     /*  8 M slots × 32 B = 256 MiB */
 #define MAX_MACROS            1024
 #define MAX_PARAMS            16
 #define MAX_MACRO_BODY_TOKENS MAX_TOKENS
-#define MAX_EXPAND            65536
+#define MAX_EXPAND            524288      /* 512 K × 32 B = 16 MiB:
+                                            * cc.scm wraps each C function in
+                                            * %fn(... { body }), and m1pp's
+                                            * expand_macro_tokens copies the
+                                            * argument tokens into the pool —
+                                            * so the entire body of a long
+                                            * function is resident in the pool
+                                            * while its outer %fn is active.
+                                            * tcc.c's next_nomacro1 (~5900
+                                            * lines × ~13 m1pp tokens/line ≈
+                                            * 77 K tokens, ~2.5 MiB) plus
+                                            * inner expansions sit comfortably
+                                            * under 16 MiB. */
 #define MAX_STACK             64
 #define MAX_EXPR_FRAMES       256
 #define MAX_SCOPE_DEPTH       32

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README

M	M1pp/M1pp.P1	\|	315	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	M1pp/M1pp.c	\|	33	++++++++++++++++++++++++++++-----