commit e04409821656586c20a1688e9a86ab8384cf2e2c
parent 200e8f4f18f3894e47e56db1a1582f5b4ea24850
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 28 Apr 2026 17:36:35 -0700
M1pp: bump caps, cleanup duplication
Diffstat:
| M | M1pp/M1pp.P1 | | | 315 | ++++++++++++++++++++++++++++++++++++------------------------------------------- |
| M | M1pp/M1pp.c | | | 33 | ++++++++++++++++++++++++++++----- |
2 files changed, 171 insertions(+), 177 deletions(-)
diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1
@@ -27,11 +27,25 @@
## --- Constants & sizing ------------------------------------------------------
-DEFINE M1PP_INPUT_CAP 0000040000000000
-DEFINE M1PP_OUTPUT_CAP 0000040000000000
-DEFINE M1PP_TEXT_CAP 0000100000000000
-## source_tokens cap: 2 MB / 32-byte tokens = 65536 tokens.
-DEFINE M1PP_TOKENS_END 0000200000000000
+## Caps bumped 2025-04 for tcc-boot2: cc.scm against the flattened
+## tcc.flat.c emits ~6.5 MB of macro-rich .P1pp; m1pp's old caps were
+## sized for the scheme1 build at ~256 KiB. New caps cover the larger
+## working set the cc-bootstrap path exercises:
+##
+## 16 MiB input — combined prelude + tcc.flat.P1pp
+## 128 MiB output — final .M1 (~8x source after expansion)
+## 64 MiB text-arena — pasted names, hex literals from %(...) eval,
+## per-call @local label rewrites (cc.scm
+## triggers ~hundreds of K of each)
+## 256 MiB source-toks — 8 M Token slots × 32 B
+DEFINE M1PP_INPUT_CAP 0000000100000000
+DEFINE M1PP_OUTPUT_CAP 0000000800000000
+DEFINE M1PP_TEXT_CAP 0000000400000000
+## source_tokens cap: 256 MB / 32-byte tokens = ~8.4 M tokens. cc.scm
+## emits one source token per ~3 input bytes (heavy macro use), so a
+## 6.5 MB tcc.flat.P1pp tokenises to ~2.5 M; 8 M leaves comfortable
+## headroom for the larger TUs the harness will exercise next.
+DEFINE M1PP_TOKENS_END 0000001000000000
## Macro record is 296 bytes: name (16) + param_count (8) + params[16]*16 (256)
## + body_start (8) + body_end (8). MACROS_CAP fits 1024 records (303104 B).
## Body-token arena fits 65536 tokens (2 MB = 0x200000).
@@ -76,11 +90,23 @@ DEFINE M1PP_STREAM_POS_OFF 1000000000000000
DEFINE M1PP_STREAM_LS_OFF 1800000000000000
DEFINE M1PP_STREAM_MARK_OFF 2000000000000000
-## Stream stack cap: 16 streams × 40 = 640 bytes.
-DEFINE M1PP_STREAM_STACK_CAP 8002000000000000
-
-## Expansion pool fits 65536 Token slots × 32 bytes = 2 MB (0x200000).
-DEFINE M1PP_EXPAND_CAP 0000200000000000
+## Stream stack cap: 128 streams × 40 = 5120 bytes (was 16 × 40 = 640).
+## Each nested macro / expression eval pushes a stream; cc.scm's macro
+## chains (%ld → %p1_mem → %select → %aa64_mem → %aa64_mem_after_nonneg
+## → %aa64_ldst_uimm12 → %(EXPR), plus %fn / %ifelse_* / @local label
+## scopes around the call site) routinely exceed 16 deep on the
+## tcc-boot2 path. Note: the runtime overflow message ("token buffer
+## overflow") at line 2504 is misleadingly shared with EXPAND/TOKENS;
+## hitting the cap there means *stream* stack exhaustion.
+DEFINE M1PP_STREAM_STACK_CAP 0014000000000000
+
+## Expansion pool fits 524288 Token slots × 32 bytes = 16 MB (0x1000000).
+## cc.scm emits %fn(name, sz, { body }), and m1pp eagerly copies the
+## body argument into the pool when expanding %fn — so the entire body
+## of a long function lives in the pool until %fn pops. tcc.c's
+## next_nomacro1 (~5900 lines × ~13 tokens/line ≈ 77 K tokens, ~2.5 MiB)
+## plus nested expansions sit comfortably under 16 MiB.
+DEFINE M1PP_EXPAND_CAP 0000000100000000
## ExprFrame record: 144 bytes. Fields:
## +0 op_code u64
@@ -122,9 +148,20 @@ DEFINE EXPR_GE 1000000000000000
DEFINE EXPR_STRLEN 1100000000000000
DEFINE EXPR_INVALID 1200000000000000
## --- BSS layout (offsets from ELF_end) -------------------------------------
-## With 32-byte tokens we need ~2 MB per token region (source/body/pool) to
-## fit large expansions like the scheme1 build. Total BSS ~7.8 MB, under
-## the 8 MB segment memsz set by vendor/seed/<arch>/ELF.hex2.
+## Total BSS ~499 MB, under the 512 MB segment memsz set by
+## vendor/seed/<arch>/ELF.hex2. (If a future bump pushes past 512 MB,
+## raise p_memsz in vendor/seed/<arch>/ELF.hex2 to 1 GiB and bump here.)
+##
+## Per-region BSS slot sizes (offset diffs):
+## input_buf 32 MB (2x INPUT_CAP, headroom for trailing NUL etc.)
+## output_buf 128 MB (1x OUTPUT_CAP)
+## text_buf 64 MB (1x TEXT_CAP)
+## source_tokens 256 MB (1x TOKENS_END)
+## macros 592 KB (2x MACROS_CAP, unchanged)
+## macro_body_tokens 2 MB
+## streams 5120 B (128 streams × 40 B; was 16 × 40 = 640)
+## expand_pool 16 MB (1x EXPAND_CAP)
+## expr_frames 2304 B
DEFINE OFF_paste_scratch 0000000000000000
DEFINE OFF_local_label_scratch 0001000000000000
DEFINE OFF_scope_stack 8001000000000000
@@ -133,14 +170,14 @@ DEFINE OFF_ebc_str_scratch 8004000000000000
DEFINE OFF_arg_starts 8005000000000000
DEFINE OFF_arg_ends 0006000000000000
DEFINE OFF_input_buf 8006000000000000
-DEFINE OFF_output_buf 8006080000000000
-DEFINE OFF_text_buf 80060c0000000000
-DEFINE OFF_source_tokens 80061c0000000000
-DEFINE OFF_macros 80063c0000000000
-DEFINE OFF_macro_body_tokens 8046450000000000
-DEFINE OFF_streams 8046650000000000
-DEFINE OFF_expand_pool 0049650000000000
-DEFINE OFF_expr_frames 0049850000000000
+DEFINE OFF_output_buf 8006000200000000
+DEFINE OFF_text_buf 8006000A00000000
+DEFINE OFF_source_tokens 8006000E00000000
+DEFINE OFF_macros 8006001E00000000
+DEFINE OFF_macro_body_tokens 8046091E00000000
+DEFINE OFF_streams 8046291E00000000
+DEFINE OFF_expand_pool 805A291E00000000
+DEFINE OFF_expr_frames 805A291F00000000
## local_lookup_scratch — 256-byte working buffer used by
## expand_local_into_pool to assemble "<frame>_FRAME.<field>" before
## the macro-table linear search. Placed past expr_frames (BSS end) so
@@ -1159,226 +1196,146 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
la_br &proc_check_struct
beqz_a0
- # %macro: shim into define_macro through the proc_pos globals.
- # define_macro reads/writes proc_pos and walks against source_end,
- # so it only behaves correctly when s is the source stream — which
- # holds in practice (line_start in expansion streams is cleared
- # before any %macro could matter). proc_line_start receives the stream's
- # line_start at directive entry — define_macro consults it to decide
- # whether to consume the trailing newline after %endm. After the call
- # we copy proc_pos back into s->pos and set s->line_start = 1.
+ # %macro: shim into define_macro through proc_pos / proc_line_start.
+ # define_macro reads/writes proc_pos against source_end, so it only
+ # behaves correctly when s is the source stream — which holds in
+ # practice (line_start in expansion streams is cleared before any
+ # %macro could matter). proc_line_start lets define_macro decide
+ # whether to consume the trailing newline after %endm.
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
la_br &define_macro
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%struct" ----
## The %macro guard above already proved kind == TOK_WORD; if we reach here
-## via a %macro non-match, that gate still holds.
+## via a %macro non-match, that gate still holds. Each handler:
+## 1. tok_eq_const(tok, NAME, len)
+## 2. miss -> branch to next check
+## 3. hit -> proc_save_pos_and_ls(s, tok); shim; proc_restore_and_loop
:proc_check_struct
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_struct
li_a2 %7 %0
la_br &tok_eq_const
call
la_br &proc_check_enum
beqz_a0
-
- # %struct matched: shim into define_fielded(stride=8, total="SIZE", len=4)
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
li_a0 %8 %0
la_a1 &const_size
li_a2 %4 %0
la_br &define_fielded
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%enum" ----
:proc_check_enum
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_enum
li_a2 %5 %0
la_br &tok_eq_const
call
la_br &proc_check_scope
beqz_a0
-
- # %enum matched: shim into define_fielded(stride=1, total="COUNT", len=5)
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
li_a0 %1 %0
la_a1 &const_count
li_a2 %5 %0
la_br &define_fielded
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%scope" ----
:proc_check_scope
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_scope
li_a2 %6 %0
la_br &tok_eq_const
call
la_br &proc_check_endscope
beqz_a0
-
- # %scope matched: shim into push_scope(stream_end).
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
ld_a0,sp,0
ld_a0,a0,8
la_br &push_scope
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%endscope" ----
:proc_check_endscope
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_endscope
li_a2 %9 %0
la_br &tok_eq_const
call
la_br &proc_check_frame
beqz_a0
-
- # %endscope matched: shim into pop_scope(stream_end).
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
ld_a0,sp,0
ld_a0,a0,8
la_br &pop_scope
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%frame" ----
:proc_check_frame
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_frame
li_a2 %6 %0
la_br &tok_eq_const
call
la_br &proc_check_endframe
beqz_a0
-
- # %frame matched: shim into push_frame(stream_end).
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
ld_a0,sp,0
ld_a0,a0,8
la_br &push_frame
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
## ---- tok eq "%endframe" ----
:proc_check_endframe
- ld_t0,sp,8
- mov_a0,t0
+ ld_a0,sp,8
la_a1 &const_endframe
li_a2 %9 %0
la_br &tok_eq_const
call
la_br &proc_check_newline
beqz_a0
-
- # %endframe matched: shim into pop_frame(stream_end).
ld_a0,sp,0
- ld_t0,sp,8
- la_a1 &proc_pos
- st_t0,a1,0
- ld_a2,a0,24
- la_a1 &proc_line_start
- st_a2,a1,0
+ ld_a1,sp,8
+ la_br &proc_save_pos_and_ls
+ call
ld_a0,sp,0
ld_a0,a0,8
la_br &pop_frame
call
- ld_a0,sp,0
- la_a1 &proc_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %1 %0
- st_t1,a0,24
- la_br &proc_loop
+ la_br &proc_restore_and_loop
b
:proc_check_newline
@@ -1567,6 +1524,30 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
bnez_t0
eret
+## proc_save_pos_and_ls(a0=s, a1=tok): publish the directive's position into
+## proc_pos / proc_line_start so the directive handler can drive against the
+## source stream. Leaf — preserves the caller's other state.
+:proc_save_pos_and_ls
+ la_t0 &proc_pos
+ st_a1,t0,0
+ ld_t1,a0,24
+ la_t0 &proc_line_start
+ st_t1,t0,0
+ ret
+
+## proc_restore_and_loop: reached only via `b` (sp must be intact). Reads
+## sp,0 = s; copies proc_pos into s->pos, sets s->line_start=1, jumps to
+## proc_loop. Tail of every directive shim above.
+:proc_restore_and_loop
+ ld_a0,sp,0
+ la_a1 &proc_pos
+ ld_t0,a1,0
+ st_t0,a0,16
+ li_t1 %1 %0
+ st_t1,a0,24
+ la_br &proc_loop
+ b
+
## --- %scope / %endscope handlers --------------------------------------------
## Called at proc_pos == the `%scope` / `%endscope` word on a line-start.
## Input: a0 = stream end (pointer one past last token in the current stream).
@@ -4794,13 +4775,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %0 %0
- la_br &aeo_cmp_store_zero
+ la_br &aeo_cmp_finish
bne_a2,a3
li_t0 %1 %0
-:aeo_cmp_store_zero
- la_a0 &aeo_acc
- st_t0,a0,0
- la_br &aeo_finish
+ la_br &aeo_cmp_finish
b
:aeo_do_ne
@@ -4811,13 +4789,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %0 %0
- la_br &aeo_cmp_store_zero1
+ la_br &aeo_cmp_finish
beq_a2,a3
li_t0 %1 %0
-:aeo_cmp_store_zero1
- la_a0 &aeo_acc
- st_t0,a0,0
- la_br &aeo_finish
+ la_br &aeo_cmp_finish
b
:aeo_do_lt
@@ -4828,13 +4803,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %1 %0
- la_br &aeo_cmp_store_one
+ la_br &aeo_cmp_finish
blt_a2,a3
li_t0 %0 %0
-:aeo_cmp_store_one
- la_a0 &aeo_acc
- st_t0,a0,0
- la_br &aeo_finish
+ la_br &aeo_cmp_finish
b
:aeo_do_le
@@ -4846,13 +4818,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %0 %0
- la_br &aeo_cmp_store_two
+ la_br &aeo_cmp_finish
blt_a3,a2
li_t0 %1 %0
-:aeo_cmp_store_two
- la_a0 &aeo_acc
- st_t0,a0,0
- la_br &aeo_finish
+ la_br &aeo_cmp_finish
b
:aeo_do_gt
@@ -4864,13 +4833,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %1 %0
- la_br &aeo_cmp_store_three
+ la_br &aeo_cmp_finish
blt_a3,a2
li_t0 %0 %0
-:aeo_cmp_store_three
- la_a0 &aeo_acc
- st_t0,a0,0
- la_br &aeo_finish
+ la_br &aeo_cmp_finish
b
:aeo_do_ge
@@ -4882,10 +4848,15 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
ld_a2,a1,0
ld_a3,a1,8
li_t0 %0 %0
- la_br &aeo_cmp_store_four
+ la_br &aeo_cmp_finish
blt_a2,a3
li_t0 %1 %0
-:aeo_cmp_store_four
+ la_br &aeo_cmp_finish
+ b
+
+## Shared tail for the comparison ops: store t0 (the 0/1 result) into
+## aeo_acc, then jump to aeo_finish. Reached only via `b`.
+:aeo_cmp_finish
la_a0 &aeo_acc
st_t0,a0,0
la_br &aeo_finish
diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c
@@ -81,14 +81,37 @@
#include <stdlib.h>
#include <string.h>
-#define MAX_INPUT 262144
-#define MAX_OUTPUT 524288
-#define MAX_TEXT 1048576
-#define MAX_TOKENS 65536
+/* Caps chosen to mirror the M1pp.P1 BSS layout, sized so the cc.scm
+ * emission of tcc.flat.c (~6.5 MB of macro-rich .P1pp) lexes cleanly.
+ * The native binary is host-side, so static globals at these sizes
+ * just live in .bss / anonymous mmap without any of the ELF-segment
+ * sizing dance the bootstrap m1pp has to do. */
+#define MAX_INPUT 16777216 /* 16 MiB */
+#define MAX_OUTPUT 134217728 /* 128 MiB */
+#define MAX_TEXT 67108864 /* 64 MiB:
+ * paste tokens, hex literals from
+ * %(EXPR) evaluation, and per-call
+ * @local label rewrites all live
+ * here for the run's lifetime. cc.scm
+ * triggers hundreds of thousands of
+ * each across the tcc.c expansion. */
+#define MAX_TOKENS 8388608 /* 8 M slots × 32 B = 256 MiB */
#define MAX_MACROS 1024
#define MAX_PARAMS 16
#define MAX_MACRO_BODY_TOKENS MAX_TOKENS
-#define MAX_EXPAND 65536
+#define MAX_EXPAND 524288 /* 512 K × 32 B = 16 MiB:
+ * cc.scm wraps each C function in
+ * %fn(... { body }), and m1pp's
+ * expand_macro_tokens copies the
+ * argument tokens into the pool —
+ * so the entire body of a long
+ * function is resident in the pool
+ * while its outer %fn is active.
+ * tcc.c's next_nomacro1 (~5900
+ * lines × ~13 m1pp tokens/line ≈
+ * 77 K tokens, ~2.5 MiB) plus
+ * inner expansions sit comfortably
+ * under 16 MiB. */
#define MAX_STACK 64
#define MAX_EXPR_FRAMES 256
#define MAX_SCOPE_DEPTH 32