boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit e04409821656586c20a1688e9a86ab8384cf2e2c
parent 200e8f4f18f3894e47e56db1a1582f5b4ea24850
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 28 Apr 2026 17:36:35 -0700

M1pp: bump caps, cleanup duplication

Diffstat:
MM1pp/M1pp.P1 | 315++++++++++++++++++++++++++++++++++++-------------------------------------------
MM1pp/M1pp.c | 33++++++++++++++++++++++++++++-----
2 files changed, 171 insertions(+), 177 deletions(-)

diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1 @@ -27,11 +27,25 @@ ## --- Constants & sizing ------------------------------------------------------ -DEFINE M1PP_INPUT_CAP 0000040000000000 -DEFINE M1PP_OUTPUT_CAP 0000040000000000 -DEFINE M1PP_TEXT_CAP 0000100000000000 -## source_tokens cap: 2 MB / 32-byte tokens = 65536 tokens. -DEFINE M1PP_TOKENS_END 0000200000000000 +## Caps bumped 2025-04 for tcc-boot2: cc.scm against the flattened +## tcc.flat.c emits ~6.5 MB of macro-rich .P1pp; m1pp's old caps were +## sized for the scheme1 build at ~256 KiB. New caps cover the larger +## working set the cc-bootstrap path exercises: +## +## 16 MiB input — combined prelude + tcc.flat.P1pp +## 128 MiB output — final .M1 (~8x source after expansion) +## 64 MiB text-arena — pasted names, hex literals from %(...) eval, +## per-call @local label rewrites (cc.scm +## triggers ~hundreds of K of each) +## 256 MiB source-toks — 8 M Token slots × 32 B +DEFINE M1PP_INPUT_CAP 0000000100000000 +DEFINE M1PP_OUTPUT_CAP 0000000800000000 +DEFINE M1PP_TEXT_CAP 0000000400000000 +## source_tokens cap: 256 MB / 32-byte tokens = ~8.4 M tokens. cc.scm +## emits one source token per ~3 input bytes (heavy macro use), so a +## 6.5 MB tcc.flat.P1pp tokenises to ~2.5 M; 8 M leaves comfortable +## headroom for the larger TUs the harness will exercise next. +DEFINE M1PP_TOKENS_END 0000001000000000 ## Macro record is 296 bytes: name (16) + param_count (8) + params[16]*16 (256) ## + body_start (8) + body_end (8). MACROS_CAP fits 1024 records (303104 B). ## Body-token arena fits 65536 tokens (2 MB = 0x200000). @@ -76,11 +90,23 @@ DEFINE M1PP_STREAM_POS_OFF 1000000000000000 DEFINE M1PP_STREAM_LS_OFF 1800000000000000 DEFINE M1PP_STREAM_MARK_OFF 2000000000000000 -## Stream stack cap: 16 streams × 40 = 640 bytes. -DEFINE M1PP_STREAM_STACK_CAP 8002000000000000 - -## Expansion pool fits 65536 Token slots × 32 bytes = 2 MB (0x200000). -DEFINE M1PP_EXPAND_CAP 0000200000000000 +## Stream stack cap: 128 streams × 40 = 5120 bytes (was 16 × 40 = 640). +## Each nested macro / expression eval pushes a stream; cc.scm's macro +## chains (%ld → %p1_mem → %select → %aa64_mem → %aa64_mem_after_nonneg +## → %aa64_ldst_uimm12 → %(EXPR), plus %fn / %ifelse_* / @local label +## scopes around the call site) routinely exceed 16 deep on the +## tcc-boot2 path. Note: the runtime overflow message ("token buffer +## overflow") at line 2504 is misleadingly shared with EXPAND/TOKENS; +## hitting the cap there means *stream* stack exhaustion. +DEFINE M1PP_STREAM_STACK_CAP 0014000000000000 + +## Expansion pool fits 524288 Token slots × 32 bytes = 16 MB (0x1000000). +## cc.scm emits %fn(name, sz, { body }), and m1pp eagerly copies the +## body argument into the pool when expanding %fn — so the entire body +## of a long function lives in the pool until %fn pops. tcc.c's +## next_nomacro1 (~5900 lines × ~13 tokens/line ≈ 77 K tokens, ~2.5 MiB) +## plus nested expansions sit comfortably under 16 MiB. +DEFINE M1PP_EXPAND_CAP 0000000100000000 ## ExprFrame record: 144 bytes. Fields: ## +0 op_code u64 @@ -122,9 +148,20 @@ DEFINE EXPR_GE 1000000000000000 DEFINE EXPR_STRLEN 1100000000000000 DEFINE EXPR_INVALID 1200000000000000 ## --- BSS layout (offsets from ELF_end) ------------------------------------- -## With 32-byte tokens we need ~2 MB per token region (source/body/pool) to -## fit large expansions like the scheme1 build. Total BSS ~7.8 MB, under -## the 8 MB segment memsz set by vendor/seed/<arch>/ELF.hex2. +## Total BSS ~499 MB, under the 512 MB segment memsz set by +## vendor/seed/<arch>/ELF.hex2. (If a future bump pushes past 512 MB, +## raise p_memsz in vendor/seed/<arch>/ELF.hex2 to 1 GiB and bump here.) +## +## Per-region BSS slot sizes (offset diffs): +## input_buf 32 MB (2x INPUT_CAP, headroom for trailing NUL etc.) +## output_buf 128 MB (1x OUTPUT_CAP) +## text_buf 64 MB (1x TEXT_CAP) +## source_tokens 256 MB (1x TOKENS_END) +## macros 592 KB (2x MACROS_CAP, unchanged) +## macro_body_tokens 2 MB +## streams 5120 B (128 streams × 40 B; was 16 × 40 = 640) +## expand_pool 16 MB (1x EXPAND_CAP) +## expr_frames 2304 B DEFINE OFF_paste_scratch 0000000000000000 DEFINE OFF_local_label_scratch 0001000000000000 DEFINE OFF_scope_stack 8001000000000000 @@ -133,14 +170,14 @@ DEFINE OFF_ebc_str_scratch 8004000000000000 DEFINE OFF_arg_starts 8005000000000000 DEFINE OFF_arg_ends 0006000000000000 DEFINE OFF_input_buf 8006000000000000 -DEFINE OFF_output_buf 8006080000000000 -DEFINE OFF_text_buf 80060c0000000000 -DEFINE OFF_source_tokens 80061c0000000000 -DEFINE OFF_macros 80063c0000000000 -DEFINE OFF_macro_body_tokens 8046450000000000 -DEFINE OFF_streams 8046650000000000 -DEFINE OFF_expand_pool 0049650000000000 -DEFINE OFF_expr_frames 0049850000000000 +DEFINE OFF_output_buf 8006000200000000 +DEFINE OFF_text_buf 8006000A00000000 +DEFINE OFF_source_tokens 8006000E00000000 +DEFINE OFF_macros 8006001E00000000 +DEFINE OFF_macro_body_tokens 8046091E00000000 +DEFINE OFF_streams 8046291E00000000 +DEFINE OFF_expand_pool 805A291E00000000 +DEFINE OFF_expr_frames 805A291F00000000 ## local_lookup_scratch — 256-byte working buffer used by ## expand_local_into_pool to assemble "<frame>_FRAME.<field>" before ## the macro-table linear search. Placed past expr_frames (BSS end) so @@ -1159,226 +1196,146 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 la_br &proc_check_struct beqz_a0 - # %macro: shim into define_macro through the proc_pos globals. - # define_macro reads/writes proc_pos and walks against source_end, - # so it only behaves correctly when s is the source stream — which - # holds in practice (line_start in expansion streams is cleared - # before any %macro could matter). proc_line_start receives the stream's - # line_start at directive entry — define_macro consults it to decide - # whether to consume the trailing newline after %endm. After the call - # we copy proc_pos back into s->pos and set s->line_start = 1. + # %macro: shim into define_macro through proc_pos / proc_line_start. + # define_macro reads/writes proc_pos against source_end, so it only + # behaves correctly when s is the source stream — which holds in + # practice (line_start in expansion streams is cleared before any + # %macro could matter). proc_line_start lets define_macro decide + # whether to consume the trailing newline after %endm. ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call la_br &define_macro call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%struct" ---- ## The %macro guard above already proved kind == TOK_WORD; if we reach here -## via a %macro non-match, that gate still holds. +## via a %macro non-match, that gate still holds. Each handler: +## 1. tok_eq_const(tok, NAME, len) +## 2. miss -> branch to next check +## 3. hit -> proc_save_pos_and_ls(s, tok); shim; proc_restore_and_loop :proc_check_struct - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_struct li_a2 %7 %0 la_br &tok_eq_const call la_br &proc_check_enum beqz_a0 - - # %struct matched: shim into define_fielded(stride=8, total="SIZE", len=4) ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call li_a0 %8 %0 la_a1 &const_size li_a2 %4 %0 la_br &define_fielded call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%enum" ---- :proc_check_enum - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_enum li_a2 %5 %0 la_br &tok_eq_const call la_br &proc_check_scope beqz_a0 - - # %enum matched: shim into define_fielded(stride=1, total="COUNT", len=5) ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call li_a0 %1 %0 la_a1 &const_count li_a2 %5 %0 la_br &define_fielded call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%scope" ---- :proc_check_scope - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_scope li_a2 %6 %0 la_br &tok_eq_const call la_br &proc_check_endscope beqz_a0 - - # %scope matched: shim into push_scope(stream_end). ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call ld_a0,sp,0 ld_a0,a0,8 la_br &push_scope call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%endscope" ---- :proc_check_endscope - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_endscope li_a2 %9 %0 la_br &tok_eq_const call la_br &proc_check_frame beqz_a0 - - # %endscope matched: shim into pop_scope(stream_end). ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call ld_a0,sp,0 ld_a0,a0,8 la_br &pop_scope call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%frame" ---- :proc_check_frame - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_frame li_a2 %6 %0 la_br &tok_eq_const call la_br &proc_check_endframe beqz_a0 - - # %frame matched: shim into push_frame(stream_end). ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call ld_a0,sp,0 ld_a0,a0,8 la_br &push_frame call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b ## ---- tok eq "%endframe" ---- :proc_check_endframe - ld_t0,sp,8 - mov_a0,t0 + ld_a0,sp,8 la_a1 &const_endframe li_a2 %9 %0 la_br &tok_eq_const call la_br &proc_check_newline beqz_a0 - - # %endframe matched: shim into pop_frame(stream_end). ld_a0,sp,0 - ld_t0,sp,8 - la_a1 &proc_pos - st_t0,a1,0 - ld_a2,a0,24 - la_a1 &proc_line_start - st_a2,a1,0 + ld_a1,sp,8 + la_br &proc_save_pos_and_ls + call ld_a0,sp,0 ld_a0,a0,8 la_br &pop_frame call - ld_a0,sp,0 - la_a1 &proc_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %1 %0 - st_t1,a0,24 - la_br &proc_loop + la_br &proc_restore_and_loop b :proc_check_newline @@ -1567,6 +1524,30 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 bnez_t0 eret +## proc_save_pos_and_ls(a0=s, a1=tok): publish the directive's position into +## proc_pos / proc_line_start so the directive handler can drive against the +## source stream. Leaf — preserves the caller's other state. +:proc_save_pos_and_ls + la_t0 &proc_pos + st_a1,t0,0 + ld_t1,a0,24 + la_t0 &proc_line_start + st_t1,t0,0 + ret + +## proc_restore_and_loop: reached only via `b` (sp must be intact). Reads +## sp,0 = s; copies proc_pos into s->pos, sets s->line_start=1, jumps to +## proc_loop. Tail of every directive shim above. +:proc_restore_and_loop + ld_a0,sp,0 + la_a1 &proc_pos + ld_t0,a1,0 + st_t0,a0,16 + li_t1 %1 %0 + st_t1,a0,24 + la_br &proc_loop + b + ## --- %scope / %endscope handlers -------------------------------------------- ## Called at proc_pos == the `%scope` / `%endscope` word on a line-start. ## Input: a0 = stream end (pointer one past last token in the current stream). @@ -4794,13 +4775,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %0 %0 - la_br &aeo_cmp_store_zero + la_br &aeo_cmp_finish bne_a2,a3 li_t0 %1 %0 -:aeo_cmp_store_zero - la_a0 &aeo_acc - st_t0,a0,0 - la_br &aeo_finish + la_br &aeo_cmp_finish b :aeo_do_ne @@ -4811,13 +4789,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %0 %0 - la_br &aeo_cmp_store_zero1 + la_br &aeo_cmp_finish beq_a2,a3 li_t0 %1 %0 -:aeo_cmp_store_zero1 - la_a0 &aeo_acc - st_t0,a0,0 - la_br &aeo_finish + la_br &aeo_cmp_finish b :aeo_do_lt @@ -4828,13 +4803,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %1 %0 - la_br &aeo_cmp_store_one + la_br &aeo_cmp_finish blt_a2,a3 li_t0 %0 %0 -:aeo_cmp_store_one - la_a0 &aeo_acc - st_t0,a0,0 - la_br &aeo_finish + la_br &aeo_cmp_finish b :aeo_do_le @@ -4846,13 +4818,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %0 %0 - la_br &aeo_cmp_store_two + la_br &aeo_cmp_finish blt_a3,a2 li_t0 %1 %0 -:aeo_cmp_store_two - la_a0 &aeo_acc - st_t0,a0,0 - la_br &aeo_finish + la_br &aeo_cmp_finish b :aeo_do_gt @@ -4864,13 +4833,10 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %1 %0 - la_br &aeo_cmp_store_three + la_br &aeo_cmp_finish blt_a3,a2 li_t0 %0 %0 -:aeo_cmp_store_three - la_a0 &aeo_acc - st_t0,a0,0 - la_br &aeo_finish + la_br &aeo_cmp_finish b :aeo_do_ge @@ -4882,10 +4848,15 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 ld_a2,a1,0 ld_a3,a1,8 li_t0 %0 %0 - la_br &aeo_cmp_store_four + la_br &aeo_cmp_finish blt_a2,a3 li_t0 %1 %0 -:aeo_cmp_store_four + la_br &aeo_cmp_finish + b + +## Shared tail for the comparison ops: store t0 (the 0/1 result) into +## aeo_acc, then jump to aeo_finish. Reached only via `b`. +:aeo_cmp_finish la_a0 &aeo_acc st_t0,a0,0 la_br &aeo_finish diff --git a/M1pp/M1pp.c b/M1pp/M1pp.c @@ -81,14 +81,37 @@ #include <stdlib.h> #include <string.h> -#define MAX_INPUT 262144 -#define MAX_OUTPUT 524288 -#define MAX_TEXT 1048576 -#define MAX_TOKENS 65536 +/* Caps chosen to mirror the M1pp.P1 BSS layout, sized so the cc.scm + * emission of tcc.flat.c (~6.5 MB of macro-rich .P1pp) lexes cleanly. + * The native binary is host-side, so static globals at these sizes + * just live in .bss / anonymous mmap without any of the ELF-segment + * sizing dance the bootstrap m1pp has to do. */ +#define MAX_INPUT 16777216 /* 16 MiB */ +#define MAX_OUTPUT 134217728 /* 128 MiB */ +#define MAX_TEXT 67108864 /* 64 MiB: + * paste tokens, hex literals from + * %(EXPR) evaluation, and per-call + * @local label rewrites all live + * here for the run's lifetime. cc.scm + * triggers hundreds of thousands of + * each across the tcc.c expansion. */ +#define MAX_TOKENS 8388608 /* 8 M slots × 32 B = 256 MiB */ #define MAX_MACROS 1024 #define MAX_PARAMS 16 #define MAX_MACRO_BODY_TOKENS MAX_TOKENS -#define MAX_EXPAND 65536 +#define MAX_EXPAND 524288 /* 512 K × 32 B = 16 MiB: + * cc.scm wraps each C function in + * %fn(... { body }), and m1pp's + * expand_macro_tokens copies the + * argument tokens into the pool — + * so the entire body of a long + * function is resident in the pool + * while its outer %fn is active. + * tcc.c's next_nomacro1 (~5900 + * lines × ~13 m1pp tokens/line ≈ + * 77 K tokens, ~2.5 MiB) plus + * inner expansions sit comfortably + * under 16 MiB. */ #define MAX_STACK 64 #define MAX_EXPR_FRAMES 256 #define MAX_SCOPE_DEPTH 32