boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

M1pp.P1 (171757B)


      1 ## m1pp.M1 — bootstrap M1 macro-expander, P1.
      2 ##
      3 ## Runtime shape: m1pp input.M1 output.M1
      4 ##
      5 ## Pipeline:
      6 ##   p1_main         argc/argv from the backend :_start stub; stash argv[1..2]
      7 ##                   into input_path / output_path; openat+read into
      8 ##                   input_buf; call lex_source, then process_tokens;
      9 ##                   openat+write output_buf to output_path; return 0.
     10 ##   lex_source      input_buf -> source_tokens[] (via append_text +
     11 ##                   push_source_token).
     12 ##   process_tokens  Stream-driven loop. Pushes source_tokens as the initial
     13 ##                   stream and walks it token-by-token, dispatching to
     14 ##                   define_macro at line-start %macro, emit_newline /
     15 ##                   emit_token for pass-through, expand_builtin_call for
     16 ##                   !@%$, %select, %str, %local, and expand_call
     17 ##                   for user macros. Macro expansions and %select push
     18 ##                   fresh streams onto streams[]; popping rewinds the
     19 ##                   expansion pool.
     20 ##
     21 ## Output is consumed directly by hex2pp -- there is no intermediate M0/hex2
     22 ## stage. Lexical scoping for control-flow labels is delegated to hex2pp's
     23 ## nestable .scope / .endscope; M1pp itself only handles per-expansion
     24 ## macro hygiene labels (:@name / &@name).
     25 ##   define_macro    Parse %macro header+body; record in macros[] +
     26 ##                   macro_body_tokens[]; consume through the %endm line
     27 ##                   without emitting output.
     28 ##
     29 ## P1 ABI: a0..a3 arg/return, t0..t2 caller-saved temps, s0..s3 callee-saved
     30 ## (unused here). Non-leaf functions use enter_0 / eret. Entry is the portable
     31 ## p1_main (a0=argc, a1=argv); the backend-owned :_start stub captures
     32 ## argc/argv from the native entry state and sys_exits p1_main's return value.
     33 
     34 ## --- Constants & sizing ------------------------------------------------------
     35 
     36 ## Caps bumped 2025-04 for tcc-boot2: cc.scm against the flattened
     37 ## tcc.flat.c emits ~6.5 MB of macro-rich .P1pp; m1pp's old caps were
     38 ## sized for the scheme1 build at ~256 KiB. New caps cover the larger
     39 ## working set the cc-bootstrap path exercises:
     40 ##
     41 ##   16 MiB input        — combined prelude + tcc.flat.P1pp
     42 ##  128 MiB output       — final .M1 (~8x source after expansion)
     43 ##   64 MiB text-arena   — pasted names, hex literals from %(...) eval,
     44 ##                         per-call @local label rewrites (cc.scm
     45 ##                         triggers ~hundreds of K of each)
     46 ##  256 MiB source-toks  — 8 M Token slots × 32 B
     47 DEFINE M1PP_INPUT_CAP 0000000100000000
     48 DEFINE M1PP_OUTPUT_CAP 0000000800000000
     49 DEFINE M1PP_TEXT_CAP 0000000400000000
     50 ## source_tokens cap: 256 MB / 32-byte tokens = ~8.4 M tokens. cc.scm
     51 ## emits one source token per ~3 input bytes (heavy macro use), so a
     52 ## 6.5 MB tcc.flat.P1pp tokenises to ~2.5 M; 8 M leaves comfortable
     53 ## headroom for the larger TUs the harness will exercise next.
     54 DEFINE M1PP_TOKENS_END 0000001000000000
     55 ## Macro record is 304 bytes: name (16) + param_count (8) + params[16]*16 (256)
     56 ## + body_start (8) + body_end (8) + has_paste (8). MACROS_CAP fits 1024
     57 ## records (311296 B). The 2x BSS slot for macros (0x94000 B) still has
     58 ## ample headroom past the new cap. has_paste is a 0/1 flag set during
     59 ## define_macro when the body contains TOK_PASTE; expand_macro_tokens
     60 ## reads it (along with args_have_paste from parse_args) to skip the
     61 ## paste_pool_range scan when neither side contributes a `##`.
     62 ## Body-token arena fits 65536 tokens (2 MB = 0x200000).
     63 DEFINE M1PP_MACRO_RECORD_SIZE 3001000000000000
     64 DEFINE M1PP_MACRO_BODY_START_OFF 1801000000000000
     65 DEFINE M1PP_MACRO_BODY_END_OFF 2001000000000000
     66 DEFINE M1PP_MACRO_HAS_PASTE_OFF 2801000000000000
     67 DEFINE M1PP_MACROS_CAP 00C0040000000000
     68 DEFINE M1PP_MACRO_BODY_CAP 0000200000000000
     69 DEFINE O_WRONLY_CREAT_TRUNC 4102000000000000
     70 DEFINE MODE_0644 A401000000000000
     71 DEFINE AT_FDCWD 9CFFFFFFFFFFFFFF
     72 DEFINE ZERO32 '0000000000000000000000000000000000000000000000000000000000000000'
     73 DEFINE ZERO8 '0000000000000000'
     74 DEFINE ZERO4 '00000000'
     75 
     76 DEFINE TOK_WORD 0000000000000000
     77 DEFINE TOK_STRING 0100000000000000
     78 DEFINE TOK_NEWLINE 0200000000000000
     79 DEFINE TOK_LPAREN 0300000000000000
     80 DEFINE TOK_RPAREN 0400000000000000
     81 DEFINE TOK_COMMA 0500000000000000
     82 DEFINE TOK_PASTE 0600000000000000
     83 DEFINE TOK_LBRACE 0700000000000000
     84 DEFINE TOK_RBRACE 0800000000000000
     85 
     86 ## Token record stride (kind + text_ptr + text_len + tight). Advance by this.
     87 ## Layout: +0 kind (8), +8 text_ptr (8), +16 text_len (8), +24 tight (8) = 32.
     88 ## Only byte 0 of the tight word is meaningful (0/1); upper bytes are zero.
     89 DEFINE M1PP_TOK_SIZE 2000000000000000
     90 DEFINE M1PP_TOK_TIGHT_OFF 1800000000000000
     91 
     92 ## --- Stream / expansion-pool / expression-frame sizes ------------------------
     93 ## Stream record: 40 bytes. Fields (each 8 bytes):
     94 ##   +0   start       Token*
     95 ##   +8   end         Token*   (exclusive)
     96 ##   +16  pos         Token*
     97 ##   +24  line_start  u64      (1 at entry, 0 after first non-newline emit)
     98 ##   +32  pool_mark   i64      (byte offset into expand_pool; -1 for source)
     99 DEFINE M1PP_STREAM_SIZE 2800000000000000
    100 DEFINE M1PP_STREAM_END_OFF 0800000000000000
    101 DEFINE M1PP_STREAM_POS_OFF 1000000000000000
    102 DEFINE M1PP_STREAM_LS_OFF 1800000000000000
    103 DEFINE M1PP_STREAM_MARK_OFF 2000000000000000
    104 
    105 ## Stream stack cap: 128 streams × 40 = 5120 bytes (was 16 × 40 = 640).
    106 ## Each nested macro / expression eval pushes a stream; cc.scm's macro
    107 ## chains (%ld → %p1_mem → %select → %aa64_mem → %aa64_mem_after_nonneg
    108 ## → %aa64_ldst_uimm12 → %(EXPR), plus %fn / %ifelse_* / @local label
    109 ## scopes around the call site) routinely exceed 16 deep on the
    110 ## tcc-boot2 path. Note: the runtime overflow message ("token buffer
    111 ## overflow") at line 2504 is misleadingly shared with EXPAND/TOKENS;
    112 ## hitting the cap there means *stream* stack exhaustion.
    113 DEFINE M1PP_STREAM_STACK_CAP 0014000000000000
    114 
    115 ## Expansion pool fits 524288 Token slots × 32 bytes = 16 MB (0x1000000).
    116 ## cc.scm emits %fn(name, sz, { body }), and m1pp eagerly copies the
    117 ## body argument into the pool when expanding %fn — so the entire body
    118 ## of a long function lives in the pool until %fn pops. tcc.c's
    119 ## next_nomacro1 (~5900 lines × ~13 tokens/line ≈ 77 K tokens, ~2.5 MiB)
    120 ## plus nested expansions sit comfortably under 16 MiB.
    121 DEFINE M1PP_EXPAND_CAP 0000000100000000
    122 
    123 ## ExprFrame record: 144 bytes. Fields:
    124 ##   +0   op_code  u64
    125 ##   +8   argc     u64
    126 ##   +16  args     i64[16]  (16 × 8 = 128 bytes)
    127 DEFINE M1PP_EXPR_FRAME_SIZE 9000000000000000
    128 DEFINE M1PP_EXPR_ARGC_OFF 0800000000000000
    129 DEFINE M1PP_EXPR_ARGS_OFF 1000000000000000
    130 
    131 ## Expr frame stack cap: 16 frames × 144 = 2304 bytes.
    132 DEFINE M1PP_EXPR_FRAMES_CAP 0009000000000000
    133 
    134 ## Common cap used by macro params, call args, and expression args.
    135 DEFINE M1PP_MAX_PARAMS 1000000000000000
    136 
    137 ## ExprOp codes (indexed by apply_expr_op).
    138 DEFINE EXPR_ADD 0000000000000000
    139 DEFINE EXPR_SUB 0100000000000000
    140 DEFINE EXPR_MUL 0200000000000000
    141 DEFINE EXPR_DIV 0300000000000000
    142 DEFINE EXPR_MOD 0400000000000000
    143 DEFINE EXPR_SHL 0500000000000000
    144 DEFINE EXPR_SHR 0600000000000000
    145 DEFINE EXPR_AND 0700000000000000
    146 DEFINE EXPR_OR 0800000000000000
    147 DEFINE EXPR_XOR 0900000000000000
    148 DEFINE EXPR_NOT 0A00000000000000
    149 DEFINE EXPR_EQ 0B00000000000000
    150 DEFINE EXPR_NE 0C00000000000000
    151 DEFINE EXPR_LT 0D00000000000000
    152 DEFINE EXPR_LE 0E00000000000000
    153 DEFINE EXPR_GT 0F00000000000000
    154 DEFINE EXPR_GE 1000000000000000
    155 DEFINE EXPR_STRLEN 1100000000000000
    156 DEFINE EXPR_INVALID 1200000000000000
    157 ## --- BSS layout (offsets from ELF_end) -------------------------------------
    158 ## Total BSS ~499 MB, under the 512 MB segment memsz set by
    159 ## vendor/seed/<arch>/ELF.hex2. (If a future bump pushes past 512 MB,
    160 ## raise p_memsz in vendor/seed/<arch>/ELF.hex2 to 1 GiB and bump here.)
    161 ##
    162 ## Per-region BSS slot sizes (offset diffs):
    163 ##   input_buf          32 MB    (2x INPUT_CAP, headroom for trailing NUL etc.)
    164 ##   output_buf        128 MB    (1x OUTPUT_CAP)
    165 ##   text_buf           64 MB    (1x TEXT_CAP)
    166 ##   source_tokens     256 MB    (1x TOKENS_END)
    167 ##   macros            592 KB    (2x MACROS_CAP, unchanged)
    168 ##   macro_body_tokens   2 MB
    169 ##   streams          5120 B    (128 streams × 40 B; was 16 × 40 = 640)
    170 ##   expand_pool        16 MB    (1x EXPAND_CAP)
    171 ##   expr_frames     2304 B
    172 DEFINE OFF_paste_scratch 0000000000000000
    173 DEFINE OFF_local_label_scratch 0001000000000000
    174 DEFINE OFF_df_name_scratch 8003000000000000
    175 DEFINE OFF_ebc_str_scratch 8004000000000000
    176 DEFINE OFF_arg_starts 8005000000000000
    177 DEFINE OFF_arg_ends 0006000000000000
    178 DEFINE OFF_input_buf 8006000000000000
    179 DEFINE OFF_output_buf 8006000200000000
    180 DEFINE OFF_text_buf 8006000A00000000
    181 DEFINE OFF_source_tokens 8006000E00000000
    182 DEFINE OFF_macros 8006001E00000000
    183 DEFINE OFF_macro_body_tokens 8046091E00000000
    184 DEFINE OFF_streams 8046291E00000000
    185 DEFINE OFF_expand_pool 805A291E00000000
    186 DEFINE OFF_expr_frames 805A291F00000000
    187 ## local_lookup_scratch — 256-byte working buffer used by
    188 ## expand_local_into_pool to assemble "<frame>_FRAME.<field>" before
    189 ## the macro-table linear search. Placed past expr_frames (BSS end) so
    190 ## adding it does not shift any existing OFF_*.
    191 DEFINE OFF_local_lookup_scratch 0052850000000000
    192 ## macro_body_param_idx / macro_body_is_local_label — 1 byte per body
    193 ## token slot (M1PP_MACRO_BODY_CAP / 32 = 65536 slots, so each region
    194 ## is 0x10000 bytes). Populated at %macro definition time so the
    195 ## expand_macro_tokens body loop reads cached classifications instead
    196 ## of re-running find_param + is_local_label_token per body token per
    197 ## expansion. Placed past the existing expr_frames (BSS end ≈ 0x1F296380)
    198 ## so adding them does not shift any other OFF_*.
    199 DEFINE OFF_macro_body_param_idx 8063291F00000000
    200 DEFINE OFF_macro_body_is_local_label 80632A1F00000000
    201 
    202 
    203 ## --- Runtime shell: argv, read input, call pipeline, write output, exit ------
    204 
    205 :p1_main
    206     enter_0
    207     # --- init BSS pointer slots from ELF_end via table walk ------------------
    208     # Each bss_init_tbl entry is 16 bytes:
    209     #   +0  slot ptr   (&label + 4 zero pad = 8-byte absolute address)
    210     #   +8  offset     (8-byte OFF_* constant)
    211     # For each entry: *slot_ptr = ELF_end + offset.
    212     la_t0 &ELF_end
    213     la_t1 &bss_init_tbl
    214     la_t2 &bss_init_tbl_end
    215 :bss_init_loop
    216     la_br &bss_init_done
    217     beq_t1,t2
    218     ld_a2,t1,0
    219     ld_a3,t1,8
    220     add_a3,a3,t0
    221     st_a3,a2,0
    222     addi_t1,t1,16
    223     la_br &bss_init_loop
    224     b
    225 :bss_init_done
    226     # --- end BSS init -------------------------------------------------------
    227 
    228     # a0 = argc, a1 = argv (pointer to argv[0]).
    229     # if (argc < 3) usage
    230     li_a2 %3 %0
    231     la_br &err_usage
    232     blt_a0,a2
    233 
    234     # Stash argv[1] and argv[2] into memory before anything clobbers a1.
    235     ld_t0,a1,8
    236     la_a2 &input_path
    237     st_t0,a2,0
    238     ld_t0,a1,16
    239     la_a2 &output_path
    240     st_t0,a2,0
    241 
    242     # source_end = &source_tokens   (running tail pointer)
    243     la_a0 &source_tokens_ptr
    244     ld_a0,a0,0
    245     la_a2 &source_end
    246     st_a0,a2,0
    247 
    248     # macros_end = &macros; macro_body_end = &macro_body_tokens
    249     la_a0 &macros_ptr
    250     ld_a0,a0,0
    251     la_a2 &macros_end
    252     st_a0,a2,0
    253     la_a0 &macro_body_tokens_ptr
    254     ld_a0,a0,0
    255     la_a2 &macro_body_end
    256     st_a0,a2,0
    257 
    258     # input_fd = openat(AT_FDCWD, input_path, O_RDONLY, 0)
    259     li_a0 sys_openat
    260     li_a1 AT_FDCWD
    261     la_a2 &input_path
    262     ld_a2,a2,0
    263     li_a3 %0 %0
    264     li_t0 %0 %0
    265     syscall
    266     la_br &err_open_input
    267     bltz_a0
    268     la_a1 &input_fd
    269     st_a0,a1,0
    270 
    271 :read_loop
    272     # while (input_len < INPUT_CAP)
    273     la_a0 &input_len
    274     ld_t1,a0,0
    275     li_t2 M1PP_INPUT_CAP
    276     la_br &read_done
    277     beq_t1,t2
    278 
    279     # n = read(input_fd, &input_buf[input_len], INPUT_CAP - input_len)
    280     la_a0 &input_fd
    281     ld_a1,a0,0
    282     la_a2 &input_buf_ptr
    283     ld_a2,a2,0
    284     add_a2,a2,t1
    285     sub_a3,t2,t1
    286     li_a0 sys_read
    287     syscall
    288 
    289     # if (n == 0) break;  if (n < 0) fatal
    290     la_br &read_done
    291     beqz_a0
    292     la_br &err_read
    293     bltz_a0
    294 
    295     # input_len += n
    296     la_a1 &input_len
    297     ld_a2,a1,0
    298     add_a2,a2,a0
    299     st_a2,a1,0
    300     la_br &read_loop
    301     b
    302 
    303 :read_done
    304     # if (input_len == INPUT_CAP) fatal  (no room for null terminator)
    305     la_a0 &input_len
    306     ld_t0,a0,0
    307     li_t1 M1PP_INPUT_CAP
    308     la_br &err_input_too_big
    309     beq_t0,t1
    310 
    311     # input_buf[input_len] = '\0'
    312     la_a0 &input_buf_ptr
    313     ld_a0,a0,0
    314     add_a0,a0,t0
    315     li_t1 %0 %0
    316     sb_t1,a0,0
    317 
    318     # lex_source(); process_tokens()
    319     la_br &lex_source
    320     call
    321     la_br &process_tokens
    322     call
    323 
    324     la_br &write_output
    325     b
    326 
    327 :write_output
    328     # output_fd = openat(AT_FDCWD, output_path, O_WRONLY|O_CREAT|O_TRUNC, 0644)
    329     la_a0 &output_path
    330     ld_a2,a0,0
    331     li_a0 sys_openat
    332     li_a1 AT_FDCWD
    333     li_a3 O_WRONLY_CREAT_TRUNC
    334     li_t0 MODE_0644
    335     syscall
    336     la_br &err_open_output
    337     bltz_a0
    338     la_a1 &output_fd
    339     st_a0,a1,0
    340 
    341 :write_loop
    342     # while (output_written < output_used)
    343     la_a0 &output_written
    344     ld_t0,a0,0
    345     la_a1 &output_used
    346     ld_t1,a1,0
    347     la_br &write_done
    348     beq_t0,t1
    349 
    350     # n = write(output_fd, &output_buf[output_written], output_used - output_written)
    351     la_a0 &output_fd
    352     ld_a1,a0,0
    353     la_a2 &output_buf_ptr
    354     ld_a2,a2,0
    355     add_a2,a2,t0
    356     sub_a3,t1,t0
    357     li_a0 sys_write
    358     syscall
    359 
    360     # n <= 0 is fatal (short write or error)
    361     la_br &err_write
    362     bltz_a0
    363     la_br &err_write
    364     beqz_a0
    365 
    366     # output_written += n
    367     la_a1 &output_written
    368     ld_a2,a1,0
    369     add_a2,a2,a0
    370     st_a2,a1,0
    371     la_br &write_loop
    372     b
    373 
    374 :write_done
    375     # return 0 (backend :_start stub sys_exits with a0)
    376     li_a0 %0 %0
    377     eret
    378 
    379 ## --- Helpers: text arena + token array + equality ----------------------------
    380 ## append_text appends bytes to text_buf (used for synthesized token text,
    381 ## e.g. single-char parens/commas and the paste `##`). Source-word and string
    382 ## tokens point directly into input_buf and skip this arena.
    383 
    384 ## append_text(a0=src, a1=len) -> a0=text ptr. Leaf.
    385 :append_text
    386     # a3 = text_used
    387     la_a2 &text_used
    388     ld_a3,a2,0
    389 
    390     # if (text_used + len + 1) > TEXT_CAP: fatal
    391     add_t0,a3,a1
    392     addi_t0,t0,1
    393     li_t1 M1PP_TEXT_CAP
    394     la_br &err_text_overflow
    395     blt_t1,t0
    396 
    397     # dst = &text_buf[text_used]
    398     la_t0 &text_buf_ptr
    399     ld_t0,t0,0
    400     add_t0,t0,a3
    401 
    402     # for (i = 0; i < len; i++) dst[i] = src[i]
    403     li_t1 %0 %0
    404 :append_text_loop
    405     la_br &append_text_done
    406     beq_t1,a1
    407     add_t2,a0,t1
    408     lb_t2,t2,0
    409     add_a2,t0,t1
    410     sb_t2,a2,0
    411     addi_t1,t1,1
    412     la_br &append_text_loop
    413     b
    414 :append_text_done
    415     # dst[len] = '\0'
    416     add_a2,t0,t1
    417     li_t2 %0 %0
    418     sb_t2,a2,0
    419 
    420     # text_used += len + 1
    421     la_a2 &text_used
    422     ld_a3,a2,0
    423     add_a3,a3,a1
    424     addi_a3,a3,1
    425     st_a3,a2,0
    426 
    427     # return dst
    428     mov_a0,t0
    429     ret
    430 
    431 ## push_source_token(a0=kind, a1=text_ptr, a2=text_len, a3=tight). Leaf.
    432 ## Token layout: +0 kind, +8 text_ptr, +16 text_len, +24 tight (32B total).
    433 ## tight=1 means "no whitespace before this token"; consulted only on LPAREN
    434 ## by the paren-call recognizers (%FOO(...), !(...), @(...), %(...), $(...),
    435 ## %select(...), %str(...)). All other kinds carry the bit but ignore it.
    436 :push_source_token
    437     # tok = source_end
    438     la_t2 &source_end
    439     ld_t0,t2,0
    440 
    441     # if (tok == &source_tokens[0] + TOKENS_END) fatal
    442     la_t1 &source_tokens_ptr
    443     ld_t1,t1,0
    444     li_t2 M1PP_TOKENS_END
    445     add_t1,t1,t2
    446     la_br &err_token_overflow
    447     beq_t0,t1
    448 
    449     # tok->kind = kind; tok->text_ptr = text_ptr; tok->text_len = text_len;
    450     # tok->tight = tight (8-byte stores zero the rest of each slot).
    451     st_a0,t0,0
    452     st_a1,t0,8
    453     st_a2,t0,16
    454     st_a3,t0,24
    455 
    456     # source_end = tok + 1 (advance 32 bytes)
    457     la_t2 &source_end
    458     addi_t0,t0,32
    459     st_t0,t2,0
    460     ret
    461 
    462 ## tok_eq_const(a0=token_ptr, a1=const_ptr, a2=const_len) -> a0=0/1. Leaf.
    463 ## Compares a token's text against a constant byte string.
    464 :tok_eq_const
    465     # if (tok->text_len != const_len) return 0
    466     ld_a3,a0,16
    467     la_br &tok_eq_false
    468     bne_a3,a2
    469 
    470     # src = tok->text_ptr; i = 0
    471     ld_t0,a0,8
    472     li_t1 %0 %0
    473 :tok_eq_loop
    474     # if (i == const_len) return 1
    475     la_br &tok_eq_true
    476     beq_t1,a2
    477 
    478     # if (src[i] != const_ptr[i]) return 0
    479     add_t2,t0,t1
    480     lb_t2,t2,0
    481     add_a3,a1,t1
    482     lb_a3,a3,0
    483     la_br &tok_eq_false
    484     bne_t2,a3
    485 
    486     # i++
    487     addi_t1,t1,1
    488     la_br &tok_eq_loop
    489     b
    490 :tok_eq_true
    491     li_a0 %1 %0
    492     ret
    493 :tok_eq_false
    494     li_a0 %0 %0
    495     ret
    496 
    497 ## --- Lexer -------------------------------------------------------------------
    498 ## Dispatches on the first byte at lex_ptr:
    499 ##   whitespace (sp/tab/cr/ff/vt) -> lex_skip_one
    500 ##   newline (\n)                 -> lex_newline   -> TOK_NEWLINE
    501 ##   quote (" or ')               -> lex_string    -> TOK_STRING
    502 ##   `#`                          -> lex_hash      -> TOK_PASTE on ##, else comment
    503 ##   `;`                          -> lex_comment   (drop to end of line)
    504 ##   `(` `)` `,`                  -> lex_lparen / rparen / comma
    505 ##   otherwise                    -> lex_word      -> TOK_WORD
    506 ##
    507 ## All branches loop back to lex_loop. lex_done exits once lex_ptr hits
    508 ## the terminating NUL that _start writes past the end of input_buf.
    509 
    510 ## lex_source(): fills source_tokens[] from input_buf.
    511 ## lex_saw_separator tracks whether whitespace (sp/tab/CR/etc., newline,
    512 ## or `;`/`#` line comment) precedes the next token. Each non-newline push
    513 ## passes tight = !lex_saw_separator and then sets lex_saw_separator = 0.
    514 ## Initialized to 1 at start of file so the first token is never tight.
    515 :lex_source
    516     enter_0
    517     la_a0 &input_buf_ptr
    518     ld_a0,a0,0
    519     la_a1 &lex_ptr
    520     st_a0,a1,0
    521     la_a0 &lex_saw_separator
    522     li_t0 %1 %0
    523     st_t0,a0,0
    524 :lex_loop
    525     # c = *lex_ptr; dispatch on lex_char_class[c].
    526     #   0 word, 1 skip ws, 2 newline, 3 string, 4 hash, 5 comment,
    527     #   6 '(', 7 ')', 8 ',', 9 '{', 10 '}', 11 NUL (fall through to done).
    528     la_a0 &lex_ptr
    529     ld_t0,a0,0
    530     lb_a0,t0,0
    531     la_a1 &lex_char_class
    532     add_a1,a1,a0
    533     lb_a2,a1,0
    534 
    535     la_br &lex_word
    536     beqz_a2
    537     li_a1 %1 %0
    538     la_br &lex_skip_one
    539     beq_a2,a1
    540     li_a1 %2 %0
    541     la_br &lex_newline
    542     beq_a2,a1
    543     li_a1 %3 %0
    544     la_br &lex_string
    545     beq_a2,a1
    546     li_a1 %4 %0
    547     la_br &lex_hash
    548     beq_a2,a1
    549     li_a1 %5 %0
    550     la_br &lex_comment
    551     beq_a2,a1
    552     li_a1 %6 %0
    553     la_br &lex_lparen
    554     beq_a2,a1
    555     li_a1 %7 %0
    556     la_br &lex_rparen
    557     beq_a2,a1
    558     li_a1 %8 %0
    559     la_br &lex_comma
    560     beq_a2,a1
    561     li_a1 %9 %0
    562     la_br &lex_lbrace
    563     beq_a2,a1
    564     li_a1 %10 %0
    565     la_br &lex_rbrace
    566     beq_a2,a1
    567     ## class 11 (NUL) — fall through
    568     la_br &lex_done
    569     b
    570 
    571 :lex_skip_one
    572     # whitespace separator: lex_saw_separator = 1; lex_ptr++
    573     la_a1 &lex_saw_separator
    574     li_a2 %1 %0
    575     st_a2,a1,0
    576     addi_t0,t0,1
    577     la_a0 &lex_ptr
    578     st_t0,a0,0
    579     la_br &lex_loop
    580     b
    581 
    582 :lex_newline
    583     # push_source_token(TOK_NEWLINE, lex_ptr, 1, tight=0); newline acts as a
    584     # separator for the NEXT token, so lex_saw_separator = 1 afterwards.
    585     mov_a1,t0
    586     li_a0 TOK_NEWLINE
    587     li_a2 %1 %0
    588     li_a3 %0 %0
    589     la_br &push_source_token
    590     call
    591 
    592     la_a0 &lex_saw_separator
    593     li_t0 %1 %0
    594     st_t0,a0,0
    595 
    596     # lex_ptr++
    597     la_a0 &lex_ptr
    598     ld_t0,a0,0
    599     addi_t0,t0,1
    600     st_t0,a0,0
    601     la_br &lex_loop
    602     b
    603 
    604 :lex_string
    605     # lex_start = lex_ptr; lex_quote = c; lex_ptr++
    606     la_a1 &lex_start
    607     st_t0,a1,0
    608     la_a1 &lex_quote
    609     st_a0,a1,0
    610     addi_t0,t0,1
    611 :lex_string_scan
    612     # c = *lex_ptr
    613     lb_a0,t0,0
    614     # if (c == '\0') finish (unterminated; keep what we have)
    615     la_br &lex_string_finish
    616     beqz_a0
    617     # if (c == '\\' && lex_ptr[1] != '\0') skip both bytes as a unit so
    618     # `\"` and `\\` don't accidentally terminate the string. Decoding
    619     # the escape's *meaning* (e.g. for %bytes) happens later — here we
    620     # only care about token boundaries.
    621     li_a1 %92 %0
    622     la_br &lex_string_check_backslash
    623     beq_a0,a1
    624     la_br &lex_string_no_escape
    625     b
    626 :lex_string_check_backslash
    627     addi_a1,t0,1
    628     lb_a1,a1,0
    629     la_br &lex_string_no_escape
    630     beqz_a1
    631     addi_t0,t0,2
    632     la_br &lex_string_scan
    633     b
    634 :lex_string_no_escape
    635     # if (c == quote) consume closing quote and finish
    636     la_a1 &lex_quote
    637     ld_a1,a1,0
    638     la_br &lex_string_after_quote
    639     beq_a0,a1
    640     # else lex_ptr++
    641     addi_t0,t0,1
    642     la_br &lex_string_scan
    643     b
    644 :lex_string_after_quote
    645     addi_t0,t0,1
    646 :lex_string_finish
    647     # lex_ptr = t0
    648     la_a1 &lex_ptr
    649     st_t0,a1,0
    650 
    651     # text_ptr = append_text(lex_start, lex_ptr - lex_start)
    652     la_a1 &lex_start
    653     ld_a0,a1,0
    654     sub_a1,t0,a0
    655     la_br &append_text
    656     call
    657 
    658     # push_source_token(TOK_STRING, text_ptr, lex_ptr - lex_start,
    659     #                   tight = !lex_saw_separator); then lex_saw_separator = 0.
    660     la_a1 &lex_ptr
    661     ld_t0,a1,0
    662     la_a1 &lex_start
    663     ld_t1,a1,0
    664     sub_a2,t0,t1
    665     mov_a1,a0
    666     li_a0 TOK_STRING
    667     la_a3 &lex_saw_separator
    668     ld_a3,a3,0
    669     li_t1 %1 %0
    670     sub_a3,t1,a3
    671     la_br &push_source_token
    672     call
    673     la_a0 &lex_saw_separator
    674     li_t0 %0 %0
    675     st_t0,a0,0
    676     la_br &lex_loop
    677     b
    678 
    679 :lex_hash
    680     # if (lex_ptr[1] == '#') goto lex_paste, else lex_comment
    681     addi_a1,t0,1
    682     lb_a1,a1,0
    683     li_a2 %35 %0
    684     la_br &lex_paste
    685     beq_a1,a2
    686     la_br &lex_comment
    687     b
    688 
    689 :lex_paste
    690     # text_ptr = append_text("##", 2)
    691     la_a0 &const_paste
    692     li_a1 %2 %0
    693     la_br &append_text
    694     call
    695 
    696     # push_source_token(TOK_PASTE, text_ptr, 2, tight = !lex_saw_separator).
    697     mov_a1,a0
    698     li_a0 TOK_PASTE
    699     li_a2 %2 %0
    700     la_a3 &lex_saw_separator
    701     ld_a3,a3,0
    702     li_t1 %1 %0
    703     sub_a3,t1,a3
    704     la_br &push_source_token
    705     call
    706 
    707     la_a0 &lex_saw_separator
    708     li_t0 %0 %0
    709     st_t0,a0,0
    710 
    711     # lex_ptr += 2
    712     la_a0 &lex_ptr
    713     ld_t0,a0,0
    714     addi_t0,t0,2
    715     st_t0,a0,0
    716     la_br &lex_loop
    717     b
    718 
    719 :lex_comment
    720     # skip to end of line: while (c != '\0' && c != '\n') lex_ptr++
    721     la_a0 &lex_ptr
    722     ld_t0,a0,0
    723 :lex_comment_loop
    724     lb_a0,t0,0
    725     la_br &lex_comment_done
    726     beqz_a0
    727     li_a1 %10 %0
    728     la_br &lex_comment_done
    729     beq_a0,a1
    730     addi_t0,t0,1
    731     la_br &lex_comment_loop
    732     b
    733 :lex_comment_done
    734     # `;` / `#` line comment counts as a separator for the next token.
    735     la_a0 &lex_ptr
    736     st_t0,a0,0
    737     la_a0 &lex_saw_separator
    738     li_t0 %1 %0
    739     st_t0,a0,0
    740     la_br &lex_loop
    741     b
    742 
    743 ## lex_lparen / lex_rparen / lex_comma all share the same shape:
    744 ## append the single-char constant, push a 1-byte token of the right kind,
    745 ## then fall through to lex_advance_one_then_loop to bump lex_ptr.
    746 
    747 :lex_lparen
    748     li_a0 TOK_LPAREN
    749     la_a1 &const_lparen
    750     la_br &lex_punct1
    751     b
    752 :lex_rparen
    753     li_a0 TOK_RPAREN
    754     la_a1 &const_rparen
    755     la_br &lex_punct1
    756     b
    757 :lex_comma
    758     li_a0 TOK_COMMA
    759     la_a1 &const_comma
    760     la_br &lex_punct1
    761     b
    762 :lex_lbrace
    763     li_a0 TOK_LBRACE
    764     la_a1 &const_lbrace
    765     la_br &lex_punct1
    766     b
    767 :lex_rbrace
    768     li_a0 TOK_RBRACE
    769     la_a1 &const_rbrace
    770     ## fall through into lex_punct1
    771 
    772 ## lex_punct1(a0=kind, a1=const_ptr): append 1 byte to text arena, push a
    773 ## 1-byte token of the given kind, advance lex_ptr by 1, branch back to
    774 ## lex_loop. Called by tail-branch from the single-char lex_X blocks, which
    775 ## all share lex_source's frame. Spills `kind` since append_text clobbers
    776 ## a0..a3.
    777 :lex_punct1
    778     la_t0 &lex_punct_kind
    779     st_a0,t0,0
    780     mov_a0,a1
    781     li_a1 %1 %0
    782     la_br &append_text
    783     call
    784     mov_a1,a0
    785     la_t0 &lex_punct_kind
    786     ld_a0,t0,0
    787     li_a2 %1 %0
    788     # tight = !lex_saw_separator (the load is consumed before push_source_token).
    789     la_a3 &lex_saw_separator
    790     ld_a3,a3,0
    791     li_t1 %1 %0
    792     sub_a3,t1,a3
    793     la_br &push_source_token
    794     call
    795     la_a0 &lex_saw_separator
    796     li_t0 %0 %0
    797     st_t0,a0,0
    798     ## fall through to lex_advance_one_then_loop
    799 
    800 :lex_advance_one_then_loop
    801     # lex_ptr++
    802     la_a0 &lex_ptr
    803     ld_t0,a0,0
    804     addi_t0,t0,1
    805     st_t0,a0,0
    806     la_br &lex_loop
    807     b
    808 
    809 :lex_word
    810     # lex_start = lex_ptr
    811     la_a1 &lex_start
    812     st_t0,a1,0
    813 :lex_word_scan
    814     # c = *lex_ptr; terminate the word if lex_char_class[c] is non-WORD,
    815     # but treat class 3 (string-quote `"`/`'`) as part of the word too —
    816     # quotes only start a STRING token at token start, not mid-word.
    817     # That matches M1pp.c, where the WORD scanner ignores `"`/`'` and
    818     # so `\`"hi"\`` (backtick-quote-...-quote-backtick with no spaces)
    819     # lexes as a single WORD.
    820     lb_a2,t0,0
    821     la_a1 &lex_char_class
    822     add_a1,a1,a2
    823     lb_a2,a1,0
    824     la_br &lex_word_continue
    825     beqz_a2
    826     li_a1 %3 %0
    827     la_br &lex_word_finish
    828     bne_a2,a1
    829 :lex_word_continue
    830     addi_t0,t0,1
    831     la_br &lex_word_scan
    832     b
    833 :lex_word_finish
    834     # lex_ptr = t0
    835     la_a1 &lex_ptr
    836     st_t0,a1,0
    837 
    838     # text_ptr = append_text(lex_start, lex_ptr - lex_start)
    839     la_a1 &lex_start
    840     ld_a0,a1,0
    841     sub_a1,t0,a0
    842     la_br &append_text
    843     call
    844 
    845     # push_source_token(TOK_WORD, text_ptr, lex_ptr - lex_start,
    846     #                   tight = !lex_saw_separator); lex_saw_separator = 0.
    847     la_a1 &lex_ptr
    848     ld_t0,a1,0
    849     la_a1 &lex_start
    850     ld_t1,a1,0
    851     sub_a2,t0,t1
    852     mov_a1,a0
    853     li_a0 TOK_WORD
    854     la_a3 &lex_saw_separator
    855     ld_a3,a3,0
    856     li_t1 %1 %0
    857     sub_a3,t1,a3
    858     la_br &push_source_token
    859     call
    860     la_a0 &lex_saw_separator
    861     li_t0 %0 %0
    862     st_t0,a0,0
    863     la_br &lex_loop
    864     b
    865 
    866 :lex_done
    867     eret
    868 
    869 ## --- Output: normalized token stream to output_buf ---------------------------
    870 ## emit_newline writes '\n' and clears output_need_space.
    871 ## emit_token prefixes a space when output_need_space is set, copies the
    872 ## token text, then sets output_need_space. This is how source whitespace
    873 ## gets normalized: one '\n' per TOK_NEWLINE, one ' ' between consecutive
    874 ## non-newline tokens.
    875 
    876 ## emit_newline(). Leaf.
    877 :emit_newline
    878     # if (output_used == OUTPUT_CAP) fatal
    879     la_a0 &output_used
    880     ld_t0,a0,0
    881     li_t1 M1PP_OUTPUT_CAP
    882     la_br &err_output_overflow
    883     beq_t0,t1
    884 
    885     # output_buf[output_used] = '\n'; output_used++
    886     la_a1 &output_buf_ptr
    887     ld_a1,a1,0
    888     add_a1,a1,t0
    889     li_t2 %10 %0
    890     sb_t2,a1,0
    891     addi_t0,t0,1
    892     st_t0,a0,0
    893 
    894     # output_need_space = 0
    895     la_a0 &output_need_space
    896     li_a1 %0 %0
    897     st_a1,a0,0
    898     ret
    899 
    900 ## emit_token(a0=token_ptr). Tail-calls emit_string_as_bytes for
    901 ## TOK_STRING (which has its own enter_0/eret frame), so emit_token
    902 ## itself stays leaf for the WORD path.
    903 :emit_token
    904     # brace tokens are no-ops at emit time (belt-and-braces with arg-strip)
    905     ld_t0,a0,0
    906     li_t1 TOK_LBRACE
    907     la_br &emit_token_skip
    908     beq_t0,t1
    909     li_t1 TOK_RBRACE
    910     la_br &emit_token_skip
    911     beq_t0,t1
    912     # Bare TOK_STRING decodes to raw bytes via emit_string_as_bytes.
    913     # Branch (not call): the tail call returns to emit_token's caller.
    914     li_t1 TOK_STRING
    915     la_br &emit_string_as_bytes
    916     beq_t0,t1
    917 
    918     # if (output_need_space) emit ' '  (skip the space for the first token on a line)
    919     la_a1 &output_need_space
    920     ld_t0,a1,0
    921     la_br &emit_token_copy
    922     beqz_t0
    923 
    924     la_a1 &output_used
    925     ld_t0,a1,0
    926     li_t1 M1PP_OUTPUT_CAP
    927     la_br &err_output_overflow
    928     beq_t0,t1
    929     la_a2 &output_buf_ptr
    930     ld_a2,a2,0
    931     add_a2,a2,t0
    932     li_t1 %32 %0
    933     sb_t1,a2,0
    934     addi_t0,t0,1
    935     st_t0,a1,0
    936 
    937 :emit_token_copy
    938     # src = tok->text_ptr; len = tok->text_len; i = 0
    939     ld_t0,a0,8
    940     ld_t1,a0,16
    941     li_t2 %0 %0
    942 :emit_token_loop
    943     # if (i == len) done
    944     la_br &emit_token_done
    945     beq_t2,t1
    946 
    947     # if (output_used == OUTPUT_CAP) fatal
    948     la_a1 &output_used
    949     ld_a2,a1,0
    950     li_a3 M1PP_OUTPUT_CAP
    951     la_br &err_output_overflow
    952     beq_a2,a3
    953 
    954     # output_buf[output_used++] = src[i]
    955     add_a3,t0,t2
    956     lb_a3,a3,0
    957     la_a0 &output_buf_ptr
    958     ld_a0,a0,0
    959     add_a0,a0,a2
    960     sb_a3,a0,0
    961     addi_a2,a2,1
    962     st_a2,a1,0
    963 
    964     # i++
    965     addi_t2,t2,1
    966     la_br &emit_token_loop
    967     b
    968 :emit_token_done
    969     # output_need_space = 1
    970     la_a0 &output_need_space
    971     li_a1 %1 %0
    972     st_a1,a0,0
    973     ret
    974 :emit_token_skip
    975     ret
    976 
    977 
    978 ## --- Main processor ----------------------------------------------------------
    979 ## Stream-driven loop. Pushes source_tokens as the initial stream, then drives
    980 ## the streams[] stack until it empties. Per iteration: pop the stream if
    981 ## exhausted, otherwise dispatch on the current token:
    982 ##   - line-start %macro      -> shim into define_macro via proc_pos
    983 ##   - TOK_NEWLINE            -> emit_newline, advance, set line_start = 1
    984 ##   - WORD + LPAREN follow + name in {! @ % $ %select}
    985 ##                            -> expand_builtin_call(s, tok)
    986 ##   - find_macro(tok) hit + LPAREN follow
    987 ##                            -> expand_call(s, macro)
    988 ##   - otherwise              -> emit_token, advance, clear line_start
    989 ##
    990 ## Stack frame: enter_16 reserves two 8-byte slots so we can preserve the
    991 ## current Stream* (sp+16) and the current Token* (sp+24) across calls
    992 ## (a0..a3, t0..t2 are caller-saved).
    993 
    994 ## process_tokens(): stream-driven main loop.
    995 :process_tokens
    996     enter_16
    997 
    998     # push_stream_span(source_tokens, source_end, -1)
    999     la_a0 &source_tokens_ptr
   1000     ld_a0,a0,0
   1001     la_a1 &source_end
   1002     ld_a1,a1,0
   1003     sub_a2,a2,a2
   1004     addi_a2,a2,neg1
   1005     la_br &push_stream_span
   1006     call
   1007 
   1008 ## proc_loop dispatch refactor:
   1009 ##
   1010 ## A — first-byte gate: most pass-through tokens (plain identifiers,
   1011 ##     hex literals, the synthetic WORDs emitted by !@%$ evaluation)
   1012 ##     don't begin with %, !, @, or $ and exit through proc_emit after
   1013 ##     a single byte compare. The old cascade ran 5 directive +  up to
   1014 ##     7 builtin tok_eq_const probes for *every* WORD.
   1015 ## B — second-byte dispatch: within the c0=='%' branch, a single
   1016 ##     c1-byte switch picks at most one directive/builtin to actually
   1017 ##     compare against (e.g. c1='s' selects %struct/%select/%str only).
   1018 ##     A user macro %FOO with c1 outside {m,s,e,f,b,l} skips every
   1019 ##     tok_eq_const and goes straight to find_macro.
   1020 :proc_loop
   1021     # s = current_stream();  if (s == 0) done
   1022     la_br &current_stream
   1023     call
   1024     la_br &proc_done
   1025     beqz_a0
   1026     st_a0,sp,0
   1027 
   1028     # if (s->pos == s->end) pop and continue
   1029     ld_t0,a0,16
   1030     ld_t1,a0,8
   1031     la_br &proc_pop_continue
   1032     beq_t0,t1
   1033 
   1034     # tok = s->pos
   1035     st_t0,sp,8
   1036 
   1037     # ---- TOK_NEWLINE fast path ----
   1038     ld_a1,t0,0
   1039     li_a2 TOK_NEWLINE
   1040     la_br &proc_handle_newline
   1041     beq_a1,a2
   1042 
   1043     # Non-WORD tokens (LPAREN, RPAREN, COMMA, LBRACE, RBRACE, STRING,
   1044     # PASTE) skip the whole dispatch and emit literally.
   1045     li_a2 TOK_WORD
   1046     la_br &proc_emit
   1047     bne_a1,a2
   1048 
   1049     # tok->text.len, ptr — needed for the byte gate.
   1050     ld_t1,t0,16
   1051     la_br &proc_emit
   1052     beqz_t1
   1053     ld_t2,t0,8
   1054     lb_a3,t2,0
   1055 
   1056     # has_paren = (tok+1 < s->end && (tok+1)->kind == TOK_LPAREN
   1057     #              && (tok+1)->tight). Stash to proc_has_paren.
   1058     li_a0 %0 %0
   1059     la_a1 &proc_has_paren
   1060     st_a0,a1,0
   1061     addi_a2,t0,32
   1062     ld_a1,sp,0
   1063     ld_a1,a1,8
   1064     la_br &proc_byte_gate
   1065     blt_a1,a2
   1066     la_br &proc_byte_gate
   1067     beq_a2,a1
   1068     ld_a0,a2,0
   1069     li_a1 TOK_LPAREN
   1070     la_br &proc_byte_gate
   1071     bne_a0,a1
   1072     ld_a0,a2,24
   1073     la_br &proc_byte_gate
   1074     beqz_a0
   1075     li_a0 %1 %0
   1076     la_a1 &proc_has_paren
   1077     st_a0,a1,0
   1078 
   1079 :proc_byte_gate
   1080     # c0 == '%' (37) -> percent branch
   1081     li_a0 %37 %0
   1082     la_br &proc_c0_pct
   1083     beq_a3,a0
   1084     # Arith builtins ! @ $ require len == 1 + has_paren. All other
   1085     # first-byte values fall through to proc_emit.
   1086     li_a0 %1 %0
   1087     la_br &proc_emit
   1088     bne_t1,a0
   1089     la_a1 &proc_has_paren
   1090     ld_a1,a1,0
   1091     la_br &proc_emit
   1092     beqz_a1
   1093     li_a0 %33 %0
   1094     la_br &proc_do_builtin
   1095     beq_a3,a0
   1096     li_a0 %64 %0
   1097     la_br &proc_do_builtin
   1098     beq_a3,a0
   1099     li_a0 %36 %0
   1100     la_br &proc_do_builtin
   1101     beq_a3,a0
   1102     la_br &proc_emit
   1103     b
   1104 
   1105 :proc_c0_pct
   1106     # c0 == '%'. Bare '%' (len == 1) + tight-paren -> arith builtin.
   1107     li_a0 %1 %0
   1108     la_br &proc_c0_pct_one
   1109     beq_t1,a0
   1110     # len >= 2: dispatch on c1 (text[1]).
   1111     lb_a3,t2,1
   1112 
   1113     li_a0 %109 %0
   1114     la_br &proc_c1_m
   1115     beq_a3,a0
   1116     li_a0 %115 %0
   1117     la_br &proc_c1_s
   1118     beq_a3,a0
   1119     li_a0 %101 %0
   1120     la_br &proc_c1_e
   1121     beq_a3,a0
   1122     li_a0 %102 %0
   1123     la_br &proc_c1_f
   1124     beq_a3,a0
   1125     li_a0 %98 %0
   1126     la_br &proc_c1_b
   1127     beq_a3,a0
   1128     li_a0 %108 %0
   1129     la_br &proc_c1_l
   1130     beq_a3,a0
   1131     # No directive/builtin candidate — try user macro.
   1132     la_br &proc_check_macro
   1133     b
   1134 
   1135 :proc_c0_pct_one
   1136     la_a1 &proc_has_paren
   1137     ld_a1,a1,0
   1138     la_br &proc_emit
   1139     beqz_a1
   1140     la_br &proc_do_builtin
   1141     b
   1142 
   1143 :proc_c1_m
   1144     # %macro
   1145     ld_a0,sp,8
   1146     la_a1 &const_macro
   1147     li_a2 %6 %0
   1148     la_br &tok_eq_const
   1149     call
   1150     la_br &proc_check_macro
   1151     beqz_a0
   1152     ld_a0,sp,0
   1153     ld_a1,sp,8
   1154     la_br &proc_save_pos_and_ls
   1155     call
   1156     la_br &define_macro
   1157     call
   1158     la_br &proc_restore_and_loop
   1159     b
   1160 
   1161 :proc_c1_s
   1162     # %struct (no paren); %select / %str (paren).
   1163     ld_a0,sp,8
   1164     la_a1 &const_struct
   1165     li_a2 %7 %0
   1166     la_br &tok_eq_const
   1167     call
   1168     la_br &proc_c1_s_not_struct
   1169     beqz_a0
   1170     ld_a0,sp,0
   1171     ld_a1,sp,8
   1172     la_br &proc_save_pos_and_ls
   1173     call
   1174     li_a0 %8 %0
   1175     la_a1 &const_size
   1176     li_a2 %4 %0
   1177     la_br &define_fielded
   1178     call
   1179     la_br &proc_restore_and_loop
   1180     b
   1181 :proc_c1_s_not_struct
   1182     la_a1 &proc_has_paren
   1183     ld_a1,a1,0
   1184     la_br &proc_check_macro
   1185     beqz_a1
   1186     ld_a0,sp,8
   1187     la_a1 &const_select
   1188     li_a2 %7 %0
   1189     la_br &tok_eq_const
   1190     call
   1191     la_br &proc_c1_s_not_select
   1192     beqz_a0
   1193     la_br &proc_do_builtin
   1194     b
   1195 :proc_c1_s_not_select
   1196     ld_a0,sp,8
   1197     la_a1 &const_str
   1198     li_a2 %4 %0
   1199     la_br &tok_eq_const
   1200     call
   1201     la_br &proc_check_macro
   1202     beqz_a0
   1203     la_br &proc_do_builtin
   1204     b
   1205 
   1206 :proc_c1_e
   1207     # %enum (no paren); %endframe (no paren).
   1208     ld_a0,sp,8
   1209     la_a1 &const_enum
   1210     li_a2 %5 %0
   1211     la_br &tok_eq_const
   1212     call
   1213     la_br &proc_c1_e_not_enum
   1214     beqz_a0
   1215     ld_a0,sp,0
   1216     ld_a1,sp,8
   1217     la_br &proc_save_pos_and_ls
   1218     call
   1219     li_a0 %1 %0
   1220     la_a1 &const_count
   1221     li_a2 %5 %0
   1222     la_br &define_fielded
   1223     call
   1224     la_br &proc_restore_and_loop
   1225     b
   1226 :proc_c1_e_not_enum
   1227     ld_a0,sp,8
   1228     la_a1 &const_endframe
   1229     li_a2 %9 %0
   1230     la_br &tok_eq_const
   1231     call
   1232     la_br &proc_check_macro
   1233     beqz_a0
   1234     ld_a0,sp,0
   1235     ld_a1,sp,8
   1236     la_br &proc_save_pos_and_ls
   1237     call
   1238     ld_a0,sp,0
   1239     ld_a0,a0,8
   1240     la_br &pop_frame
   1241     call
   1242     la_br &proc_restore_and_loop
   1243     b
   1244 
   1245 :proc_c1_f
   1246     # %frame (no paren).
   1247     ld_a0,sp,8
   1248     la_a1 &const_frame
   1249     li_a2 %6 %0
   1250     la_br &tok_eq_const
   1251     call
   1252     la_br &proc_check_macro
   1253     beqz_a0
   1254     ld_a0,sp,0
   1255     ld_a1,sp,8
   1256     la_br &proc_save_pos_and_ls
   1257     call
   1258     ld_a0,sp,0
   1259     ld_a0,a0,8
   1260     la_br &push_frame
   1261     call
   1262     la_br &proc_restore_and_loop
   1263     b
   1264 
   1265 :proc_c1_b
   1266     # %bytes (paren).
   1267     la_a1 &proc_has_paren
   1268     ld_a1,a1,0
   1269     la_br &proc_check_macro
   1270     beqz_a1
   1271     ld_a0,sp,8
   1272     la_a1 &const_bytes
   1273     li_a2 %6 %0
   1274     la_br &tok_eq_const
   1275     call
   1276     la_br &proc_check_macro
   1277     beqz_a0
   1278     la_br &proc_do_builtin
   1279     b
   1280 
   1281 :proc_c1_l
   1282     # %local (paren).
   1283     la_a1 &proc_has_paren
   1284     ld_a1,a1,0
   1285     la_br &proc_check_macro
   1286     beqz_a1
   1287     ld_a0,sp,8
   1288     la_a1 &const_local
   1289     li_a2 %6 %0
   1290     la_br &tok_eq_const
   1291     call
   1292     la_br &proc_check_macro
   1293     beqz_a0
   1294     la_br &proc_do_builtin
   1295     b
   1296 
   1297 :proc_handle_newline
   1298     ld_a0,sp,0
   1299     ld_t0,sp,8
   1300     addi_t0,t0,32
   1301     st_t0,a0,16
   1302     li_t1 %1 %0
   1303     st_t1,a0,24
   1304     la_br &emit_newline
   1305     call
   1306     la_br &proc_loop
   1307     b
   1308 
   1309 :proc_do_builtin
   1310     # expand_builtin_call(s, tok)
   1311     ld_a0,sp,0
   1312     ld_a1,sp,8
   1313     la_br &expand_builtin_call
   1314     call
   1315     la_br &proc_loop
   1316     b
   1317 
   1318 :proc_check_macro
   1319     # macro = find_macro(tok); if non-zero AND
   1320     #   ((tok+1 < s->end AND (tok+1)->kind == TOK_LPAREN) OR macro->param_count == 0)
   1321     # then expand_call. Paren-less form is reserved for 0-arg macros.
   1322     ld_a0,sp,8
   1323     la_br &find_macro
   1324     call
   1325     la_br &proc_emit
   1326     beqz_a0
   1327     mov_t2,a0
   1328     ld_a0,sp,0
   1329     ld_t0,sp,8
   1330     addi_t1,t0,32
   1331     ld_a1,a0,8
   1332     la_br &proc_macro_has_next
   1333     blt_t1,a1
   1334     la_br &proc_macro_zero_arg
   1335     b
   1336 :proc_macro_has_next
   1337     ld_a1,t1,0
   1338     li_a2 TOK_LPAREN
   1339     la_br &proc_macro_zero_arg
   1340     bne_a1,a2
   1341     # require (tok+1)->tight — `%FOO ( ... )` with whitespace is the
   1342     # paren-less form (zero-arg only) followed by a literal `(`.
   1343     ld_a1,t1,24
   1344     la_br &proc_macro_zero_arg
   1345     beqz_a1
   1346     ld_a0,sp,0
   1347     mov_a1,t2
   1348     la_br &expand_call
   1349     call
   1350     la_br &proc_loop
   1351     b
   1352 :proc_macro_zero_arg
   1353     # No trailing LPAREN (or LPAREN not tight). Expand only if param_count == 0.
   1354     ld_t0,t2,16
   1355     la_br &proc_emit
   1356     bnez_t0
   1357     ld_a0,sp,0
   1358     mov_a1,t2
   1359     la_br &expand_call
   1360     call
   1361     la_br &proc_loop
   1362     b
   1363 
   1364 :proc_emit
   1365     # emit_token(tok); s->pos += 24; s->line_start = 0
   1366     ld_a0,sp,8
   1367     la_br &emit_token
   1368     call
   1369     ld_a0,sp,0
   1370     ld_t0,a0,16
   1371     addi_t0,t0,32
   1372     st_t0,a0,16
   1373     li_t1 %0 %0
   1374     st_t1,a0,24
   1375     la_br &proc_loop
   1376     b
   1377 
   1378 :proc_pop_continue
   1379     la_br &pop_stream
   1380     call
   1381     la_br &proc_loop
   1382     b
   1383 
   1384 :proc_done
   1385     # Every %frame must be matched by an %endframe before EOF.
   1386     la_a0 &frame_active
   1387     ld_t0,a0,0
   1388     la_br &err_frame_not_closed
   1389     bnez_t0
   1390     eret
   1391 
   1392 ## proc_save_pos_and_ls(a0=s, a1=tok): publish the directive's position into
   1393 ## proc_pos / proc_line_start so the directive handler can drive against the
   1394 ## source stream. Leaf — preserves the caller's other state.
   1395 :proc_save_pos_and_ls
   1396     la_t0 &proc_pos
   1397     st_a1,t0,0
   1398     ld_t1,a0,24
   1399     la_t0 &proc_line_start
   1400     st_t1,t0,0
   1401     ret
   1402 
   1403 ## proc_restore_and_loop: reached only via `b` (sp must be intact). Reads
   1404 ## sp,0 = s; copies proc_pos into s->pos, sets s->line_start=1, jumps to
   1405 ## proc_loop. Tail of every directive shim above.
   1406 :proc_restore_and_loop
   1407     ld_a0,sp,0
   1408     la_a1 &proc_pos
   1409     ld_t0,a1,0
   1410     st_t0,a0,16
   1411     li_t1 %1 %0
   1412     st_t1,a0,24
   1413     la_br &proc_loop
   1414     b
   1415 
   1416 ## --- %frame / %endframe handlers --------------------------------------------
   1417 ## Single-slot frame state used by %local. push_frame(a0=stream_end) parses
   1418 ## `%frame NAME`, stashes name's TextSpan in current_frame_ptr/_len, and
   1419 ## sets frame_active = 1. pop_frame(a0=stream_end) clears frame_active.
   1420 ## Frames do not nest — a second push without an intervening pop is fatal.
   1421 
   1422 :push_frame
   1423     enter_0
   1424 
   1425     # proc_pos += 32 (skip past the `%frame` token).
   1426     la_t0 &proc_pos
   1427     ld_t1,t0,0
   1428     addi_t1,t1,32
   1429     st_t1,t0,0
   1430 
   1431     # Skip newlines between `%frame` and NAME.
   1432     la_a1 &pf_stream_end
   1433     st_a0,a1,0
   1434     la_br &proc_skip_newlines
   1435     call
   1436     la_a1 &pf_stream_end
   1437     ld_a0,a1,0
   1438     la_t0 &proc_pos
   1439     ld_t1,t0,0
   1440 
   1441     # Require a WORD name token within the stream.
   1442     la_br &err_bad_frame_header
   1443     beq_t1,a0
   1444     ld_t2,t1,0
   1445     la_br &err_bad_frame_header
   1446     bnez_t2
   1447 
   1448     # !frame_active (cannot nest)
   1449     la_a1 &frame_active
   1450     ld_a2,a1,0
   1451     la_br &err_frame_already_active
   1452     bnez_a2
   1453 
   1454     # current_frame_ptr = name.text_ptr; current_frame_len = name.text_len
   1455     la_a3 &current_frame_ptr
   1456     ld_t2,t1,8
   1457     st_t2,a3,0
   1458     la_a3 &current_frame_len
   1459     ld_t2,t1,16
   1460     st_t2,a3,0
   1461 
   1462     # frame_active = 1
   1463     li_a2 %1 %0
   1464     st_a2,a1,0
   1465 
   1466     # proc_pos += 32 (past the name).
   1467     la_t0 &proc_pos
   1468     ld_t1,t0,0
   1469     addi_t1,t1,32
   1470     st_t1,t0,0
   1471 
   1472     # Newlines between `%frame NAME` and the body content are insignificant.
   1473     la_br &proc_skip_newlines
   1474     call
   1475     eret
   1476 
   1477 ## pop_frame(a0 = stream_end): consume `%endframe` followed by a strict
   1478 ## TOK_NEWLINE. Fatal if no frame is active.
   1479 :pop_frame
   1480     enter_0
   1481 
   1482     # frame_active?
   1483     la_a1 &frame_active
   1484     ld_a2,a1,0
   1485     la_br &err_frame_underflow
   1486     beqz_a2
   1487     li_a2 %0 %0
   1488     st_a2,a1,0
   1489 
   1490     # proc_pos += 32 (past the `%endframe` token).
   1491     la_t0 &proc_pos
   1492     ld_t1,t0,0
   1493     addi_t1,t1,32
   1494     st_t1,t0,0
   1495 
   1496     # Strict: the token immediately after `%endframe` must be TOK_NEWLINE.
   1497     la_br &err_bad_frame_header
   1498     beq_t1,a0
   1499     ld_t2,t1,0
   1500     li_t0 TOK_NEWLINE
   1501     la_br &err_bad_frame_header
   1502     bne_t2,t0
   1503     # Consume the trailing newline only when %endframe sat at line-start;
   1504     # mid-line %endframe leaves the newline so it can be emitted.
   1505     la_t0 &proc_line_start
   1506     ld_a1,t0,0
   1507     la_br &pop_frame_done
   1508     beqz_a1
   1509     addi_t1,t1,32
   1510     la_t0 &proc_pos
   1511     st_t1,t0,0
   1512 :pop_frame_done
   1513     eret
   1514 
   1515 ## --- %macro storage: parse header + body into macros[] / macro_body_tokens --
   1516 ## Called at proc_pos == line-start `%macro`. Leaves proc_pos past the %endm
   1517 ## line with proc_line_start = 1. Uses BSS scratch (def_m_ptr, def_param_ptr,
   1518 ## def_body_line_start) since P1 enter/eret does not save s* registers.
   1519 ##
   1520 ## Macro record layout (296 bytes, see M1PP_MACRO_RECORD_SIZE):
   1521 ##   +0   name.ptr        (8)
   1522 ##   +8   name.len        (8)
   1523 ##   +16  param_count     (8)
   1524 ##   +24  params[16].ptr/.len  (16 * 16 = 256)
   1525 ##   +280 body_start      (8)  -> *Token into macro_body_tokens[]
   1526 ##   +288 body_end        (8)  -> exclusive end
   1527 
   1528 ## define_macro(): consume `%macro NAME(params...)\n ... %endm\n`.
   1529 :define_macro
   1530     enter_0
   1531 
   1532     # macros_end bounds check: if (macros_end == &macros + MACROS_CAP) fatal
   1533     la_a0 &macros_end
   1534     ld_t0,a0,0
   1535     la_a1 &macros_ptr
   1536     ld_a1,a1,0
   1537     li_a2 M1PP_MACROS_CAP
   1538     add_a1,a1,a2
   1539     la_br &err_too_many_macros
   1540     beq_t0,a1
   1541 
   1542     # def_m_ptr = macros_end   (Macro *m = &macros[macro_count])
   1543     la_a1 &def_m_ptr
   1544     st_t0,a1,0
   1545 
   1546     # advance past the %macro token itself
   1547     la_a0 &proc_pos
   1548     ld_t0,a0,0
   1549     addi_t0,t0,32
   1550     st_t0,a0,0
   1551 
   1552     # Header is whitespace-insensitive: newlines between the keyword and
   1553     # any header element (NAME, '(', params, ',', ')') are skipped.
   1554     la_br &proc_skip_newlines
   1555     call
   1556 
   1557     # ---- header: name (WORD) ----
   1558     la_a0 &proc_pos
   1559     ld_t0,a0,0
   1560     la_a1 &source_end
   1561     ld_t1,a1,0
   1562     la_br &err_bad_macro_header
   1563     beq_t0,t1
   1564     ld_a1,t0,0
   1565     li_a2 TOK_WORD
   1566     la_br &err_bad_macro_header
   1567     bne_a1,a2
   1568 
   1569     # m->name.ptr = tok->text_ptr; m->name.len = tok->text_len
   1570     ld_a2,t0,8
   1571     ld_a3,t0,16
   1572     la_a0 &def_m_ptr
   1573     ld_t2,a0,0
   1574     st_a2,t2,0
   1575     st_a3,t2,8
   1576 
   1577     # m->param_count = 0; def_param_ptr = m + 24 (first TextSpan slot in macro)
   1578     li_a0 %0 %0
   1579     st_a0,t2,16
   1580     addi_t2,t2,24
   1581     la_a0 &def_param_ptr
   1582     st_t2,a0,0
   1583 
   1584     # advance past name
   1585     addi_t0,t0,32
   1586     la_a0 &proc_pos
   1587     st_t0,a0,0
   1588 
   1589     la_br &proc_skip_newlines
   1590     call
   1591 
   1592     # ---- header: LPAREN ----
   1593     la_a0 &proc_pos
   1594     ld_t0,a0,0
   1595     la_a1 &source_end
   1596     ld_t1,a1,0
   1597     la_br &err_bad_macro_header
   1598     beq_t0,t1
   1599     ld_a1,t0,0
   1600     li_a2 TOK_LPAREN
   1601     la_br &err_bad_macro_header
   1602     bne_a1,a2
   1603 
   1604     # advance past '('
   1605     addi_t0,t0,32
   1606     la_a0 &proc_pos
   1607     st_t0,a0,0
   1608 
   1609     la_br &proc_skip_newlines
   1610     call
   1611 
   1612     # ---- header: optional param list ----
   1613     # if at end -> fall through to RPAREN check (which will fail)
   1614     # if next is RPAREN -> skip the param loop
   1615     # else enter param loop
   1616     la_a0 &proc_pos
   1617     ld_t0,a0,0
   1618     la_a1 &source_end
   1619     ld_t1,a1,0
   1620     la_br &def_header_close
   1621     beq_t0,t1
   1622     ld_a1,t0,0
   1623     li_a2 TOK_RPAREN
   1624     la_br &def_header_close
   1625     beq_a1,a2
   1626 
   1627 :def_param_loop
   1628     # reject > 16 params: if (15 < param_count) fail   (param_count capped at 16)
   1629     la_a0 &def_m_ptr
   1630     ld_t2,a0,0
   1631     ld_a1,t2,16
   1632     li_a2 %15 %0
   1633     la_br &err_bad_macro_header
   1634     blt_a2,a1
   1635 
   1636     # tok must be in range and WORD
   1637     la_a0 &proc_pos
   1638     ld_t0,a0,0
   1639     la_a1 &source_end
   1640     ld_t1,a1,0
   1641     la_br &err_bad_macro_header
   1642     beq_t0,t1
   1643     ld_a1,t0,0
   1644     li_a2 TOK_WORD
   1645     la_br &err_bad_macro_header
   1646     bne_a1,a2
   1647 
   1648     # *def_param_ptr = (tok.text_ptr, tok.text_len); def_param_ptr += 16
   1649     ld_a2,t0,8
   1650     ld_a3,t0,16
   1651     la_a0 &def_param_ptr
   1652     ld_t1,a0,0
   1653     st_a2,t1,0
   1654     st_a3,t1,8
   1655     addi_t1,t1,16
   1656     st_t1,a0,0
   1657 
   1658     # m->param_count++
   1659     la_a0 &def_m_ptr
   1660     ld_t2,a0,0
   1661     ld_a1,t2,16
   1662     addi_a1,a1,1
   1663     st_a1,t2,16
   1664 
   1665     # advance past the param word
   1666     addi_t0,t0,32
   1667     la_a0 &proc_pos
   1668     st_t0,a0,0
   1669 
   1670     # Skip newlines between a param and the following ',' or ')'.
   1671     la_br &proc_skip_newlines
   1672     call
   1673 
   1674     # if next is COMMA, consume and loop; else break
   1675     la_a0 &proc_pos
   1676     ld_t0,a0,0
   1677     la_a1 &source_end
   1678     ld_t1,a1,0
   1679     la_br &def_header_close
   1680     beq_t0,t1
   1681     ld_a1,t0,0
   1682     li_a2 TOK_COMMA
   1683     la_br &def_header_close
   1684     bne_a1,a2
   1685     addi_t0,t0,32
   1686     la_a0 &proc_pos
   1687     st_t0,a0,0
   1688     # Skip newlines after ',' so the next param can be on a new line.
   1689     la_br &proc_skip_newlines
   1690     call
   1691     la_br &def_param_loop
   1692     b
   1693 
   1694 :def_header_close
   1695     # ---- header: RPAREN ----
   1696     la_a0 &proc_pos
   1697     ld_t0,a0,0
   1698     la_a1 &source_end
   1699     ld_t1,a1,0
   1700     la_br &err_bad_macro_header
   1701     beq_t0,t1
   1702     ld_a1,t0,0
   1703     li_a2 TOK_RPAREN
   1704     la_br &err_bad_macro_header
   1705     bne_a1,a2
   1706 
   1707     addi_t0,t0,32
   1708     la_a0 &proc_pos
   1709     st_t0,a0,0
   1710 
   1711     # ---- header self-terminates at ')'. Newlines between the header line
   1712     # ----  and the body content are insignificant — skip them.
   1713     la_br &proc_skip_newlines
   1714     call
   1715 
   1716     # ---- body: m->body_start = macro_body_end ----
   1717     la_a1 &macro_body_end
   1718     ld_t2,a1,0
   1719     la_a0 &def_m_ptr
   1720     ld_t1,a0,0
   1721     li_a0 M1PP_MACRO_BODY_START_OFF
   1722     add_a0,t1,a0
   1723     st_t2,a0,0
   1724 
   1725 :def_body_loop
   1726     # if proc_pos == source_end: unterminated %macro
   1727     la_a0 &proc_pos
   1728     ld_t0,a0,0
   1729     la_a1 &source_end
   1730     ld_t1,a1,0
   1731     la_br &err_unterminated_macro
   1732     beq_t0,t1
   1733 
   1734     # %endm is recognized anywhere in the body (no line-start gating). If
   1735     # (tok.kind == TOK_WORD) and (tok eq "%endm"), break; else copy token.
   1736     ld_a1,t0,0
   1737     li_a2 TOK_WORD
   1738     la_br &def_body_copy
   1739     bne_a1,a2
   1740 
   1741     mov_a0,t0
   1742     la_a1 &const_endm
   1743     li_a2 %5 %0
   1744     la_br &tok_eq_const
   1745     call
   1746     la_br &def_body_copy
   1747     beqz_a0
   1748 
   1749     # matched %endm: advance past it and require a TOK_NEWLINE next.
   1750     la_br &def_endm_after
   1751     b
   1752 
   1753 :def_body_copy
   1754     # bounds: if (macro_body_end - macro_body_tokens + 32 > MACRO_BODY_CAP) fail
   1755     la_a0 &macro_body_end
   1756     ld_t1,a0,0
   1757     la_a2 &macro_body_tokens_ptr
   1758     ld_a2,a2,0
   1759     sub_a3,t1,a2
   1760     addi_a3,a3,32
   1761     li_t2 M1PP_MACRO_BODY_CAP
   1762     la_br &err_macro_body_overflow
   1763     blt_t2,a3
   1764 
   1765     # === C: cache classification for this body token, so expand_macro_tokens
   1766     # === doesn't re-run find_param + is_local_label_token per body token per
   1767     # === expansion. Computed once at definition; written to byte-stride
   1768     # === parallel arrays indexed by (slot - macro_body_tokens_ptr) / 32.
   1769     # offset = (t1 - macro_body_tokens_ptr) / 32   (t1 still = macro_body_end)
   1770     la_a0 &macro_body_tokens_ptr
   1771     ld_a0,a0,0
   1772     sub_a0,t1,a0
   1773     shri_a0,a0,5
   1774     la_a1 &def_body_meta_idx
   1775     st_a0,a1,0
   1776 
   1777     # macro_body_param_idx[idx] = find_param(def_m_ptr, proc_pos)
   1778     la_a0 &def_m_ptr
   1779     ld_a0,a0,0
   1780     la_a1 &proc_pos
   1781     ld_a1,a1,0
   1782     la_br &find_param
   1783     call
   1784     la_a1 &def_body_meta_idx
   1785     ld_a1,a1,0
   1786     la_a2 &macro_body_param_idx_ptr
   1787     ld_a2,a2,0
   1788     add_a2,a2,a1
   1789     sb_a0,a2,0
   1790 
   1791     # macro_body_is_local_label[idx] = (kind==WORD && len>=3 && p[0] in ':&' && p[1]=='@')
   1792     # Inlined — there's no separate is_local_label_token function in P1; the
   1793     # full predicate is replicated here in def-time, replacing the per-expansion
   1794     # emt_check_local_label sequence.
   1795     la_a0 &proc_pos
   1796     ld_t0,a0,0
   1797     li_a3 %0 %0
   1798     ld_a1,t0,0
   1799     la_br &def_body_copy_ill_store
   1800     bnez_a1
   1801     ld_a1,t0,16
   1802     li_a2 %3 %0
   1803     la_br &def_body_copy_ill_store
   1804     blt_a1,a2
   1805     ld_a2,t0,8
   1806     lb_a1,a2,0
   1807     li_a0 %58 %0
   1808     la_br &def_body_copy_ill_at
   1809     beq_a1,a0
   1810     li_a0 %38 %0
   1811     la_br &def_body_copy_ill_store
   1812     bne_a1,a0
   1813 :def_body_copy_ill_at
   1814     lb_a1,a2,1
   1815     li_a0 %64 %0
   1816     la_br &def_body_copy_ill_store
   1817     bne_a1,a0
   1818     li_a3 %1 %0
   1819 :def_body_copy_ill_store
   1820     la_a1 &def_body_meta_idx
   1821     ld_a1,a1,0
   1822     la_a2 &macro_body_is_local_label_ptr
   1823     ld_a2,a2,0
   1824     add_a2,a2,a1
   1825     sb_a3,a2,0
   1826     # === end C ===
   1827 
   1828     # If the body token is TOK_PASTE, set m->has_paste = 1. expand_macro_tokens
   1829     # uses this (combined with args_have_paste) to skip paste_pool_range when
   1830     # neither the body nor the call's args contribute a `##`. Defaulted to 0
   1831     # by BSS init when macros_end advanced into this slot.
   1832     la_a0 &proc_pos
   1833     ld_t0,a0,0
   1834     ld_a1,t0,0
   1835     li_a2 TOK_PASTE
   1836     la_br &def_body_copy_after_paste_chk
   1837     bne_a1,a2
   1838     la_a0 &def_m_ptr
   1839     ld_a2,a0,0
   1840     li_a3 M1PP_MACRO_HAS_PASTE_OFF
   1841     add_a2,a2,a3
   1842     li_a3 %1 %0
   1843     st_a3,a2,0
   1844 
   1845 :def_body_copy_after_paste_chk
   1846     # Reload t1 = macro_body_end — clobbered by find_param above.
   1847     la_a0 &macro_body_end
   1848     ld_t1,a0,0
   1849 
   1850     # copy 32 bytes from *proc_pos to *macro_body_end (preserves tight at +24)
   1851     la_a0 &proc_pos
   1852     ld_t0,a0,0
   1853     ld_a1,t0,0
   1854     st_a1,t1,0
   1855     ld_a1,t0,8
   1856     st_a1,t1,8
   1857     ld_a1,t0,16
   1858     st_a1,t1,16
   1859     ld_a1,t0,24
   1860     st_a1,t1,24
   1861 
   1862     # macro_body_end += 32
   1863     addi_t1,t1,32
   1864     la_a0 &macro_body_end
   1865     st_t1,a0,0
   1866 
   1867     # proc_pos += 32
   1868     addi_t0,t0,32
   1869     la_a0 &proc_pos
   1870     st_t0,a0,0
   1871     la_br &def_body_loop
   1872     b
   1873 
   1874 :def_endm_after
   1875     # advance past the %endm token; the next token MUST be TOK_NEWLINE.
   1876     la_a0 &proc_pos
   1877     ld_t0,a0,0
   1878     addi_t0,t0,32
   1879     st_t0,a0,0
   1880     la_a1 &source_end
   1881     ld_t1,a1,0
   1882     la_br &err_bad_macro_header
   1883     beq_t0,t1
   1884     ld_a1,t0,0
   1885     li_a2 TOK_NEWLINE
   1886     la_br &err_bad_macro_header
   1887     bne_a1,a2
   1888     # Consume the trailing NEWLINE only when the directive started at
   1889     # line-start; mid-line directives leave the newline in the stream so
   1890     # the outer loop emits it (preserving line layout).
   1891     la_a0 &proc_line_start
   1892     ld_a1,a0,0
   1893     la_br &def_finish
   1894     beqz_a1
   1895     addi_t0,t0,32
   1896     la_a0 &proc_pos
   1897     st_t0,a0,0
   1898 
   1899 :def_finish
   1900     # m->body_end = macro_body_end
   1901     la_a1 &macro_body_end
   1902     ld_t2,a1,0
   1903     la_a0 &def_m_ptr
   1904     ld_t1,a0,0
   1905     li_a0 M1PP_MACRO_BODY_END_OFF
   1906     add_a0,t1,a0
   1907     st_t2,a0,0
   1908 
   1909     # macros_end += MACRO_RECORD_SIZE
   1910     la_a0 &macros_end
   1911     ld_t0,a0,0
   1912     li_a1 M1PP_MACRO_RECORD_SIZE
   1913     add_t0,t0,a1
   1914     st_t0,a0,0
   1915 
   1916     # caller resumes at line start
   1917     la_a0 &proc_line_start
   1918     li_a1 %1 %0
   1919     st_a1,a0,0
   1920     eret
   1921 
   1922 ## --- %struct / %enum directive ----------------------------------------------
   1923 ## define_fielded(a0=stride, a1=total_name_ptr, a2=total_name_len).
   1924 ## Parses `%struct NAME { f1 f2 ... }` or `%enum NAME { ... }` (caller has
   1925 ## already detected %struct / %enum at line start and primed proc_pos to
   1926 ## that token). Synthesizes N+1 zero-parameter macros — NAME.field_k -> k*stride
   1927 ## and NAME.<total_name> -> N*stride — by appending each {name, body-token}
   1928 ## pair into macros[] / macro_body_tokens[].
   1929 ##
   1930 ## All working state lives in BSS (df_* slots + df_name_scratch / df_digit_scratch)
   1931 ## because df_emit_field calls append_text, which clobbers caller-saved regs.
   1932 :define_fielded
   1933     enter_0
   1934 
   1935     # Save directive args to BSS.
   1936     la_a3 &df_stride
   1937     st_a0,a3,0
   1938     la_a3 &df_total_name_ptr
   1939     st_a1,a3,0
   1940     la_a3 &df_total_name_len
   1941     st_a2,a3,0
   1942 
   1943     # advance past the %struct / %enum directive token
   1944     la_a0 &proc_pos
   1945     ld_t0,a0,0
   1946     addi_t0,t0,32
   1947     st_t0,a0,0
   1948 
   1949     # Header is whitespace-insensitive: newlines between %struct/%enum and
   1950     # the NAME (and between NAME and '{') are skipped.
   1951     la_br &proc_skip_newlines
   1952     call
   1953 
   1954     # ---- header: name (WORD) ----
   1955     la_a0 &proc_pos
   1956     ld_t0,a0,0
   1957     la_a1 &source_end
   1958     ld_t1,a1,0
   1959     la_br &err_bad_directive
   1960     beq_t0,t1
   1961     ld_a1,t0,0
   1962     li_a2 TOK_WORD
   1963     la_br &err_bad_directive
   1964     bne_a1,a2
   1965 
   1966     # df_base_ptr = tok.text_ptr; df_base_len = tok.text_len
   1967     ld_a2,t0,8
   1968     la_a3 &df_base_ptr
   1969     st_a2,a3,0
   1970     ld_a2,t0,16
   1971     la_a3 &df_base_len
   1972     st_a2,a3,0
   1973 
   1974     # advance past the base name
   1975     addi_t0,t0,32
   1976     la_a0 &proc_pos
   1977     st_t0,a0,0
   1978 
   1979 ## skip NEWLINE tokens before '{' (tolerates `%struct NAME\n{ ... }`)
   1980 :df_skip_nl_before_lbrace
   1981     la_a0 &proc_pos
   1982     ld_t0,a0,0
   1983     la_a1 &source_end
   1984     ld_t1,a1,0
   1985     la_br &err_bad_directive
   1986     beq_t0,t1
   1987     ld_a1,t0,0
   1988     li_a2 TOK_NEWLINE
   1989     la_br &df_require_lbrace
   1990     bne_a1,a2
   1991     addi_t0,t0,32
   1992     la_a0 &proc_pos
   1993     st_t0,a0,0
   1994     la_br &df_skip_nl_before_lbrace
   1995     b
   1996 
   1997 :df_require_lbrace
   1998     # expect LBRACE
   1999     li_a2 TOK_LBRACE
   2000     la_br &err_bad_directive
   2001     bne_a1,a2
   2002 
   2003     # advance past '{'
   2004     addi_t0,t0,32
   2005     la_a0 &proc_pos
   2006     st_t0,a0,0
   2007 
   2008     # df_index = 0
   2009     li_a0 %0 %0
   2010     la_a1 &df_index
   2011     st_a0,a1,0
   2012 
   2013 ## field loop: skip comma/newline separators, stop at '}', else consume a WORD.
   2014 :df_field_loop
   2015     la_a0 &proc_pos
   2016     ld_t0,a0,0
   2017     la_a1 &source_end
   2018     ld_t1,a1,0
   2019     la_br &err_unterminated_directive
   2020     beq_t0,t1
   2021     ld_a1,t0,0
   2022 
   2023     # separator: COMMA or NEWLINE -> advance and reloop
   2024     li_a2 TOK_COMMA
   2025     la_br &df_field_skip_sep
   2026     beq_a1,a2
   2027     li_a2 TOK_NEWLINE
   2028     la_br &df_field_skip_sep
   2029     beq_a1,a2
   2030 
   2031     # end-of-list marker '}' -> break
   2032     li_a2 TOK_RBRACE
   2033     la_br &df_fields_done
   2034     beq_a1,a2
   2035 
   2036     # else must be a WORD
   2037     li_a2 TOK_WORD
   2038     la_br &err_bad_directive
   2039     bne_a1,a2
   2040 
   2041     # df_suffix_ptr = tok.text_ptr; df_suffix_len = tok.text_len
   2042     ld_a2,t0,8
   2043     la_a3 &df_suffix_ptr
   2044     st_a2,a3,0
   2045     ld_a2,t0,16
   2046     la_a3 &df_suffix_len
   2047     st_a2,a3,0
   2048 
   2049     # df_value = df_index * df_stride
   2050     la_a0 &df_index
   2051     ld_t1,a0,0
   2052     la_a0 &df_stride
   2053     ld_t2,a0,0
   2054     mul_a0,t1,t2
   2055     la_a1 &df_value
   2056     st_a0,a1,0
   2057 
   2058     # synthesize the field macro
   2059     la_br &df_emit_field
   2060     call
   2061 
   2062     # df_index++
   2063     la_a0 &df_index
   2064     ld_t1,a0,0
   2065     addi_t1,t1,1
   2066     st_t1,a0,0
   2067 
   2068     # advance past the field word
   2069     la_a0 &proc_pos
   2070     ld_t0,a0,0
   2071     addi_t0,t0,32
   2072     st_t0,a0,0
   2073     la_br &df_field_loop
   2074     b
   2075 
   2076 :df_field_skip_sep
   2077     addi_t0,t0,32
   2078     la_a0 &proc_pos
   2079     st_t0,a0,0
   2080     la_br &df_field_loop
   2081     b
   2082 
   2083 :df_fields_done
   2084     # advance past '}'
   2085     addi_t0,t0,32
   2086     la_a0 &proc_pos
   2087     st_t0,a0,0
   2088 
   2089     # ---- emit totalizer: df_suffix <- df_total_name; df_value = N * stride ----
   2090     la_a0 &df_total_name_ptr
   2091     ld_t0,a0,0
   2092     la_a1 &df_suffix_ptr
   2093     st_t0,a1,0
   2094     la_a0 &df_total_name_len
   2095     ld_t0,a0,0
   2096     la_a1 &df_suffix_len
   2097     st_t0,a1,0
   2098 
   2099     la_a0 &df_index
   2100     ld_t1,a0,0
   2101     la_a0 &df_stride
   2102     ld_t2,a0,0
   2103     mul_a0,t1,t2
   2104     la_a1 &df_value
   2105     st_a0,a1,0
   2106 
   2107     la_br &df_emit_field
   2108     call
   2109 
   2110     # Strict: the closing '}' must be immediately followed by TOK_NEWLINE.
   2111     # Consume that newline only when the directive started at line-start,
   2112     # mirroring %endm / %endframe.
   2113     la_a0 &proc_pos
   2114     ld_t0,a0,0
   2115     la_a1 &source_end
   2116     ld_t1,a1,0
   2117     la_br &err_bad_directive
   2118     beq_t0,t1
   2119     ld_a1,t0,0
   2120     li_a2 TOK_NEWLINE
   2121     la_br &err_bad_directive
   2122     bne_a1,a2
   2123     la_a0 &proc_line_start
   2124     ld_a1,a0,0
   2125     la_br &df_finish
   2126     beqz_a1
   2127     addi_t0,t0,32
   2128     la_a0 &proc_pos
   2129     st_t0,a0,0
   2130 
   2131 :df_finish
   2132     la_a0 &proc_line_start
   2133     li_a1 %1 %0
   2134     st_a1,a0,0
   2135     eret
   2136 
   2137 ## df_emit_field(): read df_base_*, df_suffix_*, df_value from BSS; synthesize
   2138 ## one macro record + one body token. Builds the "NAME.field" identifier in
   2139 ## df_name_scratch and the decimal body text via df_render_decimal, then
   2140 ## copies both into text_buf via append_text so they outlive the scratch.
   2141 :df_emit_field
   2142     enter_0
   2143 
   2144     # macros_end capacity check
   2145     la_a0 &macros_end
   2146     ld_t0,a0,0
   2147     la_a1 &macros_ptr
   2148     ld_a1,a1,0
   2149     li_a2 M1PP_MACROS_CAP
   2150     add_a1,a1,a2
   2151     la_br &err_too_many_macros
   2152     beq_t0,a1
   2153 
   2154     # ---- assemble "BASE.SUFFIX" into df_name_scratch ----
   2155     # copy base bytes
   2156     la_a0 &df_base_ptr
   2157     ld_t0,a0,0
   2158     la_a0 &df_base_len
   2159     ld_t1,a0,0
   2160     la_t2 &df_name_scratch_ptr
   2161     ld_t2,t2,0
   2162     li_a3 %0 %0
   2163 :df_ef_base_loop
   2164     la_br &df_ef_base_done
   2165     beq_a3,t1
   2166     add_a0,t0,a3
   2167     lb_a0,a0,0
   2168     add_a1,t2,a3
   2169     sb_a0,a1,0
   2170     addi_a3,a3,1
   2171     la_br &df_ef_base_loop
   2172     b
   2173 :df_ef_base_done
   2174     # scratch[base_len] = '.'
   2175     add_a1,t2,t1
   2176     li_a0 %46 %0
   2177     sb_a0,a1,0
   2178 
   2179     # copy suffix bytes into scratch[base_len + 1 ..]
   2180     la_a0 &df_suffix_ptr
   2181     ld_t0,a0,0
   2182     la_a0 &df_suffix_len
   2183     ld_t1,a0,0
   2184     addi_a1,a1,1
   2185     li_a3 %0 %0
   2186 :df_ef_suffix_loop
   2187     la_br &df_ef_suffix_done
   2188     beq_a3,t1
   2189     add_a0,t0,a3
   2190     lb_a0,a0,0
   2191     add_a2,a1,a3
   2192     sb_a0,a2,0
   2193     addi_a3,a3,1
   2194     la_br &df_ef_suffix_loop
   2195     b
   2196 :df_ef_suffix_done
   2197 
   2198     # name_len = base_len + 1 + suffix_len
   2199     la_a0 &df_base_len
   2200     ld_t0,a0,0
   2201     la_a0 &df_suffix_len
   2202     ld_t1,a0,0
   2203     add_t0,t0,t1
   2204     addi_t0,t0,1
   2205     la_a1 &df_name_len
   2206     st_t0,a1,0
   2207 
   2208     # durable_name = append_text(&df_name_scratch, name_len)
   2209     la_a0 &df_name_scratch_ptr
   2210     ld_a0,a0,0
   2211     mov_a1,t0
   2212     la_br &append_text
   2213     call
   2214     # a0 = durable_name ptr
   2215 
   2216     # m = macros_end; m->name.ptr = durable_name; m->name.len = name_len
   2217     la_a1 &macros_end
   2218     ld_t2,a1,0
   2219     st_a0,t2,0
   2220     la_a0 &df_name_len
   2221     ld_a0,a0,0
   2222     st_a0,t2,8
   2223 
   2224     # m->param_count = 0  (params[] left zeroed; not read when count == 0)
   2225     li_a0 %0 %0
   2226     st_a0,t2,16
   2227 
   2228     # render df_value into df_digit_scratch (reverse fill)
   2229     la_br &df_render_decimal
   2230     call
   2231 
   2232     # durable_digits = append_text(&df_digit_cursor, df_digit_count)
   2233     la_a0 &df_digit_cursor
   2234     ld_a0,a0,0
   2235     la_a1 &df_digit_count
   2236     ld_a1,a1,0
   2237     la_br &append_text
   2238     call
   2239     # a0 = durable_digits
   2240 
   2241     # macro_body_end capacity check
   2242     la_a1 &macro_body_end
   2243     ld_t0,a1,0
   2244     la_a2 &macro_body_tokens_ptr
   2245     ld_a2,a2,0
   2246     sub_a3,t0,a2
   2247     addi_a3,a3,32
   2248     li_t2 M1PP_MACRO_BODY_CAP
   2249     la_br &err_macro_body_overflow
   2250     blt_t2,a3
   2251 
   2252     # body_tok = TOK_WORD { durable_digits, df_digit_count, tight=0 }
   2253     li_a1 TOK_WORD
   2254     st_a1,t0,0
   2255     st_a0,t0,8
   2256     la_a2 &df_digit_count
   2257     ld_a2,a2,0
   2258     st_a2,t0,16
   2259     li_a1 %0 %0
   2260     st_a1,t0,24
   2261 
   2262     # m->body_start = macro_body_end (the slot we just wrote)
   2263     la_a0 &macros_end
   2264     ld_t2,a0,0
   2265     li_a1 M1PP_MACRO_BODY_START_OFF
   2266     add_a1,t2,a1
   2267     st_t0,a1,0
   2268 
   2269     # macro_body_end += 24
   2270     addi_t0,t0,32
   2271     la_a1 &macro_body_end
   2272     st_t0,a1,0
   2273 
   2274     # m->body_end = macro_body_end
   2275     li_a1 M1PP_MACRO_BODY_END_OFF
   2276     add_a1,t2,a1
   2277     st_t0,a1,0
   2278 
   2279     # macros_end += MACRO_RECORD_SIZE
   2280     li_a0 M1PP_MACRO_RECORD_SIZE
   2281     add_t2,t2,a0
   2282     la_a1 &macros_end
   2283     st_t2,a1,0
   2284 
   2285     eret
   2286 
   2287 ## df_render_decimal(): reads df_value; writes a reverse-filled decimal
   2288 ## rendering into df_digit_scratch[cursor..end) and stores df_digit_count +
   2289 ## df_digit_cursor for a subsequent append_text call. Leaf.
   2290 :df_render_decimal
   2291     la_a0 &df_value
   2292     ld_t0,a0,0
   2293     la_t1 &df_digit_scratch
   2294     li_a2 %24 %0
   2295     add_t1,t1,a2
   2296     mov_t2,t1
   2297 
   2298     # special-case v == 0 -> single '0'
   2299     la_br &df_rd_loop
   2300     bnez_t0
   2301     addi_t2,t2,neg1
   2302     li_a0 %48 %0
   2303     sb_a0,t2,0
   2304     la_br &df_rd_done
   2305     b
   2306 :df_rd_loop
   2307     la_br &df_rd_done
   2308     beqz_t0
   2309     mov_a0,t0
   2310     li_a1 %10 %0
   2311     rem_a2,a0,a1
   2312     addi_a2,a2,48
   2313     addi_t2,t2,neg1
   2314     sb_a2,t2,0
   2315     mov_a0,t0
   2316     li_a1 %10 %0
   2317     div_a0,a0,a1
   2318     mov_t0,a0
   2319     la_br &df_rd_loop
   2320     b
   2321 :df_rd_done
   2322     la_a1 &df_digit_scratch
   2323     li_a2 %24 %0
   2324     add_a1,a1,a2
   2325     sub_a0,a1,t2
   2326     la_a1 &df_digit_count
   2327     st_a0,a1,0
   2328     la_a1 &df_digit_cursor
   2329     st_t2,a1,0
   2330     ret
   2331 
   2332 ## ============================================================================
   2333 ## --- Stream stack + expansion-pool lifetime ---------------------------------
   2334 ## ============================================================================
   2335 ## process_tokens drives a stack of token streams. The source token array is
   2336 ## pushed first; each macro expansion or %select chosen-branch pushes a fresh
   2337 ## stream backed by a slice of expand_pool, popping rewinds pool_used to the
   2338 ## stream's pool_mark.
   2339 
   2340 ## push_stream_span(a0=start_tok, a1=end_tok, a2=pool_mark) -> void (fatal on overflow)
   2341 ## Push Stream { start = pos = a0, end = a1, line_start = 1, pool_mark = a2 }
   2342 ## onto streams[]. Bumps stream_top. pool_mark is a byte offset into
   2343 ## expand_pool, or -1 for a source-owned stream (pop_stream won't rewind).
   2344 ##
   2345 ## stream_top is maintained as a byte offset into streams[] (count * 40),
   2346 ## matching the running-tail-pointer pattern used by source_end / macros_end.
   2347 ## Reads/writes: streams, stream_top. Leaf.
   2348 :push_stream_span
   2349     # new_top = stream_top + STREAM_SIZE; if (cap < new_top) fatal
   2350     la_t0 &stream_top
   2351     ld_t1,t0,0
   2352     li_t2 M1PP_STREAM_SIZE
   2353     add_t2,t1,t2
   2354     li_a3 M1PP_STREAM_STACK_CAP
   2355     la_br &err_token_overflow
   2356     blt_a3,t2
   2357 
   2358     # s = &streams[stream_top]
   2359     la_a3 &streams_ptr
   2360     ld_a3,a3,0
   2361     add_a3,a3,t1
   2362 
   2363     # s->start = a0; s->end = a1; s->pos = a0; s->line_start = 1; s->pool_mark = a2
   2364     st_a0,a3,0
   2365     st_a1,a3,8
   2366     st_a0,a3,16
   2367     li_t1 %1 %0
   2368     st_t1,a3,24
   2369     st_a2,a3,32
   2370 
   2371     # stream_top = new_top
   2372     st_t2,t0,0
   2373     ret
   2374 
   2375 ## current_stream() -> a0 = &streams[stream_top-1], or 0 if empty. Leaf.
   2376 ## stream_top is a byte offset, so &streams[top-1] = streams + stream_top - 40.
   2377 ## Reads: streams, stream_top.
   2378 :current_stream
   2379     la_a0 &stream_top
   2380     ld_t0,a0,0
   2381     la_br &current_stream_empty
   2382     beqz_t0
   2383     la_a0 &streams_ptr
   2384     ld_a0,a0,0
   2385     add_a0,a0,t0
   2386     li_t1 M1PP_STREAM_SIZE
   2387     sub_a0,a0,t1
   2388     ret
   2389 :current_stream_empty
   2390     li_a0 %0 %0
   2391     ret
   2392 
   2393 ## pop_stream() -> void. Leaf.
   2394 ## Decrement stream_top. If the popped stream's pool_mark >= 0, restore
   2395 ## pool_used = pool_mark (reclaim the expansion-pool space it used).
   2396 ## Reads/writes: streams, stream_top, pool_used.
   2397 :pop_stream
   2398     la_a0 &stream_top
   2399     ld_t0,a0,0
   2400     la_br &pop_stream_done
   2401     beqz_t0
   2402     li_t1 M1PP_STREAM_SIZE
   2403     sub_t0,t0,t1
   2404     st_t0,a0,0
   2405 
   2406     # mark = popped->pool_mark
   2407     la_a1 &streams_ptr
   2408     ld_a1,a1,0
   2409     add_a1,a1,t0
   2410     ld_t0,a1,32
   2411 
   2412     # if (mark < 0) skip; else pool_used = mark
   2413     la_br &pop_stream_done
   2414     bltz_t0
   2415     la_a1 &pool_used
   2416     st_t0,a1,0
   2417 :pop_stream_done
   2418     ret
   2419 
   2420 ## copy_span_to_pool(a0=start_tok, a1=end_tok) -> void (fatal on pool overflow)
   2421 ## Append each 32-byte Token in [start, end) to expand_pool at pool_used,
   2422 ## advancing pool_used accordingly. Preserves tight bit at +24.
   2423 ## Reads/writes: expand_pool, pool_used. Leaf.
   2424 :copy_span_to_pool
   2425 :cstp_loop
   2426     # if (start == end) done
   2427     la_br &cstp_done
   2428     beq_a0,a1
   2429 
   2430     # bounds: pool_used + 32 must fit in EXPAND_CAP
   2431     la_a2 &pool_used
   2432     ld_t0,a2,0
   2433     addi_t1,t0,32
   2434     li_t2 M1PP_EXPAND_CAP
   2435     la_br &err_token_overflow
   2436     blt_t2,t1
   2437 
   2438     # dst = &expand_pool[pool_used]
   2439     la_a3 &expand_pool_ptr
   2440     ld_a3,a3,0
   2441     add_a3,a3,t0
   2442 
   2443     # copy 32 bytes (4 × u64)
   2444     ld_t1,a0,0
   2445     st_t1,a3,0
   2446     ld_t1,a0,8
   2447     st_t1,a3,8
   2448     ld_t1,a0,16
   2449     st_t1,a3,16
   2450     ld_t1,a0,24
   2451     st_t1,a3,24
   2452 
   2453     # pool_used += 32; start += 32
   2454     addi_t0,t0,32
   2455     st_t0,a2,0
   2456     addi_a0,a0,32
   2457     la_br &cstp_loop
   2458     b
   2459 :cstp_done
   2460     ret
   2461 
   2462 ## push_pool_stream_from_mark(a0=mark) -> void (fatal on overflow)
   2463 ## If pool_used == mark (empty expansion), do nothing and return.
   2464 ## Otherwise push_stream_span(expand_pool+mark, expand_pool+pool_used, mark).
   2465 ## Reads/writes: expand_pool, pool_used, streams, stream_top. Non-leaf:
   2466 ## needs a frame so the call to push_stream_span doesn't clobber LR.
   2467 :push_pool_stream_from_mark
   2468     enter_0
   2469     # if (pool_used == mark) return
   2470     la_a1 &pool_used
   2471     ld_t0,a1,0
   2472     la_br &ppsfm_done
   2473     beq_t0,a0
   2474 
   2475     # push_stream_span(expand_pool+mark, expand_pool+pool_used, mark)
   2476     la_a2 &expand_pool_ptr
   2477     ld_a2,a2,0
   2478     mov_t1,a0
   2479     add_a0,a2,a0
   2480     add_a1,a2,t0
   2481     mov_a2,t1
   2482     la_br &push_stream_span
   2483     call
   2484 :ppsfm_done
   2485     eret
   2486 
   2487 ## ============================================================================
   2488 ## --- Argument parsing -------------------------------------------------------
   2489 ## ============================================================================
   2490 
   2491 ## parse_args(a0=lparen_tok, a1=limit_tok) -> void (fatal on unterminated/overflow)
   2492 ## Scan tokens from lparen+1 up to limit, tracking paren depth. At depth 1 each
   2493 ## TOK_COMMA ends one arg and starts the next; the matching TOK_RPAREN at
   2494 ## depth 0 ends the last arg. An empty `()` is arg_count = 0.
   2495 ##
   2496 ## Writes globals:
   2497 ##   arg_starts[i]  = first token of arg i
   2498 ##   arg_ends[i]    = one past last token of arg i
   2499 ##   arg_count      = number of args (0..16)
   2500 ##   call_end_pos   = one past the closing RPAREN
   2501 ##
   2502 ## Fatal on: > 16 args, reaching limit without matching RPAREN.
   2503 :parse_args
   2504     # tok = lparen + 1; arg_start = tok; depth = 1; arg_index = 0; brace_depth = 0
   2505     addi_a0,a0,32
   2506     la_a2 &pa_pos
   2507     st_a0,a2,0
   2508     la_a2 &pa_arg_start
   2509     st_a0,a2,0
   2510     la_a2 &pa_limit
   2511     st_a1,a2,0
   2512     li_a2 %1 %0
   2513     la_a3 &pa_depth
   2514     st_a2,a3,0
   2515     li_a2 %0 %0
   2516     la_a3 &pa_arg_index
   2517     st_a2,a3,0
   2518     li_a2 %0 %0
   2519     la_a3 &pa_brace_depth
   2520     st_a2,a3,0
   2521 
   2522     # args_have_paste = 0 — set to 1 below if any TOK_PASTE appears in the
   2523     # call's argument span. expand_macro_tokens snapshots this right after
   2524     # parse_args returns; bare arg copies preserve embedded ## tokens, and
   2525     # the snapshot tells us whether we still have to run paste_pool_range
   2526     # even when the body itself contains no ##.
   2527     li_a2 %0 %0
   2528     la_a3 &args_have_paste
   2529     st_a2,a3,0
   2530 
   2531 :pa_loop
   2532     # if (tok >= limit) fatal unterminated
   2533     la_a0 &pa_pos
   2534     ld_t0,a0,0
   2535     la_a1 &pa_limit
   2536     ld_t1,a1,0
   2537     la_br &err_unterminated_macro
   2538     beq_t0,t1
   2539 
   2540     # kind = tok->kind
   2541     ld_a2,t0,0
   2542 
   2543     # if (kind == TOK_PASTE) { args_have_paste = 1; fall through to default-advance }
   2544     li_a3 TOK_PASTE
   2545     la_br &pa_kind_check
   2546     bne_a2,a3
   2547     li_a3 %1 %0
   2548     la_a0 &args_have_paste
   2549     st_a3,a0,0
   2550 
   2551 :pa_kind_check
   2552     # if (kind == TOK_LPAREN) { depth++; tok++; loop }
   2553     li_a3 TOK_LPAREN
   2554     la_br &pa_lparen
   2555     beq_a2,a3
   2556     li_a3 TOK_RPAREN
   2557     la_br &pa_rparen
   2558     beq_a2,a3
   2559     li_a3 TOK_COMMA
   2560     la_br &pa_maybe_comma
   2561     beq_a2,a3
   2562     li_a3 TOK_LBRACE
   2563     la_br &pa_lbrace
   2564     beq_a2,a3
   2565     li_a3 TOK_RBRACE
   2566     la_br &pa_rbrace
   2567     beq_a2,a3
   2568 
   2569     # default: tok++
   2570     addi_t0,t0,32
   2571     la_a0 &pa_pos
   2572     st_t0,a0,0
   2573     la_br &pa_loop
   2574     b
   2575 
   2576 :pa_lparen
   2577     la_a0 &pa_depth
   2578     ld_t1,a0,0
   2579     addi_t1,t1,1
   2580     st_t1,a0,0
   2581     addi_t0,t0,32
   2582     la_a0 &pa_pos
   2583     st_t0,a0,0
   2584     la_br &pa_loop
   2585     b
   2586 
   2587 :pa_rparen
   2588     # depth--
   2589     la_a0 &pa_depth
   2590     ld_t1,a0,0
   2591     addi_t1,t1,neg1
   2592     st_t1,a0,0
   2593     # if (depth != 0) tok++; loop
   2594     la_br &pa_rparen_close
   2595     beqz_t1
   2596     addi_t0,t0,32
   2597     la_a0 &pa_pos
   2598     st_t0,a0,0
   2599     la_br &pa_loop
   2600     b
   2601 
   2602 :pa_rparen_close
   2603     # depth == 0: if brace_depth != 0 -> unbalanced braces
   2604     la_a0 &pa_brace_depth
   2605     ld_t1,a0,0
   2606     la_br &err_unbalanced_braces
   2607     bnez_t1
   2608     # close out the call.
   2609     # arg_start (BSS), arg_index (BSS), tok = current pos.
   2610     la_a0 &pa_arg_start
   2611     ld_a1,a0,0
   2612     la_a0 &pa_arg_index
   2613     ld_a2,a0,0
   2614 
   2615     # if (arg_start == tok && arg_index == 0) -> arg_count = 0
   2616     la_br &pa_close_with_arg
   2617     bne_a1,t0
   2618     la_br &pa_close_with_arg
   2619     bnez_a2
   2620 
   2621     # empty (): arg_count = 0
   2622     li_a3 %0 %0
   2623     la_a0 &arg_count
   2624     st_a3,a0,0
   2625     la_br &pa_finish
   2626     b
   2627 
   2628 :pa_close_with_arg
   2629     # if (arg_index >= 16) fatal: branch to ok only if arg_index < 16
   2630     li_a3 M1PP_MAX_PARAMS
   2631     la_br &pa_close_with_arg_ok
   2632     blt_a2,a3
   2633     la_br &err_bad_macro_header
   2634     b
   2635 :pa_close_with_arg_ok
   2636     # arg_starts[arg_index] = arg_start; arg_ends[arg_index] = tok
   2637     la_a3 &arg_starts_ptr
   2638     ld_a3,a3,0
   2639     shli_t1,a2,3
   2640     add_a3,a3,t1
   2641     st_a1,a3,0
   2642     la_a3 &arg_ends_ptr
   2643     ld_a3,a3,0
   2644     add_a3,a3,t1
   2645     st_t0,a3,0
   2646     # arg_count = arg_index + 1
   2647     addi_a2,a2,1
   2648     la_a0 &arg_count
   2649     st_a2,a0,0
   2650 
   2651 :pa_finish
   2652     # call_end_pos = tok + 24
   2653     addi_t0,t0,32
   2654     la_a0 &call_end_pos
   2655     st_t0,a0,0
   2656     ret
   2657 
   2658 :pa_maybe_comma
   2659     # only split at depth == 1
   2660     la_a0 &pa_depth
   2661     ld_t1,a0,0
   2662     li_a3 %1 %0
   2663     la_br &pa_default_advance
   2664     bne_t1,a3
   2665     # and only when brace_depth == 0
   2666     la_a0 &pa_brace_depth
   2667     ld_t1,a0,0
   2668     la_br &pa_default_advance
   2669     bnez_t1
   2670 
   2671     # depth == 1 && brace_depth == 0 split: append (arg_start, tok) at arg_index
   2672     la_a0 &pa_arg_index
   2673     ld_a2,a0,0
   2674     li_a3 M1PP_MAX_PARAMS
   2675     la_br &pa_comma_ok
   2676     blt_a2,a3
   2677     la_br &err_bad_macro_header
   2678     b
   2679 :pa_comma_ok
   2680     la_a0 &pa_arg_start
   2681     ld_a1,a0,0
   2682     la_a3 &arg_starts_ptr
   2683     ld_a3,a3,0
   2684     shli_t1,a2,3
   2685     add_a3,a3,t1
   2686     st_a1,a3,0
   2687     la_a3 &arg_ends_ptr
   2688     ld_a3,a3,0
   2689     add_a3,a3,t1
   2690     st_t0,a3,0
   2691     # arg_index++
   2692     addi_a2,a2,1
   2693     la_a0 &pa_arg_index
   2694     st_a2,a0,0
   2695     # arg_start = tok + 24
   2696     addi_t0,t0,32
   2697     la_a0 &pa_arg_start
   2698     st_t0,a0,0
   2699     la_a0 &pa_pos
   2700     st_t0,a0,0
   2701     la_br &pa_loop
   2702     b
   2703 
   2704 :pa_default_advance
   2705     # comma at depth != 1: just advance
   2706     addi_t0,t0,32
   2707     la_a0 &pa_pos
   2708     st_t0,a0,0
   2709     la_br &pa_loop
   2710     b
   2711 
   2712 :pa_lbrace
   2713     # brace_depth++; tok++
   2714     la_a0 &pa_brace_depth
   2715     ld_t1,a0,0
   2716     addi_t1,t1,1
   2717     st_t1,a0,0
   2718     addi_t0,t0,32
   2719     la_a0 &pa_pos
   2720     st_t0,a0,0
   2721     la_br &pa_loop
   2722     b
   2723 
   2724 :pa_rbrace
   2725     # if (brace_depth <= 0) fatal unbalanced braces
   2726     la_a0 &pa_brace_depth
   2727     ld_t1,a0,0
   2728     la_br &err_unbalanced_braces
   2729     beqz_t1
   2730     # brace_depth--; tok++
   2731     addi_t1,t1,neg1
   2732     st_t1,a0,0
   2733     addi_t0,t0,32
   2734     la_a0 &pa_pos
   2735     st_t0,a0,0
   2736     la_br &pa_loop
   2737     b
   2738 
   2739 ## ============================================================================
   2740 ## --- Macro lookup + call expansion ------------------------------------------
   2741 ## ============================================================================
   2742 
   2743 ## find_macro(a0=tok) -> a0 = Macro* or 0. Leaf.
   2744 ## Non-zero only if tok is TOK_WORD, text.len >= 2, text[0] == '%', and
   2745 ## (text+1, len-1) equals macros[i].name for some i. First match wins.
   2746 ## Reads: macros, macros_end.
   2747 :find_macro
   2748     # if (tok.kind != TOK_WORD) return 0
   2749     ld_a1,a0,0
   2750     li_a2 TOK_WORD
   2751     la_br &find_macro_zero
   2752     bne_a1,a2
   2753 
   2754     # if (tok.text.len < 2) return 0
   2755     ld_a2,a0,16
   2756     li_a3 %2 %0
   2757     la_br &find_macro_zero
   2758     blt_a2,a3
   2759 
   2760     # if (tok.text[0] != '%') return 0
   2761     ld_a1,a0,8
   2762     lb_a3,a1,0
   2763     li_t0 %37 %0
   2764     la_br &find_macro_zero
   2765     bne_a3,t0
   2766 
   2767     # name_ptr = tok.text + 1; name_len = tok.text.len - 1
   2768     addi_a1,a1,1
   2769     addi_a2,a2,neg1
   2770 
   2771     # m = &macros[0]; m_end = macros_end
   2772     la_a3 &macros_ptr
   2773     ld_a3,a3,0
   2774     la_t0 &macros_end
   2775     ld_t0,t0,0
   2776 
   2777 :find_macro_loop
   2778     # if (m == macros_end) return 0
   2779     la_br &find_macro_zero
   2780     beq_a3,t0
   2781 
   2782     # if (m->name.len != name_len) advance
   2783     ld_t1,a3,8
   2784     la_br &find_macro_next
   2785     bne_t1,a2
   2786 
   2787     # byte-compare m->name.ptr vs name_ptr for name_len bytes
   2788     ld_t1,a3,0
   2789     li_t2 %0 %0
   2790 :find_macro_cmp
   2791     la_br &find_macro_match
   2792     beq_t2,a2
   2793     add_a0,t1,t2
   2794     lb_a0,a0,0
   2795     add_t0,a1,t2
   2796     lb_t0,t0,0
   2797     la_br &find_macro_next
   2798     bne_a0,t0
   2799     addi_t2,t2,1
   2800     la_br &find_macro_cmp
   2801     b
   2802 
   2803 :find_macro_next
   2804     # m += M1PP_MACRO_RECORD_SIZE
   2805     li_t1 M1PP_MACRO_RECORD_SIZE
   2806     add_a3,a3,t1
   2807     # reload macros_end (clobbered by the comparisons)
   2808     la_t0 &macros_end
   2809     ld_t0,t0,0
   2810     la_br &find_macro_loop
   2811     b
   2812 
   2813 :find_macro_match
   2814     mov_a0,a3
   2815     ret
   2816 
   2817 :find_macro_zero
   2818     li_a0 %0 %0
   2819     ret
   2820 
   2821 ## find_param(a0=macro_ptr, a1=tok) -> a0 = (index+1) or 0. Leaf.
   2822 ## Linear search over macro->params[0..param_count). Non-WORD tok -> 0, so
   2823 ## callers can test the return against zero without pre-filtering.
   2824 :find_param
   2825     # if (tok.kind != TOK_WORD) return 0
   2826     ld_a2,a1,0
   2827     li_a3 TOK_WORD
   2828     la_br &find_param_zero
   2829     bne_a2,a3
   2830 
   2831     # param_count = macro->param_count
   2832     ld_a2,a0,16
   2833     la_br &find_param_zero
   2834     beqz_a2
   2835 
   2836     # Spill bases into BSS so the cmp loop has free temp regs.
   2837     #   fp_macro     = macro_ptr
   2838     #   fp_tok       = tok ptr
   2839     #   fp_pcount    = param_count
   2840     #   fp_idx       = current param index
   2841     la_a3 &fp_macro
   2842     st_a0,a3,0
   2843     la_a3 &fp_tok
   2844     st_a1,a3,0
   2845     la_a3 &fp_pcount
   2846     st_a2,a3,0
   2847     li_a3 %0 %0
   2848     la_a0 &fp_idx
   2849     st_a3,a0,0
   2850 
   2851 :find_param_outer
   2852     # idx, pcount
   2853     la_a0 &fp_idx
   2854     ld_t0,a0,0
   2855     la_a0 &fp_pcount
   2856     ld_a1,a0,0
   2857     la_br &find_param_zero
   2858     beq_t0,a1
   2859 
   2860     # param_ptr = fp_macro + 24 + idx * 16   (macro record params start at +24)
   2861     la_a0 &fp_macro
   2862     ld_a2,a0,0
   2863     addi_a2,a2,24
   2864     shli_a3,t0,4
   2865     add_a2,a2,a3
   2866 
   2867     # tok ptr
   2868     la_a0 &fp_tok
   2869     ld_a3,a0,0
   2870 
   2871     # Compare lengths.
   2872     ld_t1,a2,8
   2873     ld_t2,a3,16
   2874     la_br &find_param_next
   2875     bne_t1,t2
   2876 
   2877     # Lengths match. Byte-compare param.ptr vs tok.text.ptr for t1 bytes.
   2878     # After this point we either return or restart the outer loop, so
   2879     # all caller-saved regs are free.
   2880     ld_a0,a2,0
   2881     ld_a1,a3,8
   2882     li_t0 %0 %0
   2883 :find_param_cmp
   2884     la_br &find_param_match
   2885     beq_t0,t1
   2886     add_t2,a0,t0
   2887     lb_t2,t2,0
   2888     add_a2,a1,t0
   2889     lb_a2,a2,0
   2890     la_br &find_param_next
   2891     bne_t2,a2
   2892     addi_t0,t0,1
   2893     la_br &find_param_cmp
   2894     b
   2895 
   2896 :find_param_next
   2897     # idx++
   2898     la_a0 &fp_idx
   2899     ld_t0,a0,0
   2900     addi_t0,t0,1
   2901     st_t0,a0,0
   2902     la_br &find_param_outer
   2903     b
   2904 
   2905 :find_param_match
   2906     # return idx + 1
   2907     la_a0 &fp_idx
   2908     ld_a0,a0,0
   2909     addi_a0,a0,1
   2910     ret
   2911 
   2912 :find_param_zero
   2913     li_a0 %0 %0
   2914     ret
   2915 
   2916 ## arg_is_braced(a0=start, a1=end) -> a0 = 1 if the span wraps in a matching
   2917 ## outer { ... } pair (outer RBRACE is the same-level mate of the leading
   2918 ## LBRACE), else 0. Leaf.
   2919 :arg_is_braced
   2920     # if (end - start < 2 tokens = 64 bytes) return 0
   2921     sub_a2,a1,a0
   2922     li_a3 %64 %0
   2923     la_br &aib_zero
   2924     blt_a2,a3
   2925 
   2926     # if (start->kind != TOK_LBRACE) return 0
   2927     ld_a2,a0,0
   2928     li_a3 TOK_LBRACE
   2929     la_br &aib_zero
   2930     bne_a2,a3
   2931 
   2932     # if ((end - 24)->kind != TOK_RBRACE) return 0
   2933     addi_t0,a1,neg32
   2934     ld_a2,t0,0
   2935     li_a3 TOK_RBRACE
   2936     la_br &aib_zero
   2937     bne_a2,a3
   2938 
   2939     # walk tokens tracking depth; if depth hits 0 before reaching end-24,
   2940     # the leading LBRACE doesn't match the trailing RBRACE -> return 0.
   2941     # t0 = tok, t1 = depth, t2 = last_tok = end - 24
   2942     mov_t0,a0
   2943     li_t1 %0 %0
   2944     addi_t2,a1,neg32
   2945 :aib_loop
   2946     la_br &aib_done
   2947     beq_t0,a1
   2948     ld_a2,t0,0
   2949     li_a3 TOK_LBRACE
   2950     la_br &aib_incr
   2951     beq_a2,a3
   2952     li_a3 TOK_RBRACE
   2953     la_br &aib_decr
   2954     beq_a2,a3
   2955     # non-brace: advance
   2956     addi_t0,t0,32
   2957     la_br &aib_loop
   2958     b
   2959 :aib_incr
   2960     addi_t1,t1,1
   2961     addi_t0,t0,32
   2962     la_br &aib_loop
   2963     b
   2964 :aib_decr
   2965     addi_t1,t1,neg1
   2966     # if (depth == 0 && tok != end - 24) -> not wrapping
   2967     la_br &aib_decr_skip
   2968     bnez_t1
   2969     la_br &aib_zero
   2970     bne_t0,t2
   2971 :aib_decr_skip
   2972     addi_t0,t0,32
   2973     la_br &aib_loop
   2974     b
   2975 :aib_done
   2976     # return (depth == 0) ? 1 : 0
   2977     la_br &aib_zero
   2978     bnez_t1
   2979     li_a0 %1 %0
   2980     ret
   2981 :aib_zero
   2982     li_a0 %0 %0
   2983     ret
   2984 
   2985 ## copy_arg_tokens_to_pool(a0=arg_start, a1=arg_end) -> void (fatal if empty)
   2986 ## Non-leaf (calls copy_span_to_pool). Empty arg is an error.
   2987 ## If the span is wrapped in a matching outer { ... } pair, strip the outer
   2988 ## braces before copying; an empty inner span is a no-op.
   2989 :copy_arg_tokens_to_pool
   2990     enter_16
   2991     # if (arg_start == arg_end) fatal
   2992     la_br &err_bad_macro_header
   2993     beq_a0,a1
   2994     # spill a0/a1 so arg_is_braced can clobber regs
   2995     st_a0,sp,0
   2996     st_a1,sp,8
   2997     la_br &arg_is_braced
   2998     call
   2999     la_br &catp_plain
   3000     beqz_a0
   3001     # braced: strip outer braces (start+24, end-24)
   3002     ld_a0,sp,0
   3003     ld_a1,sp,8
   3004     addi_a0,a0,32
   3005     addi_a1,a1,neg32
   3006     la_br &catp_done
   3007     beq_a0,a1
   3008     la_br &copy_span_to_pool
   3009     call
   3010     la_br &catp_done
   3011     b
   3012 :catp_plain
   3013     ld_a0,sp,0
   3014     ld_a1,sp,8
   3015     la_br &copy_span_to_pool
   3016     call
   3017 :catp_done
   3018     eret
   3019 
   3020 ## copy_paste_arg_to_pool(a0=arg_start, a1=arg_end) -> void (fatal unless len 1)
   3021 ## Enforces the single-token-argument rule for params adjacent to ##.
   3022 ## Braced args are rejected — pasting onto a block is nonsense.
   3023 :copy_paste_arg_to_pool
   3024     enter_16
   3025     # spill a0/a1 for the arg_is_braced call
   3026     st_a0,sp,0
   3027     st_a1,sp,8
   3028     la_br &arg_is_braced
   3029     call
   3030     la_br &err_bad_macro_header
   3031     bnez_a0
   3032     ld_a0,sp,0
   3033     ld_a1,sp,8
   3034     # if ((arg_end - arg_start) != 24) fatal
   3035     sub_a2,a1,a0
   3036     li_a3 M1PP_TOK_SIZE
   3037     la_br &err_bad_macro_header
   3038     bne_a2,a3
   3039     la_br &copy_span_to_pool
   3040     call
   3041     eret
   3042 
   3043 ## expand_macro_tokens(a0=call_tok, a1=limit, a2=macro_ptr) -> void (fatal on bad)
   3044 ## Requires call_tok+1 is TOK_LPAREN. Runs parse_args(call_tok+1, limit),
   3045 ## verifies arg_count == macro->param_count, walks macro body, substituting
   3046 ## each param token via copy_arg_tokens_to_pool (or copy_paste_arg_to_pool
   3047 ## when adjacent to ##), copying other body tokens as-is, then runs
   3048 ## paste_pool_range over the newly-written slice.
   3049 ##
   3050 ## Outputs via globals (callers must snapshot before any nested call that
   3051 ## could overwrite them):
   3052 ##   emt_after_pos = token one past the matching ')' (= call_end_pos)
   3053 ##   emt_mark      = pool_used as of entry (start of expansion slice)
   3054 ##
   3055 :expand_macro_tokens
   3056     enter_0
   3057 
   3058     # Snapshot inputs into BSS (find_param/copy_*/paste_pool_range clobber regs).
   3059     la_a3 &emt_call_tok
   3060     st_a0,a3,0
   3061     la_a3 &emt_limit
   3062     st_a1,a3,0
   3063     la_a3 &emt_macro
   3064     st_a2,a3,0
   3065 
   3066     # lparen = call_tok + 24
   3067     addi_a0,a0,32
   3068 
   3069     # Branch split for paren-less 0-arg calls:
   3070     #   if lparen < limit AND lparen->kind == TOK_LPAREN: parse_args as usual.
   3071     #   else if macro->param_count == 0: synthesize empty arg list, no parse_args.
   3072     #   else: fatal "bad macro call".
   3073 
   3074     # if (lparen >= limit) goto emt_try_zero_arg
   3075     la_br &emt_try_zero_arg
   3076     beq_a0,a1
   3077     la_br &emt_try_zero_arg
   3078     blt_a1,a0
   3079 
   3080     # if (lparen->kind != TOK_LPAREN) goto emt_try_zero_arg
   3081     ld_a2,a0,0
   3082     li_a3 TOK_LPAREN
   3083     la_br &emt_try_zero_arg
   3084     bne_a2,a3
   3085 
   3086     # if (!lparen->tight) goto emt_try_zero_arg — `%FOO ( ... )` with a space
   3087     # is a paren-less zero-arg call followed by a literal `(`.
   3088     ld_a2,a0,24
   3089     la_br &emt_try_zero_arg
   3090     beqz_a2
   3091 
   3092     # parse_args(lparen, limit)
   3093     # a0 already lparen; a1 already limit
   3094     la_br &parse_args
   3095     call
   3096 
   3097     # Snapshot args_have_paste -> emt_saw_arg_paste BEFORE the body loop
   3098     # potentially runs nested expansions that would clobber the global. This
   3099     # snapshot is OR'd with macro->has_paste at emt_done to decide whether
   3100     # to run paste_pool_range.
   3101     la_a0 &args_have_paste
   3102     ld_t0,a0,0
   3103     la_a1 &emt_saw_arg_paste
   3104     st_t0,a1,0
   3105 
   3106     # Check arg_count == macro->param_count
   3107     la_a0 &arg_count
   3108     ld_t0,a0,0
   3109     la_a0 &emt_macro
   3110     ld_t1,a0,0
   3111     ld_t1,t1,16
   3112     la_br &err_bad_macro_header
   3113     bne_t0,t1
   3114 
   3115     # expansion_id = ++next_expansion_id (monotonic; used by local-label
   3116     # rewriting in the body-copy path to rename :@name / &@name tokens).
   3117     la_a0 &next_expansion_id
   3118     ld_t0,a0,0
   3119     addi_t0,t0,1
   3120     st_t0,a0,0
   3121     la_a1 &emt_expansion_id
   3122     st_t0,a1,0
   3123 
   3124     # Snapshot call_end_pos -> emt_after_pos before the body walk, so
   3125     # nothing in the substitution loop can clobber the resume position.
   3126     la_a0 &call_end_pos
   3127     ld_t0,a0,0
   3128     la_a1 &emt_after_pos
   3129     st_t0,a1,0
   3130     la_br &emt_after_arg_setup
   3131     b
   3132 
   3133 :emt_try_zero_arg
   3134     # No trailing LPAREN. Allowed only if macro->param_count == 0.
   3135     la_a0 &emt_macro
   3136     ld_t1,a0,0
   3137     ld_t1,t1,16
   3138     la_br &err_bad_macro_header
   3139     bnez_t1
   3140 
   3141     # No parse_args ran in this branch; args_have_paste from a stale earlier
   3142     # call MUST NOT leak into emt_saw_arg_paste. Force it to 0 so emt_done's
   3143     # paste-gate uses only macro->has_paste here.
   3144     li_t0 %0 %0
   3145     la_a1 &emt_saw_arg_paste
   3146     st_t0,a1,0
   3147 
   3148     # arg_count = 0
   3149     la_a0 &arg_count
   3150     li_t0 %0 %0
   3151     st_t0,a0,0
   3152 
   3153     # emt_after_pos = call_tok + 24
   3154     la_a0 &emt_call_tok
   3155     ld_t0,a0,0
   3156     addi_t0,t0,32
   3157     la_a1 &emt_after_pos
   3158     st_t0,a1,0
   3159 
   3160 :emt_after_arg_setup
   3161 
   3162     # mark = pool_used; emt_mark = mark
   3163     la_a0 &pool_used
   3164     ld_t0,a0,0
   3165     la_a1 &emt_mark
   3166     st_t0,a1,0
   3167 
   3168     # body_pos = macro->body_start; body_end = macro->body_end
   3169     la_a0 &emt_macro
   3170     ld_t1,a0,0
   3171     li_a2 M1PP_MACRO_BODY_START_OFF
   3172     add_a3,t1,a2
   3173     ld_a3,a3,0
   3174     la_a0 &emt_body_pos
   3175     st_a3,a0,0
   3176     la_a0 &emt_body_start
   3177     st_a3,a0,0
   3178     li_a2 M1PP_MACRO_BODY_END_OFF
   3179     add_a3,t1,a2
   3180     ld_a3,a3,0
   3181     la_a0 &emt_body_end
   3182     st_a3,a0,0
   3183 
   3184 :emt_loop
   3185     # if (body_pos == body_end) break
   3186     la_a0 &emt_body_pos
   3187     ld_t0,a0,0
   3188     la_a1 &emt_body_end
   3189     ld_t1,a1,0
   3190     la_br &emt_done
   3191     beq_t0,t1
   3192 
   3193     # Cached param_idx = macro_body_param_idx[(body_pos - macro_body_tokens) / 32].
   3194     # Set at %macro define time so the body loop never has to call find_param.
   3195     la_a1 &macro_body_tokens_ptr
   3196     ld_a1,a1,0
   3197     sub_a0,t0,a1
   3198     shri_a0,a0,5
   3199     la_a1 &macro_body_param_idx_ptr
   3200     ld_a1,a1,0
   3201     add_a1,a1,a0
   3202     lb_a0,a1,0
   3203     # Spill for emt_do_substitute_paste / _plain (no need to re-derive).
   3204     la_a1 &emt_cached_param_idx
   3205     st_a0,a1,0
   3206 
   3207     # if (param_idx == 0) body-native token: check for local-label rewrite,
   3208     # else fall through to substitute logic.
   3209     la_br &emt_check_local_label
   3210     beqz_a0
   3211 
   3212     # param_idx != 0: substitute. The emt_do_substitute_* paths read
   3213     # emt_cached_param_idx (no re-call to find_param).
   3214 
   3215     # Reload body_pos for the pasted-classification loads.
   3216     la_a0 &emt_body_pos
   3217     ld_t0,a0,0
   3218 
   3219     # Compute pasted = (body_pos > body_start AND (body_pos - 24)->kind == TOK_PASTE)
   3220     #                  OR (body_pos + 24 < body_end AND (body_pos + 24)->kind == TOK_PASTE)
   3221     la_a1 &emt_body_start
   3222     ld_t1,a1,0
   3223 
   3224     # Branch to emt_check_after if body_pos == body_start
   3225     la_br &emt_check_after
   3226     beq_t0,t1
   3227 
   3228     # prev_kind = (body_pos - 24)->kind
   3229     addi_t2,t0,neg32
   3230     ld_a2,t2,0
   3231     li_a3 TOK_PASTE
   3232     la_br &emt_pasted
   3233     beq_a2,a3
   3234 
   3235 :emt_check_after
   3236     # next_pos = body_pos + 24; if (next_pos >= body_end) skip
   3237     addi_t2,t0,32
   3238     la_a1 &emt_body_end
   3239     ld_a3,a1,0
   3240     # if (next_pos == body_end) -> not pasted (need next_pos < body_end)
   3241     la_br &emt_not_pasted
   3242     beq_t2,a3
   3243     # next_kind = next_pos->kind
   3244     ld_a2,t2,0
   3245     li_a3 TOK_PASTE
   3246     la_br &emt_pasted
   3247     beq_a2,a3
   3248     la_br &emt_not_pasted
   3249     b
   3250 
   3251 :emt_pasted
   3252     # body_pos is a param adjacent to ##: substitute one-token arg.
   3253     la_br &emt_do_substitute_paste
   3254     b
   3255 
   3256 :emt_not_pasted
   3257     # body_pos is a param NOT adjacent to ##: substitute arg span.
   3258     la_br &emt_do_substitute_plain
   3259     b
   3260 
   3261 ## emt_check_local_label: read the cached macro_body_is_local_label[]
   3262 ## flag (set at %macro define time). 0 -> emt_copy_literal copies the
   3263 ## body token verbatim; 1 -> falls through to emt_rewrite_local_label.
   3264 ## Replaces the per-expansion ':@' / '&@' / '@' predicate that used to
   3265 ## live inline here.
   3266 :emt_check_local_label
   3267     la_a0 &emt_body_pos
   3268     ld_t0,a0,0
   3269     la_a1 &macro_body_tokens_ptr
   3270     ld_a1,a1,0
   3271     sub_a0,t0,a1
   3272     shri_a0,a0,5
   3273     la_a1 &macro_body_is_local_label_ptr
   3274     ld_a1,a1,0
   3275     add_a1,a1,a0
   3276     lb_a0,a1,0
   3277     la_br &emt_copy_literal
   3278     beqz_a0
   3279     # Cached flag is 1: fall through to rewrite.
   3280 
   3281 ## emt_rewrite_local_label: build "sigil + tail + __ + decimal(NN)" in
   3282 ## local_label_scratch, stash it into text_buf via append_text, and push
   3283 ## a TOK_WORD to expand_pool.
   3284 :emt_rewrite_local_label
   3285     # Stash body_tok text_ptr / text_len into BSS so they survive
   3286     # function calls (append_text is non-leaf via its arena bump).
   3287     la_a0 &emt_body_pos
   3288     ld_t0,a0,0
   3289     ld_a1,t0,8
   3290     la_a2 &ll_src_ptr
   3291     st_a1,a2,0
   3292     ld_a1,t0,16
   3293     la_a2 &ll_src_len
   3294     st_a1,a2,0
   3295 
   3296     # --- Convert emt_expansion_id to decimal, reverse-fill into
   3297     # --- local_label_digits[0..24). Write right-to-left starting at
   3298     # --- offset 23 so digits are adjacent at [cursor, &scratch+24).
   3299     la_a0 &emt_expansion_id
   3300     ld_t0,a0,0                 # t0 = id (mutated)
   3301     la_t1 &local_label_digits
   3302     li_a2 %24 %0
   3303     add_t1,t1,a2               # t1 = end (one past last slot)
   3304     mov_t2,t1                  # t2 = cursor (moves left)
   3305 
   3306     # Special-case id == 0 -> single '0' digit.
   3307     la_br &emt_rldg_loop
   3308     bnez_t0
   3309     addi_t2,t2,neg1
   3310     li_a0 %48 %0
   3311     sb_a0,t2,0
   3312     la_br &emt_rldg_done
   3313     b
   3314 :emt_rldg_loop
   3315     la_br &emt_rldg_done
   3316     beqz_t0
   3317     # digit = id % 10
   3318     mov_a0,t0
   3319     li_a1 %10 %0
   3320     rem_a2,a0,a1               # a2 = id % 10
   3321     addi_a2,a2,48              # a2 = '0' + digit
   3322     addi_t2,t2,neg1
   3323     sb_a2,t2,0                 # *--cursor = digit
   3324     # id = id / 10
   3325     mov_a0,t0
   3326     li_a1 %10 %0
   3327     div_a0,a0,a1
   3328     mov_t0,a0
   3329     la_br &emt_rldg_loop
   3330     b
   3331 :emt_rldg_done
   3332     # digit_count = end - cursor
   3333     la_a1 &local_label_digits
   3334     li_a2 %24 %0
   3335     add_a1,a1,a2               # a1 = end
   3336     sub_a0,a1,t2               # a0 = digit_count
   3337     la_a1 &ll_digit_count
   3338     st_a0,a1,0
   3339     # Save cursor (start of digits) for the copy step.
   3340     la_a1 &ll_digit_cursor
   3341     st_t2,a1,0
   3342 
   3343     # --- Build final text in local_label_scratch ---
   3344     # Layout: [0]=sigil, [1..1+tail_len)=tail, then "__", then digits.
   3345     # tail_len = len - 2
   3346 
   3347     # Write sigil (src_ptr[0]) to scratch[0].
   3348     la_a0 &ll_src_ptr
   3349     ld_a1,a0,0
   3350     lb_a2,a1,0
   3351     la_a3 &local_label_scratch_ptr
   3352     ld_a3,a3,0
   3353     sb_a2,a3,0
   3354 
   3355     # Copy tail: scratch[1..1+tail_len) <- src_ptr[2..2+tail_len).
   3356     la_a0 &ll_src_len
   3357     ld_a1,a0,0
   3358     li_a2 %2 %0
   3359     sub_t0,a1,a2               # t0 = tail_len = src_len - 2
   3360     la_a0 &ll_src_ptr
   3361     ld_a1,a0,0                 # a1 = src_ptr
   3362     addi_a1,a1,2               # a1 = src_ptr + 2 (tail start)
   3363     la_a2 &local_label_scratch_ptr
   3364     ld_a2,a2,0
   3365     addi_a2,a2,1               # a2 = scratch + 1 (dst tail start)
   3366     li_t1 %0 %0                # t1 = i
   3367 :emt_rlbuild_tail_loop
   3368     la_br &emt_rlbuild_tail_done
   3369     beq_t1,t0
   3370     add_a3,a1,t1
   3371     lb_a3,a3,0
   3372     add_t2,a2,t1
   3373     sb_a3,t2,0
   3374     addi_t1,t1,1
   3375     la_br &emt_rlbuild_tail_loop
   3376     b
   3377 :emt_rlbuild_tail_done
   3378     # Save tail_len for later offset math.
   3379     la_a0 &ll_tail_len
   3380     st_t0,a0,0
   3381 
   3382     # Write "__" at scratch[1+tail_len], scratch[2+tail_len].
   3383     la_a2 &local_label_scratch_ptr
   3384     ld_a2,a2,0
   3385     addi_a2,a2,1
   3386     add_a2,a2,t0               # a2 = &scratch[1+tail_len]
   3387     li_a3 %95 %0               # '_'
   3388     sb_a3,a2,0
   3389     addi_a2,a2,1
   3390     sb_a3,a2,0
   3391 
   3392     # Copy digits: scratch[3+tail_len..3+tail_len+digit_count) <- digit_cursor[0..digit_count).
   3393     la_a0 &ll_digit_count
   3394     ld_t1,a0,0                 # t1 = digit_count
   3395     la_a0 &ll_digit_cursor
   3396     ld_a1,a0,0                 # a1 = digit_cursor (src)
   3397     la_a0 &ll_tail_len
   3398     ld_t0,a0,0                 # t0 = tail_len
   3399     la_a2 &local_label_scratch_ptr
   3400     ld_a2,a2,0
   3401     addi_a2,a2,3
   3402     add_a2,a2,t0               # a2 = &scratch[3+tail_len] (dst)
   3403     li_t2 %0 %0                # t2 = i
   3404 :emt_rlbuild_digits_loop
   3405     la_br &emt_rlbuild_digits_done
   3406     beq_t2,t1
   3407     add_a3,a1,t2
   3408     lb_a3,a3,0
   3409     add_a0,a2,t2
   3410     sb_a3,a0,0
   3411     addi_t2,t2,1
   3412     la_br &emt_rlbuild_digits_loop
   3413     b
   3414 :emt_rlbuild_digits_done
   3415 
   3416     # total_len = 1 + tail_len + 2 + digit_count = 3 + tail_len + digit_count
   3417     la_a0 &ll_tail_len
   3418     ld_a1,a0,0
   3419     la_a0 &ll_digit_count
   3420     ld_a2,a0,0
   3421     add_a1,a1,a2
   3422     addi_a1,a1,3
   3423     la_a0 &ll_total_len
   3424     st_a1,a0,0
   3425 
   3426     # durable_ptr = append_text(&local_label_scratch, total_len)
   3427     la_a0 &local_label_scratch_ptr
   3428     ld_a0,a0,0
   3429     la_br &append_text
   3430     call
   3431     # a0 = durable_ptr (into text_buf)
   3432 
   3433     # Push TOK_WORD { kind=0, text_ptr=durable_ptr, text_len=total_len } to expand_pool.
   3434     la_a1 &pool_used
   3435     ld_t0,a1,0
   3436     li_a2 M1PP_EXPAND_CAP
   3437     la_br &err_token_overflow
   3438     beq_t0,a2
   3439     la_a3 &expand_pool_ptr
   3440     ld_a3,a3,0
   3441     add_a3,a3,t0               # a3 = dst slot
   3442     # kind = TOK_WORD
   3443     li_a2 TOK_WORD
   3444     st_a2,a3,0
   3445     # text_ptr
   3446     st_a0,a3,8
   3447     # text_len
   3448     la_a0 &ll_total_len
   3449     ld_a2,a0,0
   3450     st_a2,a3,16
   3451     # tight = 0 (synthetic local-label token)
   3452     li_a2 %0 %0
   3453     st_a2,a3,24
   3454     # pool_used += 32
   3455     addi_t0,t0,32
   3456     la_a1 &pool_used
   3457     st_t0,a1,0
   3458 
   3459     # body_pos += 32
   3460     la_a0 &emt_body_pos
   3461     ld_t0,a0,0
   3462     addi_t0,t0,32
   3463     st_t0,a0,0
   3464     la_br &emt_loop
   3465     b
   3466 
   3467 :emt_copy_literal
   3468     # Append *body_pos to expand_pool. Check overflow.
   3469     la_a0 &pool_used
   3470     ld_t0,a0,0
   3471     li_a1 M1PP_EXPAND_CAP
   3472     la_br &err_token_overflow
   3473     beq_t0,a1
   3474     # dst = &expand_pool + pool_used
   3475     la_a2 &expand_pool_ptr
   3476     ld_a2,a2,0
   3477     add_a2,a2,t0
   3478     # src = body_pos
   3479     la_a0 &emt_body_pos
   3480     ld_a3,a0,0
   3481     # copy 32 bytes (4 x 8) — preserves tight at +24
   3482     ld_a0,a3,0
   3483     st_a0,a2,0
   3484     ld_a0,a3,8
   3485     st_a0,a2,8
   3486     ld_a0,a3,16
   3487     st_a0,a2,16
   3488     ld_a0,a3,24
   3489     st_a0,a2,24
   3490     # pool_used += 32
   3491     addi_t0,t0,32
   3492     la_a0 &pool_used
   3493     st_t0,a0,0
   3494     # body_pos += 32
   3495     addi_a3,a3,32
   3496     la_a0 &emt_body_pos
   3497     st_a3,a0,0
   3498     la_br &emt_loop
   3499     b
   3500 
   3501 :emt_do_substitute_paste
   3502     # Use the cached param_idx (set at the top of emt_loop) instead of
   3503     # re-running find_param.
   3504     la_a0 &emt_cached_param_idx
   3505     ld_a0,a0,0
   3506     addi_a0,a0,neg1
   3507     shli_a0,a0,3
   3508     la_a1 &arg_starts_ptr
   3509     ld_a1,a1,0
   3510     add_a1,a1,a0
   3511     ld_t0,a1,0
   3512     la_a1 &arg_ends_ptr
   3513     ld_a1,a1,0
   3514     add_a1,a1,a0
   3515     ld_t1,a1,0
   3516     mov_a0,t0
   3517     mov_a1,t1
   3518     la_br &copy_paste_arg_to_pool
   3519     call
   3520     # body_pos += 24
   3521     la_a0 &emt_body_pos
   3522     ld_t0,a0,0
   3523     addi_t0,t0,32
   3524     st_t0,a0,0
   3525     la_br &emt_loop
   3526     b
   3527 
   3528 :emt_do_substitute_plain
   3529     # Use the cached param_idx (set at the top of emt_loop) instead of
   3530     # re-running find_param.
   3531     la_a0 &emt_cached_param_idx
   3532     ld_a0,a0,0
   3533     addi_a0,a0,neg1
   3534     shli_a0,a0,3
   3535     la_a1 &arg_starts_ptr
   3536     ld_a1,a1,0
   3537     add_a1,a1,a0
   3538     ld_t0,a1,0
   3539     la_a1 &arg_ends_ptr
   3540     ld_a1,a1,0
   3541     add_a1,a1,a0
   3542     ld_t1,a1,0
   3543     mov_a0,t0
   3544     mov_a1,t1
   3545     la_br &copy_arg_tokens_to_pool
   3546     call
   3547     # body_pos += 24
   3548     la_a0 &emt_body_pos
   3549     ld_t0,a0,0
   3550     addi_t0,t0,32
   3551     st_t0,a0,0
   3552     la_br &emt_loop
   3553     b
   3554 
   3555 :emt_done
   3556     # Gate paste_pool_range(mark) on (macro->has_paste OR emt_saw_arg_paste).
   3557     # When neither side contains TOK_PASTE the pool sweep is wasted work — the
   3558     # whole point of has_paste / args_have_paste is to skip it for the common
   3559     # case of a `##`-free expansion.
   3560     la_a0 &emt_macro
   3561     ld_t0,a0,0
   3562     li_a1 M1PP_MACRO_HAS_PASTE_OFF
   3563     add_t0,t0,a1
   3564     ld_t0,t0,0
   3565     la_a0 &emt_saw_arg_paste
   3566     ld_t1,a0,0
   3567     or_t0,t0,t1
   3568     la_br &emt_done_skip_paste
   3569     beqz_t0
   3570     la_a0 &emt_mark
   3571     ld_a0,a0,0
   3572     la_br &paste_pool_range
   3573     call
   3574 
   3575 :emt_done_skip_paste
   3576     eret
   3577 
   3578 ## expand_call(a0=stream_ptr, a1=macro_ptr) -> void (fatal on bad call)
   3579 ## Calls expand_macro_tokens for the call at stream->pos, sets
   3580 ## stream->pos = emt_after_pos, stream->line_start = 0, and
   3581 ## push_pool_stream_from_mark(emt_mark) to rescan the expansion.
   3582 :expand_call
   3583     enter_8
   3584 
   3585     # spill stream_ptr to local frame slot (sp+16 is the first local; sp+0/+8
   3586     # hold the saved return address and saved caller sp).
   3587     st_a0,sp,0
   3588 
   3589     # expand_macro_tokens(stream->pos, stream->end, macro)
   3590     # stream->pos at +16, stream->end at +8
   3591     ld_t0,a0,16
   3592     ld_t1,a0,8
   3593     mov_a2,a1
   3594     mov_a0,t0
   3595     mov_a1,t1
   3596     la_br &expand_macro_tokens
   3597     call
   3598 
   3599     # stream->pos = emt_after_pos
   3600     ld_a0,sp,0
   3601     la_a1 &emt_after_pos
   3602     ld_t0,a1,0
   3603     st_t0,a0,16
   3604 
   3605     # stream->line_start = 0
   3606     li_t0 %0 %0
   3607     st_t0,a0,24
   3608 
   3609     # push_pool_stream_from_mark(emt_mark)
   3610     la_a0 &emt_mark
   3611     ld_a0,a0,0
   3612     la_br &push_pool_stream_from_mark
   3613     call
   3614 
   3615     eret
   3616 
   3617 ## ============================================================================
   3618 ## --- ## token paste compaction ----------------------------------------------
   3619 ## ============================================================================
   3620 
   3621 ## append_pasted_token(a0=dst_tok, a1=left_tok, a2=right_tok) -> void (fatal)
   3622 ## Concatenate left->text and right->text into paste_scratch, then call
   3623 ## append_text(&paste_scratch, total_len) for stable storage in text_buf,
   3624 ## and write *dst = { TOK_WORD, text_ptr, total_len }. paste_scratch is
   3625 ## 256 bytes (M0's quoted-literal cap). Fatal err_text_overflow if combined
   3626 ## length exceeds 256 bytes; append_text handles its own text_buf overflow.
   3627 :append_pasted_token
   3628     enter_0
   3629 
   3630     # ---- Spill all three operands to BSS so we can survive append_text. ----
   3631     la_t0 &paste_dst_save
   3632     st_a0,t0,0
   3633     la_t0 &paste_left_ptr
   3634     ld_t1,a1,8
   3635     st_t1,t0,0
   3636     la_t0 &paste_left_len
   3637     ld_t1,a1,16
   3638     st_t1,t0,0
   3639     la_t0 &paste_right_ptr
   3640     ld_t1,a2,8
   3641     st_t1,t0,0
   3642     la_t0 &paste_right_len
   3643     ld_t1,a2,16
   3644     st_t1,t0,0
   3645 
   3646     # ---- total_len = left.len + right.len; fatal if > 256 ----
   3647     la_t0 &paste_left_len
   3648     ld_t1,t0,0
   3649     la_t0 &paste_right_len
   3650     ld_t2,t0,0
   3651     add_a0,t1,t2
   3652     li_a1 %256 %0
   3653     la_br &err_text_overflow
   3654     blt_a1,a0
   3655     # save total_len for the append_text call below
   3656     la_t0 &paste_total_len
   3657     st_a0,t0,0
   3658 
   3659     # ---- Copy left bytes: paste_scratch[0..left.len) <- left.text_ptr ----
   3660     la_t0 &paste_left_ptr
   3661     ld_t0,t0,0
   3662     la_t1 &paste_left_len
   3663     ld_t1,t1,0
   3664     la_t2 &paste_scratch_ptr
   3665     ld_t2,t2,0
   3666     li_a0 %0 %0
   3667 :append_pasted_left_loop
   3668     la_br &append_pasted_left_done
   3669     beq_a0,t1
   3670     add_a1,t0,a0
   3671     lb_a1,a1,0
   3672     add_a2,t2,a0
   3673     sb_a1,a2,0
   3674     addi_a0,a0,1
   3675     la_br &append_pasted_left_loop
   3676     b
   3677 :append_pasted_left_done
   3678 
   3679     # ---- Copy right bytes: paste_scratch[left.len..total_len) <- right.text_ptr ----
   3680     la_t0 &paste_right_ptr
   3681     ld_t0,t0,0
   3682     la_t1 &paste_right_len
   3683     ld_t1,t1,0
   3684     la_t2 &paste_scratch_ptr
   3685     ld_t2,t2,0
   3686     la_a3 &paste_left_len
   3687     ld_a3,a3,0
   3688     add_t2,t2,a3              # t2 = &paste_scratch[left.len]
   3689     li_a0 %0 %0
   3690 :append_pasted_right_loop
   3691     la_br &append_pasted_right_done
   3692     beq_a0,t1
   3693     add_a1,t0,a0
   3694     lb_a1,a1,0
   3695     add_a2,t2,a0
   3696     sb_a1,a2,0
   3697     addi_a0,a0,1
   3698     la_br &append_pasted_right_loop
   3699     b
   3700 :append_pasted_right_done
   3701 
   3702     # ---- text_ptr = append_text(&paste_scratch, total_len) ----
   3703     la_a0 &paste_scratch_ptr
   3704     ld_a0,a0,0
   3705     la_a1 &paste_total_len
   3706     ld_a1,a1,0
   3707     la_br &append_text
   3708     call
   3709     # a0 = text_ptr (returned)
   3710 
   3711     # ---- *dst = { TOK_WORD, text_ptr, total_len, tight=0 } ----
   3712     la_t0 &paste_dst_save
   3713     ld_t0,t0,0
   3714     li_a2 TOK_WORD
   3715     st_a2,t0,0
   3716     st_a0,t0,8
   3717     la_a1 &paste_total_len
   3718     ld_a1,a1,0
   3719     st_a1,t0,16
   3720     li_a1 %0 %0
   3721     st_a1,t0,24
   3722 
   3723     eret
   3724 
   3725 ## paste_pool_range(a0=mark) -> void (fatal on bad paste)
   3726 ## In-place compactor over expand_pool[mark..pool_used). For each TOK_PASTE,
   3727 ## walk back from `out` over already-copied NEWLINE tokens to find the left
   3728 ## operand, walk forward from `in+1` over NEWLINE tokens to find the right
   3729 ## operand, then paste left+right into the left slot. The discarded newlines
   3730 ## on either side are dropped. Copy other tokens forward. Update pool_used
   3731 ## to the new end. Fatal if ## has no operand on a side, or its operand is
   3732 ## itself PASTE.
   3733 :paste_pool_range
   3734     enter_0
   3735 
   3736     # ---- start = expand_pool + mark ----
   3737     la_t0 &expand_pool_ptr
   3738     ld_t0,t0,0
   3739     add_t0,t0,a0
   3740     la_t1 &paste_start
   3741     st_t0,t1,0
   3742     # paste_in = start
   3743     la_t1 &paste_in
   3744     st_t0,t1,0
   3745     # paste_out = start
   3746     la_t1 &paste_out
   3747     st_t0,t1,0
   3748 
   3749     # ---- end = expand_pool + pool_used ----
   3750     la_t1 &pool_used
   3751     ld_t2,t1,0
   3752     la_t1 &expand_pool_ptr
   3753     ld_t1,t1,0
   3754     add_t2,t1,t2
   3755     la_t1 &paste_end
   3756     st_t2,t1,0
   3757 
   3758 :paste_pool_loop
   3759     # in = paste_in; end = paste_end; if (in == end) done
   3760     la_a0 &paste_in
   3761     ld_t0,a0,0
   3762     la_a1 &paste_end
   3763     ld_t1,a1,0
   3764     la_br &paste_pool_done
   3765     beq_t0,t1
   3766 
   3767     # kind = in->kind
   3768     ld_a2,t0,0
   3769     li_a3 TOK_PASTE
   3770     la_br &paste_pool_handle_paste
   3771     beq_a2,a3
   3772 
   3773     # ---- non-PASTE: copy *in to *out, advance both by 32 ----
   3774     la_a0 &paste_out
   3775     ld_t2,a0,0
   3776     # if (in == out) skip the copy
   3777     la_br &paste_pool_skip_copy
   3778     beq_t0,t2
   3779     ld_a3,t0,0
   3780     st_a3,t2,0
   3781     ld_a3,t0,8
   3782     st_a3,t2,8
   3783     ld_a3,t0,16
   3784     st_a3,t2,16
   3785     ld_a3,t0,24
   3786     st_a3,t2,24
   3787 :paste_pool_skip_copy
   3788     addi_t0,t0,32
   3789     addi_t2,t2,32
   3790     la_a0 &paste_in
   3791     st_t0,a0,0
   3792     la_a0 &paste_out
   3793     st_t2,a0,0
   3794     la_br &paste_pool_loop
   3795     b
   3796 
   3797 :paste_pool_handle_paste
   3798     # ---- TOK_PASTE handling ----
   3799     # Find left operand: start at out, walk back over NEWLINE tokens, then
   3800     # step one more to land on the actual left operand. If we cannot step
   3801     # back past start, the ## has no left operand -> fatal.
   3802     la_a0 &paste_out
   3803     ld_t1,a0,0                   # t1 = left (initially = out)
   3804     la_a1 &paste_start
   3805     ld_t2,a1,0                   # t2 = start
   3806 :paste_pool_left_skip_nl
   3807     la_br &paste_pool_left_step
   3808     beq_t1,t2
   3809     ld_a2,t1,neg32
   3810     li_a3 TOK_NEWLINE
   3811     la_br &paste_pool_left_step
   3812     bne_a2,a3
   3813     addi_t1,t1,neg32
   3814     la_br &paste_pool_left_skip_nl
   3815     b
   3816 :paste_pool_left_step
   3817     # left == start? then ## is first (fatal).
   3818     la_br &err_bad_macro_header
   3819     beq_t1,t2
   3820     addi_t1,t1,neg32             # left now points at the operand
   3821     # Validate left->kind != TOK_PASTE.
   3822     ld_a2,t1,0
   3823     li_a3 TOK_PASTE
   3824     la_br &err_bad_macro_header
   3825     beq_a2,a3
   3826 
   3827     # Find right operand: start at in+1, walk forward over NEWLINE tokens.
   3828     # If we run out of tokens, ## is last (fatal). If right is PASTE, fatal.
   3829     addi_t0,t0,32                # t0 = right (initially = in + 1)
   3830     la_a1 &paste_end
   3831     ld_a2,a1,0                   # a2 = end
   3832 :paste_pool_right_skip_nl
   3833     la_br &err_bad_macro_header
   3834     beq_t0,a2
   3835     ld_a3,t0,0
   3836     li_a1 TOK_NEWLINE
   3837     la_br &paste_pool_right_step
   3838     bne_a3,a1
   3839     addi_t0,t0,32
   3840     la_br &paste_pool_right_skip_nl
   3841     b
   3842 :paste_pool_right_step
   3843     # Validate right->kind != TOK_PASTE.
   3844     li_a1 TOK_PASTE
   3845     la_br &err_bad_macro_header
   3846     beq_a3,a1
   3847 
   3848     # Pre-publish out = left + 1 and in = right + 1, since the call below
   3849     # clobbers the t* / a* registers we used to track them.
   3850     addi_a3,t1,32
   3851     la_a1 &paste_out
   3852     st_a3,a1,0
   3853     addi_a3,t0,32
   3854     la_a1 &paste_in
   3855     st_a3,a1,0
   3856 
   3857     # append_pasted_token(left, left, right)
   3858     mov_a0,t1
   3859     mov_a1,t1
   3860     mov_a2,t0
   3861     la_br &append_pasted_token
   3862     call
   3863 
   3864     la_br &paste_pool_loop
   3865     b
   3866 
   3867 :paste_pool_done
   3868     # pool_used = (out - expand_pool)
   3869     la_a0 &paste_out
   3870     ld_t0,a0,0
   3871     la_a1 &expand_pool_ptr
   3872     ld_a1,a1,0
   3873     sub_t0,t0,a1
   3874     la_a1 &pool_used
   3875     st_t0,a1,0
   3876     eret
   3877 
   3878 ## ============================================================================
   3879 ## --- Integer atoms + S-expression evaluator ---------------------------------
   3880 ## ============================================================================
   3881 
   3882 ## parse_int_token(a0=tok) -> a0 = i64 (fatal on bad). Leaf.
   3883 ## Accepts decimal (optional leading '-') and 0x-prefixed hex. Positive
   3884 ## values are accumulated as u64 and reinterpreted as i64, so values with
   3885 ## the high bit set wrap to negative i64.
   3886 ##
   3887 ## Register usage (leaf, no calls):
   3888 ##   t0 = src ptr (cursor into text)
   3889 ##   t1 = end ptr (text + len)
   3890 ##   t2 = current byte
   3891 ##   a0 = accumulator (return)
   3892 ##   a1 = negative flag (0/1)
   3893 ##   a2 = scratch (digit, multiplier)
   3894 ##   a3 = scratch (compare value)
   3895 :parse_int_token
   3896     # if (tok->kind != TOK_WORD) fatal
   3897     ld_t0,a0,0
   3898     li_t1 TOK_WORD
   3899     la_br &err_bad_macro_header
   3900     bne_t0,t1
   3901 
   3902     # src = tok->text_ptr; len = tok->text_len; end = src + len
   3903     ld_t0,a0,8
   3904     ld_t1,a0,16
   3905 
   3906     # if (len <= 0) fatal
   3907     la_br &err_bad_macro_header
   3908     beqz_t1
   3909     add_t1,t0,t1
   3910 
   3911     # negative = 0
   3912     li_a1 %0 %0
   3913 
   3914     # if (*src == '-') { negative = 1; src++; if (src == end) fatal }
   3915     lb_t2,t0,0
   3916     li_a3 %45 %0
   3917     la_br &pit_after_sign
   3918     bne_t2,a3
   3919     li_a1 %1 %0
   3920     addi_t0,t0,1
   3921     la_br &err_bad_macro_header
   3922     beq_t0,t1
   3923 
   3924 :pit_after_sign
   3925     # accumulator = 0
   3926     li_a0 %0 %0
   3927 
   3928     # check for 0x / 0X prefix: need at least 2 chars left
   3929     mov_a2,t1
   3930     sub_a2,a2,t0
   3931     li_a3 %2 %0
   3932     la_br &pit_decimal
   3933     blt_a2,a3
   3934 
   3935     # if (src[0] == '0' && (src[1] == 'x' || src[1] == 'X')) -> hex
   3936     lb_t2,t0,0
   3937     li_a3 %48 %0
   3938     la_br &pit_decimal
   3939     bne_t2,a3
   3940     addi_a2,t0,1
   3941     lb_a2,a2,0
   3942     li_a3 %120 %0
   3943     la_br &pit_hex_start
   3944     beq_a2,a3
   3945     li_a3 %88 %0
   3946     la_br &pit_hex_start
   3947     beq_a2,a3
   3948     la_br &pit_decimal
   3949     b
   3950 
   3951 :pit_hex_start
   3952     # consume "0x"; require at least one hex digit after
   3953     addi_t0,t0,2
   3954     la_br &err_bad_macro_header
   3955     beq_t0,t1
   3956 :pit_hex_loop
   3957     la_br &pit_finish
   3958     beq_t0,t1
   3959     lb_t2,t0,0
   3960 
   3961     # 0..9
   3962     li_a3 %48 %0
   3963     la_br &pit_hex_check_lower
   3964     blt_t2,a3
   3965     li_a3 %57 %0
   3966     la_br &pit_hex_check_lower
   3967     blt_a3,t2
   3968     # digit = c - '0'
   3969     addi_a2,t2,neg48
   3970     la_br &pit_hex_accum
   3971     b
   3972 
   3973 :pit_hex_check_lower
   3974     # 'a'..'f'
   3975     li_a3 %97 %0
   3976     la_br &pit_hex_check_upper
   3977     blt_t2,a3
   3978     li_a3 %102 %0
   3979     la_br &pit_hex_check_upper
   3980     blt_a3,t2
   3981     # digit = (c - 'a') + 10
   3982     li_a3 %97 %0
   3983     sub_a2,t2,a3
   3984     addi_a2,a2,8
   3985     addi_a2,a2,2
   3986     la_br &pit_hex_accum
   3987     b
   3988 
   3989 :pit_hex_check_upper
   3990     # 'A'..'F'
   3991     li_a3 %65 %0
   3992     la_br &err_bad_macro_header
   3993     blt_t2,a3
   3994     li_a3 %70 %0
   3995     la_br &err_bad_macro_header
   3996     blt_a3,t2
   3997     # digit = (c - 'A') + 10
   3998     li_a3 %65 %0
   3999     sub_a2,t2,a3
   4000     addi_a2,a2,8
   4001     addi_a2,a2,2
   4002 
   4003 :pit_hex_accum
   4004     # accum = (accum << 4) | digit
   4005     shli_a0,a0,4
   4006     or_a0,a0,a2
   4007     addi_t0,t0,1
   4008     la_br &pit_hex_loop
   4009     b
   4010 
   4011 :pit_decimal
   4012     # decimal loop: accum = accum * 10 + digit
   4013     # (caller already ensured len > 0 and that src points to first digit)
   4014 :pit_decimal_loop
   4015     la_br &pit_finish
   4016     beq_t0,t1
   4017     lb_t2,t0,0
   4018     li_a3 %48 %0
   4019     la_br &err_bad_macro_header
   4020     blt_t2,a3
   4021     li_a3 %57 %0
   4022     la_br &err_bad_macro_header
   4023     blt_a3,t2
   4024     # accum = accum * 10
   4025     li_a3 %10 %0
   4026     mul_a0,a0,a3
   4027     # digit = c - '0'; accum += digit
   4028     addi_a2,t2,neg48
   4029     add_a0,a0,a2
   4030     addi_t0,t0,1
   4031     la_br &pit_decimal_loop
   4032     b
   4033 
   4034 :pit_finish
   4035     # if (negative) accum = 0 - accum
   4036     la_br &pit_done
   4037     beqz_a1
   4038     li_a3 %0 %0
   4039     sub_a0,a3,a0
   4040 :pit_done
   4041     ret
   4042 
   4043 ## expr_op_code(a0=tok) -> a0 = EXPR_ADD..EXPR_STRLEN, or EXPR_INVALID.
   4044 ## Accepts operator tokens: +  -  *  /  %  <<  >>  &  |  ^  ~  =  !=
   4045 ## <  <=  >  >=  strlen. Non-WORD tok or unknown operator -> EXPR_INVALID.
   4046 ##
   4047 ## tok_eq_const is a leaf but clobbers a0..a3,t0..t2; spill tok to eoc_tok
   4048 ## once, reload before each compare. Needs an enter_0 frame because it
   4049 ## issues `call` instructions (aarch64 CALL writes LR).
   4050 :expr_op_code
   4051     enter_0
   4052     # spill tok; reject non-WORD up front
   4053     la_a1 &eoc_tok
   4054     st_a0,a1,0
   4055     ld_t0,a0,0
   4056     li_t1 TOK_WORD
   4057     la_br &eoc_invalid
   4058     bne_t0,t1
   4059 
   4060     # "+" -> EXPR_ADD
   4061     la_a0 &eoc_tok
   4062     ld_a0,a0,0
   4063     la_a1 &op_plus
   4064     li_a2 %1 %0
   4065     la_br &tok_eq_const
   4066     call
   4067     la_br &eoc_add
   4068     bnez_a0
   4069 
   4070     # "-" -> EXPR_SUB
   4071     la_a0 &eoc_tok
   4072     ld_a0,a0,0
   4073     la_a1 &op_minus
   4074     li_a2 %1 %0
   4075     la_br &tok_eq_const
   4076     call
   4077     la_br &eoc_sub
   4078     bnez_a0
   4079 
   4080     # "*" -> EXPR_MUL
   4081     la_a0 &eoc_tok
   4082     ld_a0,a0,0
   4083     la_a1 &op_star
   4084     li_a2 %1 %0
   4085     la_br &tok_eq_const
   4086     call
   4087     la_br &eoc_mul
   4088     bnez_a0
   4089 
   4090     # "/" -> EXPR_DIV
   4091     la_a0 &eoc_tok
   4092     ld_a0,a0,0
   4093     la_a1 &op_slash
   4094     li_a2 %1 %0
   4095     la_br &tok_eq_const
   4096     call
   4097     la_br &eoc_div
   4098     bnez_a0
   4099 
   4100     # "%" -> EXPR_MOD
   4101     la_a0 &eoc_tok
   4102     ld_a0,a0,0
   4103     la_a1 &op_percent
   4104     li_a2 %1 %0
   4105     la_br &tok_eq_const
   4106     call
   4107     la_br &eoc_mod
   4108     bnez_a0
   4109 
   4110     # "<<" -> EXPR_SHL
   4111     la_a0 &eoc_tok
   4112     ld_a0,a0,0
   4113     la_a1 &op_shl
   4114     li_a2 %2 %0
   4115     la_br &tok_eq_const
   4116     call
   4117     la_br &eoc_shl
   4118     bnez_a0
   4119 
   4120     # ">>" -> EXPR_SHR
   4121     la_a0 &eoc_tok
   4122     ld_a0,a0,0
   4123     la_a1 &op_shr
   4124     li_a2 %2 %0
   4125     la_br &tok_eq_const
   4126     call
   4127     la_br &eoc_shr
   4128     bnez_a0
   4129 
   4130     # "&" -> EXPR_AND
   4131     la_a0 &eoc_tok
   4132     ld_a0,a0,0
   4133     la_a1 &op_amp
   4134     li_a2 %1 %0
   4135     la_br &tok_eq_const
   4136     call
   4137     la_br &eoc_and
   4138     bnez_a0
   4139 
   4140     # "|" -> EXPR_OR
   4141     la_a0 &eoc_tok
   4142     ld_a0,a0,0
   4143     la_a1 &op_bar
   4144     li_a2 %1 %0
   4145     la_br &tok_eq_const
   4146     call
   4147     la_br &eoc_or
   4148     bnez_a0
   4149 
   4150     # "^" -> EXPR_XOR
   4151     la_a0 &eoc_tok
   4152     ld_a0,a0,0
   4153     la_a1 &op_caret
   4154     li_a2 %1 %0
   4155     la_br &tok_eq_const
   4156     call
   4157     la_br &eoc_xor
   4158     bnez_a0
   4159 
   4160     # "~" -> EXPR_NOT
   4161     la_a0 &eoc_tok
   4162     ld_a0,a0,0
   4163     la_a1 &op_tilde
   4164     li_a2 %1 %0
   4165     la_br &tok_eq_const
   4166     call
   4167     la_br &eoc_not
   4168     bnez_a0
   4169 
   4170     # "=" -> EXPR_EQ
   4171     la_a0 &eoc_tok
   4172     ld_a0,a0,0
   4173     la_a1 &op_eq
   4174     li_a2 %1 %0
   4175     la_br &tok_eq_const
   4176     call
   4177     la_br &eoc_eq
   4178     bnez_a0
   4179 
   4180     # "!=" -> EXPR_NE
   4181     la_a0 &eoc_tok
   4182     ld_a0,a0,0
   4183     la_a1 &op_ne
   4184     li_a2 %2 %0
   4185     la_br &tok_eq_const
   4186     call
   4187     la_br &eoc_ne
   4188     bnez_a0
   4189 
   4190     # "<=" -> EXPR_LE (check before single "<")
   4191     la_a0 &eoc_tok
   4192     ld_a0,a0,0
   4193     la_a1 &op_le
   4194     li_a2 %2 %0
   4195     la_br &tok_eq_const
   4196     call
   4197     la_br &eoc_le
   4198     bnez_a0
   4199 
   4200     # "<" -> EXPR_LT
   4201     la_a0 &eoc_tok
   4202     ld_a0,a0,0
   4203     la_a1 &op_lt
   4204     li_a2 %1 %0
   4205     la_br &tok_eq_const
   4206     call
   4207     la_br &eoc_lt
   4208     bnez_a0
   4209 
   4210     # ">=" -> EXPR_GE (check before single ">")
   4211     la_a0 &eoc_tok
   4212     ld_a0,a0,0
   4213     la_a1 &op_ge
   4214     li_a2 %2 %0
   4215     la_br &tok_eq_const
   4216     call
   4217     la_br &eoc_ge
   4218     bnez_a0
   4219 
   4220     # ">" -> EXPR_GT
   4221     la_a0 &eoc_tok
   4222     ld_a0,a0,0
   4223     la_a1 &op_gt
   4224     li_a2 %1 %0
   4225     la_br &tok_eq_const
   4226     call
   4227     la_br &eoc_gt
   4228     bnez_a0
   4229 
   4230     # "strlen" -> EXPR_STRLEN
   4231     la_a0 &eoc_tok
   4232     ld_a0,a0,0
   4233     la_a1 &op_strlen
   4234     li_a2 %6 %0
   4235     la_br &tok_eq_const
   4236     call
   4237     la_br &eoc_strlen
   4238     bnez_a0
   4239 
   4240 :eoc_invalid
   4241     li_a0 EXPR_INVALID
   4242     eret
   4243 :eoc_add
   4244     li_a0 EXPR_ADD
   4245     eret
   4246 :eoc_sub
   4247     li_a0 EXPR_SUB
   4248     eret
   4249 :eoc_mul
   4250     li_a0 EXPR_MUL
   4251     eret
   4252 :eoc_div
   4253     li_a0 EXPR_DIV
   4254     eret
   4255 :eoc_mod
   4256     li_a0 EXPR_MOD
   4257     eret
   4258 :eoc_shl
   4259     li_a0 EXPR_SHL
   4260     eret
   4261 :eoc_shr
   4262     li_a0 EXPR_SHR
   4263     eret
   4264 :eoc_and
   4265     li_a0 EXPR_AND
   4266     eret
   4267 :eoc_or
   4268     li_a0 EXPR_OR
   4269     eret
   4270 :eoc_xor
   4271     li_a0 EXPR_XOR
   4272     eret
   4273 :eoc_not
   4274     li_a0 EXPR_NOT
   4275     eret
   4276 :eoc_eq
   4277     li_a0 EXPR_EQ
   4278     eret
   4279 :eoc_ne
   4280     li_a0 EXPR_NE
   4281     eret
   4282 :eoc_lt
   4283     li_a0 EXPR_LT
   4284     eret
   4285 :eoc_le
   4286     li_a0 EXPR_LE
   4287     eret
   4288 :eoc_gt
   4289     li_a0 EXPR_GT
   4290     eret
   4291 :eoc_ge
   4292     li_a0 EXPR_GE
   4293     eret
   4294 :eoc_strlen
   4295     li_a0 EXPR_STRLEN
   4296     eret
   4297 
   4298 ## apply_expr_op(a0=op_code, a1=args_ptr, a2=argc) -> a0 = i64 result
   4299 ## Reduce args[0..argc) per op:
   4300 ##   + * & | $      variadic, argc >= 1
   4301 ##   -              argc >= 1 (argc == 1 is negate, else left-assoc subtract)
   4302 ##   / %            binary, div-by-zero fatal
   4303 ##   << >>          binary (>> is arithmetic)
   4304 ##   ~              unary
   4305 ##   = == != < <= > >=  binary
   4306 ## Fatal on wrong argc or EXPR_INVALID.
   4307 ##
   4308 ## Calls aeo_require_* helpers via `call`, so it needs a frame.
   4309 ## State held in BSS scratch (aeo_op/args/argc/acc/i) since loops trash registers.
   4310 :apply_expr_op
   4311     enter_0
   4312     # spill op, args, argc to BSS
   4313     la_a3 &aeo_op
   4314     st_a0,a3,0
   4315     la_a3 &aeo_args
   4316     st_a1,a3,0
   4317     la_a3 &aeo_argc
   4318     st_a2,a3,0
   4319 
   4320     # dispatch: compare op against each EXPR_* and branch to its handler
   4321     li_t0 EXPR_ADD
   4322     la_br &aeo_do_add
   4323     beq_a0,t0
   4324     li_t0 EXPR_SUB
   4325     la_br &aeo_do_sub
   4326     beq_a0,t0
   4327     li_t0 EXPR_MUL
   4328     la_br &aeo_do_mul
   4329     beq_a0,t0
   4330     li_t0 EXPR_DIV
   4331     la_br &aeo_do_div
   4332     beq_a0,t0
   4333     li_t0 EXPR_MOD
   4334     la_br &aeo_do_mod
   4335     beq_a0,t0
   4336     li_t0 EXPR_SHL
   4337     la_br &aeo_do_shl
   4338     beq_a0,t0
   4339     li_t0 EXPR_SHR
   4340     la_br &aeo_do_shr
   4341     beq_a0,t0
   4342     li_t0 EXPR_AND
   4343     la_br &aeo_do_and
   4344     beq_a0,t0
   4345     li_t0 EXPR_OR
   4346     la_br &aeo_do_or
   4347     beq_a0,t0
   4348     li_t0 EXPR_XOR
   4349     la_br &aeo_do_xor
   4350     beq_a0,t0
   4351     li_t0 EXPR_NOT
   4352     la_br &aeo_do_not
   4353     beq_a0,t0
   4354     li_t0 EXPR_EQ
   4355     la_br &aeo_do_eq
   4356     beq_a0,t0
   4357     li_t0 EXPR_NE
   4358     la_br &aeo_do_ne
   4359     beq_a0,t0
   4360     li_t0 EXPR_LT
   4361     la_br &aeo_do_lt
   4362     beq_a0,t0
   4363     li_t0 EXPR_LE
   4364     la_br &aeo_do_le
   4365     beq_a0,t0
   4366     li_t0 EXPR_GT
   4367     la_br &aeo_do_gt
   4368     beq_a0,t0
   4369     li_t0 EXPR_GE
   4370     la_br &aeo_do_ge
   4371     beq_a0,t0
   4372     # EXPR_INVALID or unknown
   4373     la_br &err_bad_macro_header
   4374     b
   4375 
   4376 ## --- shared helpers for variadic folds ----------------------------------
   4377 ## aeo_require_argc_ge1: branch to err if argc < 1
   4378 ## aeo_require_argc_eq2: branch to err if argc != 2
   4379 ## aeo_load_arg0_to_acc: acc = args[0]; i = 1
   4380 
   4381 :aeo_do_add
   4382     la_br &aeo_require_argc_ge1
   4383     call
   4384     la_br &aeo_load_arg0_to_acc
   4385     call
   4386 :aeo_add_loop
   4387     la_a0 &aeo_i
   4388     ld_t0,a0,0
   4389     la_a1 &aeo_argc
   4390     ld_t1,a1,0
   4391     la_br &aeo_finish
   4392     beq_t0,t1
   4393     # acc += args[i]
   4394     la_a0 &aeo_args
   4395     ld_a1,a0,0
   4396     shli_t2,t0,3
   4397     add_t2,a1,t2
   4398     ld_a2,t2,0
   4399     la_a0 &aeo_acc
   4400     ld_a3,a0,0
   4401     add_a3,a3,a2
   4402     st_a3,a0,0
   4403     addi_t0,t0,1
   4404     la_a1 &aeo_i
   4405     st_t0,a1,0
   4406     la_br &aeo_add_loop
   4407     b
   4408 
   4409 :aeo_do_sub
   4410     la_br &aeo_require_argc_ge1
   4411     call
   4412     # if (argc == 1) acc = -args[0]; else acc = args[0]
   4413     la_a0 &aeo_argc
   4414     ld_t0,a0,0
   4415     li_t1 %1 %0
   4416     la_br &aeo_sub_unary
   4417     beq_t0,t1
   4418     la_br &aeo_load_arg0_to_acc
   4419     call
   4420 :aeo_sub_loop
   4421     la_a0 &aeo_i
   4422     ld_t0,a0,0
   4423     la_a1 &aeo_argc
   4424     ld_t1,a1,0
   4425     la_br &aeo_finish
   4426     beq_t0,t1
   4427     la_a0 &aeo_args
   4428     ld_a1,a0,0
   4429     shli_t2,t0,3
   4430     add_t2,a1,t2
   4431     ld_a2,t2,0
   4432     la_a0 &aeo_acc
   4433     ld_a3,a0,0
   4434     sub_a3,a3,a2
   4435     st_a3,a0,0
   4436     addi_t0,t0,1
   4437     la_a1 &aeo_i
   4438     st_t0,a1,0
   4439     la_br &aeo_sub_loop
   4440     b
   4441 :aeo_sub_unary
   4442     # acc = 0 - args[0]
   4443     la_a0 &aeo_args
   4444     ld_a1,a0,0
   4445     ld_a2,a1,0
   4446     li_a3 %0 %0
   4447     sub_a3,a3,a2
   4448     la_a0 &aeo_acc
   4449     st_a3,a0,0
   4450     la_br &aeo_finish
   4451     b
   4452 
   4453 :aeo_do_mul
   4454     la_br &aeo_require_argc_ge1
   4455     call
   4456     la_br &aeo_load_arg0_to_acc
   4457     call
   4458 :aeo_mul_loop
   4459     la_a0 &aeo_i
   4460     ld_t0,a0,0
   4461     la_a1 &aeo_argc
   4462     ld_t1,a1,0
   4463     la_br &aeo_finish
   4464     beq_t0,t1
   4465     la_a0 &aeo_args
   4466     ld_a1,a0,0
   4467     shli_t2,t0,3
   4468     add_t2,a1,t2
   4469     ld_a2,t2,0
   4470     la_a0 &aeo_acc
   4471     ld_a3,a0,0
   4472     mul_a3,a3,a2
   4473     st_a3,a0,0
   4474     addi_t0,t0,1
   4475     la_a1 &aeo_i
   4476     st_t0,a1,0
   4477     la_br &aeo_mul_loop
   4478     b
   4479 
   4480 :aeo_do_div
   4481     la_br &aeo_require_argc_eq2
   4482     call
   4483     la_a0 &aeo_args
   4484     ld_a1,a0,0
   4485     ld_a2,a1,0
   4486     ld_a3,a1,8
   4487     # if (args[1] == 0) fatal
   4488     la_br &err_bad_macro_header
   4489     beqz_a3
   4490     div_a2,a2,a3
   4491     la_a0 &aeo_acc
   4492     st_a2,a0,0
   4493     la_br &aeo_finish
   4494     b
   4495 
   4496 :aeo_do_mod
   4497     la_br &aeo_require_argc_eq2
   4498     call
   4499     la_a0 &aeo_args
   4500     ld_a1,a0,0
   4501     ld_a2,a1,0
   4502     ld_a3,a1,8
   4503     la_br &err_bad_macro_header
   4504     beqz_a3
   4505     rem_a2,a2,a3
   4506     la_a0 &aeo_acc
   4507     st_a2,a0,0
   4508     la_br &aeo_finish
   4509     b
   4510 
   4511 :aeo_do_shl
   4512     la_br &aeo_require_argc_eq2
   4513     call
   4514     la_a0 &aeo_args
   4515     ld_a1,a0,0
   4516     ld_a2,a1,0
   4517     ld_a3,a1,8
   4518     shl_a2,a2,a3
   4519     la_a0 &aeo_acc
   4520     st_a2,a0,0
   4521     la_br &aeo_finish
   4522     b
   4523 
   4524 :aeo_do_shr
   4525     la_br &aeo_require_argc_eq2
   4526     call
   4527     la_a0 &aeo_args
   4528     ld_a1,a0,0
   4529     ld_a2,a1,0
   4530     ld_a3,a1,8
   4531     sar_a2,a2,a3
   4532     la_a0 &aeo_acc
   4533     st_a2,a0,0
   4534     la_br &aeo_finish
   4535     b
   4536 
   4537 :aeo_do_and
   4538     la_br &aeo_require_argc_ge1
   4539     call
   4540     la_br &aeo_load_arg0_to_acc
   4541     call
   4542 :aeo_and_loop
   4543     la_a0 &aeo_i
   4544     ld_t0,a0,0
   4545     la_a1 &aeo_argc
   4546     ld_t1,a1,0
   4547     la_br &aeo_finish
   4548     beq_t0,t1
   4549     la_a0 &aeo_args
   4550     ld_a1,a0,0
   4551     shli_t2,t0,3
   4552     add_t2,a1,t2
   4553     ld_a2,t2,0
   4554     la_a0 &aeo_acc
   4555     ld_a3,a0,0
   4556     and_a3,a3,a2
   4557     st_a3,a0,0
   4558     addi_t0,t0,1
   4559     la_a1 &aeo_i
   4560     st_t0,a1,0
   4561     la_br &aeo_and_loop
   4562     b
   4563 
   4564 :aeo_do_or
   4565     la_br &aeo_require_argc_ge1
   4566     call
   4567     la_br &aeo_load_arg0_to_acc
   4568     call
   4569 :aeo_or_loop
   4570     la_a0 &aeo_i
   4571     ld_t0,a0,0
   4572     la_a1 &aeo_argc
   4573     ld_t1,a1,0
   4574     la_br &aeo_finish
   4575     beq_t0,t1
   4576     la_a0 &aeo_args
   4577     ld_a1,a0,0
   4578     shli_t2,t0,3
   4579     add_t2,a1,t2
   4580     ld_a2,t2,0
   4581     la_a0 &aeo_acc
   4582     ld_a3,a0,0
   4583     or_a3,a3,a2
   4584     st_a3,a0,0
   4585     addi_t0,t0,1
   4586     la_a1 &aeo_i
   4587     st_t0,a1,0
   4588     la_br &aeo_or_loop
   4589     b
   4590 
   4591 :aeo_do_xor
   4592     la_br &aeo_require_argc_ge1
   4593     call
   4594     la_br &aeo_load_arg0_to_acc
   4595     call
   4596 :aeo_xor_loop
   4597     la_a0 &aeo_i
   4598     ld_t0,a0,0
   4599     la_a1 &aeo_argc
   4600     ld_t1,a1,0
   4601     la_br &aeo_finish
   4602     beq_t0,t1
   4603     la_a0 &aeo_args
   4604     ld_a1,a0,0
   4605     shli_t2,t0,3
   4606     add_t2,a1,t2
   4607     ld_a2,t2,0
   4608     la_a0 &aeo_acc
   4609     ld_a3,a0,0
   4610     xor_a3,a3,a2
   4611     st_a3,a0,0
   4612     addi_t0,t0,1
   4613     la_a1 &aeo_i
   4614     st_t0,a1,0
   4615     la_br &aeo_xor_loop
   4616     b
   4617 
   4618 :aeo_do_not
   4619     # require argc == 1
   4620     la_a0 &aeo_argc
   4621     ld_t0,a0,0
   4622     li_t1 %1 %0
   4623     la_br &err_bad_macro_header
   4624     bne_t0,t1
   4625     la_a0 &aeo_args
   4626     ld_a1,a0,0
   4627     ld_a2,a1,0
   4628     # ~x = x XOR -1
   4629     li_a3 %1 %0
   4630     li_t0 %0 %0
   4631     sub_a3,t0,a3
   4632     xor_a2,a2,a3
   4633     la_a0 &aeo_acc
   4634     st_a2,a0,0
   4635     la_br &aeo_finish
   4636     b
   4637 
   4638 ## --- comparison ops: return 0 or 1 ----------------------------------------
   4639 ## EQ:  args[0] == args[1]
   4640 ## NE:  args[0] != args[1]
   4641 ## LT:  args[0] <  args[1]   (signed)
   4642 ## LE:  args[0] <= args[1]   (signed)
   4643 ## GT:  args[0] >  args[1]   (signed)
   4644 ## GE:  args[0] >= args[1]   (signed)
   4645 
   4646 :aeo_do_eq
   4647     la_br &aeo_require_argc_eq2
   4648     call
   4649     la_a0 &aeo_args
   4650     ld_a1,a0,0
   4651     ld_a2,a1,0
   4652     ld_a3,a1,8
   4653     li_t0 %0 %0
   4654     la_br &aeo_cmp_finish
   4655     bne_a2,a3
   4656     li_t0 %1 %0
   4657     la_br &aeo_cmp_finish
   4658     b
   4659 
   4660 :aeo_do_ne
   4661     la_br &aeo_require_argc_eq2
   4662     call
   4663     la_a0 &aeo_args
   4664     ld_a1,a0,0
   4665     ld_a2,a1,0
   4666     ld_a3,a1,8
   4667     li_t0 %0 %0
   4668     la_br &aeo_cmp_finish
   4669     beq_a2,a3
   4670     li_t0 %1 %0
   4671     la_br &aeo_cmp_finish
   4672     b
   4673 
   4674 :aeo_do_lt
   4675     la_br &aeo_require_argc_eq2
   4676     call
   4677     la_a0 &aeo_args
   4678     ld_a1,a0,0
   4679     ld_a2,a1,0
   4680     ld_a3,a1,8
   4681     li_t0 %1 %0
   4682     la_br &aeo_cmp_finish
   4683     blt_a2,a3
   4684     li_t0 %0 %0
   4685     la_br &aeo_cmp_finish
   4686     b
   4687 
   4688 :aeo_do_le
   4689     # a[0] <= a[1]   <=>   !(a[1] < a[0])
   4690     la_br &aeo_require_argc_eq2
   4691     call
   4692     la_a0 &aeo_args
   4693     ld_a1,a0,0
   4694     ld_a2,a1,0
   4695     ld_a3,a1,8
   4696     li_t0 %0 %0
   4697     la_br &aeo_cmp_finish
   4698     blt_a3,a2
   4699     li_t0 %1 %0
   4700     la_br &aeo_cmp_finish
   4701     b
   4702 
   4703 :aeo_do_gt
   4704     # a[0] > a[1]   <=>   a[1] < a[0]
   4705     la_br &aeo_require_argc_eq2
   4706     call
   4707     la_a0 &aeo_args
   4708     ld_a1,a0,0
   4709     ld_a2,a1,0
   4710     ld_a3,a1,8
   4711     li_t0 %1 %0
   4712     la_br &aeo_cmp_finish
   4713     blt_a3,a2
   4714     li_t0 %0 %0
   4715     la_br &aeo_cmp_finish
   4716     b
   4717 
   4718 :aeo_do_ge
   4719     # a[0] >= a[1]   <=>   !(a[0] < a[1])
   4720     la_br &aeo_require_argc_eq2
   4721     call
   4722     la_a0 &aeo_args
   4723     ld_a1,a0,0
   4724     ld_a2,a1,0
   4725     ld_a3,a1,8
   4726     li_t0 %0 %0
   4727     la_br &aeo_cmp_finish
   4728     blt_a2,a3
   4729     li_t0 %1 %0
   4730     la_br &aeo_cmp_finish
   4731     b
   4732 
   4733 ## Shared tail for the comparison ops: store t0 (the 0/1 result) into
   4734 ## aeo_acc, then jump to aeo_finish. Reached only via `b`.
   4735 :aeo_cmp_finish
   4736     la_a0 &aeo_acc
   4737     st_t0,a0,0
   4738     la_br &aeo_finish
   4739     b
   4740 
   4741 :aeo_finish
   4742     la_a0 &aeo_acc
   4743     ld_a0,a0,0
   4744     eret
   4745 
   4746 ## helper: validate argc >= 1; fatal otherwise. (Returns to caller.)
   4747 :aeo_require_argc_ge1
   4748     la_a0 &aeo_argc
   4749     ld_t0,a0,0
   4750     li_t1 %1 %0
   4751     la_br &err_bad_macro_header
   4752     blt_t0,t1
   4753     ret
   4754 
   4755 ## helper: validate argc == 2; fatal otherwise.
   4756 :aeo_require_argc_eq2
   4757     la_a0 &aeo_argc
   4758     ld_t0,a0,0
   4759     li_t1 %2 %0
   4760     la_br &err_bad_macro_header
   4761     bne_t0,t1
   4762     ret
   4763 
   4764 ## helper: acc = args[0]; i = 1.
   4765 :aeo_load_arg0_to_acc
   4766     la_a0 &aeo_args
   4767     ld_a1,a0,0
   4768     ld_a2,a1,0
   4769     la_a0 &aeo_acc
   4770     st_a2,a0,0
   4771     li_t0 %1 %0
   4772     la_a0 &aeo_i
   4773     st_t0,a0,0
   4774     ret
   4775 
   4776 ## skip_expr_newlines(a0=pos, a1=end) -> a0 = new pos. Leaf.
   4777 ## Advance pos past consecutive TOK_NEWLINE tokens so expressions may span
   4778 ## lines. Also used by directive header parsers to make whitespace
   4779 ## (newlines specifically) insignificant inside %macro/%struct/%frame
   4780 ## headers and around `##` paste operands.
   4781 :skip_expr_newlines
   4782 :sen_loop
   4783     # if (pos == end) done
   4784     la_br &sen_done
   4785     beq_a0,a1
   4786     # if (pos->kind != TOK_NEWLINE) done
   4787     ld_t0,a0,0
   4788     li_t1 TOK_NEWLINE
   4789     la_br &sen_done
   4790     bne_t0,t1
   4791     # pos += 24
   4792     addi_a0,a0,32
   4793     la_br &sen_loop
   4794     b
   4795 :sen_done
   4796     ret
   4797 
   4798 ## proc_skip_newlines(): advance proc_pos past TOK_NEWLINE tokens, bounded by
   4799 ## source_end. Convenience wrapper used by directive header parsers.
   4800 :proc_skip_newlines
   4801 :psn_loop
   4802     la_a0 &proc_pos
   4803     ld_t0,a0,0
   4804     la_a1 &source_end
   4805     ld_t1,a1,0
   4806     la_br &psn_done
   4807     beq_t0,t1
   4808     ld_a2,t0,0
   4809     li_a3 TOK_NEWLINE
   4810     la_br &psn_done
   4811     bne_a2,a3
   4812     addi_t0,t0,32
   4813     st_t0,a0,0
   4814     la_br &psn_loop
   4815     b
   4816 :psn_done
   4817     ret
   4818 
   4819 ## eval_expr_atom(a0=tok, a1=limit) -> void
   4820 ## Outputs via globals:
   4821 ##   eval_after_pos = token one past the consumed atom (or one past ')' for
   4822 ##                    a macro atom)
   4823 ##   eval_value     = the atom's i64 value
   4824 ##
   4825 ## If tok is a defined macro followed by TOK_LPAREN: expand_macro_tokens into
   4826 ## the pool at mark = pool_used, recursively eval_expr_range over the new
   4827 ## slice, require exactly one value (no trailing tokens), restore
   4828 ## pool_used = mark, and set eval_after_pos = emt_after_pos. Otherwise
   4829 ## parse_int_token(tok) and set eval_after_pos = tok + 24 bytes.
   4830 ##
   4831 ## CAVEAT: this path can recurse through eval_expr_range. Callers MUST
   4832 ## snapshot eval_after_pos / eval_value into local stack slots (via
   4833 ## enter_N) before any further call that might overwrite them.
   4834 ##
   4835 ## Stack-local layout (enter_40):
   4836 ##   sp+16  saved tok
   4837 ##   sp+24  saved limit
   4838 ##   sp+32  macro_ptr (find_macro result)
   4839 ##   sp+40  saved emt_after_pos
   4840 ##   sp+48  saved emt_mark
   4841 :eval_expr_atom
   4842     enter_40
   4843     st_a0,sp,0
   4844     st_a1,sp,8
   4845 
   4846     # ---- tok eq "%local" with tight ( -> expand and recurse over body ----
   4847     # %local is a built-in (not a macro) but expands to an integer-yielding
   4848     # token sequence, so eval_expr_atom must handle it before the find_macro
   4849     # path. Validation (tight LPAREN, arg shape, frame_active, name lookup)
   4850     # is centralized in expand_local_into_pool.
   4851     ld_a0,sp,0
   4852     la_a1 &const_local
   4853     li_a2 %6 %0
   4854     la_br &tok_eq_const
   4855     call
   4856     la_br &eea_skip_local
   4857     beqz_a0
   4858 
   4859     # Confirm tight LPAREN follows; otherwise treat %local as opaque text and
   4860     # let the integer-atom path fail with a useful error.
   4861     ld_t0,sp,0
   4862     addi_t0,t0,32
   4863     ld_t1,sp,8
   4864     la_br &eea_skip_local
   4865     blt_t1,t0
   4866     la_br &eea_skip_local
   4867     beq_t0,t1
   4868     ld_a3,t0,0
   4869     li_a2 TOK_LPAREN
   4870     la_br &eea_skip_local
   4871     bne_a3,a2
   4872     ld_a3,t0,24
   4873     la_br &eea_skip_local
   4874     beqz_a3
   4875 
   4876     # Dispatch to the local-expansion path.
   4877     ld_a0,sp,0
   4878     ld_a1,sp,8
   4879     la_br &expand_local_into_pool
   4880     call
   4881 
   4882     # Snapshot elp_mark / elp_after before recursing.
   4883     la_a0 &elp_after
   4884     ld_t0,a0,0
   4885     st_t0,sp,24
   4886     la_a0 &elp_mark
   4887     ld_t0,a0,0
   4888     st_t0,sp,32
   4889 
   4890     # If pool was not extended (pool_used == mark) -> bad expression.
   4891     la_a0 &pool_used
   4892     ld_t0,a0,0
   4893     ld_t1,sp,32
   4894     la_br &err_bad_macro_header
   4895     beq_t0,t1
   4896 
   4897     # eval_expr_range(expand_pool + mark, expand_pool + pool_used)
   4898     la_a0 &expand_pool_ptr
   4899     ld_a0,a0,0
   4900     ld_t1,sp,32
   4901     add_a0,a0,t1
   4902     la_a1 &expand_pool_ptr
   4903     ld_a1,a1,0
   4904     la_a2 &pool_used
   4905     ld_a2,a2,0
   4906     add_a1,a1,a2
   4907     la_br &eval_expr_range
   4908     call
   4909 
   4910     la_a1 &eval_value
   4911     st_a0,a1,0
   4912 
   4913     # restore pool_used = mark
   4914     la_a0 &pool_used
   4915     ld_t0,sp,32
   4916     st_t0,a0,0
   4917 
   4918     # eval_after_pos = saved elp_after
   4919     la_a0 &eval_after_pos
   4920     ld_t0,sp,24
   4921     st_t0,a0,0
   4922 
   4923     eret
   4924 
   4925 :eea_skip_local
   4926     # macro_ptr = find_macro(tok)
   4927     ld_a0,sp,0
   4928     la_br &find_macro
   4929     call
   4930     st_a0,sp,16
   4931 
   4932     # if (macro_ptr == 0) -> integer atom branch
   4933     la_br &eea_int_atom
   4934     beqz_a0
   4935 
   4936     # Paren-less 0-arg atom:
   4937     #   Take the macro-call branch if (tok+1 < limit AND (tok+1)->kind == TOK_LPAREN
   4938     #   AND (tok+1)->tight) OR macro->param_count == 0. Otherwise fall through
   4939     #   to int atom (unchanged).
   4940     ld_t0,sp,0
   4941     addi_t0,t0,32
   4942     ld_t1,sp,8
   4943     la_br &eea_check_zero_arg
   4944     blt_t1,t0
   4945     la_br &eea_check_zero_arg
   4946     beq_t0,t1
   4947     ld_t2,t0,0
   4948     li_a3 TOK_LPAREN
   4949     la_br &eea_check_zero_arg
   4950     bne_t2,a3
   4951     ld_t2,t0,24
   4952     la_br &eea_check_zero_arg
   4953     beqz_t2
   4954     la_br &eea_do_macro
   4955     b
   4956 
   4957 :eea_check_zero_arg
   4958     # No trailing LPAREN. Take the macro branch only if param_count == 0.
   4959     ld_t0,sp,16
   4960     ld_t1,t0,16
   4961     la_br &eea_int_atom
   4962     bnez_t1
   4963 
   4964 :eea_do_macro
   4965     # Macro call branch:
   4966     #   expand_macro_tokens(tok, limit, macro_ptr)
   4967     ld_a0,sp,0
   4968     ld_a1,sp,8
   4969     ld_a2,sp,16
   4970     la_br &expand_macro_tokens
   4971     call
   4972 
   4973     # Snapshot emt outputs immediately.
   4974     la_a0 &emt_after_pos
   4975     ld_t0,a0,0
   4976     st_t0,sp,24
   4977     la_a0 &emt_mark
   4978     ld_t0,a0,0
   4979     st_t0,sp,32
   4980 
   4981     # If pool was not extended (pool_used == mark) -> bad expression.
   4982     la_a0 &pool_used
   4983     ld_t0,a0,0
   4984     ld_t1,sp,32
   4985     la_br &err_bad_macro_header
   4986     beq_t0,t1
   4987 
   4988     # eval_expr_range(expand_pool + mark, expand_pool + pool_used)
   4989     la_a0 &expand_pool_ptr
   4990     ld_a0,a0,0
   4991     ld_t1,sp,32
   4992     add_a0,a0,t1
   4993     la_a1 &expand_pool_ptr
   4994     ld_a1,a1,0
   4995     la_a2 &pool_used
   4996     ld_a2,a2,0
   4997     add_a1,a1,a2
   4998     la_br &eval_expr_range
   4999     call
   5000 
   5001     # eval_value = result
   5002     la_a1 &eval_value
   5003     st_a0,a1,0
   5004 
   5005     # restore pool_used = mark
   5006     la_a0 &pool_used
   5007     ld_t0,sp,32
   5008     st_t0,a0,0
   5009 
   5010     # eval_after_pos = saved emt_after_pos
   5011     la_a0 &eval_after_pos
   5012     ld_t0,sp,24
   5013     st_t0,a0,0
   5014 
   5015     eret
   5016 
   5017 :eea_int_atom
   5018     # parse_int_token(tok) -> i64
   5019     ld_a0,sp,0
   5020     la_br &parse_int_token
   5021     call
   5022     la_a1 &eval_value
   5023     st_a0,a1,0
   5024 
   5025     # eval_after_pos = tok + 24
   5026     ld_t0,sp,0
   5027     addi_t0,t0,32
   5028     la_a0 &eval_after_pos
   5029     st_t0,a0,0
   5030 
   5031     eret
   5032 
   5033 ## eval_expr_range(a0=start_tok, a1=end_tok) -> a0 = i64 result (fatal on bad)
   5034 ## Main S-expression evaluator loop, driven by the explicit ExprFrame stack
   5035 ## in expr_frames[] / expr_frame_top — NOT by P1 recursion (eval_expr_atom
   5036 ## can re-enter eval_expr_range through expand_macro_tokens, and a P1
   5037 ## recursion would defeat the bounded frame budget). Enforces exactly one
   5038 ## top-level value and no trailing tokens.
   5039 ## Fatal on: unmatched parens, > 16 frames deep, > 16 args per frame,
   5040 ## bad atom, bad operator.
   5041 ## Reads/writes: expr_frames, expr_frame_top.
   5042 ##
   5043 ## Stack-local layout (enter_56):
   5044 ##   sp+16  pos              Token*
   5045 ##   sp+24  end              Token*
   5046 ##   sp+32  value            i64 (most recent atom or rparen result)
   5047 ##   sp+40  result           i64 (set when have_result transitions to 1)
   5048 ##   sp+48  have_value       0/1
   5049 ##   sp+56  have_result      0/1
   5050 ##   sp+64  entry_frame_top  i64 (snapshot at entry; restored on exit;
   5051 ##                                used as the local base for stack checks)
   5052 :eval_expr_range
   5053     enter_56
   5054     st_a0,sp,0
   5055     st_a1,sp,8
   5056     li_t0 %0 %0
   5057     st_t0,sp,16
   5058     st_t0,sp,24
   5059     st_t0,sp,32
   5060     st_t0,sp,40
   5061     # entry_frame_top = expr_frame_top
   5062     la_a0 &expr_frame_top
   5063     ld_t0,a0,0
   5064     st_t0,sp,48
   5065 
   5066 :eer_loop
   5067     # If have_value, deliver it.
   5068     ld_t0,sp,32
   5069     la_br &eer_no_have_value
   5070     beqz_t0
   5071 
   5072     # have_value: feed into top frame, or set result.
   5073     la_a0 &expr_frame_top
   5074     ld_t0,a0,0
   5075     ld_t1,sp,48
   5076     la_br &eer_set_result
   5077     beq_t0,t1
   5078     # frame = &expr_frames[frame_top - 1]
   5079     addi_t0,t0,neg1
   5080     li_a1 M1PP_EXPR_FRAME_SIZE
   5081     mul_t0,t0,a1
   5082     la_a0 &expr_frames_ptr
   5083     ld_a0,a0,0
   5084     add_a0,a0,t0
   5085     # if (frame->argc >= MAX_PARAMS) fatal
   5086     li_a1 M1PP_EXPR_ARGC_OFF
   5087     add_a1,a0,a1
   5088     ld_t1,a1,0
   5089     li_a2 M1PP_MAX_PARAMS
   5090     la_br &err_bad_macro_header
   5091     blt_a2,t1
   5092     la_br &err_bad_macro_header
   5093     beq_t1,a2
   5094     # frame->args[argc] = value
   5095     li_a2 M1PP_EXPR_ARGS_OFF
   5096     add_a3,a0,a2
   5097     shli_a2,t1,3
   5098     add_a3,a3,a2
   5099     ld_t2,sp,16
   5100     st_t2,a3,0
   5101     # frame->argc++
   5102     addi_t1,t1,1
   5103     st_t1,a1,0
   5104     # have_value = 0
   5105     li_t0 %0 %0
   5106     st_t0,sp,32
   5107     la_br &eer_loop
   5108     b
   5109 
   5110 :eer_set_result
   5111     # No frame open; this value is the top-level result.
   5112     ld_t0,sp,40
   5113     la_br &err_bad_macro_header
   5114     bnez_t0
   5115     ld_t0,sp,16
   5116     st_t0,sp,24
   5117     li_t0 %1 %0
   5118     st_t0,sp,40
   5119     li_t0 %0 %0
   5120     st_t0,sp,32
   5121     la_br &eer_loop
   5122     b
   5123 
   5124 :eer_no_have_value
   5125     # skip_expr_newlines(pos, end)
   5126     ld_a0,sp,0
   5127     ld_a1,sp,8
   5128     la_br &skip_expr_newlines
   5129     call
   5130     st_a0,sp,0
   5131 
   5132     # if (pos >= end) break
   5133     ld_t0,sp,0
   5134     ld_t1,sp,8
   5135     la_br &eer_loop_done
   5136     beq_t0,t1
   5137 
   5138     # Dispatch on token kind.
   5139     ld_t2,t0,0
   5140     li_a3 TOK_LPAREN
   5141     la_br &eer_lparen
   5142     beq_t2,a3
   5143     li_a3 TOK_RPAREN
   5144     la_br &eer_rparen
   5145     beq_t2,a3
   5146 
   5147     # atom: eval_expr_atom(pos, end); value = eval_value; pos = eval_after_pos
   5148     ld_a0,sp,0
   5149     ld_a1,sp,8
   5150     la_br &eval_expr_atom
   5151     call
   5152     la_a0 &eval_value
   5153     ld_t0,a0,0
   5154     st_t0,sp,16
   5155     la_a0 &eval_after_pos
   5156     ld_t0,a0,0
   5157     st_t0,sp,0
   5158     li_t0 %1 %0
   5159     st_t0,sp,32
   5160     la_br &eer_loop
   5161     b
   5162 
   5163 :eer_lparen
   5164     # pos++
   5165     addi_t0,t0,32
   5166     st_t0,sp,0
   5167     # skip_expr_newlines
   5168     ld_a0,sp,0
   5169     ld_a1,sp,8
   5170     la_br &skip_expr_newlines
   5171     call
   5172     st_a0,sp,0
   5173     # if (pos >= end) fatal
   5174     ld_t0,sp,0
   5175     ld_t1,sp,8
   5176     la_br &err_bad_macro_header
   5177     beq_t0,t1
   5178     # op = expr_op_code(pos)
   5179     ld_a0,sp,0
   5180     la_br &expr_op_code
   5181     call
   5182     # if (op == EXPR_INVALID) fatal
   5183     li_t0 EXPR_INVALID
   5184     la_br &err_bad_macro_header
   5185     beq_a0,t0
   5186     # if (op == EXPR_STRLEN) handle inline — strlen's argument is a
   5187     # TOK_STRING atom, not a recursive expression. Yield text.len - 2.
   5188     li_t0 EXPR_STRLEN
   5189     la_br &eer_strlen
   5190     beq_a0,t0
   5191     # frame stack overflow check: if (expr_frame_top >= 16) fatal
   5192     # (the global expr_frames[] array has 16 slots, shared across recursive
   5193     # eval_expr_range calls)
   5194     la_a1 &expr_frame_top
   5195     ld_t0,a1,0
   5196     li_a2 M1PP_MAX_PARAMS
   5197     la_br &err_bad_macro_header
   5198     blt_a2,t0
   5199     la_br &err_bad_macro_header
   5200     beq_t0,a2
   5201     # frames[frame_top].op = op; frames[frame_top].argc = 0
   5202     li_a2 M1PP_EXPR_FRAME_SIZE
   5203     mul_t2,t0,a2
   5204     la_a3 &expr_frames_ptr
   5205     ld_a3,a3,0
   5206     add_a3,a3,t2
   5207     st_a0,a3,0
   5208     li_a2 M1PP_EXPR_ARGC_OFF
   5209     add_a2,a3,a2
   5210     li_t2 %0 %0
   5211     st_t2,a2,0
   5212     # frame_top++
   5213     addi_t0,t0,1
   5214     st_t0,a1,0
   5215     # pos++ (skip operator token)
   5216     ld_t0,sp,0
   5217     addi_t0,t0,32
   5218     st_t0,sp,0
   5219     la_br &eer_loop
   5220     b
   5221 
   5222 :eer_rparen
   5223     # if (frame_top <= entry_frame_top) fatal
   5224     la_a0 &expr_frame_top
   5225     ld_t0,a0,0
   5226     ld_t1,sp,48
   5227     la_br &err_bad_macro_header
   5228     beq_t0,t1
   5229     la_br &err_bad_macro_header
   5230     blt_t0,t1
   5231     # frame = &expr_frames[frame_top - 1]
   5232     addi_t0,t0,neg1
   5233     li_a1 M1PP_EXPR_FRAME_SIZE
   5234     mul_t0,t0,a1
   5235     la_a3 &expr_frames_ptr
   5236     ld_a3,a3,0
   5237     add_a3,a3,t0
   5238     # apply_expr_op(op, args, argc) -> a0
   5239     ld_a0,a3,0
   5240     li_a1 M1PP_EXPR_ARGS_OFF
   5241     add_a1,a3,a1
   5242     li_a2 M1PP_EXPR_ARGC_OFF
   5243     add_a2,a3,a2
   5244     ld_a2,a2,0
   5245     la_br &apply_expr_op
   5246     call
   5247     # value = result; frame_top--; pos++; have_value = 1
   5248     st_a0,sp,16
   5249     la_a1 &expr_frame_top
   5250     ld_t0,a1,0
   5251     addi_t0,t0,neg1
   5252     st_t0,a1,0
   5253     ld_t0,sp,0
   5254     addi_t0,t0,32
   5255     st_t0,sp,0
   5256     li_t0 %1 %0
   5257     st_t0,sp,32
   5258     la_br &eer_loop
   5259     b
   5260 
   5261 :eer_strlen
   5262     # (strlen "literal") — degenerate unary op whose argument is a
   5263     # TOK_STRING atom, not a recursive expression.
   5264     # pos++ past the "strlen" operator word.
   5265     ld_t0,sp,0
   5266     addi_t0,t0,32
   5267     st_t0,sp,0
   5268     # skip_expr_newlines(pos, end)
   5269     ld_a0,sp,0
   5270     ld_a1,sp,8
   5271     la_br &skip_expr_newlines
   5272     call
   5273     st_a0,sp,0
   5274     # if (pos >= end) fatal
   5275     ld_t0,sp,0
   5276     ld_t1,sp,8
   5277     la_br &err_bad_macro_header
   5278     beq_t0,t1
   5279     # if (pos->kind != TOK_STRING) fatal
   5280     ld_t2,t0,0
   5281     li_a3 TOK_STRING
   5282     la_br &err_bad_macro_header
   5283     bne_t2,a3
   5284     # if (pos->text.len < 2) fatal
   5285     ld_a1,t0,16
   5286     li_a2 %2 %0
   5287     la_br &err_bad_macro_header
   5288     blt_a1,a2
   5289     # if (pos->text.ptr[0] != '"') fatal — rejects single-quoted '..' hex
   5290     ld_a2,t0,8
   5291     lb_a3,a2,0
   5292     li_a0 %34 %0
   5293     la_br &err_bad_macro_header
   5294     bne_a3,a0
   5295     # value = pos->text.len - 2
   5296     addi_a1,a1,neg2
   5297     st_a1,sp,16
   5298     # pos++
   5299     addi_t0,t0,32
   5300     st_t0,sp,0
   5301     # skip_expr_newlines(pos, end)
   5302     ld_a0,sp,0
   5303     ld_a1,sp,8
   5304     la_br &skip_expr_newlines
   5305     call
   5306     st_a0,sp,0
   5307     # if (pos >= end) fatal
   5308     ld_t0,sp,0
   5309     ld_t1,sp,8
   5310     la_br &err_bad_macro_header
   5311     beq_t0,t1
   5312     # if (pos->kind != TOK_RPAREN) fatal
   5313     ld_t2,t0,0
   5314     li_a3 TOK_RPAREN
   5315     la_br &err_bad_macro_header
   5316     bne_t2,a3
   5317     # pos++
   5318     addi_t0,t0,32
   5319     st_t0,sp,0
   5320     # have_value = 1
   5321     li_t0 %1 %0
   5322     st_t0,sp,32
   5323     la_br &eer_loop
   5324     b
   5325 
   5326 :eer_loop_done
   5327     # frame_top must equal entry_frame_top
   5328     la_a0 &expr_frame_top
   5329     ld_t0,a0,0
   5330     ld_t1,sp,48
   5331     la_br &err_bad_macro_header
   5332     bne_t0,t1
   5333     # have_result must be 1
   5334     ld_t0,sp,40
   5335     la_br &err_bad_macro_header
   5336     beqz_t0
   5337     # pos must equal end
   5338     ld_t0,sp,0
   5339     ld_t1,sp,8
   5340     la_br &err_bad_macro_header
   5341     bne_t0,t1
   5342     # return result
   5343     ld_a0,sp,24
   5344     eret
   5345 
   5346 ## ============================================================================
   5347 ## --- Hex emit for !@%$ ------------------------------------------------------
   5348 ## ============================================================================
   5349 
   5350 ## emit_hex_value(a0=value_u64, a1=byte_count) -> void (fatal on overflow)
   5351 ## byte_count must be 1, 2, 4, or 8. Serialize value into (2 * byte_count)
   5352 ## uppercase hex chars, little-endian byte order (byte i at char indices
   5353 ## 2i, 2i+1) as bare hex digits. hex2pp's byte-stream parser groups every
   5354 ## two hex digits into one byte; no quoting or separators are needed.
   5355 ## Total emitted text length = 2 * byte_count; emitted as a TOK_WORD via
   5356 ## append_text + emit_token.
   5357 :emit_hex_value
   5358     enter_0
   5359 
   5360     # ehv_value = value; ehv_bytes = byte_count
   5361     la_a2 &ehv_value
   5362     st_a0,a2,0
   5363     la_a2 &ehv_bytes
   5364     st_a1,a2,0
   5365 
   5366     # i = 0
   5367     li_t0 %0 %0
   5368 :emit_hex_value_loop
   5369     # if (i == bytes) done
   5370     la_a1 &ehv_bytes
   5371     ld_t1,a1,0
   5372     la_br &emit_hex_value_emit
   5373     beq_t0,t1
   5374 
   5375     # byte = ehv_value & 0xFF
   5376     la_a1 &ehv_value
   5377     ld_t2,a1,0
   5378     andi_a3,t2,255
   5379 
   5380     # high = (byte >> 4) & 0x0F  (byte is already in a3)
   5381     shri_a2,a3,4
   5382     andi_a2,a2,15
   5383 
   5384     # low = byte & 0x0F
   5385     andi_a3,a3,15
   5386 
   5387     # scratch[2*i] = hex_chars[high]
   5388     la_a1 &hex_chars
   5389     add_a1,a1,a2
   5390     lb_a2,a1,0
   5391     la_a1 &ehv_scratch
   5392     shli_a3,t0,1
   5393     add_a1,a1,a3
   5394     sb_a2,a1,0
   5395 
   5396     # scratch[2*i+1] = hex_chars[low]   (reload low from byte & 0x0F)
   5397     la_a1 &ehv_value
   5398     ld_t2,a1,0
   5399     andi_a3,t2,255
   5400     andi_a3,a3,15
   5401     la_a1 &hex_chars
   5402     add_a1,a1,a3
   5403     lb_a2,a1,0
   5404     la_a1 &ehv_scratch
   5405     shli_a3,t0,1
   5406     add_a1,a1,a3
   5407     addi_a1,a1,1
   5408     sb_a2,a1,0
   5409 
   5410     # ehv_value >>= 8
   5411     la_a1 &ehv_value
   5412     ld_t2,a1,0
   5413     shri_t2,t2,8
   5414     st_t2,a1,0
   5415 
   5416     # i++
   5417     addi_t0,t0,1
   5418     la_br &emit_hex_value_loop
   5419     b
   5420 
   5421 :emit_hex_value_emit
   5422     # text_ptr = append_text(&ehv_scratch, 2 * ehv_bytes)
   5423     la_a0 &ehv_scratch
   5424     la_a1 &ehv_bytes
   5425     ld_a1,a1,0
   5426     shli_a1,a1,1
   5427     la_br &append_text
   5428     call
   5429 
   5430     # ehv_token.kind = TOK_WORD; ehv_token.text_ptr = text_ptr;
   5431     # ehv_token.text_len = 2 * ehv_bytes; ehv_token.tight = 0.
   5432     la_a2 &ehv_token
   5433     li_a3 TOK_WORD
   5434     st_a3,a2,0
   5435     st_a0,a2,8
   5436     la_a1 &ehv_bytes
   5437     ld_a1,a1,0
   5438     shli_a1,a1,1
   5439     st_a1,a2,16
   5440     li_a1 %0 %0
   5441     st_a1,a2,24
   5442 
   5443     # emit_token(&ehv_token)
   5444     la_a0 &ehv_token
   5445     la_br &emit_token
   5446     call
   5447 
   5448     eret
   5449 
   5450 ## ============================================================================
   5451 ## --- Builtin dispatcher ( ! @ % $ %select %str %local ) --------------------
   5452 ## ============================================================================
   5453 
   5454 ## expand_builtin_call(a0=stream_ptr, a1=builtin_tok) -> void (fatal on bad)
   5455 ## Requires builtin_tok+1 is TOK_LPAREN. Runs parse_args(lparen, stream->end),
   5456 ## then dispatches on builtin_tok->text:
   5457 ##
   5458 ##   "!" "@" "%" "$"
   5459 ##     require arg_count == 1
   5460 ##     eval_expr_range(arg_starts[0], arg_ends[0]) -> value
   5461 ##     stream->pos = call_end_pos; stream->line_start = 0
   5462 ##     emit_hex_value(value, 1 / 2 / 4 / 8 respectively)
   5463 ##
   5464 ##   "%select"
   5465 ##     require arg_count == 3
   5466 ##     eval_expr_range(cond_arg) -> value
   5467 ##     chosen = (value != 0) ? arg1 : arg2
   5468 ##     stream->pos = call_end_pos; stream->line_start = 0
   5469 ##     if chosen is empty, return (no stream push)
   5470 ##     else copy_span_to_pool(chosen) and push_pool_stream_from_mark(mark)
   5471 ##     The unchosen branch is NOT evaluated, validated, or expanded.
   5472 ##
   5473 ## Any other text under a builtin slot -> fatal "bad builtin".
   5474 ## expand_local_into_pool(a0=call_tok, a1=limit) -> writes elp_after, elp_mark
   5475 ## Resolve %local(NAME) against the current frame: assemble the lookup key
   5476 ## "<frame>_FRAME.<NAME>" in local_lookup_scratch, linear-search macros[]
   5477 ## for that name, and copy the matching body into the pool. Errors:
   5478 ##   - call_tok+1 missing / not tight LPAREN: bad_macro_header
   5479 ##   - parse_args fails: propagated
   5480 ##   - arg_count != 1, arg span != 1 token, arg kind != WORD: bad_macro_header
   5481 ##   - frame not active: local_outside_frame
   5482 ##   - assembled name >= 256 bytes: local_name_too_long
   5483 ##   - no matching macro: unknown_local
   5484 ##
   5485 ## On success, elp_mark = pool_used at entry, elp_after = call_end_pos
   5486 ## (Token* one past the call's `)`). Both expand_builtin_call's %local
   5487 ## branch and eval_expr_atom's %local branch consume those.
   5488 :expand_local_into_pool
   5489     enter_0
   5490 
   5491     # --- Validate (call_tok+1) is a tight LPAREN within the stream. ---
   5492     addi_t0,a0,32                  # lparen = call_tok + 32
   5493     la_br &err_bad_macro_header
   5494     blt_a1,t0
   5495     la_br &err_bad_macro_header
   5496     beq_t0,a1
   5497     ld_a2,t0,0
   5498     li_a3 TOK_LPAREN
   5499     la_br &err_bad_macro_header
   5500     bne_a2,a3
   5501     ld_a2,t0,24
   5502     la_br &err_bad_macro_header
   5503     beqz_a2
   5504 
   5505     # --- parse_args(lparen, limit) ---
   5506     mov_a0,t0
   5507     la_br &parse_args
   5508     call
   5509 
   5510     # --- Validate arg shape: arg_count == 1, single 32-byte token, WORD kind. ---
   5511     la_a0 &arg_count
   5512     ld_t0,a0,0
   5513     li_t1 %1 %0
   5514     la_br &err_bad_macro_header
   5515     bne_t0,t1
   5516 
   5517     la_a0 &arg_starts_ptr
   5518     ld_a0,a0,0
   5519     ld_t0,a0,0                     # arg_tok = arg_starts[0]
   5520     la_a1 &arg_ends_ptr
   5521     ld_a1,a1,0
   5522     ld_t1,a1,0                     # arg_end = arg_ends[0]
   5523     sub_t2,t1,t0
   5524     li_a2 %32 %0
   5525     la_br &err_bad_macro_header
   5526     bne_t2,a2
   5527 
   5528     ld_a3,t0,0                     # arg_tok->kind
   5529     li_a2 TOK_WORD
   5530     la_br &err_bad_macro_header
   5531     bne_a3,a2
   5532 
   5533     # Stash arg.text.ptr / arg.text.len for the byte-copy loop below.
   5534     ld_a0,t0,8
   5535     la_a1 &elp_arg_ptr
   5536     st_a0,a1,0
   5537     ld_a0,t0,16
   5538     la_a1 &elp_arg_len
   5539     st_a0,a1,0
   5540 
   5541     # --- frame_active? ---
   5542     la_a1 &frame_active
   5543     ld_a2,a1,0
   5544     la_br &err_local_outside_frame
   5545     beqz_a2
   5546 
   5547     # --- name_len = current_frame_len + 7 + arg_len; must be < 256. ---
   5548     la_a1 &current_frame_len
   5549     ld_a2,a1,0
   5550     la_a1 &elp_arg_len
   5551     ld_a3,a1,0
   5552     add_t0,a2,a3
   5553     addi_t0,t0,7
   5554     la_a1 &elp_name_len
   5555     st_t0,a1,0
   5556     li_t1 %256 %0
   5557     la_br &err_local_name_too_long
   5558     blt_t1,t0
   5559     la_br &err_local_name_too_long
   5560     beq_t0,t1
   5561 
   5562     # --- Build lookup name in local_lookup_scratch. ---
   5563     # First: copy current_frame_ptr[0..frame_len] -> scratch[0..]
   5564     la_a0 &current_frame_ptr
   5565     ld_t0,a0,0                     # frame_ptr
   5566     la_a0 &current_frame_len
   5567     ld_t1,a0,0                     # frame_len
   5568     la_t2 &local_lookup_scratch_ptr
   5569     ld_t2,t2,0                     # scratch_base
   5570     li_a3 %0 %0
   5571 :elp_copy_frame
   5572     la_br &elp_copy_frame_done
   5573     beq_a3,t1
   5574     add_a0,t0,a3
   5575     lb_a0,a0,0
   5576     add_a1,t2,a3
   5577     sb_a0,a1,0
   5578     addi_a3,a3,1
   5579     la_br &elp_copy_frame
   5580     b
   5581 :elp_copy_frame_done
   5582 
   5583     # Advance scratch cursor to scratch + frame_len for the suffix copy.
   5584     add_t2,t2,t1
   5585 
   5586     # Copy const_frame_suffix (7 bytes "_FRAME.") -> scratch[frame_len..]
   5587     la_a0 &const_frame_suffix
   5588     li_t1 %7 %0
   5589     li_a3 %0 %0
   5590 :elp_copy_suffix
   5591     la_br &elp_copy_suffix_done
   5592     beq_a3,t1
   5593     add_a1,a0,a3
   5594     lb_a1,a1,0
   5595     add_t0,t2,a3
   5596     sb_a1,t0,0
   5597     addi_a3,a3,1
   5598     la_br &elp_copy_suffix
   5599     b
   5600 :elp_copy_suffix_done
   5601 
   5602     # Advance past suffix (7 bytes).
   5603     addi_t2,t2,7
   5604 
   5605     # Copy arg bytes -> scratch[frame_len + 7 ..]
   5606     la_a0 &elp_arg_ptr
   5607     ld_t0,a0,0
   5608     la_a0 &elp_arg_len
   5609     ld_t1,a0,0
   5610     li_a3 %0 %0
   5611 :elp_copy_arg
   5612     la_br &elp_copy_arg_done
   5613     beq_a3,t1
   5614     add_a0,t0,a3
   5615     lb_a0,a0,0
   5616     add_a1,t2,a3
   5617     sb_a0,a1,0
   5618     addi_a3,a3,1
   5619     la_br &elp_copy_arg
   5620     b
   5621 :elp_copy_arg_done
   5622 
   5623     # --- Linear search macros[] for an exact name match. ---
   5624     # m (a3) walks from macros_ptr to macros_end (each MACRO_RECORD_SIZE).
   5625     # Match criterion: m->name.len == name_len AND first name_len bytes of
   5626     # m->name.ptr equal local_lookup_scratch. Modeled on find_macro: keep
   5627     # m in a3, reload macros_end into t0 after each iteration, and use
   5628     # a0/a1/a2/t1/t2 as scratch within the inner byte-compare.
   5629     la_a3 &macros_ptr
   5630     ld_a3,a3,0
   5631     la_t0 &macros_end
   5632     ld_t0,t0,0
   5633 :elp_search_loop
   5634     la_br &elp_unknown
   5635     beq_a3,t0
   5636 
   5637     # m->name.len == name_len?
   5638     ld_t1,a3,8
   5639     la_a0 &elp_name_len
   5640     ld_a2,a0,0
   5641     la_br &elp_search_next
   5642     bne_t1,a2
   5643 
   5644     # byte-compare m->name.ptr vs scratch for name_len bytes.
   5645     ld_t1,a3,0                     # name_ptr
   5646     la_a0 &local_lookup_scratch_ptr
   5647     ld_a1,a0,0                     # lookup_ptr
   5648     li_t2 %0 %0
   5649 :elp_search_cmp
   5650     la_br &elp_search_match
   5651     beq_t2,a2
   5652     add_a0,t1,t2
   5653     lb_a0,a0,0
   5654     add_t0,a1,t2
   5655     lb_t0,t0,0
   5656     la_br &elp_search_next
   5657     bne_a0,t0
   5658     addi_t2,t2,1
   5659     la_br &elp_search_cmp
   5660     b
   5661 
   5662 :elp_search_next
   5663     li_t1 M1PP_MACRO_RECORD_SIZE
   5664     add_a3,a3,t1
   5665     la_t0 &macros_end
   5666     ld_t0,t0,0
   5667     la_br &elp_search_loop
   5668     b
   5669 
   5670 :elp_unknown
   5671     la_br &err_unknown_local
   5672     b
   5673 
   5674 :elp_search_match
   5675     # a3 = matched macro pointer. mark = pool_used; copy body span.
   5676     la_a0 &pool_used
   5677     ld_t0,a0,0
   5678     la_a1 &elp_mark
   5679     st_t0,a1,0
   5680 
   5681     li_t0 M1PP_MACRO_BODY_START_OFF
   5682     add_t0,a3,t0
   5683     ld_a0,t0,0                     # body_start
   5684     li_t1 M1PP_MACRO_BODY_END_OFF
   5685     add_t1,a3,t1
   5686     ld_a1,t1,0                     # body_end
   5687     la_br &copy_span_to_pool
   5688     call
   5689 
   5690     # elp_after = call_end_pos
   5691     la_a0 &call_end_pos
   5692     ld_t0,a0,0
   5693     la_a1 &elp_after
   5694     st_t0,a1,0
   5695 
   5696     eret
   5697 
   5698 :expand_builtin_call
   5699     enter_0
   5700 
   5701     # ebc_stream = stream_ptr;  also stash builtin_tok via a register reload path
   5702     la_a2 &ebc_stream
   5703     st_a0,a2,0
   5704 
   5705     # lparen = builtin_tok + 24; if (lparen >= stream->end) fatal
   5706     addi_t0,a1,32
   5707     ld_t1,a0,8           # stream->end
   5708     la_br &err_bad_macro_header
   5709     beq_t0,t1
   5710     la_br &err_bad_macro_header
   5711     blt_t1,t0
   5712 
   5713     # if (lparen->kind != TOK_LPAREN) fatal
   5714     ld_a3,t0,0
   5715     li_a2 TOK_LPAREN
   5716     la_br &err_bad_macro_header
   5717     bne_a3,a2
   5718 
   5719     # parse_args(lparen, stream->end)
   5720     mov_a0,t0
   5721     la_a2 &ebc_stream
   5722     ld_a2,a2,0
   5723     ld_a1,a2,8           # stream->end
   5724     la_br &parse_args
   5725     call
   5726 
   5727     # snapshot call_end_pos -> ebc_call_end_pos
   5728     la_a0 &call_end_pos
   5729     ld_t0,a0,0
   5730     la_a1 &ebc_call_end_pos
   5731     st_t0,a1,0
   5732 
   5733     # dispatch on builtin_tok->text. a1 (builtin_tok) is gone after parse_args,
   5734     # but stream->pos still points at the builtin token (we don't advance it
   5735     # until the dispatched branch sets stream->pos = call_end_pos), so reload
   5736     # builtin_tok from stream->pos.
   5737     la_a0 &ebc_stream
   5738     ld_a0,a0,0
   5739     ld_t0,a0,16          # stream->pos -> builtin_tok
   5740 
   5741     # if tok_eq_const(tok, "!", 1) -> bytes=1
   5742     mov_a0,t0
   5743     la_a1 &const_bang
   5744     li_a2 %1 %0
   5745     la_br &tok_eq_const
   5746     call
   5747     la_br &ebc_arg_set_1
   5748     bnez_a0
   5749 
   5750     # if tok_eq_const(tok, "@", 1) -> bytes=2
   5751     la_a0 &ebc_stream
   5752     ld_a0,a0,0
   5753     ld_a0,a0,16
   5754     la_a1 &const_at
   5755     li_a2 %1 %0
   5756     la_br &tok_eq_const
   5757     call
   5758     la_br &ebc_arg_set_2
   5759     bnez_a0
   5760 
   5761     # if tok_eq_const(tok, "%", 1) -> bytes=4
   5762     la_a0 &ebc_stream
   5763     ld_a0,a0,0
   5764     ld_a0,a0,16
   5765     la_a1 &const_pct
   5766     li_a2 %1 %0
   5767     la_br &tok_eq_const
   5768     call
   5769     la_br &ebc_arg_set_4
   5770     bnez_a0
   5771 
   5772     # if tok_eq_const(tok, "$", 1) -> bytes=8
   5773     la_a0 &ebc_stream
   5774     ld_a0,a0,0
   5775     ld_a0,a0,16
   5776     la_a1 &const_dlr
   5777     li_a2 %1 %0
   5778     la_br &tok_eq_const
   5779     call
   5780     la_br &ebc_arg_set_8
   5781     bnez_a0
   5782 
   5783     # if tok_eq_const(tok, "%select", 7) -> select path
   5784     la_a0 &ebc_stream
   5785     ld_a0,a0,0
   5786     ld_a0,a0,16
   5787     la_a1 &const_select
   5788     li_a2 %7 %0
   5789     la_br &tok_eq_const
   5790     call
   5791     la_br &ebc_select
   5792     bnez_a0
   5793 
   5794     # if tok_eq_const(tok, "%str", 4) -> str path
   5795     la_a0 &ebc_stream
   5796     ld_a0,a0,0
   5797     ld_a0,a0,16
   5798     la_a1 &const_str
   5799     li_a2 %4 %0
   5800     la_br &tok_eq_const
   5801     call
   5802     la_br &ebc_str
   5803     bnez_a0
   5804 
   5805     # if tok_eq_const(tok, "%local", 6) -> local path
   5806     la_a0 &ebc_stream
   5807     ld_a0,a0,0
   5808     ld_a0,a0,16
   5809     la_a1 &const_local
   5810     li_a2 %6 %0
   5811     la_br &tok_eq_const
   5812     call
   5813     la_br &ebc_local
   5814     bnez_a0
   5815 
   5816     # else: fatal
   5817     la_br &err_bad_macro_header
   5818     b
   5819 
   5820 :ebc_arg_set_1
   5821     li_a0 %1 %0
   5822     la_a1 &ebc_bytes
   5823     st_a0,a1,0
   5824     la_br &ebc_arg_path
   5825     b
   5826 :ebc_arg_set_2
   5827     li_a0 %2 %0
   5828     la_a1 &ebc_bytes
   5829     st_a0,a1,0
   5830     la_br &ebc_arg_path
   5831     b
   5832 :ebc_arg_set_4
   5833     li_a0 %4 %0
   5834     la_a1 &ebc_bytes
   5835     st_a0,a1,0
   5836     la_br &ebc_arg_path
   5837     b
   5838 :ebc_arg_set_8
   5839     li_a0 %8 %0
   5840     la_a1 &ebc_bytes
   5841     st_a0,a1,0
   5842     la_br &ebc_arg_path
   5843     b
   5844 
   5845 :ebc_arg_path
   5846     # require arg_count == 1
   5847     la_a0 &arg_count
   5848     ld_t0,a0,0
   5849     li_t1 %1 %0
   5850     la_br &err_bad_macro_header
   5851     bne_t0,t1
   5852 
   5853     # snapshot arg_starts[0], arg_ends[0]
   5854     la_a0 &arg_starts_ptr
   5855     ld_a0,a0,0
   5856     ld_t0,a0,0
   5857     la_a1 &ebc_arg0_start
   5858     st_t0,a1,0
   5859     la_a0 &arg_ends_ptr
   5860     ld_a0,a0,0
   5861     ld_t0,a0,0
   5862     la_a1 &ebc_arg0_end
   5863     st_t0,a1,0
   5864 
   5865     # value = eval_expr_range(arg0_start, arg0_end)
   5866     la_a0 &ebc_arg0_start
   5867     ld_a0,a0,0
   5868     la_a1 &ebc_arg0_end
   5869     ld_a1,a1,0
   5870     la_br &eval_expr_range
   5871     call
   5872 
   5873     # ebc_value = a0
   5874     la_a1 &ebc_value
   5875     st_a0,a1,0
   5876 
   5877     # stream->pos = ebc_call_end_pos; stream->line_start = 0
   5878     la_a0 &ebc_stream
   5879     ld_a0,a0,0
   5880     la_a1 &ebc_call_end_pos
   5881     ld_t0,a1,0
   5882     st_t0,a0,16
   5883     li_t1 %0 %0
   5884     st_t1,a0,24
   5885 
   5886     # emit_hex_value(ebc_value, ebc_bytes)
   5887     la_a0 &ebc_value
   5888     ld_a0,a0,0
   5889     la_a1 &ebc_bytes
   5890     ld_a1,a1,0
   5891     la_br &emit_hex_value
   5892     call
   5893 
   5894     eret
   5895 
   5896 :ebc_select
   5897     # require arg_count == 3
   5898     la_a0 &arg_count
   5899     ld_t0,a0,0
   5900     li_t1 %3 %0
   5901     la_br &err_bad_macro_header
   5902     bne_t0,t1
   5903 
   5904     # snapshot arg_starts[0..2] / arg_ends[0..2]
   5905     la_a0 &arg_starts_ptr
   5906     ld_a0,a0,0
   5907     ld_t0,a0,0
   5908     la_a1 &ebc_arg0_start
   5909     st_t0,a1,0
   5910     la_a0 &arg_starts_ptr
   5911     ld_a0,a0,0
   5912     ld_t0,a0,8
   5913     la_a1 &ebc_then_start
   5914     st_t0,a1,0
   5915     la_a0 &arg_starts_ptr
   5916     ld_a0,a0,0
   5917     ld_t0,a0,16
   5918     la_a1 &ebc_else_start
   5919     st_t0,a1,0
   5920 
   5921     la_a0 &arg_ends_ptr
   5922     ld_a0,a0,0
   5923     ld_t0,a0,0
   5924     la_a1 &ebc_arg0_end
   5925     st_t0,a1,0
   5926     la_a0 &arg_ends_ptr
   5927     ld_a0,a0,0
   5928     ld_t0,a0,8
   5929     la_a1 &ebc_then_end
   5930     st_t0,a1,0
   5931     la_a0 &arg_ends_ptr
   5932     ld_a0,a0,0
   5933     ld_t0,a0,16
   5934     la_a1 &ebc_else_end
   5935     st_t0,a1,0
   5936 
   5937     # value = eval_expr_range(arg0_start, arg0_end)
   5938     la_a0 &ebc_arg0_start
   5939     ld_a0,a0,0
   5940     la_a1 &ebc_arg0_end
   5941     ld_a1,a1,0
   5942     la_br &eval_expr_range
   5943     call
   5944 
   5945     # if (value != 0) chosen = then; else chosen = else
   5946     la_br &ebc_select_then
   5947     bnez_a0
   5948 
   5949     # chosen = else
   5950     la_a0 &ebc_else_start
   5951     ld_t0,a0,0
   5952     la_a1 &ebc_arg0_start
   5953     st_t0,a1,0
   5954     la_a0 &ebc_else_end
   5955     ld_t0,a0,0
   5956     la_a1 &ebc_arg0_end
   5957     st_t0,a1,0
   5958     la_br &ebc_select_after_pick
   5959     b
   5960 
   5961 :ebc_select_then
   5962     # chosen = then
   5963     la_a0 &ebc_then_start
   5964     ld_t0,a0,0
   5965     la_a1 &ebc_arg0_start
   5966     st_t0,a1,0
   5967     la_a0 &ebc_then_end
   5968     ld_t0,a0,0
   5969     la_a1 &ebc_arg0_end
   5970     st_t0,a1,0
   5971 
   5972 :ebc_select_after_pick
   5973     # stream->pos = ebc_call_end_pos; stream->line_start = 0
   5974     la_a0 &ebc_stream
   5975     ld_a0,a0,0
   5976     la_a1 &ebc_call_end_pos
   5977     ld_t0,a1,0
   5978     st_t0,a0,16
   5979     li_t1 %0 %0
   5980     st_t1,a0,24
   5981 
   5982     # if (chosen_start == chosen_end) return
   5983     la_a0 &ebc_arg0_start
   5984     ld_t0,a0,0
   5985     la_a1 &ebc_arg0_end
   5986     ld_t1,a1,0
   5987     la_br &ebc_select_done
   5988     beq_t0,t1
   5989 
   5990     # mark = pool_used
   5991     la_a0 &pool_used
   5992     ld_t0,a0,0
   5993     la_a1 &ebc_mark
   5994     st_t0,a1,0
   5995 
   5996     # copy_span_to_pool(chosen_start, chosen_end)
   5997     la_a0 &ebc_arg0_start
   5998     ld_a0,a0,0
   5999     la_a1 &ebc_arg0_end
   6000     ld_a1,a1,0
   6001     la_br &copy_span_to_pool
   6002     call
   6003 
   6004     # push_pool_stream_from_mark(mark)
   6005     la_a0 &ebc_mark
   6006     ld_a0,a0,0
   6007     la_br &push_pool_stream_from_mark
   6008     call
   6009 
   6010 :ebc_select_done
   6011     eret
   6012 
   6013 ## %str(IDENT): stringify a single WORD argument into a TOK_STRING literal.
   6014 ## Validation: arg_count == 1, arg span length == 1 token, and that token's
   6015 ## kind is TOK_WORD. Output: a freshly-allocated text span built as
   6016 ## `"` + arg.text + `"` (len = arg.text.len + 2) and a synthesized TOK_STRING
   6017 ## pointing at it. Stream pos advances to call_end_pos; line_start = 0.
   6018 :ebc_str
   6019     # require arg_count == 1
   6020     la_a0 &arg_count
   6021     ld_t0,a0,0
   6022     li_t1 %1 %0
   6023     la_br &err_bad_macro_header
   6024     bne_t0,t1
   6025 
   6026     # snapshot arg_starts[0] / arg_ends[0]
   6027     la_a0 &arg_starts_ptr
   6028     ld_a0,a0,0
   6029     ld_t0,a0,0
   6030     la_a1 &ebc_arg0_start
   6031     st_t0,a1,0
   6032     la_a0 &arg_ends_ptr
   6033     ld_a0,a0,0
   6034     ld_t0,a0,0
   6035     la_a1 &ebc_arg0_end
   6036     st_t0,a1,0
   6037 
   6038     # require arg0_end - arg0_start == 32 (exactly one token)
   6039     la_a0 &ebc_arg0_start
   6040     ld_t0,a0,0
   6041     la_a1 &ebc_arg0_end
   6042     ld_t1,a1,0
   6043     sub_t2,t1,t0
   6044     li_a2 %32 %0
   6045     la_br &err_bad_macro_header
   6046     bne_t2,a2
   6047 
   6048     # require arg_tok->kind == TOK_WORD
   6049     ld_a3,t0,0
   6050     li_a2 TOK_WORD
   6051     la_br &err_bad_macro_header
   6052     bne_a3,a2
   6053 
   6054     # orig_len = arg_tok->text.len; out_len = orig_len + 2
   6055     # fatal if out_len > 256 (scratch cap; text_buf cap checked by append_text)
   6056     ld_t1,t0,16
   6057     la_a0 &ebc_str_orig_len
   6058     st_t1,a0,0
   6059     addi_t2,t1,2
   6060     la_a0 &ebc_str_out_len
   6061     st_t2,a0,0
   6062     li_a1 %256 %0
   6063     la_br &err_text_overflow
   6064     blt_a1,t2
   6065 
   6066     # scratch[0] = '"'
   6067     la_t2 &ebc_str_scratch_ptr
   6068     ld_t2,t2,0
   6069     li_a3 %34 %0
   6070     sb_a3,t2,0
   6071 
   6072     # copy arg_tok->text bytes into scratch[1..1+orig_len)
   6073     #   src = arg_tok->text.ptr; i = 0
   6074     la_a0 &ebc_arg0_start
   6075     ld_a0,a0,0
   6076     ld_t0,a0,8
   6077     la_a1 &ebc_str_orig_len
   6078     ld_t1,a1,0
   6079     li_a0 %0 %0
   6080 :ebc_str_copy_loop
   6081     la_br &ebc_str_copy_done
   6082     beq_a0,t1
   6083     add_a1,t0,a0
   6084     lb_a1,a1,0
   6085     addi_a2,a0,1
   6086     add_a2,t2,a2
   6087     sb_a1,a2,0
   6088     addi_a0,a0,1
   6089     la_br &ebc_str_copy_loop
   6090     b
   6091 :ebc_str_copy_done
   6092 
   6093     # scratch[1 + orig_len] = '"'
   6094     la_t2 &ebc_str_scratch_ptr
   6095     ld_t2,t2,0
   6096     la_a1 &ebc_str_orig_len
   6097     ld_a1,a1,0
   6098     addi_a1,a1,1
   6099     add_a0,t2,a1
   6100     li_a3 %34 %0
   6101     sb_a3,a0,0
   6102 
   6103     # text_ptr = append_text(&scratch, out_len)
   6104     la_a0 &ebc_str_scratch_ptr
   6105     ld_a0,a0,0
   6106     la_a1 &ebc_str_out_len
   6107     ld_a1,a1,0
   6108     la_br &append_text
   6109     call
   6110 
   6111     # ebc_str_token = { TOK_STRING, text_ptr, out_len, tight=0 }
   6112     la_a2 &ebc_str_token
   6113     li_a3 TOK_STRING
   6114     st_a3,a2,0
   6115     st_a0,a2,8
   6116     la_a1 &ebc_str_out_len
   6117     ld_a1,a1,0
   6118     st_a1,a2,16
   6119     li_a1 %0 %0
   6120     st_a1,a2,24
   6121 
   6122     # stream->pos = ebc_call_end_pos; stream->line_start = 0
   6123     la_a0 &ebc_stream
   6124     ld_a0,a0,0
   6125     la_a1 &ebc_call_end_pos
   6126     ld_t0,a1,0
   6127     st_t0,a0,16
   6128     li_t1 %0 %0
   6129     st_t1,a0,24
   6130 
   6131     # emit_token(&ebc_str_token)
   6132     la_a0 &ebc_str_token
   6133     la_br &emit_token
   6134     call
   6135 
   6136     eret
   6137 
   6138 ## emit_string_as_bytes(a0=tok_ptr) -> void (fatal on bad escape).
   6139 ## Decode the contents of a TOK_STRING (between the surrounding quotes)
   6140 ## and emit each byte as one TOK_WORD via emit_hex_value(byte, 1). The
   6141 ## lexer accepts both "..." and '...'; this routine just strips the
   6142 ## first/last byte of text and decodes the middle. Recognised escapes:
   6143 ##   \n \t \r \0 \\ \"     and \xNN (two hex digits, case-insensitive).
   6144 ## hex2pp's parse_byte_stream coalesces the resulting space-separated
   6145 ## hex bytes back into a contiguous byte sequence at link time.
   6146 :emit_string_as_bytes
   6147     enter_0
   6148 
   6149     # require tok->text.len >= 2
   6150     ld_a1,a0,16
   6151     li_a2 %2 %0
   6152     la_br &err_bad_escape
   6153     blt_a1,a2
   6154 
   6155     # src = tok->text.ptr + 1; src_len = tok->text.len - 2
   6156     ld_a3,a0,8
   6157     addi_a3,a3,1
   6158     la_a0 &ebc_b_src_ptr
   6159     st_a3,a0,0
   6160     addi_a1,a1,neg2
   6161     la_a0 &ebc_b_src_len
   6162     st_a1,a0,0
   6163 
   6164     # ebc_b_src_i = 0
   6165     li_a0 %0 %0
   6166     la_a1 &ebc_b_src_i
   6167     st_a0,a1,0
   6168 
   6169 :ebc_b_loop
   6170     # if (src_i == src_len) done
   6171     la_a0 &ebc_b_src_i
   6172     ld_t0,a0,0
   6173     la_a1 &ebc_b_src_len
   6174     ld_t1,a1,0
   6175     la_br &ebc_b_done
   6176     beq_t0,t1
   6177 
   6178     # c = src_ptr[src_i];  src_i++
   6179     # P1 lacks lb_a3,a0,0 — bounce through a1 (mov_a1,a0; lb_a3,a1,0).
   6180     la_a0 &ebc_b_src_ptr
   6181     ld_a0,a0,0
   6182     add_a0,a0,t0
   6183     mov_a1,a0
   6184     lb_a3,a1,0
   6185     addi_t0,t0,1
   6186     la_a1 &ebc_b_src_i
   6187     st_t0,a1,0
   6188 
   6189     # if (c == '\\') -> escape path
   6190     li_a2 %92 %0
   6191     la_br &ebc_b_escape
   6192     beq_a3,a2
   6193 
   6194     # literal byte: emit_hex_value(c, 1) and reloop
   6195     mov_a0,a3
   6196     li_a1 %1 %0
   6197     la_br &emit_hex_value
   6198     call
   6199     la_br &ebc_b_loop
   6200     b
   6201 
   6202 :ebc_b_escape
   6203     # Read the escape character; require at least one byte left.
   6204     la_a0 &ebc_b_src_i
   6205     ld_t0,a0,0
   6206     la_a1 &ebc_b_src_len
   6207     ld_t1,a1,0
   6208     la_br &err_bad_escape
   6209     beq_t0,t1
   6210     la_a0 &ebc_b_src_ptr
   6211     ld_a0,a0,0
   6212     add_a0,a0,t0
   6213     mov_a1,a0
   6214     lb_a3,a1,0                     # a3 = e
   6215     addi_t0,t0,1
   6216     la_a1 &ebc_b_src_i
   6217     st_t0,a1,0
   6218 
   6219     # Single-char escapes: dispatch via beq chain (matches the existing
   6220     # proc_check_<directive> pattern). Each branch loads the resulting
   6221     # byte into a3 and falls through to ebc_b_emit_one.
   6222     li_a2 %110 %0                  # 'n'
   6223     la_br &ebc_b_esc_n
   6224     beq_a3,a2
   6225     li_a2 %116 %0                  # 't'
   6226     la_br &ebc_b_esc_t
   6227     beq_a3,a2
   6228     li_a2 %114 %0                  # 'r'
   6229     la_br &ebc_b_esc_r
   6230     beq_a3,a2
   6231     li_a2 %48 %0                   # '0'
   6232     la_br &ebc_b_esc_zero
   6233     beq_a3,a2
   6234     li_a2 %92 %0                   # '\\'
   6235     la_br &ebc_b_esc_bs
   6236     beq_a3,a2
   6237     li_a2 %34 %0                   # '"'
   6238     la_br &ebc_b_esc_dq
   6239     beq_a3,a2
   6240     li_a2 %120 %0                  # 'x'
   6241     la_br &ebc_b_esc_hex
   6242     beq_a3,a2
   6243     la_br &err_bad_escape
   6244     b
   6245 
   6246 :ebc_b_esc_n
   6247     li_a3 %10 %0                   # 0x0A
   6248     la_br &ebc_b_emit_one
   6249     b
   6250 :ebc_b_esc_t
   6251     li_a3 %9 %0                    # 0x09
   6252     la_br &ebc_b_emit_one
   6253     b
   6254 :ebc_b_esc_r
   6255     li_a3 %13 %0                   # 0x0D
   6256     la_br &ebc_b_emit_one
   6257     b
   6258 :ebc_b_esc_zero
   6259     li_a3 %0 %0                    # 0x00
   6260     la_br &ebc_b_emit_one
   6261     b
   6262 :ebc_b_esc_bs
   6263     li_a3 %92 %0                   # 0x5C
   6264     la_br &ebc_b_emit_one
   6265     b
   6266 :ebc_b_esc_dq
   6267     li_a3 %34 %0                   # 0x22
   6268     la_br &ebc_b_emit_one
   6269     b
   6270 
   6271 :ebc_b_emit_one
   6272     # Common tail for single-char escapes: emit_hex_value(a3, 1), reloop.
   6273     mov_a0,a3
   6274     li_a1 %1 %0
   6275     la_br &emit_hex_value
   6276     call
   6277     la_br &ebc_b_loop
   6278     b
   6279 
   6280 :ebc_b_esc_hex
   6281     # \xNN: require two hex chars at src[src_i], src[src_i+1].
   6282     la_a0 &ebc_b_src_i
   6283     ld_t0,a0,0
   6284     la_a1 &ebc_b_src_len
   6285     ld_t1,a1,0
   6286     sub_t2,t1,t0                   # remaining = src_len - src_i
   6287     li_a3 %2 %0
   6288     la_br &err_bad_escape
   6289     blt_t2,a3
   6290 
   6291     # hi char: src[src_i]; decode via hex_digit_table[c]; fail if 0xFF.
   6292     la_a0 &ebc_b_src_ptr
   6293     ld_a0,a0,0
   6294     add_a0,a0,t0
   6295     lb_a0,a0,0                     # a0 = hi char
   6296     la_a1 &hex_digit_table
   6297     add_a1,a1,a0
   6298     lb_a2,a1,0                     # a2 = hi digit (or 0xFF)
   6299     li_a3 %255 %0
   6300     la_br &err_bad_escape
   6301     beq_a2,a3
   6302     # Stash hi digit into ebc_b_hex_hi for the (hi << 4) | lo combine
   6303     # below — the lo-digit lookup clobbers a2.
   6304     la_a0 &ebc_b_hex_hi
   6305     st_a2,a0,0
   6306 
   6307     # advance past hi char
   6308     la_a0 &ebc_b_src_i
   6309     ld_t0,a0,0
   6310     addi_t0,t0,1
   6311     st_t0,a0,0
   6312 
   6313     # lo char: src[src_i]; decode via hex_digit_table[c]; fail if 0xFF.
   6314     la_a0 &ebc_b_src_ptr
   6315     ld_a0,a0,0
   6316     add_a0,a0,t0
   6317     lb_a0,a0,0                     # a0 = lo char
   6318     la_a1 &hex_digit_table
   6319     add_a1,a1,a0
   6320     lb_a2,a1,0                     # a2 = lo digit (or 0xFF)
   6321     li_a3 %255 %0
   6322     la_br &err_bad_escape
   6323     beq_a2,a3
   6324 
   6325     # advance past lo char
   6326     la_a0 &ebc_b_src_i
   6327     ld_t0,a0,0
   6328     addi_t0,t0,1
   6329     st_t0,a0,0
   6330 
   6331     # byte = (hi << 4) | lo. shli_a3,t0,4 puts hi<<4 in a3, then or.
   6332     la_a0 &ebc_b_hex_hi
   6333     ld_t0,a0,0
   6334     shli_a3,t0,4
   6335     or_a3,a3,a2
   6336 
   6337     la_br &ebc_b_emit_one
   6338     b
   6339 
   6340 :ebc_b_done
   6341     eret
   6342 
   6343 ## %local(NAME): emit-time variant. expand_builtin_call has already
   6344 ## parse_args'd the call (so arg_starts/arg_ends/arg_count/call_end_pos
   6345 ## are set), but expand_local_into_pool re-parses internally so it can
   6346 ## also be invoked from eval_expr_atom where parse_args wasn't called.
   6347 ## After the helper returns, advance the stream past the call and push
   6348 ## the body slice as a fresh stream for rescan.
   6349 :ebc_local
   6350     # call_tok = stream->pos; limit = stream->end
   6351     la_a0 &ebc_stream
   6352     ld_a0,a0,0
   6353     ld_t0,a0,16                    # call_tok
   6354     ld_t1,a0,8                     # limit
   6355     mov_a0,t0
   6356     mov_a1,t1
   6357     la_br &expand_local_into_pool
   6358     call
   6359 
   6360     # stream->pos = elp_after; stream->line_start = 0
   6361     la_a0 &ebc_stream
   6362     ld_a0,a0,0
   6363     la_a1 &elp_after
   6364     ld_t0,a1,0
   6365     st_t0,a0,16
   6366     li_t1 %0 %0
   6367     st_t1,a0,24
   6368 
   6369     # push_pool_stream_from_mark(elp_mark)
   6370     la_a0 &elp_mark
   6371     ld_a0,a0,0
   6372     la_br &push_pool_stream_from_mark
   6373     call
   6374 
   6375     eret
   6376 
   6377 ## --- Error paths -------------------------------------------------------------
   6378 ## Each err_* loads a (msg, len) pair for fatal; fatal writes "m1pp: <msg>\n"
   6379 ## to stderr and exits 1. Error labels are branched to from range/overflow
   6380 ## checks throughout the code.
   6381 
   6382 :err_usage
   6383     la_a0 &msg_usage
   6384     la_br &fatal
   6385     b
   6386 :err_open_input
   6387     la_a0 &msg_open_input
   6388     la_br &fatal
   6389     b
   6390 :err_read
   6391     la_a0 &msg_read
   6392     la_br &fatal
   6393     b
   6394 :err_input_too_big
   6395     la_a0 &msg_input_too_big
   6396     la_br &fatal
   6397     b
   6398 :err_open_output
   6399     la_a0 &msg_open_output
   6400     la_br &fatal
   6401     b
   6402 :err_write
   6403     la_a0 &msg_write
   6404     la_br &fatal
   6405     b
   6406 :err_text_overflow
   6407     la_a0 &msg_text_overflow
   6408     la_br &fatal
   6409     b
   6410 :err_token_overflow
   6411     la_a0 &msg_token_overflow
   6412     la_br &fatal
   6413     b
   6414 :err_output_overflow
   6415     la_a0 &msg_output_overflow
   6416     la_br &fatal
   6417     b
   6418 :err_unterminated_macro
   6419     la_a0 &msg_unterminated_macro
   6420     la_br &fatal
   6421     b
   6422 :err_bad_macro_header
   6423     la_a0 &msg_bad_macro_header
   6424     la_br &fatal
   6425     b
   6426 :err_too_many_macros
   6427     la_a0 &msg_too_many_macros
   6428     la_br &fatal
   6429     b
   6430 :err_macro_body_overflow
   6431     la_a0 &msg_macro_body_overflow
   6432     la_br &fatal
   6433     b
   6434 :err_unbalanced_braces
   6435     la_a0 &msg_unbalanced_braces
   6436     la_br &fatal
   6437     b
   6438 :err_bad_directive
   6439     la_a0 &msg_bad_directive
   6440     la_br &fatal
   6441     b
   6442 :err_unterminated_directive
   6443     la_a0 &msg_unterminated_directive
   6444     la_br &fatal
   6445     b
   6446 :err_bad_escape
   6447     la_a0 &msg_bad_escape
   6448     la_br &fatal
   6449     b
   6450 :err_bad_frame_header
   6451     la_a0 &msg_bad_frame_header
   6452     la_br &fatal
   6453     b
   6454 :err_frame_already_active
   6455     la_a0 &msg_frame_already_active
   6456     la_br &fatal
   6457     b
   6458 :err_frame_underflow
   6459     la_a0 &msg_frame_underflow
   6460     la_br &fatal
   6461     b
   6462 :err_frame_not_closed
   6463     la_a0 &msg_frame_not_closed
   6464     la_br &fatal
   6465     b
   6466 :err_local_outside_frame
   6467     la_a0 &msg_local_outside_frame
   6468     la_br &fatal
   6469     b
   6470 :err_unknown_local
   6471     la_a0 &msg_unknown_local
   6472     la_br &fatal
   6473     b
   6474 :err_local_name_too_long
   6475     la_a0 &msg_local_name_too_long
   6476     la_br &fatal
   6477     b
   6478 
   6479 ## fatal(a0=msg_ptr): writes "m1pp: <msg>\n" to stderr and exits 1.
   6480 ## Length is computed inline via a strlen loop (messages are NUL-terminated).
   6481 ## Reached by unconditional branch from any err_* stub, so no frame is required.
   6482 :fatal
   6483     # Stash msg_ptr; compute len inline into err_saved_len.
   6484     la_a1 &err_saved_msg
   6485     st_a0,a1,0
   6486     li_t0 %0 %0
   6487 :fatal_strlen
   6488     add_t1,a0,t0
   6489     lb_t1,t1,0
   6490     la_br &fatal_strlen_done
   6491     beqz_t1
   6492     addi_t0,t0,1
   6493     la_br &fatal_strlen
   6494     b
   6495 :fatal_strlen_done
   6496     la_a1 &err_saved_len
   6497     st_t0,a1,0
   6498 
   6499     # write(2, "m1pp:", 5)
   6500     li_a0 sys_write
   6501     li_a1 %2 %0
   6502     la_a2 &msg_prefix
   6503     li_a3 %5 %0
   6504     syscall
   6505 
   6506     # write(2, msg, len)
   6507     la_a0 &err_saved_msg
   6508     ld_a2,a0,0
   6509     la_a0 &err_saved_len
   6510     ld_a3,a0,0
   6511     li_a0 sys_write
   6512     li_a1 %2 %0
   6513     syscall
   6514 
   6515     # write(2, "\n", 1)
   6516     li_a0 sys_write
   6517     li_a1 %2 %0
   6518     la_a2 &msg_newline
   6519     li_a3 %1 %0
   6520     syscall
   6521 
   6522     # exit(1)
   6523     li_a0 sys_exit
   6524     li_a1 %1 %0
   6525     syscall
   6526 
   6527 ## Sentinel: marks the boundary between executable text and rodata. Read by
   6528 ## scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound disassembly
   6529 ## so trailing strings don't decode as bogus instructions.
   6530 :_text_end
   6531 
   6532 ## --- Rodata: const tokens (for tok_eq_const) and fatal messages --------------
   6533 
   6534 :const_macro "%macro"
   6535 :const_endm "%endm"
   6536 :const_paste "##"
   6537 :const_lparen "("
   6538 :const_rparen ")"
   6539 :const_comma ","
   6540 :const_lbrace "{"
   6541 :const_rbrace "}"
   6542 :const_bang "!"
   6543 :const_at "@"
   6544 :const_pct "%"
   6545 :const_dlr "$"
   6546 :const_select "%select"
   6547 :const_str "%str"
   6548 :const_struct "%struct"
   6549 :const_enum "%enum"
   6550 :const_size "SIZE"
   6551 :const_count "COUNT"
   6552 :const_frame "%frame"
   6553 :const_endframe "%endframe"
   6554 :const_local "%local"
   6555 :const_bytes "%bytes"
   6556 ## Suffix appended to the frame name when looking up <frame>_FRAME.<field>.
   6557 :const_frame_suffix "_FRAME."
   6558 
   6559 ## Operator strings for expr_op_code. Each is a raw byte literal; lengths
   6560 ## are passed separately to tok_eq_const. "<=" must be tested before "<"
   6561 ## so the longer match wins; same for ">=" before ">".
   6562 :op_plus "+"
   6563 :op_minus "-"
   6564 :op_star "*"
   6565 :op_slash "/"
   6566 :op_percent "%"
   6567 :op_shl "<<"
   6568 :op_shr ">>"
   6569 :op_amp "&"
   6570 :op_bar "|"
   6571 :op_caret "^"
   6572 :op_tilde "~"
   6573 :op_eq "="
   6574 :op_ne "!="
   6575 :op_lt "<"
   6576 :op_le "<="
   6577 :op_gt ">"
   6578 :op_ge ">="
   6579 :op_strlen "strlen"
   6580 
   6581 ## Nibble-to-hex lookup table for emit_hex_value.
   6582 :hex_chars "0123456789ABCDEF"
   6583 
   6584 ## 256-byte hex-digit lookup table for %bytes(\xNN). Indexed by source
   6585 ## byte; value is the digit (0..15) for '0'..'9'/'a'..'f'/'A'..'F', or
   6586 ## 0xFF for any other input. The escape decoder reads two source bytes
   6587 ## and combines (hi << 4) | lo into the emitted byte; either lookup
   6588 ## returning 0xFF triggers err_bad_escape.
   6589 :hex_digit_table
   6590 ## 0x00-0x1F: all invalid
   6591 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6592 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6593 ## 0x20-0x2F: invalid
   6594 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6595 ## 0x30-0x39 = '0'..'9' -> 0..9; 0x3A-0x3F invalid
   6596 '00010203040506070809FFFFFFFFFFFF'
   6597 ## 0x40 invalid; 0x41-0x46 = 'A'..'F' -> 10..15; 0x47-0x5F invalid
   6598 'FF0A0B0C0D0E0FFFFFFFFFFFFFFFFFFF'
   6599 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6600 ## 0x60 invalid; 0x61-0x66 = 'a'..'f' -> 10..15; 0x67-0x7F invalid
   6601 'FF0A0B0C0D0E0FFFFFFFFFFFFFFFFFFF'
   6602 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6603 ## 0x80-0xFF: all invalid
   6604 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6605 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6606 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6607 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6608 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6609 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6610 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6611 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
   6612 
   6613 ## 256-byte char-class table for lex_loop / lex_word_scan. Indexed by the
   6614 ## source byte `c`; value is the class code dispatched by lex_loop:
   6615 ##   0  WORD (default; word_scan continues through this byte)
   6616 ##   1  SKIP (non-newline whitespace: 0x09 tab, 0x0B-0x0D vt/ff/cr, 0x20 sp)
   6617 ##   2  NEWLINE (0x0A)
   6618 ##   3  STRING (0x22 ", 0x27 ')
   6619 ##   4  HASH (0x23 #)
   6620 ##   5  COMMENT (0x3B ;)
   6621 ##   6  LPAREN (0x28 ()
   6622 ##   7  RPAREN (0x29 ))
   6623 ##   8  COMMA  (0x2C ,)
   6624 ##   9  LBRACE (0x7B {)
   6625 ##  10  RBRACE (0x7D })
   6626 ##  11  NUL (0x00 — lex_loop fall-through to lex_done)
   6627 :lex_char_class
   6628 ## bytes 0x00-0x1F: NUL=11, \t=1, \n=2, \v/\f/\r=1, rest=0
   6629 '0B000000000000000001020101010000'
   6630 '00000000000000000000000000000000'
   6631 ## bytes 0x20-0x3F: sp=1, "=3, #=4, '=3, (=6, )=7, ,=8, ;=5
   6632 '01000304000000030607000008000000'
   6633 '00000000000000000000000500000000'
   6634 ## bytes 0x40-0x7F: {=9 (0x7B), }=10 (0x7D)
   6635 '00000000000000000000000000000000'
   6636 '00000000000000000000000000000000'
   6637 '00000000000000000000000000000000'
   6638 '000000000000000000000009000A0000'
   6639 ## bytes 0x80-0xFF: all 0 (word)
   6640 '00000000000000000000000000000000'
   6641 '00000000000000000000000000000000'
   6642 '00000000000000000000000000000000'
   6643 '00000000000000000000000000000000'
   6644 '00000000000000000000000000000000'
   6645 '00000000000000000000000000000000'
   6646 '00000000000000000000000000000000'
   6647 '00000000000000000000000000000000'
   6648 
   6649 ## BSS pointer-slot init table (for p1_main's bss_init_loop).
   6650 ## Each entry: 8-byte slot ptr (&label + 4 pad) + 8-byte OFF_* constant.
   6651 ## Walked linearly; order is irrelevant.
   6652 :bss_init_tbl
   6653 &paste_scratch_ptr ZERO4 OFF_paste_scratch
   6654 &local_label_scratch_ptr ZERO4 OFF_local_label_scratch
   6655 &df_name_scratch_ptr ZERO4 OFF_df_name_scratch
   6656 &ebc_str_scratch_ptr ZERO4 OFF_ebc_str_scratch
   6657 &arg_starts_ptr ZERO4 OFF_arg_starts
   6658 &arg_ends_ptr ZERO4 OFF_arg_ends
   6659 &input_buf_ptr ZERO4 OFF_input_buf
   6660 &output_buf_ptr ZERO4 OFF_output_buf
   6661 &text_buf_ptr ZERO4 OFF_text_buf
   6662 &source_tokens_ptr ZERO4 OFF_source_tokens
   6663 &macros_ptr ZERO4 OFF_macros
   6664 &macro_body_tokens_ptr ZERO4 OFF_macro_body_tokens
   6665 &streams_ptr ZERO4 OFF_streams
   6666 &expand_pool_ptr ZERO4 OFF_expand_pool
   6667 &expr_frames_ptr ZERO4 OFF_expr_frames
   6668 &local_lookup_scratch_ptr ZERO4 OFF_local_lookup_scratch
   6669 &macro_body_param_idx_ptr ZERO4 OFF_macro_body_param_idx
   6670 &macro_body_is_local_label_ptr ZERO4 OFF_macro_body_is_local_label
   6671 :bss_init_tbl_end
   6672 
   6673 :msg_prefix "m1pp: "
   6674 :msg_newline "
   6675 "
   6676 ## All err_* messages below are NUL-terminated (trailing '00'); fatal uses an
   6677 ## inline strlen loop rather than a caller-supplied length.
   6678 :msg_usage "usage: m1pp input.M1 output.M1" '00'
   6679 :msg_open_input "failed to open input file" '00'
   6680 :msg_read "failed to read input" '00'
   6681 :msg_input_too_big "input file too large" '00'
   6682 :msg_open_output "failed to open output file" '00'
   6683 :msg_write "failed to write output" '00'
   6684 :msg_text_overflow "text buffer overflow" '00'
   6685 :msg_token_overflow "token buffer overflow" '00'
   6686 :msg_output_overflow "output buffer overflow" '00'
   6687 :msg_unterminated_macro "unterminated %macro definition" '00'
   6688 :msg_bad_macro_header "bad macro header" '00'
   6689 :msg_too_many_macros "too many macros" '00'
   6690 :msg_macro_body_overflow "macro body overflow" '00'
   6691 :msg_unbalanced_braces "unbalanced braces" '00'
   6692 :msg_bad_directive "bad %struct/%enum directive" '00'
   6693 :msg_unterminated_directive "unterminated %struct/%enum directive" '00'
   6694 :msg_bad_escape "bad escape in %bytes" '00'
   6695 :msg_bad_frame_header "bad frame header" '00'
   6696 :msg_frame_already_active "frame already active" '00'
   6697 :msg_frame_underflow "frame underflow" '00'
   6698 :msg_frame_not_closed "frame not closed" '00'
   6699 :msg_local_outside_frame "local outside frame" '00'
   6700 :msg_unknown_local "unknown local" '00'
   6701 :msg_local_name_too_long "local name too long" '00'
   6702 
   6703 ## --- BSS ---------------------------------------------------------------------
   6704 ## Placed before :ELF_end so filesz/memsz (which this ELF header sets equal)
   6705 ## covers the whole zero-initialized region. Bloats the file by the BSS size,
   6706 ## but avoids a custom ELF header.
   6707 ##
   6708 ## Layout: scalars (pointers, counters, lexer/processor state), then the
   6709 ## four arenas — input_buf, output_buf, text_buf, source_tokens — whose
   6710 ## sizes match the CAP constants above.
   6711 
   6712 ## Scalars (each 8 bytes).
   6713 :input_fd
   6714 ZERO8
   6715 :input_len
   6716 ZERO8
   6717 :output_fd
   6718 ZERO8
   6719 :output_used
   6720 ZERO8
   6721 :output_written
   6722 ZERO8
   6723 :output_need_space
   6724 ZERO8
   6725 :input_path
   6726 ZERO8
   6727 :output_path
   6728 ZERO8
   6729 :text_used
   6730 ZERO8
   6731 :source_end
   6732 ZERO8
   6733 :lex_ptr
   6734 ZERO8
   6735 :lex_start
   6736 ZERO8
   6737 :lex_quote
   6738 ZERO8
   6739 :lex_punct_kind
   6740 ZERO8
   6741 :lex_saw_separator
   6742 ZERO8
   6743 :proc_pos
   6744 ZERO8
   6745 :proc_line_start
   6746 ZERO8
   6747 ## proc_has_paren — set per iteration of proc_loop to 1 when the next
   6748 ## token is a tight TOK_LPAREN (i.e. the current token is a paren-call
   6749 ## form). Read by directive/builtin sub-handlers that gate on paren form
   6750 ## (%select, %str, %bytes, %local, !@$% arith, user-macro paren call).
   6751 :proc_has_paren
   6752 ZERO8
   6753 :macros_end
   6754 ZERO8
   6755 :macro_body_end
   6756 ZERO8
   6757 :def_m_ptr
   6758 ZERO8
   6759 :def_param_ptr
   6760 ZERO8
   6761 :def_body_line_start
   6762 ZERO8
   6763 ## def_body_meta_idx — slot index of the body token currently being copied
   6764 ## by def_body_copy, i.e. (macro_body_end - macro_body_tokens) / 32. Used
   6765 ## as the parallel-array index for macro_body_param_idx[] / _is_local_label[]
   6766 ## across the find_param call, which clobbers caller-saved registers.
   6767 :def_body_meta_idx
   6768 ZERO8
   6769 :pf_stream_end
   6770 ZERO8
   6771 
   6772 ## --- Frame state -------------------------------------------------------------
   6773 ## Single-slot "current frame" used by %local. current_frame_ptr/_len point
   6774 ## into stable text memory (input_buf or text_buf), borrowed from the WORD
   6775 ## token that named the frame. frame_active is 0 / 1.
   6776 ## elp_after / elp_mark are expand_local_into_pool's outputs (mirroring
   6777 ## emt_after_pos / emt_mark for expand_macro_tokens).
   6778 :current_frame_ptr
   6779 ZERO8
   6780 :current_frame_len
   6781 ZERO8
   6782 :frame_active
   6783 ZERO8
   6784 :elp_after
   6785 ZERO8
   6786 :elp_mark
   6787 ZERO8
   6788 :elp_arg_ptr
   6789 ZERO8
   6790 :elp_arg_len
   6791 ZERO8
   6792 :elp_name_len
   6793 ZERO8
   6794 :err_saved_msg
   6795 ZERO8
   6796 :err_saved_len
   6797 ZERO8
   6798 
   6799 ## Stream / pool / arg / expression scalars. Each is one u64 (ZERO8).
   6800 ## pool_used      — byte offset into expand_pool (i.e. next write slot).
   6801 ## stream_top     — stream stack depth in bytes (count × 40; 0 == empty).
   6802 ## arg_count      — number of args produced by the most recent parse_args.
   6803 ## call_end_pos   — Token* one past the ')' of that call.
   6804 ## expr_frame_top — ExprFrame stack depth inside eval_expr_range.
   6805 ## emt_after_pos, emt_mark — expand_macro_tokens output slots (Token* and
   6806 ##                           byte offset into expand_pool).
   6807 ## eval_after_pos, eval_value — eval_expr_atom output slots (Token* and i64).
   6808 ##                              Callers MUST snapshot these before any nested
   6809 ##                              eval_* call that could overwrite them.
   6810 :pool_used
   6811 ZERO8
   6812 :stream_top
   6813 ZERO8
   6814 :arg_count
   6815 ZERO8
   6816 :call_end_pos
   6817 ZERO8
   6818 :expr_frame_top
   6819 ZERO8
   6820 :emt_after_pos
   6821 ZERO8
   6822 :emt_mark
   6823 ZERO8
   6824 :eval_after_pos
   6825 ZERO8
   6826 :eval_value
   6827 ZERO8
   6828 
   6829 ## Paste-pass spill slots. Both append_pasted_token and paste_pool_range
   6830 ## call other functions, so all locals must round-trip through BSS
   6831 ## across the call.
   6832 ##   paste_dst_save  — dst Token* spilled across append_text
   6833 ##   paste_left_ptr/_len, paste_right_ptr/_len — operand spans for the
   6834 ##                       byte-copy loops in append_pasted_token
   6835 ##   paste_total_len — left.len + right.len, reused after append_text
   6836 ##   paste_start     — expand_pool + mark; needed to detect "## is first"
   6837 ##                     after registers are clobbered by append_pasted_token
   6838 ##   paste_in        — current read cursor (Token*)
   6839 ##   paste_out       — current write cursor (Token*)
   6840 ##   paste_end       — exclusive end (Token*), = expand_pool + pool_used
   6841 :paste_dst_save
   6842 ZERO8
   6843 :paste_left_ptr
   6844 ZERO8
   6845 :paste_left_len
   6846 ZERO8
   6847 :paste_right_ptr
   6848 ZERO8
   6849 :paste_right_len
   6850 ZERO8
   6851 :paste_total_len
   6852 ZERO8
   6853 :paste_start
   6854 ZERO8
   6855 :paste_in
   6856 ZERO8
   6857 :paste_out
   6858 ZERO8
   6859 :paste_end
   6860 ZERO8
   6861 
   6862 ## paste_scratch — 256-byte working buffer for append_pasted_token.
   6863 ## We assemble left.text ++ right.text here, then call
   6864 ## append_text(&paste_scratch, total_len) to copy into the durable
   6865 ## text_buf arena. 256 bytes is M0's quoted-literal cap.
   6866 
   6867 ## parse_args + expand_macro_tokens + find_param spill slots (P1 has
   6868 ## no callee-save spill on enter, and find_param's inner byte compare
   6869 ## needs every caller-saved register; parse_args + expand_macro_tokens
   6870 ## carry state across iterations and nested calls). One u64 each (ZERO8).
   6871 :pa_pos
   6872 ZERO8
   6873 :pa_arg_start
   6874 ZERO8
   6875 :pa_depth
   6876 ZERO8
   6877 :pa_arg_index
   6878 ZERO8
   6879 :pa_limit
   6880 ZERO8
   6881 :pa_brace_depth
   6882 ZERO8
   6883 ## args_have_paste — sticky 0/1 set by parse_args when the call's argument
   6884 ## span contains TOK_PASTE. expand_macro_tokens snapshots it into
   6885 ## emt_saw_arg_paste right after parse_args, then ORs it with
   6886 ## macro->has_paste to decide whether to run paste_pool_range. Lets us
   6887 ## skip the pool sweep when neither body nor args contribute a `##`.
   6888 :args_have_paste
   6889 ZERO8
   6890 :emt_call_tok
   6891 ZERO8
   6892 :emt_limit
   6893 ZERO8
   6894 :emt_macro
   6895 ZERO8
   6896 :emt_saw_arg_paste
   6897 ZERO8
   6898 ## emt_cached_param_idx — body token's param index (0 = not a param) read
   6899 ## once from macro_body_param_idx[] at the top of emt_loop, then reused in
   6900 ## emt_do_substitute_paste / _plain instead of re-running find_param.
   6901 :emt_cached_param_idx
   6902 ZERO8
   6903 :emt_body_pos
   6904 ZERO8
   6905 :emt_body_end
   6906 ZERO8
   6907 :emt_body_start
   6908 ZERO8
   6909 
   6910 ## Local-label rewrite. next_expansion_id is the monotonic counter
   6911 ## (never reset); emt_expansion_id snapshots it at the start of each
   6912 ## expand_macro_tokens call so nested-call BSS reuse is safe.
   6913 ## ll_* slots hold body-token span + derived sizes while building the
   6914 ## renamed text in local_label_scratch.
   6915 :next_expansion_id
   6916 ZERO8
   6917 :emt_expansion_id
   6918 ZERO8
   6919 :ll_src_ptr
   6920 ZERO8
   6921 :ll_src_len
   6922 ZERO8
   6923 :ll_tail_len
   6924 ZERO8
   6925 :ll_digit_count
   6926 ZERO8
   6927 :ll_digit_cursor
   6928 ZERO8
   6929 :ll_total_len
   6930 ZERO8
   6931 
   6932 ## local_label_digits: 24-byte reverse-fill scratch for the decimal
   6933 ## rendering of emt_expansion_id (fits any u64 value).
   6934 :local_label_digits
   6935 ZERO8 ZERO8 ZERO8
   6936 
   6937 ## local_label_scratch: 128-byte working buffer for the renamed text
   6938 ## (sigil + tail + "__" + digits) before it's copied into text_buf via
   6939 ## append_text. Caps the combined tail + digit length at ~125 bytes,
   6940 ## which is ample for any realistic local-label name.
   6941 
   6942 ## %struct / %enum scratch. define_fielded calls append_text twice
   6943 ## per synthesized macro, so every piece of state that must survive a call
   6944 ## lives here rather than in a register.
   6945 ##   df_stride               — 8 for %struct, 1 for %enum
   6946 ##   df_total_name_ptr/_len  — "SIZE" (4) for struct, "COUNT" (5) for enum
   6947 ##   df_base_ptr/_len        — directive's NAME token span
   6948 ##   df_index                — running field index, 0..N
   6949 ##   df_suffix_ptr/_len      — current synthesized field suffix
   6950 ##   df_value                — index * stride for this macro's body
   6951 ##   df_name_len             — base_len + 1 + suffix_len
   6952 ##   df_digit_count/_cursor  — df_render_decimal output
   6953 :df_stride
   6954 ZERO8
   6955 :df_total_name_ptr
   6956 ZERO8
   6957 :df_total_name_len
   6958 ZERO8
   6959 :df_base_ptr
   6960 ZERO8
   6961 :df_base_len
   6962 ZERO8
   6963 :df_index
   6964 ZERO8
   6965 :df_suffix_ptr
   6966 ZERO8
   6967 :df_suffix_len
   6968 ZERO8
   6969 :df_value
   6970 ZERO8
   6971 :df_name_len
   6972 ZERO8
   6973 :df_digit_count
   6974 ZERO8
   6975 :df_digit_cursor
   6976 ZERO8
   6977 
   6978 ## df_name_scratch: 256-byte working buffer for "BASE.SUFFIX" before
   6979 ## append_text copies it to text_buf. 256 B matches paste_scratch /
   6980 ## ebc_str_scratch; df_emit_field asserts nothing explicit, but realistic
   6981 ## struct/enum names stay well under 128 chars.
   6982 
   6983 ## df_digit_scratch: 24-byte reverse-fill buffer for the decimal rendering
   6984 ## of df_value (any u64 fits).
   6985 :df_digit_scratch
   6986 ZERO8 ZERO8 ZERO8
   6987 
   6988 :fp_macro
   6989 ZERO8
   6990 :fp_tok
   6991 ZERO8
   6992 :fp_pcount
   6993 ZERO8
   6994 :fp_idx
   6995 ZERO8
   6996 
   6997 ## Expression-evaluator scratch slots. expr_op_code spills its tok
   6998 ## argument to eoc_tok across tok_eq_const calls. apply_expr_op spills
   6999 ## op/args/argc and uses acc/i as the accumulator and loop induction var
   7000 ## inside the variadic folds.
   7001 :eoc_tok
   7002 ZERO8
   7003 :aeo_op
   7004 ZERO8
   7005 :aeo_args
   7006 ZERO8
   7007 :aeo_argc
   7008 ZERO8
   7009 :aeo_acc
   7010 ZERO8
   7011 :aeo_i
   7012 ZERO8
   7013 
   7014 ## Builtin scratch.
   7015 ## emit_hex_value: ehv_value/bytes hold the args; ehv_scratch is a 24-byte
   7016 ## buffer (max 16 chars used: 16 hex chars for an 8-byte $-emit; rounded
   7017 ## up to keep the next slot 8-byte aligned); ehv_token is a synthesized
   7018 ## 32-byte Token { kind, text_ptr, text_len, tight }.
   7019 :ehv_value
   7020 ZERO8
   7021 :ehv_bytes
   7022 ZERO8
   7023 :ehv_scratch
   7024 ZERO8 ZERO8 ZERO8
   7025 :ehv_token
   7026 ZERO8 ZERO8 ZERO8 ZERO8
   7027 
   7028 ## expand_builtin_call: snapshots the stream pointer, the post-call resume
   7029 ## position, the byte count for !@%$, the eval_expr_range result, the chosen
   7030 ## arg span (start/end), the unchosen-side spans for %select, and the
   7031 ## pool mark used to push the chosen-stream slice.
   7032 :ebc_stream
   7033 ZERO8
   7034 :ebc_call_end_pos
   7035 ZERO8
   7036 :ebc_bytes
   7037 ZERO8
   7038 :ebc_value
   7039 ZERO8
   7040 :ebc_arg0_start
   7041 ZERO8
   7042 :ebc_arg0_end
   7043 ZERO8
   7044 :ebc_then_start
   7045 ZERO8
   7046 :ebc_then_end
   7047 ZERO8
   7048 :ebc_else_start
   7049 ZERO8
   7050 :ebc_else_end
   7051 ZERO8
   7052 :ebc_mark
   7053 ZERO8
   7054 
   7055 ## %str builtin scratch. ebc_str_orig_len / ebc_str_out_len spill the
   7056 ## argument text length and its +2 output length across append_text;
   7057 ## ebc_str_token is the synthesized TOK_STRING { kind, text_ptr, text_len,
   7058 ## tight } handed to emit_token; ebc_str_scratch is a 256-byte assembly
   7059 ## buffer (matches paste_scratch / M0's quoted-literal cap).
   7060 :ebc_str_orig_len
   7061 ZERO8
   7062 :ebc_str_out_len
   7063 ZERO8
   7064 :ebc_str_token
   7065 ZERO8 ZERO8 ZERO8 ZERO8
   7066 
   7067 ## %bytes builtin scratch. ebc_b_src_ptr/_len/_i walk the input string
   7068 ## across emit_hex_value calls (which clobber every caller-saved reg).
   7069 ## ebc_b_hex_hi spills the high nibble across the second hex_digit_table
   7070 ## lookup for the low nibble. Each source byte emits independently via
   7071 ## emit_hex_value(byte, 1); hex2pp's parse_byte_stream coalesces the
   7072 ## resulting space-separated runs back into a contiguous byte stream.
   7073 :ebc_b_src_ptr
   7074 ZERO8
   7075 :ebc_b_src_len
   7076 ZERO8
   7077 :ebc_b_src_i
   7078 ZERO8
   7079 :ebc_b_hex_hi
   7080 ZERO8
   7081 
   7082 ## arg_starts[16] / arg_ends[16]: 16 × 8 = 128 bytes each, i.e. 4 ZERO32.
   7083 ## Written by parse_args; read by expand_macro_tokens and expand_builtin_call.
   7084 
   7085 ## input_buf: 8 KB (M1PP_INPUT_CAP)
   7086 
   7087 ## output_buf: 8 KB (M1PP_OUTPUT_CAP)
   7088 
   7089 ## text_buf: 1 MB (M1PP_TEXT_CAP)
   7090 
   7091 ## source_tokens (M1PP_TOKENS_END)
   7092 
   7093 ## macros: 32 records × 296 bytes = 9472 bytes (M1PP_MACROS_CAP).
   7094 ## 37 lines × 256 bytes = 9472. Each line is 8 × ZERO32 = 256 bytes.
   7095 
   7096 ## macro_body_tokens: 256 slots × 24 bytes = 6 KB (M1PP_MACRO_BODY_CAP).
   7097 ## 24 lines × 256 bytes = 6144. Source tokens are copied in 24 bytes at a
   7098 ## time as macro bodies are recorded.
   7099 
   7100 ## streams: 16 Stream records × 40 bytes = 640 bytes (M1PP_STREAM_STACK_CAP).
   7101 ## 20 ZERO32 = 2 lines of 8 + 1 line of 4.
   7102 
   7103 ## expand_pool: 256 Token slots × 24 bytes = 6144 bytes (M1PP_EXPAND_CAP).
   7104 ## 24 lines × 8 ZERO32 = 192 ZERO32.
   7105 
   7106 ## expr_frames: 16 × 144 bytes = 2304 bytes (M1PP_EXPR_FRAMES_CAP).
   7107 ## 9 lines × 8 ZERO32 = 72 ZERO32.
   7108 
   7109 ## --- BSS pointer slots (set by p1_main; one per BSS buffer) -----------------
   7110 :paste_scratch_ptr
   7111 ZERO8
   7112 :local_label_scratch_ptr
   7113 ZERO8
   7114 :df_name_scratch_ptr
   7115 ZERO8
   7116 :ebc_str_scratch_ptr
   7117 ZERO8
   7118 :arg_starts_ptr
   7119 ZERO8
   7120 :arg_ends_ptr
   7121 ZERO8
   7122 :input_buf_ptr
   7123 ZERO8
   7124 :output_buf_ptr
   7125 ZERO8
   7126 :text_buf_ptr
   7127 ZERO8
   7128 :source_tokens_ptr
   7129 ZERO8
   7130 :macros_ptr
   7131 ZERO8
   7132 :macro_body_tokens_ptr
   7133 ZERO8
   7134 :streams_ptr
   7135 ZERO8
   7136 :expand_pool_ptr
   7137 ZERO8
   7138 :expr_frames_ptr
   7139 ZERO8
   7140 :local_lookup_scratch_ptr
   7141 ZERO8
   7142 :macro_body_param_idx_ptr
   7143 ZERO8
   7144 :macro_body_is_local_label_ptr
   7145 ZERO8
   7146 
   7147 :ELF_end