boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit fba0661c4e9928254ae9b1a9c2eb4d243a3af53e
parent f3aa6d050718c9abf3080ed7d08c6a301b121229
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun,  3 May 2026 16:37:10 -0700

M1pp: bring M1pp.P1 to parity with M1pp.c; build M1pp from .P1

- emit_token tail-calls a new emit_string_as_bytes for TOK_STRING, so
  bare "..."/'...' tokens decode to raw hex bytes at emit time
  (matching the new contract; the old %bytes builtin is dropped).
- lex_word_scan no longer terminates on `"`/`'`; quote chars only
  start a STRING token at token start, mirroring M1pp.c's lexer.
- Drop %bytes from proc_check_builtin / expand_builtin_call dispatch
  and remove the const_bytes rodata; refactor the old ebc_bytes_handler
  into the callable emit_string_as_bytes subroutine.
- Makefile: build build/<arch>/M1pp/M1pp from M1pp/M1pp.P1 via
  boot-build-p1.sh (seed M0 + hex2-0 chain), not from M1pp.c via
  alpine-gcc. hex2pp still builds from .c until hex2pp.P1 catches up.

make test SUITE=m1pp ARCH=aarch64 -> 30/30 pass.

Diffstat:
MM1pp/M1pp.P1 | 141++++++++++++++++++++++---------------------------------------------------------
MMakefile | 13++++++-------
MP1/P1-aarch64.M1 | 3+++
MP1/P1-amd64.M1 | 3+++
MP1/P1-riscv64.M1 | 3+++
5 files changed, 54 insertions(+), 109 deletions(-)

diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1 @@ -13,7 +13,7 @@ ## stream and walks it token-by-token, dispatching to ## define_macro at line-start %macro, emit_newline / ## emit_token for pass-through, expand_builtin_call for -## !@%$, %select, %str, %bytes, %local, and expand_call +## !@%$, %select, %str, %local, and expand_call ## for user macros. Macro expansions and %select push ## fresh streams onto streams[]; popping rewinds the ## expansion pool. @@ -796,13 +796,22 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 la_a1 &lex_start st_t0,a1,0 :lex_word_scan - # c = *lex_ptr; terminate the word if lex_char_class[c] != WORD (0). + # c = *lex_ptr; terminate the word if lex_char_class[c] is non-WORD, + # but treat class 3 (string-quote `"`/`'`) as part of the word too — + # quotes only start a STRING token at token start, not mid-word. + # That matches M1pp.c, where the WORD scanner ignores `"`/`'` and + # so `\`"hi"\`` (backtick-quote-...-quote-backtick with no spaces) + # lexes as a single WORD. lb_a2,t0,0 la_a1 &lex_char_class add_a1,a1,a2 lb_a2,a1,0 + la_br &lex_word_continue + beqz_a2 + li_a1 %3 %0 la_br &lex_word_finish - bnez_a2 + bne_a2,a1 +:lex_word_continue addi_t0,t0,1 la_br &lex_word_scan b @@ -873,7 +882,9 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 st_a1,a0,0 ret -## emit_token(a0=token_ptr). Leaf. +## emit_token(a0=token_ptr). Tail-calls emit_string_as_bytes for +## TOK_STRING (which has its own enter_0/eret frame), so emit_token +## itself stays leaf for the WORD path. :emit_token # brace tokens are no-ops at emit time (belt-and-braces with arg-strip) ld_t0,a0,0 @@ -883,6 +894,11 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 li_t1 TOK_RBRACE la_br &emit_token_skip beq_t0,t1 + # Bare TOK_STRING decodes to raw bytes via emit_string_as_bytes. + # Branch (not call): the tail call returns to emit_token's caller. + li_t1 TOK_STRING + la_br &emit_string_as_bytes + beq_t0,t1 # if (output_need_space) emit ' ' (skip the space for the first token on a line) la_a1 &output_need_space @@ -1152,7 +1168,7 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 la_br &proc_check_macro beqz_a1 - # try the eight builtin names: ! @ % $ %select %str %bytes %local + # try the seven builtin names: ! @ % $ %select %str %local mov_a0,t0 la_a1 &const_bang li_a2 %1 %0 @@ -1202,13 +1218,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 call la_br &proc_do_builtin bnez_a0 - ld_a0,sp,8 - la_a1 &const_bytes - li_a2 %6 %0 - la_br &tok_eq_const - call - la_br &proc_do_builtin - bnez_a0 la_br &proc_check_macro b @@ -5237,7 +5246,7 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 eret ## ============================================================================ -## --- Builtin dispatcher ( ! @ % $ %select %str %bytes %local ) ------------- +## --- Builtin dispatcher ( ! @ % $ %select %str %local ) -------------------- ## ============================================================================ ## expand_builtin_call(a0=stream_ptr, a1=builtin_tok) -> void (fatal on bad) @@ -5602,17 +5611,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 la_br &ebc_local bnez_a0 - # if tok_eq_const(tok, "%bytes", 6) -> bytes path - la_a0 &ebc_stream - ld_a0,a0,0 - ld_a0,a0,16 - la_a1 &const_bytes - li_a2 %6 %0 - la_br &tok_eq_const - call - la_br &ebc_bytes_handler - bnez_a0 - # else: fatal la_br &err_bad_macro_header b @@ -5935,82 +5933,30 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 eret -## %bytes("STR"): emit the raw bytes of a "..."-quoted string as one -## contiguous run of hex bytes. Recognised escapes inside the string: -## \n -> 0x0A \t -> 0x09 \r -> 0x0D \0 -> 0x00 -## \\ -> 0x5C \" -> 0x22 \xNN -> byte NN (two hex digits) -## Any other backslash escape is fatal. No NUL terminator is appended; -## the caller writes one explicitly (e.g. "00") if needed. An empty -## string produces no output. -## -## Implementation strategy: each byte is emitted independently via -## emit_hex_value(byte, 1). hex2pp's parse_byte_stream accumulates -## adjacent hex digits across whitespace, so "68 69 0A" reads as the -## same three bytes as "68690A" — there is no need to coalesce them -## into a single output WORD here. -## -## Validation: arg_count == 1, arg span is exactly one token, kind is -## TOK_STRING, len >= 2, ptr[0] == '"'. For \xNN, the next two source -## bytes must both be valid hex digits (0-9, a-f, A-F). -:ebc_bytes_handler - # No enter_0: ebc_bytes_handler is jumped to as a continuation of - # expand_builtin_call's frame (matching the convention used by - # :ebc_str, :ebc_select, etc.). The terminating `eret` at - # :ebc_b_done unwinds expand_builtin_call's frame. - - # require arg_count == 1 - la_a0 &arg_count - ld_t0,a0,0 - li_t1 %1 %0 - la_br &err_bad_macro_header - bne_t0,t1 - - # require arg span is exactly one token (32 bytes) - la_a0 &arg_starts_ptr - ld_a0,a0,0 - ld_t0,a0,0 - la_a1 &arg_ends_ptr - ld_a1,a1,0 - ld_t1,a1,0 - sub_t2,t1,t0 - li_a2 %32 %0 - la_br &err_bad_macro_header - bne_t2,a2 - - # require arg_tok->kind == TOK_STRING - ld_a3,t0,0 - li_a2 TOK_STRING - la_br &err_bad_macro_header - bne_a3,a2 +## emit_string_as_bytes(a0=tok_ptr) -> void (fatal on bad escape). +## Decode the contents of a TOK_STRING (between the surrounding quotes) +## and emit each byte as one TOK_WORD via emit_hex_value(byte, 1). The +## lexer accepts both "..." and '...'; this routine just strips the +## first/last byte of text and decodes the middle. Recognised escapes: +## \n \t \r \0 \\ \" and \xNN (two hex digits, case-insensitive). +## hex2pp's parse_byte_stream coalesces the resulting space-separated +## hex bytes back into a contiguous byte sequence at link time. +:emit_string_as_bytes + enter_0 - # require arg_tok->text.len >= 2 - ld_a1,t0,16 + # require tok->text.len >= 2 + ld_a1,a0,16 li_a2 %2 %0 - la_br &err_bad_macro_header + la_br &err_bad_escape blt_a1,a2 - # require arg_tok->text.ptr[0] == '"'. - # Save text_ptr to ebc_b_src_ptr (will += 1 below) and text_len to - # ebc_b_src_len (-= 2 below). Reading the first byte uses lb_a3,a3,0 - # which clobbers a3, so do the save first. - ld_a3,t0,8 - la_a0 &ebc_b_src_ptr - st_a3,a0,0 - la_a0 &ebc_b_src_len - st_a1,a0,0 - lb_a3,a3,0 - li_a2 %34 %0 - la_br &err_bad_macro_header - bne_a3,a2 - - # src_ptr += 1; src_len -= 2 (strip surrounding quotes) - la_a0 &ebc_b_src_ptr - ld_a3,a0,0 + # src = tok->text.ptr + 1; src_len = tok->text.len - 2 + ld_a3,a0,8 addi_a3,a3,1 + la_a0 &ebc_b_src_ptr st_a3,a0,0 - la_a0 &ebc_b_src_len - ld_a1,a0,0 addi_a1,a1,neg2 + la_a0 &ebc_b_src_len st_a1,a0,0 # ebc_b_src_i = 0 @@ -6190,14 +6136,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 b :ebc_b_done - # stream->pos = ebc_call_end_pos; stream->line_start = 0 - la_a0 &ebc_stream - ld_a0,a0,0 - la_a1 &ebc_call_end_pos - ld_t0,a1,0 - st_t0,a0,16 - li_t1 %0 %0 - st_t1,a0,24 eret ## %local(NAME): emit-time variant. expand_builtin_call has already @@ -6416,7 +6354,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 :const_frame "%frame" :const_endframe "%endframe" :const_local "%local" -:const_bytes "%bytes" ## Suffix appended to the frame name when looking up <frame>_FRAME.<field>. :const_frame_suffix "_FRAME." diff --git a/Makefile b/Makefile @@ -210,10 +210,10 @@ P1PP_BUILD_DEPS = scripts/boot-build-p1pp.sh \ vendor/seed/%/ELF.hex2 \ P1/P1-%.M1pp P1/P1.M1pp P1/P1pp.P1pp -# Until M1pp.P1 / hex2pp.P1 catch up to the new string-emission contract -# (bare "..." emits decoded bytes; %bytes is gone), the test pipeline -# uses the M1pp.c / hex2pp.c reference implementations compiled inside -# a per-arch alpine-gcc container. +# M1pp is built from its self-hosted .P1 source via the seed M0+hex2-0 +# chain (boot-build-p1.sh). hex2pp.P1 hasn't yet caught up to the new +# string-emission contract (bare "..." emits decoded bytes; %bytes is +# gone), so hex2pp is still compiled from hex2pp.c via alpine-gcc. ALPINE_GCC_IMAGES := $(foreach a,$(ALL_ARCHES),build/$(a)/.image-alpine-gcc) $(ALPINE_GCC_IMAGES): build/%/.image-alpine-gcc: scripts/Containerfile.alpine-gcc @@ -228,9 +228,8 @@ ALPINE_GCC = podman run --rm --pull=never --platform $(PLATFORM_$(1)) \ -e ARCH=$(1) \ -v $(CURDIR):/work -w /work boot2-alpine-gcc:$(1) -$(M1PP_BINS): build/%/M1pp/M1pp: M1pp/M1pp.c build/%/.image-alpine-gcc - mkdir -p $(@D) - $(call ALPINE_GCC,$*) cc -O2 -std=c99 -static M1pp/M1pp.c -o $@ +$(M1PP_BINS): build/%/M1pp/M1pp: M1pp/M1pp.P1 $(P1_BUILD_DEPS) + $(call PODMAN,$*) sh scripts/boot-build-p1.sh M1pp/M1pp.P1 $@ $(HEX2PP_BINS): build/%/hex2pp/hex2pp: hex2pp/hex2pp.c build/%/.image-alpine-gcc mkdir -p $(@D) diff --git a/P1/P1-aarch64.M1 b/P1/P1-aarch64.M1 @@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 E00F40F9 DEFINE ld_a0,sp,24 E01740F9 DEFINE ld_a1,a0,0 010040F9 DEFINE ld_a1,a0,8 010440F9 +DEFINE ld_a1,a0,16 010840F9 DEFINE ld_a1,a1,0 210040F9 DEFINE ld_a1,a2,8 410440F9 DEFINE ld_a1,a3,8 610440F9 @@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 420140F9 DEFINE ld_a2,t2,0 620140F9 DEFINE ld_a2,sp,16 E21340F9 DEFINE ld_a3,a0,0 030040F9 +DEFINE ld_a3,a0,8 030440F9 DEFINE ld_a3,a0,16 030840F9 DEFINE ld_a3,a1,0 230040F9 DEFINE ld_a3,a1,8 230440F9 @@ -452,6 +454,7 @@ DEFINE beq_t2,t1 7F010AEB4100005420021FD6 DEFINE bne_a0,t0 1F0009EB4000005420021FD6 DEFINE bne_a1,a2 3F0002EB4000005420021FD6 DEFINE bne_a1,t0 3F0009EB4000005420021FD6 +DEFINE bne_a2,a1 5F0001EB4000005420021FD6 DEFINE bne_a2,a3 5F0003EB4000005420021FD6 DEFINE bne_a3,a0 7F0000EB4000005420021FD6 DEFINE bne_a3,a1 7F0001EB4000005420021FD6 diff --git a/P1/P1-amd64.M1 b/P1/P1-amd64.M1 @@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 488B7C2418 DEFINE ld_a0,sp,24 488B7C2428 DEFINE ld_a1,a0,0 488B7700 DEFINE ld_a1,a0,8 488B7708 +DEFINE ld_a1,a0,16 488B7710 DEFINE ld_a1,a1,0 488B7600 DEFINE ld_a1,a2,8 488B7208 DEFINE ld_a1,a3,8 488B7108 @@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 498B5300 DEFINE ld_a2,t2,0 498B5000 DEFINE ld_a2,sp,16 488B542420 DEFINE ld_a3,a0,0 488B4F00 +DEFINE ld_a3,a0,8 488B4F08 DEFINE ld_a3,a0,16 488B4F10 DEFINE ld_a3,a1,0 488B4E00 DEFINE ld_a3,a1,8 488B4E08 @@ -452,6 +454,7 @@ DEFINE beq_t2,t1 4D39D8750341FFE7 DEFINE bne_a0,t0 4C39D7740341FFE7 DEFINE bne_a1,a2 4839D6740341FFE7 DEFINE bne_a1,t0 4C39D6740341FFE7 +DEFINE bne_a2,a1 4839F2740341FFE7 DEFINE bne_a2,a3 4839CA740341FFE7 DEFINE bne_a3,a0 4839F9740341FFE7 DEFINE bne_a3,a1 4839F1740341FFE7 diff --git a/P1/P1-riscv64.M1 b/P1/P1-riscv64.M1 @@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 03358101 DEFINE ld_a0,sp,24 03358102 DEFINE ld_a1,a0,0 83350500 DEFINE ld_a1,a0,8 83358500 +DEFINE ld_a1,a0,16 83350501 DEFINE ld_a1,a1,0 83B50500 DEFINE ld_a1,a2,8 83358600 DEFINE ld_a1,a3,8 83B58600 @@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 03360300 DEFINE ld_a2,t2,0 03B60300 DEFINE ld_a2,sp,16 03360102 DEFINE ld_a3,a0,0 83360500 +DEFINE ld_a3,a0,8 83368500 DEFINE ld_a3,a0,16 83360501 DEFINE ld_a3,a1,0 83B60500 DEFINE ld_a3,a1,8 83B68500 @@ -452,6 +454,7 @@ DEFINE beq_t2,t1 6394630067800F00 DEFINE bne_a0,t0 6304550067800F00 DEFINE bne_a1,a2 6384C50067800F00 DEFINE bne_a1,t0 6384550067800F00 +DEFINE bne_a2,a1 6304B60067800F00 DEFINE bne_a2,a3 6304D60067800F00 DEFINE bne_a3,a0 6384A60067800F00 DEFINE bne_a3,a1 6384B60067800F00