commit fba0661c4e9928254ae9b1a9c2eb4d243a3af53e
parent f3aa6d050718c9abf3080ed7d08c6a301b121229
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 3 May 2026 16:37:10 -0700
M1pp: bring M1pp.P1 to parity with M1pp.c; build M1pp from .P1
- emit_token tail-calls a new emit_string_as_bytes for TOK_STRING, so
bare "..."/'...' tokens decode to raw hex bytes at emit time
(matching the new contract; the old %bytes builtin is dropped).
- lex_word_scan no longer terminates on `"`/`'`; quote chars only
start a STRING token at token start, mirroring M1pp.c's lexer.
- Drop %bytes from proc_check_builtin / expand_builtin_call dispatch
and remove the const_bytes rodata; refactor the old ebc_bytes_handler
into the callable emit_string_as_bytes subroutine.
- Makefile: build build/<arch>/M1pp/M1pp from M1pp/M1pp.P1 via
boot-build-p1.sh (seed M0 + hex2-0 chain), not from M1pp.c via
alpine-gcc. hex2pp still builds from .c until hex2pp.P1 catches up.
make test SUITE=m1pp ARCH=aarch64 -> 30/30 pass.
Diffstat:
5 files changed, 54 insertions(+), 109 deletions(-)
diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1
@@ -13,7 +13,7 @@
## stream and walks it token-by-token, dispatching to
## define_macro at line-start %macro, emit_newline /
## emit_token for pass-through, expand_builtin_call for
-## !@%$, %select, %str, %bytes, %local, and expand_call
+## !@%$, %select, %str, %local, and expand_call
## for user macros. Macro expansions and %select push
## fresh streams onto streams[]; popping rewinds the
## expansion pool.
@@ -796,13 +796,22 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
la_a1 &lex_start
st_t0,a1,0
:lex_word_scan
- # c = *lex_ptr; terminate the word if lex_char_class[c] != WORD (0).
+ # c = *lex_ptr; terminate the word if lex_char_class[c] is non-WORD,
+ # but treat class 3 (string-quote `"`/`'`) as part of the word too —
+ # quotes only start a STRING token at token start, not mid-word.
+ # That matches M1pp.c, where the WORD scanner ignores `"`/`'` and
+ # so `\`"hi"\`` (backtick-quote-...-quote-backtick with no spaces)
+ # lexes as a single WORD.
lb_a2,t0,0
la_a1 &lex_char_class
add_a1,a1,a2
lb_a2,a1,0
+ la_br &lex_word_continue
+ beqz_a2
+ li_a1 %3 %0
la_br &lex_word_finish
- bnez_a2
+ bne_a2,a1
+:lex_word_continue
addi_t0,t0,1
la_br &lex_word_scan
b
@@ -873,7 +882,9 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
st_a1,a0,0
ret
-## emit_token(a0=token_ptr). Leaf.
+## emit_token(a0=token_ptr). Tail-calls emit_string_as_bytes for
+## TOK_STRING (which has its own enter_0/eret frame), so emit_token
+## itself stays leaf for the WORD path.
:emit_token
# brace tokens are no-ops at emit time (belt-and-braces with arg-strip)
ld_t0,a0,0
@@ -883,6 +894,11 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
li_t1 TOK_RBRACE
la_br &emit_token_skip
beq_t0,t1
+ # Bare TOK_STRING decodes to raw bytes via emit_string_as_bytes.
+ # Branch (not call): the tail call returns to emit_token's caller.
+ li_t1 TOK_STRING
+ la_br &emit_string_as_bytes
+ beq_t0,t1
# if (output_need_space) emit ' ' (skip the space for the first token on a line)
la_a1 &output_need_space
@@ -1152,7 +1168,7 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
la_br &proc_check_macro
beqz_a1
- # try the eight builtin names: ! @ % $ %select %str %bytes %local
+ # try the seven builtin names: ! @ % $ %select %str %local
mov_a0,t0
la_a1 &const_bang
li_a2 %1 %0
@@ -1202,13 +1218,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
call
la_br &proc_do_builtin
bnez_a0
- ld_a0,sp,8
- la_a1 &const_bytes
- li_a2 %6 %0
- la_br &tok_eq_const
- call
- la_br &proc_do_builtin
- bnez_a0
la_br &proc_check_macro
b
@@ -5237,7 +5246,7 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
eret
## ============================================================================
-## --- Builtin dispatcher ( ! @ % $ %select %str %bytes %local ) -------------
+## --- Builtin dispatcher ( ! @ % $ %select %str %local ) --------------------
## ============================================================================
## expand_builtin_call(a0=stream_ptr, a1=builtin_tok) -> void (fatal on bad)
@@ -5602,17 +5611,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
la_br &ebc_local
bnez_a0
- # if tok_eq_const(tok, "%bytes", 6) -> bytes path
- la_a0 &ebc_stream
- ld_a0,a0,0
- ld_a0,a0,16
- la_a1 &const_bytes
- li_a2 %6 %0
- la_br &tok_eq_const
- call
- la_br &ebc_bytes_handler
- bnez_a0
-
# else: fatal
la_br &err_bad_macro_header
b
@@ -5935,82 +5933,30 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
eret
-## %bytes("STR"): emit the raw bytes of a "..."-quoted string as one
-## contiguous run of hex bytes. Recognised escapes inside the string:
-## \n -> 0x0A \t -> 0x09 \r -> 0x0D \0 -> 0x00
-## \\ -> 0x5C \" -> 0x22 \xNN -> byte NN (two hex digits)
-## Any other backslash escape is fatal. No NUL terminator is appended;
-## the caller writes one explicitly (e.g. "00") if needed. An empty
-## string produces no output.
-##
-## Implementation strategy: each byte is emitted independently via
-## emit_hex_value(byte, 1). hex2pp's parse_byte_stream accumulates
-## adjacent hex digits across whitespace, so "68 69 0A" reads as the
-## same three bytes as "68690A" — there is no need to coalesce them
-## into a single output WORD here.
-##
-## Validation: arg_count == 1, arg span is exactly one token, kind is
-## TOK_STRING, len >= 2, ptr[0] == '"'. For \xNN, the next two source
-## bytes must both be valid hex digits (0-9, a-f, A-F).
-:ebc_bytes_handler
- # No enter_0: ebc_bytes_handler is jumped to as a continuation of
- # expand_builtin_call's frame (matching the convention used by
- # :ebc_str, :ebc_select, etc.). The terminating `eret` at
- # :ebc_b_done unwinds expand_builtin_call's frame.
-
- # require arg_count == 1
- la_a0 &arg_count
- ld_t0,a0,0
- li_t1 %1 %0
- la_br &err_bad_macro_header
- bne_t0,t1
-
- # require arg span is exactly one token (32 bytes)
- la_a0 &arg_starts_ptr
- ld_a0,a0,0
- ld_t0,a0,0
- la_a1 &arg_ends_ptr
- ld_a1,a1,0
- ld_t1,a1,0
- sub_t2,t1,t0
- li_a2 %32 %0
- la_br &err_bad_macro_header
- bne_t2,a2
-
- # require arg_tok->kind == TOK_STRING
- ld_a3,t0,0
- li_a2 TOK_STRING
- la_br &err_bad_macro_header
- bne_a3,a2
+## emit_string_as_bytes(a0=tok_ptr) -> void (fatal on bad escape).
+## Decode the contents of a TOK_STRING (between the surrounding quotes)
+## and emit each byte as one TOK_WORD via emit_hex_value(byte, 1). The
+## lexer accepts both "..." and '...'; this routine just strips the
+## first/last byte of text and decodes the middle. Recognised escapes:
+## \n \t \r \0 \\ \" and \xNN (two hex digits, case-insensitive).
+## hex2pp's parse_byte_stream coalesces the resulting space-separated
+## hex bytes back into a contiguous byte sequence at link time.
+:emit_string_as_bytes
+ enter_0
- # require arg_tok->text.len >= 2
- ld_a1,t0,16
+ # require tok->text.len >= 2
+ ld_a1,a0,16
li_a2 %2 %0
- la_br &err_bad_macro_header
+ la_br &err_bad_escape
blt_a1,a2
- # require arg_tok->text.ptr[0] == '"'.
- # Save text_ptr to ebc_b_src_ptr (will += 1 below) and text_len to
- # ebc_b_src_len (-= 2 below). Reading the first byte uses lb_a3,a3,0
- # which clobbers a3, so do the save first.
- ld_a3,t0,8
- la_a0 &ebc_b_src_ptr
- st_a3,a0,0
- la_a0 &ebc_b_src_len
- st_a1,a0,0
- lb_a3,a3,0
- li_a2 %34 %0
- la_br &err_bad_macro_header
- bne_a3,a2
-
- # src_ptr += 1; src_len -= 2 (strip surrounding quotes)
- la_a0 &ebc_b_src_ptr
- ld_a3,a0,0
+ # src = tok->text.ptr + 1; src_len = tok->text.len - 2
+ ld_a3,a0,8
addi_a3,a3,1
+ la_a0 &ebc_b_src_ptr
st_a3,a0,0
- la_a0 &ebc_b_src_len
- ld_a1,a0,0
addi_a1,a1,neg2
+ la_a0 &ebc_b_src_len
st_a1,a0,0
# ebc_b_src_i = 0
@@ -6190,14 +6136,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
b
:ebc_b_done
- # stream->pos = ebc_call_end_pos; stream->line_start = 0
- la_a0 &ebc_stream
- ld_a0,a0,0
- la_a1 &ebc_call_end_pos
- ld_t0,a1,0
- st_t0,a0,16
- li_t1 %0 %0
- st_t1,a0,24
eret
## %local(NAME): emit-time variant. expand_builtin_call has already
@@ -6416,7 +6354,6 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
:const_frame "%frame"
:const_endframe "%endframe"
:const_local "%local"
-:const_bytes "%bytes"
## Suffix appended to the frame name when looking up <frame>_FRAME.<field>.
:const_frame_suffix "_FRAME."
diff --git a/Makefile b/Makefile
@@ -210,10 +210,10 @@ P1PP_BUILD_DEPS = scripts/boot-build-p1pp.sh \
vendor/seed/%/ELF.hex2 \
P1/P1-%.M1pp P1/P1.M1pp P1/P1pp.P1pp
-# Until M1pp.P1 / hex2pp.P1 catch up to the new string-emission contract
-# (bare "..." emits decoded bytes; %bytes is gone), the test pipeline
-# uses the M1pp.c / hex2pp.c reference implementations compiled inside
-# a per-arch alpine-gcc container.
+# M1pp is built from its self-hosted .P1 source via the seed M0+hex2-0
+# chain (boot-build-p1.sh). hex2pp.P1 hasn't yet caught up to the new
+# string-emission contract (bare "..." emits decoded bytes; %bytes is
+# gone), so hex2pp is still compiled from hex2pp.c via alpine-gcc.
ALPINE_GCC_IMAGES := $(foreach a,$(ALL_ARCHES),build/$(a)/.image-alpine-gcc)
$(ALPINE_GCC_IMAGES): build/%/.image-alpine-gcc: scripts/Containerfile.alpine-gcc
@@ -228,9 +228,8 @@ ALPINE_GCC = podman run --rm --pull=never --platform $(PLATFORM_$(1)) \
-e ARCH=$(1) \
-v $(CURDIR):/work -w /work boot2-alpine-gcc:$(1)
-$(M1PP_BINS): build/%/M1pp/M1pp: M1pp/M1pp.c build/%/.image-alpine-gcc
- mkdir -p $(@D)
- $(call ALPINE_GCC,$*) cc -O2 -std=c99 -static M1pp/M1pp.c -o $@
+$(M1PP_BINS): build/%/M1pp/M1pp: M1pp/M1pp.P1 $(P1_BUILD_DEPS)
+ $(call PODMAN,$*) sh scripts/boot-build-p1.sh M1pp/M1pp.P1 $@
$(HEX2PP_BINS): build/%/hex2pp/hex2pp: hex2pp/hex2pp.c build/%/.image-alpine-gcc
mkdir -p $(@D)
diff --git a/P1/P1-aarch64.M1 b/P1/P1-aarch64.M1
@@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 E00F40F9
DEFINE ld_a0,sp,24 E01740F9
DEFINE ld_a1,a0,0 010040F9
DEFINE ld_a1,a0,8 010440F9
+DEFINE ld_a1,a0,16 010840F9
DEFINE ld_a1,a1,0 210040F9
DEFINE ld_a1,a2,8 410440F9
DEFINE ld_a1,a3,8 610440F9
@@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 420140F9
DEFINE ld_a2,t2,0 620140F9
DEFINE ld_a2,sp,16 E21340F9
DEFINE ld_a3,a0,0 030040F9
+DEFINE ld_a3,a0,8 030440F9
DEFINE ld_a3,a0,16 030840F9
DEFINE ld_a3,a1,0 230040F9
DEFINE ld_a3,a1,8 230440F9
@@ -452,6 +454,7 @@ DEFINE beq_t2,t1 7F010AEB4100005420021FD6
DEFINE bne_a0,t0 1F0009EB4000005420021FD6
DEFINE bne_a1,a2 3F0002EB4000005420021FD6
DEFINE bne_a1,t0 3F0009EB4000005420021FD6
+DEFINE bne_a2,a1 5F0001EB4000005420021FD6
DEFINE bne_a2,a3 5F0003EB4000005420021FD6
DEFINE bne_a3,a0 7F0000EB4000005420021FD6
DEFINE bne_a3,a1 7F0001EB4000005420021FD6
diff --git a/P1/P1-amd64.M1 b/P1/P1-amd64.M1
@@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 488B7C2418
DEFINE ld_a0,sp,24 488B7C2428
DEFINE ld_a1,a0,0 488B7700
DEFINE ld_a1,a0,8 488B7708
+DEFINE ld_a1,a0,16 488B7710
DEFINE ld_a1,a1,0 488B7600
DEFINE ld_a1,a2,8 488B7208
DEFINE ld_a1,a3,8 488B7108
@@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 498B5300
DEFINE ld_a2,t2,0 498B5000
DEFINE ld_a2,sp,16 488B542420
DEFINE ld_a3,a0,0 488B4F00
+DEFINE ld_a3,a0,8 488B4F08
DEFINE ld_a3,a0,16 488B4F10
DEFINE ld_a3,a1,0 488B4E00
DEFINE ld_a3,a1,8 488B4E08
@@ -452,6 +454,7 @@ DEFINE beq_t2,t1 4D39D8750341FFE7
DEFINE bne_a0,t0 4C39D7740341FFE7
DEFINE bne_a1,a2 4839D6740341FFE7
DEFINE bne_a1,t0 4C39D6740341FFE7
+DEFINE bne_a2,a1 4839F2740341FFE7
DEFINE bne_a2,a3 4839CA740341FFE7
DEFINE bne_a3,a0 4839F9740341FFE7
DEFINE bne_a3,a1 4839F1740341FFE7
diff --git a/P1/P1-riscv64.M1 b/P1/P1-riscv64.M1
@@ -225,6 +225,7 @@ DEFINE ld_a0,sp,8 03358101
DEFINE ld_a0,sp,24 03358102
DEFINE ld_a1,a0,0 83350500
DEFINE ld_a1,a0,8 83358500
+DEFINE ld_a1,a0,16 83350501
DEFINE ld_a1,a1,0 83B50500
DEFINE ld_a1,a2,8 83358600
DEFINE ld_a1,a3,8 83B58600
@@ -250,6 +251,7 @@ DEFINE ld_a2,t1,0 03360300
DEFINE ld_a2,t2,0 03B60300
DEFINE ld_a2,sp,16 03360102
DEFINE ld_a3,a0,0 83360500
+DEFINE ld_a3,a0,8 83368500
DEFINE ld_a3,a0,16 83360501
DEFINE ld_a3,a1,0 83B60500
DEFINE ld_a3,a1,8 83B68500
@@ -452,6 +454,7 @@ DEFINE beq_t2,t1 6394630067800F00
DEFINE bne_a0,t0 6304550067800F00
DEFINE bne_a1,a2 6384C50067800F00
DEFINE bne_a1,t0 6384550067800F00
+DEFINE bne_a2,a1 6304B60067800F00
DEFINE bne_a2,a3 6304D60067800F00
DEFINE bne_a3,a0 6384A60067800F00
DEFINE bne_a3,a1 6384B60067800F00