boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 48cc13f8851f4264f2459311c53108e6b62fd5e0
parent 0b8e9e9a5fea1e7a28489feccb4f6b827be64292
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat, 25 Apr 2026 15:31:45 -0700

scheme1: add symbol?, #u8(...) literal, number?/bytevector? aliases

symbol? checks TAG.SYM directly. #u8(...) reads the element list via
parse_list and packs it into a fresh HDR.BV, sharing parse_one's atom
dispatch so #x.. element values come for free. number? and bytevector?
land in the prelude as exact aliases since scheme1 has only one numeric
and one byte-string repr today.

Diffstat:
Mscheme1/prelude.scm | 5+++++
Mscheme1/scheme1.P1pp | 261+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Atests/scheme1/78-symbolq.scm | 15+++++++++++++++
Atests/scheme1/79-bv-num-aliases.scm | 19+++++++++++++++++++
Atests/scheme1/80-u8-literal.scm | 30++++++++++++++++++++++++++++++
5 files changed, 310 insertions(+), 20 deletions(-)

diff --git a/scheme1/prelude.scm b/scheme1/prelude.scm @@ -14,6 +14,11 @@ (define (negative? x) (< x 0)) (define (positive? x) (> x 0)) +;; scheme1 has only one numeric and one byte-string repr today, so these +;; predicates are exact aliases. They exist so callers can spell intent. +(define number? integer?) +(define bytevector? string?) + (define (abs x) (if (< x 0) (- 0 x) x)) (define (min a b) (if (< a b) a b)) diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp @@ -649,6 +649,8 @@ %beqz(a1, &::hex_lit) %addi(a1, a0, -92) ; '\\' %beqz(a1, &::char_lit) + %addi(a1, a0, -117) ; 'u' + %beqz(a1, &::u8_lit) %die(msg_bad_hash) ::true_lit @@ -716,6 +718,29 @@ # a tagged fixnum (the u8 char value). %tail(&parse_char) + ::u8_lit + # Cursor is past '#u'. Demand '8' then '('; consume both and tail to + # parse_u8_body, which reads the element list and packs it into a bv. + %la(t2, &readbuf_pos) + %ld(t0, t2, 0) + %la(t1, &readbuf_len) + %ld(t1, t1, 0) + %beq(t0, t1, &::u8_bad) + %readbuf_byte(a0, t0) + %addi(a1, a0, -56) ; '8' + %bnez(a1, &::u8_bad) + %addi(t0, t0, 1) + %beq(t0, t1, &::u8_bad) + %readbuf_byte(a0, t0) + %addi(a1, a0, -40) ; '(' + %bnez(a1, &::u8_bad) + %addi(t0, t0, 1) + %st(t0, t2, 0) + %tail(&parse_u8_body) + + ::u8_bad + %die(msg_bad_hash) + ::eof %die(msg_unexp_eof) }) @@ -815,13 +840,105 @@ %die(msg_unterm_list) }) +# parse_u8_body() -> tagged HDR.BV in a0. Cursor sits past '#u8(' on +# entry. Reads elements via parse_list (each must be a fixnum byte 0..255; +# range is unchecked, matching make-bytevector's lax stance) and packs +# them into a fresh bytevector. +# +# Frame: 16 bytes +# +0 list head (cursor during fill pass) +# +8 result bv +%fn(parse_u8_body, 16, { + %call(&parse_list) + %st(a0, sp, 0) + + %call(&list_length) ; clobbers a0 -> count + %call(&bv_alloc) ; a0 = bv + %st(a0, sp, 8) + + %ld(t0, a0, 5) ; data ptr + %ld(a0, sp, 0) ; list cursor + + ::loop + %if_nil(t1, a0, &::done) + %car(t1, a0) + %sari(t1, t1, 3) ; untag fixnum -> raw byte + %sb(t1, t0, 0) + %addi(t0, t0, 1) + %cdr(a0, a0) + %b(&::loop) + ::done + %ld(a0, sp, 8) +}) + +# is_ident_byte(c=a0) -> a1 (1 if c is a valid identifier byte, else 0). +# Leaf. Allowed bytes are R7RS-Small's identifier set: ASCII letters, +# digits, and the extended chars ! $ % & * + - . / : < = > ? @ ^ _ ~ . +# Clobbers t0, t1, a1. +:is_ident_byte +%scope is_ident_byte + %addi(t0, a0, -48) ; '0' + %li(t1, 10) + %bltu(t0, t1, &::ok) ; '0'..'9' + %li(t1, 26) + %addi(t0, a0, -65) ; 'A' + %bltu(t0, t1, &::ok) ; 'A'..'Z' + %addi(t0, a0, -97) ; 'a' + %bltu(t0, t1, &::ok) ; 'a'..'z' + %addi(t0, a0, -33) ; '!' + %beqz(t0, &::ok) + %addi(t0, a0, -36) ; '$' + %beqz(t0, &::ok) + %addi(t0, a0, -37) ; '%' + %beqz(t0, &::ok) + %addi(t0, a0, -38) ; '&' + %beqz(t0, &::ok) + %addi(t0, a0, -42) ; '*' + %beqz(t0, &::ok) + %addi(t0, a0, -43) ; '+' + %beqz(t0, &::ok) + %addi(t0, a0, -45) ; '-' + %beqz(t0, &::ok) + %addi(t0, a0, -46) ; '.' + %beqz(t0, &::ok) + %addi(t0, a0, -47) ; '/' + %beqz(t0, &::ok) + %addi(t0, a0, -58) ; ':' + %beqz(t0, &::ok) + %addi(t0, a0, -60) ; '<' + %beqz(t0, &::ok) + %addi(t0, a0, -61) ; '=' + %beqz(t0, &::ok) + %addi(t0, a0, -62) ; '>' + %beqz(t0, &::ok) + %addi(t0, a0, -63) ; '?' + %beqz(t0, &::ok) + %addi(t0, a0, -64) ; '@' + %beqz(t0, &::ok) + %addi(t0, a0, -94) ; '^' + %beqz(t0, &::ok) + %addi(t0, a0, -95) ; '_' + %beqz(t0, &::ok) + %addi(t0, a0, -126) ; '~' + %beqz(t0, &::ok) + %li(a1, 0) + %ret + ::ok + %li(a1, 1) + %ret +%endscope + # parse_atom() -> tagged value (fixnum or symbol) in a0. # Reads until whitespace or paren or EOF, then dispatches by first byte. +# A token whose first byte is a digit (or sign-then-digit) commits to +# parse_dec and any non-numeric byte aborts; otherwise the token is a +# symbol and every byte is checked against is_ident_byte before intern. # -# Frame: 16 bytes +# Frame: 24 bytes # +0 start cursor (byte offset) # +8 end cursor (byte offset) -%fn(parse_atom, 16, { +# +16 cursor (scratch slot for the symbol-validation loop) +%fn(parse_atom, 24, { %la(t0, &readbuf_pos) %ld(t1, t0, 0) %st(t1, sp, 0) @@ -859,10 +976,14 @@ %addi(a1, t1, -48) %li(a2, 10) %bltu(a1, a2, &::is_int) - # '-' followed by digit -> int + # '-' or '+' followed by digit -> int. A lone '+' or '-' falls + # through to is_sym (those tokens stay valid identifiers). %addi(a1, t1, -45) - %bnez(a1, &::is_sym) - # there must be at least one more byte for it to be a number + %beqz(a1, &::sign) + %addi(a1, t1, -43) + %beqz(a1, &::sign) + %b(&::is_sym) + ::sign %ld(t2, sp, 8) %addi(t0, t0, 1) %beq(t0, t2, &::is_sym) @@ -872,6 +993,25 @@ # fall through to is_sym ::is_sym + # Validate every byte; abort on the first non-ident byte. + %ld(t0, sp, 0) + %st(t0, sp, 16) + ::sym_loop + %ld(t0, sp, 16) + %ld(t1, sp, 8) + %beq(t0, t1, &::sym_intern) + %readbuf_byte(a0, t0) + %call(&is_ident_byte) + %beqz(a1, &::sym_bad) + %ld(t0, sp, 16) + %addi(t0, t0, 1) + %st(t0, sp, 16) + %b(&::sym_loop) + + ::sym_bad + %die(msg_bad_ident) + + ::sym_intern %ld(a0, sp, 0) %la(t0, &readbuf_buf_ptr) %ld(t0, t0, 0) @@ -888,20 +1028,26 @@ %ld(a0, a0, 0) %add(a0, a0, t0) ; ptr = base + start_off %sub(a1, t1, t0) ; len = end_off - start_off - %tail(&parse_dec) + %call(&parse_dec) ; -> (a0=value, a1=ok) + %beqz(a1, &::int_bad) + %eret + ::int_bad + %die(msg_bad_number) }) # parse_string() -> tagged bytevector in a0. Cursor sits past the # opening '"' (consumed by parse_one). Two-pass: pass 1 walks the body, # counting decoded bytes in a0 and locating the closing '"'; pass 2 -# allocates the bv and decodes into its data buffer. Each escape -# (\n \t \r \\ \") yields one byte. +# allocates the bv and decodes into its data buffer. Each named escape +# (\n \t \r \\ \") yields one byte; an inline-hex escape \xHEX; (1+ +# hex digits, value 0..255, terminated by ';') also yields one byte. # -# Frame: 24 bytes +# Frame: 32 bytes # +0 start cursor (first content byte) # +8 end cursor (closing '"' position) # +16 bv wrapper (saved across the data fill loop) -%fn(parse_string, 24, { +# +24 spill slot (write ptr saved across parse_hex in \x escape) +%fn(parse_string, 32, { %la(t0, &readbuf_pos) %ld(t1, t0, 0) %st(t1, sp, 0) @@ -922,14 +1068,35 @@ %b(&::scan) ::scan_esc - # Backslash plus the next byte yield one decoded byte (validated in - # pass 2). Need at least one more byte after the backslash. + # Backslash plus the next byte yield one decoded byte. \xHEX; runs + # until the terminating ';' (validated in pass 2); every other escape + # is exactly two source bytes. %addi(t1, t1, 1) %beq(t1, t2, &::eof) + %readbuf_byte(a3, t1) + %addi(a1, a3, -120) ; 'x' + %beqz(a1, &::scan_hex) %addi(t1, t1, 1) %addi(a0, a0, 1) %b(&::scan) + ::scan_hex + # Skip past 'x' and scan to the terminating ';'. EOF before ';' + # falls into the unterminated-string path below, matching how an + # unterminated body is reported. + %addi(t1, t1, 1) + ::scan_hex_loop + %beq(t1, t2, &::eof) + %readbuf_byte(a3, t1) + %addi(a1, a3, -59) ; ';' + %beqz(a1, &::scan_hex_done) + %addi(t1, t1, 1) + %b(&::scan_hex_loop) + ::scan_hex_done + %addi(t1, t1, 1) ; consume ';' + %addi(a0, a0, 1) ; +1 output byte + %b(&::scan) + ::scan_done %st(t1, sp, 8) %call(&bv_alloc) @@ -963,6 +1130,8 @@ %beqz(a2, &::write_byte) %addi(a2, a1, -34) ; '"' %beqz(a2, &::write_byte) + %addi(a2, a1, -120) ; 'x' + %beqz(a2, &::esc_hex) %die(msg_bad_escape) ::esc_n @@ -979,6 +1148,38 @@ %addi(t1, t1, 1) %b(&::fill) + ::esc_hex + # Skip past 'x'. parse_hex consumes hex digits; demand at least one, + # value <= 255, and an immediate ';' terminator. parse_hex clobbers + # t0/t1/t2 and a2/a3, so spill the cursor (t1) and write ptr (a3) + # across the call. sp+0 is free once pass 1 finishes. + %addi(t1, t1, 1) ; t1 -> first hex digit + %st(t1, sp, 0) + %st(a3, sp, 24) + %la(t0, &readbuf_buf_ptr) + %ld(t0, t0, 0) + %add(a0, t0, t1) ; ptr to first hex digit + %sub(a1, t2, t1) ; max len (bytes left in body) + %call(&parse_hex) ; -> (a0=value, a1=consumed) + %beqz(a1, &::hex_bad) + %li(t0, 255) + %bltu(t0, a0, &::hex_bad) + %ld(t1, sp, 0) + %add(t1, t1, a1) ; t1 = position of expected ';' + %ld(t2, sp, 8) + %beq(t1, t2, &::hex_bad) + %readbuf_byte(t0, t1) + %addi(t0, t0, -59) ; ';' + %bnez(t0, &::hex_bad) + %addi(t1, t1, 1) ; consume ';' + %ld(a3, sp, 24) + %sb(a0, a3, 0) + %addi(a3, a3, 1) + %b(&::fill) + + ::hex_bad + %die(msg_bad_escape) + ::fill_done %addi(t1, t1, 1) ; consume closing '"' %la(t0, &readbuf_pos) @@ -1131,12 +1332,12 @@ }) # parse_dec(data_ptr=a0, len=a1) -> (tagged fixnum=a0, ok=a1). Leaf. -# Accepts an optional leading '-' followed by one or more decimal digits. -# ok=1 on a fully-consumed valid input; ok=0 on empty input, lone "-", -# or any non-digit byte. On failure a0 still holds the best-effort -# accumulator (sign-applied, retagged) so reader-style callers that -# don't check ok get the same garbage they got before this routine -# validated. UB on integer overflow. Uses only t0..t2. +# Accepts an optional leading '+' or '-' followed by one or more decimal +# digits. ok=1 on a fully-consumed valid input; ok=0 on empty input, a +# lone sign, or any non-digit byte. On failure a0 still holds the +# best-effort accumulator (sign-applied, retagged) so reader-style +# callers that don't check ok get the same garbage they got before this +# routine validated. UB on integer overflow. Uses only t0..t2. # # Register map: # t0 = current ptr a0 = byte / digit / final value @@ -1152,9 +1353,14 @@ %beqz(t1, &::fail) %lb(a0, t0, 0) - %addi(a0, a0, -45) ; '-' - %bnez(a0, &::loop_init) + %addi(t2, a0, -45) ; '-' + %beqz(t2, &::neg) + %addi(t2, a0, -43) ; '+' + %bnez(t2, &::loop_init) + %b(&::skip_sign) + ::neg %li(a2, 1) + ::skip_sign %addi(t0, t0, 1) %addi(t1, t1, -1) %beqz(t1, &::fail) @@ -2444,6 +2650,19 @@ %ret %endscope +# (symbol? x) -- #t iff x is TAG.SYM (interned symbol index, not a heap obj). +:prim_symbolq_entry +%scope prim_symbolq + %car(t0, a0) + %tagof(t1, t0) + %li(t2, %TAG.SYM) + %li(a0, %imm_val(%IMM.FALSE)) + %bne(t1, t2, &::end) + %li(a0, %imm_val(%IMM.TRUE)) + ::end + %ret +%endscope + # (procedure? x) -- #t iff x is HEAP-tagged with header HDR.CLOSURE or HDR.PRIM. :prim_procedureq_entry %scope prim_procedureq @@ -4359,6 +4578,7 @@ :name_bv_append "bytevector-append" :name_booleanq "boolean?" :name_integerq "integer?" +:name_symbolq "symbol?" :name_procedureq "procedure?" :name_zeroq "zero?" :name_not "not" @@ -4438,6 +4658,7 @@ &name_bv_append %(0) $(17) &prim_bv_append_entry %(0) &name_booleanq %(0) $(8) &prim_booleanq_entry %(0) &name_integerq %(0) $(8) &prim_integerq_entry %(0) +&name_symbolq %(0) $(7) &prim_symbolq_entry %(0) &name_procedureq %(0) $(10) &prim_procedureq_entry %(0) &name_zeroq %(0) $(5) &prim_zeroq_entry %(0) &name_not %(0) $(3) &prim_not_entry %(0) diff --git a/tests/scheme1/78-symbolq.scm b/tests/scheme1/78-symbolq.scm @@ -0,0 +1,15 @@ +; symbol? — #t iff x is a HEAP-tagged HDR.SYM (an interned symbol). + +(if (symbol? 'foo) 0 (sys-exit 1)) +(if (symbol? (quote bar)) 0 (sys-exit 2)) +(if (symbol? (string->symbol "baz")) 0 (sys-exit 3)) + +(if (not (symbol? "foo")) 0 (sys-exit 4)) +(if (not (symbol? 0)) 0 (sys-exit 5)) +(if (not (symbol? '())) 0 (sys-exit 6)) +(if (not (symbol? '(a b))) 0 (sys-exit 7)) +(if (not (symbol? #t)) 0 (sys-exit 8)) +(if (not (symbol? car)) 0 (sys-exit 9)) +(if (not (symbol? (make-bytevector 1 0))) 0 (sys-exit 10)) + +(sys-exit 0) diff --git a/tests/scheme1/79-bv-num-aliases.scm b/tests/scheme1/79-bv-num-aliases.scm @@ -0,0 +1,19 @@ +; bytevector? and number? — prelude aliases. bytevector? mirrors string? +; (one HDR.BV repr); number? mirrors integer? (only fixnums today). + +; bytevector? +(if (bytevector? "abc") 0 (sys-exit 1)) +(if (bytevector? (make-bytevector 3 0)) 0 (sys-exit 2)) +(if (not (bytevector? 0)) 0 (sys-exit 3)) +(if (not (bytevector? '())) 0 (sys-exit 4)) +(if (not (bytevector? 'foo)) 0 (sys-exit 5)) + +; number? +(if (number? 0) 0 (sys-exit 6)) +(if (number? -7) 0 (sys-exit 7)) +(if (number? 42) 0 (sys-exit 8)) +(if (not (number? "0")) 0 (sys-exit 9)) +(if (not (number? '())) 0 (sys-exit 10)) +(if (not (number? 'sym)) 0 (sys-exit 11)) + +(sys-exit 0) diff --git a/tests/scheme1/80-u8-literal.scm b/tests/scheme1/80-u8-literal.scm @@ -0,0 +1,30 @@ +; #u8(...) — bytevector literal. Each element is a fixnum byte (0..255); +; result is a HDR.BV with those exact bytes. + +; Empty literal. +(define e #u8()) +(if (bytevector=? e (make-bytevector 0 0)) 0 (sys-exit 1)) + +; Length and per-byte access. +(define a #u8(1 2 3)) +(if (= (bytevector-length a) 3) 0 (sys-exit 2)) +(if (= (bytevector-u8-ref a 0) 1) 0 (sys-exit 3)) +(if (= (bytevector-u8-ref a 1) 2) 0 (sys-exit 4)) +(if (= (bytevector-u8-ref a 2) 3) 0 (sys-exit 5)) + +; Equality with a same-bytes string literal (both HDR.BV). +(if (bytevector=? #u8(65 66 67) "ABC") 0 (sys-exit 6)) + +; Hex element values are accepted (parser shares atom dispatch). +(define b #u8(#x00 #xff)) +(if (= (bytevector-length b) 2) 0 (sys-exit 7)) +(if (= (bytevector-u8-ref b 0) 0) 0 (sys-exit 8)) +(if (= (bytevector-u8-ref b 1) 255) 0 (sys-exit 8)) + +; Whitespace and comments inside the literal. +(define c #u8( 10 20 ; trailing comment + 30 )) +(if (= (bytevector-length c) 3) 0 (sys-exit 9)) +(if (= (bytevector-u8-ref c 2) 30) 0 (sys-exit 10)) + +(sys-exit 0)