boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 9666d50f74fff919f734d35670d133c66cdc8847
parent 8411ede10e4eafe904ea36d7aac44be7b927a325
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat, 25 Apr 2026 13:39:45 -0700

scheme1: string literal, char literal

Diffstat:
Mscheme1/scheme1.P1pp | 267++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtests/scheme1/06-comment.scm | 8+++++++-
Mtests/scheme1/07-hex.scm | 11++++++++++-
Atests/scheme1/52-string-literal.expected-exit | 1+
Atests/scheme1/52-string-literal.scm | 9+++++++++
Atests/scheme1/53-string-escapes.expected-exit | 1+
Atests/scheme1/53-string-escapes.scm | 13+++++++++++++
Atests/scheme1/54-char-literal.expected-exit | 1+
Atests/scheme1/54-char-literal.scm | 11+++++++++++
Atests/scheme1/55-char-named.expected-exit | 1+
Atests/scheme1/55-char-named.scm | 7+++++++
11 files changed, 327 insertions(+), 3 deletions(-)

diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp @@ -603,6 +603,8 @@ %beqz(a1, &::hash) %addi(a1, a0, -39) ; '\'' %beqz(a1, &::quote) + %addi(a1, a0, -34) ; '"' + %beqz(a1, &::string) %tail(&parse_atom) @@ -617,9 +619,17 @@ ::rparen %die(msg_unexp_rparen) + ::string + # Consume opening '"' and tail to parse_string. parse_string scans + # through the matching '"' (consuming it) and returns a tagged bv. + %la(t0, &readbuf_pos) + %ld(t1, t0, 0) + %addi(t1, t1, 1) + %st(t1, t0, 0) + %tail(&parse_string) + ::hash # Consume '#' plus its type byte; dispatch on the type byte. - # Other #-prefixed forms (#xHEX, #\char) come later. %la(t2, &readbuf_pos) %ld(t0, t2, 0) %addi(t0, t0, 1) @@ -637,6 +647,8 @@ %beqz(a1, &::hex_lit) %addi(a1, a0, -88) ; 'X' %beqz(a1, &::hex_lit) + %addi(a1, a0, -92) ; '\\' + %beqz(a1, &::char_lit) %die(msg_bad_hash) ::true_lit @@ -699,6 +711,11 @@ %mov(a0, t0) %tail(&cons) + ::char_lit + # Cursor is already past '#\\'; parse_char scans the body and returns + # a tagged fixnum (the u8 char value). + %tail(&parse_char) + ::eof %die(msg_unexp_eof) }) @@ -870,6 +887,245 @@ %tail(&parse_int) }) +# parse_string() -> tagged bytevector in a0. Cursor sits past the +# opening '"' (consumed by parse_one). Two-pass: pass 1 walks the body, +# counting decoded bytes in a0 and locating the closing '"'; pass 2 +# allocates the bv and decodes into its data buffer. Each escape +# (\n \t \r \\ \") yields one byte. +# +# Frame: 24 bytes +# +0 start cursor (first content byte) +# +8 end cursor (closing '"' position) +# +16 bv wrapper (saved across the data fill loop) +%fn(parse_string, 24, { + %la(t0, &readbuf_pos) + %ld(t1, t0, 0) + %st(t1, sp, 0) + + %la(t2, &readbuf_len) + %ld(t2, t2, 0) + + %li(a0, 0) + ::scan + %beq(t1, t2, &::eof) + %readbuf_byte(a3, t1) + %addi(a1, a3, -34) ; '"' + %beqz(a1, &::scan_done) + %addi(a1, a3, -92) ; '\\' + %beqz(a1, &::scan_esc) + %addi(t1, t1, 1) + %addi(a0, a0, 1) + %b(&::scan) + + ::scan_esc + # Backslash plus the next byte yield one decoded byte (validated in + # pass 2). Need at least one more byte after the backslash. + %addi(t1, t1, 1) + %beq(t1, t2, &::eof) + %addi(t1, t1, 1) + %addi(a0, a0, 1) + %b(&::scan) + + ::scan_done + %st(t1, sp, 8) + %call(&bv_alloc) + %st(a0, sp, 16) + + # Pass 2: decode into the freshly allocated data buffer. + %ld(t1, sp, 0) ; start + %ld(t2, sp, 8) ; end + %ld(a3, a0, 5) ; data_ptr + + ::fill + %beq(t1, t2, &::fill_done) + %readbuf_byte(a1, t1) + %addi(a2, a1, -92) ; '\\' + %beqz(a2, &::fill_esc) + %sb(a1, a3, 0) + %addi(a3, a3, 1) + %addi(t1, t1, 1) + %b(&::fill) + + ::fill_esc + %addi(t1, t1, 1) ; consume backslash + %readbuf_byte(a1, t1) + %addi(a2, a1, -110) ; 'n' + %beqz(a2, &::esc_n) + %addi(a2, a1, -116) ; 't' + %beqz(a2, &::esc_t) + %addi(a2, a1, -114) ; 'r' + %beqz(a2, &::esc_r) + %addi(a2, a1, -92) ; '\\' + %beqz(a2, &::write_byte) + %addi(a2, a1, -34) ; '"' + %beqz(a2, &::write_byte) + %die(msg_bad_escape) + + ::esc_n + %li(a1, 10) + %b(&::write_byte) + ::esc_t + %li(a1, 9) + %b(&::write_byte) + ::esc_r + %li(a1, 13) + ::write_byte + %sb(a1, a3, 0) + %addi(a3, a3, 1) + %addi(t1, t1, 1) + %b(&::fill) + + ::fill_done + %addi(t1, t1, 1) ; consume closing '"' + %la(t0, &readbuf_pos) + %st(t1, t0, 0) + %ld(a0, sp, 16) + %eret + + ::eof + %die(msg_unterm_string) +}) + +# parse_char() -> tagged fixnum (the u8 char value) in a0. Cursor sits +# past '#\\' (consumed by parse_one's hash dispatch). Always consumes +# at least one byte; then continues until ws/paren/EOF. Single-byte +# bodies yield that byte; multi-byte bodies dispatch to hex (#\xNN) or +# named (#\space, #\newline, #\tab, #\return, #\null) forms. +# +# Frame: 16 bytes +# +0 start cursor +# +8 end cursor +%fn(parse_char, 16, { + %la(t0, &readbuf_pos) + %ld(t1, t0, 0) + %st(t1, sp, 0) + + %la(t2, &readbuf_len) + %ld(t2, t2, 0) + + %beq(t1, t2, &::short) + + # Always consume the first byte unconditionally — it might itself be + # a delimiter (e.g., '(' in `#\(`) and is still the character value. + %addi(t1, t1, 1) + + ::scan + %beq(t1, t2, &::scan_done) + %readbuf_byte(a0, t1) + %is_ws_branch(a1, a0, &::scan_done) + %addi(a1, a0, -40) ; '(' + %beqz(a1, &::scan_done) + %addi(a1, a0, -41) ; ')' + %beqz(a1, &::scan_done) + %addi(t1, t1, 1) + %b(&::scan) + + ::scan_done + %st(t1, sp, 8) + %la(t0, &readbuf_pos) + %st(t1, t0, 0) + + %ld(t0, sp, 0) + %ld(t1, sp, 8) + %sub(a2, t1, t0) ; length + + %li(a3, 1) + %beq(a2, a3, &::single) + + %la(t2, &readbuf_buf_ptr) + %ld(t2, t2, 0) + %add(t2, t2, t0) ; t2 = slice ptr + + # Hex form: first byte is 'x'. + %lb(a0, t2, 0) + %addi(a1, a0, -120) ; 'x' + %beqz(a1, &::hex_form) + + # Named form: dispatch on length. + %li(a3, 3) + %beq(a2, a3, &::try_tab) + %li(a3, 4) + %beq(a2, a3, &::try_null) + %li(a3, 5) + %beq(a2, a3, &::try_space) + %li(a3, 6) + %beq(a2, a3, &::try_return) + %li(a3, 7) + %beq(a2, a3, &::try_newline) + %b(&::bad) + + ::single + %la(t2, &readbuf_buf_ptr) + %ld(t2, t2, 0) + %add(t2, t2, t0) + %lb(a0, t2, 0) + %mkfix(a0, a0) + %eret + + ::hex_form + %addi(a0, t2, 1) + %addi(a1, a2, -1) + %call(&parse_hex) + %mkfix(a0, a0) + %eret + + ::try_tab + %mov(a0, t2) + %la(a1, &name_ch_tab) + %li(a2, 3) + %call(&memcmp) + %bnez(a0, &::bad) + %li(a0, 9) + %mkfix(a0, a0) + %eret + + ::try_null + %mov(a0, t2) + %la(a1, &name_ch_null) + %li(a2, 4) + %call(&memcmp) + %bnez(a0, &::bad) + %li(a0, 0) + %mkfix(a0, a0) + %eret + + ::try_space + %mov(a0, t2) + %la(a1, &name_ch_space) + %li(a2, 5) + %call(&memcmp) + %bnez(a0, &::bad) + %li(a0, 32) + %mkfix(a0, a0) + %eret + + ::try_return + %mov(a0, t2) + %la(a1, &name_ch_return) + %li(a2, 6) + %call(&memcmp) + %bnez(a0, &::bad) + %li(a0, 13) + %mkfix(a0, a0) + %eret + + ::try_newline + %mov(a0, t2) + %la(a1, &name_ch_newline) + %li(a2, 7) + %call(&memcmp) + %bnez(a0, &::bad) + %li(a0, 10) + %mkfix(a0, a0) + %eret + + ::bad + %die(msg_bad_char) + + ::short + %die(msg_bad_char) +}) + # parse_int(start_off=a0, end_off=a1) -> tagged fixnum in a0. Leaf. :parse_int %scope parse_int @@ -3804,6 +4060,15 @@ :msg_heap_full "scheme1: heap exhausted" '0a' '00' :msg_readbuf_full "scheme1: source buffer overflow" '0a' '00' :msg_bv_oob "scheme1: bytevector index out of range" '0a' '00' +:msg_unterm_string "scheme1: unterminated string literal" '0a' '00' +:msg_bad_escape "scheme1: bad string escape" '0a' '00' +:msg_bad_char "scheme1: bad #\\ character literal" '0a' '00' + +:name_ch_tab "tab" +:name_ch_null "null" +:name_ch_space "space" +:name_ch_return "return" +:name_ch_newline "newline" # ========================================================================= # BSS pointer-init table diff --git a/tests/scheme1/06-comment.scm b/tests/scheme1/06-comment.scm @@ -1,5 +1,11 @@ ; A comment at top of file. ;; double-semicolon, ignored. + +; Full-line comment between top-level forms (no form on this line). +(define x 17) +; Another full-line comment between two top-level forms. +(define y x) ; trailing comment after a complete top-level form (sys-exit ; trailing comment after head ; another full-line comment between args - 17) ; trailing tail + y) ; trailing tail +; Trailing line-comment as the last content of the file. diff --git a/tests/scheme1/07-hex.scm b/tests/scheme1/07-hex.scm @@ -1 +1,10 @@ -(sys-exit #x2a) +; #x and #X dispatch, lowercase / uppercase / mixed-case digits, and the +; negative-literal form #x-NN. +(if (= #x2a 42) 0 (sys-exit 1)) ; lowercase #x, lowercase digits +(if (= #X2a 42) 0 (sys-exit 2)) ; uppercase #X +(if (= #xff 255) 0 (sys-exit 3)) ; lowercase digits +(if (= #xFF 255) 0 (sys-exit 4)) ; uppercase digits +(if (= #xFf 255) 0 (sys-exit 5)) ; mixed-case digits +(if (= #x-2a -42) 0 (sys-exit 6)) ; negative +(if (= #X-Ff -255) 0 (sys-exit 7)) ; negative + uppercase #X + uppercase digits +(sys-exit 42) diff --git a/tests/scheme1/52-string-literal.expected-exit b/tests/scheme1/52-string-literal.expected-exit @@ -0,0 +1 @@ +0 diff --git a/tests/scheme1/52-string-literal.scm b/tests/scheme1/52-string-literal.scm @@ -0,0 +1,9 @@ +; "..." string literals are bytevectors. Basic content + length. +(define s "abc") +(if (= (bytevector-length s) 3) 0 (sys-exit 1)) +(if (= (bytevector-u8-ref s 0) 97) 0 (sys-exit 2)) ; 'a' +(if (= (bytevector-u8-ref s 1) 98) 0 (sys-exit 3)) ; 'b' +(if (= (bytevector-u8-ref s 2) 99) 0 (sys-exit 4)) ; 'c' +; Empty string is a 0-length bytevector. +(if (= (bytevector-length "") 0) 0 (sys-exit 5)) +(sys-exit 0) diff --git a/tests/scheme1/53-string-escapes.expected-exit b/tests/scheme1/53-string-escapes.expected-exit @@ -0,0 +1 @@ +0 diff --git a/tests/scheme1/53-string-escapes.scm b/tests/scheme1/53-string-escapes.scm @@ -0,0 +1,13 @@ +; All five string escapes: \n \t \r \\ \" — each yields one byte. +; "x\ny\tz\r\\\"" -> 'x', LF, 'y', TAB, 'z', CR, '\\', '"' (8 bytes) +(define s "x\ny\tz\r\\\"") +(if (= (bytevector-length s) 8) 0 (sys-exit 1)) +(if (= (bytevector-u8-ref s 0) 120) 0 (sys-exit 2)) ; 'x' +(if (= (bytevector-u8-ref s 1) 10) 0 (sys-exit 3)) ; \n +(if (= (bytevector-u8-ref s 2) 121) 0 (sys-exit 4)) ; 'y' +(if (= (bytevector-u8-ref s 3) 9) 0 (sys-exit 5)) ; \t +(if (= (bytevector-u8-ref s 4) 122) 0 (sys-exit 6)) ; 'z' +(if (= (bytevector-u8-ref s 5) 13) 0 (sys-exit 7)) ; \r +(if (= (bytevector-u8-ref s 6) 92) 0 (sys-exit 8)) ; \\ +(if (= (bytevector-u8-ref s 7) 34) 0 (sys-exit 9)) ; \" +(sys-exit 0) diff --git a/tests/scheme1/54-char-literal.expected-exit b/tests/scheme1/54-char-literal.expected-exit @@ -0,0 +1 @@ +0 diff --git a/tests/scheme1/54-char-literal.scm b/tests/scheme1/54-char-literal.scm @@ -0,0 +1,11 @@ +; #\<c> for printable ASCII evaluates to the u8 of <c>. +; #\xNN evaluates to the byte with that hex value. +(if (= #\a 97) 0 (sys-exit 1)) ; lowercase +(if (= #\Z 90) 0 (sys-exit 2)) ; uppercase +(if (= #\! 33) 0 (sys-exit 3)) ; punctuation +(if (= #\~ 126) 0 (sys-exit 4)) ; last printable +(if (= #\x41 65) 0 (sys-exit 5)) ; hex form -> 'A' +(if (= #\x7f 127) 0 (sys-exit 6)) ; hex form -> DEL +(if (= #\xff 255) 0 (sys-exit 7)) ; hex, 8-bit max +(if (= #\xFF 255) 0 (sys-exit 8)) ; hex, uppercase digits +(sys-exit 0) diff --git a/tests/scheme1/55-char-named.expected-exit b/tests/scheme1/55-char-named.expected-exit @@ -0,0 +1 @@ +0 diff --git a/tests/scheme1/55-char-named.scm b/tests/scheme1/55-char-named.scm @@ -0,0 +1,7 @@ +; Named character forms: #\space #\newline #\tab #\return #\null. +(if (= #\space 32) 0 (sys-exit 1)) +(if (= #\newline 10) 0 (sys-exit 2)) +(if (= #\tab 9) 0 (sys-exit 3)) +(if (= #\return 13) 0 (sys-exit 4)) +(if (= #\null 0) 0 (sys-exit 5)) +(sys-exit 0)