commit 9666d50f74fff919f734d35670d133c66cdc8847
parent 8411ede10e4eafe904ea36d7aac44be7b927a325
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 25 Apr 2026 13:39:45 -0700
scheme1: string literal, char literal
Diffstat:
11 files changed, 327 insertions(+), 3 deletions(-)
diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp
@@ -603,6 +603,8 @@
%beqz(a1, &::hash)
%addi(a1, a0, -39) ; '\''
%beqz(a1, &::quote)
+ %addi(a1, a0, -34) ; '"'
+ %beqz(a1, &::string)
%tail(&parse_atom)
@@ -617,9 +619,17 @@
::rparen
%die(msg_unexp_rparen)
+ ::string
+ # Consume opening '"' and tail to parse_string. parse_string scans
+ # through the matching '"' (consuming it) and returns a tagged bv.
+ %la(t0, &readbuf_pos)
+ %ld(t1, t0, 0)
+ %addi(t1, t1, 1)
+ %st(t1, t0, 0)
+ %tail(&parse_string)
+
::hash
# Consume '#' plus its type byte; dispatch on the type byte.
- # Other #-prefixed forms (#xHEX, #\char) come later.
%la(t2, &readbuf_pos)
%ld(t0, t2, 0)
%addi(t0, t0, 1)
@@ -637,6 +647,8 @@
%beqz(a1, &::hex_lit)
%addi(a1, a0, -88) ; 'X'
%beqz(a1, &::hex_lit)
+ %addi(a1, a0, -92) ; '\\'
+ %beqz(a1, &::char_lit)
%die(msg_bad_hash)
::true_lit
@@ -699,6 +711,11 @@
%mov(a0, t0)
%tail(&cons)
+ ::char_lit
+ # Cursor is already past '#\\'; parse_char scans the body and returns
+ # a tagged fixnum (the u8 char value).
+ %tail(&parse_char)
+
::eof
%die(msg_unexp_eof)
})
@@ -870,6 +887,245 @@
%tail(&parse_int)
})
+# parse_string() -> tagged bytevector in a0. Cursor sits past the
+# opening '"' (consumed by parse_one). Two-pass: pass 1 walks the body,
+# counting decoded bytes in a0 and locating the closing '"'; pass 2
+# allocates the bv and decodes into its data buffer. Each escape
+# (\n \t \r \\ \") yields one byte.
+#
+# Frame: 24 bytes
+# +0 start cursor (first content byte)
+# +8 end cursor (closing '"' position)
+# +16 bv wrapper (saved across the data fill loop)
+%fn(parse_string, 24, {
+ %la(t0, &readbuf_pos)
+ %ld(t1, t0, 0)
+ %st(t1, sp, 0)
+
+ %la(t2, &readbuf_len)
+ %ld(t2, t2, 0)
+
+ %li(a0, 0)
+ ::scan
+ %beq(t1, t2, &::eof)
+ %readbuf_byte(a3, t1)
+ %addi(a1, a3, -34) ; '"'
+ %beqz(a1, &::scan_done)
+ %addi(a1, a3, -92) ; '\\'
+ %beqz(a1, &::scan_esc)
+ %addi(t1, t1, 1)
+ %addi(a0, a0, 1)
+ %b(&::scan)
+
+ ::scan_esc
+ # Backslash plus the next byte yield one decoded byte (validated in
+ # pass 2). Need at least one more byte after the backslash.
+ %addi(t1, t1, 1)
+ %beq(t1, t2, &::eof)
+ %addi(t1, t1, 1)
+ %addi(a0, a0, 1)
+ %b(&::scan)
+
+ ::scan_done
+ %st(t1, sp, 8)
+ %call(&bv_alloc)
+ %st(a0, sp, 16)
+
+ # Pass 2: decode into the freshly allocated data buffer.
+ %ld(t1, sp, 0) ; start
+ %ld(t2, sp, 8) ; end
+ %ld(a3, a0, 5) ; data_ptr
+
+ ::fill
+ %beq(t1, t2, &::fill_done)
+ %readbuf_byte(a1, t1)
+ %addi(a2, a1, -92) ; '\\'
+ %beqz(a2, &::fill_esc)
+ %sb(a1, a3, 0)
+ %addi(a3, a3, 1)
+ %addi(t1, t1, 1)
+ %b(&::fill)
+
+ ::fill_esc
+ %addi(t1, t1, 1) ; consume backslash
+ %readbuf_byte(a1, t1)
+ %addi(a2, a1, -110) ; 'n'
+ %beqz(a2, &::esc_n)
+ %addi(a2, a1, -116) ; 't'
+ %beqz(a2, &::esc_t)
+ %addi(a2, a1, -114) ; 'r'
+ %beqz(a2, &::esc_r)
+ %addi(a2, a1, -92) ; '\\'
+ %beqz(a2, &::write_byte)
+ %addi(a2, a1, -34) ; '"'
+ %beqz(a2, &::write_byte)
+ %die(msg_bad_escape)
+
+ ::esc_n
+ %li(a1, 10)
+ %b(&::write_byte)
+ ::esc_t
+ %li(a1, 9)
+ %b(&::write_byte)
+ ::esc_r
+ %li(a1, 13)
+ ::write_byte
+ %sb(a1, a3, 0)
+ %addi(a3, a3, 1)
+ %addi(t1, t1, 1)
+ %b(&::fill)
+
+ ::fill_done
+ %addi(t1, t1, 1) ; consume closing '"'
+ %la(t0, &readbuf_pos)
+ %st(t1, t0, 0)
+ %ld(a0, sp, 16)
+ %eret
+
+ ::eof
+ %die(msg_unterm_string)
+})
+
+# parse_char() -> tagged fixnum (the u8 char value) in a0. Cursor sits
+# past '#\\' (consumed by parse_one's hash dispatch). Always consumes
+# at least one byte; then continues until ws/paren/EOF. Single-byte
+# bodies yield that byte; multi-byte bodies dispatch to hex (#\xNN) or
+# named (#\space, #\newline, #\tab, #\return, #\null) forms.
+#
+# Frame: 16 bytes
+# +0 start cursor
+# +8 end cursor
+%fn(parse_char, 16, {
+ %la(t0, &readbuf_pos)
+ %ld(t1, t0, 0)
+ %st(t1, sp, 0)
+
+ %la(t2, &readbuf_len)
+ %ld(t2, t2, 0)
+
+ %beq(t1, t2, &::short)
+
+ # Always consume the first byte unconditionally — it might itself be
+ # a delimiter (e.g., '(' in `#\(`) and is still the character value.
+ %addi(t1, t1, 1)
+
+ ::scan
+ %beq(t1, t2, &::scan_done)
+ %readbuf_byte(a0, t1)
+ %is_ws_branch(a1, a0, &::scan_done)
+ %addi(a1, a0, -40) ; '('
+ %beqz(a1, &::scan_done)
+ %addi(a1, a0, -41) ; ')'
+ %beqz(a1, &::scan_done)
+ %addi(t1, t1, 1)
+ %b(&::scan)
+
+ ::scan_done
+ %st(t1, sp, 8)
+ %la(t0, &readbuf_pos)
+ %st(t1, t0, 0)
+
+ %ld(t0, sp, 0)
+ %ld(t1, sp, 8)
+ %sub(a2, t1, t0) ; length
+
+ %li(a3, 1)
+ %beq(a2, a3, &::single)
+
+ %la(t2, &readbuf_buf_ptr)
+ %ld(t2, t2, 0)
+ %add(t2, t2, t0) ; t2 = slice ptr
+
+ # Hex form: first byte is 'x'.
+ %lb(a0, t2, 0)
+ %addi(a1, a0, -120) ; 'x'
+ %beqz(a1, &::hex_form)
+
+ # Named form: dispatch on length.
+ %li(a3, 3)
+ %beq(a2, a3, &::try_tab)
+ %li(a3, 4)
+ %beq(a2, a3, &::try_null)
+ %li(a3, 5)
+ %beq(a2, a3, &::try_space)
+ %li(a3, 6)
+ %beq(a2, a3, &::try_return)
+ %li(a3, 7)
+ %beq(a2, a3, &::try_newline)
+ %b(&::bad)
+
+ ::single
+ %la(t2, &readbuf_buf_ptr)
+ %ld(t2, t2, 0)
+ %add(t2, t2, t0)
+ %lb(a0, t2, 0)
+ %mkfix(a0, a0)
+ %eret
+
+ ::hex_form
+ %addi(a0, t2, 1)
+ %addi(a1, a2, -1)
+ %call(&parse_hex)
+ %mkfix(a0, a0)
+ %eret
+
+ ::try_tab
+ %mov(a0, t2)
+ %la(a1, &name_ch_tab)
+ %li(a2, 3)
+ %call(&memcmp)
+ %bnez(a0, &::bad)
+ %li(a0, 9)
+ %mkfix(a0, a0)
+ %eret
+
+ ::try_null
+ %mov(a0, t2)
+ %la(a1, &name_ch_null)
+ %li(a2, 4)
+ %call(&memcmp)
+ %bnez(a0, &::bad)
+ %li(a0, 0)
+ %mkfix(a0, a0)
+ %eret
+
+ ::try_space
+ %mov(a0, t2)
+ %la(a1, &name_ch_space)
+ %li(a2, 5)
+ %call(&memcmp)
+ %bnez(a0, &::bad)
+ %li(a0, 32)
+ %mkfix(a0, a0)
+ %eret
+
+ ::try_return
+ %mov(a0, t2)
+ %la(a1, &name_ch_return)
+ %li(a2, 6)
+ %call(&memcmp)
+ %bnez(a0, &::bad)
+ %li(a0, 13)
+ %mkfix(a0, a0)
+ %eret
+
+ ::try_newline
+ %mov(a0, t2)
+ %la(a1, &name_ch_newline)
+ %li(a2, 7)
+ %call(&memcmp)
+ %bnez(a0, &::bad)
+ %li(a0, 10)
+ %mkfix(a0, a0)
+ %eret
+
+ ::bad
+ %die(msg_bad_char)
+
+ ::short
+ %die(msg_bad_char)
+})
+
# parse_int(start_off=a0, end_off=a1) -> tagged fixnum in a0. Leaf.
:parse_int
%scope parse_int
@@ -3804,6 +4060,15 @@
:msg_heap_full "scheme1: heap exhausted" '0a' '00'
:msg_readbuf_full "scheme1: source buffer overflow" '0a' '00'
:msg_bv_oob "scheme1: bytevector index out of range" '0a' '00'
+:msg_unterm_string "scheme1: unterminated string literal" '0a' '00'
+:msg_bad_escape "scheme1: bad string escape" '0a' '00'
+:msg_bad_char "scheme1: bad #\\ character literal" '0a' '00'
+
+:name_ch_tab "tab"
+:name_ch_null "null"
+:name_ch_space "space"
+:name_ch_return "return"
+:name_ch_newline "newline"
# =========================================================================
# BSS pointer-init table
diff --git a/tests/scheme1/06-comment.scm b/tests/scheme1/06-comment.scm
@@ -1,5 +1,11 @@
; A comment at top of file.
;; double-semicolon, ignored.
+
+; Full-line comment between top-level forms (no form on this line).
+(define x 17)
+; Another full-line comment between two top-level forms.
+(define y x) ; trailing comment after a complete top-level form
(sys-exit ; trailing comment after head
; another full-line comment between args
- 17) ; trailing tail
+ y) ; trailing tail
+; Trailing line-comment as the last content of the file.
diff --git a/tests/scheme1/07-hex.scm b/tests/scheme1/07-hex.scm
@@ -1 +1,10 @@
-(sys-exit #x2a)
+; #x and #X dispatch, lowercase / uppercase / mixed-case digits, and the
+; negative-literal form #x-NN.
+(if (= #x2a 42) 0 (sys-exit 1)) ; lowercase #x, lowercase digits
+(if (= #X2a 42) 0 (sys-exit 2)) ; uppercase #X
+(if (= #xff 255) 0 (sys-exit 3)) ; lowercase digits
+(if (= #xFF 255) 0 (sys-exit 4)) ; uppercase digits
+(if (= #xFf 255) 0 (sys-exit 5)) ; mixed-case digits
+(if (= #x-2a -42) 0 (sys-exit 6)) ; negative
+(if (= #X-Ff -255) 0 (sys-exit 7)) ; negative + uppercase #X + uppercase digits
+(sys-exit 42)
diff --git a/tests/scheme1/52-string-literal.expected-exit b/tests/scheme1/52-string-literal.expected-exit
@@ -0,0 +1 @@
+0
diff --git a/tests/scheme1/52-string-literal.scm b/tests/scheme1/52-string-literal.scm
@@ -0,0 +1,9 @@
+; "..." string literals are bytevectors. Basic content + length.
+(define s "abc")
+(if (= (bytevector-length s) 3) 0 (sys-exit 1))
+(if (= (bytevector-u8-ref s 0) 97) 0 (sys-exit 2)) ; 'a'
+(if (= (bytevector-u8-ref s 1) 98) 0 (sys-exit 3)) ; 'b'
+(if (= (bytevector-u8-ref s 2) 99) 0 (sys-exit 4)) ; 'c'
+; Empty string is a 0-length bytevector.
+(if (= (bytevector-length "") 0) 0 (sys-exit 5))
+(sys-exit 0)
diff --git a/tests/scheme1/53-string-escapes.expected-exit b/tests/scheme1/53-string-escapes.expected-exit
@@ -0,0 +1 @@
+0
diff --git a/tests/scheme1/53-string-escapes.scm b/tests/scheme1/53-string-escapes.scm
@@ -0,0 +1,13 @@
+; All five string escapes: \n \t \r \\ \" — each yields one byte.
+; "x\ny\tz\r\\\"" -> 'x', LF, 'y', TAB, 'z', CR, '\\', '"' (8 bytes)
+(define s "x\ny\tz\r\\\"")
+(if (= (bytevector-length s) 8) 0 (sys-exit 1))
+(if (= (bytevector-u8-ref s 0) 120) 0 (sys-exit 2)) ; 'x'
+(if (= (bytevector-u8-ref s 1) 10) 0 (sys-exit 3)) ; \n
+(if (= (bytevector-u8-ref s 2) 121) 0 (sys-exit 4)) ; 'y'
+(if (= (bytevector-u8-ref s 3) 9) 0 (sys-exit 5)) ; \t
+(if (= (bytevector-u8-ref s 4) 122) 0 (sys-exit 6)) ; 'z'
+(if (= (bytevector-u8-ref s 5) 13) 0 (sys-exit 7)) ; \r
+(if (= (bytevector-u8-ref s 6) 92) 0 (sys-exit 8)) ; \\
+(if (= (bytevector-u8-ref s 7) 34) 0 (sys-exit 9)) ; \"
+(sys-exit 0)
diff --git a/tests/scheme1/54-char-literal.expected-exit b/tests/scheme1/54-char-literal.expected-exit
@@ -0,0 +1 @@
+0
diff --git a/tests/scheme1/54-char-literal.scm b/tests/scheme1/54-char-literal.scm
@@ -0,0 +1,11 @@
+; #\<c> for printable ASCII evaluates to the u8 of <c>.
+; #\xNN evaluates to the byte with that hex value.
+(if (= #\a 97) 0 (sys-exit 1)) ; lowercase
+(if (= #\Z 90) 0 (sys-exit 2)) ; uppercase
+(if (= #\! 33) 0 (sys-exit 3)) ; punctuation
+(if (= #\~ 126) 0 (sys-exit 4)) ; last printable
+(if (= #\x41 65) 0 (sys-exit 5)) ; hex form -> 'A'
+(if (= #\x7f 127) 0 (sys-exit 6)) ; hex form -> DEL
+(if (= #\xff 255) 0 (sys-exit 7)) ; hex, 8-bit max
+(if (= #\xFF 255) 0 (sys-exit 8)) ; hex, uppercase digits
+(sys-exit 0)
diff --git a/tests/scheme1/55-char-named.expected-exit b/tests/scheme1/55-char-named.expected-exit
@@ -0,0 +1 @@
+0
diff --git a/tests/scheme1/55-char-named.scm b/tests/scheme1/55-char-named.scm
@@ -0,0 +1,7 @@
+; Named character forms: #\space #\newline #\tab #\return #\null.
+(if (= #\space 32) 0 (sys-exit 1))
+(if (= #\newline 10) 0 (sys-exit 2))
+(if (= #\tab 9) 0 (sys-exit 3))
+(if (= #\return 13) 0 (sys-exit 4))
+(if (= #\null 0) 0 (sys-exit 5))
+(sys-exit 0)