commit 48cc13f8851f4264f2459311c53108e6b62fd5e0
parent 0b8e9e9a5fea1e7a28489feccb4f6b827be64292
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 25 Apr 2026 15:31:45 -0700
scheme1: add symbol?, #u8(...) literal, number?/bytevector? aliases
symbol? checks TAG.SYM directly. #u8(...) reads the element list via
parse_list and packs it into a fresh HDR.BV, sharing parse_one's atom
dispatch so #x.. element values come for free. number? and bytevector?
land in the prelude as exact aliases since scheme1 has only one numeric
and one byte-string repr today.
Diffstat:
5 files changed, 310 insertions(+), 20 deletions(-)
diff --git a/scheme1/prelude.scm b/scheme1/prelude.scm
@@ -14,6 +14,11 @@
(define (negative? x) (< x 0))
(define (positive? x) (> x 0))
+;; scheme1 has only one numeric and one byte-string repr today, so these
+;; predicates are exact aliases. They exist so callers can spell intent.
+(define number? integer?)
+(define bytevector? string?)
+
(define (abs x) (if (< x 0) (- 0 x) x))
(define (min a b) (if (< a b) a b))
diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp
@@ -649,6 +649,8 @@
%beqz(a1, &::hex_lit)
%addi(a1, a0, -92) ; '\\'
%beqz(a1, &::char_lit)
+ %addi(a1, a0, -117) ; 'u'
+ %beqz(a1, &::u8_lit)
%die(msg_bad_hash)
::true_lit
@@ -716,6 +718,29 @@
# a tagged fixnum (the u8 char value).
%tail(&parse_char)
+ ::u8_lit
+ # Cursor is past '#u'. Demand '8' then '('; consume both and tail to
+ # parse_u8_body, which reads the element list and packs it into a bv.
+ %la(t2, &readbuf_pos)
+ %ld(t0, t2, 0)
+ %la(t1, &readbuf_len)
+ %ld(t1, t1, 0)
+ %beq(t0, t1, &::u8_bad)
+ %readbuf_byte(a0, t0)
+ %addi(a1, a0, -56) ; '8'
+ %bnez(a1, &::u8_bad)
+ %addi(t0, t0, 1)
+ %beq(t0, t1, &::u8_bad)
+ %readbuf_byte(a0, t0)
+ %addi(a1, a0, -40) ; '('
+ %bnez(a1, &::u8_bad)
+ %addi(t0, t0, 1)
+ %st(t0, t2, 0)
+ %tail(&parse_u8_body)
+
+ ::u8_bad
+ %die(msg_bad_hash)
+
::eof
%die(msg_unexp_eof)
})
@@ -815,13 +840,105 @@
%die(msg_unterm_list)
})
+# parse_u8_body() -> tagged HDR.BV in a0. Cursor sits past '#u8(' on
+# entry. Reads elements via parse_list (each must be a fixnum byte 0..255;
+# range is unchecked, matching make-bytevector's lax stance) and packs
+# them into a fresh bytevector.
+#
+# Frame: 16 bytes
+# +0 list head (cursor during fill pass)
+# +8 result bv
+%fn(parse_u8_body, 16, {
+ %call(&parse_list)
+ %st(a0, sp, 0)
+
+ %call(&list_length) ; clobbers a0 -> count
+ %call(&bv_alloc) ; a0 = bv
+ %st(a0, sp, 8)
+
+ %ld(t0, a0, 5) ; data ptr
+ %ld(a0, sp, 0) ; list cursor
+
+ ::loop
+ %if_nil(t1, a0, &::done)
+ %car(t1, a0)
+ %sari(t1, t1, 3) ; untag fixnum -> raw byte
+ %sb(t1, t0, 0)
+ %addi(t0, t0, 1)
+ %cdr(a0, a0)
+ %b(&::loop)
+ ::done
+ %ld(a0, sp, 8)
+})
+
+# is_ident_byte(c=a0) -> a1 (1 if c is a valid identifier byte, else 0).
+# Leaf. Allowed bytes are R7RS-Small's identifier set: ASCII letters,
+# digits, and the extended chars ! $ % & * + - . / : < = > ? @ ^ _ ~ .
+# Clobbers t0, t1, a1.
+:is_ident_byte
+%scope is_ident_byte
+ %addi(t0, a0, -48) ; '0'
+ %li(t1, 10)
+ %bltu(t0, t1, &::ok) ; '0'..'9'
+ %li(t1, 26)
+ %addi(t0, a0, -65) ; 'A'
+ %bltu(t0, t1, &::ok) ; 'A'..'Z'
+ %addi(t0, a0, -97) ; 'a'
+ %bltu(t0, t1, &::ok) ; 'a'..'z'
+ %addi(t0, a0, -33) ; '!'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -36) ; '$'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -37) ; '%'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -38) ; '&'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -42) ; '*'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -43) ; '+'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -45) ; '-'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -46) ; '.'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -47) ; '/'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -58) ; ':'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -60) ; '<'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -61) ; '='
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -62) ; '>'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -63) ; '?'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -64) ; '@'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -94) ; '^'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -95) ; '_'
+ %beqz(t0, &::ok)
+ %addi(t0, a0, -126) ; '~'
+ %beqz(t0, &::ok)
+ %li(a1, 0)
+ %ret
+ ::ok
+ %li(a1, 1)
+ %ret
+%endscope
+
# parse_atom() -> tagged value (fixnum or symbol) in a0.
# Reads until whitespace or paren or EOF, then dispatches by first byte.
+# A token whose first byte is a digit (or sign-then-digit) commits to
+# parse_dec and any non-numeric byte aborts; otherwise the token is a
+# symbol and every byte is checked against is_ident_byte before intern.
#
-# Frame: 16 bytes
+# Frame: 24 bytes
# +0 start cursor (byte offset)
# +8 end cursor (byte offset)
-%fn(parse_atom, 16, {
+# +16 cursor (scratch slot for the symbol-validation loop)
+%fn(parse_atom, 24, {
%la(t0, &readbuf_pos)
%ld(t1, t0, 0)
%st(t1, sp, 0)
@@ -859,10 +976,14 @@
%addi(a1, t1, -48)
%li(a2, 10)
%bltu(a1, a2, &::is_int)
- # '-' followed by digit -> int
+ # '-' or '+' followed by digit -> int. A lone '+' or '-' falls
+ # through to is_sym (those tokens stay valid identifiers).
%addi(a1, t1, -45)
- %bnez(a1, &::is_sym)
- # there must be at least one more byte for it to be a number
+ %beqz(a1, &::sign)
+ %addi(a1, t1, -43)
+ %beqz(a1, &::sign)
+ %b(&::is_sym)
+ ::sign
%ld(t2, sp, 8)
%addi(t0, t0, 1)
%beq(t0, t2, &::is_sym)
@@ -872,6 +993,25 @@
# fall through to is_sym
::is_sym
+ # Validate every byte; abort on the first non-ident byte.
+ %ld(t0, sp, 0)
+ %st(t0, sp, 16)
+ ::sym_loop
+ %ld(t0, sp, 16)
+ %ld(t1, sp, 8)
+ %beq(t0, t1, &::sym_intern)
+ %readbuf_byte(a0, t0)
+ %call(&is_ident_byte)
+ %beqz(a1, &::sym_bad)
+ %ld(t0, sp, 16)
+ %addi(t0, t0, 1)
+ %st(t0, sp, 16)
+ %b(&::sym_loop)
+
+ ::sym_bad
+ %die(msg_bad_ident)
+
+ ::sym_intern
%ld(a0, sp, 0)
%la(t0, &readbuf_buf_ptr)
%ld(t0, t0, 0)
@@ -888,20 +1028,26 @@
%ld(a0, a0, 0)
%add(a0, a0, t0) ; ptr = base + start_off
%sub(a1, t1, t0) ; len = end_off - start_off
- %tail(&parse_dec)
+ %call(&parse_dec) ; -> (a0=value, a1=ok)
+ %beqz(a1, &::int_bad)
+ %eret
+ ::int_bad
+ %die(msg_bad_number)
})
# parse_string() -> tagged bytevector in a0. Cursor sits past the
# opening '"' (consumed by parse_one). Two-pass: pass 1 walks the body,
# counting decoded bytes in a0 and locating the closing '"'; pass 2
-# allocates the bv and decodes into its data buffer. Each escape
-# (\n \t \r \\ \") yields one byte.
+# allocates the bv and decodes into its data buffer. Each named escape
+# (\n \t \r \\ \") yields one byte; an inline-hex escape \xHEX; (1+
+# hex digits, value 0..255, terminated by ';') also yields one byte.
#
-# Frame: 24 bytes
+# Frame: 32 bytes
# +0 start cursor (first content byte)
# +8 end cursor (closing '"' position)
# +16 bv wrapper (saved across the data fill loop)
-%fn(parse_string, 24, {
+# +24 spill slot (write ptr saved across parse_hex in \x escape)
+%fn(parse_string, 32, {
%la(t0, &readbuf_pos)
%ld(t1, t0, 0)
%st(t1, sp, 0)
@@ -922,14 +1068,35 @@
%b(&::scan)
::scan_esc
- # Backslash plus the next byte yield one decoded byte (validated in
- # pass 2). Need at least one more byte after the backslash.
+ # Backslash plus the next byte yield one decoded byte. \xHEX; runs
+ # until the terminating ';' (validated in pass 2); every other escape
+ # is exactly two source bytes.
%addi(t1, t1, 1)
%beq(t1, t2, &::eof)
+ %readbuf_byte(a3, t1)
+ %addi(a1, a3, -120) ; 'x'
+ %beqz(a1, &::scan_hex)
%addi(t1, t1, 1)
%addi(a0, a0, 1)
%b(&::scan)
+ ::scan_hex
+ # Skip past 'x' and scan to the terminating ';'. EOF before ';'
+ # falls into the unterminated-string path below, matching how an
+ # unterminated body is reported.
+ %addi(t1, t1, 1)
+ ::scan_hex_loop
+ %beq(t1, t2, &::eof)
+ %readbuf_byte(a3, t1)
+ %addi(a1, a3, -59) ; ';'
+ %beqz(a1, &::scan_hex_done)
+ %addi(t1, t1, 1)
+ %b(&::scan_hex_loop)
+ ::scan_hex_done
+ %addi(t1, t1, 1) ; consume ';'
+ %addi(a0, a0, 1) ; +1 output byte
+ %b(&::scan)
+
::scan_done
%st(t1, sp, 8)
%call(&bv_alloc)
@@ -963,6 +1130,8 @@
%beqz(a2, &::write_byte)
%addi(a2, a1, -34) ; '"'
%beqz(a2, &::write_byte)
+ %addi(a2, a1, -120) ; 'x'
+ %beqz(a2, &::esc_hex)
%die(msg_bad_escape)
::esc_n
@@ -979,6 +1148,38 @@
%addi(t1, t1, 1)
%b(&::fill)
+ ::esc_hex
+ # Skip past 'x'. parse_hex consumes hex digits; demand at least one,
+ # value <= 255, and an immediate ';' terminator. parse_hex clobbers
+ # t0/t1/t2 and a2/a3, so spill the cursor (t1) and write ptr (a3)
+ # across the call. sp+0 is free once pass 1 finishes.
+ %addi(t1, t1, 1) ; t1 -> first hex digit
+ %st(t1, sp, 0)
+ %st(a3, sp, 24)
+ %la(t0, &readbuf_buf_ptr)
+ %ld(t0, t0, 0)
+ %add(a0, t0, t1) ; ptr to first hex digit
+ %sub(a1, t2, t1) ; max len (bytes left in body)
+ %call(&parse_hex) ; -> (a0=value, a1=consumed)
+ %beqz(a1, &::hex_bad)
+ %li(t0, 255)
+ %bltu(t0, a0, &::hex_bad)
+ %ld(t1, sp, 0)
+ %add(t1, t1, a1) ; t1 = position of expected ';'
+ %ld(t2, sp, 8)
+ %beq(t1, t2, &::hex_bad)
+ %readbuf_byte(t0, t1)
+ %addi(t0, t0, -59) ; ';'
+ %bnez(t0, &::hex_bad)
+ %addi(t1, t1, 1) ; consume ';'
+ %ld(a3, sp, 24)
+ %sb(a0, a3, 0)
+ %addi(a3, a3, 1)
+ %b(&::fill)
+
+ ::hex_bad
+ %die(msg_bad_escape)
+
::fill_done
%addi(t1, t1, 1) ; consume closing '"'
%la(t0, &readbuf_pos)
@@ -1131,12 +1332,12 @@
})
# parse_dec(data_ptr=a0, len=a1) -> (tagged fixnum=a0, ok=a1). Leaf.
-# Accepts an optional leading '-' followed by one or more decimal digits.
-# ok=1 on a fully-consumed valid input; ok=0 on empty input, lone "-",
-# or any non-digit byte. On failure a0 still holds the best-effort
-# accumulator (sign-applied, retagged) so reader-style callers that
-# don't check ok get the same garbage they got before this routine
-# validated. UB on integer overflow. Uses only t0..t2.
+# Accepts an optional leading '+' or '-' followed by one or more decimal
+# digits. ok=1 on a fully-consumed valid input; ok=0 on empty input, a
+# lone sign, or any non-digit byte. On failure a0 still holds the
+# best-effort accumulator (sign-applied, retagged) so reader-style
+# callers that don't check ok get the same garbage they got before this
+# routine validated. UB on integer overflow. Uses only t0..t2.
#
# Register map:
# t0 = current ptr a0 = byte / digit / final value
@@ -1152,9 +1353,14 @@
%beqz(t1, &::fail)
%lb(a0, t0, 0)
- %addi(a0, a0, -45) ; '-'
- %bnez(a0, &::loop_init)
+ %addi(t2, a0, -45) ; '-'
+ %beqz(t2, &::neg)
+ %addi(t2, a0, -43) ; '+'
+ %bnez(t2, &::loop_init)
+ %b(&::skip_sign)
+ ::neg
%li(a2, 1)
+ ::skip_sign
%addi(t0, t0, 1)
%addi(t1, t1, -1)
%beqz(t1, &::fail)
@@ -2444,6 +2650,19 @@
%ret
%endscope
+# (symbol? x) -- #t iff x is TAG.SYM (interned symbol index, not a heap obj).
+:prim_symbolq_entry
+%scope prim_symbolq
+ %car(t0, a0)
+ %tagof(t1, t0)
+ %li(t2, %TAG.SYM)
+ %li(a0, %imm_val(%IMM.FALSE))
+ %bne(t1, t2, &::end)
+ %li(a0, %imm_val(%IMM.TRUE))
+ ::end
+ %ret
+%endscope
+
# (procedure? x) -- #t iff x is HEAP-tagged with header HDR.CLOSURE or HDR.PRIM.
:prim_procedureq_entry
%scope prim_procedureq
@@ -4359,6 +4578,7 @@
:name_bv_append "bytevector-append"
:name_booleanq "boolean?"
:name_integerq "integer?"
+:name_symbolq "symbol?"
:name_procedureq "procedure?"
:name_zeroq "zero?"
:name_not "not"
@@ -4438,6 +4658,7 @@
&name_bv_append %(0) $(17) &prim_bv_append_entry %(0)
&name_booleanq %(0) $(8) &prim_booleanq_entry %(0)
&name_integerq %(0) $(8) &prim_integerq_entry %(0)
+&name_symbolq %(0) $(7) &prim_symbolq_entry %(0)
&name_procedureq %(0) $(10) &prim_procedureq_entry %(0)
&name_zeroq %(0) $(5) &prim_zeroq_entry %(0)
&name_not %(0) $(3) &prim_not_entry %(0)
diff --git a/tests/scheme1/78-symbolq.scm b/tests/scheme1/78-symbolq.scm
@@ -0,0 +1,15 @@
+; symbol? — #t iff x is a HEAP-tagged HDR.SYM (an interned symbol).
+
+(if (symbol? 'foo) 0 (sys-exit 1))
+(if (symbol? (quote bar)) 0 (sys-exit 2))
+(if (symbol? (string->symbol "baz")) 0 (sys-exit 3))
+
+(if (not (symbol? "foo")) 0 (sys-exit 4))
+(if (not (symbol? 0)) 0 (sys-exit 5))
+(if (not (symbol? '())) 0 (sys-exit 6))
+(if (not (symbol? '(a b))) 0 (sys-exit 7))
+(if (not (symbol? #t)) 0 (sys-exit 8))
+(if (not (symbol? car)) 0 (sys-exit 9))
+(if (not (symbol? (make-bytevector 1 0))) 0 (sys-exit 10))
+
+(sys-exit 0)
diff --git a/tests/scheme1/79-bv-num-aliases.scm b/tests/scheme1/79-bv-num-aliases.scm
@@ -0,0 +1,19 @@
+; bytevector? and number? — prelude aliases. bytevector? mirrors string?
+; (one HDR.BV repr); number? mirrors integer? (only fixnums today).
+
+; bytevector?
+(if (bytevector? "abc") 0 (sys-exit 1))
+(if (bytevector? (make-bytevector 3 0)) 0 (sys-exit 2))
+(if (not (bytevector? 0)) 0 (sys-exit 3))
+(if (not (bytevector? '())) 0 (sys-exit 4))
+(if (not (bytevector? 'foo)) 0 (sys-exit 5))
+
+; number?
+(if (number? 0) 0 (sys-exit 6))
+(if (number? -7) 0 (sys-exit 7))
+(if (number? 42) 0 (sys-exit 8))
+(if (not (number? "0")) 0 (sys-exit 9))
+(if (not (number? '())) 0 (sys-exit 10))
+(if (not (number? 'sym)) 0 (sys-exit 11))
+
+(sys-exit 0)
diff --git a/tests/scheme1/80-u8-literal.scm b/tests/scheme1/80-u8-literal.scm
@@ -0,0 +1,30 @@
+; #u8(...) — bytevector literal. Each element is a fixnum byte (0..255);
+; result is a HDR.BV with those exact bytes.
+
+; Empty literal.
+(define e #u8())
+(if (bytevector=? e (make-bytevector 0 0)) 0 (sys-exit 1))
+
+; Length and per-byte access.
+(define a #u8(1 2 3))
+(if (= (bytevector-length a) 3) 0 (sys-exit 2))
+(if (= (bytevector-u8-ref a 0) 1) 0 (sys-exit 3))
+(if (= (bytevector-u8-ref a 1) 2) 0 (sys-exit 4))
+(if (= (bytevector-u8-ref a 2) 3) 0 (sys-exit 5))
+
+; Equality with a same-bytes string literal (both HDR.BV).
+(if (bytevector=? #u8(65 66 67) "ABC") 0 (sys-exit 6))
+
+; Hex element values are accepted (parser shares atom dispatch).
+(define b #u8(#x00 #xff))
+(if (= (bytevector-length b) 2) 0 (sys-exit 7))
+(if (= (bytevector-u8-ref b 0) 0) 0 (sys-exit 8))
+(if (= (bytevector-u8-ref b 1) 255) 0 (sys-exit 8))
+
+; Whitespace and comments inside the literal.
+(define c #u8( 10 20 ; trailing comment
+ 30 ))
+(if (= (bytevector-length c) 3) 0 (sys-exit 9))
+(if (= (bytevector-u8-ref c 2) 30) 0 (sys-exit 10))
+
+(sys-exit 0)