boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 772b42d44f685524177de9acef5b2318064ec4fa
parent 89426553d1861d00e272ec9706e6b064eb4547e6
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 28 Apr 2026 17:51:13 -0700

cc: quoted-hex data emission and signed→unsigned cast canonicalisation

Two correctness fixes uncovered by the tcc-boot2 path through cc.scm:

1. cg-intern-string and %cg-init-piece->bv now render every byte of
   string literals and global initializer bytevectors as `'XXXX...'`
   M0 quoted-hex chunks (≤128 hex chars / 64 bytes per line, matching
   M0's per-line quoted-literal buffer). The previous `"..."` form
   blew up on tcc.c's `"'%c' expected (got \"%s\")"`-style messages —
   m1pp's quoted-text lex has no escape mechanism, so embedded `"`,
   `\`, control chars and high-bit bytes broke the token stream.
   Per-byte `!(N)` was the other option but is ~5× larger output.

2. cg-cast retag fast-path (same-size or widening, no codegen) was
   incorrectly used for signed→unsigned conversions, leaving the
   sign-extended high bits of the slot to leak into later 64-bit
   compares and wider casts. Same problem in the reverse direction
   at narrow widths (u8→i8 must sign-extend the slot to canonical
   form). The retag branch now requires `not (from-signed AND
   to-unsigned)` and `not (same-size AND from-unsigned AND
   to-signed)`; mismatches fall through to the existing narrowing
   path which canonicalises per to-kind.

Tests:
  127-string-escapes.c — `\"`, `\\`, control bytes and high-bit
                         bytes round-trip through the data section.
  128-cast-signedness.c — i8 → u8/u16/u32/u64, u8 → i8 round-trip,
                          i32 → u32 of negative value, intermediate
                          unsigned-char locals.

Diffstat:
Mcc/cc.scm | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Atests/cc/127-string-escapes.c | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/cc/127-string-escapes.expected-exit | 1+
Atests/cc/128-cast-signedness.c | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/cc/128-cast-signedness.expected-exit | 1+
5 files changed, 251 insertions(+), 20 deletions(-)

diff --git a/cc/cc.scm b/cc/cc.scm @@ -3358,13 +3358,28 @@ (or (eq? (ctype-kind from-ty) 'ptr) (eq? (ctype-kind from-ty) 'arr)))) (cg-push cg (%opnd (opnd-kind p) to-type (opnd-ext p) (opnd-lval? p)))) - ((>= to-sz from-sz) + ;; Same-size or widening cast — retag only when the canonical + ;; 64-bit slot form for FROM-TY is also canonical for TO-TYPE. + ;; That holds unless we're crossing from a signed type into an + ;; unsigned one of the same or wider width: the source's + ;; sign-extended high bits would leak past the unsigned width + ;; and corrupt later 64-bit operands (compares, wider casts). + ;; Same applies to same-size unsigned→signed at narrow widths + ;; (the narrow branch's sign-extend turns 0xCA back into the + ;; canonical i8 slot 0xFF…FFCA). + ((and (>= to-sz from-sz) + (not (and (not (%ctype-unsigned? from-ty)) + (%ctype-unsigned? to-type))) + (not (and (= to-sz from-sz) + (%ctype-unsigned? from-ty) + (not (%ctype-unsigned? to-type))))) (cg-push cg (%opnd (opnd-kind p) to-type (opnd-ext p) (opnd-lval? p)))) (else - ;; Narrowing cast. Signed targets (i8/i16/i32) shli/sari to - ;; truncate-and-sign-extend in one step, so the slot holds the - ;; canonical 64-bit form and a subsequent widening cast (which - ;; is relabel-only) restores the value. Unsigned targets mask + ;; Narrowing cast OR same/widening with signedness flip. + ;; Signed targets (i8/i16/i32) shli/sari to truncate-and- + ;; sign-extend in one step, so the slot holds the canonical + ;; 64-bit form and a subsequent widening cast (which is + ;; relabel-only) restores the value. Unsigned targets mask ;; off high bits to zero-extend. (%cg-load-opnd-into cg p 't0) (cond @@ -3911,24 +3926,16 @@ ;; (piece ...) — initialized in .data; pieces concatenated. ;; ;; Each piece is either: -;; <bytevector> — raw bytes; emitted as N×!(byte) entries. +;; <bytevector> — raw bytes; emitted as `'XXXX...'` M0 +;; quoted-hex chunks (64 bytes / 128 hex +;; chars per line). ;; (label-ref . <label-bv>) — 8-byte pointer slot containing &label; ;; emitted as `&<label> %(0)` (4B label ref + ;; 4B zero pad). (define (%cg-init-piece->bv piece) (cond ((bytevector? piece) - (let ((n (bytevector-length piece))) - (let loop ((i 0) (acc '())) - (cond - ((= i n) (bv-cat (reverse acc))) - (else - (loop (+ i 1) - (cons (bv-cat (list "!(" - (number->string - (bytevector-u8-ref piece i) 10) - ")\n")) - acc))))))) + (bv-cat (%cg-bv->hex-lines piece #f))) ((and (pair? piece) (eq? (car piece) 'label-ref)) (bv-cat (list "&" (cdr piece) " %(0)\n"))) (else (die #f "cg-emit-global: bad init piece" piece)))) @@ -3999,11 +4006,62 @@ (cg-str-pool-set! cg (alist-set bv-content lbl (cg-str-pool cg))) (buf-push! (cg-data cg) - (bv-cat (list "\n:" lbl "\n" - "\"" bv-content "\"\n" - "!(0)\n"))) + (bv-cat (append (list "\n:" lbl "\n") + (%cg-bv->hex-lines bv-content #t)))) lbl))))) +;; Render BV's bytes as `'XXXXXX'` quoted-hex M0 literals — uniform +;; format for every byte, regardless of whether it would otherwise be +;; printable. Avoids the `"..."` lex path entirely (m1pp's quoted-text +;; lex has no escape mechanism, so embedded `"`, `\`, control chars, +;; and high-bit bytes can't ride raw between the quotes), and avoids +;; per-byte `!(N)` lines (5+× larger output). Lines are chunked to +;; ≤128 hex chars (= 64 bytes) — M0's per-line quoted-literal buffer +;; is 256 bytes on amd64 and overflows otherwise. +;; +;; If TRAILING-NUL? is #t, an extra 0x00 byte is appended to terminate +;; a C string. Returns a list of bytevectors ready for bv-cat. +(define %CG-HEX-CHUNK-BYTES 64) + +(define (%cg-bv->hex-lines bv trailing-nul?) + (let* ((len (bytevector-length bv)) + (total (cond (trailing-nul? (+ len 1)) (else len)))) + (cond + ((= total 0) '()) + (else + (let loop ((i 0) (acc '())) + (cond + ((>= i total) (reverse acc)) + (else + (let ((end (cond ((< (+ i %CG-HEX-CHUNK-BYTES) total) + (+ i %CG-HEX-CHUNK-BYTES)) + (else total)))) + (loop end (cons (%cg-hex-line bv i end len) acc)))))))))) + +;; One `'XXXX...XX'\n` line covering BV bytes [START, END). Indices +;; >= LEN render as 0x00 (used for the trailing NUL terminator). +(define (%cg-hex-line bv start end len) + (let* ((nbytes (- end start)) + (out (make-bytevector (+ 1 (* 2 nbytes) 1 1)))) + (bytevector-u8-set! out 0 (char->integer #\')) + (let loop ((j start) (k 1)) + (cond + ((= j end) + (bytevector-u8-set! out k (char->integer #\')) + (bytevector-u8-set! out (+ k 1) (char->integer #\newline)) + out) + (else + (let ((b (cond ((< j len) (bytevector-u8-ref bv j)) + (else 0)))) + (bytevector-u8-set! out k (%cg-hex-digit + (arithmetic-shift b -4))) + (bytevector-u8-set! out (+ k 1) (%cg-hex-digit (bit-and b 15))) + (loop (+ j 1) (+ k 2)))))))) + +(define (%cg-hex-digit n) + (cond ((< n 10) (+ n (char->integer #\0))) + (else (+ (- n 10) (char->integer #\A))))) + ;; -------------------------------------------------------------------- ;; Frame ;; -------------------------------------------------------------------- diff --git a/tests/cc/127-string-escapes.c b/tests/cc/127-string-escapes.c @@ -0,0 +1,112 @@ +/* String literals containing characters that need escaping when emitted + * into P1pp. m1pp's lex treats `"` as a string delimiter with no escape + * mechanism, so the raw byte cannot ride inside `"..."` in cg-intern- + * string's data emission. Same problem for `\\` (backslash) — also for + * embedded control bytes and high-bit bytes which would confuse M0's + * line-oriented tokenizer downstream. + * + * The fix is to always emit as single-quoted hex bytes. + * + * tcc.c hits this in messages like: + * "'%c' expected (got \"%s\")" + * which in C-source-level bytes is `'%c' expected (got "%s")` (i.e. with + * literal `"` characters around the %s) — the exact bytes that broke + * the m1pp pipeline before the fix. + */ + +int strlen_(const char *s) { + int n = 0; + while (s[n] != 0) n = n + 1; + return n; +} + +int memeq_(const char *a, const char *b, int n) { + int i = 0; + while (i < n) { + if (a[i] != b[i]) return 0; + i = i + 1; + } + return 1; +} + +int test_dquote(void) { + /* Embedded " — the tcc.c case. */ + const char *s = "'%c' expected (got \"%s\")"; + /* 24 bytes: '%c' expected (got "%s") */ + if (strlen_(s) != 24) return 1; + if (s[0] != '\'') return 2; + if (s[1] != '%') return 3; + if (s[2] != 'c') return 4; + if (s[3] != '\'') return 5; + if (s[19] != '"') return 6; + if (s[20] != '%') return 7; + if (s[21] != 's') return 8; + if (s[22] != '"') return 9; + if (s[23] != ')') return 10; + if (s[24] != 0) return 11; + return 0; +} + +int test_backslash(void) { + /* Embedded \\ — also unsafe to ride raw between "..." in P1pp. */ + const char *s = "a\\b\\c"; + if (strlen_(s) != 5) return 1; + if (s[0] != 'a') return 2; + if (s[1] != '\\') return 3; + if (s[2] != 'b') return 4; + if (s[3] != '\\') return 5; + if (s[4] != 'c') return 6; + if (s[5] != 0) return 7; + return 0; +} + +int test_controls(void) { + /* Embedded control bytes — \n / \t / \r round-trip through m1pp's + * line-oriented tokenizer if not emitted as !(N). */ + const char *s = "x\ny\tz\r"; + if (strlen_(s) != 6) return 1; + if (s[0] != 'x') return 2; + if (s[1] != '\n') return 3; + if (s[2] != 'y') return 4; + if (s[3] != '\t') return 5; + if (s[4] != 'z') return 6; + if (s[5] != '\r') return 7; + if (s[6] != 0) return 8; + return 0; +} + +int test_highbit(void) { + /* Embedded byte >= 0x80. m1pp's `"..."` token lex treats the line as + * text and is fragile with non-ASCII / >0x7F bytes; encode as !(N). + * Octal escapes avoid the `\xCAb` C ambiguity (b is a hex digit). */ + const char *s = "a\312b\377c"; + if (strlen_(s) != 5) return 1; + if ((unsigned char)s[0] != 'a') return 2; + if ((unsigned char)s[1] != 0xCA) return 3; + if ((unsigned char)s[2] != 'b') return 4; + if ((unsigned char)s[3] != 0xFF) return 5; + if ((unsigned char)s[4] != 'c') return 6; + if ((unsigned char)s[5] != 0) return 7; + return 0; +} + +int test_combined(void) { + /* All categories at once, in the sort of pattern tcc.c uses. */ + const char *s = "got \"\312\\n\""; + /* g o t SP " 0xCA \ n " : 9 bytes (the `\\n` here is backslash + n, + * not a newline — `\\` decodes to one `\`, then a literal `n`). */ + if (strlen_(s) != 9) return 1; + const char want[] = {'g', 'o', 't', ' ', '"', (char)0xCA, '\\', 'n', '"', 0}; + if (!memeq_(s, want, 10)) return 2; + return 0; +} + +int main(int argc, char **argv) { + int r; + if ((r = test_dquote())) return 10 + r; + if ((r = test_backslash())) return 20 + r; + if ((r = test_controls())) return 30 + r; + if ((r = test_highbit())) return 40 + r; + if ((r = test_combined())) return 50 + r; + return 0; +} diff --git a/tests/cc/127-string-escapes.expected-exit b/tests/cc/127-string-escapes.expected-exit @@ -0,0 +1 @@ +0 diff --git a/tests/cc/128-cast-signedness.c b/tests/cc/128-cast-signedness.c @@ -0,0 +1,59 @@ +/* Same-size and widening casts that flip signedness must canonicalize + * the slot for the destination type — i8 (sign-extended in the slot) + * cast to u8 / u16 / u32 / u64 needs the high bits cleared so a + * subsequent comparison or wider arithmetic sees the C-semantic value. + * + * The classic case (which broke the tcc-boot2 string-roundtrip test): + * const char *s = "...\xCA..."; + * if ((unsigned char)s[1] != 0xCA) ... // this MUST take the false branch + * + * On aarch64, s[1] sign-extends to int -54. Slot = 0xFFFFFFFFFFFFFFCA. + * The naive "same-size cast = relabel" path leaves the high bits set, + * so a 64-bit compare against 0x00000000000000CA returns "not equal". + */ + +int main(int argc, char **argv) { + const char *s = "a\312\377"; /* bytes 'a', 0xCA, 0xFF, NUL */ + + /* --- signed → unsigned, same size (i8 → u8) ------------------- */ + if ((unsigned char) s[1] != 202) return 1; + if ((unsigned char) s[1] != 0xCA) return 2; + /* Must NOT equal -54 once cast to u8. */ + if ((unsigned char) s[1] == -54) return 3; + + /* --- signed → unsigned, widening (i8 → u16/u32/u64) ----------- */ + /* C: (unsigned int)(signed char)(-54) = 4294967242 = 0xFFFFFFCA. */ + if ((unsigned int) s[1] != 0xFFFFFFCAu) return 4; + /* Comparison at u32 width: -54 (slot=…FFCA) ≠ 4294967242 only + * if the wider compare reads high bits correctly. */ + + /* (unsigned long)(signed char)(-54) = 0xFFFFFFFFFFFFFFCA. */ + if ((unsigned long) s[1] != 0xFFFFFFFFFFFFFFCAul) return 5; + + /* --- unsigned → signed, same size (u8 → i8) ------------------- */ + { + unsigned char uc = 0xCA; + signed char sc = (signed char) uc; + if (sc != -54) return 10; + /* And the reverse round-trip: */ + if ((unsigned char) sc != 202) return 11; + } + + /* --- intermediate-cast doesn't lose the value ----------------- */ + { + unsigned char tmp = (unsigned char) s[1]; + if (tmp != 202) return 20; + if ((int) tmp != 202) return 21; + } + + /* --- i32 → u32 same-size: the high bits matter for 64-bit ops. */ + { + int neg = -1; /* slot = 0xFFFFFFFFFFFFFFFF */ + unsigned int u = (unsigned int) neg; /* slot must zero high 32 */ + /* cast result canonical: 0x00000000FFFFFFFF = 4294967295 */ + if (u != 4294967295u) return 30; + if ((unsigned long) u != 4294967295ul) return 31; + } + + return 0; +} diff --git a/tests/cc/128-cast-signedness.expected-exit b/tests/cc/128-cast-signedness.expected-exit @@ -0,0 +1 @@ +0