commit 772b42d44f685524177de9acef5b2318064ec4fa
parent 89426553d1861d00e272ec9706e6b064eb4547e6
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 28 Apr 2026 17:51:13 -0700
cc: quoted-hex data emission and signed→unsigned cast canonicalisation
Two correctness fixes uncovered by the tcc-boot2 path through cc.scm:
1. cg-intern-string and %cg-init-piece->bv now render every byte of
string literals and global initializer bytevectors as `'XXXX...'`
M0 quoted-hex chunks (≤128 hex chars / 64 bytes per line, matching
M0's per-line quoted-literal buffer). The previous `"..."` form
blew up on tcc.c's `"'%c' expected (got \"%s\")"`-style messages —
m1pp's quoted-text lex has no escape mechanism, so embedded `"`,
`\`, control chars and high-bit bytes broke the token stream.
Per-byte `!(N)` was the other option but is ~5× larger output.
2. cg-cast retag fast-path (same-size or widening, no codegen) was
incorrectly used for signed→unsigned conversions, leaving the
sign-extended high bits of the slot to leak into later 64-bit
compares and wider casts. Same problem in the reverse direction
at narrow widths (u8→i8 must sign-extend the slot to canonical
form). The retag branch now requires `not (from-signed AND
to-unsigned)` and `not (same-size AND from-unsigned AND
to-signed)`; mismatches fall through to the existing narrowing
path which canonicalises per to-kind.
Tests:
127-string-escapes.c — `\"`, `\\`, control bytes and high-bit
bytes round-trip through the data section.
128-cast-signedness.c — i8 → u8/u16/u32/u64, u8 → i8 round-trip,
i32 → u32 of negative value, intermediate
unsigned-char locals.
Diffstat:
5 files changed, 251 insertions(+), 20 deletions(-)
diff --git a/cc/cc.scm b/cc/cc.scm
@@ -3358,13 +3358,28 @@
(or (eq? (ctype-kind from-ty) 'ptr)
(eq? (ctype-kind from-ty) 'arr))))
(cg-push cg (%opnd (opnd-kind p) to-type (opnd-ext p) (opnd-lval? p))))
- ((>= to-sz from-sz)
+ ;; Same-size or widening cast — retag only when the canonical
+ ;; 64-bit slot form for FROM-TY is also canonical for TO-TYPE.
+ ;; That holds unless we're crossing from a signed type into an
+ ;; unsigned one of the same or wider width: the source's
+ ;; sign-extended high bits would leak past the unsigned width
+ ;; and corrupt later 64-bit operands (compares, wider casts).
+ ;; Same applies to same-size unsigned→signed at narrow widths
+ ;; (the narrow branch's sign-extend turns 0xCA back into the
+ ;; canonical i8 slot 0xFF…FFCA).
+ ((and (>= to-sz from-sz)
+ (not (and (not (%ctype-unsigned? from-ty))
+ (%ctype-unsigned? to-type)))
+ (not (and (= to-sz from-sz)
+ (%ctype-unsigned? from-ty)
+ (not (%ctype-unsigned? to-type)))))
(cg-push cg (%opnd (opnd-kind p) to-type (opnd-ext p) (opnd-lval? p))))
(else
- ;; Narrowing cast. Signed targets (i8/i16/i32) shli/sari to
- ;; truncate-and-sign-extend in one step, so the slot holds the
- ;; canonical 64-bit form and a subsequent widening cast (which
- ;; is relabel-only) restores the value. Unsigned targets mask
+ ;; Narrowing cast OR same/widening with signedness flip.
+ ;; Signed targets (i8/i16/i32) shli/sari to truncate-and-
+ ;; sign-extend in one step, so the slot holds the canonical
+ ;; 64-bit form and a subsequent widening cast (which is
+ ;; relabel-only) restores the value. Unsigned targets mask
;; off high bits to zero-extend.
(%cg-load-opnd-into cg p 't0)
(cond
@@ -3911,24 +3926,16 @@
;; (piece ...) — initialized in .data; pieces concatenated.
;;
;; Each piece is either:
-;; <bytevector> — raw bytes; emitted as N×!(byte) entries.
+;; <bytevector> — raw bytes; emitted as `'XXXX...'` M0
+;; quoted-hex chunks (64 bytes / 128 hex
+;; chars per line).
;; (label-ref . <label-bv>) — 8-byte pointer slot containing &label;
;; emitted as `&<label> %(0)` (4B label ref +
;; 4B zero pad).
(define (%cg-init-piece->bv piece)
(cond
((bytevector? piece)
- (let ((n (bytevector-length piece)))
- (let loop ((i 0) (acc '()))
- (cond
- ((= i n) (bv-cat (reverse acc)))
- (else
- (loop (+ i 1)
- (cons (bv-cat (list "!("
- (number->string
- (bytevector-u8-ref piece i) 10)
- ")\n"))
- acc)))))))
+ (bv-cat (%cg-bv->hex-lines piece #f)))
((and (pair? piece) (eq? (car piece) 'label-ref))
(bv-cat (list "&" (cdr piece) " %(0)\n")))
(else (die #f "cg-emit-global: bad init piece" piece))))
@@ -3999,11 +4006,62 @@
(cg-str-pool-set! cg
(alist-set bv-content lbl (cg-str-pool cg)))
(buf-push! (cg-data cg)
- (bv-cat (list "\n:" lbl "\n"
- "\"" bv-content "\"\n"
- "!(0)\n")))
+ (bv-cat (append (list "\n:" lbl "\n")
+ (%cg-bv->hex-lines bv-content #t))))
lbl)))))
+;; Render BV's bytes as `'XXXXXX'` quoted-hex M0 literals — uniform
+;; format for every byte, regardless of whether it would otherwise be
+;; printable. Avoids the `"..."` lex path entirely (m1pp's quoted-text
+;; lex has no escape mechanism, so embedded `"`, `\`, control chars,
+;; and high-bit bytes can't ride raw between the quotes), and avoids
+;; per-byte `!(N)` lines (5+× larger output). Lines are chunked to
+;; ≤128 hex chars (= 64 bytes) — M0's per-line quoted-literal buffer
+;; is 256 bytes on amd64 and overflows otherwise.
+;;
+;; If TRAILING-NUL? is #t, an extra 0x00 byte is appended to terminate
+;; a C string. Returns a list of bytevectors ready for bv-cat.
+(define %CG-HEX-CHUNK-BYTES 64)
+
+(define (%cg-bv->hex-lines bv trailing-nul?)
+ (let* ((len (bytevector-length bv))
+ (total (cond (trailing-nul? (+ len 1)) (else len))))
+ (cond
+ ((= total 0) '())
+ (else
+ (let loop ((i 0) (acc '()))
+ (cond
+ ((>= i total) (reverse acc))
+ (else
+ (let ((end (cond ((< (+ i %CG-HEX-CHUNK-BYTES) total)
+ (+ i %CG-HEX-CHUNK-BYTES))
+ (else total))))
+ (loop end (cons (%cg-hex-line bv i end len) acc))))))))))
+
+;; One `'XXXX...XX'\n` line covering BV bytes [START, END). Indices
+;; >= LEN render as 0x00 (used for the trailing NUL terminator).
+(define (%cg-hex-line bv start end len)
+ (let* ((nbytes (- end start))
+ (out (make-bytevector (+ 1 (* 2 nbytes) 1 1))))
+ (bytevector-u8-set! out 0 (char->integer #\'))
+ (let loop ((j start) (k 1))
+ (cond
+ ((= j end)
+ (bytevector-u8-set! out k (char->integer #\'))
+ (bytevector-u8-set! out (+ k 1) (char->integer #\newline))
+ out)
+ (else
+ (let ((b (cond ((< j len) (bytevector-u8-ref bv j))
+ (else 0))))
+ (bytevector-u8-set! out k (%cg-hex-digit
+ (arithmetic-shift b -4)))
+ (bytevector-u8-set! out (+ k 1) (%cg-hex-digit (bit-and b 15)))
+ (loop (+ j 1) (+ k 2))))))))
+
+(define (%cg-hex-digit n)
+ (cond ((< n 10) (+ n (char->integer #\0)))
+ (else (+ (- n 10) (char->integer #\A)))))
+
;; --------------------------------------------------------------------
;; Frame
;; --------------------------------------------------------------------
diff --git a/tests/cc/127-string-escapes.c b/tests/cc/127-string-escapes.c
@@ -0,0 +1,112 @@
+/* String literals containing characters that need escaping when emitted
+ * into P1pp. m1pp's lex treats `"` as a string delimiter with no escape
+ * mechanism, so the raw byte cannot ride inside `"..."` in cg-intern-
+ * string's data emission. Same problem for `\\` (backslash) — also for
+ * embedded control bytes and high-bit bytes which would confuse M0's
+ * line-oriented tokenizer downstream.
+ *
+ * The fix is to always emit as single-quoted hex bytes.
+ *
+ * tcc.c hits this in messages like:
+ * "'%c' expected (got \"%s\")"
+ * which in C-source-level bytes is `'%c' expected (got "%s")` (i.e. with
+ * literal `"` characters around the %s) — the exact bytes that broke
+ * the m1pp pipeline before the fix.
+ */
+
+int strlen_(const char *s) {
+ int n = 0;
+ while (s[n] != 0) n = n + 1;
+ return n;
+}
+
+int memeq_(const char *a, const char *b, int n) {
+ int i = 0;
+ while (i < n) {
+ if (a[i] != b[i]) return 0;
+ i = i + 1;
+ }
+ return 1;
+}
+
+int test_dquote(void) {
+ /* Embedded " — the tcc.c case. */
+ const char *s = "'%c' expected (got \"%s\")";
+ /* 24 bytes: '%c' expected (got "%s") */
+ if (strlen_(s) != 24) return 1;
+ if (s[0] != '\'') return 2;
+ if (s[1] != '%') return 3;
+ if (s[2] != 'c') return 4;
+ if (s[3] != '\'') return 5;
+ if (s[19] != '"') return 6;
+ if (s[20] != '%') return 7;
+ if (s[21] != 's') return 8;
+ if (s[22] != '"') return 9;
+ if (s[23] != ')') return 10;
+ if (s[24] != 0) return 11;
+ return 0;
+}
+
+int test_backslash(void) {
+ /* Embedded \\ — also unsafe to ride raw between "..." in P1pp. */
+ const char *s = "a\\b\\c";
+ if (strlen_(s) != 5) return 1;
+ if (s[0] != 'a') return 2;
+ if (s[1] != '\\') return 3;
+ if (s[2] != 'b') return 4;
+ if (s[3] != '\\') return 5;
+ if (s[4] != 'c') return 6;
+ if (s[5] != 0) return 7;
+ return 0;
+}
+
+int test_controls(void) {
+ /* Embedded control bytes — \n / \t / \r round-trip through m1pp's
+ * line-oriented tokenizer if not emitted as !(N). */
+ const char *s = "x\ny\tz\r";
+ if (strlen_(s) != 6) return 1;
+ if (s[0] != 'x') return 2;
+ if (s[1] != '\n') return 3;
+ if (s[2] != 'y') return 4;
+ if (s[3] != '\t') return 5;
+ if (s[4] != 'z') return 6;
+ if (s[5] != '\r') return 7;
+ if (s[6] != 0) return 8;
+ return 0;
+}
+
+int test_highbit(void) {
+ /* Embedded byte >= 0x80. m1pp's `"..."` token lex treats the line as
+ * text and is fragile with non-ASCII / >0x7F bytes; encode as !(N).
+ * Octal escapes avoid the `\xCAb` C ambiguity (b is a hex digit). */
+ const char *s = "a\312b\377c";
+ if (strlen_(s) != 5) return 1;
+ if ((unsigned char)s[0] != 'a') return 2;
+ if ((unsigned char)s[1] != 0xCA) return 3;
+ if ((unsigned char)s[2] != 'b') return 4;
+ if ((unsigned char)s[3] != 0xFF) return 5;
+ if ((unsigned char)s[4] != 'c') return 6;
+ if ((unsigned char)s[5] != 0) return 7;
+ return 0;
+}
+
+int test_combined(void) {
+ /* All categories at once, in the sort of pattern tcc.c uses. */
+ const char *s = "got \"\312\\n\"";
+ /* g o t SP " 0xCA \ n " : 9 bytes (the `\\n` here is backslash + n,
+ * not a newline — `\\` decodes to one `\`, then a literal `n`). */
+ if (strlen_(s) != 9) return 1;
+ const char want[] = {'g', 'o', 't', ' ', '"', (char)0xCA, '\\', 'n', '"', 0};
+ if (!memeq_(s, want, 10)) return 2;
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int r;
+ if ((r = test_dquote())) return 10 + r;
+ if ((r = test_backslash())) return 20 + r;
+ if ((r = test_controls())) return 30 + r;
+ if ((r = test_highbit())) return 40 + r;
+ if ((r = test_combined())) return 50 + r;
+ return 0;
+}
diff --git a/tests/cc/127-string-escapes.expected-exit b/tests/cc/127-string-escapes.expected-exit
@@ -0,0 +1 @@
+0
diff --git a/tests/cc/128-cast-signedness.c b/tests/cc/128-cast-signedness.c
@@ -0,0 +1,59 @@
+/* Same-size and widening casts that flip signedness must canonicalize
+ * the slot for the destination type — i8 (sign-extended in the slot)
+ * cast to u8 / u16 / u32 / u64 needs the high bits cleared so a
+ * subsequent comparison or wider arithmetic sees the C-semantic value.
+ *
+ * The classic case (which broke the tcc-boot2 string-roundtrip test):
+ * const char *s = "...\xCA...";
+ * if ((unsigned char)s[1] != 0xCA) ... // this MUST take the false branch
+ *
+ * On aarch64, s[1] sign-extends to int -54. Slot = 0xFFFFFFFFFFFFFFCA.
+ * The naive "same-size cast = relabel" path leaves the high bits set,
+ * so a 64-bit compare against 0x00000000000000CA returns "not equal".
+ */
+
+int main(int argc, char **argv) {
+ const char *s = "a\312\377"; /* bytes 'a', 0xCA, 0xFF, NUL */
+
+ /* --- signed → unsigned, same size (i8 → u8) ------------------- */
+ if ((unsigned char) s[1] != 202) return 1;
+ if ((unsigned char) s[1] != 0xCA) return 2;
+ /* Must NOT equal -54 once cast to u8. */
+ if ((unsigned char) s[1] == -54) return 3;
+
+ /* --- signed → unsigned, widening (i8 → u16/u32/u64) ----------- */
+ /* C: (unsigned int)(signed char)(-54) = 4294967242 = 0xFFFFFFCA. */
+ if ((unsigned int) s[1] != 0xFFFFFFCAu) return 4;
+ /* Comparison at u32 width: -54 (slot=…FFCA) ≠ 4294967242 only
+ * if the wider compare reads high bits correctly. */
+
+ /* (unsigned long)(signed char)(-54) = 0xFFFFFFFFFFFFFFCA. */
+ if ((unsigned long) s[1] != 0xFFFFFFFFFFFFFFCAul) return 5;
+
+ /* --- unsigned → signed, same size (u8 → i8) ------------------- */
+ {
+ unsigned char uc = 0xCA;
+ signed char sc = (signed char) uc;
+ if (sc != -54) return 10;
+ /* And the reverse round-trip: */
+ if ((unsigned char) sc != 202) return 11;
+ }
+
+ /* --- intermediate-cast doesn't lose the value ----------------- */
+ {
+ unsigned char tmp = (unsigned char) s[1];
+ if (tmp != 202) return 20;
+ if ((int) tmp != 202) return 21;
+ }
+
+ /* --- i32 → u32 same-size: the high bits matter for 64-bit ops. */
+ {
+ int neg = -1; /* slot = 0xFFFFFFFFFFFFFFFF */
+ unsigned int u = (unsigned int) neg; /* slot must zero high 32 */
+ /* cast result canonical: 0x00000000FFFFFFFF = 4294967295 */
+ if (u != 4294967295u) return 30;
+ if ((unsigned long) u != 4294967295ul) return 31;
+ }
+
+ return 0;
+}
diff --git a/tests/cc/128-cast-signedness.expected-exit b/tests/cc/128-cast-signedness.expected-exit
@@ -0,0 +1 @@
+0