127-string-escapes.c (3738B)
1 /* String literals containing characters that need escaping when emitted 2 * into P1pp. m1pp's lex treats `"` as a string delimiter with no escape 3 * mechanism, so the raw byte cannot ride inside `"..."` in cg-intern- 4 * string's data emission. Same problem for `\\` (backslash) — also for 5 * embedded control bytes and high-bit bytes which would confuse M0's 6 * line-oriented tokenizer downstream. 7 * 8 * The fix is to always emit as single-quoted hex bytes. 9 * 10 * tcc.c hits this in messages like: 11 * "'%c' expected (got \"%s\")" 12 * which in C-source-level bytes is `'%c' expected (got "%s")` (i.e. with 13 * literal `"` characters around the %s) — the exact bytes that broke 14 * the m1pp pipeline before the fix. 15 */ 16 17 int strlen_(const char *s) { 18 int n = 0; 19 while (s[n] != 0) n = n + 1; 20 return n; 21 } 22 23 int memeq_(const char *a, const char *b, int n) { 24 int i = 0; 25 while (i < n) { 26 if (a[i] != b[i]) return 0; 27 i = i + 1; 28 } 29 return 1; 30 } 31 32 int test_dquote(void) { 33 /* Embedded " — the tcc.c case. */ 34 const char *s = "'%c' expected (got \"%s\")"; 35 /* 24 bytes: '%c' expected (got "%s") */ 36 if (strlen_(s) != 24) return 1; 37 if (s[0] != '\'') return 2; 38 if (s[1] != '%') return 3; 39 if (s[2] != 'c') return 4; 40 if (s[3] != '\'') return 5; 41 if (s[19] != '"') return 6; 42 if (s[20] != '%') return 7; 43 if (s[21] != 's') return 8; 44 if (s[22] != '"') return 9; 45 if (s[23] != ')') return 10; 46 if (s[24] != 0) return 11; 47 return 0; 48 } 49 50 int test_backslash(void) { 51 /* Embedded \\ — also unsafe to ride raw between "..." in P1pp. */ 52 const char *s = "a\\b\\c"; 53 if (strlen_(s) != 5) return 1; 54 if (s[0] != 'a') return 2; 55 if (s[1] != '\\') return 3; 56 if (s[2] != 'b') return 4; 57 if (s[3] != '\\') return 5; 58 if (s[4] != 'c') return 6; 59 if (s[5] != 0) return 7; 60 return 0; 61 } 62 63 int test_controls(void) { 64 /* Embedded control bytes — \n / \t / \r round-trip through m1pp's 65 * line-oriented tokenizer if not emitted as !(N). */ 66 const char *s = "x\ny\tz\r"; 67 if (strlen_(s) != 6) return 1; 68 if (s[0] != 'x') return 2; 69 if (s[1] != '\n') return 3; 70 if (s[2] != 'y') return 4; 71 if (s[3] != '\t') return 5; 72 if (s[4] != 'z') return 6; 73 if (s[5] != '\r') return 7; 74 if (s[6] != 0) return 8; 75 return 0; 76 } 77 78 int test_highbit(void) { 79 /* Embedded byte >= 0x80. m1pp's `"..."` token lex treats the line as 80 * text and is fragile with non-ASCII / >0x7F bytes; encode as !(N). 81 * Octal escapes avoid the `\xCAb` C ambiguity (b is a hex digit). */ 82 const char *s = "a\312b\377c"; 83 if (strlen_(s) != 5) return 1; 84 if ((unsigned char)s[0] != 'a') return 2; 85 if ((unsigned char)s[1] != 0xCA) return 3; 86 if ((unsigned char)s[2] != 'b') return 4; 87 if ((unsigned char)s[3] != 0xFF) return 5; 88 if ((unsigned char)s[4] != 'c') return 6; 89 if ((unsigned char)s[5] != 0) return 7; 90 return 0; 91 } 92 93 int test_combined(void) { 94 /* All categories at once, in the sort of pattern tcc.c uses. */ 95 const char *s = "got \"\312\\n\""; 96 /* g o t SP " 0xCA \ n " : 9 bytes (the `\\n` here is backslash + n, 97 * not a newline — `\\` decodes to one `\`, then a literal `n`). */ 98 if (strlen_(s) != 9) return 1; 99 const char want[] = {'g', 'o', 't', ' ', '"', (char)0xCA, '\\', 'n', '"', 0}; 100 if (!memeq_(s, want, 10)) return 2; 101 return 0; 102 } 103 104 int main(int argc, char **argv) { 105 int r; 106 if ((r = test_dquote())) return 10 + r; 107 if ((r = test_backslash())) return 20 + r; 108 if ((r = test_controls())) return 30 + r; 109 if ((r = test_highbit())) return 40 + r; 110 if ((r = test_combined())) return 50 + r; 111 return 0; 112 }