boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

127-string-escapes.c (3738B)


      1 /* String literals containing characters that need escaping when emitted
      2  * into P1pp. m1pp's lex treats `"` as a string delimiter with no escape
      3  * mechanism, so the raw byte cannot ride inside `"..."` in cg-intern-
      4  * string's data emission. Same problem for `\\` (backslash) — also for
      5  * embedded control bytes and high-bit bytes which would confuse M0's
      6  * line-oriented tokenizer downstream.
      7  *
      8  * The fix is to always emit as single-quoted hex bytes.
      9  *
     10  * tcc.c hits this in messages like:
     11  *     "'%c' expected (got \"%s\")"
     12  * which in C-source-level bytes is `'%c' expected (got "%s")` (i.e. with
     13  * literal `"` characters around the %s) — the exact bytes that broke
     14  * the m1pp pipeline before the fix.
     15  */
     16 
     17 int strlen_(const char *s) {
     18     int n = 0;
     19     while (s[n] != 0) n = n + 1;
     20     return n;
     21 }
     22 
     23 int memeq_(const char *a, const char *b, int n) {
     24     int i = 0;
     25     while (i < n) {
     26         if (a[i] != b[i]) return 0;
     27         i = i + 1;
     28     }
     29     return 1;
     30 }
     31 
     32 int test_dquote(void) {
     33     /* Embedded " — the tcc.c case. */
     34     const char *s = "'%c' expected (got \"%s\")";
     35     /* 24 bytes: '%c' expected (got "%s") */
     36     if (strlen_(s) != 24) return 1;
     37     if (s[0]  != '\'') return 2;
     38     if (s[1]  != '%')  return 3;
     39     if (s[2]  != 'c')  return 4;
     40     if (s[3]  != '\'') return 5;
     41     if (s[19] != '"')  return 6;
     42     if (s[20] != '%')  return 7;
     43     if (s[21] != 's')  return 8;
     44     if (s[22] != '"')  return 9;
     45     if (s[23] != ')')  return 10;
     46     if (s[24] != 0)    return 11;
     47     return 0;
     48 }
     49 
     50 int test_backslash(void) {
     51     /* Embedded \\ — also unsafe to ride raw between "..." in P1pp. */
     52     const char *s = "a\\b\\c";
     53     if (strlen_(s) != 5) return 1;
     54     if (s[0] != 'a')  return 2;
     55     if (s[1] != '\\') return 3;
     56     if (s[2] != 'b')  return 4;
     57     if (s[3] != '\\') return 5;
     58     if (s[4] != 'c')  return 6;
     59     if (s[5] != 0)    return 7;
     60     return 0;
     61 }
     62 
     63 int test_controls(void) {
     64     /* Embedded control bytes — \n / \t / \r round-trip through m1pp's
     65      * line-oriented tokenizer if not emitted as !(N). */
     66     const char *s = "x\ny\tz\r";
     67     if (strlen_(s) != 6) return 1;
     68     if (s[0] != 'x')  return 2;
     69     if (s[1] != '\n') return 3;
     70     if (s[2] != 'y')  return 4;
     71     if (s[3] != '\t') return 5;
     72     if (s[4] != 'z')  return 6;
     73     if (s[5] != '\r') return 7;
     74     if (s[6] != 0)    return 8;
     75     return 0;
     76 }
     77 
     78 int test_highbit(void) {
     79     /* Embedded byte >= 0x80. m1pp's `"..."` token lex treats the line as
     80      * text and is fragile with non-ASCII / >0x7F bytes; encode as !(N).
     81      * Octal escapes avoid the `\xCAb` C ambiguity (b is a hex digit). */
     82     const char *s = "a\312b\377c";
     83     if (strlen_(s) != 5) return 1;
     84     if ((unsigned char)s[0] != 'a')  return 2;
     85     if ((unsigned char)s[1] != 0xCA) return 3;
     86     if ((unsigned char)s[2] != 'b')  return 4;
     87     if ((unsigned char)s[3] != 0xFF) return 5;
     88     if ((unsigned char)s[4] != 'c')  return 6;
     89     if ((unsigned char)s[5] != 0)    return 7;
     90     return 0;
     91 }
     92 
     93 int test_combined(void) {
     94     /* All categories at once, in the sort of pattern tcc.c uses. */
     95     const char *s = "got \"\312\\n\"";
     96     /* g o t SP " 0xCA \ n " : 9 bytes (the `\\n` here is backslash + n,
     97      * not a newline — `\\` decodes to one `\`, then a literal `n`). */
     98     if (strlen_(s) != 9) return 1;
     99     const char want[] = {'g', 'o', 't', ' ', '"', (char)0xCA, '\\', 'n', '"', 0};
    100     if (!memeq_(s, want, 10)) return 2;
    101     return 0;
    102 }
    103 
    104 int main(int argc, char **argv) {
    105     int r;
    106     if ((r = test_dquote()))     return 10 + r;
    107     if ((r = test_backslash()))  return 20 + r;
    108     if ((r = test_controls()))   return 30 + r;
    109     if ((r = test_highbit()))    return 40 + r;
    110     if ((r = test_combined()))   return 50 + r;
    111     return 0;
    112 }