kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

literal_unicode.c (7074B)


      1 #include "parse/literal_unicode.h"
      2 
      3 static int hex_value(int c) {
      4   if (c >= '0' && c <= '9') return c - '0';
      5   if (c >= 'a' && c <= 'f') return c - 'a' + 10;
      6   if (c >= 'A' && c <= 'F') return c - 'A' + 10;
      7   return -1;
      8 }
      9 
     10 static u32 unit_mask(u32 v, u32 bits) {
     11   if (bits >= 32u) return v;
     12   return v & ((1u << bits) - 1u);
     13 }
     14 
     15 static int valid_scalar(u32 cp) {
     16   return cp <= 0x10ffffu && !(cp >= 0xd800u && cp <= 0xdfffu);
     17 }
     18 
     19 static int valid_ucn(u32 cp) {
     20   if (!valid_scalar(cp)) return 0;
     21   if (cp < 0x00a0u && cp != 0x24u && cp != 0x40u && cp != 0x60u) return 0;
     22   return 1;
     23 }
     24 
     25 static int decode_utf8(const char* s, size_t len, size_t* pi, u32* out,
     26                        const char** err_out) {
     27   size_t i = *pi;
     28   unsigned char c0;
     29   u32 v;
     30   u32 minv;
     31   u32 need;
     32 
     33   if (i >= len) {
     34     *err_out = "truncated character literal";
     35     return 0;
     36   }
     37   c0 = (unsigned char)s[i++];
     38   if (c0 < 0x80u) {
     39     *pi = i;
     40     *out = c0;
     41     return 1;
     42   }
     43   if ((c0 & 0xe0u) == 0xc0u) {
     44     v = c0 & 0x1fu;
     45     minv = 0x80u;
     46     need = 1;
     47   } else if ((c0 & 0xf0u) == 0xe0u) {
     48     v = c0 & 0x0fu;
     49     minv = 0x800u;
     50     need = 2;
     51   } else if ((c0 & 0xf8u) == 0xf0u) {
     52     v = c0 & 0x07u;
     53     minv = 0x10000u;
     54     need = 3;
     55   } else {
     56     *err_out = "malformed UTF-8 in literal";
     57     return 0;
     58   }
     59   while (need--) {
     60     unsigned char cx;
     61     if (i >= len) {
     62       *err_out = "truncated UTF-8 character";
     63       return 0;
     64     }
     65     cx = (unsigned char)s[i++];
     66     if ((cx & 0xc0u) != 0x80u) {
     67       *err_out = "malformed UTF-8 in literal";
     68       return 0;
     69     }
     70     v = (v << 6) | (u32)(cx & 0x3fu);
     71   }
     72   if (v < minv || !valid_scalar(v)) {
     73     *err_out = "invalid Unicode scalar value in literal";
     74     return 0;
     75   }
     76   *pi = i;
     77   *out = v;
     78   return 1;
     79 }
     80 
     81 int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out,
     82                       const char** err_out) {
     83   size_t i = *pi;
     84   int c;
     85   if (i >= len) {
     86     *err_out = "truncated character literal";
     87     return 0;
     88   }
     89   out->kind = C_LIT_UNIT_CODEPOINT;
     90   if (s[i] != '\\') {
     91     if (!decode_utf8(s, len, &i, &out->value, err_out)) return 0;
     92     *pi = i;
     93     return 1;
     94   }
     95   ++i;
     96   if (i >= len) {
     97     *err_out = "trailing '\\' in literal";
     98     return 0;
     99   }
    100   c = (unsigned char)s[i++];
    101   switch (c) {
    102     case 'n':
    103       out->value = '\n';
    104       break;
    105     case 't':
    106       out->value = '\t';
    107       break;
    108     case 'r':
    109       out->value = '\r';
    110       break;
    111     case 'b':
    112       out->value = '\b';
    113       break;
    114     case 'f':
    115       out->value = '\f';
    116       break;
    117     case 'v':
    118       out->value = '\v';
    119       break;
    120     case 'a':
    121       out->value = '\a';
    122       break;
    123     case '\\':
    124       out->value = '\\';
    125       break;
    126     case '\'':
    127       out->value = '\'';
    128       break;
    129     case '"':
    130       out->value = '"';
    131       break;
    132     case '?':
    133       out->value = '?';
    134       break;
    135     case 'x': {
    136       u32 hex = 0;
    137       int any = 0;
    138       out->kind = C_LIT_UNIT_NUMERIC;
    139       while (i < len) {
    140         int dv = hex_value((unsigned char)s[i]);
    141         if (dv < 0) break;
    142         hex = hex * 16u + (u32)dv;
    143         any = 1;
    144         ++i;
    145       }
    146       if (!any) {
    147         *err_out = "\\x with no hex digits";
    148         return 0;
    149       }
    150       out->value = hex;
    151       break;
    152     }
    153     case 'u':
    154     case 'U': {
    155       int n = c == 'u' ? 4 : 8;
    156       u32 ucn = 0;
    157       int j;
    158       for (j = 0; j < n; ++j) {
    159         int dv;
    160         if (i >= len) {
    161           *err_out = "truncated UCN";
    162           return 0;
    163         }
    164         dv = hex_value((unsigned char)s[i++]);
    165         if (dv < 0) {
    166           *err_out = "malformed UCN";
    167           return 0;
    168         }
    169         ucn = ucn * 16u + (u32)dv;
    170       }
    171       if (!valid_ucn(ucn)) {
    172         *err_out = "invalid UCN scalar value";
    173         return 0;
    174       }
    175       out->value = ucn;
    176       break;
    177     }
    178     default:
    179       if (c >= '0' && c <= '7') {
    180         u32 oct = (u32)(c - '0');
    181         int n = 1;
    182         out->kind = C_LIT_UNIT_NUMERIC;
    183         while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') {
    184           oct = oct * 8u + (u32)(s[i] - '0');
    185           ++i;
    186           ++n;
    187         }
    188         out->value = oct;
    189       } else {
    190         out->value = (u32)(unsigned char)c;
    191       }
    192       break;
    193   }
    194   *pi = i;
    195   return 1;
    196 }
    197 
    198 void c_lit_encode_uint_le(u8* dst, u32 size, u32 value) {
    199   u32 i;
    200   for (i = 0; i < size; ++i) dst[i] = (u8)((value >> (8u * i)) & 0xffu);
    201 }
    202 
    203 static void append_unit(u8* dst, size_t* pk, u32 elem_size, u32 value) {
    204   c_lit_encode_uint_le(dst + *pk, elem_size, value);
    205   *pk += elem_size;
    206 }
    207 
    208 static int append_utf8(u8* dst, size_t* pk, u32 cp, const char** err_out) {
    209   if (!valid_scalar(cp)) {
    210     *err_out = "invalid Unicode scalar value";
    211     return 0;
    212   }
    213   if (cp <= 0x7fu) {
    214     dst[(*pk)++] = (u8)cp;
    215   } else if (cp <= 0x7ffu) {
    216     dst[(*pk)++] = (u8)(0xc0u | (cp >> 6));
    217     dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
    218   } else if (cp <= 0xffffu) {
    219     dst[(*pk)++] = (u8)(0xe0u | (cp >> 12));
    220     dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu));
    221     dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
    222   } else {
    223     dst[(*pk)++] = (u8)(0xf0u | (cp >> 18));
    224     dst[(*pk)++] = (u8)(0x80u | ((cp >> 12) & 0x3fu));
    225     dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu));
    226     dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
    227   }
    228   return 1;
    229 }
    230 
    231 static int append_utf16(u8* dst, size_t* pk, u32 elem_size, u32 cp,
    232                         const char** err_out) {
    233   if (!valid_scalar(cp)) {
    234     *err_out = "invalid Unicode scalar value";
    235     return 0;
    236   }
    237   if (cp <= 0xffffu) {
    238     append_unit(dst, pk, elem_size, cp);
    239   } else {
    240     u32 v = cp - 0x10000u;
    241     append_unit(dst, pk, elem_size, 0xd800u | (v >> 10));
    242     append_unit(dst, pk, elem_size, 0xdc00u | (v & 0x3ffu));
    243   }
    244   return 1;
    245 }
    246 
    247 int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc,
    248                              u32 elem_size, CLitUnit unit,
    249                              const char** err_out) {
    250   u32 bits = elem_size >= 4u ? 32u : elem_size * 8u;
    251   if (unit.kind == C_LIT_UNIT_NUMERIC) {
    252     append_unit(dst, pk, elem_size, unit_mask(unit.value, bits));
    253     return 1;
    254   }
    255   switch (enc) {
    256     case C_LIT_STR_UTF8:
    257       return append_utf8(dst, pk, unit.value, err_out);
    258     case C_LIT_STR_UTF16:
    259       return append_utf16(dst, pk, elem_size, unit.value, err_out);
    260     case C_LIT_STR_UTF32:
    261       append_unit(dst, pk, elem_size, unit.value);
    262       return 1;
    263     case C_LIT_STR_ORDINARY:
    264     default:
    265       append_unit(dst, pk, elem_size, unit_mask(unit.value, bits));
    266       return 1;
    267   }
    268 }
    269 
    270 int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit,
    271                            u32* out, const char** err_out) {
    272   if (unit.kind == C_LIT_UNIT_NUMERIC) {
    273     *out = unit_mask(unit.value, elem_bits);
    274     return 1;
    275   }
    276   if (enc == C_LIT_STR_UTF16 && unit.value > 0xffffu) {
    277     *err_out = "UTF-16 character constant does not fit in one code unit";
    278     return 0;
    279   }
    280   if (enc == C_LIT_STR_UTF8 && unit.value > 0x7fu) {
    281     *err_out = "UTF-8 character constant does not fit in one code unit";
    282     return 0;
    283   }
    284   *out = unit_mask(unit.value, elem_bits);
    285   return 1;
    286 }