literal_unicode.c (7074B)
1 #include "parse/literal_unicode.h" 2 3 static int hex_value(int c) { 4 if (c >= '0' && c <= '9') return c - '0'; 5 if (c >= 'a' && c <= 'f') return c - 'a' + 10; 6 if (c >= 'A' && c <= 'F') return c - 'A' + 10; 7 return -1; 8 } 9 10 static u32 unit_mask(u32 v, u32 bits) { 11 if (bits >= 32u) return v; 12 return v & ((1u << bits) - 1u); 13 } 14 15 static int valid_scalar(u32 cp) { 16 return cp <= 0x10ffffu && !(cp >= 0xd800u && cp <= 0xdfffu); 17 } 18 19 static int valid_ucn(u32 cp) { 20 if (!valid_scalar(cp)) return 0; 21 if (cp < 0x00a0u && cp != 0x24u && cp != 0x40u && cp != 0x60u) return 0; 22 return 1; 23 } 24 25 static int decode_utf8(const char* s, size_t len, size_t* pi, u32* out, 26 const char** err_out) { 27 size_t i = *pi; 28 unsigned char c0; 29 u32 v; 30 u32 minv; 31 u32 need; 32 33 if (i >= len) { 34 *err_out = "truncated character literal"; 35 return 0; 36 } 37 c0 = (unsigned char)s[i++]; 38 if (c0 < 0x80u) { 39 *pi = i; 40 *out = c0; 41 return 1; 42 } 43 if ((c0 & 0xe0u) == 0xc0u) { 44 v = c0 & 0x1fu; 45 minv = 0x80u; 46 need = 1; 47 } else if ((c0 & 0xf0u) == 0xe0u) { 48 v = c0 & 0x0fu; 49 minv = 0x800u; 50 need = 2; 51 } else if ((c0 & 0xf8u) == 0xf0u) { 52 v = c0 & 0x07u; 53 minv = 0x10000u; 54 need = 3; 55 } else { 56 *err_out = "malformed UTF-8 in literal"; 57 return 0; 58 } 59 while (need--) { 60 unsigned char cx; 61 if (i >= len) { 62 *err_out = "truncated UTF-8 character"; 63 return 0; 64 } 65 cx = (unsigned char)s[i++]; 66 if ((cx & 0xc0u) != 0x80u) { 67 *err_out = "malformed UTF-8 in literal"; 68 return 0; 69 } 70 v = (v << 6) | (u32)(cx & 0x3fu); 71 } 72 if (v < minv || !valid_scalar(v)) { 73 *err_out = "invalid Unicode scalar value in literal"; 74 return 0; 75 } 76 *pi = i; 77 *out = v; 78 return 1; 79 } 80 81 int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out, 82 const char** err_out) { 83 size_t i = *pi; 84 int c; 85 if (i >= len) { 86 *err_out = "truncated character literal"; 87 return 0; 88 } 89 out->kind = C_LIT_UNIT_CODEPOINT; 90 if (s[i] != '\\') { 91 if (!decode_utf8(s, len, &i, &out->value, err_out)) return 0; 92 *pi = i; 93 return 1; 94 } 95 ++i; 96 if (i >= len) { 97 *err_out = "trailing '\\' in literal"; 98 return 0; 99 } 100 c = (unsigned char)s[i++]; 101 switch (c) { 102 case 'n': 103 out->value = '\n'; 104 break; 105 case 't': 106 out->value = '\t'; 107 break; 108 case 'r': 109 out->value = '\r'; 110 break; 111 case 'b': 112 out->value = '\b'; 113 break; 114 case 'f': 115 out->value = '\f'; 116 break; 117 case 'v': 118 out->value = '\v'; 119 break; 120 case 'a': 121 out->value = '\a'; 122 break; 123 case '\\': 124 out->value = '\\'; 125 break; 126 case '\'': 127 out->value = '\''; 128 break; 129 case '"': 130 out->value = '"'; 131 break; 132 case '?': 133 out->value = '?'; 134 break; 135 case 'x': { 136 u32 hex = 0; 137 int any = 0; 138 out->kind = C_LIT_UNIT_NUMERIC; 139 while (i < len) { 140 int dv = hex_value((unsigned char)s[i]); 141 if (dv < 0) break; 142 hex = hex * 16u + (u32)dv; 143 any = 1; 144 ++i; 145 } 146 if (!any) { 147 *err_out = "\\x with no hex digits"; 148 return 0; 149 } 150 out->value = hex; 151 break; 152 } 153 case 'u': 154 case 'U': { 155 int n = c == 'u' ? 4 : 8; 156 u32 ucn = 0; 157 int j; 158 for (j = 0; j < n; ++j) { 159 int dv; 160 if (i >= len) { 161 *err_out = "truncated UCN"; 162 return 0; 163 } 164 dv = hex_value((unsigned char)s[i++]); 165 if (dv < 0) { 166 *err_out = "malformed UCN"; 167 return 0; 168 } 169 ucn = ucn * 16u + (u32)dv; 170 } 171 if (!valid_ucn(ucn)) { 172 *err_out = "invalid UCN scalar value"; 173 return 0; 174 } 175 out->value = ucn; 176 break; 177 } 178 default: 179 if (c >= '0' && c <= '7') { 180 u32 oct = (u32)(c - '0'); 181 int n = 1; 182 out->kind = C_LIT_UNIT_NUMERIC; 183 while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') { 184 oct = oct * 8u + (u32)(s[i] - '0'); 185 ++i; 186 ++n; 187 } 188 out->value = oct; 189 } else { 190 out->value = (u32)(unsigned char)c; 191 } 192 break; 193 } 194 *pi = i; 195 return 1; 196 } 197 198 void c_lit_encode_uint_le(u8* dst, u32 size, u32 value) { 199 u32 i; 200 for (i = 0; i < size; ++i) dst[i] = (u8)((value >> (8u * i)) & 0xffu); 201 } 202 203 static void append_unit(u8* dst, size_t* pk, u32 elem_size, u32 value) { 204 c_lit_encode_uint_le(dst + *pk, elem_size, value); 205 *pk += elem_size; 206 } 207 208 static int append_utf8(u8* dst, size_t* pk, u32 cp, const char** err_out) { 209 if (!valid_scalar(cp)) { 210 *err_out = "invalid Unicode scalar value"; 211 return 0; 212 } 213 if (cp <= 0x7fu) { 214 dst[(*pk)++] = (u8)cp; 215 } else if (cp <= 0x7ffu) { 216 dst[(*pk)++] = (u8)(0xc0u | (cp >> 6)); 217 dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); 218 } else if (cp <= 0xffffu) { 219 dst[(*pk)++] = (u8)(0xe0u | (cp >> 12)); 220 dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu)); 221 dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); 222 } else { 223 dst[(*pk)++] = (u8)(0xf0u | (cp >> 18)); 224 dst[(*pk)++] = (u8)(0x80u | ((cp >> 12) & 0x3fu)); 225 dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu)); 226 dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); 227 } 228 return 1; 229 } 230 231 static int append_utf16(u8* dst, size_t* pk, u32 elem_size, u32 cp, 232 const char** err_out) { 233 if (!valid_scalar(cp)) { 234 *err_out = "invalid Unicode scalar value"; 235 return 0; 236 } 237 if (cp <= 0xffffu) { 238 append_unit(dst, pk, elem_size, cp); 239 } else { 240 u32 v = cp - 0x10000u; 241 append_unit(dst, pk, elem_size, 0xd800u | (v >> 10)); 242 append_unit(dst, pk, elem_size, 0xdc00u | (v & 0x3ffu)); 243 } 244 return 1; 245 } 246 247 int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc, 248 u32 elem_size, CLitUnit unit, 249 const char** err_out) { 250 u32 bits = elem_size >= 4u ? 32u : elem_size * 8u; 251 if (unit.kind == C_LIT_UNIT_NUMERIC) { 252 append_unit(dst, pk, elem_size, unit_mask(unit.value, bits)); 253 return 1; 254 } 255 switch (enc) { 256 case C_LIT_STR_UTF8: 257 return append_utf8(dst, pk, unit.value, err_out); 258 case C_LIT_STR_UTF16: 259 return append_utf16(dst, pk, elem_size, unit.value, err_out); 260 case C_LIT_STR_UTF32: 261 append_unit(dst, pk, elem_size, unit.value); 262 return 1; 263 case C_LIT_STR_ORDINARY: 264 default: 265 append_unit(dst, pk, elem_size, unit_mask(unit.value, bits)); 266 return 1; 267 } 268 } 269 270 int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit, 271 u32* out, const char** err_out) { 272 if (unit.kind == C_LIT_UNIT_NUMERIC) { 273 *out = unit_mask(unit.value, elem_bits); 274 return 1; 275 } 276 if (enc == C_LIT_STR_UTF16 && unit.value > 0xffffu) { 277 *err_out = "UTF-16 character constant does not fit in one code unit"; 278 return 0; 279 } 280 if (enc == C_LIT_STR_UTF8 && unit.value > 0x7fu) { 281 *err_out = "UTF-8 character constant does not fit in one code unit"; 282 return 0; 283 } 284 *out = unit_mask(unit.value, elem_bits); 285 return 1; 286 }