commit cb323097a119ba2428b1eb1457cff882a572a43a
parent 2bd5ad377b0a0ce27256ec64bfb3fecab5538c70
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 19 May 2026 12:00:12 -0700
Support Unicode transcoding in C literals
Diffstat:
11 files changed, 370 insertions(+), 156 deletions(-)
diff --git a/lang/c/parse/literal_unicode.c b/lang/c/parse/literal_unicode.c
@@ -0,0 +1,286 @@
+#include "parse/literal_unicode.h"
+
+static int hex_value(int c) {
+ if (c >= '0' && c <= '9') return c - '0';
+ if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+ if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+ return -1;
+}
+
+static u32 unit_mask(u32 v, u32 bits) {
+ if (bits >= 32u) return v;
+ return v & ((1u << bits) - 1u);
+}
+
+static int valid_scalar(u32 cp) {
+ return cp <= 0x10ffffu && !(cp >= 0xd800u && cp <= 0xdfffu);
+}
+
+static int valid_ucn(u32 cp) {
+ if (!valid_scalar(cp)) return 0;
+ if (cp < 0x00a0u && cp != 0x24u && cp != 0x40u && cp != 0x60u) return 0;
+ return 1;
+}
+
+static int decode_utf8(const char* s, size_t len, size_t* pi, u32* out,
+ const char** err_out) {
+ size_t i = *pi;
+ unsigned char c0;
+ u32 v;
+ u32 minv;
+ u32 need;
+
+ if (i >= len) {
+ *err_out = "truncated character literal";
+ return 0;
+ }
+ c0 = (unsigned char)s[i++];
+ if (c0 < 0x80u) {
+ *pi = i;
+ *out = c0;
+ return 1;
+ }
+ if ((c0 & 0xe0u) == 0xc0u) {
+ v = c0 & 0x1fu;
+ minv = 0x80u;
+ need = 1;
+ } else if ((c0 & 0xf0u) == 0xe0u) {
+ v = c0 & 0x0fu;
+ minv = 0x800u;
+ need = 2;
+ } else if ((c0 & 0xf8u) == 0xf0u) {
+ v = c0 & 0x07u;
+ minv = 0x10000u;
+ need = 3;
+ } else {
+ *err_out = "malformed UTF-8 in literal";
+ return 0;
+ }
+ while (need--) {
+ unsigned char cx;
+ if (i >= len) {
+ *err_out = "truncated UTF-8 character";
+ return 0;
+ }
+ cx = (unsigned char)s[i++];
+ if ((cx & 0xc0u) != 0x80u) {
+ *err_out = "malformed UTF-8 in literal";
+ return 0;
+ }
+ v = (v << 6) | (u32)(cx & 0x3fu);
+ }
+ if (v < minv || !valid_scalar(v)) {
+ *err_out = "invalid Unicode scalar value in literal";
+ return 0;
+ }
+ *pi = i;
+ *out = v;
+ return 1;
+}
+
+int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out,
+ const char** err_out) {
+ size_t i = *pi;
+ int c;
+ if (i >= len) {
+ *err_out = "truncated character literal";
+ return 0;
+ }
+ out->kind = C_LIT_UNIT_CODEPOINT;
+ if (s[i] != '\\') {
+ if (!decode_utf8(s, len, &i, &out->value, err_out)) return 0;
+ *pi = i;
+ return 1;
+ }
+ ++i;
+ if (i >= len) {
+ *err_out = "trailing '\\' in literal";
+ return 0;
+ }
+ c = (unsigned char)s[i++];
+ switch (c) {
+ case 'n':
+ out->value = '\n';
+ break;
+ case 't':
+ out->value = '\t';
+ break;
+ case 'r':
+ out->value = '\r';
+ break;
+ case 'b':
+ out->value = '\b';
+ break;
+ case 'f':
+ out->value = '\f';
+ break;
+ case 'v':
+ out->value = '\v';
+ break;
+ case 'a':
+ out->value = '\a';
+ break;
+ case '\\':
+ out->value = '\\';
+ break;
+ case '\'':
+ out->value = '\'';
+ break;
+ case '"':
+ out->value = '"';
+ break;
+ case '?':
+ out->value = '?';
+ break;
+ case 'x': {
+ u32 hex = 0;
+ int any = 0;
+ out->kind = C_LIT_UNIT_NUMERIC;
+ while (i < len) {
+ int dv = hex_value((unsigned char)s[i]);
+ if (dv < 0) break;
+ hex = hex * 16u + (u32)dv;
+ any = 1;
+ ++i;
+ }
+ if (!any) {
+ *err_out = "\\x with no hex digits";
+ return 0;
+ }
+ out->value = hex;
+ break;
+ }
+ case 'u':
+ case 'U': {
+ int n = c == 'u' ? 4 : 8;
+ u32 ucn = 0;
+ int j;
+ for (j = 0; j < n; ++j) {
+ int dv;
+ if (i >= len) {
+ *err_out = "truncated UCN";
+ return 0;
+ }
+ dv = hex_value((unsigned char)s[i++]);
+ if (dv < 0) {
+ *err_out = "malformed UCN";
+ return 0;
+ }
+ ucn = ucn * 16u + (u32)dv;
+ }
+ if (!valid_ucn(ucn)) {
+ *err_out = "invalid UCN scalar value";
+ return 0;
+ }
+ out->value = ucn;
+ break;
+ }
+ default:
+ if (c >= '0' && c <= '7') {
+ u32 oct = (u32)(c - '0');
+ int n = 1;
+ out->kind = C_LIT_UNIT_NUMERIC;
+ while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') {
+ oct = oct * 8u + (u32)(s[i] - '0');
+ ++i;
+ ++n;
+ }
+ out->value = oct;
+ } else {
+ out->value = (u32)(unsigned char)c;
+ }
+ break;
+ }
+ *pi = i;
+ return 1;
+}
+
+void c_lit_encode_uint_le(u8* dst, u32 size, u32 value) {
+ u32 i;
+ for (i = 0; i < size; ++i) dst[i] = (u8)((value >> (8u * i)) & 0xffu);
+}
+
+static void append_unit(u8* dst, size_t* pk, u32 elem_size, u32 value) {
+ c_lit_encode_uint_le(dst + *pk, elem_size, value);
+ *pk += elem_size;
+}
+
+static int append_utf8(u8* dst, size_t* pk, u32 cp, const char** err_out) {
+ if (!valid_scalar(cp)) {
+ *err_out = "invalid Unicode scalar value";
+ return 0;
+ }
+ if (cp <= 0x7fu) {
+ dst[(*pk)++] = (u8)cp;
+ } else if (cp <= 0x7ffu) {
+ dst[(*pk)++] = (u8)(0xc0u | (cp >> 6));
+ dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
+ } else if (cp <= 0xffffu) {
+ dst[(*pk)++] = (u8)(0xe0u | (cp >> 12));
+ dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu));
+ dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
+ } else {
+ dst[(*pk)++] = (u8)(0xf0u | (cp >> 18));
+ dst[(*pk)++] = (u8)(0x80u | ((cp >> 12) & 0x3fu));
+ dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu));
+ dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu));
+ }
+ return 1;
+}
+
+static int append_utf16(u8* dst, size_t* pk, u32 elem_size, u32 cp,
+ const char** err_out) {
+ if (!valid_scalar(cp)) {
+ *err_out = "invalid Unicode scalar value";
+ return 0;
+ }
+ if (cp <= 0xffffu) {
+ append_unit(dst, pk, elem_size, cp);
+ } else {
+ u32 v = cp - 0x10000u;
+ append_unit(dst, pk, elem_size, 0xd800u | (v >> 10));
+ append_unit(dst, pk, elem_size, 0xdc00u | (v & 0x3ffu));
+ }
+ return 1;
+}
+
+int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc,
+ u32 elem_size, CLitUnit unit,
+ const char** err_out) {
+ u32 bits = elem_size >= 4u ? 32u : elem_size * 8u;
+ if (unit.kind == C_LIT_UNIT_NUMERIC) {
+ append_unit(dst, pk, elem_size, unit_mask(unit.value, bits));
+ return 1;
+ }
+ switch (enc) {
+ case C_LIT_STR_UTF8:
+ return append_utf8(dst, pk, unit.value, err_out);
+ case C_LIT_STR_UTF16:
+ return append_utf16(dst, pk, elem_size, unit.value, err_out);
+ case C_LIT_STR_UTF32:
+ append_unit(dst, pk, elem_size, unit.value);
+ return 1;
+ case C_LIT_STR_ORDINARY:
+ default:
+ append_unit(dst, pk, elem_size, unit_mask(unit.value, bits));
+ return 1;
+ }
+}
+
+int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit,
+ u32* out, const char** err_out) {
+ if (unit.kind == C_LIT_UNIT_NUMERIC) {
+ *out = unit_mask(unit.value, elem_bits);
+ return 1;
+ }
+ if (enc == C_LIT_STR_UTF16 && unit.value > 0xffffu) {
+ *err_out = "UTF-16 character constant does not fit in one code unit";
+ return 0;
+ }
+ if (enc == C_LIT_STR_UTF8 && unit.value > 0x7fu) {
+ *err_out = "UTF-8 character constant does not fit in one code unit";
+ return 0;
+ }
+ *out = unit_mask(unit.value, elem_bits);
+ return 1;
+}
diff --git a/lang/c/parse/literal_unicode.h b/lang/c/parse/literal_unicode.h
@@ -0,0 +1,32 @@
+#ifndef CFREE_C_PARSE_LITERAL_UNICODE_H
+#define CFREE_C_PARSE_LITERAL_UNICODE_H
+
+#include "c_support.h"
+
+typedef enum CLitUnitKind {
+ C_LIT_UNIT_CODEPOINT,
+ C_LIT_UNIT_NUMERIC,
+} CLitUnitKind;
+
+typedef struct CLitUnit {
+ u32 value;
+ u8 kind;
+} CLitUnit;
+
+typedef enum CLitStringEnc {
+ C_LIT_STR_ORDINARY,
+ C_LIT_STR_UTF8,
+ C_LIT_STR_UTF16,
+ C_LIT_STR_UTF32,
+} CLitStringEnc;
+
+int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out,
+ const char** err_out);
+int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc,
+ u32 elem_size, CLitUnit unit,
+ const char** err_out);
+int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit,
+ u32* out, const char** err_out);
+void c_lit_encode_uint_le(u8* dst, u32 size, u32 value);
+
+#endif
diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c
@@ -2,6 +2,7 @@
* constant evaluation. */
#include "parse/parse_priv.h"
+#include "parse/literal_unicode.h"
static const Type* ty_int(Parser* p) { return type_prim(p->pool, TY_INT); }
static const Type* ty_size_t(Parser* p) {
@@ -314,159 +315,19 @@ static const Type* float_literal_type(Parser* p, const Tok* t) {
return type_prim(p->pool, TY_DOUBLE);
}
-static int hex_value(int c) {
- if (c >= '0' && c <= '9') return c - '0';
- if (c >= 'a' && c <= 'f') return c - 'a' + 10;
- if (c >= 'A' && c <= 'F') return c - 'A' + 10;
- return -1;
-}
-
-static i64 char_mask_value(i64 v, u32 bits) {
- if (bits >= 64u) return v;
- return v & (i64)((1ull << bits) - 1ull);
-}
-
-static i64 decode_utf8_char(Parser* p, const char* s, size_t len, size_t* pi,
- SrcLoc loc) {
- size_t i = *pi;
- unsigned char c0;
- u32 v;
- u32 need;
- if (i >= len) compiler_panic(p->c, loc, "truncated character literal");
- c0 = (unsigned char)s[i++];
- if (c0 < 0x80u) {
- *pi = i;
- return c0;
- }
- if ((c0 & 0xe0u) == 0xc0u) {
- v = c0 & 0x1fu;
- need = 1;
- } else if ((c0 & 0xf0u) == 0xe0u) {
- v = c0 & 0x0fu;
- need = 2;
- } else if ((c0 & 0xf8u) == 0xf0u) {
- v = c0 & 0x07u;
- need = 3;
- } else {
- perr(p, "malformed UTF-8 in wide character literal");
- }
- while (need--) {
- unsigned char cx;
- if (i >= len) compiler_panic(p->c, loc, "truncated UTF-8 character");
- cx = (unsigned char)s[i++];
- if ((cx & 0xc0u) != 0x80u) perr(p, "malformed UTF-8 in wide literal");
- v = (v << 6) | (u32)(cx & 0x3fu);
- }
- *pi = i;
- return (i64)v;
-}
-
-static i64 decode_one_char(Parser* p, const char* s, size_t len, size_t* pi,
- SrcLoc loc, u32 unit_bits) {
- size_t i = *pi;
- i64 v;
- int c;
- if (i >= len) compiler_panic(p->c, loc, "truncated character literal");
- if (s[i] != '\\') {
- if (unit_bits > 8u)
- v = decode_utf8_char(p, s, len, &i, loc);
- else
- v = (unsigned char)s[i++];
- *pi = i;
- return v;
- }
- i++;
- if (i >= len) compiler_panic(p->c, loc, "trailing '\\' in literal");
- c = (unsigned char)s[i++];
- switch (c) {
- case 'n':
- v = '\n';
- break;
- case 't':
- v = '\t';
- break;
- case 'r':
- v = '\r';
- break;
- case 'b':
- v = '\b';
- break;
- case 'f':
- v = '\f';
- break;
- case 'v':
- v = '\v';
- break;
- case 'a':
- v = '\a';
- break;
- case '\\':
- v = '\\';
- break;
- case '\'':
- v = '\'';
- break;
- case '"':
- v = '"';
- break;
- case '?':
- v = '?';
- break;
- case 'x': {
- i64 hex = 0;
- int any = 0;
- while (i < len) {
- int dv = hex_value((unsigned char)s[i]);
- if (dv < 0)
- break;
- hex = hex * 16 + dv;
- any = 1;
- i++;
- }
- if (!any) compiler_panic(p->c, loc, "\\x with no hex digits");
- v = char_mask_value(hex, unit_bits);
- break;
- }
- case 'u':
- case 'U': {
- int n = c == 'u' ? 4 : 8;
- i64 ucn = 0;
- int j;
- for (j = 0; j < n; ++j) {
- int dv;
- if (i >= len) compiler_panic(p->c, loc, "truncated UCN");
- dv = hex_value((unsigned char)s[i++]);
- if (dv < 0) compiler_panic(p->c, loc, "malformed UCN");
- ucn = ucn * 16 + dv;
- }
- v = char_mask_value(ucn, unit_bits);
- break;
- }
- default:
- if (c >= '0' && c <= '7') {
- i64 oct = c - '0';
- int n = 1;
- while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') {
- oct = oct * 8 + (s[i] - '0');
- i++;
- n++;
- }
- v = char_mask_value(oct, unit_bits);
- } else {
- v = c;
- }
- break;
- }
- *pi = i;
- return v;
-}
-
const Type* char_literal_type(Parser* p, const Tok* t) {
if (t->flags & TF_STR_U16) return ty_char16(p);
if (t->flags & TF_STR_U32) return ty_char32(p);
return ty_int(p);
}
+static CLitStringEnc literal_string_encoding(const Tok* t) {
+ if (t->flags & TF_STR_U8) return C_LIT_STR_UTF8;
+ if (t->flags & TF_STR_U16) return C_LIT_STR_UTF16;
+ if (t->flags & (TF_STR_WIDE | TF_STR_U32)) return C_LIT_STR_UTF32;
+ return C_LIT_STR_ORDINARY;
+}
+
const Type* string_literal_elem_type(Parser* p, const Tok* t) {
if (t->flags & TF_STR_WIDE) return ty_int(p);
if (t->flags & TF_STR_U16) return ty_char16(p);
@@ -489,8 +350,11 @@ i64 decode_char_literal(Parser* p, const Tok* t) {
size_t len = 0;
const char* s = pool_str(p->pool, t->spelling, &len);
size_t i = 0;
- i64 v;
+ CLitUnit unit;
+ const char* err = NULL;
+ u32 v;
u32 bits = 8;
+ CLitStringEnc enc = literal_string_encoding(t);
if (!s) perr(p, "bad char literal");
if (t->flags & TF_STR_U8)
i = 2;
@@ -503,11 +367,16 @@ i64 decode_char_literal(Parser* p, const Tok* t) {
if (i >= len || s[i] != '\'') perr(p, "malformed character literal");
i++;
if (i >= len || s[i] == '\'') perr(p, "empty character literal");
- v = decode_one_char(p, s, len, &i, t->loc, bits);
+ if (!c_lit_decode_unit(s, len, &i, &unit, &err)) {
+ compiler_panic(p->c, t->loc, "%s", err ? err : "bad character literal");
+ }
+ if (!c_lit_encode_char_unit(enc, bits, unit, &v, &err)) {
+ compiler_panic(p->c, t->loc, "%s", err ? err : "bad character literal");
+ }
if (i >= len || s[i] != '\'') {
perr(p, "multi-character constants are not supported");
}
- return v;
+ return (i64)v;
}
u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) {
@@ -519,7 +388,8 @@ u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) {
size_t k = 0;
const Type* elem_ty;
u32 elem_size;
- u32 bits;
+ CLitStringEnc enc = literal_string_encoding(t);
+ const char* err = NULL;
if (!s) perr(p, "bad string literal");
if (t->flags & TF_STR_U8)
i = 2;
@@ -527,17 +397,20 @@ u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) {
i = 1;
elem_ty = string_literal_elem_type(p, t);
elem_size = c_abi_sizeof(p->abi, elem_ty);
- bits = elem_size >= 4u ? 32u : elem_size * 8u;
if (i >= len || s[i] != '"') perr(p, "malformed string literal");
i++;
buf = (u8*)h->alloc(h, (len + 1u) * elem_size, 1);
if (!buf) perr(p, "out of memory in string literal");
while (i < len && s[i] != '"') {
- i64 ch = decode_one_char(p, s, len, &i, t->loc, bits);
- encode_int_le(buf + k, elem_size, ch);
- k += elem_size;
+ CLitUnit unit;
+ if (!c_lit_decode_unit(s, len, &i, &unit, &err)) {
+ compiler_panic(p->c, t->loc, "%s", err ? err : "bad string literal");
+ }
+ if (!c_lit_append_string_unit(buf, &k, enc, elem_size, unit, &err)) {
+ compiler_panic(p->c, t->loc, "%s", err ? err : "bad string literal");
+ }
}
- encode_int_le(buf + k, elem_size, 0);
+ c_lit_encode_uint_le(buf + k, elem_size, 0);
k += elem_size;
*nlen_out = k;
return buf;
diff --git a/test/parse/cases/6_4_5_11_source_utf8_wide_string.c b/test/parse/cases/6_4_5_11_source_utf8_wide_string.c
@@ -0,0 +1,10 @@
+#include <stddef.h>
+
+int test_main(void) {
+ const wchar_t* w = L"é";
+ const unsigned int* u = U"é";
+ return sizeof(L"é") == 8 && w[0] == 0x00e9 && w[1] == 0 &&
+ sizeof(U"é") == 8 && u[0] == 0x00e9 && u[1] == 0
+ ? 42
+ : 0;
+}
diff --git a/test/parse/cases/6_4_5_11_source_utf8_wide_string.expected b/test/parse/cases/6_4_5_11_source_utf8_wide_string.expected
@@ -0,0 +1 @@
+42
diff --git a/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.c b/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.c
@@ -0,0 +1,3 @@
+int test_main(void) {
+ return L'\u0041';
+}
diff --git a/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.errpat b/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.errpat
@@ -0,0 +1 @@
+invalid UCN scalar value
diff --git a/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.c b/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.c
@@ -0,0 +1,3 @@
+int test_main(void) {
+ return U'\U00110000';
+}
diff --git a/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.errpat b/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.errpat
@@ -0,0 +1 @@
+invalid UCN scalar value
diff --git a/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.c b/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.c
@@ -0,0 +1,3 @@
+int test_main(void) {
+ return u'\U0001f600';
+}
diff --git a/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.errpat b/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.errpat
@@ -0,0 +1 @@
+UTF-16 character constant does not fit in one code unit