kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit cb323097a119ba2428b1eb1457cff882a572a43a
parent 2bd5ad377b0a0ce27256ec64bfb3fecab5538c70
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 19 May 2026 12:00:12 -0700

Support Unicode transcoding in C literals

Diffstat:
Alang/c/parse/literal_unicode.c | 286+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alang/c/parse/literal_unicode.h | 32++++++++++++++++++++++++++++++++
Mlang/c/parse/parse_expr.c | 185+++++++++++++------------------------------------------------------------------
Atest/parse/cases/6_4_5_11_source_utf8_wide_string.c | 10++++++++++
Atest/parse/cases/6_4_5_11_source_utf8_wide_string.expected | 1+
Atest/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.c | 3+++
Atest/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.errpat | 1+
Atest/parse/cases_err/6_4_4_4_invalid_ucn_scalar.c | 3+++
Atest/parse/cases_err/6_4_4_4_invalid_ucn_scalar.errpat | 1+
Atest/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.c | 3+++
Atest/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.errpat | 1+
11 files changed, 370 insertions(+), 156 deletions(-)

diff --git a/lang/c/parse/literal_unicode.c b/lang/c/parse/literal_unicode.c @@ -0,0 +1,286 @@ +#include "parse/literal_unicode.h" + +static int hex_value(int c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} + +static u32 unit_mask(u32 v, u32 bits) { + if (bits >= 32u) return v; + return v & ((1u << bits) - 1u); +} + +static int valid_scalar(u32 cp) { + return cp <= 0x10ffffu && !(cp >= 0xd800u && cp <= 0xdfffu); +} + +static int valid_ucn(u32 cp) { + if (!valid_scalar(cp)) return 0; + if (cp < 0x00a0u && cp != 0x24u && cp != 0x40u && cp != 0x60u) return 0; + return 1; +} + +static int decode_utf8(const char* s, size_t len, size_t* pi, u32* out, + const char** err_out) { + size_t i = *pi; + unsigned char c0; + u32 v; + u32 minv; + u32 need; + + if (i >= len) { + *err_out = "truncated character literal"; + return 0; + } + c0 = (unsigned char)s[i++]; + if (c0 < 0x80u) { + *pi = i; + *out = c0; + return 1; + } + if ((c0 & 0xe0u) == 0xc0u) { + v = c0 & 0x1fu; + minv = 0x80u; + need = 1; + } else if ((c0 & 0xf0u) == 0xe0u) { + v = c0 & 0x0fu; + minv = 0x800u; + need = 2; + } else if ((c0 & 0xf8u) == 0xf0u) { + v = c0 & 0x07u; + minv = 0x10000u; + need = 3; + } else { + *err_out = "malformed UTF-8 in literal"; + return 0; + } + while (need--) { + unsigned char cx; + if (i >= len) { + *err_out = "truncated UTF-8 character"; + return 0; + } + cx = (unsigned char)s[i++]; + if ((cx & 0xc0u) != 0x80u) { + *err_out = "malformed UTF-8 in literal"; + return 0; + } + v = (v << 6) | (u32)(cx & 0x3fu); + } + if (v < minv || !valid_scalar(v)) { + *err_out = "invalid Unicode scalar value in literal"; + return 0; + } + *pi = i; + *out = v; + return 1; +} + +int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out, + const char** err_out) { + size_t i = *pi; + int c; + if (i >= len) { + *err_out = "truncated character literal"; + return 0; + } + out->kind = C_LIT_UNIT_CODEPOINT; + if (s[i] != '\\') { + if (!decode_utf8(s, len, &i, &out->value, err_out)) return 0; + *pi = i; + return 1; + } + ++i; + if (i >= len) { + *err_out = "trailing '\\' in literal"; + return 0; + } + c = (unsigned char)s[i++]; + switch (c) { + case 'n': + out->value = '\n'; + break; + case 't': + out->value = '\t'; + break; + case 'r': + out->value = '\r'; + break; + case 'b': + out->value = '\b'; + break; + case 'f': + out->value = '\f'; + break; + case 'v': + out->value = '\v'; + break; + case 'a': + out->value = '\a'; + break; + case '\\': + out->value = '\\'; + break; + case '\'': + out->value = '\''; + break; + case '"': + out->value = '"'; + break; + case '?': + out->value = '?'; + break; + case 'x': { + u32 hex = 0; + int any = 0; + out->kind = C_LIT_UNIT_NUMERIC; + while (i < len) { + int dv = hex_value((unsigned char)s[i]); + if (dv < 0) break; + hex = hex * 16u + (u32)dv; + any = 1; + ++i; + } + if (!any) { + *err_out = "\\x with no hex digits"; + return 0; + } + out->value = hex; + break; + } + case 'u': + case 'U': { + int n = c == 'u' ? 4 : 8; + u32 ucn = 0; + int j; + for (j = 0; j < n; ++j) { + int dv; + if (i >= len) { + *err_out = "truncated UCN"; + return 0; + } + dv = hex_value((unsigned char)s[i++]); + if (dv < 0) { + *err_out = "malformed UCN"; + return 0; + } + ucn = ucn * 16u + (u32)dv; + } + if (!valid_ucn(ucn)) { + *err_out = "invalid UCN scalar value"; + return 0; + } + out->value = ucn; + break; + } + default: + if (c >= '0' && c <= '7') { + u32 oct = (u32)(c - '0'); + int n = 1; + out->kind = C_LIT_UNIT_NUMERIC; + while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') { + oct = oct * 8u + (u32)(s[i] - '0'); + ++i; + ++n; + } + out->value = oct; + } else { + out->value = (u32)(unsigned char)c; + } + break; + } + *pi = i; + return 1; +} + +void c_lit_encode_uint_le(u8* dst, u32 size, u32 value) { + u32 i; + for (i = 0; i < size; ++i) dst[i] = (u8)((value >> (8u * i)) & 0xffu); +} + +static void append_unit(u8* dst, size_t* pk, u32 elem_size, u32 value) { + c_lit_encode_uint_le(dst + *pk, elem_size, value); + *pk += elem_size; +} + +static int append_utf8(u8* dst, size_t* pk, u32 cp, const char** err_out) { + if (!valid_scalar(cp)) { + *err_out = "invalid Unicode scalar value"; + return 0; + } + if (cp <= 0x7fu) { + dst[(*pk)++] = (u8)cp; + } else if (cp <= 0x7ffu) { + dst[(*pk)++] = (u8)(0xc0u | (cp >> 6)); + dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); + } else if (cp <= 0xffffu) { + dst[(*pk)++] = (u8)(0xe0u | (cp >> 12)); + dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu)); + dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); + } else { + dst[(*pk)++] = (u8)(0xf0u | (cp >> 18)); + dst[(*pk)++] = (u8)(0x80u | ((cp >> 12) & 0x3fu)); + dst[(*pk)++] = (u8)(0x80u | ((cp >> 6) & 0x3fu)); + dst[(*pk)++] = (u8)(0x80u | (cp & 0x3fu)); + } + return 1; +} + +static int append_utf16(u8* dst, size_t* pk, u32 elem_size, u32 cp, + const char** err_out) { + if (!valid_scalar(cp)) { + *err_out = "invalid Unicode scalar value"; + return 0; + } + if (cp <= 0xffffu) { + append_unit(dst, pk, elem_size, cp); + } else { + u32 v = cp - 0x10000u; + append_unit(dst, pk, elem_size, 0xd800u | (v >> 10)); + append_unit(dst, pk, elem_size, 0xdc00u | (v & 0x3ffu)); + } + return 1; +} + +int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc, + u32 elem_size, CLitUnit unit, + const char** err_out) { + u32 bits = elem_size >= 4u ? 32u : elem_size * 8u; + if (unit.kind == C_LIT_UNIT_NUMERIC) { + append_unit(dst, pk, elem_size, unit_mask(unit.value, bits)); + return 1; + } + switch (enc) { + case C_LIT_STR_UTF8: + return append_utf8(dst, pk, unit.value, err_out); + case C_LIT_STR_UTF16: + return append_utf16(dst, pk, elem_size, unit.value, err_out); + case C_LIT_STR_UTF32: + append_unit(dst, pk, elem_size, unit.value); + return 1; + case C_LIT_STR_ORDINARY: + default: + append_unit(dst, pk, elem_size, unit_mask(unit.value, bits)); + return 1; + } +} + +int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit, + u32* out, const char** err_out) { + if (unit.kind == C_LIT_UNIT_NUMERIC) { + *out = unit_mask(unit.value, elem_bits); + return 1; + } + if (enc == C_LIT_STR_UTF16 && unit.value > 0xffffu) { + *err_out = "UTF-16 character constant does not fit in one code unit"; + return 0; + } + if (enc == C_LIT_STR_UTF8 && unit.value > 0x7fu) { + *err_out = "UTF-8 character constant does not fit in one code unit"; + return 0; + } + *out = unit_mask(unit.value, elem_bits); + return 1; +} diff --git a/lang/c/parse/literal_unicode.h b/lang/c/parse/literal_unicode.h @@ -0,0 +1,32 @@ +#ifndef CFREE_C_PARSE_LITERAL_UNICODE_H +#define CFREE_C_PARSE_LITERAL_UNICODE_H + +#include "c_support.h" + +typedef enum CLitUnitKind { + C_LIT_UNIT_CODEPOINT, + C_LIT_UNIT_NUMERIC, +} CLitUnitKind; + +typedef struct CLitUnit { + u32 value; + u8 kind; +} CLitUnit; + +typedef enum CLitStringEnc { + C_LIT_STR_ORDINARY, + C_LIT_STR_UTF8, + C_LIT_STR_UTF16, + C_LIT_STR_UTF32, +} CLitStringEnc; + +int c_lit_decode_unit(const char* s, size_t len, size_t* pi, CLitUnit* out, + const char** err_out); +int c_lit_append_string_unit(u8* dst, size_t* pk, CLitStringEnc enc, + u32 elem_size, CLitUnit unit, + const char** err_out); +int c_lit_encode_char_unit(CLitStringEnc enc, u32 elem_bits, CLitUnit unit, + u32* out, const char** err_out); +void c_lit_encode_uint_le(u8* dst, u32 size, u32 value); + +#endif diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c @@ -2,6 +2,7 @@ * constant evaluation. */ #include "parse/parse_priv.h" +#include "parse/literal_unicode.h" static const Type* ty_int(Parser* p) { return type_prim(p->pool, TY_INT); } static const Type* ty_size_t(Parser* p) { @@ -314,159 +315,19 @@ static const Type* float_literal_type(Parser* p, const Tok* t) { return type_prim(p->pool, TY_DOUBLE); } -static int hex_value(int c) { - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - return -1; -} - -static i64 char_mask_value(i64 v, u32 bits) { - if (bits >= 64u) return v; - return v & (i64)((1ull << bits) - 1ull); -} - -static i64 decode_utf8_char(Parser* p, const char* s, size_t len, size_t* pi, - SrcLoc loc) { - size_t i = *pi; - unsigned char c0; - u32 v; - u32 need; - if (i >= len) compiler_panic(p->c, loc, "truncated character literal"); - c0 = (unsigned char)s[i++]; - if (c0 < 0x80u) { - *pi = i; - return c0; - } - if ((c0 & 0xe0u) == 0xc0u) { - v = c0 & 0x1fu; - need = 1; - } else if ((c0 & 0xf0u) == 0xe0u) { - v = c0 & 0x0fu; - need = 2; - } else if ((c0 & 0xf8u) == 0xf0u) { - v = c0 & 0x07u; - need = 3; - } else { - perr(p, "malformed UTF-8 in wide character literal"); - } - while (need--) { - unsigned char cx; - if (i >= len) compiler_panic(p->c, loc, "truncated UTF-8 character"); - cx = (unsigned char)s[i++]; - if ((cx & 0xc0u) != 0x80u) perr(p, "malformed UTF-8 in wide literal"); - v = (v << 6) | (u32)(cx & 0x3fu); - } - *pi = i; - return (i64)v; -} - -static i64 decode_one_char(Parser* p, const char* s, size_t len, size_t* pi, - SrcLoc loc, u32 unit_bits) { - size_t i = *pi; - i64 v; - int c; - if (i >= len) compiler_panic(p->c, loc, "truncated character literal"); - if (s[i] != '\\') { - if (unit_bits > 8u) - v = decode_utf8_char(p, s, len, &i, loc); - else - v = (unsigned char)s[i++]; - *pi = i; - return v; - } - i++; - if (i >= len) compiler_panic(p->c, loc, "trailing '\\' in literal"); - c = (unsigned char)s[i++]; - switch (c) { - case 'n': - v = '\n'; - break; - case 't': - v = '\t'; - break; - case 'r': - v = '\r'; - break; - case 'b': - v = '\b'; - break; - case 'f': - v = '\f'; - break; - case 'v': - v = '\v'; - break; - case 'a': - v = '\a'; - break; - case '\\': - v = '\\'; - break; - case '\'': - v = '\''; - break; - case '"': - v = '"'; - break; - case '?': - v = '?'; - break; - case 'x': { - i64 hex = 0; - int any = 0; - while (i < len) { - int dv = hex_value((unsigned char)s[i]); - if (dv < 0) - break; - hex = hex * 16 + dv; - any = 1; - i++; - } - if (!any) compiler_panic(p->c, loc, "\\x with no hex digits"); - v = char_mask_value(hex, unit_bits); - break; - } - case 'u': - case 'U': { - int n = c == 'u' ? 4 : 8; - i64 ucn = 0; - int j; - for (j = 0; j < n; ++j) { - int dv; - if (i >= len) compiler_panic(p->c, loc, "truncated UCN"); - dv = hex_value((unsigned char)s[i++]); - if (dv < 0) compiler_panic(p->c, loc, "malformed UCN"); - ucn = ucn * 16 + dv; - } - v = char_mask_value(ucn, unit_bits); - break; - } - default: - if (c >= '0' && c <= '7') { - i64 oct = c - '0'; - int n = 1; - while (n < 3 && i < len && s[i] >= '0' && s[i] <= '7') { - oct = oct * 8 + (s[i] - '0'); - i++; - n++; - } - v = char_mask_value(oct, unit_bits); - } else { - v = c; - } - break; - } - *pi = i; - return v; -} - const Type* char_literal_type(Parser* p, const Tok* t) { if (t->flags & TF_STR_U16) return ty_char16(p); if (t->flags & TF_STR_U32) return ty_char32(p); return ty_int(p); } +static CLitStringEnc literal_string_encoding(const Tok* t) { + if (t->flags & TF_STR_U8) return C_LIT_STR_UTF8; + if (t->flags & TF_STR_U16) return C_LIT_STR_UTF16; + if (t->flags & (TF_STR_WIDE | TF_STR_U32)) return C_LIT_STR_UTF32; + return C_LIT_STR_ORDINARY; +} + const Type* string_literal_elem_type(Parser* p, const Tok* t) { if (t->flags & TF_STR_WIDE) return ty_int(p); if (t->flags & TF_STR_U16) return ty_char16(p); @@ -489,8 +350,11 @@ i64 decode_char_literal(Parser* p, const Tok* t) { size_t len = 0; const char* s = pool_str(p->pool, t->spelling, &len); size_t i = 0; - i64 v; + CLitUnit unit; + const char* err = NULL; + u32 v; u32 bits = 8; + CLitStringEnc enc = literal_string_encoding(t); if (!s) perr(p, "bad char literal"); if (t->flags & TF_STR_U8) i = 2; @@ -503,11 +367,16 @@ i64 decode_char_literal(Parser* p, const Tok* t) { if (i >= len || s[i] != '\'') perr(p, "malformed character literal"); i++; if (i >= len || s[i] == '\'') perr(p, "empty character literal"); - v = decode_one_char(p, s, len, &i, t->loc, bits); + if (!c_lit_decode_unit(s, len, &i, &unit, &err)) { + compiler_panic(p->c, t->loc, "%s", err ? err : "bad character literal"); + } + if (!c_lit_encode_char_unit(enc, bits, unit, &v, &err)) { + compiler_panic(p->c, t->loc, "%s", err ? err : "bad character literal"); + } if (i >= len || s[i] != '\'') { perr(p, "multi-character constants are not supported"); } - return v; + return (i64)v; } u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) { @@ -519,7 +388,8 @@ u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) { size_t k = 0; const Type* elem_ty; u32 elem_size; - u32 bits; + CLitStringEnc enc = literal_string_encoding(t); + const char* err = NULL; if (!s) perr(p, "bad string literal"); if (t->flags & TF_STR_U8) i = 2; @@ -527,17 +397,20 @@ u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out) { i = 1; elem_ty = string_literal_elem_type(p, t); elem_size = c_abi_sizeof(p->abi, elem_ty); - bits = elem_size >= 4u ? 32u : elem_size * 8u; if (i >= len || s[i] != '"') perr(p, "malformed string literal"); i++; buf = (u8*)h->alloc(h, (len + 1u) * elem_size, 1); if (!buf) perr(p, "out of memory in string literal"); while (i < len && s[i] != '"') { - i64 ch = decode_one_char(p, s, len, &i, t->loc, bits); - encode_int_le(buf + k, elem_size, ch); - k += elem_size; + CLitUnit unit; + if (!c_lit_decode_unit(s, len, &i, &unit, &err)) { + compiler_panic(p->c, t->loc, "%s", err ? err : "bad string literal"); + } + if (!c_lit_append_string_unit(buf, &k, enc, elem_size, unit, &err)) { + compiler_panic(p->c, t->loc, "%s", err ? err : "bad string literal"); + } } - encode_int_le(buf + k, elem_size, 0); + c_lit_encode_uint_le(buf + k, elem_size, 0); k += elem_size; *nlen_out = k; return buf; diff --git a/test/parse/cases/6_4_5_11_source_utf8_wide_string.c b/test/parse/cases/6_4_5_11_source_utf8_wide_string.c @@ -0,0 +1,10 @@ +#include <stddef.h> + +int test_main(void) { + const wchar_t* w = L"é"; + const unsigned int* u = U"é"; + return sizeof(L"é") == 8 && w[0] == 0x00e9 && w[1] == 0 && + sizeof(U"é") == 8 && u[0] == 0x00e9 && u[1] == 0 + ? 42 + : 0; +} diff --git a/test/parse/cases/6_4_5_11_source_utf8_wide_string.expected b/test/parse/cases/6_4_5_11_source_utf8_wide_string.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.c b/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.c @@ -0,0 +1,3 @@ +int test_main(void) { + return L'\u0041'; +} diff --git a/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.errpat b/test/parse/cases_err/6_4_4_4_invalid_basic_char_ucn.errpat @@ -0,0 +1 @@ +invalid UCN scalar value diff --git a/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.c b/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.c @@ -0,0 +1,3 @@ +int test_main(void) { + return U'\U00110000'; +} diff --git a/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.errpat b/test/parse/cases_err/6_4_4_4_invalid_ucn_scalar.errpat @@ -0,0 +1 @@ +invalid UCN scalar value diff --git a/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.c b/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.c @@ -0,0 +1,3 @@ +int test_main(void) { + return u'\U0001f600'; +} diff --git a/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.errpat b/test/parse/cases_err/6_4_4_4_utf16_char_surrogate_pair.errpat @@ -0,0 +1 @@ +UTF-16 character constant does not fit in one code unit