From bf25b38c0af6262b7873bacc7bd5c5eb70a69c56 Mon Sep 17 00:00:00 2001 From: Joseph Sutton Date: Mon, 14 Aug 2023 14:19:03 +1200 Subject: [PATCH] =?UTF-8?q?wind:=20Support=20UTF-8=E2=80=93encoding=20non-?= =?UTF-8?q?BMP=20codepoints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Joseph Sutton --- lib/wind/utf8.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/lib/wind/utf8.c b/lib/wind/utf8.c index 7b72e8ce8..4dcc15f22 100644 --- a/lib/wind/utf8.c +++ b/lib/wind/utf8.c @@ -478,7 +478,7 @@ wind_utf8ucs2_length(const char *in, size_t *out_len) int wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) { - uint16_t ch; + uint32_t ch; size_t i, len, o; for (o = 0, i = 0; i < in_len; i++) { @@ -488,8 +488,36 @@ wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) len = 1; } else if (ch < 0x800) { len = 2; - } else + } else if (ch < 0xd800 || ch >= 0xe000) { len = 3; + } else if (ch < 0xdc00) { + /* A high surrogate. */ + if (i < in_len - 1) { + uint16_t ch2 = in[i + 1]; + + if (ch2 >= 0xdc00 && ch2 < 0xe000) { + uint16_t high_ten_bits; + uint16_t low_ten_bits; + + /* A surrogate pair. */ + high_ten_bits = ch & 0x3ff; + low_ten_bits = ch2 & 0x3ff; + + ch = 0x10000 + ((uint32_t)high_ten_bits << 10 | low_ten_bits); + len = 4; + ++i; + } else { + /* An unpaired high surrogate. */ + len = 3; + } + } else { + /* An unpaired high surrogate. */ + len = 3; + } + } else { + /* An unpaired low surrogate. */ + len = 3; + } o += len; @@ -498,6 +526,10 @@ wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) return WIND_ERR_OVERRUN; switch(len) { + case 4: + out[3] = (ch | 0x80) & 0xbf; + ch = ch >> 6; + HEIM_FALLTHROUGH; case 3: out[2] = (ch | 0x80) & 0xbf; ch = ch >> 6;