wind: Deny invalid UTF-8 encodings

Codepoints above U+10FFFF and overlong encodings are considered invalid.
Unpaired surrogates are not, as these are known to be generated on
occasion — by Windows, for example.

Signed-off-by: Joseph Sutton <josephsutton@catalyst.net.nz>
This commit is contained in:
Joseph Sutton
2023-07-12 13:04:56 +12:00
committed by Nico Williams
parent bf25b38c0a
commit a3878d3e9d
2 changed files with 32 additions and 15 deletions

View File

@@ -50,21 +50,32 @@ static const char *failing_testcases[] = {
"\xF7",
"\xC0\x01",
"\xC0\x7F",
"\xC0\x80",
"\xC0\x81",
"\xC0\xFF",
"\xC0\x80\x80",
"\xC1\x80",
"\xE0\x01",
"\xE0\x7F",
"\xE0\x80",
"\xE0\xFF",
"\xE0\x80\x20",
"\xE0\x80\xFF",
"\xE0\x80\x80",
"\xE0\x80\x81",
"\xE0\x80\x80\x80",
"\xE0\x81\x80",
"\xF0\x01",
"\xF0\x80",
"\xF0\x80\x01",
"\xF0\x80\x80",
"\xF0\x80\x80\x01",
"\xF0\x80\x80\x80",
"\xF0\x80\x80\x81",
"\xF0\x80\x80\xFF",
"\xF0\x80\x81\x80",
"\xF0\x81\x80\x80",
"\xF7\xBF\xBF\xBF",
NULL
};
@@ -82,21 +93,10 @@ static const struct testcase testcases[] = {
{"\x01", 1, {1}, 0},
{"\x7F", 1, {0x7F}, 0},
{"\x01\x7F", 2, {0x01, 0x7F}, 0},
{"\xC0\x80", 1, {0}, 0},
{"\xC0\x81", 1, {1}, 0},
{"\xC1\x80", 1, {0x40}, 0},
{"\xDF\xBF", 1, {0x7FF}, 0},
{"\xE0\x80\x80", 1, {0}, 0},
{"\xE0\x80\x81", 1, {1}, 0},
{"\xE0\x81\x80", 1, {0x40}, 0},
{"\xE1\x80\x80", 1, {0x1000}, 0},
{"\xEF\xBF\xBF", 1, {0xFFFF}, 0},
{"\xF0\x80\x80\x80", 1, {0}, 0},
{"\xF0\x80\x80\x81", 1, {1}, 0},
{"\xF0\x80\x81\x80", 1, {0x40}, 0},
{"\xF0\x81\x80\x80", 1, {0x1000}, 0},
{"\xF1\x80\x80\x80", 1, {0x40000}, 0},
{"\xF7\xBF\xBF\xBF", 1, {0X1FFFFF}, 1},
};
int

View File

@@ -39,13 +39,17 @@ utf8toutf32(const unsigned char **pp, uint32_t *out)
{
const unsigned char *p = *pp;
uint32_t c = *p;
uint32_t out_val;
if (c & 0x80) {
if ((c & 0xE0) == 0xC0) {
const uint32_t c2 = *++p;
if ((c2 & 0xC0) == 0x80) {
*out = ((c & 0x1F) << 6)
out_val = ((c & 0x1F) << 6)
| (c2 & 0x3F);
if (out_val < 0x80) {
return WIND_ERR_INVALID_UTF8;
}
} else {
return WIND_ERR_INVALID_UTF8;
}
@@ -54,9 +58,12 @@ utf8toutf32(const unsigned char **pp, uint32_t *out)
if ((c2 & 0xC0) == 0x80) {
const uint32_t c3 = *++p;
if ((c3 & 0xC0) == 0x80) {
*out = ((c & 0x0F) << 12)
out_val = ((c & 0x0F) << 12)
| ((c2 & 0x3F) << 6)
| (c3 & 0x3F);
if (out_val < 0x800) {
return WIND_ERR_INVALID_UTF8;
}
} else {
return WIND_ERR_INVALID_UTF8;
}
@@ -70,10 +77,13 @@ utf8toutf32(const unsigned char **pp, uint32_t *out)
if ((c3 & 0xC0) == 0x80) {
const uint32_t c4 = *++p;
if ((c4 & 0xC0) == 0x80) {
*out = ((c & 0x07) << 18)
out_val = ((c & 0x07) << 18)
| ((c2 & 0x3F) << 12)
| ((c3 & 0x3F) << 6)
| (c4 & 0x3F);
if (out_val < 0x10000) {
return WIND_ERR_INVALID_UTF8;
}
} else {
return WIND_ERR_INVALID_UTF8;
}
@@ -87,9 +97,16 @@ utf8toutf32(const unsigned char **pp, uint32_t *out)
return WIND_ERR_INVALID_UTF8;
}
} else {
*out = c;
out_val = c;
}
/* Allow unpaired surrogates (in the range 0xd8000xdfff). */
if (out_val > 0x10ffff) {
return WIND_ERR_INVALID_UTF8;
}
*out = out_val;
*pp = p;
return 0;