2023-03-06 14:42:04 +01:00
|
|
|
// SPDX-License-Identifier: BSD-2-Clause
|
|
|
|
// author: Max Kellermann <max.kellermann@gmail.com>
|
2014-10-10 20:42:32 +02:00
|
|
|
|
|
|
|
#include "UTF8.hxx"
|
|
|
|
#include "CharUtil.hxx"
|
2021-11-26 16:23:16 +01:00
|
|
|
#include "Compiler.h"
|
2014-10-10 20:42:32 +02:00
|
|
|
|
|
|
|
#include <algorithm>
|
2020-03-13 01:08:53 +01:00
|
|
|
#include <cstdint>
|
2019-10-14 13:41:59 +02:00
|
|
|
|
2014-10-10 20:42:32 +02:00
|
|
|
/**
|
|
|
|
* Is this a leading byte that is followed by 1 continuation byte?
|
|
|
|
*/
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsLeading1(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xe0) == 0xc0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeLeading1(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0xc0 | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Is this a leading byte that is followed by 2 continuation byte?
|
|
|
|
*/
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsLeading2(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xf0) == 0xe0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeLeading2(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0xe0 | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Is this a leading byte that is followed by 3 continuation byte?
|
|
|
|
*/
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsLeading3(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xf8) == 0xf0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeLeading3(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0xf0 | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Is this a leading byte that is followed by 4 continuation byte?
|
|
|
|
*/
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsLeading4(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xfc) == 0xf8;
|
|
|
|
}
|
|
|
|
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeLeading4(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0xf8 | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Is this a leading byte that is followed by 5 continuation byte?
|
|
|
|
*/
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsLeading5(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xfe) == 0xfc;
|
|
|
|
}
|
|
|
|
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeLeading5(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0xfc | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
static constexpr bool
|
2019-10-14 13:41:59 +02:00
|
|
|
IsContinuation(uint8_t ch) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return (ch & 0xc0) == 0x80;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Generate a continuation byte of the low 6 bit.
|
|
|
|
*/
|
2019-10-14 13:41:59 +02:00
|
|
|
static constexpr uint8_t
|
|
|
|
MakeContinuation(uint8_t value) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
return 0x80 | (value & 0x3f);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
2017-05-08 14:44:49 +02:00
|
|
|
ValidateUTF8(const char *p) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
for (; *p != 0; ++p) {
|
2019-10-14 13:41:59 +02:00
|
|
|
uint8_t ch = *p;
|
2014-10-10 20:42:32 +02:00
|
|
|
if (IsASCII(ch))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (IsContinuation(ch))
|
|
|
|
/* continuation without a prefix */
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (IsLeading1(ch)) {
|
|
|
|
/* 1 continuation */
|
|
|
|
if (!IsContinuation(*++p))
|
|
|
|
return false;
|
|
|
|
} else if (IsLeading2(ch)) {
|
|
|
|
/* 2 continuations */
|
|
|
|
if (!IsContinuation(*++p) || !IsContinuation(*++p))
|
|
|
|
return false;
|
|
|
|
} else if (IsLeading3(ch)) {
|
|
|
|
/* 3 continuations */
|
|
|
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
|
|
|
!IsContinuation(*++p))
|
|
|
|
return false;
|
|
|
|
} else if (IsLeading4(ch)) {
|
|
|
|
/* 4 continuations */
|
|
|
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
|
|
|
!IsContinuation(*++p) || !IsContinuation(*++p))
|
|
|
|
return false;
|
|
|
|
} else if (IsLeading5(ch)) {
|
|
|
|
/* 5 continuations */
|
|
|
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
|
|
|
!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
|
|
|
!IsContinuation(*++p))
|
|
|
|
return false;
|
|
|
|
} else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
std::size_t
|
2017-05-08 14:44:49 +02:00
|
|
|
SequenceLengthUTF8(char ch) noexcept
|
2014-10-10 21:17:40 +02:00
|
|
|
{
|
|
|
|
if (IsASCII(ch))
|
|
|
|
return 1;
|
|
|
|
else if (IsLeading1(ch))
|
|
|
|
/* 1 continuation */
|
|
|
|
return 2;
|
|
|
|
else if (IsLeading2(ch))
|
|
|
|
/* 2 continuations */
|
|
|
|
return 3;
|
|
|
|
else if (IsLeading3(ch))
|
|
|
|
/* 3 continuations */
|
|
|
|
return 4;
|
|
|
|
else if (IsLeading4(ch))
|
|
|
|
/* 4 continuations */
|
|
|
|
return 5;
|
|
|
|
else if (IsLeading5(ch))
|
|
|
|
/* 5 continuations */
|
|
|
|
return 6;
|
|
|
|
else
|
|
|
|
/* continuation without a prefix or some other illegal
|
|
|
|
start byte */
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
template<std::size_t L>
|
2014-10-10 21:17:40 +02:00
|
|
|
struct CheckSequenceUTF8 {
|
2021-11-26 16:23:16 +01:00
|
|
|
[[gnu::pure]]
|
2017-05-08 14:44:49 +02:00
|
|
|
bool operator()(const char *p) const noexcept {
|
2014-10-10 21:17:40 +02:00
|
|
|
return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
2020-02-02 23:31:45 +01:00
|
|
|
struct CheckSequenceUTF8<0U> {
|
2020-03-12 20:56:11 +01:00
|
|
|
constexpr bool operator()([[maybe_unused]] const char *p) const noexcept {
|
2014-10-10 21:17:40 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
template<std::size_t L>
|
2021-11-26 16:23:16 +01:00
|
|
|
[[gnu::pure]]
|
2021-02-09 06:21:58 +01:00
|
|
|
static std::size_t
|
2017-05-08 14:44:49 +02:00
|
|
|
InnerSequenceLengthUTF8(const char *p) noexcept
|
2014-10-10 21:17:40 +02:00
|
|
|
{
|
|
|
|
return CheckSequenceUTF8<L>()(p)
|
|
|
|
? L + 1
|
2020-02-02 23:31:45 +01:00
|
|
|
: 0U;
|
2014-10-10 21:17:40 +02:00
|
|
|
}
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
std::size_t
|
2017-05-08 14:44:49 +02:00
|
|
|
SequenceLengthUTF8(const char *p) noexcept
|
2014-10-10 21:17:40 +02:00
|
|
|
{
|
2019-10-14 13:41:59 +02:00
|
|
|
const uint8_t ch = *p++;
|
2014-10-10 21:17:40 +02:00
|
|
|
|
|
|
|
if (IsASCII(ch))
|
|
|
|
return 1;
|
|
|
|
else if (IsLeading1(ch))
|
|
|
|
/* 1 continuation */
|
|
|
|
return InnerSequenceLengthUTF8<1>(p);
|
|
|
|
else if (IsLeading2(ch))
|
|
|
|
/* 2 continuations */
|
|
|
|
return InnerSequenceLengthUTF8<2>(p);
|
|
|
|
else if (IsLeading3(ch))
|
|
|
|
/* 3 continuations */
|
|
|
|
return InnerSequenceLengthUTF8<3>(p);
|
|
|
|
else if (IsLeading4(ch))
|
|
|
|
/* 4 continuations */
|
|
|
|
return InnerSequenceLengthUTF8<4>(p);
|
|
|
|
else if (IsLeading5(ch))
|
|
|
|
/* 5 continuations */
|
|
|
|
return InnerSequenceLengthUTF8<5>(p);
|
|
|
|
else
|
|
|
|
/* continuation without a prefix or some other illegal
|
|
|
|
start byte */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-11-26 16:23:16 +01:00
|
|
|
[[gnu::pure]]
|
2014-10-10 20:42:32 +02:00
|
|
|
static const char *
|
2017-05-08 14:44:49 +02:00
|
|
|
FindNonASCIIOrZero(const char *p) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
while (*p != 0 && IsASCII(*p))
|
|
|
|
++p;
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *
|
|
|
|
Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer,
|
2021-02-09 06:21:58 +01:00
|
|
|
std::size_t buffer_size) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
const char *p = FindNonASCIIOrZero(src);
|
|
|
|
if (*p == 0)
|
|
|
|
/* everything is plain ASCII, we don't need to convert anything */
|
|
|
|
return src;
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
if ((std::size_t)(p - src) >= buffer_size)
|
2014-10-10 20:42:32 +02:00
|
|
|
/* buffer too small */
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
const char *const end = buffer + buffer_size;
|
|
|
|
char *q = std::copy(src, p, buffer);
|
|
|
|
|
|
|
|
while (*p != 0) {
|
2019-10-14 13:41:59 +02:00
|
|
|
uint8_t ch = *p++;
|
2014-10-10 20:42:32 +02:00
|
|
|
|
|
|
|
if (IsASCII(ch)) {
|
|
|
|
*q++ = ch;
|
|
|
|
|
|
|
|
if (q >= end)
|
|
|
|
/* buffer too small */
|
|
|
|
return nullptr;
|
|
|
|
} else {
|
|
|
|
if (q + 2 >= end)
|
|
|
|
/* buffer too small */
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
*q++ = MakeLeading1(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*q = 0;
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
char *
|
2017-05-08 14:44:49 +02:00
|
|
|
UnicodeToUTF8(unsigned ch, char *q) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
if (gcc_likely(ch < 0x80)) {
|
|
|
|
*q++ = (char)ch;
|
|
|
|
} else if (gcc_likely(ch < 0x800)) {
|
|
|
|
*q++ = MakeLeading1(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
} else if (ch < 0x10000) {
|
|
|
|
*q++ = MakeLeading2(ch >> 12);
|
|
|
|
*q++ = MakeContinuation(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
} else if (ch < 0x200000) {
|
|
|
|
*q++ = MakeLeading3(ch >> 18);
|
|
|
|
*q++ = MakeContinuation(ch >> 12);
|
|
|
|
*q++ = MakeContinuation(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
} else if (ch < 0x4000000) {
|
|
|
|
*q++ = MakeLeading4(ch >> 24);
|
|
|
|
*q++ = MakeContinuation(ch >> 18);
|
|
|
|
*q++ = MakeContinuation(ch >> 12);
|
|
|
|
*q++ = MakeContinuation(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
} else if (ch < 0x80000000) {
|
|
|
|
*q++ = MakeLeading5(ch >> 30);
|
|
|
|
*q++ = MakeContinuation(ch >> 24);
|
|
|
|
*q++ = MakeContinuation(ch >> 18);
|
|
|
|
*q++ = MakeContinuation(ch >> 12);
|
|
|
|
*q++ = MakeContinuation(ch >> 6);
|
|
|
|
*q++ = MakeContinuation(ch);
|
|
|
|
} else {
|
|
|
|
// error
|
|
|
|
}
|
|
|
|
|
|
|
|
return q;
|
|
|
|
}
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
std::size_t
|
2017-05-08 14:44:49 +02:00
|
|
|
LengthUTF8(const char *p) noexcept
|
2014-10-10 20:42:32 +02:00
|
|
|
{
|
|
|
|
/* this is a very naive implementation: it does not do any
|
|
|
|
verification, it just counts the bytes that are not a UTF-8
|
|
|
|
continuation */
|
|
|
|
|
2021-02-09 06:21:58 +01:00
|
|
|
std::size_t n = 0;
|
2014-10-10 20:42:32 +02:00
|
|
|
for (; *p != 0; ++p)
|
|
|
|
if (!IsContinuation(*p))
|
|
|
|
++n;
|
|
|
|
return n;
|
|
|
|
}
|