mpd/src/util/UTF8.cxx
2014-10-10 22:11:38 +02:00

346 lines
7.5 KiB
C++

/*
* Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
* http://www.musicpd.org
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "UTF8.hxx"
#include "CharUtil.hxx"
#include <algorithm>
/**
* Is this a leading byte that is followed by 1 continuation byte?
*/
static constexpr bool
IsLeading1(unsigned char ch)
{
return (ch & 0xe0) == 0xc0;
}
static constexpr unsigned char
MakeLeading1(unsigned char value)
{
return 0xc0 | value;
}
/**
* Is this a leading byte that is followed by 2 continuation byte?
*/
static constexpr bool
IsLeading2(unsigned char ch)
{
return (ch & 0xf0) == 0xe0;
}
static constexpr unsigned char
MakeLeading2(unsigned char value)
{
return 0xe0 | value;
}
/**
* Is this a leading byte that is followed by 3 continuation byte?
*/
static constexpr bool
IsLeading3(unsigned char ch)
{
return (ch & 0xf8) == 0xf0;
}
static constexpr unsigned char
MakeLeading3(unsigned char value)
{
return 0xf0 | value;
}
/**
* Is this a leading byte that is followed by 4 continuation byte?
*/
static constexpr bool
IsLeading4(unsigned char ch)
{
return (ch & 0xfc) == 0xf8;
}
static constexpr unsigned char
MakeLeading4(unsigned char value)
{
return 0xf8 | value;
}
/**
* Is this a leading byte that is followed by 5 continuation byte?
*/
static constexpr bool
IsLeading5(unsigned char ch)
{
return (ch & 0xfe) == 0xfc;
}
static constexpr unsigned char
MakeLeading5(unsigned char value)
{
return 0xfc | value;
}
static constexpr bool
IsContinuation(unsigned char ch)
{
return (ch & 0xc0) == 0x80;
}
/**
* Generate a continuation byte of the low 6 bit.
*/
static constexpr unsigned char
MakeContinuation(unsigned char value)
{
return 0x80 | (value & 0x3f);
}
bool
ValidateUTF8(const char *p)
{
for (; *p != 0; ++p) {
unsigned char ch = *p;
if (IsASCII(ch))
continue;
if (IsContinuation(ch))
/* continuation without a prefix */
return false;
if (IsLeading1(ch)) {
/* 1 continuation */
if (!IsContinuation(*++p))
return false;
} else if (IsLeading2(ch)) {
/* 2 continuations */
if (!IsContinuation(*++p) || !IsContinuation(*++p))
return false;
} else if (IsLeading3(ch)) {
/* 3 continuations */
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
!IsContinuation(*++p))
return false;
} else if (IsLeading4(ch)) {
/* 4 continuations */
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
!IsContinuation(*++p) || !IsContinuation(*++p))
return false;
} else if (IsLeading5(ch)) {
/* 5 continuations */
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
!IsContinuation(*++p) || !IsContinuation(*++p) ||
!IsContinuation(*++p))
return false;
} else
return false;
}
return true;
}
size_t
SequenceLengthUTF8(char ch)
{
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return 2;
else if (IsLeading2(ch))
/* 2 continuations */
return 3;
else if (IsLeading3(ch))
/* 3 continuations */
return 4;
else if (IsLeading4(ch))
/* 4 continuations */
return 5;
else if (IsLeading5(ch))
/* 5 continuations */
return 6;
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
template<size_t L>
struct CheckSequenceUTF8 {
gcc_pure
bool operator()(const char *p) const {
return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
}
};
template<>
struct CheckSequenceUTF8<0u> {
constexpr bool operator()(gcc_unused const char *p) const {
return true;
}
};
template<size_t L>
gcc_pure
static size_t
InnerSequenceLengthUTF8(const char *p)
{
return CheckSequenceUTF8<L>()(p)
? L + 1
: 0u;
}
size_t
SequenceLengthUTF8(const char *p)
{
const unsigned char ch = *p++;
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return InnerSequenceLengthUTF8<1>(p);
else if (IsLeading2(ch))
/* 2 continuations */
return InnerSequenceLengthUTF8<2>(p);
else if (IsLeading3(ch))
/* 3 continuations */
return InnerSequenceLengthUTF8<3>(p);
else if (IsLeading4(ch))
/* 4 continuations */
return InnerSequenceLengthUTF8<4>(p);
else if (IsLeading5(ch))
/* 5 continuations */
return InnerSequenceLengthUTF8<5>(p);
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
static const char *
FindNonASCIIOrZero(const char *p)
{
while (*p != 0 && IsASCII(*p))
++p;
return p;
}
const char *
Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer,
size_t buffer_size)
{
const char *p = FindNonASCIIOrZero(src);
if (*p == 0)
/* everything is plain ASCII, we don't need to convert anything */
return src;
if ((size_t)(p - src) >= buffer_size)
/* buffer too small */
return nullptr;
const char *const end = buffer + buffer_size;
char *q = std::copy(src, p, buffer);
while (*p != 0) {
unsigned char ch = *p++;
if (IsASCII(ch)) {
*q++ = ch;
if (q >= end)
/* buffer too small */
return nullptr;
} else {
if (q + 2 >= end)
/* buffer too small */
return nullptr;
*q++ = MakeLeading1(ch >> 6);
*q++ = MakeContinuation(ch);
}
}
*q = 0;
return buffer;
}
char *
UnicodeToUTF8(unsigned ch, char *q)
{
if (gcc_likely(ch < 0x80)) {
*q++ = (char)ch;
} else if (gcc_likely(ch < 0x800)) {
*q++ = MakeLeading1(ch >> 6);
*q++ = MakeContinuation(ch);
} else if (ch < 0x10000) {
*q++ = MakeLeading2(ch >> 12);
*q++ = MakeContinuation(ch >> 6);
*q++ = MakeContinuation(ch);
} else if (ch < 0x200000) {
*q++ = MakeLeading3(ch >> 18);
*q++ = MakeContinuation(ch >> 12);
*q++ = MakeContinuation(ch >> 6);
*q++ = MakeContinuation(ch);
} else if (ch < 0x4000000) {
*q++ = MakeLeading4(ch >> 24);
*q++ = MakeContinuation(ch >> 18);
*q++ = MakeContinuation(ch >> 12);
*q++ = MakeContinuation(ch >> 6);
*q++ = MakeContinuation(ch);
} else if (ch < 0x80000000) {
*q++ = MakeLeading5(ch >> 30);
*q++ = MakeContinuation(ch >> 24);
*q++ = MakeContinuation(ch >> 18);
*q++ = MakeContinuation(ch >> 12);
*q++ = MakeContinuation(ch >> 6);
*q++ = MakeContinuation(ch);
} else {
// error
}
return q;
}
size_t
LengthUTF8(const char *p)
{
/* this is a very naive implementation: it does not do any
verification, it just counts the bytes that are not a UTF-8
continuation */
size_t n = 0;
for (; *p != 0; ++p)
if (!IsContinuation(*p))
++n;
return n;
}