util/UTF8: new library
This commit is contained in:
parent
b7a1954c33
commit
d5cf41e043
@ -366,6 +366,7 @@ libutil_a_SOURCES = \
|
|||||||
src/util/Domain.hxx \
|
src/util/Domain.hxx \
|
||||||
src/util/ReusableArray.hxx \
|
src/util/ReusableArray.hxx \
|
||||||
src/util/ASCII.hxx \
|
src/util/ASCII.hxx \
|
||||||
|
src/util/UTF8.cxx src/util/UTF8.hxx \
|
||||||
src/util/CharUtil.hxx \
|
src/util/CharUtil.hxx \
|
||||||
src/util/NumberParser.hxx \
|
src/util/NumberParser.hxx \
|
||||||
src/util/StringUtil.cxx src/util/StringUtil.hxx \
|
src/util/StringUtil.cxx src/util/StringUtil.hxx \
|
||||||
|
265
src/util/UTF8.cxx
Normal file
265
src/util/UTF8.cxx
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
|
||||||
|
* http://www.musicpd.org
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* - Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* - Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||||
|
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "UTF8.hxx"
|
||||||
|
#include "CharUtil.hxx"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a leading byte that is followed by 1 continuation byte?
|
||||||
|
*/
|
||||||
|
static constexpr bool
|
||||||
|
IsLeading1(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xe0) == 0xc0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeLeading1(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0xc0 | value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a leading byte that is followed by 2 continuation byte?
|
||||||
|
*/
|
||||||
|
static constexpr bool
|
||||||
|
IsLeading2(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xf0) == 0xe0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeLeading2(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0xe0 | value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a leading byte that is followed by 3 continuation byte?
|
||||||
|
*/
|
||||||
|
static constexpr bool
|
||||||
|
IsLeading3(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xf8) == 0xf0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeLeading3(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0xf0 | value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a leading byte that is followed by 4 continuation byte?
|
||||||
|
*/
|
||||||
|
static constexpr bool
|
||||||
|
IsLeading4(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xfc) == 0xf8;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeLeading4(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0xf8 | value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a leading byte that is followed by 5 continuation byte?
|
||||||
|
*/
|
||||||
|
static constexpr bool
|
||||||
|
IsLeading5(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xfe) == 0xfc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeLeading5(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0xfc | value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool
|
||||||
|
IsContinuation(unsigned char ch)
|
||||||
|
{
|
||||||
|
return (ch & 0xc0) == 0x80;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a continuation byte of the low 6 bit.
|
||||||
|
*/
|
||||||
|
static constexpr unsigned char
|
||||||
|
MakeContinuation(unsigned char value)
|
||||||
|
{
|
||||||
|
return 0x80 | (value & 0x3f);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
ValidateUTF8(const char *p)
|
||||||
|
{
|
||||||
|
for (; *p != 0; ++p) {
|
||||||
|
unsigned char ch = *p;
|
||||||
|
if (IsASCII(ch))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (IsContinuation(ch))
|
||||||
|
/* continuation without a prefix */
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (IsLeading1(ch)) {
|
||||||
|
/* 1 continuation */
|
||||||
|
if (!IsContinuation(*++p))
|
||||||
|
return false;
|
||||||
|
} else if (IsLeading2(ch)) {
|
||||||
|
/* 2 continuations */
|
||||||
|
if (!IsContinuation(*++p) || !IsContinuation(*++p))
|
||||||
|
return false;
|
||||||
|
} else if (IsLeading3(ch)) {
|
||||||
|
/* 3 continuations */
|
||||||
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
||||||
|
!IsContinuation(*++p))
|
||||||
|
return false;
|
||||||
|
} else if (IsLeading4(ch)) {
|
||||||
|
/* 4 continuations */
|
||||||
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
||||||
|
!IsContinuation(*++p) || !IsContinuation(*++p))
|
||||||
|
return false;
|
||||||
|
} else if (IsLeading5(ch)) {
|
||||||
|
/* 5 continuations */
|
||||||
|
if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
||||||
|
!IsContinuation(*++p) || !IsContinuation(*++p) ||
|
||||||
|
!IsContinuation(*++p))
|
||||||
|
return false;
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *
|
||||||
|
FindNonASCIIOrZero(const char *p)
|
||||||
|
{
|
||||||
|
while (*p != 0 && IsASCII(*p))
|
||||||
|
++p;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *
|
||||||
|
Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer,
|
||||||
|
size_t buffer_size)
|
||||||
|
{
|
||||||
|
const char *p = FindNonASCIIOrZero(src);
|
||||||
|
if (*p == 0)
|
||||||
|
/* everything is plain ASCII, we don't need to convert anything */
|
||||||
|
return src;
|
||||||
|
|
||||||
|
if ((size_t)(p - src) >= buffer_size)
|
||||||
|
/* buffer too small */
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
const char *const end = buffer + buffer_size;
|
||||||
|
char *q = std::copy(src, p, buffer);
|
||||||
|
|
||||||
|
while (*p != 0) {
|
||||||
|
unsigned char ch = *p++;
|
||||||
|
|
||||||
|
if (IsASCII(ch)) {
|
||||||
|
*q++ = ch;
|
||||||
|
|
||||||
|
if (q >= end)
|
||||||
|
/* buffer too small */
|
||||||
|
return nullptr;
|
||||||
|
} else {
|
||||||
|
if (q + 2 >= end)
|
||||||
|
/* buffer too small */
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
*q++ = MakeLeading1(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*q = 0;
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *
|
||||||
|
UnicodeToUTF8(unsigned ch, char *q)
|
||||||
|
{
|
||||||
|
if (gcc_likely(ch < 0x80)) {
|
||||||
|
*q++ = (char)ch;
|
||||||
|
} else if (gcc_likely(ch < 0x800)) {
|
||||||
|
*q++ = MakeLeading1(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
} else if (ch < 0x10000) {
|
||||||
|
*q++ = MakeLeading2(ch >> 12);
|
||||||
|
*q++ = MakeContinuation(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
} else if (ch < 0x200000) {
|
||||||
|
*q++ = MakeLeading3(ch >> 18);
|
||||||
|
*q++ = MakeContinuation(ch >> 12);
|
||||||
|
*q++ = MakeContinuation(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
} else if (ch < 0x4000000) {
|
||||||
|
*q++ = MakeLeading4(ch >> 24);
|
||||||
|
*q++ = MakeContinuation(ch >> 18);
|
||||||
|
*q++ = MakeContinuation(ch >> 12);
|
||||||
|
*q++ = MakeContinuation(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
} else if (ch < 0x80000000) {
|
||||||
|
*q++ = MakeLeading5(ch >> 30);
|
||||||
|
*q++ = MakeContinuation(ch >> 24);
|
||||||
|
*q++ = MakeContinuation(ch >> 18);
|
||||||
|
*q++ = MakeContinuation(ch >> 12);
|
||||||
|
*q++ = MakeContinuation(ch >> 6);
|
||||||
|
*q++ = MakeContinuation(ch);
|
||||||
|
} else {
|
||||||
|
// error
|
||||||
|
}
|
||||||
|
|
||||||
|
return q;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t
|
||||||
|
LengthUTF8(const char *p)
|
||||||
|
{
|
||||||
|
/* this is a very naive implementation: it does not do any
|
||||||
|
verification, it just counts the bytes that are not a UTF-8
|
||||||
|
continuation */
|
||||||
|
|
||||||
|
size_t n = 0;
|
||||||
|
for (; *p != 0; ++p)
|
||||||
|
if (!IsContinuation(*p))
|
||||||
|
++n;
|
||||||
|
return n;
|
||||||
|
}
|
74
src/util/UTF8.hxx
Normal file
74
src/util/UTF8.hxx
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
|
||||||
|
* http://www.musicpd.org
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* - Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* - Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||||
|
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef UTF8_HXX
|
||||||
|
#define UTF8_HXX
|
||||||
|
|
||||||
|
#include "Compiler.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this a valid UTF-8 string?
|
||||||
|
*/
|
||||||
|
gcc_pure gcc_nonnull_all
|
||||||
|
bool
|
||||||
|
ValidateUTF8(const char *p);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the specified string from ISO-8859-1 to UTF-8.
|
||||||
|
*
|
||||||
|
* @return the UTF-8 version of the source string; may return #src if
|
||||||
|
* there are no non-ASCII characters; returns nullptr if the destination
|
||||||
|
* buffer is too small
|
||||||
|
*/
|
||||||
|
gcc_pure gcc_nonnull_all
|
||||||
|
const char *
|
||||||
|
Latin1ToUTF8(const char *src, char *buffer, size_t buffer_size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the specified Unicode character to UTF-8 and write it to
|
||||||
|
* the buffer. buffer must have a length of at least 6!
|
||||||
|
*
|
||||||
|
* @return a pointer to the buffer plus the added bytes(s)
|
||||||
|
*/
|
||||||
|
gcc_nonnull_all
|
||||||
|
char *
|
||||||
|
UnicodeToUTF8(unsigned ch, char *buffer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of characters in the string. This is different
|
||||||
|
* from strlen(), which counts the number of bytes.
|
||||||
|
*/
|
||||||
|
gcc_pure gcc_nonnull_all
|
||||||
|
size_t
|
||||||
|
LengthUTF8(const char *p);
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user