From d5cf41e0431db4fd7d28848899015d8b9225eb45 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 10 Oct 2014 20:42:32 +0200 Subject: [PATCH] util/UTF8: new library --- Makefile.am | 1 + src/util/UTF8.cxx | 265 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/UTF8.hxx | 74 +++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 src/util/UTF8.cxx create mode 100644 src/util/UTF8.hxx diff --git a/Makefile.am b/Makefile.am index e263f9c61..96c387f21 100644 --- a/Makefile.am +++ b/Makefile.am @@ -366,6 +366,7 @@ libutil_a_SOURCES = \ src/util/Domain.hxx \ src/util/ReusableArray.hxx \ src/util/ASCII.hxx \ + src/util/UTF8.cxx src/util/UTF8.hxx \ src/util/CharUtil.hxx \ src/util/NumberParser.hxx \ src/util/StringUtil.cxx src/util/StringUtil.hxx \ diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx new file mode 100644 index 000000000..273cbac1e --- /dev/null +++ b/src/util/UTF8.cxx @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2011-2014 Max Kellermann + * http://www.musicpd.org + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "UTF8.hxx" +#include "CharUtil.hxx" + +#include + +/** + * Is this a leading byte that is followed by 1 continuation byte? + */ +static constexpr bool +IsLeading1(unsigned char ch) +{ + return (ch & 0xe0) == 0xc0; +} + +static constexpr unsigned char +MakeLeading1(unsigned char value) +{ + return 0xc0 | value; +} + +/** + * Is this a leading byte that is followed by 2 continuation byte? + */ +static constexpr bool +IsLeading2(unsigned char ch) +{ + return (ch & 0xf0) == 0xe0; +} + +static constexpr unsigned char +MakeLeading2(unsigned char value) +{ + return 0xe0 | value; +} + +/** + * Is this a leading byte that is followed by 3 continuation byte? + */ +static constexpr bool +IsLeading3(unsigned char ch) +{ + return (ch & 0xf8) == 0xf0; +} + +static constexpr unsigned char +MakeLeading3(unsigned char value) +{ + return 0xf0 | value; +} + +/** + * Is this a leading byte that is followed by 4 continuation byte? + */ +static constexpr bool +IsLeading4(unsigned char ch) +{ + return (ch & 0xfc) == 0xf8; +} + +static constexpr unsigned char +MakeLeading4(unsigned char value) +{ + return 0xf8 | value; +} + +/** + * Is this a leading byte that is followed by 5 continuation byte? + */ +static constexpr bool +IsLeading5(unsigned char ch) +{ + return (ch & 0xfe) == 0xfc; +} + +static constexpr unsigned char +MakeLeading5(unsigned char value) +{ + return 0xfc | value; +} + +static constexpr bool +IsContinuation(unsigned char ch) +{ + return (ch & 0xc0) == 0x80; +} + +/** + * Generate a continuation byte of the low 6 bit. + */ +static constexpr unsigned char +MakeContinuation(unsigned char value) +{ + return 0x80 | (value & 0x3f); +} + +bool +ValidateUTF8(const char *p) +{ + for (; *p != 0; ++p) { + unsigned char ch = *p; + if (IsASCII(ch)) + continue; + + if (IsContinuation(ch)) + /* continuation without a prefix */ + return false; + + if (IsLeading1(ch)) { + /* 1 continuation */ + if (!IsContinuation(*++p)) + return false; + } else if (IsLeading2(ch)) { + /* 2 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p)) + return false; + } else if (IsLeading3(ch)) { + /* 3 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p)) + return false; + } else if (IsLeading4(ch)) { + /* 4 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p) || !IsContinuation(*++p)) + return false; + } else if (IsLeading5(ch)) { + /* 5 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p)) + return false; + } else + return false; + } + + return true; +} + +static const char * +FindNonASCIIOrZero(const char *p) +{ + while (*p != 0 && IsASCII(*p)) + ++p; + return p; +} + +const char * +Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer, + size_t buffer_size) +{ + const char *p = FindNonASCIIOrZero(src); + if (*p == 0) + /* everything is plain ASCII, we don't need to convert anything */ + return src; + + if ((size_t)(p - src) >= buffer_size) + /* buffer too small */ + return nullptr; + + const char *const end = buffer + buffer_size; + char *q = std::copy(src, p, buffer); + + while (*p != 0) { + unsigned char ch = *p++; + + if (IsASCII(ch)) { + *q++ = ch; + + if (q >= end) + /* buffer too small */ + return nullptr; + } else { + if (q + 2 >= end) + /* buffer too small */ + return nullptr; + + *q++ = MakeLeading1(ch >> 6); + *q++ = MakeContinuation(ch); + } + } + + *q = 0; + return buffer; +} + +char * +UnicodeToUTF8(unsigned ch, char *q) +{ + if (gcc_likely(ch < 0x80)) { + *q++ = (char)ch; + } else if (gcc_likely(ch < 0x800)) { + *q++ = MakeLeading1(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x10000) { + *q++ = MakeLeading2(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x200000) { + *q++ = MakeLeading3(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x4000000) { + *q++ = MakeLeading4(ch >> 24); + *q++ = MakeContinuation(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x80000000) { + *q++ = MakeLeading5(ch >> 30); + *q++ = MakeContinuation(ch >> 24); + *q++ = MakeContinuation(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else { + // error + } + + return q; +} + +size_t +LengthUTF8(const char *p) +{ + /* this is a very naive implementation: it does not do any + verification, it just counts the bytes that are not a UTF-8 + continuation */ + + size_t n = 0; + for (; *p != 0; ++p) + if (!IsContinuation(*p)) + ++n; + return n; +} diff --git a/src/util/UTF8.hxx b/src/util/UTF8.hxx new file mode 100644 index 000000000..d3d694f6b --- /dev/null +++ b/src/util/UTF8.hxx @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2011-2014 Max Kellermann + * http://www.musicpd.org + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UTF8_HXX +#define UTF8_HXX + +#include "Compiler.h" + +#include + +/** + * Is this a valid UTF-8 string? + */ +gcc_pure gcc_nonnull_all +bool +ValidateUTF8(const char *p); + +/** + * Convert the specified string from ISO-8859-1 to UTF-8. + * + * @return the UTF-8 version of the source string; may return #src if + * there are no non-ASCII characters; returns nullptr if the destination + * buffer is too small + */ +gcc_pure gcc_nonnull_all +const char * +Latin1ToUTF8(const char *src, char *buffer, size_t buffer_size); + +/** + * Convert the specified Unicode character to UTF-8 and write it to + * the buffer. buffer must have a length of at least 6! + * + * @return a pointer to the buffer plus the added bytes(s) + */ +gcc_nonnull_all +char * +UnicodeToUTF8(unsigned ch, char *buffer); + +/** + * Returns the number of characters in the string. This is different + * from strlen(), which counts the number of bytes. + */ +gcc_pure gcc_nonnull_all +size_t +LengthUTF8(const char *p); + +#endif