lib/icu/Canonicalize: convert all punctuation to ASCII

To make tag matching easier.

Closes https://github.com/MusicPlayerDaemon/MPD/issues/1669
This commit is contained in:
Max Kellermann
2022-11-15 18:49:33 +01:00
parent d7f545721b
commit 53530bd1d5
5 changed files with 62 additions and 0 deletions

View File

@@ -26,15 +26,48 @@
#ifdef HAVE_ICU
#include "Normalize.hxx"
#include "Transliterator.hxx"
#include "Util.hxx"
#include "util/AllocatedArray.hxx"
#include "util/SpanCast.hxx"
#endif
#ifdef HAVE_ICU
using std::string_view_literals::operator""sv;
static IcuTransliterator *transliterator;
void
IcuCanonicalizeInit()
{
assert(transliterator == nullptr);
const auto id =
/* convert all punctuation to ASCII equivalents */
"[:Punctuation:] Latin-ASCII;"sv;
transliterator = new IcuTransliterator(ToStringView(std::span{UCharFromUTF8(id)}),
{});
}
void
IcuCanonicalizeFinish() noexcept
{
assert(transliterator != nullptr);
delete transliterator;
transliterator = nullptr;
}
#endif
AllocatedString
IcuCanonicalize(std::string_view src, bool fold_case) noexcept
try {
#ifdef HAVE_ICU
assert(transliterator != nullptr);
auto u = UCharFromUTF8(src);
if (u.data() == nullptr)
return {src};
@@ -45,6 +78,10 @@ try {
n != nullptr)
u = std::move(n);
if (auto t = transliterator->Transliterate(ToStringView(std::span{u}));
t != nullptr)
u = std::move(t);
return UCharToUTF8(ToStringView(std::span{u}));
#else
#error not implemented

View File

@@ -28,6 +28,15 @@
class AllocatedString;
/**
* Throws on error.
*/
void
IcuCanonicalizeInit();
void
IcuCanonicalizeFinish() noexcept;
/**
* Transform the given string to "canonical" form to allow fuzzy
* string comparisons. The full set of features (if ICU is being

View File

@@ -19,6 +19,7 @@
#include "Init.hxx"
#include "Collate.hxx"
#include "Canonicalize.hxx"
#include "util/RuntimeError.hxx"
#include <unicode/uclean.h>
@@ -33,11 +34,13 @@ IcuInit()
u_errorName(code));
IcuCollateInit();
IcuCanonicalizeInit();
}
void
IcuFinish() noexcept
{
IcuCanonicalizeFinish();
IcuCollateFinish();
u_cleanup();