lib/icu/Canonicalize: convert all punctuation to ASCII

To make tag matching easier.

Closes https://github.com/MusicPlayerDaemon/MPD/issues/1669
This commit is contained in:
Max Kellermann 2022-11-15 18:49:33 +01:00
parent d7f545721b
commit 53530bd1d5
5 changed files with 62 additions and 0 deletions

View File

@ -237,6 +237,7 @@ of:
The :command:`find` commands are case sensitive, while The :command:`find` commands are case sensitive, while
:command:`search` and related commands ignore case. The latter also :command:`search` and related commands ignore case. The latter also
applies `Unicode normalization <https://unicode.org/reports/tr15/>`__ applies `Unicode normalization <https://unicode.org/reports/tr15/>`__
and converts all punctuation to ASCII equivalents
if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support. if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support.
Prior to MPD 0.21, the syntax looked like this:: Prior to MPD 0.21, the syntax looked like this::

View File

@ -26,15 +26,48 @@
#ifdef HAVE_ICU #ifdef HAVE_ICU
#include "Normalize.hxx" #include "Normalize.hxx"
#include "Transliterator.hxx"
#include "Util.hxx" #include "Util.hxx"
#include "util/AllocatedArray.hxx" #include "util/AllocatedArray.hxx"
#include "util/SpanCast.hxx" #include "util/SpanCast.hxx"
#endif #endif
#ifdef HAVE_ICU
using std::string_view_literals::operator""sv;
static IcuTransliterator *transliterator;
void
IcuCanonicalizeInit()
{
assert(transliterator == nullptr);
const auto id =
/* convert all punctuation to ASCII equivalents */
"[:Punctuation:] Latin-ASCII;"sv;
transliterator = new IcuTransliterator(ToStringView(std::span{UCharFromUTF8(id)}),
{});
}
void
IcuCanonicalizeFinish() noexcept
{
assert(transliterator != nullptr);
delete transliterator;
transliterator = nullptr;
}
#endif
AllocatedString AllocatedString
IcuCanonicalize(std::string_view src, bool fold_case) noexcept IcuCanonicalize(std::string_view src, bool fold_case) noexcept
try { try {
#ifdef HAVE_ICU #ifdef HAVE_ICU
assert(transliterator != nullptr);
auto u = UCharFromUTF8(src); auto u = UCharFromUTF8(src);
if (u.data() == nullptr) if (u.data() == nullptr)
return {src}; return {src};
@ -45,6 +78,10 @@ try {
n != nullptr) n != nullptr)
u = std::move(n); u = std::move(n);
if (auto t = transliterator->Transliterate(ToStringView(std::span{u}));
t != nullptr)
u = std::move(t);
return UCharToUTF8(ToStringView(std::span{u})); return UCharToUTF8(ToStringView(std::span{u}));
#else #else
#error not implemented #error not implemented

View File

@ -28,6 +28,15 @@
class AllocatedString; class AllocatedString;
/**
* Throws on error.
*/
void
IcuCanonicalizeInit();
void
IcuCanonicalizeFinish() noexcept;
/** /**
* Transform the given string to "canonical" form to allow fuzzy * Transform the given string to "canonical" form to allow fuzzy
* string comparisons. The full set of features (if ICU is being * string comparisons. The full set of features (if ICU is being

View File

@ -19,6 +19,7 @@
#include "Init.hxx" #include "Init.hxx"
#include "Collate.hxx" #include "Collate.hxx"
#include "Canonicalize.hxx"
#include "util/RuntimeError.hxx" #include "util/RuntimeError.hxx"
#include <unicode/uclean.h> #include <unicode/uclean.h>
@ -33,11 +34,13 @@ IcuInit()
u_errorName(code)); u_errorName(code));
IcuCollateInit(); IcuCollateInit();
IcuCanonicalizeInit();
} }
void void
IcuFinish() noexcept IcuFinish() noexcept
{ {
IcuCanonicalizeFinish();
IcuCollateFinish(); IcuCollateFinish();
u_cleanup(); u_cleanup();

View File

@ -143,6 +143,18 @@ TEST_F(StringFilterTest, Normalize)
#endif #endif
#ifdef HAVE_ICU
TEST_F(StringFilterTest, Transliterate)
{
const StringFilter f{"'", true, false, false, false};
EXPECT_TRUE(f.Match(""));
EXPECT_FALSE(f.Match("\""));
}
#endif
TEST_F(StringFilterTest, FoldCase) TEST_F(StringFilterTest, FoldCase)
{ {
const StringFilter f{"nëedlé", true, false, false, false}; const StringFilter f{"nëedlé", true, false, false, false};