lib/icu/Canonicalize: convert all punctuation to ASCII
To make tag matching easier. Closes https://github.com/MusicPlayerDaemon/MPD/issues/1669
This commit is contained in:
parent
d7f545721b
commit
53530bd1d5
@ -237,6 +237,7 @@ of:
|
||||
The :command:`find` commands are case sensitive, while
|
||||
:command:`search` and related commands ignore case. The latter also
|
||||
applies `Unicode normalization <https://unicode.org/reports/tr15/>`__
|
||||
and converts all punctuation to ASCII equivalents
|
||||
if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support.
|
||||
|
||||
Prior to MPD 0.21, the syntax looked like this::
|
||||
|
@ -26,15 +26,48 @@
|
||||
|
||||
#ifdef HAVE_ICU
|
||||
#include "Normalize.hxx"
|
||||
#include "Transliterator.hxx"
|
||||
#include "Util.hxx"
|
||||
#include "util/AllocatedArray.hxx"
|
||||
#include "util/SpanCast.hxx"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ICU
|
||||
|
||||
using std::string_view_literals::operator""sv;
|
||||
|
||||
static IcuTransliterator *transliterator;
|
||||
|
||||
void
|
||||
IcuCanonicalizeInit()
|
||||
{
|
||||
assert(transliterator == nullptr);
|
||||
|
||||
const auto id =
|
||||
/* convert all punctuation to ASCII equivalents */
|
||||
"[:Punctuation:] Latin-ASCII;"sv;
|
||||
|
||||
transliterator = new IcuTransliterator(ToStringView(std::span{UCharFromUTF8(id)}),
|
||||
{});
|
||||
}
|
||||
|
||||
void
|
||||
IcuCanonicalizeFinish() noexcept
|
||||
{
|
||||
assert(transliterator != nullptr);
|
||||
|
||||
delete transliterator;
|
||||
transliterator = nullptr;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
AllocatedString
|
||||
IcuCanonicalize(std::string_view src, bool fold_case) noexcept
|
||||
try {
|
||||
#ifdef HAVE_ICU
|
||||
assert(transliterator != nullptr);
|
||||
|
||||
auto u = UCharFromUTF8(src);
|
||||
if (u.data() == nullptr)
|
||||
return {src};
|
||||
@ -45,6 +78,10 @@ try {
|
||||
n != nullptr)
|
||||
u = std::move(n);
|
||||
|
||||
if (auto t = transliterator->Transliterate(ToStringView(std::span{u}));
|
||||
t != nullptr)
|
||||
u = std::move(t);
|
||||
|
||||
return UCharToUTF8(ToStringView(std::span{u}));
|
||||
#else
|
||||
#error not implemented
|
||||
|
@ -28,6 +28,15 @@
|
||||
|
||||
class AllocatedString;
|
||||
|
||||
/**
|
||||
* Throws on error.
|
||||
*/
|
||||
void
|
||||
IcuCanonicalizeInit();
|
||||
|
||||
void
|
||||
IcuCanonicalizeFinish() noexcept;
|
||||
|
||||
/**
|
||||
* Transform the given string to "canonical" form to allow fuzzy
|
||||
* string comparisons. The full set of features (if ICU is being
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include "Init.hxx"
|
||||
#include "Collate.hxx"
|
||||
#include "Canonicalize.hxx"
|
||||
#include "util/RuntimeError.hxx"
|
||||
|
||||
#include <unicode/uclean.h>
|
||||
@ -33,11 +34,13 @@ IcuInit()
|
||||
u_errorName(code));
|
||||
|
||||
IcuCollateInit();
|
||||
IcuCanonicalizeInit();
|
||||
}
|
||||
|
||||
void
|
||||
IcuFinish() noexcept
|
||||
{
|
||||
IcuCanonicalizeFinish();
|
||||
IcuCollateFinish();
|
||||
|
||||
u_cleanup();
|
||||
|
@ -143,6 +143,18 @@ TEST_F(StringFilterTest, Normalize)
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ICU
|
||||
|
||||
TEST_F(StringFilterTest, Transliterate)
|
||||
{
|
||||
const StringFilter f{"'", true, false, false, false};
|
||||
|
||||
EXPECT_TRUE(f.Match("’"));
|
||||
EXPECT_FALSE(f.Match("\""));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
TEST_F(StringFilterTest, FoldCase)
|
||||
{
|
||||
const StringFilter f{"nëedlé", true, false, false, false};
|
||||
|
Loading…
Reference in New Issue
Block a user