lib/icu/Canonicalize: convert all punctuation to ASCII
To make tag matching easier. Closes https://github.com/MusicPlayerDaemon/MPD/issues/1669
This commit is contained in:
parent
d7f545721b
commit
53530bd1d5
|
@ -237,6 +237,7 @@ of:
|
||||||
The :command:`find` commands are case sensitive, while
|
The :command:`find` commands are case sensitive, while
|
||||||
:command:`search` and related commands ignore case. The latter also
|
:command:`search` and related commands ignore case. The latter also
|
||||||
applies `Unicode normalization <https://unicode.org/reports/tr15/>`__
|
applies `Unicode normalization <https://unicode.org/reports/tr15/>`__
|
||||||
|
and converts all punctuation to ASCII equivalents
|
||||||
if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support.
|
if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support.
|
||||||
|
|
||||||
Prior to MPD 0.21, the syntax looked like this::
|
Prior to MPD 0.21, the syntax looked like this::
|
||||||
|
|
|
@ -26,15 +26,48 @@
|
||||||
|
|
||||||
#ifdef HAVE_ICU
|
#ifdef HAVE_ICU
|
||||||
#include "Normalize.hxx"
|
#include "Normalize.hxx"
|
||||||
|
#include "Transliterator.hxx"
|
||||||
#include "Util.hxx"
|
#include "Util.hxx"
|
||||||
#include "util/AllocatedArray.hxx"
|
#include "util/AllocatedArray.hxx"
|
||||||
#include "util/SpanCast.hxx"
|
#include "util/SpanCast.hxx"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_ICU
|
||||||
|
|
||||||
|
using std::string_view_literals::operator""sv;
|
||||||
|
|
||||||
|
static IcuTransliterator *transliterator;
|
||||||
|
|
||||||
|
void
|
||||||
|
IcuCanonicalizeInit()
|
||||||
|
{
|
||||||
|
assert(transliterator == nullptr);
|
||||||
|
|
||||||
|
const auto id =
|
||||||
|
/* convert all punctuation to ASCII equivalents */
|
||||||
|
"[:Punctuation:] Latin-ASCII;"sv;
|
||||||
|
|
||||||
|
transliterator = new IcuTransliterator(ToStringView(std::span{UCharFromUTF8(id)}),
|
||||||
|
{});
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
IcuCanonicalizeFinish() noexcept
|
||||||
|
{
|
||||||
|
assert(transliterator != nullptr);
|
||||||
|
|
||||||
|
delete transliterator;
|
||||||
|
transliterator = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
AllocatedString
|
AllocatedString
|
||||||
IcuCanonicalize(std::string_view src, bool fold_case) noexcept
|
IcuCanonicalize(std::string_view src, bool fold_case) noexcept
|
||||||
try {
|
try {
|
||||||
#ifdef HAVE_ICU
|
#ifdef HAVE_ICU
|
||||||
|
assert(transliterator != nullptr);
|
||||||
|
|
||||||
auto u = UCharFromUTF8(src);
|
auto u = UCharFromUTF8(src);
|
||||||
if (u.data() == nullptr)
|
if (u.data() == nullptr)
|
||||||
return {src};
|
return {src};
|
||||||
|
@ -45,6 +78,10 @@ try {
|
||||||
n != nullptr)
|
n != nullptr)
|
||||||
u = std::move(n);
|
u = std::move(n);
|
||||||
|
|
||||||
|
if (auto t = transliterator->Transliterate(ToStringView(std::span{u}));
|
||||||
|
t != nullptr)
|
||||||
|
u = std::move(t);
|
||||||
|
|
||||||
return UCharToUTF8(ToStringView(std::span{u}));
|
return UCharToUTF8(ToStringView(std::span{u}));
|
||||||
#else
|
#else
|
||||||
#error not implemented
|
#error not implemented
|
||||||
|
|
|
@ -28,6 +28,15 @@
|
||||||
|
|
||||||
class AllocatedString;
|
class AllocatedString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Throws on error.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
IcuCanonicalizeInit();
|
||||||
|
|
||||||
|
void
|
||||||
|
IcuCanonicalizeFinish() noexcept;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Transform the given string to "canonical" form to allow fuzzy
|
* Transform the given string to "canonical" form to allow fuzzy
|
||||||
* string comparisons. The full set of features (if ICU is being
|
* string comparisons. The full set of features (if ICU is being
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "Init.hxx"
|
#include "Init.hxx"
|
||||||
#include "Collate.hxx"
|
#include "Collate.hxx"
|
||||||
|
#include "Canonicalize.hxx"
|
||||||
#include "util/RuntimeError.hxx"
|
#include "util/RuntimeError.hxx"
|
||||||
|
|
||||||
#include <unicode/uclean.h>
|
#include <unicode/uclean.h>
|
||||||
|
@ -33,11 +34,13 @@ IcuInit()
|
||||||
u_errorName(code));
|
u_errorName(code));
|
||||||
|
|
||||||
IcuCollateInit();
|
IcuCollateInit();
|
||||||
|
IcuCanonicalizeInit();
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
IcuFinish() noexcept
|
IcuFinish() noexcept
|
||||||
{
|
{
|
||||||
|
IcuCanonicalizeFinish();
|
||||||
IcuCollateFinish();
|
IcuCollateFinish();
|
||||||
|
|
||||||
u_cleanup();
|
u_cleanup();
|
||||||
|
|
|
@ -143,6 +143,18 @@ TEST_F(StringFilterTest, Normalize)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_ICU
|
||||||
|
|
||||||
|
TEST_F(StringFilterTest, Transliterate)
|
||||||
|
{
|
||||||
|
const StringFilter f{"'", true, false, false, false};
|
||||||
|
|
||||||
|
EXPECT_TRUE(f.Match("’"));
|
||||||
|
EXPECT_FALSE(f.Match("\""));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
TEST_F(StringFilterTest, FoldCase)
|
TEST_F(StringFilterTest, FoldCase)
|
||||||
{
|
{
|
||||||
const StringFilter f{"nëedlé", true, false, false, false};
|
const StringFilter f{"nëedlé", true, false, false, false};
|
||||||
|
|
Loading…
Reference in New Issue