lib/icu/Canonicalize: convert all punctuation to ASCII
To make tag matching easier. Closes https://github.com/MusicPlayerDaemon/MPD/issues/1669
This commit is contained in:
		@@ -237,6 +237,7 @@ of:
 | 
			
		||||
The :command:`find` commands are case sensitive, while
 | 
			
		||||
:command:`search` and related commands ignore case.  The latter also
 | 
			
		||||
applies `Unicode normalization <https://unicode.org/reports/tr15/>`__
 | 
			
		||||
and converts all punctuation to ASCII equivalents
 | 
			
		||||
if MPD was compiled with `ICU <https://icu.unicode.org/>`__ support.
 | 
			
		||||
 | 
			
		||||
Prior to MPD 0.21, the syntax looked like this::
 | 
			
		||||
 
 | 
			
		||||
@@ -26,15 +26,48 @@
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_ICU
 | 
			
		||||
#include "Normalize.hxx"
 | 
			
		||||
#include "Transliterator.hxx"
 | 
			
		||||
#include "Util.hxx"
 | 
			
		||||
#include "util/AllocatedArray.hxx"
 | 
			
		||||
#include "util/SpanCast.hxx"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_ICU
 | 
			
		||||
 | 
			
		||||
using std::string_view_literals::operator""sv;
 | 
			
		||||
 | 
			
		||||
static IcuTransliterator *transliterator;
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
IcuCanonicalizeInit()
 | 
			
		||||
{
 | 
			
		||||
	assert(transliterator == nullptr);
 | 
			
		||||
 | 
			
		||||
	const auto id =
 | 
			
		||||
		/* convert all punctuation to ASCII equivalents */
 | 
			
		||||
		"[:Punctuation:] Latin-ASCII;"sv;
 | 
			
		||||
 | 
			
		||||
	transliterator = new IcuTransliterator(ToStringView(std::span{UCharFromUTF8(id)}),
 | 
			
		||||
					       {});
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
IcuCanonicalizeFinish() noexcept
 | 
			
		||||
{
 | 
			
		||||
	assert(transliterator != nullptr);
 | 
			
		||||
 | 
			
		||||
	delete transliterator;
 | 
			
		||||
	transliterator = nullptr;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
AllocatedString
 | 
			
		||||
IcuCanonicalize(std::string_view src, bool fold_case) noexcept
 | 
			
		||||
try {
 | 
			
		||||
#ifdef HAVE_ICU
 | 
			
		||||
	assert(transliterator != nullptr);
 | 
			
		||||
 | 
			
		||||
	auto u = UCharFromUTF8(src);
 | 
			
		||||
	if (u.data() == nullptr)
 | 
			
		||||
		return {src};
 | 
			
		||||
@@ -45,6 +78,10 @@ try {
 | 
			
		||||
	    n != nullptr)
 | 
			
		||||
		u = std::move(n);
 | 
			
		||||
 | 
			
		||||
	if (auto t = transliterator->Transliterate(ToStringView(std::span{u}));
 | 
			
		||||
	    t != nullptr)
 | 
			
		||||
		u = std::move(t);
 | 
			
		||||
 | 
			
		||||
	return UCharToUTF8(ToStringView(std::span{u}));
 | 
			
		||||
#else
 | 
			
		||||
#error not implemented
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,15 @@
 | 
			
		||||
 | 
			
		||||
class AllocatedString;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Throws on error.
 | 
			
		||||
 */
 | 
			
		||||
void
 | 
			
		||||
IcuCanonicalizeInit();
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
IcuCanonicalizeFinish() noexcept;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Transform the given string to "canonical" form to allow fuzzy
 | 
			
		||||
 * string comparisons.  The full set of features (if ICU is being
 | 
			
		||||
 
 | 
			
		||||
@@ -19,6 +19,7 @@
 | 
			
		||||
 | 
			
		||||
#include "Init.hxx"
 | 
			
		||||
#include "Collate.hxx"
 | 
			
		||||
#include "Canonicalize.hxx"
 | 
			
		||||
#include "util/RuntimeError.hxx"
 | 
			
		||||
 | 
			
		||||
#include <unicode/uclean.h>
 | 
			
		||||
@@ -33,11 +34,13 @@ IcuInit()
 | 
			
		||||
					 u_errorName(code));
 | 
			
		||||
 | 
			
		||||
	IcuCollateInit();
 | 
			
		||||
	IcuCanonicalizeInit();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
IcuFinish() noexcept
 | 
			
		||||
{
 | 
			
		||||
	IcuCanonicalizeFinish();
 | 
			
		||||
	IcuCollateFinish();
 | 
			
		||||
 | 
			
		||||
	u_cleanup();
 | 
			
		||||
 
 | 
			
		||||
@@ -143,6 +143,18 @@ TEST_F(StringFilterTest, Normalize)
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_ICU
 | 
			
		||||
 | 
			
		||||
TEST_F(StringFilterTest, Transliterate)
 | 
			
		||||
{
 | 
			
		||||
	const StringFilter f{"'", true, false, false, false};
 | 
			
		||||
 | 
			
		||||
	EXPECT_TRUE(f.Match("’"));
 | 
			
		||||
	EXPECT_FALSE(f.Match("\""));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
TEST_F(StringFilterTest, FoldCase)
 | 
			
		||||
{
 | 
			
		||||
	const StringFilter f{"nëedlé", true, false, false, false};
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user