util/UTF8: add SequenceLengthUTF8()

This commit is contained in:
Max Kellermann 2014-10-10 21:17:40 +02:00
parent d5cf41e043
commit b70bf938c2
2 changed files with 96 additions and 0 deletions

View File

@ -166,6 +166,86 @@ ValidateUTF8(const char *p)
return true; return true;
} }
size_t
SequenceLengthUTF8(char ch)
{
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return 2;
else if (IsLeading2(ch))
/* 2 continuations */
return 3;
else if (IsLeading3(ch))
/* 3 continuations */
return 4;
else if (IsLeading4(ch))
/* 4 continuations */
return 5;
else if (IsLeading5(ch))
/* 5 continuations */
return 6;
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
template<size_t L>
struct CheckSequenceUTF8 {
gcc_pure
bool operator()(const char *p) const {
return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
}
};
template<>
struct CheckSequenceUTF8<0u> {
constexpr bool operator()(gcc_unused const char *p) const {
return true;
}
};
template<size_t L>
gcc_pure
static size_t
InnerSequenceLengthUTF8(const char *p)
{
return CheckSequenceUTF8<L>()(p)
? L + 1
: 0u;
}
size_t
SequenceLengthUTF8(const char *p)
{
const unsigned char ch = *p++;
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return InnerSequenceLengthUTF8<1>(p);
else if (IsLeading2(ch))
/* 2 continuations */
return InnerSequenceLengthUTF8<2>(p);
else if (IsLeading3(ch))
/* 3 continuations */
return InnerSequenceLengthUTF8<3>(p);
else if (IsLeading4(ch))
/* 4 continuations */
return InnerSequenceLengthUTF8<4>(p);
else if (IsLeading5(ch))
/* 5 continuations */
return InnerSequenceLengthUTF8<5>(p);
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
static const char * static const char *
FindNonASCIIOrZero(const char *p) FindNonASCIIOrZero(const char *p)
{ {

View File

@ -42,6 +42,22 @@ gcc_pure gcc_nonnull_all
bool bool
ValidateUTF8(const char *p); ValidateUTF8(const char *p);
/**
* @return the number of the sequence beginning with the given
* character, or 0 if the character is not a valid start byte
*/
gcc_const
size_t
SequenceLengthUTF8(char ch);
/**
* @return the number of the first sequence in the given string, or 0
* if the sequence is malformed
*/
gcc_pure
size_t
SequenceLengthUTF8(const char *p);
/** /**
* Convert the specified string from ISO-8859-1 to UTF-8. * Convert the specified string from ISO-8859-1 to UTF-8.
* *