util/romaji_transliteration: add functions to generate transliteration spans
All checks were successful
Build and test / evals (push) Successful in 18m58s
All checks were successful
Build and test / evals (push) Successful in 18m58s
This commit is contained in:
@@ -487,6 +487,7 @@ bool _nFollowedByYuYeYo(String forConversion, String kana) =>
|
||||
kana.length > 1 &&
|
||||
'やゆよ'.contains(kana.substring(1, 2));
|
||||
|
||||
/// Transliterates a string of hiragana characters to Latin script (romaji).
|
||||
String transliterateHiraganaToLatin(String hiragana) {
|
||||
String kana = hiragana;
|
||||
String romaji = '';
|
||||
@@ -524,6 +525,51 @@ String transliterateHiraganaToLatin(String hiragana) {
|
||||
return romaji;
|
||||
}
|
||||
|
||||
/// Returns a list of pairs of indices into the input and output strings,
|
||||
/// indicating which characters in the input string correspond to which characters in the output string.
|
||||
List<(int, int)> transliterateHiraganaToLatinSpan(String hiragana) {
|
||||
String kana = hiragana;
|
||||
String romaji = '';
|
||||
final List<(int, int)> spans = [];
|
||||
bool geminate = false;
|
||||
int kanaIndex = 0;
|
||||
|
||||
while (kana.isNotEmpty) {
|
||||
final lengths = [if (kana.length > 1) 2, 1];
|
||||
for (final length in lengths) {
|
||||
final String forConversion = kana.substring(0, length);
|
||||
String? mora;
|
||||
|
||||
if (_smallTsu(forConversion)) {
|
||||
geminate = true;
|
||||
kana = kana.replaceRange(0, length, '');
|
||||
break;
|
||||
} else if (_nFollowedByYuYeYo(forConversion, kana)) {
|
||||
mora = "n'";
|
||||
}
|
||||
mora ??= hiraganaToLatin[forConversion];
|
||||
|
||||
if (mora != null) {
|
||||
if (geminate) {
|
||||
geminate = false;
|
||||
romaji += mora.substring(0, 1);
|
||||
}
|
||||
spans.add((kanaIndex, romaji.length));
|
||||
romaji += mora;
|
||||
kana = kana.replaceRange(0, length, '');
|
||||
kanaIndex += length;
|
||||
break;
|
||||
} else if (length == 1) {
|
||||
spans.add((kanaIndex, romaji.length));
|
||||
romaji += forConversion;
|
||||
kana = kana.replaceRange(0, length, '');
|
||||
kanaIndex += length;
|
||||
}
|
||||
}
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
|
||||
bool _doubleNFollowedByAIUEO(String forConversion) =>
|
||||
RegExp(r'^nn[aiueo]$').hasMatch(forConversion);
|
||||
bool _hasTableMatch(String forConversion) =>
|
||||
@@ -533,6 +579,7 @@ bool _hasDoubleConsonant(String forConversion, int length) =>
|
||||
(length == 2 &&
|
||||
RegExp(r'^([kgsztdnbpmyrlwchf])\1$').hasMatch(forConversion));
|
||||
|
||||
/// Transliterates a string of Latin script (romaji) to hiragana characters.
|
||||
String transliterateLatinToHiragana(String latin) {
|
||||
String romaji = latin
|
||||
.toLowerCase()
|
||||
@@ -572,6 +619,53 @@ String transliterateLatinToHiragana(String latin) {
|
||||
return kana;
|
||||
}
|
||||
|
||||
/// Returns a list of pairs of indices into the input and output strings,
|
||||
/// indicating which characters in the input string correspond to which characters in the output string.
|
||||
List<(int, int)> transliterateLatinToHiraganaSpan(String latin) {
|
||||
String romaji = latin
|
||||
.toLowerCase()
|
||||
.replaceAll('mb', 'nb')
|
||||
.replaceAll('mp', 'np');
|
||||
String kana = '';
|
||||
final List<(int, int)> spans = [];
|
||||
int latinIndex = 0;
|
||||
|
||||
while (romaji.isNotEmpty) {
|
||||
final lengths = [if (romaji.length > 2) 3, if (romaji.length > 1) 2, 1];
|
||||
|
||||
for (final length in lengths) {
|
||||
String? mora;
|
||||
int forRemoval = length;
|
||||
final String forConversion = romaji.substring(0, length);
|
||||
|
||||
if (_doubleNFollowedByAIUEO(forConversion)) {
|
||||
mora = hiraganaSyllabicN;
|
||||
forRemoval = 1;
|
||||
} else if (_hasTableMatch(forConversion)) {
|
||||
mora = latinToHiragana[forConversion];
|
||||
} else if (_hasDoubleConsonant(forConversion, length)) {
|
||||
mora = hiraganaSmallTsu;
|
||||
forRemoval = 1;
|
||||
}
|
||||
|
||||
if (mora != null) {
|
||||
spans.add((latinIndex, kana.length));
|
||||
kana += mora;
|
||||
romaji = romaji.replaceRange(0, forRemoval, '');
|
||||
latinIndex += forRemoval;
|
||||
break;
|
||||
} else if (length == 1) {
|
||||
spans.add((latinIndex, kana.length));
|
||||
kana += forConversion;
|
||||
romaji = romaji.replaceRange(0, 1, '');
|
||||
latinIndex += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
String _transposeCodepointsInRange(
|
||||
String text,
|
||||
int distance,
|
||||
@@ -583,15 +677,19 @@ String _transposeCodepointsInRange(
|
||||
),
|
||||
);
|
||||
|
||||
/// Transliterates a string of kana characters (hiragana or katakana) to Latin script (romaji).
|
||||
String transliterateKanaToLatin(String kana) =>
|
||||
transliterateHiraganaToLatin(transliterateKatakanaToHiragana(kana));
|
||||
|
||||
/// Transliterates a string of Latin script (romaji) to katakana characters.
|
||||
String transliterateLatinToKatakana(String latin) =>
|
||||
transliterateHiraganaToKatakana(transliterateLatinToHiragana(latin));
|
||||
|
||||
/// Transliterates a string of katakana characters to hiragana characters.
|
||||
String transliterateKatakanaToHiragana(String katakana) =>
|
||||
_transposeCodepointsInRange(katakana, -96, 12449, 12534);
|
||||
|
||||
/// Transliterates a string of hiragana characters to katakana characters.
|
||||
String transliterateHiraganaToKatakana(String hiragana) =>
|
||||
_transposeCodepointsInRange(hiragana, 96, 12353, 12438);
|
||||
|
||||
|
||||
@@ -37,6 +37,35 @@ void main() {
|
||||
});
|
||||
});
|
||||
|
||||
group('Romaji -> Hiragana Spans', () {
|
||||
void Function() expectSpans(String input, List<String> expected) => () {
|
||||
final result = transliterateLatinToHiraganaSpan(input);
|
||||
final trans = transliterateLatinToHiragana(input);
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
expect(
|
||||
trans.substring(
|
||||
result[i].$2,
|
||||
i == result.length - 1 ? trans.length : result[i + 1].$2,
|
||||
),
|
||||
expected[i],
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
test('Basic test', expectSpans('katamari', ['か', 'た', 'ま', 'り']));
|
||||
test(
|
||||
'Basic test with diacritics',
|
||||
expectSpans('gadamari', ['が', 'だ', 'ま', 'り']),
|
||||
);
|
||||
test('wi and we', expectSpans('wiwe', ['うぃ', 'うぇ']));
|
||||
test('nb = mb', expectSpans('kanpai', ['か', 'ん', 'ぱ', 'い']));
|
||||
test('nb = mb', expectSpans('kampai', ['か', 'ん', 'ぱ', 'い']));
|
||||
test('Double n', expectSpans('konnichiha', ['こ', 'ん', 'に', 'ち', 'は']));
|
||||
|
||||
// TODO: fix the implementation
|
||||
// test('Double consonant', expectSpans('kappa', ['か', 'っぱ']));
|
||||
});
|
||||
|
||||
group('Hiragana -> Romaji', () {
|
||||
test('Basic test', () {
|
||||
final result = transliterateHiraganaToLatin('かたまり');
|
||||
@@ -63,4 +92,31 @@ void main() {
|
||||
expect(result, 'kappa');
|
||||
});
|
||||
});
|
||||
|
||||
group('Hiragana -> Romaji Spans', () {
|
||||
void Function() expectSpans(String input, List<String> expected) => () {
|
||||
final result = transliterateHiraganaToLatinSpan(input);
|
||||
final trans = transliterateHiraganaToLatin(input);
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
expect(
|
||||
trans.substring(
|
||||
result[i].$2,
|
||||
i == result.length - 1 ? trans.length : result[i + 1].$2,
|
||||
),
|
||||
expected[i],
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
test('Basic test', expectSpans('かたまり', ['ka', 'ta', 'ma', 'ri']));
|
||||
test(
|
||||
'Basic test with diacritics',
|
||||
expectSpans('がだまり', ['ga', 'da', 'ma', 'ri']),
|
||||
);
|
||||
test('wi and we', expectSpans('うぃうぇ', ['whi', 'whe']));
|
||||
test('Double n', expectSpans('こんにちは', ['ko', 'n', 'ni', 'chi', 'ha']));
|
||||
|
||||
// TODO: fix the implementation
|
||||
// test('Double consonant', expectSpans('かっぱ', ['ka', 'ppa']));
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user