util/romaji_transliteration: add functions to generate transliteration spans
All checks were successful
Build and test / evals (push) Successful in 18m58s

This commit is contained in:
2026-03-02 18:21:06 +09:00
parent d14e3909d4
commit a86f857553
2 changed files with 154 additions and 0 deletions

View File

@@ -487,6 +487,7 @@ bool _nFollowedByYuYeYo(String forConversion, String kana) =>
kana.length > 1 &&
'やゆよ'.contains(kana.substring(1, 2));
/// Transliterates a string of hiragana characters to Latin script (romaji).
String transliterateHiraganaToLatin(String hiragana) {
String kana = hiragana;
String romaji = '';
@@ -524,6 +525,51 @@ String transliterateHiraganaToLatin(String hiragana) {
return romaji;
}
/// Returns a list of pairs of indices into the input and output strings,
/// indicating which characters in the input string correspond to which characters in the output string.
List<(int, int)> transliterateHiraganaToLatinSpan(String hiragana) {
String kana = hiragana;
String romaji = '';
final List<(int, int)> spans = [];
bool geminate = false;
int kanaIndex = 0;
while (kana.isNotEmpty) {
final lengths = [if (kana.length > 1) 2, 1];
for (final length in lengths) {
final String forConversion = kana.substring(0, length);
String? mora;
if (_smallTsu(forConversion)) {
geminate = true;
kana = kana.replaceRange(0, length, '');
break;
} else if (_nFollowedByYuYeYo(forConversion, kana)) {
mora = "n'";
}
mora ??= hiraganaToLatin[forConversion];
if (mora != null) {
if (geminate) {
geminate = false;
romaji += mora.substring(0, 1);
}
spans.add((kanaIndex, romaji.length));
romaji += mora;
kana = kana.replaceRange(0, length, '');
kanaIndex += length;
break;
} else if (length == 1) {
spans.add((kanaIndex, romaji.length));
romaji += forConversion;
kana = kana.replaceRange(0, length, '');
kanaIndex += length;
}
}
}
return spans;
}
bool _doubleNFollowedByAIUEO(String forConversion) =>
RegExp(r'^nn[aiueo]$').hasMatch(forConversion);
bool _hasTableMatch(String forConversion) =>
@@ -533,6 +579,7 @@ bool _hasDoubleConsonant(String forConversion, int length) =>
(length == 2 &&
RegExp(r'^([kgsztdnbpmyrlwchf])\1$').hasMatch(forConversion));
/// Transliterates a string of Latin script (romaji) to hiragana characters.
String transliterateLatinToHiragana(String latin) {
String romaji = latin
.toLowerCase()
@@ -572,6 +619,53 @@ String transliterateLatinToHiragana(String latin) {
return kana;
}
/// Returns a list of pairs of indices into the input and output strings,
/// indicating which characters in the input string correspond to which characters in the output string.
List<(int, int)> transliterateLatinToHiraganaSpan(String latin) {
String romaji = latin
.toLowerCase()
.replaceAll('mb', 'nb')
.replaceAll('mp', 'np');
String kana = '';
final List<(int, int)> spans = [];
int latinIndex = 0;
while (romaji.isNotEmpty) {
final lengths = [if (romaji.length > 2) 3, if (romaji.length > 1) 2, 1];
for (final length in lengths) {
String? mora;
int forRemoval = length;
final String forConversion = romaji.substring(0, length);
if (_doubleNFollowedByAIUEO(forConversion)) {
mora = hiraganaSyllabicN;
forRemoval = 1;
} else if (_hasTableMatch(forConversion)) {
mora = latinToHiragana[forConversion];
} else if (_hasDoubleConsonant(forConversion, length)) {
mora = hiraganaSmallTsu;
forRemoval = 1;
}
if (mora != null) {
spans.add((latinIndex, kana.length));
kana += mora;
romaji = romaji.replaceRange(0, forRemoval, '');
latinIndex += forRemoval;
break;
} else if (length == 1) {
spans.add((latinIndex, kana.length));
kana += forConversion;
romaji = romaji.replaceRange(0, 1, '');
latinIndex += 1;
}
}
}
return spans;
}
String _transposeCodepointsInRange(
String text,
int distance,
@@ -583,15 +677,19 @@ String _transposeCodepointsInRange(
),
);
/// Transliterates a string of kana characters (hiragana or katakana) to Latin script (romaji).
String transliterateKanaToLatin(String kana) =>
transliterateHiraganaToLatin(transliterateKatakanaToHiragana(kana));
/// Transliterates a string of Latin script (romaji) to katakana characters.
String transliterateLatinToKatakana(String latin) =>
transliterateHiraganaToKatakana(transliterateLatinToHiragana(latin));
/// Transliterates a string of katakana characters to hiragana characters.
String transliterateKatakanaToHiragana(String katakana) =>
_transposeCodepointsInRange(katakana, -96, 12449, 12534);
/// Transliterates a string of hiragana characters to katakana characters.
String transliterateHiraganaToKatakana(String hiragana) =>
_transposeCodepointsInRange(hiragana, 96, 12353, 12438);

View File

@@ -37,6 +37,35 @@ void main() {
});
});
group('Romaji -> Hiragana Spans', () {
void Function() expectSpans(String input, List<String> expected) => () {
final result = transliterateLatinToHiraganaSpan(input);
final trans = transliterateLatinToHiragana(input);
for (int i = 0; i < result.length; i++) {
expect(
trans.substring(
result[i].$2,
i == result.length - 1 ? trans.length : result[i + 1].$2,
),
expected[i],
);
}
};
test('Basic test', expectSpans('katamari', ['', '', '', '']));
test(
'Basic test with diacritics',
expectSpans('gadamari', ['', '', '', '']),
);
test('wi and we', expectSpans('wiwe', ['うぃ', 'うぇ']));
test('nb = mb', expectSpans('kanpai', ['', '', '', '']));
test('nb = mb', expectSpans('kampai', ['', '', '', '']));
test('Double n', expectSpans('konnichiha', ['', '', '', '', '']));
// TODO: fix the implementation
// test('Double consonant', expectSpans('kappa', ['か', 'っぱ']));
});
group('Hiragana -> Romaji', () {
test('Basic test', () {
final result = transliterateHiraganaToLatin('かたまり');
@@ -63,4 +92,31 @@ void main() {
expect(result, 'kappa');
});
});
group('Hiragana -> Romaji Spans', () {
void Function() expectSpans(String input, List<String> expected) => () {
final result = transliterateHiraganaToLatinSpan(input);
final trans = transliterateHiraganaToLatin(input);
for (int i = 0; i < result.length; i++) {
expect(
trans.substring(
result[i].$2,
i == result.length - 1 ? trans.length : result[i + 1].$2,
),
expected[i],
);
}
};
test('Basic test', expectSpans('かたまり', ['ka', 'ta', 'ma', 'ri']));
test(
'Basic test with diacritics',
expectSpans('がだまり', ['ga', 'da', 'ma', 'ri']),
);
test('wi and we', expectSpans('うぃうぇ', ['whi', 'whe']));
test('Double n', expectSpans('こんにちは', ['ko', 'n', 'ni', 'chi', 'ha']));
// TODO: fix the implementation
// test('Double consonant', expectSpans('かっぱ', ['ka', 'ppa']));
});
}