diff --git a/lib/util/romaji_transliteration.dart b/lib/util/romaji_transliteration.dart index 6f56267..0070e2d 100644 --- a/lib/util/romaji_transliteration.dart +++ b/lib/util/romaji_transliteration.dart @@ -487,6 +487,7 @@ bool _nFollowedByYuYeYo(String forConversion, String kana) => kana.length > 1 && 'やゆよ'.contains(kana.substring(1, 2)); +/// Transliterates a string of hiragana characters to Latin script (romaji). String transliterateHiraganaToLatin(String hiragana) { String kana = hiragana; String romaji = ''; @@ -524,6 +525,51 @@ String transliterateHiraganaToLatin(String hiragana) { return romaji; } +/// Returns a list of pairs of indices into the input and output strings, +/// indicating which characters in the input string correspond to which characters in the output string. +List<(int, int)> transliterateHiraganaToLatinSpan(String hiragana) { + String kana = hiragana; + String romaji = ''; + final List<(int, int)> spans = []; + bool geminate = false; + int kanaIndex = 0; + + while (kana.isNotEmpty) { + final lengths = [if (kana.length > 1) 2, 1]; + for (final length in lengths) { + final String forConversion = kana.substring(0, length); + String? mora; + + if (_smallTsu(forConversion)) { + geminate = true; + kana = kana.replaceRange(0, length, ''); + break; + } else if (_nFollowedByYuYeYo(forConversion, kana)) { + mora = "n'"; + } + mora ??= hiraganaToLatin[forConversion]; + + if (mora != null) { + if (geminate) { + geminate = false; + romaji += mora.substring(0, 1); + } + spans.add((kanaIndex, romaji.length)); + romaji += mora; + kana = kana.replaceRange(0, length, ''); + kanaIndex += length; + break; + } else if (length == 1) { + spans.add((kanaIndex, romaji.length)); + romaji += forConversion; + kana = kana.replaceRange(0, length, ''); + kanaIndex += length; + } + } + } + return spans; +} + bool _doubleNFollowedByAIUEO(String forConversion) => RegExp(r'^nn[aiueo]$').hasMatch(forConversion); bool _hasTableMatch(String forConversion) => @@ -533,6 +579,7 @@ bool _hasDoubleConsonant(String forConversion, int length) => (length == 2 && RegExp(r'^([kgsztdnbpmyrlwchf])\1$').hasMatch(forConversion)); +/// Transliterates a string of Latin script (romaji) to hiragana characters. String transliterateLatinToHiragana(String latin) { String romaji = latin .toLowerCase() @@ -572,6 +619,53 @@ String transliterateLatinToHiragana(String latin) { return kana; } +/// Returns a list of pairs of indices into the input and output strings, +/// indicating which characters in the input string correspond to which characters in the output string. +List<(int, int)> transliterateLatinToHiraganaSpan(String latin) { + String romaji = latin + .toLowerCase() + .replaceAll('mb', 'nb') + .replaceAll('mp', 'np'); + String kana = ''; + final List<(int, int)> spans = []; + int latinIndex = 0; + + while (romaji.isNotEmpty) { + final lengths = [if (romaji.length > 2) 3, if (romaji.length > 1) 2, 1]; + + for (final length in lengths) { + String? mora; + int forRemoval = length; + final String forConversion = romaji.substring(0, length); + + if (_doubleNFollowedByAIUEO(forConversion)) { + mora = hiraganaSyllabicN; + forRemoval = 1; + } else if (_hasTableMatch(forConversion)) { + mora = latinToHiragana[forConversion]; + } else if (_hasDoubleConsonant(forConversion, length)) { + mora = hiraganaSmallTsu; + forRemoval = 1; + } + + if (mora != null) { + spans.add((latinIndex, kana.length)); + kana += mora; + romaji = romaji.replaceRange(0, forRemoval, ''); + latinIndex += forRemoval; + break; + } else if (length == 1) { + spans.add((latinIndex, kana.length)); + kana += forConversion; + romaji = romaji.replaceRange(0, 1, ''); + latinIndex += 1; + } + } + } + + return spans; +} + String _transposeCodepointsInRange( String text, int distance, @@ -583,15 +677,19 @@ String _transposeCodepointsInRange( ), ); +/// Transliterates a string of kana characters (hiragana or katakana) to Latin script (romaji). String transliterateKanaToLatin(String kana) => transliterateHiraganaToLatin(transliterateKatakanaToHiragana(kana)); +/// Transliterates a string of Latin script (romaji) to katakana characters. String transliterateLatinToKatakana(String latin) => transliterateHiraganaToKatakana(transliterateLatinToHiragana(latin)); +/// Transliterates a string of katakana characters to hiragana characters. String transliterateKatakanaToHiragana(String katakana) => _transposeCodepointsInRange(katakana, -96, 12449, 12534); +/// Transliterates a string of hiragana characters to katakana characters. String transliterateHiraganaToKatakana(String hiragana) => _transposeCodepointsInRange(hiragana, 96, 12353, 12438); diff --git a/test/util/romaji_transliteration_test.dart b/test/util/romaji_transliteration_test.dart index 0324876..7723d6d 100644 --- a/test/util/romaji_transliteration_test.dart +++ b/test/util/romaji_transliteration_test.dart @@ -37,6 +37,35 @@ void main() { }); }); + group('Romaji -> Hiragana Spans', () { + void Function() expectSpans(String input, List expected) => () { + final result = transliterateLatinToHiraganaSpan(input); + final trans = transliterateLatinToHiragana(input); + for (int i = 0; i < result.length; i++) { + expect( + trans.substring( + result[i].$2, + i == result.length - 1 ? trans.length : result[i + 1].$2, + ), + expected[i], + ); + } + }; + + test('Basic test', expectSpans('katamari', ['か', 'た', 'ま', 'り'])); + test( + 'Basic test with diacritics', + expectSpans('gadamari', ['が', 'だ', 'ま', 'り']), + ); + test('wi and we', expectSpans('wiwe', ['うぃ', 'うぇ'])); + test('nb = mb', expectSpans('kanpai', ['か', 'ん', 'ぱ', 'い'])); + test('nb = mb', expectSpans('kampai', ['か', 'ん', 'ぱ', 'い'])); + test('Double n', expectSpans('konnichiha', ['こ', 'ん', 'に', 'ち', 'は'])); + + // TODO: fix the implementation + // test('Double consonant', expectSpans('kappa', ['か', 'っぱ'])); + }); + group('Hiragana -> Romaji', () { test('Basic test', () { final result = transliterateHiraganaToLatin('かたまり'); @@ -63,4 +92,31 @@ void main() { expect(result, 'kappa'); }); }); + + group('Hiragana -> Romaji Spans', () { + void Function() expectSpans(String input, List expected) => () { + final result = transliterateHiraganaToLatinSpan(input); + final trans = transliterateHiraganaToLatin(input); + for (int i = 0; i < result.length; i++) { + expect( + trans.substring( + result[i].$2, + i == result.length - 1 ? trans.length : result[i + 1].$2, + ), + expected[i], + ); + } + }; + + test('Basic test', expectSpans('かたまり', ['ka', 'ta', 'ma', 'ri'])); + test( + 'Basic test with diacritics', + expectSpans('がだまり', ['ga', 'da', 'ma', 'ri']), + ); + test('wi and we', expectSpans('うぃうぇ', ['whi', 'whe'])); + test('Double n', expectSpans('こんにちは', ['ko', 'n', 'ni', 'chi', 'ha'])); + + // TODO: fix the implementation + // test('Double consonant', expectSpans('かっぱ', ['ka', 'ppa'])); + }); }