Files
jadb/lib/util/romaji_transliteration.dart
h7x4 a86f857553
All checks were successful
Build and test / evals (push) Successful in 18m58s
util/romaji_transliteration: add functions to generate transliteration spans
2026-03-02 18:23:36 +09:00

711 lines
15 KiB
Dart
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Source: https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb
const hiraganaSyllabicN = '';
const hiraganaSmallTsu = '';
const Map<String, String> hiraganaToLatin = {
'': 'a',
'': 'i',
'': 'u',
'': 'e',
'': 'o',
'': 'ka',
'': 'ki',
'': 'ku',
'': 'ke',
'': 'ko',
'': 'ga',
'': 'gi',
'': 'gu',
'': 'ge',
'': 'go',
'': 'sa',
'': 'shi',
'': 'su',
'': 'se',
'': 'so',
'': 'za',
'': 'ji',
'': 'zu',
'': 'ze',
'': 'zo',
'': 'ta',
'': 'chi',
'': 'tsu',
'': 'te',
'': 'to',
'': 'da',
'': 'ji',
'': 'zu',
'': 'de',
'': 'do',
'': 'na',
'': 'ni',
'': 'nu',
'': 'ne',
'': 'no',
'': 'ha',
'': 'hi',
'': 'fu',
'': 'he',
'': 'ho',
'': 'ba',
'': 'bi',
'': 'bu',
'': 'be',
'': 'bo',
'': 'pa',
'': 'pi',
'': 'pu',
'': 'pe',
'': 'po',
'': 'ma',
'': 'mi',
'': 'mu',
'': 'me',
'': 'mo',
'': 'ya',
'': 'yu',
'': 'yo',
'': 'ra',
'': 'ri',
'': 'ru',
'': 're',
'': 'ro',
'': 'wa',
'うぃ': 'whi',
'うぇ': 'whe',
'': 'wo',
'': 'we',
'': 'wi',
'': '-',
'': 'n',
'きゃ': 'kya',
'きゅ': 'kyu',
'きょ': 'kyo',
'きぇ': 'kye',
'きぃ': 'kyi',
'ぎゃ': 'gya',
'ぎゅ': 'gyu',
'ぎょ': 'gyo',
'ぎぇ': 'gye',
'ぎぃ': 'gyi',
'くぁ': 'kwa',
'くぃ': 'kwi',
'くぅ': 'kwu',
'くぇ': 'kwe',
'くぉ': 'kwo',
'ぐぁ': 'qwa',
'ぐぃ': 'gwi',
'ぐぅ': 'gwu',
'ぐぇ': 'gwe',
'ぐぉ': 'gwo',
'しゃ': 'sha',
'しぃ': 'syi',
'しゅ': 'shu',
'しぇ': 'she',
'しょ': 'sho',
'じゃ': 'ja',
'じゅ': 'ju',
'じぇ': 'jye',
'じょ': 'jo',
'じぃ': 'jyi',
'すぁ': 'swa',
'すぃ': 'swi',
'すぅ': 'swu',
'すぇ': 'swe',
'すぉ': 'swo',
'ちゃ': 'cha',
'ちゅ': 'chu',
'ちぇ': 'tye',
'ちょ': 'cho',
'ちぃ': 'tyi',
'ぢゃ': 'ja',
'ぢぃ': 'dyi',
'ぢゅ': 'ju',
'ぢぇ': 'dye',
'ぢょ': 'jo',
'つぁ': 'tsa',
'つぃ': 'tsi',
'つぇ': 'tse',
'つぉ': 'tso',
'てゃ': 'tha',
'てぃ': 'thi',
'てゅ': 'thu',
'てぇ': 'the',
'てょ': 'tho',
'とぁ': 'twa',
'とぃ': 'twi',
'とぅ': 'twu',
'とぇ': 'twe',
'とぉ': 'two',
'でゃ': 'dha',
'でぃ': 'dhi',
'でゅ': 'dhu',
'でぇ': 'dhe',
'でょ': 'dho',
'どぁ': 'dwa',
'どぃ': 'dwi',
'どぅ': 'dwu',
'どぇ': 'dwe',
'どぉ': 'dwo',
'にゃ': 'nya',
'にゅ': 'nyu',
'にょ': 'nyo',
'にぇ': 'nye',
'にぃ': 'nyi',
'ひゃ': 'hya',
'ひぃ': 'hyi',
'ひゅ': 'hyu',
'ひぇ': 'hye',
'ひょ': 'hyo',
'びゃ': 'bya',
'びぃ': 'byi',
'びゅ': 'byu',
'びぇ': 'bye',
'びょ': 'byo',
'ぴゃ': 'pya',
'ぴぃ': 'pyi',
'ぴゅ': 'pyu',
'ぴぇ': 'pye',
'ぴょ': 'pyo',
'ふぁ': 'fwa',
'ふぃ': 'fyi',
'ふぇ': 'fye',
'ふぉ': 'fwo',
'ふぅ': 'fwu',
'ふゃ': 'fya',
'ふゅ': 'fyu',
'ふょ': 'fyo',
'みゃ': 'mya',
'みぃ': 'myi',
'みゅ': 'myu',
'みぇ': 'mye',
'みょ': 'myo',
'りゃ': 'rya',
'りぃ': 'ryi',
'りゅ': 'ryu',
'りぇ': 'rye',
'りょ': 'ryo',
'ゔぁ': 'va',
'ゔぃ': 'vyi',
'': 'vu',
'ゔぇ': 'vye',
'ゔぉ': 'vo',
'ゔゃ': 'vya',
'ゔゅ': 'vyu',
'ゔょ': 'vyo',
'うぁ': 'wha',
'いぇ': 'ye',
'うぉ': 'who',
'': 'xa',
'': 'xi',
'': 'xu',
'': 'xe',
'': 'xo',
'': 'xka',
'': 'xke',
'': 'xwa',
'': 'yori',
};
const Map<String, String> latinToHiragana = {
'a': '',
'i': '',
'u': '',
'e': '',
'o': '',
'ka': '',
'ki': '',
'ku': '',
'ke': '',
'ko': '',
'ga': '',
'gi': '',
'gu': '',
'ge': '',
'go': '',
'sa': '',
'si': '',
'shi': '',
'su': '',
'se': '',
'so': '',
'za': '',
'zi': '',
'ji': '',
'zu': '',
'ze': '',
'zo': '',
'ta': '',
'ti': '',
'chi': '',
'tu': '',
'tsu': '',
'te': '',
'to': '',
'da': '',
'di': '',
'du': '',
'dzu': '',
'de': '',
'do': '',
'na': '',
'ni': '',
'nu': '',
'ne': '',
'no': '',
'ha': '',
'hi': '',
'hu': '',
'fu': '',
'he': '',
'ho': '',
'ba': '',
'bi': '',
'bu': '',
'be': '',
'bo': '',
'pa': '',
'pi': '',
'pu': '',
'pe': '',
'po': '',
'ma': '',
'mi': '',
'mu': '',
'me': '',
'mo': '',
'ya': '',
'yu': '',
'yo': '',
'ra': '',
'ri': '',
'ru': '',
're': '',
'ro': '',
'la': '',
'li': '',
'lu': '',
'le': '',
'lo': '',
'wa': '',
'wi': 'うぃ',
'we': 'うぇ',
'wo': '',
'wye': '',
'wyi': '',
'-': '',
'n': '',
'nn': '',
"n'": '',
'kya': 'きゃ',
'kyu': 'きゅ',
'kyo': 'きょ',
'kye': 'きぇ',
'kyi': 'きぃ',
'gya': 'ぎゃ',
'gyu': 'ぎゅ',
'gyo': 'ぎょ',
'gye': 'ぎぇ',
'gyi': 'ぎぃ',
'kwa': 'くぁ',
'kwi': 'くぃ',
'kwu': 'くぅ',
'kwe': 'くぇ',
'kwo': 'くぉ',
'gwa': 'ぐぁ',
'gwi': 'ぐぃ',
'gwu': 'ぐぅ',
'gwe': 'ぐぇ',
'gwo': 'ぐぉ',
'qwa': 'ぐぁ',
'qwi': 'ぐぃ',
'qwu': 'ぐぅ',
'qwe': 'ぐぇ',
'qwo': 'ぐぉ',
'sya': 'しゃ',
'syi': 'しぃ',
'syu': 'しゅ',
'sye': 'しぇ',
'syo': 'しょ',
'sha': 'しゃ',
'shu': 'しゅ',
'she': 'しぇ',
'sho': 'しょ',
'ja': 'じゃ',
'ju': 'じゅ',
'je': 'じぇ',
'jo': 'じょ',
'jya': 'じゃ',
'jyi': 'じぃ',
'jyu': 'じゅ',
'jye': 'じぇ',
'jyo': 'じょ',
'zya': 'じゃ',
'zyu': 'じゅ',
'zyo': 'じょ',
'zye': 'じぇ',
'zyi': 'じぃ',
'swa': 'すぁ',
'swi': 'すぃ',
'swu': 'すぅ',
'swe': 'すぇ',
'swo': 'すぉ',
'cha': 'ちゃ',
'chu': 'ちゅ',
'che': 'ちぇ',
'cho': 'ちょ',
'cya': 'ちゃ',
'cyi': 'ちぃ',
'cyu': 'ちゅ',
'cye': 'ちぇ',
'cyo': 'ちょ',
'tya': 'ちゃ',
'tyi': 'ちぃ',
'tyu': 'ちゅ',
'tye': 'ちぇ',
'tyo': 'ちょ',
'dya': 'ぢゃ',
'dyi': 'ぢぃ',
'dyu': 'ぢゅ',
'dye': 'ぢぇ',
'dyo': 'ぢょ',
'tsa': 'つぁ',
'tsi': 'つぃ',
'tse': 'つぇ',
'tso': 'つぉ',
'tha': 'てゃ',
'thi': 'てぃ',
'thu': 'てゅ',
'the': 'てぇ',
'tho': 'てょ',
'twa': 'とぁ',
'twi': 'とぃ',
'twu': 'とぅ',
'twe': 'とぇ',
'two': 'とぉ',
'dha': 'でゃ',
'dhi': 'でぃ',
'dhu': 'でゅ',
'dhe': 'でぇ',
'dho': 'でょ',
'dwa': 'どぁ',
'dwi': 'どぃ',
'dwu': 'どぅ',
'dwe': 'どぇ',
'dwo': 'どぉ',
'nya': 'にゃ',
'nyu': 'にゅ',
'nyo': 'にょ',
'nye': 'にぇ',
'nyi': 'にぃ',
'hya': 'ひゃ',
'hyi': 'ひぃ',
'hyu': 'ひゅ',
'hye': 'ひぇ',
'hyo': 'ひょ',
'bya': 'びゃ',
'byi': 'びぃ',
'byu': 'びゅ',
'bye': 'びぇ',
'byo': 'びょ',
'pya': 'ぴゃ',
'pyi': 'ぴぃ',
'pyu': 'ぴゅ',
'pye': 'ぴぇ',
'pyo': 'ぴょ',
'fa': 'ふぁ',
'fi': 'ふぃ',
'fe': 'ふぇ',
'fo': 'ふぉ',
'fwa': 'ふぁ',
'fwi': 'ふぃ',
'fwu': 'ふぅ',
'fwe': 'ふぇ',
'fwo': 'ふぉ',
'fya': 'ふゃ',
'fyi': 'ふぃ',
'fyu': 'ふゅ',
'fye': 'ふぇ',
'fyo': 'ふょ',
'mya': 'みゃ',
'myi': 'みぃ',
'myu': 'みゅ',
'mye': 'みぇ',
'myo': 'みょ',
'rya': 'りゃ',
'ryi': 'りぃ',
'ryu': 'りゅ',
'rye': 'りぇ',
'ryo': 'りょ',
'lya': 'りゃ',
'lyu': 'りゅ',
'lyo': 'りょ',
'lye': 'りぇ',
'lyi': 'りぃ',
'va': 'ゔぁ',
'vi': 'ゔぃ',
'vu': '',
've': 'ゔぇ',
'vo': 'ゔぉ',
'vya': 'ゔゃ',
'vyi': 'ゔぃ',
'vyu': 'ゔゅ',
'vye': 'ゔぇ',
'vyo': 'ゔょ',
'wha': 'うぁ',
'whi': 'うぃ',
'ye': 'いぇ',
'whe': 'うぇ',
'who': 'うぉ',
'xa': '',
'xi': '',
'xu': '',
'xe': '',
'xo': '',
'xya': '',
'xyu': '',
'xyo': '',
'xtu': '',
'xtsu': '',
'xka': '',
'xke': '',
'xwa': '',
'@@': ' ',
'#[': '',
'#]': '',
'#,': '',
'#.': '',
'#/': '',
'#~': '',
};
bool _smallTsu(String forConversion) => forConversion == hiraganaSmallTsu;
bool _nFollowedByYuYeYo(String forConversion, String kana) =>
forConversion == hiraganaSyllabicN &&
kana.length > 1 &&
'やゆよ'.contains(kana.substring(1, 2));
/// Transliterates a string of hiragana characters to Latin script (romaji).
String transliterateHiraganaToLatin(String hiragana) {
String kana = hiragana;
String romaji = '';
bool geminate = false;
while (kana.isNotEmpty) {
final lengths = [if (kana.length > 1) 2, 1];
for (final length in lengths) {
final String forConversion = kana.substring(0, length);
String? mora;
if (_smallTsu(forConversion)) {
geminate = true;
kana = kana.replaceRange(0, length, '');
break;
} else if (_nFollowedByYuYeYo(forConversion, kana)) {
mora = "n'";
}
mora ??= hiraganaToLatin[forConversion];
if (mora != null) {
if (geminate) {
geminate = false;
romaji += mora.substring(0, 1);
}
romaji += mora;
kana = kana.replaceRange(0, length, '');
break;
} else if (length == 1) {
romaji += forConversion;
kana = kana.replaceRange(0, length, '');
}
}
}
return romaji;
}
/// Returns a list of pairs of indices into the input and output strings,
/// indicating which characters in the input string correspond to which characters in the output string.
List<(int, int)> transliterateHiraganaToLatinSpan(String hiragana) {
String kana = hiragana;
String romaji = '';
final List<(int, int)> spans = [];
bool geminate = false;
int kanaIndex = 0;
while (kana.isNotEmpty) {
final lengths = [if (kana.length > 1) 2, 1];
for (final length in lengths) {
final String forConversion = kana.substring(0, length);
String? mora;
if (_smallTsu(forConversion)) {
geminate = true;
kana = kana.replaceRange(0, length, '');
break;
} else if (_nFollowedByYuYeYo(forConversion, kana)) {
mora = "n'";
}
mora ??= hiraganaToLatin[forConversion];
if (mora != null) {
if (geminate) {
geminate = false;
romaji += mora.substring(0, 1);
}
spans.add((kanaIndex, romaji.length));
romaji += mora;
kana = kana.replaceRange(0, length, '');
kanaIndex += length;
break;
} else if (length == 1) {
spans.add((kanaIndex, romaji.length));
romaji += forConversion;
kana = kana.replaceRange(0, length, '');
kanaIndex += length;
}
}
}
return spans;
}
bool _doubleNFollowedByAIUEO(String forConversion) =>
RegExp(r'^nn[aiueo]$').hasMatch(forConversion);
bool _hasTableMatch(String forConversion) =>
latinToHiragana[forConversion] != null;
bool _hasDoubleConsonant(String forConversion, int length) =>
forConversion == 'tch' ||
(length == 2 &&
RegExp(r'^([kgsztdnbpmyrlwchf])\1$').hasMatch(forConversion));
/// Transliterates a string of Latin script (romaji) to hiragana characters.
String transliterateLatinToHiragana(String latin) {
String romaji = latin
.toLowerCase()
.replaceAll('mb', 'nb')
.replaceAll('mp', 'np');
String kana = '';
while (romaji.isNotEmpty) {
final lengths = [if (romaji.length > 2) 3, if (romaji.length > 1) 2, 1];
for (final length in lengths) {
String? mora;
int forRemoval = length;
final String forConversion = romaji.substring(0, length);
if (_doubleNFollowedByAIUEO(forConversion)) {
mora = hiraganaSyllabicN;
forRemoval = 1;
} else if (_hasTableMatch(forConversion)) {
mora = latinToHiragana[forConversion];
} else if (_hasDoubleConsonant(forConversion, length)) {
mora = hiraganaSmallTsu;
forRemoval = 1;
}
if (mora != null) {
kana += mora;
romaji = romaji.replaceRange(0, forRemoval, '');
break;
} else if (length == 1) {
kana += forConversion;
romaji = romaji.replaceRange(0, 1, '');
}
}
}
return kana;
}
/// Returns a list of pairs of indices into the input and output strings,
/// indicating which characters in the input string correspond to which characters in the output string.
List<(int, int)> transliterateLatinToHiraganaSpan(String latin) {
String romaji = latin
.toLowerCase()
.replaceAll('mb', 'nb')
.replaceAll('mp', 'np');
String kana = '';
final List<(int, int)> spans = [];
int latinIndex = 0;
while (romaji.isNotEmpty) {
final lengths = [if (romaji.length > 2) 3, if (romaji.length > 1) 2, 1];
for (final length in lengths) {
String? mora;
int forRemoval = length;
final String forConversion = romaji.substring(0, length);
if (_doubleNFollowedByAIUEO(forConversion)) {
mora = hiraganaSyllabicN;
forRemoval = 1;
} else if (_hasTableMatch(forConversion)) {
mora = latinToHiragana[forConversion];
} else if (_hasDoubleConsonant(forConversion, length)) {
mora = hiraganaSmallTsu;
forRemoval = 1;
}
if (mora != null) {
spans.add((latinIndex, kana.length));
kana += mora;
romaji = romaji.replaceRange(0, forRemoval, '');
latinIndex += forRemoval;
break;
} else if (length == 1) {
spans.add((latinIndex, kana.length));
kana += forConversion;
romaji = romaji.replaceRange(0, 1, '');
latinIndex += 1;
}
}
}
return spans;
}
String _transposeCodepointsInRange(
String text,
int distance,
int rangeStart,
int rangeEnd,
) => String.fromCharCodes(
text.codeUnits.map(
(c) => c + ((rangeStart <= c && c <= rangeEnd) ? distance : 0),
),
);
/// Transliterates a string of kana characters (hiragana or katakana) to Latin script (romaji).
String transliterateKanaToLatin(String kana) =>
transliterateHiraganaToLatin(transliterateKatakanaToHiragana(kana));
/// Transliterates a string of Latin script (romaji) to katakana characters.
String transliterateLatinToKatakana(String latin) =>
transliterateHiraganaToKatakana(transliterateLatinToHiragana(latin));
/// Transliterates a string of katakana characters to hiragana characters.
String transliterateKatakanaToHiragana(String katakana) =>
_transposeCodepointsInRange(katakana, -96, 12449, 12534);
/// Transliterates a string of hiragana characters to katakana characters.
String transliterateHiraganaToKatakana(String hiragana) =>
_transposeCodepointsInRange(hiragana, 96, 12353, 12438);
String transliterateFullwidthRomajiToHalfwidth(String halfwidth) =>
_transposeCodepointsInRange(
_transposeCodepointsInRange(halfwidth, -65248, 65281, 65374),
-12256,
12288,
12288,
);
String transliterateHalfwidthRomajiToFullwidth(String halfwidth) =>
_transposeCodepointsInRange(
_transposeCodepointsInRange(halfwidth, 65248, 33, 126),
12256,
32,
32,
);