text_normalization: implement iteration mark flattening
This commit is contained in:
+119
-7
@@ -4,9 +4,105 @@ use crate::kana_transliteration::{
|
||||
transliterate_katakana_to_hiragana_script_only_in_place,
|
||||
};
|
||||
|
||||
fn is_iteration_mark(c: char) -> bool {
|
||||
matches!(c, '々' | '〻' | 'ゝ' | 'ゞ' | 'ヽ' | 'ヾ')
|
||||
}
|
||||
|
||||
fn is_hiragana(c: char) -> bool {
|
||||
('ぁ'..='ゖ').contains(&c)
|
||||
}
|
||||
|
||||
fn strip_hiragana_voicing(hiragana: char) -> Option<char> {
|
||||
Some(match hiragana {
|
||||
'が' => 'か',
|
||||
'ぎ' => 'き',
|
||||
'ぐ' => 'く',
|
||||
'げ' => 'け',
|
||||
'ご' => 'こ',
|
||||
'ざ' => 'さ',
|
||||
'じ' => 'し',
|
||||
'ず' => 'す',
|
||||
'ぜ' => 'せ',
|
||||
'ぞ' => 'そ',
|
||||
'だ' => 'た',
|
||||
'ぢ' => 'ち',
|
||||
'づ' => 'つ',
|
||||
'で' => 'て',
|
||||
'ど' => 'と',
|
||||
'ば' | 'ぱ' => 'は',
|
||||
'び' | 'ぴ' => 'ひ',
|
||||
'ぶ' | 'ぷ' => 'ふ',
|
||||
'べ' | 'ぺ' => 'へ',
|
||||
'ぼ' | 'ぽ' => 'ほ',
|
||||
'ゔ' => 'う',
|
||||
c if is_hiragana(c) => c,
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
|
||||
fn add_hiragana_dakuten(hiragana: char) -> Option<char> {
|
||||
Some(match hiragana {
|
||||
'う' => 'ゔ',
|
||||
'か' => 'が',
|
||||
'き' => 'ぎ',
|
||||
'く' => 'ぐ',
|
||||
'け' => 'げ',
|
||||
'こ' => 'ご',
|
||||
'さ' => 'ざ',
|
||||
'し' => 'じ',
|
||||
'す' => 'ず',
|
||||
'せ' => 'ぜ',
|
||||
'そ' => 'ぞ',
|
||||
'た' => 'だ',
|
||||
'ち' => 'ぢ',
|
||||
'つ' => 'づ',
|
||||
'て' => 'で',
|
||||
'と' => 'ど',
|
||||
'は' => 'ば',
|
||||
'ひ' => 'び',
|
||||
'ふ' => 'ぶ',
|
||||
'へ' => 'べ',
|
||||
'ほ' => 'ぼ',
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
|
||||
fn replace_iteration_marks(text: &mut String) {
|
||||
let mut byte_index = 0;
|
||||
let mut previous_char = None;
|
||||
|
||||
while byte_index < text.len() {
|
||||
let c = text[byte_index..].chars().next().unwrap();
|
||||
let replacement = match c {
|
||||
'々' | '〻' => previous_char.unwrap_or(c),
|
||||
'ゝ' => previous_char.and_then(strip_hiragana_voicing).unwrap_or(c),
|
||||
'ゞ' => previous_char
|
||||
.and_then(strip_hiragana_voicing)
|
||||
.and_then(|hiragana| add_hiragana_dakuten(hiragana).or(Some(hiragana)))
|
||||
.unwrap_or(c),
|
||||
_ => c,
|
||||
};
|
||||
|
||||
if replacement != c {
|
||||
let mut buffer = [0; 4];
|
||||
text.replace_range(
|
||||
byte_index..byte_index + c.len_utf8(),
|
||||
replacement.encode_utf8(&mut buffer),
|
||||
);
|
||||
}
|
||||
|
||||
if !is_iteration_mark(replacement) {
|
||||
previous_char = Some(replacement);
|
||||
}
|
||||
|
||||
byte_index += replacement.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_japanese_text(text: &str) -> String {
|
||||
let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
|
||||
transliterate_katakana_to_hiragana_script_only_in_place(&mut text);
|
||||
replace_iteration_marks(&mut text);
|
||||
expand_hiragana_long_vowel_marks_in_place(&mut text);
|
||||
|
||||
transliterate_fullwidth_romaji_to_halfwidth_romaji(&text)
|
||||
@@ -51,7 +147,6 @@ mod tests {
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_kanji() {
|
||||
let input = "佐々木";
|
||||
@@ -59,7 +154,13 @@ mod tests {
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_kanji_alternative_variant() {
|
||||
let input = "佐〻木";
|
||||
let expected = "佐佐木";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_hiragana() {
|
||||
let input = "さゝき";
|
||||
@@ -67,7 +168,6 @@ mod tests {
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() {
|
||||
let input = "さゞき";
|
||||
@@ -75,22 +175,34 @@ mod tests {
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_katakana() {
|
||||
let input = "サゝキ";
|
||||
let expected = "ササキ";
|
||||
let expected = "ささき";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() {
|
||||
let input = "サゞキ";
|
||||
let expected = "サザキ";
|
||||
let expected = "さざき";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_long_vowel_after_iteration_mark() {
|
||||
let input = "サヾーエ";
|
||||
let expected = "さざあえ";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replace_iteration_marks_handles_variable_width_replacements_in_place() {
|
||||
let mut text = "🙂々".to_string();
|
||||
replace_iteration_marks(&mut text);
|
||||
assert_eq!(text, "🙂🙂");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_normalizes_fullwidth_digits() {
|
||||
let input = "1234567890";
|
||||
|
||||
Reference in New Issue
Block a user