diff --git a/src/text_normalization.rs b/src/text_normalization.rs index 228d6ae..a81aaf8 100644 --- a/src/text_normalization.rs +++ b/src/text_normalization.rs @@ -4,9 +4,105 @@ use crate::kana_transliteration::{ transliterate_katakana_to_hiragana_script_only_in_place, }; +fn is_iteration_mark(c: char) -> bool { + matches!(c, '々' | '〻' | 'ゝ' | 'ゞ' | 'ヽ' | 'ヾ') +} + +fn is_hiragana(c: char) -> bool { + ('ぁ'..='ゖ').contains(&c) +} + +fn strip_hiragana_voicing(hiragana: char) -> Option { + Some(match hiragana { + 'が' => 'か', + 'ぎ' => 'き', + 'ぐ' => 'く', + 'げ' => 'け', + 'ご' => 'こ', + 'ざ' => 'さ', + 'じ' => 'し', + 'ず' => 'す', + 'ぜ' => 'せ', + 'ぞ' => 'そ', + 'だ' => 'た', + 'ぢ' => 'ち', + 'づ' => 'つ', + 'で' => 'て', + 'ど' => 'と', + 'ば' | 'ぱ' => 'は', + 'び' | 'ぴ' => 'ひ', + 'ぶ' | 'ぷ' => 'ふ', + 'べ' | 'ぺ' => 'へ', + 'ぼ' | 'ぽ' => 'ほ', + 'ゔ' => 'う', + c if is_hiragana(c) => c, + _ => return None, + }) +} + +fn add_hiragana_dakuten(hiragana: char) -> Option { + Some(match hiragana { + 'う' => 'ゔ', + 'か' => 'が', + 'き' => 'ぎ', + 'く' => 'ぐ', + 'け' => 'げ', + 'こ' => 'ご', + 'さ' => 'ざ', + 'し' => 'じ', + 'す' => 'ず', + 'せ' => 'ぜ', + 'そ' => 'ぞ', + 'た' => 'だ', + 'ち' => 'ぢ', + 'つ' => 'づ', + 'て' => 'で', + 'と' => 'ど', + 'は' => 'ば', + 'ひ' => 'び', + 'ふ' => 'ぶ', + 'へ' => 'べ', + 'ほ' => 'ぼ', + _ => return None, + }) +} + +fn replace_iteration_marks(text: &mut String) { + let mut byte_index = 0; + let mut previous_char = None; + + while byte_index < text.len() { + let c = text[byte_index..].chars().next().unwrap(); + let replacement = match c { + '々' | '〻' => previous_char.unwrap_or(c), + 'ゝ' => previous_char.and_then(strip_hiragana_voicing).unwrap_or(c), + 'ゞ' => previous_char + .and_then(strip_hiragana_voicing) + .and_then(|hiragana| add_hiragana_dakuten(hiragana).or(Some(hiragana))) + .unwrap_or(c), + _ => c, + }; + + if replacement != c { + let mut buffer = [0; 4]; + text.replace_range( + byte_index..byte_index + c.len_utf8(), + replacement.encode_utf8(&mut buffer), + ); + } + + if !is_iteration_mark(replacement) { + previous_char = Some(replacement); + } + + byte_index += replacement.len_utf8(); + } +} + pub fn normalize_japanese_text(text: &str) -> String { let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text); transliterate_katakana_to_hiragana_script_only_in_place(&mut text); + replace_iteration_marks(&mut text); expand_hiragana_long_vowel_marks_in_place(&mut text); transliterate_fullwidth_romaji_to_halfwidth_romaji(&text) @@ -51,7 +147,6 @@ mod tests { assert_eq!(normalize_japanese_text(input), expected); } - #[ignore = "FIX ME"] #[test] fn test_normalize_jp_replaces_iteration_mark_kanji() { let input = "佐々木"; @@ -59,7 +154,13 @@ mod tests { assert_eq!(normalize_japanese_text(input), expected); } - #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_kanji_alternative_variant() { + let input = "佐〻木"; + let expected = "佐佐木"; + assert_eq!(normalize_japanese_text(input), expected); + } + #[test] fn test_normalize_jp_replaces_iteration_mark_hiragana() { let input = "さゝき"; @@ -67,7 +168,6 @@ mod tests { assert_eq!(normalize_japanese_text(input), expected); } - #[ignore = "FIX ME"] #[test] fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() { let input = "さゞき"; @@ -75,22 +175,34 @@ mod tests { assert_eq!(normalize_japanese_text(input), expected); } - #[ignore = "FIX ME"] #[test] fn test_normalize_jp_replaces_iteration_mark_katakana() { let input = "サゝキ"; - let expected = "ササキ"; + let expected = "ささき"; assert_eq!(normalize_japanese_text(input), expected); } - #[ignore = "FIX ME"] #[test] fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() { let input = "サゞキ"; - let expected = "サザキ"; + let expected = "さざき"; assert_eq!(normalize_japanese_text(input), expected); } + #[test] + fn test_normalize_jp_replaces_long_vowel_after_iteration_mark() { + let input = "サヾーエ"; + let expected = "さざあえ"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_replace_iteration_marks_handles_variable_width_replacements_in_place() { + let mut text = "🙂々".to_string(); + replace_iteration_marks(&mut text); + assert_eq!(text, "🙂🙂"); + } + #[test] fn test_normalize_jp_normalizes_fullwidth_digits() { let input = "1234567890";