diff --git a/src/text_normalization.rs b/src/text_normalization.rs index 0978b47..f55e313 100644 --- a/src/text_normalization.rs +++ b/src/text_normalization.rs @@ -1,8 +1,121 @@ use crate::kana_transliteration::{ + transliterate_fullwidth_romaji_to_halfwidth_romaji, transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana, }; pub fn normalize_japanese_text(text: &str) -> String { let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text); - transliterate_katakana_to_hiragana(&text) + let text = transliterate_katakana_to_hiragana(&text); + let text = transliterate_fullwidth_romaji_to_halfwidth_romaji(&text); + text +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalize_jp_katakana_to_hiragana() { + let input = "カタカナ"; + let expected = "かたかな"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_mixed_katakana_and_hiragana() { + let input = "カタカナとひらがな"; + let expected = "かたかなとひらがな"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_halfwidth_katakana_to_hiragana() { + let input = "カタカナ"; + let expected = "かたかな"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_replaces_long_vowel_dash() { + let input = "コーヒー"; + let expected = "こおひい"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_replaces_multiple_long_vowel_dashes() { + let input = "コーーヒーー"; + let expected = "こおおひいい"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_kanji() { + let input = "佐々木"; + let expected = "佐佐木"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_hiragana() { + let input = "さゝき"; + let expected = "ささき"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() { + let input = "さゞき"; + let expected = "さざき"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_katakana() { + let input = "サゝキ"; + let expected = "ササキ"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() { + let input = "サゞキ"; + let expected = "サザキ"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_normalizes_fullwidth_digits() { + let input = "1234567890"; + let expected = "1234567890"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[test] + fn test_normalize_jp_normalizes_fullwidth_romaji() { + let input = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + let expected = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_hiragana_yori() { + let input = "ゟ"; + let expected = "より"; + assert_eq!(normalize_japanese_text(input), expected); + } + + #[ignore = "FIX ME"] + #[test] + fn test_normalize_jp_katakana_koto() { + let input = "ヿ"; + let expected = "こと"; + assert_eq!(normalize_japanese_text(input), expected); + } }