text_normalization: normalize fullwidth romaji, add more tests
This commit is contained in:
+114
-1
@@ -1,8 +1,121 @@
|
||||
use crate::kana_transliteration::{
|
||||
transliterate_fullwidth_romaji_to_halfwidth_romaji,
|
||||
transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
|
||||
};
|
||||
|
||||
pub fn normalize_japanese_text(text: &str) -> String {
|
||||
let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
|
||||
transliterate_katakana_to_hiragana(&text)
|
||||
let text = transliterate_katakana_to_hiragana(&text);
|
||||
let text = transliterate_fullwidth_romaji_to_halfwidth_romaji(&text);
|
||||
text
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_katakana_to_hiragana() {
|
||||
let input = "カタカナ";
|
||||
let expected = "かたかな";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_mixed_katakana_and_hiragana() {
|
||||
let input = "カタカナとひらがな";
|
||||
let expected = "かたかなとひらがな";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_halfwidth_katakana_to_hiragana() {
|
||||
let input = "カタカナ";
|
||||
let expected = "かたかな";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_long_vowel_dash() {
|
||||
let input = "コーヒー";
|
||||
let expected = "こおひい";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_multiple_long_vowel_dashes() {
|
||||
let input = "コーーヒーー";
|
||||
let expected = "こおおひいい";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_kanji() {
|
||||
let input = "佐々木";
|
||||
let expected = "佐佐木";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_hiragana() {
|
||||
let input = "さゝき";
|
||||
let expected = "ささき";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() {
|
||||
let input = "さゞき";
|
||||
let expected = "さざき";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_katakana() {
|
||||
let input = "サゝキ";
|
||||
let expected = "ササキ";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() {
|
||||
let input = "サゞキ";
|
||||
let expected = "サザキ";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_normalizes_fullwidth_digits() {
|
||||
let input = "1234567890";
|
||||
let expected = "1234567890";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_jp_normalizes_fullwidth_romaji() {
|
||||
let input = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
let expected = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_hiragana_yori() {
|
||||
let input = "ゟ";
|
||||
let expected = "より";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_normalize_jp_katakana_koto() {
|
||||
let input = "ヿ";
|
||||
let expected = "こと";
|
||||
assert_eq!(normalize_japanese_text(input), expected);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user