text_normalization: normalize fullwidth romaji, add more tests
Build and test / check (push) Failing after 44s
Build and test / test (push) Successful in 55s
Build and test / build-static-library (push) Successful in 56s
Build and test / build-dynamic-library (push) Successful in 1m36s

This commit is contained in:
2026-06-04 15:51:10 +09:00
parent df2df5b4ac
commit b78c642a45
+114 -1
View File
@@ -1,8 +1,121 @@
use crate::kana_transliteration::{
transliterate_fullwidth_romaji_to_halfwidth_romaji,
transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
};
pub fn normalize_japanese_text(text: &str) -> String {
let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
transliterate_katakana_to_hiragana(&text)
let text = transliterate_katakana_to_hiragana(&text);
let text = transliterate_fullwidth_romaji_to_halfwidth_romaji(&text);
text
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_jp_katakana_to_hiragana() {
let input = "カタカナ";
let expected = "かたかな";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_mixed_katakana_and_hiragana() {
let input = "カタカナとひらがな";
let expected = "かたかなとひらがな";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_halfwidth_katakana_to_hiragana() {
let input = "カタカナ";
let expected = "かたかな";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_replaces_long_vowel_dash() {
let input = "コーヒー";
let expected = "こおひい";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_replaces_multiple_long_vowel_dashes() {
let input = "コーーヒーー";
let expected = "こおおひいい";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_kanji() {
let input = "佐々木";
let expected = "佐佐木";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_hiragana() {
let input = "さゝき";
let expected = "ささき";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() {
let input = "さゞき";
let expected = "さざき";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_katakana() {
let input = "サゝキ";
let expected = "ササキ";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() {
let input = "サゞキ";
let expected = "サザキ";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_normalizes_fullwidth_digits() {
let input = "1234567890";
let expected = "1234567890";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_normalizes_fullwidth_romaji() {
let input = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
let expected = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_hiragana_yori() {
let input = "";
let expected = "より";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_katakana_koto() {
let input = "";
let expected = "こと";
assert_eq!(normalize_japanese_text(input), expected);
}
}