text_normalization: implement iteration mark flattening

This commit is contained in:
2026-06-04 18:49:56 +09:00
parent 23450274d1
commit b8ab2ad2fc
+119 -7
View File
@@ -4,9 +4,105 @@ use crate::kana_transliteration::{
transliterate_katakana_to_hiragana_script_only_in_place,
};
fn is_iteration_mark(c: char) -> bool {
matches!(c, '々' | '〻' | 'ゝ' | 'ゞ' | 'ヽ' | 'ヾ')
}
fn is_hiragana(c: char) -> bool {
('ぁ'..='ゖ').contains(&c)
}
fn strip_hiragana_voicing(hiragana: char) -> Option<char> {
Some(match hiragana {
'が' => 'か',
'ぎ' => 'き',
'ぐ' => 'く',
'げ' => 'け',
'ご' => 'こ',
'ざ' => 'さ',
'じ' => 'し',
'ず' => 'す',
'ぜ' => 'せ',
'ぞ' => 'そ',
'だ' => 'た',
'ぢ' => 'ち',
'づ' => 'つ',
'で' => 'て',
'ど' => 'と',
'ば' | 'ぱ' => 'は',
'び' | 'ぴ' => 'ひ',
'ぶ' | 'ぷ' => 'ふ',
'べ' | 'ぺ' => 'へ',
'ぼ' | 'ぽ' => 'ほ',
'ゔ' => 'う',
c if is_hiragana(c) => c,
_ => return None,
})
}
fn add_hiragana_dakuten(hiragana: char) -> Option<char> {
Some(match hiragana {
'う' => 'ゔ',
'か' => 'が',
'き' => 'ぎ',
'く' => 'ぐ',
'け' => 'げ',
'こ' => 'ご',
'さ' => 'ざ',
'し' => 'じ',
'す' => 'ず',
'せ' => 'ぜ',
'そ' => 'ぞ',
'た' => 'だ',
'ち' => 'ぢ',
'つ' => 'づ',
'て' => 'で',
'と' => 'ど',
'は' => 'ば',
'ひ' => 'び',
'ふ' => 'ぶ',
'へ' => 'べ',
'ほ' => 'ぼ',
_ => return None,
})
}
fn replace_iteration_marks(text: &mut String) {
let mut byte_index = 0;
let mut previous_char = None;
while byte_index < text.len() {
let c = text[byte_index..].chars().next().unwrap();
let replacement = match c {
'々' | '〻' => previous_char.unwrap_or(c),
'ゝ' => previous_char.and_then(strip_hiragana_voicing).unwrap_or(c),
'ゞ' => previous_char
.and_then(strip_hiragana_voicing)
.and_then(|hiragana| add_hiragana_dakuten(hiragana).or(Some(hiragana)))
.unwrap_or(c),
_ => c,
};
if replacement != c {
let mut buffer = [0; 4];
text.replace_range(
byte_index..byte_index + c.len_utf8(),
replacement.encode_utf8(&mut buffer),
);
}
if !is_iteration_mark(replacement) {
previous_char = Some(replacement);
}
byte_index += replacement.len_utf8();
}
}
pub fn normalize_japanese_text(text: &str) -> String {
let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
transliterate_katakana_to_hiragana_script_only_in_place(&mut text);
replace_iteration_marks(&mut text);
expand_hiragana_long_vowel_marks_in_place(&mut text);
transliterate_fullwidth_romaji_to_halfwidth_romaji(&text)
@@ -51,7 +147,6 @@ mod tests {
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_kanji() {
let input = "佐々木";
@@ -59,7 +154,13 @@ mod tests {
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_kanji_alternative_variant() {
let input = "佐〻木";
let expected = "佐佐木";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_replaces_iteration_mark_hiragana() {
let input = "さゝき";
@@ -67,7 +168,6 @@ mod tests {
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_hiragana_dakuten() {
let input = "さゞき";
@@ -75,22 +175,34 @@ mod tests {
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_katakana() {
let input = "サゝキ";
let expected = "ササキ";
let expected = "ささき";
assert_eq!(normalize_japanese_text(input), expected);
}
#[ignore = "FIX ME"]
#[test]
fn test_normalize_jp_replaces_iteration_mark_katakana_dakuten() {
let input = "サゞキ";
let expected = "サザキ";
let expected = "さざき";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_normalize_jp_replaces_long_vowel_after_iteration_mark() {
let input = "サヾーエ";
let expected = "さざあえ";
assert_eq!(normalize_japanese_text(input), expected);
}
#[test]
fn test_replace_iteration_marks_handles_variable_width_replacements_in_place() {
let mut text = "🙂々".to_string();
replace_iteration_marks(&mut text);
assert_eq!(text, "🙂🙂");
}
#[test]
fn test_normalize_jp_normalizes_fullwidth_digits() {
let input = "1234567890";