diff --git a/src/kana_transliteration.rs b/src/kana_transliteration.rs index 645a80f..efe53e3 100644 --- a/src/kana_transliteration.rs +++ b/src/kana_transliteration.rs @@ -1,4 +1,6 @@ -use crate::util::transpose_codepoints_in_range; +use crate::util::{ + replace_char_in_place, transpose_codepoints_in_range, transpose_codepoints_in_range_in_place, +}; fn hiragana_vowel_map(hiragana: char) -> Option { match hiragana { @@ -114,52 +116,80 @@ fn katakana_vowel_map(katakana: char) -> Option { // - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf // - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf -pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String { - let first_pass = transpose_codepoints_in_range(katakana, -96, 'ァ'.into(), 'ヶ'.into()); +pub(crate) fn transliterate_katakana_to_hiragana_script_only_in_place(katakana: &mut String) { + transpose_codepoints_in_range_in_place(katakana, -96, 'ァ'.into(), 'ヶ'.into()); - // TODO: don't create a new result, just mutate the first pass in-place. - let chars = first_pass.chars().collect::>(); - let mut result = String::with_capacity(chars.len()); + let mut byte_index = 0; + let mut previous_result_char = None; - for (i, c) in chars.iter().enumerate() { - match *c { - 'ー' if i > 0 => { - let prev_char = result.chars().last().unwrap(); - if let Some(vowel) = hiragana_vowel_map(prev_char) { - result.push(vowel); - } else { - result.push('ー'); - } - } - 'ヽ' if i > 0 => result.push('ゝ'), - 'ヾ' if i > 0 => result.push('ゞ'), - _ => result.push(*c), - } + while byte_index < katakana.len() { + let c = katakana[byte_index..].chars().next().unwrap(); + let replacement = match c { + 'ヽ' if previous_result_char.is_some() => 'ゝ', + 'ヾ' if previous_result_char.is_some() => 'ゞ', + _ => c, + }; + + replace_char_in_place(katakana, byte_index, c, replacement); + previous_result_char = Some(replacement); + byte_index += c.len_utf8(); } +} +pub(crate) fn expand_hiragana_long_vowel_marks_in_place(text: &mut String) { + let mut byte_index = 0; + let mut previous_result_char = None; + + while byte_index < text.len() { + let c = text[byte_index..].chars().next().unwrap(); + let replacement = match c { + 'ー' => previous_result_char + .and_then(hiragana_vowel_map) + .unwrap_or('ー'), + _ => c, + }; + + replace_char_in_place(text, byte_index, c, replacement); + previous_result_char = Some(replacement); + byte_index += c.len_utf8(); + } +} + +pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String { + let mut result = katakana.to_string(); + transliterate_katakana_to_hiragana_script_only_in_place(&mut result); + expand_hiragana_long_vowel_marks_in_place(&mut result); result } -pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String { - let first_pass = transpose_codepoints_in_range(hiragana, 96, 12353, 12438); +pub(crate) fn transliterate_hiragana_to_katakana_in_place(hiragana: &mut String) { + transpose_codepoints_in_range_in_place(hiragana, 96, 12353, 12438); + + let mut byte_index = 0; + let mut previous_original_char = None; - // TODO: don't create a new result, just mutate the first pass in-place. // Find instances of long vowels, and convert them into long vowel marks (ー). - let chars = first_pass.chars().collect::>(); - let mut result = String::with_capacity(chars.len()); + while byte_index < hiragana.len() { + let c = hiragana[byte_index..].chars().next().unwrap(); + let replacement = match c { + 'ア' | 'イ' | 'ウ' | 'エ' | 'オ' + if previous_original_char + .is_some_and(|prev| katakana_vowel_map(prev) == Some(c) || prev == 'ー') => + { + 'ー' + } + _ => c, + }; - for i in 0..chars.len() { - if i == 0 { - result.push(chars[i]); - } else if ['ア', 'イ', 'ウ', 'エ', 'オ'].contains(&chars[i]) - && (katakana_vowel_map(chars[i - 1]) == Some(chars[i]) || chars[i - 1] == 'ー') - { - result.push('ー'); - } else { - result.push(chars[i]); - } + replace_char_in_place(hiragana, byte_index, c, replacement); + previous_original_char = Some(c); + byte_index += c.len_utf8(); } +} +pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String { + let mut result = hiragana.to_string(); + transliterate_hiragana_to_katakana_in_place(&mut result); result } diff --git a/src/text_normalization.rs b/src/text_normalization.rs index e9792d6..228d6ae 100644 --- a/src/text_normalization.rs +++ b/src/text_normalization.rs @@ -1,12 +1,14 @@ use crate::kana_transliteration::{ - transliterate_fullwidth_romaji_to_halfwidth_romaji, - transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana, + expand_hiragana_long_vowel_marks_in_place, transliterate_fullwidth_romaji_to_halfwidth_romaji, + transliterate_halfwidth_katakana_to_fullwidth_katakana, + transliterate_katakana_to_hiragana_script_only_in_place, }; pub fn normalize_japanese_text(text: &str) -> String { - let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text); - let text = transliterate_katakana_to_hiragana(&text); - + let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text); + transliterate_katakana_to_hiragana_script_only_in_place(&mut text); + expand_hiragana_long_vowel_marks_in_place(&mut text); + transliterate_fullwidth_romaji_to_halfwidth_romaji(&text) } diff --git a/src/util.rs b/src/util.rs index 60df67a..0894c80 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,12 +1,47 @@ +#[cfg(debug_assertions)] +fn debug_assert_transpose_codepoints_range(distance: i32, range_start: u32, range_end: u32) { + debug_assert!(range_start <= range_end); + debug_assert!(i64::from(range_start) + i64::from(distance) >= 0); + debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX)); +} + +#[cfg(debug_assertions)] +fn debug_codepoint_utf8_width(codepoint: u32) -> usize { + std::char::from_u32(codepoint) + .expect("transpose range endpoints must be valid Unicode scalar values") + .len_utf8() +} + +#[cfg(debug_assertions)] +fn debug_assert_transpose_codepoints_range_same_width( + distance: i32, + range_start: u32, + range_end: u32, +) { + let shifted_range_start = range_start.strict_add_signed(distance); + let shifted_range_end = range_end.strict_add_signed(distance); + + let source_width = debug_codepoint_utf8_width(range_start); + debug_assert_eq!(source_width, debug_codepoint_utf8_width(range_end)); + + let shifted_width = debug_codepoint_utf8_width(shifted_range_start); + debug_assert_eq!(shifted_width, debug_codepoint_utf8_width(shifted_range_end)); + debug_assert_eq!(source_width, shifted_width); +} + +/// Transposes codepoints in the specified range by the specified distance. +/// +/// Codepoints outside the range are left unchanged. +/// Codepoints that would be transposed to invalid Unicode scalar values are left unchanged. +/// +/// This produces a new string on the heap, see [transpose_codepoints_in_range_in_place_same_width] for an in-place variant. pub(crate) fn transpose_codepoints_in_range( text: &str, distance: i32, range_start: u32, range_end: u32, ) -> String { - debug_assert!(range_start <= range_end); - debug_assert!(i64::from(range_start) + i64::from(distance) >= 0); - debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX)); + debug_assert_transpose_codepoints_range(distance, range_start, range_end); text.chars() .map(|c| { @@ -19,3 +54,68 @@ pub(crate) fn transpose_codepoints_in_range( }) .collect() } + +#[inline] +pub(crate) fn replace_char_in_place(text: &mut String, byte_index: usize, old: char, new: char) { + debug_assert_eq!(old.len_utf8(), new.len_utf8()); + + if old == new { + return; + } + + let mut buffer = [0; 4]; + text.replace_range( + byte_index..byte_index + old.len_utf8(), + new.encode_utf8(&mut buffer), + ); +} + +/// Variant of [transpose_codepoints_in_range] that modifies the input string in place. +/// +/// The input and output range must consist of codepoints with the same UTF-8 width, +/// otherwise the resulting bytes will overlap with each other, or there will be gaps between them. +pub(crate) fn transpose_codepoints_in_range_in_place( + text: &mut String, + distance: i32, + range_start: u32, + range_end: u32, +) { + debug_assert_transpose_codepoints_range(distance, range_start, range_end); + debug_assert_transpose_codepoints_range_same_width(distance, range_start, range_end); + + let mut byte_index = 0; + + while byte_index < text.len() { + let c = text[byte_index..].chars().next().unwrap(); + let codepoint = c as u32; + let replacement = if range_start <= codepoint && codepoint <= range_end { + std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c) + } else { + c + }; + + replace_char_in_place(text, byte_index, c, replacement); + + byte_index += c.len_utf8(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_transpose_codepoints_in_range_in_place_same_width() { + let mut text = "カタカナ".to_string(); + transpose_codepoints_in_range_in_place(&mut text, -96, 'ァ'.into(), 'ヶ'.into()); + assert_eq!(text, "かたかな"); + } + + #[cfg(debug_assertions)] + #[test] + #[should_panic] + fn test_transpose_codepoints_in_range_in_place_same_width_rejects_mixed_width_range() { + let mut text = "A".to_string(); + transpose_codepoints_in_range_in_place(&mut text, 0, 'A'.into(), 'Ā'.into()); + } +}