kana_transliteration: substitute characters in place where possible

2026-06-04 18:48:33 +09:00
parent fb2b4b86dd
commit 7ddfe08849
3 changed files with 175 additions and 43 deletions
@@ -1,4 +1,6 @@
-use crate::util::transpose_codepoints_in_range;
+use crate::util::{
+    replace_char_in_place, transpose_codepoints_in_range, transpose_codepoints_in_range_in_place,
+};

 fn hiragana_vowel_map(hiragana: char) -> Option<char> {
    match hiragana {
@@ -114,52 +116,80 @@ fn katakana_vowel_map(katakana: char) -> Option<char> {
 // - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf
 // - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf

-pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
-    let first_pass = transpose_codepoints_in_range(katakana, -96, 'ァ'.into(), 'ヶ'.into());
+pub(crate) fn transliterate_katakana_to_hiragana_script_only_in_place(katakana: &mut String) {
+    transpose_codepoints_in_range_in_place(katakana, -96, 'ァ'.into(), 'ヶ'.into());

-    // TODO: don't create a new result, just mutate the first pass in-place.
-    let chars = first_pass.chars().collect::<Vec<_>>();
-    let mut result = String::with_capacity(chars.len());
+    let mut byte_index = 0;
+    let mut previous_result_char = None;

-    for (i, c) in chars.iter().enumerate() {
-        match *c {
-            'ー' if i > 0 => {
-                let prev_char = result.chars().last().unwrap();
-                if let Some(vowel) = hiragana_vowel_map(prev_char) {
-                    result.push(vowel);
-                } else {
-                    result.push('ー');
-                }
-            }
-            'ヽ' if i > 0 => result.push('ゝ'),
-            'ヾ' if i > 0 => result.push('ゞ'),
-            _ => result.push(*c),
-        }
+    while byte_index < katakana.len() {
+        let c = katakana[byte_index..].chars().next().unwrap();
+        let replacement = match c {
+            'ヽ' if previous_result_char.is_some() => 'ゝ',
+            'ヾ' if previous_result_char.is_some() => 'ゞ',
+            _ => c,
+        };
+
+        replace_char_in_place(katakana, byte_index, c, replacement);
+        previous_result_char = Some(replacement);
+        byte_index += c.len_utf8();
    }
+}

+pub(crate) fn expand_hiragana_long_vowel_marks_in_place(text: &mut String) {
+    let mut byte_index = 0;
+    let mut previous_result_char = None;
+
+    while byte_index < text.len() {
+        let c = text[byte_index..].chars().next().unwrap();
+        let replacement = match c {
+            'ー' => previous_result_char
+                .and_then(hiragana_vowel_map)
+                .unwrap_or('ー'),
+            _ => c,
+        };
+
+        replace_char_in_place(text, byte_index, c, replacement);
+        previous_result_char = Some(replacement);
+        byte_index += c.len_utf8();
+    }
+}
+
+pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
+    let mut result = katakana.to_string();
+    transliterate_katakana_to_hiragana_script_only_in_place(&mut result);
+    expand_hiragana_long_vowel_marks_in_place(&mut result);
    result
 }

-pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
-    let first_pass = transpose_codepoints_in_range(hiragana, 96, 12353, 12438);
+pub(crate) fn transliterate_hiragana_to_katakana_in_place(hiragana: &mut String) {
+    transpose_codepoints_in_range_in_place(hiragana, 96, 12353, 12438);
+
+    let mut byte_index = 0;
+    let mut previous_original_char = None;

-    // TODO: don't create a new result, just mutate the first pass in-place.
    // Find instances of long vowels, and convert them into long vowel marks (ー).
-    let chars = first_pass.chars().collect::<Vec<_>>();
-    let mut result = String::with_capacity(chars.len());
+    while byte_index < hiragana.len() {
+        let c = hiragana[byte_index..].chars().next().unwrap();
+        let replacement = match c {
+            'ア' | 'イ' | 'ウ' | 'エ' | 'オ'
+                if previous_original_char
+                    .is_some_and(|prev| katakana_vowel_map(prev) == Some(c) || prev == 'ー') =>
+            {
+                'ー'
+            }
+            _ => c,
+        };

-    for i in 0..chars.len() {
-        if i == 0 {
-            result.push(chars[i]);
-        } else if ['ア', 'イ', 'ウ', 'エ', 'オ'].contains(&chars[i])
-            && (katakana_vowel_map(chars[i - 1]) == Some(chars[i]) || chars[i - 1] == 'ー')
-        {
-            result.push('ー');
-        } else {
-            result.push(chars[i]);
-        }
+        replace_char_in_place(hiragana, byte_index, c, replacement);
+        previous_original_char = Some(c);
+        byte_index += c.len_utf8();
    }
+}

+pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
+    let mut result = hiragana.to_string();
+    transliterate_hiragana_to_katakana_in_place(&mut result);
    result
 }

@@ -1,12 +1,14 @@
 use crate::kana_transliteration::{
-    transliterate_fullwidth_romaji_to_halfwidth_romaji,
-    transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
+    expand_hiragana_long_vowel_marks_in_place, transliterate_fullwidth_romaji_to_halfwidth_romaji,
+    transliterate_halfwidth_katakana_to_fullwidth_katakana,
+    transliterate_katakana_to_hiragana_script_only_in_place,
 };

 pub fn normalize_japanese_text(text: &str) -> String {
-    let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
-    let text = transliterate_katakana_to_hiragana(&text);
-    
+    let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
+    transliterate_katakana_to_hiragana_script_only_in_place(&mut text);
+    expand_hiragana_long_vowel_marks_in_place(&mut text);
+
    transliterate_fullwidth_romaji_to_halfwidth_romaji(&text)
 }

@@ -1,12 +1,47 @@
+#[cfg(debug_assertions)]
+fn debug_assert_transpose_codepoints_range(distance: i32, range_start: u32, range_end: u32) {
+    debug_assert!(range_start <= range_end);
+    debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
+    debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
+}
+
+#[cfg(debug_assertions)]
+fn debug_codepoint_utf8_width(codepoint: u32) -> usize {
+    std::char::from_u32(codepoint)
+        .expect("transpose range endpoints must be valid Unicode scalar values")
+        .len_utf8()
+}
+
+#[cfg(debug_assertions)]
+fn debug_assert_transpose_codepoints_range_same_width(
+    distance: i32,
+    range_start: u32,
+    range_end: u32,
+) {
+    let shifted_range_start = range_start.strict_add_signed(distance);
+    let shifted_range_end = range_end.strict_add_signed(distance);
+
+    let source_width = debug_codepoint_utf8_width(range_start);
+    debug_assert_eq!(source_width, debug_codepoint_utf8_width(range_end));
+
+    let shifted_width = debug_codepoint_utf8_width(shifted_range_start);
+    debug_assert_eq!(shifted_width, debug_codepoint_utf8_width(shifted_range_end));
+    debug_assert_eq!(source_width, shifted_width);
+}
+
+/// Transposes codepoints in the specified range by the specified distance.
+///
+/// Codepoints outside the range are left unchanged.
+/// Codepoints that would be transposed to invalid Unicode scalar values are left unchanged.
+///
+/// This produces a new string on the heap, see [transpose_codepoints_in_range_in_place_same_width] for an in-place variant.
 pub(crate) fn transpose_codepoints_in_range(
    text: &str,
    distance: i32,
    range_start: u32,
    range_end: u32,
 ) -> String {
-    debug_assert!(range_start <= range_end);
-    debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
-    debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
+    debug_assert_transpose_codepoints_range(distance, range_start, range_end);

    text.chars()
        .map(|c| {
@@ -19,3 +54,68 @@ pub(crate) fn transpose_codepoints_in_range(
        })
        .collect()
 }
+
+#[inline]
+pub(crate) fn replace_char_in_place(text: &mut String, byte_index: usize, old: char, new: char) {
+    debug_assert_eq!(old.len_utf8(), new.len_utf8());
+
+    if old == new {
+        return;
+    }
+
+    let mut buffer = [0; 4];
+    text.replace_range(
+        byte_index..byte_index + old.len_utf8(),
+        new.encode_utf8(&mut buffer),
+    );
+}
+
+/// Variant of [transpose_codepoints_in_range] that modifies the input string in place.
+///
+/// The input and output range must consist of codepoints with the same UTF-8 width,
+/// otherwise the resulting bytes will overlap with each other, or there will be gaps between them.
+pub(crate) fn transpose_codepoints_in_range_in_place(
+    text: &mut String,
+    distance: i32,
+    range_start: u32,
+    range_end: u32,
+) {
+    debug_assert_transpose_codepoints_range(distance, range_start, range_end);
+    debug_assert_transpose_codepoints_range_same_width(distance, range_start, range_end);
+
+    let mut byte_index = 0;
+
+    while byte_index < text.len() {
+        let c = text[byte_index..].chars().next().unwrap();
+        let codepoint = c as u32;
+        let replacement = if range_start <= codepoint && codepoint <= range_end {
+            std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
+        } else {
+            c
+        };
+
+        replace_char_in_place(text, byte_index, c, replacement);
+
+        byte_index += c.len_utf8();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_transpose_codepoints_in_range_in_place_same_width() {
+        let mut text = "カタカナ".to_string();
+        transpose_codepoints_in_range_in_place(&mut text, -96, 'ァ'.into(), 'ヶ'.into());
+        assert_eq!(text, "かたかな");
+    }
+
+    #[cfg(debug_assertions)]
+    #[test]
+    #[should_panic]
+    fn test_transpose_codepoints_in_range_in_place_same_width_rejects_mixed_width_range() {
+        let mut text = "A".to_string();
+        transpose_codepoints_in_range_in_place(&mut text, 0, 'A'.into(), 'Ā'.into());
+    }
+}