kana_transliteration: substitute characters in place where possible

This commit is contained in:
2026-06-04 18:48:33 +09:00
parent fb2b4b86dd
commit 7ddfe08849
3 changed files with 175 additions and 43 deletions
+65 -35
View File
@@ -1,4 +1,6 @@
use crate::util::transpose_codepoints_in_range;
use crate::util::{
replace_char_in_place, transpose_codepoints_in_range, transpose_codepoints_in_range_in_place,
};
fn hiragana_vowel_map(hiragana: char) -> Option<char> {
match hiragana {
@@ -114,52 +116,80 @@ fn katakana_vowel_map(katakana: char) -> Option<char> {
// - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf
// - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
let first_pass = transpose_codepoints_in_range(katakana, -96, 'ァ'.into(), 'ヶ'.into());
pub(crate) fn transliterate_katakana_to_hiragana_script_only_in_place(katakana: &mut String) {
transpose_codepoints_in_range_in_place(katakana, -96, 'ァ'.into(), 'ヶ'.into());
// TODO: don't create a new result, just mutate the first pass in-place.
let chars = first_pass.chars().collect::<Vec<_>>();
let mut result = String::with_capacity(chars.len());
let mut byte_index = 0;
let mut previous_result_char = None;
for (i, c) in chars.iter().enumerate() {
match *c {
'ー' if i > 0 => {
let prev_char = result.chars().last().unwrap();
if let Some(vowel) = hiragana_vowel_map(prev_char) {
result.push(vowel);
} else {
result.push('ー');
}
}
'ヽ' if i > 0 => result.push('ゝ'),
'ヾ' if i > 0 => result.push('ゞ'),
_ => result.push(*c),
}
while byte_index < katakana.len() {
let c = katakana[byte_index..].chars().next().unwrap();
let replacement = match c {
'ヽ' if previous_result_char.is_some() => 'ゝ',
'ヾ' if previous_result_char.is_some() => 'ゞ',
_ => c,
};
replace_char_in_place(katakana, byte_index, c, replacement);
previous_result_char = Some(replacement);
byte_index += c.len_utf8();
}
}
pub(crate) fn expand_hiragana_long_vowel_marks_in_place(text: &mut String) {
let mut byte_index = 0;
let mut previous_result_char = None;
while byte_index < text.len() {
let c = text[byte_index..].chars().next().unwrap();
let replacement = match c {
'ー' => previous_result_char
.and_then(hiragana_vowel_map)
.unwrap_or('ー'),
_ => c,
};
replace_char_in_place(text, byte_index, c, replacement);
previous_result_char = Some(replacement);
byte_index += c.len_utf8();
}
}
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
let mut result = katakana.to_string();
transliterate_katakana_to_hiragana_script_only_in_place(&mut result);
expand_hiragana_long_vowel_marks_in_place(&mut result);
result
}
pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
let first_pass = transpose_codepoints_in_range(hiragana, 96, 12353, 12438);
pub(crate) fn transliterate_hiragana_to_katakana_in_place(hiragana: &mut String) {
transpose_codepoints_in_range_in_place(hiragana, 96, 12353, 12438);
let mut byte_index = 0;
let mut previous_original_char = None;
// TODO: don't create a new result, just mutate the first pass in-place.
// Find instances of long vowels, and convert them into long vowel marks (ー).
let chars = first_pass.chars().collect::<Vec<_>>();
let mut result = String::with_capacity(chars.len());
while byte_index < hiragana.len() {
let c = hiragana[byte_index..].chars().next().unwrap();
let replacement = match c {
'ア' | 'イ' | 'ウ' | 'エ' | 'オ'
if previous_original_char
.is_some_and(|prev| katakana_vowel_map(prev) == Some(c) || prev == 'ー') =>
{
'ー'
}
_ => c,
};
for i in 0..chars.len() {
if i == 0 {
result.push(chars[i]);
} else if ['ア', 'イ', 'ウ', 'エ', 'オ'].contains(&chars[i])
&& (katakana_vowel_map(chars[i - 1]) == Some(chars[i]) || chars[i - 1] == 'ー')
{
result.push('ー');
} else {
result.push(chars[i]);
}
replace_char_in_place(hiragana, byte_index, c, replacement);
previous_original_char = Some(c);
byte_index += c.len_utf8();
}
}
pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
let mut result = hiragana.to_string();
transliterate_hiragana_to_katakana_in_place(&mut result);
result
}
+7 -5
View File
@@ -1,12 +1,14 @@
use crate::kana_transliteration::{
transliterate_fullwidth_romaji_to_halfwidth_romaji,
transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
expand_hiragana_long_vowel_marks_in_place, transliterate_fullwidth_romaji_to_halfwidth_romaji,
transliterate_halfwidth_katakana_to_fullwidth_katakana,
transliterate_katakana_to_hiragana_script_only_in_place,
};
pub fn normalize_japanese_text(text: &str) -> String {
let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
let text = transliterate_katakana_to_hiragana(&text);
let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
transliterate_katakana_to_hiragana_script_only_in_place(&mut text);
expand_hiragana_long_vowel_marks_in_place(&mut text);
transliterate_fullwidth_romaji_to_halfwidth_romaji(&text)
}
+103 -3
View File
@@ -1,12 +1,47 @@
#[cfg(debug_assertions)]
fn debug_assert_transpose_codepoints_range(distance: i32, range_start: u32, range_end: u32) {
debug_assert!(range_start <= range_end);
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
}
#[cfg(debug_assertions)]
fn debug_codepoint_utf8_width(codepoint: u32) -> usize {
std::char::from_u32(codepoint)
.expect("transpose range endpoints must be valid Unicode scalar values")
.len_utf8()
}
#[cfg(debug_assertions)]
fn debug_assert_transpose_codepoints_range_same_width(
distance: i32,
range_start: u32,
range_end: u32,
) {
let shifted_range_start = range_start.strict_add_signed(distance);
let shifted_range_end = range_end.strict_add_signed(distance);
let source_width = debug_codepoint_utf8_width(range_start);
debug_assert_eq!(source_width, debug_codepoint_utf8_width(range_end));
let shifted_width = debug_codepoint_utf8_width(shifted_range_start);
debug_assert_eq!(shifted_width, debug_codepoint_utf8_width(shifted_range_end));
debug_assert_eq!(source_width, shifted_width);
}
/// Transposes codepoints in the specified range by the specified distance.
///
/// Codepoints outside the range are left unchanged.
/// Codepoints that would be transposed to invalid Unicode scalar values are left unchanged.
///
/// This produces a new string on the heap, see [transpose_codepoints_in_range_in_place_same_width] for an in-place variant.
pub(crate) fn transpose_codepoints_in_range(
text: &str,
distance: i32,
range_start: u32,
range_end: u32,
) -> String {
debug_assert!(range_start <= range_end);
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
debug_assert_transpose_codepoints_range(distance, range_start, range_end);
text.chars()
.map(|c| {
@@ -19,3 +54,68 @@ pub(crate) fn transpose_codepoints_in_range(
})
.collect()
}
#[inline]
pub(crate) fn replace_char_in_place(text: &mut String, byte_index: usize, old: char, new: char) {
debug_assert_eq!(old.len_utf8(), new.len_utf8());
if old == new {
return;
}
let mut buffer = [0; 4];
text.replace_range(
byte_index..byte_index + old.len_utf8(),
new.encode_utf8(&mut buffer),
);
}
/// Variant of [transpose_codepoints_in_range] that modifies the input string in place.
///
/// The input and output range must consist of codepoints with the same UTF-8 width,
/// otherwise the resulting bytes will overlap with each other, or there will be gaps between them.
pub(crate) fn transpose_codepoints_in_range_in_place(
text: &mut String,
distance: i32,
range_start: u32,
range_end: u32,
) {
debug_assert_transpose_codepoints_range(distance, range_start, range_end);
debug_assert_transpose_codepoints_range_same_width(distance, range_start, range_end);
let mut byte_index = 0;
while byte_index < text.len() {
let c = text[byte_index..].chars().next().unwrap();
let codepoint = c as u32;
let replacement = if range_start <= codepoint && codepoint <= range_end {
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
} else {
c
};
replace_char_in_place(text, byte_index, c, replacement);
byte_index += c.len_utf8();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_transpose_codepoints_in_range_in_place_same_width() {
let mut text = "カタカナ".to_string();
transpose_codepoints_in_range_in_place(&mut text, -96, 'ァ'.into(), 'ヶ'.into());
assert_eq!(text, "かたかな");
}
#[cfg(debug_assertions)]
#[test]
#[should_panic]
fn test_transpose_codepoints_in_range_in_place_same_width_rejects_mixed_width_range() {
let mut text = "A".to_string();
transpose_codepoints_in_range_in_place(&mut text, 0, 'A'.into(), 'Ā'.into());
}
}