kana_transliteration: substitute characters in place where possible
This commit is contained in:
+65
-35
@@ -1,4 +1,6 @@
|
||||
use crate::util::transpose_codepoints_in_range;
|
||||
use crate::util::{
|
||||
replace_char_in_place, transpose_codepoints_in_range, transpose_codepoints_in_range_in_place,
|
||||
};
|
||||
|
||||
fn hiragana_vowel_map(hiragana: char) -> Option<char> {
|
||||
match hiragana {
|
||||
@@ -114,52 +116,80 @@ fn katakana_vowel_map(katakana: char) -> Option<char> {
|
||||
// - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf
|
||||
// - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf
|
||||
|
||||
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
|
||||
let first_pass = transpose_codepoints_in_range(katakana, -96, 'ァ'.into(), 'ヶ'.into());
|
||||
pub(crate) fn transliterate_katakana_to_hiragana_script_only_in_place(katakana: &mut String) {
|
||||
transpose_codepoints_in_range_in_place(katakana, -96, 'ァ'.into(), 'ヶ'.into());
|
||||
|
||||
// TODO: don't create a new result, just mutate the first pass in-place.
|
||||
let chars = first_pass.chars().collect::<Vec<_>>();
|
||||
let mut result = String::with_capacity(chars.len());
|
||||
let mut byte_index = 0;
|
||||
let mut previous_result_char = None;
|
||||
|
||||
for (i, c) in chars.iter().enumerate() {
|
||||
match *c {
|
||||
'ー' if i > 0 => {
|
||||
let prev_char = result.chars().last().unwrap();
|
||||
if let Some(vowel) = hiragana_vowel_map(prev_char) {
|
||||
result.push(vowel);
|
||||
} else {
|
||||
result.push('ー');
|
||||
}
|
||||
}
|
||||
'ヽ' if i > 0 => result.push('ゝ'),
|
||||
'ヾ' if i > 0 => result.push('ゞ'),
|
||||
_ => result.push(*c),
|
||||
}
|
||||
while byte_index < katakana.len() {
|
||||
let c = katakana[byte_index..].chars().next().unwrap();
|
||||
let replacement = match c {
|
||||
'ヽ' if previous_result_char.is_some() => 'ゝ',
|
||||
'ヾ' if previous_result_char.is_some() => 'ゞ',
|
||||
_ => c,
|
||||
};
|
||||
|
||||
replace_char_in_place(katakana, byte_index, c, replacement);
|
||||
previous_result_char = Some(replacement);
|
||||
byte_index += c.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn expand_hiragana_long_vowel_marks_in_place(text: &mut String) {
|
||||
let mut byte_index = 0;
|
||||
let mut previous_result_char = None;
|
||||
|
||||
while byte_index < text.len() {
|
||||
let c = text[byte_index..].chars().next().unwrap();
|
||||
let replacement = match c {
|
||||
'ー' => previous_result_char
|
||||
.and_then(hiragana_vowel_map)
|
||||
.unwrap_or('ー'),
|
||||
_ => c,
|
||||
};
|
||||
|
||||
replace_char_in_place(text, byte_index, c, replacement);
|
||||
previous_result_char = Some(replacement);
|
||||
byte_index += c.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
|
||||
let mut result = katakana.to_string();
|
||||
transliterate_katakana_to_hiragana_script_only_in_place(&mut result);
|
||||
expand_hiragana_long_vowel_marks_in_place(&mut result);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
|
||||
let first_pass = transpose_codepoints_in_range(hiragana, 96, 12353, 12438);
|
||||
pub(crate) fn transliterate_hiragana_to_katakana_in_place(hiragana: &mut String) {
|
||||
transpose_codepoints_in_range_in_place(hiragana, 96, 12353, 12438);
|
||||
|
||||
let mut byte_index = 0;
|
||||
let mut previous_original_char = None;
|
||||
|
||||
// TODO: don't create a new result, just mutate the first pass in-place.
|
||||
// Find instances of long vowels, and convert them into long vowel marks (ー).
|
||||
let chars = first_pass.chars().collect::<Vec<_>>();
|
||||
let mut result = String::with_capacity(chars.len());
|
||||
while byte_index < hiragana.len() {
|
||||
let c = hiragana[byte_index..].chars().next().unwrap();
|
||||
let replacement = match c {
|
||||
'ア' | 'イ' | 'ウ' | 'エ' | 'オ'
|
||||
if previous_original_char
|
||||
.is_some_and(|prev| katakana_vowel_map(prev) == Some(c) || prev == 'ー') =>
|
||||
{
|
||||
'ー'
|
||||
}
|
||||
_ => c,
|
||||
};
|
||||
|
||||
for i in 0..chars.len() {
|
||||
if i == 0 {
|
||||
result.push(chars[i]);
|
||||
} else if ['ア', 'イ', 'ウ', 'エ', 'オ'].contains(&chars[i])
|
||||
&& (katakana_vowel_map(chars[i - 1]) == Some(chars[i]) || chars[i - 1] == 'ー')
|
||||
{
|
||||
result.push('ー');
|
||||
} else {
|
||||
result.push(chars[i]);
|
||||
}
|
||||
replace_char_in_place(hiragana, byte_index, c, replacement);
|
||||
previous_original_char = Some(c);
|
||||
byte_index += c.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
|
||||
let mut result = hiragana.to_string();
|
||||
transliterate_hiragana_to_katakana_in_place(&mut result);
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
use crate::kana_transliteration::{
|
||||
transliterate_fullwidth_romaji_to_halfwidth_romaji,
|
||||
transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
|
||||
expand_hiragana_long_vowel_marks_in_place, transliterate_fullwidth_romaji_to_halfwidth_romaji,
|
||||
transliterate_halfwidth_katakana_to_fullwidth_katakana,
|
||||
transliterate_katakana_to_hiragana_script_only_in_place,
|
||||
};
|
||||
|
||||
pub fn normalize_japanese_text(text: &str) -> String {
|
||||
let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
|
||||
let text = transliterate_katakana_to_hiragana(&text);
|
||||
|
||||
let mut text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
|
||||
transliterate_katakana_to_hiragana_script_only_in_place(&mut text);
|
||||
expand_hiragana_long_vowel_marks_in_place(&mut text);
|
||||
|
||||
transliterate_fullwidth_romaji_to_halfwidth_romaji(&text)
|
||||
}
|
||||
|
||||
|
||||
+103
-3
@@ -1,12 +1,47 @@
|
||||
#[cfg(debug_assertions)]
|
||||
fn debug_assert_transpose_codepoints_range(distance: i32, range_start: u32, range_end: u32) {
|
||||
debug_assert!(range_start <= range_end);
|
||||
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
|
||||
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
fn debug_codepoint_utf8_width(codepoint: u32) -> usize {
|
||||
std::char::from_u32(codepoint)
|
||||
.expect("transpose range endpoints must be valid Unicode scalar values")
|
||||
.len_utf8()
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
fn debug_assert_transpose_codepoints_range_same_width(
|
||||
distance: i32,
|
||||
range_start: u32,
|
||||
range_end: u32,
|
||||
) {
|
||||
let shifted_range_start = range_start.strict_add_signed(distance);
|
||||
let shifted_range_end = range_end.strict_add_signed(distance);
|
||||
|
||||
let source_width = debug_codepoint_utf8_width(range_start);
|
||||
debug_assert_eq!(source_width, debug_codepoint_utf8_width(range_end));
|
||||
|
||||
let shifted_width = debug_codepoint_utf8_width(shifted_range_start);
|
||||
debug_assert_eq!(shifted_width, debug_codepoint_utf8_width(shifted_range_end));
|
||||
debug_assert_eq!(source_width, shifted_width);
|
||||
}
|
||||
|
||||
/// Transposes codepoints in the specified range by the specified distance.
|
||||
///
|
||||
/// Codepoints outside the range are left unchanged.
|
||||
/// Codepoints that would be transposed to invalid Unicode scalar values are left unchanged.
|
||||
///
|
||||
/// This produces a new string on the heap, see [transpose_codepoints_in_range_in_place_same_width] for an in-place variant.
|
||||
pub(crate) fn transpose_codepoints_in_range(
|
||||
text: &str,
|
||||
distance: i32,
|
||||
range_start: u32,
|
||||
range_end: u32,
|
||||
) -> String {
|
||||
debug_assert!(range_start <= range_end);
|
||||
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
|
||||
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
|
||||
debug_assert_transpose_codepoints_range(distance, range_start, range_end);
|
||||
|
||||
text.chars()
|
||||
.map(|c| {
|
||||
@@ -19,3 +54,68 @@ pub(crate) fn transpose_codepoints_in_range(
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn replace_char_in_place(text: &mut String, byte_index: usize, old: char, new: char) {
|
||||
debug_assert_eq!(old.len_utf8(), new.len_utf8());
|
||||
|
||||
if old == new {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut buffer = [0; 4];
|
||||
text.replace_range(
|
||||
byte_index..byte_index + old.len_utf8(),
|
||||
new.encode_utf8(&mut buffer),
|
||||
);
|
||||
}
|
||||
|
||||
/// Variant of [transpose_codepoints_in_range] that modifies the input string in place.
|
||||
///
|
||||
/// The input and output range must consist of codepoints with the same UTF-8 width,
|
||||
/// otherwise the resulting bytes will overlap with each other, or there will be gaps between them.
|
||||
pub(crate) fn transpose_codepoints_in_range_in_place(
|
||||
text: &mut String,
|
||||
distance: i32,
|
||||
range_start: u32,
|
||||
range_end: u32,
|
||||
) {
|
||||
debug_assert_transpose_codepoints_range(distance, range_start, range_end);
|
||||
debug_assert_transpose_codepoints_range_same_width(distance, range_start, range_end);
|
||||
|
||||
let mut byte_index = 0;
|
||||
|
||||
while byte_index < text.len() {
|
||||
let c = text[byte_index..].chars().next().unwrap();
|
||||
let codepoint = c as u32;
|
||||
let replacement = if range_start <= codepoint && codepoint <= range_end {
|
||||
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
|
||||
} else {
|
||||
c
|
||||
};
|
||||
|
||||
replace_char_in_place(text, byte_index, c, replacement);
|
||||
|
||||
byte_index += c.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_transpose_codepoints_in_range_in_place_same_width() {
|
||||
let mut text = "カタカナ".to_string();
|
||||
transpose_codepoints_in_range_in_place(&mut text, -96, 'ァ'.into(), 'ヶ'.into());
|
||||
assert_eq!(text, "かたかな");
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_transpose_codepoints_in_range_in_place_same_width_rejects_mixed_width_range() {
|
||||
let mut text = "A".to_string();
|
||||
transpose_codepoints_in_range_in_place(&mut text, 0, 'A'.into(), 'Ā'.into());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user