From df2df5b4ac62d9d9846b39d3354f9d110f9a78df Mon Sep 17 00:00:00 2001 From: h7x4 Date: Thu, 4 Jun 2026 15:49:30 +0900 Subject: [PATCH] kana_transliteration: handle k->h itermark translit, add more tests --- src/kana_transliteration.rs | 77 +++++++++++++++++++++---------------- src/lib.rs | 1 + src/util.rs | 21 ++++++++++ 3 files changed, 66 insertions(+), 33 deletions(-) create mode 100644 src/util.rs diff --git a/src/kana_transliteration.rs b/src/kana_transliteration.rs index 325657c..a01f317 100644 --- a/src/kana_transliteration.rs +++ b/src/kana_transliteration.rs @@ -1,24 +1,4 @@ -fn transpose_codepoints_in_range( - text: &str, - distance: i32, - range_start: u32, - range_end: u32, -) -> String { - debug_assert!(range_start <= range_end); - debug_assert!(i64::from(range_start) + i64::from(distance) >= 0); - debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX)); - - text.chars() - .map(|c| { - let codepoint = c as u32; - if range_start <= codepoint && codepoint <= range_end { - std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c) - } else { - c - } - }) - .collect() -} +use crate::util::transpose_codepoints_in_range; fn hiragana_vowel_map(hiragana: char) -> Option { match hiragana { @@ -132,22 +112,18 @@ fn katakana_vowel_map(katakana: char) -> Option { // - Basic Latin: https://www.unicode.org/charts/PDF/U0000.pdf // - Hiragana: https://www.unicode.org/charts/PDF/U3040.pdf // - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf -// - Halfwidth characters: https://www.unicode.org/charts/PDF/UFF00.pdf +// - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String { - let first_pass = transpose_codepoints_in_range(katakana, -96, 12449, 12534); - - // Find instances of long vowel marks (ー), and convert them into long vowels. + let first_pass = transpose_codepoints_in_range(katakana, -96, 'ア'.into(), 'ヶ'.into()); // TODO: don't create a new result, just mutate the first pass in-place. let chars = first_pass.chars().collect::>(); let mut result = String::with_capacity(chars.len()); for (i, c) in chars.iter().enumerate() { - if *c == 'ー' { - if i == 0 { - result.push('ー'); - } else { + match *c { + 'ー' if i > 0 => { let prev_char = result.chars().last().unwrap(); if let Some(vowel) = hiragana_vowel_map(prev_char) { result.push(vowel); @@ -155,8 +131,9 @@ pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String { result.push('ー'); } } - } else { - result.push(*c); + 'ヽ' if i > 0 => result.push('ゝ'), + 'ヾ' if i > 0 => result.push('ゞ'), + _ => result.push(*c), } } @@ -186,9 +163,9 @@ pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String { result } -pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(halfwidth: &str) -> String { +pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(fullwidth: &str) -> String { transpose_codepoints_in_range( - &transpose_codepoints_in_range(halfwidth, -65248, 0xFF01, 0xFF5E), + &transpose_codepoints_in_range(fullwidth, -65248, 0xFF01, 0xFF5E), -12256, 12288, 12288, @@ -502,6 +479,35 @@ mod tests { ); } + #[test] + fn test_katakana_to_hiragana_dont_convert_standalone_dash() { + assert_eq!( + transliterate_katakana_to_hiragana("ーかたかな"), + "ーかたかな" + ); + } + + #[test] + fn test_katakana_to_hiragana_iteration_markers() { + assert_eq!(transliterate_katakana_to_hiragana("コヽロ"), "こゝろ"); + } + + #[test] + fn test_katakana_to_hiragana_iteration_markers_dakuten() { + assert_eq!(transliterate_katakana_to_hiragana("サヾエ"), "さゞえ"); + } + + #[ignore = "FIX ME"] + #[test] + fn test_katakana_to_hiragana_dash_following_iteration_markers() { + assert_eq!(transliterate_katakana_to_hiragana("サヾーエ"), "さゞあえ"); + } + + #[test] + fn test_katakana_to_hiragana_wi_we() { + assert_eq!(transliterate_katakana_to_hiragana("ヰヱ"), "ゐゑ"); + } + #[test] fn test_hiragana_to_katakana() { assert_eq!(transliterate_hiragana_to_katakana("ひらがな"), "ヒラガナ"); @@ -523,6 +529,11 @@ mod tests { ); } + #[test] + fn test_hiragana_to_katakana_wi_we() { + assert_eq!(transliterate_hiragana_to_katakana("ゐゑ"), "ヰヱ"); + } + #[test] fn test_fullwidth_romaji_to_halfwidth_romaji() { assert_eq!( diff --git a/src/lib.rs b/src/lib.rs index 1e6f3d3..7705a3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ mod kana_transliteration; mod text_normalization; +mod util; use sqlite3_ext::{ Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn, diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..60df67a --- /dev/null +++ b/src/util.rs @@ -0,0 +1,21 @@ +pub(crate) fn transpose_codepoints_in_range( + text: &str, + distance: i32, + range_start: u32, + range_end: u32, +) -> String { + debug_assert!(range_start <= range_end); + debug_assert!(i64::from(range_start) + i64::from(distance) >= 0); + debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX)); + + text.chars() + .map(|c| { + let codepoint = c as u32; + if range_start <= codepoint && codepoint <= range_end { + std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c) + } else { + c + } + }) + .collect() +}