kana_transliteration: handle k->h itermark translit, add more tests

This commit is contained in:
2026-06-04 15:49:30 +09:00
parent fb6a018f77
commit df2df5b4ac
3 changed files with 66 additions and 33 deletions
+44 -33
View File
@@ -1,24 +1,4 @@
fn transpose_codepoints_in_range(
text: &str,
distance: i32,
range_start: u32,
range_end: u32,
) -> String {
debug_assert!(range_start <= range_end);
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
text.chars()
.map(|c| {
let codepoint = c as u32;
if range_start <= codepoint && codepoint <= range_end {
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
} else {
c
}
})
.collect()
}
use crate::util::transpose_codepoints_in_range;
fn hiragana_vowel_map(hiragana: char) -> Option<char> {
match hiragana {
@@ -132,22 +112,18 @@ fn katakana_vowel_map(katakana: char) -> Option<char> {
// - Basic Latin: https://www.unicode.org/charts/PDF/U0000.pdf
// - Hiragana: https://www.unicode.org/charts/PDF/U3040.pdf
// - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf
// - Halfwidth characters: https://www.unicode.org/charts/PDF/UFF00.pdf
// - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
let first_pass = transpose_codepoints_in_range(katakana, -96, 12449, 12534);
// Find instances of long vowel marks (ー), and convert them into long vowels.
let first_pass = transpose_codepoints_in_range(katakana, -96, 'ア'.into(), 'ヶ'.into());
// TODO: don't create a new result, just mutate the first pass in-place.
let chars = first_pass.chars().collect::<Vec<_>>();
let mut result = String::with_capacity(chars.len());
for (i, c) in chars.iter().enumerate() {
if *c == 'ー' {
if i == 0 {
result.push('ー');
} else {
match *c {
'ー' if i > 0 => {
let prev_char = result.chars().last().unwrap();
if let Some(vowel) = hiragana_vowel_map(prev_char) {
result.push(vowel);
@@ -155,8 +131,9 @@ pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
result.push('ー');
}
}
} else {
result.push(*c);
'ヽ' if i > 0 => result.push('ゝ'),
'ヾ' if i > 0 => result.push('ゞ'),
_ => result.push(*c),
}
}
@@ -186,9 +163,9 @@ pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
result
}
pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(halfwidth: &str) -> String {
pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(fullwidth: &str) -> String {
transpose_codepoints_in_range(
&transpose_codepoints_in_range(halfwidth, -65248, 0xFF01, 0xFF5E),
&transpose_codepoints_in_range(fullwidth, -65248, 0xFF01, 0xFF5E),
-12256,
12288,
12288,
@@ -502,6 +479,35 @@ mod tests {
);
}
#[test]
fn test_katakana_to_hiragana_dont_convert_standalone_dash() {
assert_eq!(
transliterate_katakana_to_hiragana("ーかたかな"),
"ーかたかな"
);
}
#[test]
fn test_katakana_to_hiragana_iteration_markers() {
assert_eq!(transliterate_katakana_to_hiragana("コヽロ"), "こゝろ");
}
#[test]
fn test_katakana_to_hiragana_iteration_markers_dakuten() {
assert_eq!(transliterate_katakana_to_hiragana("サヾエ"), "さゞえ");
}
#[ignore = "FIX ME"]
#[test]
fn test_katakana_to_hiragana_dash_following_iteration_markers() {
assert_eq!(transliterate_katakana_to_hiragana("サヾーエ"), "さゞあえ");
}
#[test]
fn test_katakana_to_hiragana_wi_we() {
assert_eq!(transliterate_katakana_to_hiragana("ヰヱ"), "ゐゑ");
}
#[test]
fn test_hiragana_to_katakana() {
assert_eq!(transliterate_hiragana_to_katakana("ひらがな"), "ヒラガナ");
@@ -523,6 +529,11 @@ mod tests {
);
}
#[test]
fn test_hiragana_to_katakana_wi_we() {
assert_eq!(transliterate_hiragana_to_katakana("ゐゑ"), "ヰヱ");
}
#[test]
fn test_fullwidth_romaji_to_halfwidth_romaji() {
assert_eq!(
+1
View File
@@ -1,5 +1,6 @@
mod kana_transliteration;
mod text_normalization;
mod util;
use sqlite3_ext::{
Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn,
+21
View File
@@ -0,0 +1,21 @@
pub(crate) fn transpose_codepoints_in_range(
text: &str,
distance: i32,
range_start: u32,
range_end: u32,
) -> String {
debug_assert!(range_start <= range_end);
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
text.chars()
.map(|c| {
let codepoint = c as u32;
if range_start <= codepoint && codepoint <= range_end {
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
} else {
c
}
})
.collect()
}