kana_transliteration: handle k->h itermark translit, add more tests
This commit is contained in:
+44
-33
@@ -1,24 +1,4 @@
|
||||
fn transpose_codepoints_in_range(
|
||||
text: &str,
|
||||
distance: i32,
|
||||
range_start: u32,
|
||||
range_end: u32,
|
||||
) -> String {
|
||||
debug_assert!(range_start <= range_end);
|
||||
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
|
||||
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
|
||||
|
||||
text.chars()
|
||||
.map(|c| {
|
||||
let codepoint = c as u32;
|
||||
if range_start <= codepoint && codepoint <= range_end {
|
||||
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
|
||||
} else {
|
||||
c
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
use crate::util::transpose_codepoints_in_range;
|
||||
|
||||
fn hiragana_vowel_map(hiragana: char) -> Option<char> {
|
||||
match hiragana {
|
||||
@@ -132,22 +112,18 @@ fn katakana_vowel_map(katakana: char) -> Option<char> {
|
||||
// - Basic Latin: https://www.unicode.org/charts/PDF/U0000.pdf
|
||||
// - Hiragana: https://www.unicode.org/charts/PDF/U3040.pdf
|
||||
// - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf
|
||||
// - Halfwidth characters: https://www.unicode.org/charts/PDF/UFF00.pdf
|
||||
// - Halfwidth and Fullwidth Forms: https://www.unicode.org/charts/PDF/UFF00.pdf
|
||||
|
||||
pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
|
||||
let first_pass = transpose_codepoints_in_range(katakana, -96, 12449, 12534);
|
||||
|
||||
// Find instances of long vowel marks (ー), and convert them into long vowels.
|
||||
let first_pass = transpose_codepoints_in_range(katakana, -96, 'ア'.into(), 'ヶ'.into());
|
||||
|
||||
// TODO: don't create a new result, just mutate the first pass in-place.
|
||||
let chars = first_pass.chars().collect::<Vec<_>>();
|
||||
let mut result = String::with_capacity(chars.len());
|
||||
|
||||
for (i, c) in chars.iter().enumerate() {
|
||||
if *c == 'ー' {
|
||||
if i == 0 {
|
||||
result.push('ー');
|
||||
} else {
|
||||
match *c {
|
||||
'ー' if i > 0 => {
|
||||
let prev_char = result.chars().last().unwrap();
|
||||
if let Some(vowel) = hiragana_vowel_map(prev_char) {
|
||||
result.push(vowel);
|
||||
@@ -155,8 +131,9 @@ pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String {
|
||||
result.push('ー');
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.push(*c);
|
||||
'ヽ' if i > 0 => result.push('ゝ'),
|
||||
'ヾ' if i > 0 => result.push('ゞ'),
|
||||
_ => result.push(*c),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,9 +163,9 @@ pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String {
|
||||
result
|
||||
}
|
||||
|
||||
pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(halfwidth: &str) -> String {
|
||||
pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(fullwidth: &str) -> String {
|
||||
transpose_codepoints_in_range(
|
||||
&transpose_codepoints_in_range(halfwidth, -65248, 0xFF01, 0xFF5E),
|
||||
&transpose_codepoints_in_range(fullwidth, -65248, 0xFF01, 0xFF5E),
|
||||
-12256,
|
||||
12288,
|
||||
12288,
|
||||
@@ -502,6 +479,35 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_katakana_to_hiragana_dont_convert_standalone_dash() {
|
||||
assert_eq!(
|
||||
transliterate_katakana_to_hiragana("ーかたかな"),
|
||||
"ーかたかな"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_katakana_to_hiragana_iteration_markers() {
|
||||
assert_eq!(transliterate_katakana_to_hiragana("コヽロ"), "こゝろ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_katakana_to_hiragana_iteration_markers_dakuten() {
|
||||
assert_eq!(transliterate_katakana_to_hiragana("サヾエ"), "さゞえ");
|
||||
}
|
||||
|
||||
#[ignore = "FIX ME"]
|
||||
#[test]
|
||||
fn test_katakana_to_hiragana_dash_following_iteration_markers() {
|
||||
assert_eq!(transliterate_katakana_to_hiragana("サヾーエ"), "さゞあえ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_katakana_to_hiragana_wi_we() {
|
||||
assert_eq!(transliterate_katakana_to_hiragana("ヰヱ"), "ゐゑ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hiragana_to_katakana() {
|
||||
assert_eq!(transliterate_hiragana_to_katakana("ひらがな"), "ヒラガナ");
|
||||
@@ -523,6 +529,11 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hiragana_to_katakana_wi_we() {
|
||||
assert_eq!(transliterate_hiragana_to_katakana("ゐゑ"), "ヰヱ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fullwidth_romaji_to_halfwidth_romaji() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
mod kana_transliteration;
|
||||
mod text_normalization;
|
||||
mod util;
|
||||
|
||||
use sqlite3_ext::{
|
||||
Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn,
|
||||
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
pub(crate) fn transpose_codepoints_in_range(
|
||||
text: &str,
|
||||
distance: i32,
|
||||
range_start: u32,
|
||||
range_end: u32,
|
||||
) -> String {
|
||||
debug_assert!(range_start <= range_end);
|
||||
debug_assert!(i64::from(range_start) + i64::from(distance) >= 0);
|
||||
debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX));
|
||||
|
||||
text.chars()
|
||||
.map(|c| {
|
||||
let codepoint = c as u32;
|
||||
if range_start <= codepoint && codepoint <= range_end {
|
||||
std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c)
|
||||
} else {
|
||||
c
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
Reference in New Issue
Block a user