From 0d21cfa4a15e2434fe50ddd35f2ef6a6d541ec20 Mon Sep 17 00:00:00 2001 From: h7x4 Date: Thu, 7 May 2026 16:32:01 +0900 Subject: [PATCH] Add basic kana transliteration functions --- src/kana_transliteration.rs | 381 ++++++++++++++++++++++++++++++++++++ src/lib.rs | 186 ++++++++++++++---- 2 files changed, 524 insertions(+), 43 deletions(-) create mode 100644 src/kana_transliteration.rs diff --git a/src/kana_transliteration.rs b/src/kana_transliteration.rs new file mode 100644 index 0000000..f542206 --- /dev/null +++ b/src/kana_transliteration.rs @@ -0,0 +1,381 @@ +fn transpose_codepoints_in_range( + text: &str, + distance: i32, + range_start: u32, + range_end: u32, +) -> String { + debug_assert!(range_start <= range_end); + debug_assert!(i64::from(range_start) + i64::from(distance) >= 0); + debug_assert!(i64::from(range_end) + i64::from(distance) <= i64::from(u32::MAX)); + + text.chars() + .map(|c| { + let codepoint = c as u32; + if range_start <= codepoint && codepoint <= range_end { + std::char::from_u32(codepoint.strict_add_signed(distance)).unwrap_or(c) + } else { + c + } + }) + .collect() +} + +// See charts: +// - Basic Latin: https://www.unicode.org/charts/PDF/U0000.pdf +// - Hiragana: https://www.unicode.org/charts/PDF/U3040.pdf +// - Katakana: https://www.unicode.org/charts/PDF/U30A0.pdf +// - Halfwidth characters: https://www.unicode.org/charts/PDF/UFF00.pdf + +pub fn transliterate_katakana_to_hiragana(katakana: &str) -> String { + transpose_codepoints_in_range(katakana, -96, 12449, 12534) +} + +pub fn transliterate_hiragana_to_katakana(hiragana: &str) -> String { + transpose_codepoints_in_range(hiragana, 96, 12353, 12438) +} + +pub fn transliterate_fullwidth_romaji_to_halfwidth_romaji(halfwidth: &str) -> String { + transpose_codepoints_in_range( + &transpose_codepoints_in_range(halfwidth, -65248, 0xFF01, 0xFF5E), + -12256, + 12288, + 12288, + ) +} + +pub fn transliterate_halfwidth_romaji_to_fullwidth_romaji(halfwidth: &str) -> String { + transpose_codepoints_in_range( + &transpose_codepoints_in_range(halfwidth, 65248, 0x21, 0x7E), + 12256, + 32, + 32, + ) +} + +pub fn transliterate_halfwidth_katakana_to_fullwidth_katakana(halfwidth: &str) -> String { + let mut result = String::with_capacity(halfwidth.len()); + let mut chars = halfwidth.chars().peekable(); + while let Some(c) = chars.next() { + let out: &[char] = match c { + 'ヲ' => &['ヲ'], + 'ァ' => &['ァ'], + 'ィ' => &['ィ'], + 'ゥ' => &['ゥ'], + 'ェ' => &['ェ'], + 'ォ' => &['ォ'], + 'ャ' => &['ャ'], + 'ュ' => &['ュ'], + 'ョ' => &['ョ'], + 'ッ' => &['ッ'], + 'ー' => &['ー'], + 'ア' => &['ア'], + 'イ' => &['イ'], + 'ウ' => &['ウ'], + 'エ' => &['エ'], + 'オ' => &['オ'], + 'カ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ガ'] + } + 'カ' => &['カ'], + 'キ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ギ'] + } + 'キ' => &['キ'], + 'ク' if chars.peek() == Some(&'゙') => { + chars.next(); + &['グ'] + } + 'ク' => &['ク'], + 'ケ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ゲ'] + } + 'ケ' => &['ケ'], + 'コ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ゴ'] + } + 'コ' => &['コ'], + 'サ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ザ'] + } + 'サ' => &['サ'], + 'シ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ジ'] + } + 'シ' => &['シ'], + 'ス' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ズ'] + } + 'ス' => &['ス'], + 'セ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ゼ'] + } + 'セ' => &['セ'], + 'ソ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ゾ'] + } + 'ソ' => &['ソ'], + 'タ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ダ'] + } + 'タ' => &['タ'], + 'チ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ヂ'] + } + 'チ' => &['チ'], + 'ツ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ヅ'] + } + 'ツ' => &['ツ'], + 'テ' if chars.peek() == Some(&'゙') => { + chars.next(); + &['デ'] + } + 'テ' => &['テ'], + 'ト' if chars.peek() == Some(&'゙') => { + chars.next(); + &['ド'] + } + 'ト' => &['ト'], + 'ナ' => &['ナ'], + 'ニ' => &['ニ'], + 'ヌ' => &['ヌ'], + 'ネ' => &['ネ'], + 'ノ' => &['ノ'], + 'ハ' => match chars.peek() { + Some(&'゙') => { + chars.next(); + &['バ'] + } + Some(&'゚') => { + chars.next(); + &['パ'] + } + _ => &['ハ'], + }, + 'ヒ' => match chars.peek() { + Some(&'゙') => { + chars.next(); + &['ビ'] + } + Some(&'゚') => { + chars.next(); + &['ピ'] + } + _ => &['ヒ'], + }, + 'フ' => match chars.peek() { + Some(&'゙') => { + chars.next(); + &['ブ'] + } + Some(&'゚') => { + chars.next(); + &['プ'] + } + _ => &['フ'], + }, + 'ヘ' => match chars.peek() { + Some(&'゙') => { + chars.next(); + &['ベ'] + } + Some(&'゚') => { + chars.next(); + &['ペ'] + } + _ => &['ヘ'], + }, + 'ホ' => match chars.peek() { + Some(&'゙') => { + chars.next(); + &['ボ'] + } + Some(&'゚') => { + chars.next(); + &['ポ'] + } + _ => &['ホ'], + }, + 'マ' => &['マ'], + 'ミ' => &['ミ'], + 'ム' => &['ム'], + 'メ' => &['メ'], + 'モ' => &['モ'], + 'ヤ' => &['ヤ'], + 'ユ' => &['ユ'], + 'ヨ' => &['ヨ'], + 'ラ' => &['ラ'], + 'リ' => &['リ'], + 'ル' => &['ル'], + 'レ' => &['レ'], + 'ロ' => &['ロ'], + 'ワ' => &['ワ'], + 'ン' => &['ン'], + _ => &[c], + }; + result.push_str(out.iter().collect::().as_str()); + } + + return result; +} + +pub fn transliterate_fullwidth_katakana_to_halfwidth_katakana(fullwidth: &str) -> String { + let mut result = String::with_capacity(fullwidth.len() * 2); + for c in fullwidth.chars() { + let out: &[char] = match c { + 'ヲ' => &['ヲ'], + 'ァ' => &['ァ'], + 'ィ' => &['ィ'], + 'ゥ' => &['ゥ'], + 'ェ' => &['ェ'], + 'ォ' => &['ォ'], + 'ャ' => &['ャ'], + 'ュ' => &['ュ'], + 'ョ' => &['ョ'], + 'ッ' => &['ッ'], + 'ー' => &['ー'], + 'ア' => &['ア'], + 'イ' => &['イ'], + 'ウ' => &['ウ'], + 'エ' => &['エ'], + 'オ' => &['オ'], + 'カ' => &['カ'], + 'キ' => &['キ'], + 'ク' => &['ク'], + 'ケ' => &['ケ'], + 'コ' => &['コ'], + 'サ' => &['サ'], + 'シ' => &['シ'], + 'ス' => &['ス'], + 'セ' => &['セ'], + 'ソ' => &['ソ'], + 'タ' => &['タ'], + 'チ' => &['チ'], + 'ツ' => &['ツ'], + 'テ' => &['テ'], + 'ト' => &['ト'], + 'ナ' => &['ナ'], + 'ニ' => &['ニ'], + 'ヌ' => &['ヌ'], + 'ネ' => &['ネ'], + 'ノ' => &['ノ'], + 'ハ' => &['ハ'], + 'ヒ' => &['ヒ'], + 'フ' => &['フ'], + 'ヘ' => &['ヘ'], + 'ホ' => &['ホ'], + 'マ' => &['マ'], + 'ミ' => &['ミ'], + 'ム' => &['ム'], + 'メ' => &['メ'], + 'モ' => &['モ'], + 'ヤ' => &['ヤ'], + 'ユ' => &['ユ'], + 'ヨ' => &['ヨ'], + 'ラ' => &['ラ'], + 'リ' => &['リ'], + 'ル' => &['ル'], + 'レ' => &['レ'], + 'ロ' => &['ロ'], + 'ワ' => &['ワ'], + 'ン' => &['ン'], + 'ガ' => &['カ', '゙'], + 'ギ' => &['キ', '゙'], + 'グ' => &['ク', '゙'], + 'ゲ' => &['ケ', '゙'], + 'ゴ' => &['コ', '゙'], + 'ザ' => &['サ', '゙'], + 'ジ' => &['シ', '゙'], + 'ズ' => &['ス', '゙'], + 'ゼ' => &['セ', '゙'], + 'ゾ' => &['ソ', '゙'], + 'ダ' => &['タ', '゙'], + 'ヂ' => &['チ', '゙'], + 'ヅ' => &['ツ', '゙'], + 'デ' => &['テ', '゙'], + 'ド' => &['ト', '゙'], + 'バ' => &['ハ', '゙'], + 'ビ' => &['ヒ', '゙'], + 'ブ' => &['フ', '゙'], + 'ベ' => &['ヘ', '゙'], + 'ボ' => &['ホ', '゙'], + 'パ' => &['ハ', '゚'], + 'ピ' => &['ヒ', '゚'], + 'プ' => &['フ', '゚'], + 'ペ' => &['ヘ', '゚'], + 'ポ' => &['ホ', '゚'], + _ => &[c], + }; + + result.push_str(out.iter().collect::().as_str()); + } + + return result; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_katakana_to_hiragana() { + assert_eq!(transliterate_katakana_to_hiragana("カタカナ"), "かたかな"); + } + + #[test] + fn test_hiragana_to_katakana() { + assert_eq!(transliterate_hiragana_to_katakana("ひらがな"), "ヒラガナ"); + } + + #[test] + fn test_fullwidth_romaji_to_halfwidth_romaji() { + assert_eq!( + transliterate_fullwidth_romaji_to_halfwidth_romaji("ABC123"), + "ABC123" + ); + } + + #[test] + fn test_halfwidth_romaji_to_fullwidth_romaji() { + assert_eq!( + transliterate_halfwidth_romaji_to_fullwidth_romaji("ABC123"), + "ABC123" + ); + } + + #[test] + fn test_halfwidth_katakana_to_fullwidth_katakana() { + assert_eq!( + transliterate_halfwidth_katakana_to_fullwidth_katakana("カタカナ"), + "カタカナ" + ); + assert_eq!( + transliterate_halfwidth_katakana_to_fullwidth_katakana("ガダガナピ"), + "ガダガナピ" + ); + } + + #[test] + fn test_fullwidth_katakana_to_halfwidth_katakana() { + assert_eq!( + transliterate_fullwidth_katakana_to_halfwidth_katakana("カタカナ"), + "カタカナ" + ); + assert_eq!( + transliterate_fullwidth_katakana_to_halfwidth_katakana("ガダガナピ"), + "ガダガナピ" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 5b2725c..b0e1c80 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,30 +1,151 @@ +mod kana_transliteration; + use sqlite3_ext::{ Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn, sqlite3_ext_main, }; -fn add(a: i32, b: i32) -> i32 { - a + b -} +use crate::kana_transliteration::{ + transliterate_fullwidth_katakana_to_halfwidth_katakana, + transliterate_fullwidth_romaji_to_halfwidth_romaji, + transliterate_halfwidth_katakana_to_fullwidth_katakana, + transliterate_halfwidth_romaji_to_fullwidth_romaji, transliterate_hiragana_to_katakana, + transliterate_katakana_to_hiragana, +}; -#[sqlite3_ext_fn(n_args = 2, deterministic)] -fn add_sqlite(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> { - if args[0].value_type() != ValueType::Integer || args[1].value_type() != ValueType::Integer { +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn hiragana_to_katakana(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> { + if args[0].value_type() != ValueType::Text { return Err(sqlite3_ext::Error::Module( - "myadd requires both arguments to be an integer".to_string(), + "hiragana_to_katakana requires a string argument".to_string(), )); } - let a = args[0].get_i32(); - let b = args[1].get_i32(); - let result = add(a, b); + let arg = args[0].get_str()?; + let result = transliterate_hiragana_to_katakana(arg); + ctx.set_result(result)?; + Ok(()) +} + +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn katakana_to_hiragana(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "katakana_to_hiragana requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = transliterate_katakana_to_hiragana(arg); + ctx.set_result(result)?; + Ok(()) +} + +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn fullwidth_romaji_to_halfwidth_romaji( + ctx: &mut Context, + args: &mut [&mut ValueRef], +) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "fullwidth_romaji_to_halfwidth_romaji requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = transliterate_fullwidth_romaji_to_halfwidth_romaji(arg); + ctx.set_result(result)?; + Ok(()) +} + +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn halfwidth_romaji_to_fullwidth_romaji( + ctx: &mut Context, + args: &mut [&mut ValueRef], +) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "halfwidth_romaji_to_fullwidth_romaji requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = transliterate_halfwidth_romaji_to_fullwidth_romaji(arg); + ctx.set_result(result)?; + Ok(()) +} + +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn fullwidth_katakana_to_halfwidth_katakana( + ctx: &mut Context, + args: &mut [&mut ValueRef], +) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "fullwidth_katakana_to_halfwidth_katakana requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = transliterate_fullwidth_katakana_to_halfwidth_katakana(arg); + ctx.set_result(result)?; + Ok(()) +} + +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn halfwidth_katakana_to_fullwidth_katakana( + ctx: &mut Context, + args: &mut [&mut ValueRef], +) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "halfwidth_katakana_to_fullwidth_katakana requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = transliterate_halfwidth_katakana_to_fullwidth_katakana(arg); ctx.set_result(result)?; Ok(()) } #[sqlite3_ext_main(persistent)] fn init(db: &Connection) -> Result<()> { - db.create_scalar_function("myadd", &ADD_SQLITE_OPTS, add_sqlite)?; + db.create_scalar_function( + "hiragana_to_katakana", + &HIRAGANA_TO_KATAKANA_OPTS, + hiragana_to_katakana, + )?; + + db.create_scalar_function( + "katakana_to_hiragana", + &KATAKANA_TO_HIRAGANA_OPTS, + katakana_to_hiragana, + )?; + + db.create_scalar_function( + "fullwidth_romaji_to_halfwidth_romaji", + &FULLWIDTH_ROMAJI_TO_HALFWIDTH_ROMAJI_OPTS, + fullwidth_romaji_to_halfwidth_romaji, + )?; + + db.create_scalar_function( + "halfwidth_romaji_to_fullwidth_romaji", + &HALFWIDTH_ROMAJI_TO_FULLWIDTH_ROMAJI_OPTS, + halfwidth_romaji_to_fullwidth_romaji, + )?; + + db.create_scalar_function( + "fullwidth_katakana_to_halfwidth_katakana", + &FULLWIDTH_KATAKANA_TO_HALFWIDTH_KATAKANA_OPTS, + fullwidth_katakana_to_halfwidth_katakana, + )?; + + db.create_scalar_function( + "halfwidth_katakana_to_fullwidth_katakana", + &HALFWIDTH_KATAKANA_TO_FULLWIDTH_KATAKANA_OPTS, + halfwidth_katakana_to_fullwidth_katakana, + )?; Ok(()) } @@ -41,44 +162,23 @@ mod test { } #[test] - fn test_working() -> Result<()> { + fn test_basic_sqlite_query() -> Result<()> { let conn = setup()?; - let results: Vec = conn - .prepare("SELECT myadd(10, 20)")? + let results: Vec<_> = conn + .prepare( + "SELECT + hiragana_to_katakana('ひらがな'), + katakana_to_hiragana('カタカナ') + ", + )? .query(())? - .map(|row| Ok(row[0].get_i64())) + .map(|row| Ok((row[0].get_str()?.to_string(), row[1].get_str()?.to_string()))) .collect()?; - assert_eq!(results, vec![30]); - - Ok(()) - } - - #[test] - fn test_wrong_arg_number() -> Result<()> { - let conn = setup()?; - let result = conn.prepare("SELECT myadd(10)"); assert_eq!( - result.unwrap_err(), - Error::Sqlite( - 1, - Some("wrong number of arguments to function myadd()".to_string()), - ) + results, + vec![("ヒラガナ".to_string(), "かたかな".to_string())] ); - Ok(()) - } - #[test] - fn test_wrong_arg_type() -> Result<()> { - let conn = setup()?; - let mut statement = conn.prepare("SELECT myadd(10, 'hello!')")?; - let result = statement.query(())?.next(); - assert_eq!( - result.unwrap_err(), - Error::Sqlite( - 1, - Some("myadd requires both arguments to be an integer".to_string()), - ) - ); Ok(()) } }