diff --git a/src/lib.rs b/src/lib.rs index b0e1c80..237a3f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod kana_transliteration; +mod text_normalization; use sqlite3_ext::{ Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn, @@ -13,6 +14,8 @@ use crate::kana_transliteration::{ transliterate_katakana_to_hiragana, }; +use crate::text_normalization::normalize_japanese_text; + #[sqlite3_ext_fn(n_args = 1, deterministic)] fn hiragana_to_katakana(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> { if args[0].value_type() != ValueType::Text { @@ -109,6 +112,20 @@ fn halfwidth_katakana_to_fullwidth_katakana( Ok(()) } +#[sqlite3_ext_fn(n_args = 1, deterministic)] +fn normalize_jp(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> { + if args[0].value_type() != ValueType::Text { + return Err(sqlite3_ext::Error::Module( + "normalize_jp requires a string argument".to_string(), + )); + } + + let arg = args[0].get_str()?; + let result = normalize_japanese_text(arg); + ctx.set_result(result)?; + Ok(()) +} + #[sqlite3_ext_main(persistent)] fn init(db: &Connection) -> Result<()> { db.create_scalar_function( @@ -147,6 +164,8 @@ fn init(db: &Connection) -> Result<()> { halfwidth_katakana_to_fullwidth_katakana, )?; + db.create_scalar_function("normalize_jp", &NORMALIZE_JP_OPTS, normalize_jp)?; + Ok(()) } @@ -181,4 +200,28 @@ mod test { Ok(()) } + + #[test] + fn test_text_normalization() -> Result<()> { + let conn = setup()?; + conn.execute("CREATE TABLE test(value TEXT)", Vec::<&str>::new())?; + conn.execute( + "CREATE INDEX idx_normalized_jp ON test(normalize_jp(value))", + Vec::<&str>::new(), + )?; + conn.execute( + "INSERT INTO test(value) VALUES ('あ'), ('ア'), ('い'), ('イ')", + Vec::<&str>::new(), + )?; + + let results: Vec<_> = conn + .prepare("SELECT value FROM test WHERE normalize_jp(value) = 'あ'")? + .query(())? + .map(|row| Ok(row[0].get_str()?.to_string())) + .collect()?; + + assert_eq!(results, vec!["あ".to_string(), "ア".to_string(),]); + + Ok(()) + } } diff --git a/src/text_normalization.rs b/src/text_normalization.rs new file mode 100644 index 0000000..0978b47 --- /dev/null +++ b/src/text_normalization.rs @@ -0,0 +1,8 @@ +use crate::kana_transliteration::{ + transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana, +}; + +pub fn normalize_japanese_text(text: &str) -> String { + let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text); + transliterate_katakana_to_hiragana(&text) +}