Add text normalization function
This commit is contained in:
+43
@@ -1,4 +1,5 @@
|
||||
mod kana_transliteration;
|
||||
mod text_normalization;
|
||||
|
||||
use sqlite3_ext::{
|
||||
Connection, FromValue, Result, ValueRef, ValueType, function::Context, sqlite3_ext_fn,
|
||||
@@ -13,6 +14,8 @@ use crate::kana_transliteration::{
|
||||
transliterate_katakana_to_hiragana,
|
||||
};
|
||||
|
||||
use crate::text_normalization::normalize_japanese_text;
|
||||
|
||||
#[sqlite3_ext_fn(n_args = 1, deterministic)]
|
||||
fn hiragana_to_katakana(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> {
|
||||
if args[0].value_type() != ValueType::Text {
|
||||
@@ -109,6 +112,20 @@ fn halfwidth_katakana_to_fullwidth_katakana(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[sqlite3_ext_fn(n_args = 1, deterministic)]
|
||||
fn normalize_jp(ctx: &mut Context, args: &mut [&mut ValueRef]) -> Result<()> {
|
||||
if args[0].value_type() != ValueType::Text {
|
||||
return Err(sqlite3_ext::Error::Module(
|
||||
"normalize_jp requires a string argument".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let arg = args[0].get_str()?;
|
||||
let result = normalize_japanese_text(arg);
|
||||
ctx.set_result(result)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[sqlite3_ext_main(persistent)]
|
||||
fn init(db: &Connection) -> Result<()> {
|
||||
db.create_scalar_function(
|
||||
@@ -147,6 +164,8 @@ fn init(db: &Connection) -> Result<()> {
|
||||
halfwidth_katakana_to_fullwidth_katakana,
|
||||
)?;
|
||||
|
||||
db.create_scalar_function("normalize_jp", &NORMALIZE_JP_OPTS, normalize_jp)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -181,4 +200,28 @@ mod test {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_normalization() -> Result<()> {
|
||||
let conn = setup()?;
|
||||
conn.execute("CREATE TABLE test(value TEXT)", Vec::<&str>::new())?;
|
||||
conn.execute(
|
||||
"CREATE INDEX idx_normalized_jp ON test(normalize_jp(value))",
|
||||
Vec::<&str>::new(),
|
||||
)?;
|
||||
conn.execute(
|
||||
"INSERT INTO test(value) VALUES ('あ'), ('ア'), ('い'), ('イ')",
|
||||
Vec::<&str>::new(),
|
||||
)?;
|
||||
|
||||
let results: Vec<_> = conn
|
||||
.prepare("SELECT value FROM test WHERE normalize_jp(value) = 'あ'")?
|
||||
.query(())?
|
||||
.map(|row| Ok(row[0].get_str()?.to_string()))
|
||||
.collect()?;
|
||||
|
||||
assert_eq!(results, vec!["あ".to_string(), "ア".to_string(),]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::kana_transliteration::{
|
||||
transliterate_halfwidth_katakana_to_fullwidth_katakana, transliterate_katakana_to_hiragana,
|
||||
};
|
||||
|
||||
pub fn normalize_japanese_text(text: &str) -> String {
|
||||
let text = transliterate_halfwidth_katakana_to_fullwidth_katakana(text);
|
||||
transliterate_katakana_to_hiragana(&text)
|
||||
}
|
||||
Reference in New Issue
Block a user