diff --git a/bin/jadb.dart b/bin/jadb.dart index cb01112..2d6dcc0 100644 --- a/bin/jadb.dart +++ b/bin/jadb.dart @@ -1,6 +1,7 @@ import 'package:args/command_runner.dart'; import 'package:jadb/cli/commands/create_db.dart'; +import 'package:jadb/cli/commands/create_tanos_jlpt_mappings.dart'; import 'package:jadb/cli/commands/query_kanji.dart'; import 'package:jadb/cli/commands/query_word.dart'; @@ -13,6 +14,7 @@ Future main(List args) async { runner.addCommand(CreateDb()); runner.addCommand(QueryKanji()); runner.addCommand(QueryWord()); + runner.addCommand(CreateTanosJlptMappings()); runner.run(args); } diff --git a/flake.lock b/flake.lock index d2445bd..ec8bd8e 100644 --- a/flake.lock +++ b/flake.lock @@ -3,7 +3,7 @@ "jmdict-src": { "flake": false, "locked": { - "narHash": "sha256-qQL58YVurB+EH84PM1m+RbfijWdXVRKkyxvo93e1210=", + "narHash": "sha256-aCHFgCZh21CPiLWRPnZRieaDq5+dG9BnOA9Pcv6G1IY=", "type": "file", "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz" }, @@ -15,7 +15,7 @@ "jmdict-with-examples-src": { "flake": false, "locked": { - "narHash": "sha256-DqxYTNU0p+J08Mh440Bh82IBSmDITDg0mlj+U4gPDBU=", + "narHash": "sha256-B+9l2E1nTUqQQeXXUtQ1oYdR8TL3sVqDUpLqGZdpgv4=", "type": "file", "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e_examp.gz" }, @@ -27,7 +27,7 @@ "kanjidic2-src": { "flake": false, "locked": { - "narHash": "sha256-XYfpEJBmlhYMDcz7Qi7FgMTTkGbd91uiBitVS5ajbOM=", + "narHash": "sha256-XQ6j4XSqipQbaa1TBwu+6iIQRb4eaeqi7W5PSCpf4c0=", "type": "file", "url": "https://www.edrdg.org/kanjidic/kanjidic2.xml.gz" }, @@ -38,11 +38,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1745526057, - "narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=", + "lastModified": 1746904237, + "narHash": "sha256-3e+AVBczosP5dCLQmMoMEogM57gmZ2qrVSrmq9aResQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "f771eb401a46846c1aebd20552521b233dd7e18b", + "rev": "d89fc19e405cb2d55ce7cc114356846a0ee5e956", "type": "github" }, "original": { diff --git a/lib/_data_ingestion/seed_database.dart b/lib/_data_ingestion/seed_database.dart index 2eb2acf..e55fa58 100644 --- a/lib/_data_ingestion/seed_database.dart +++ b/lib/_data_ingestion/seed_database.dart @@ -6,6 +6,8 @@ import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart'; import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart'; import 'package:jadb/_data_ingestion/radkfile/parser.dart'; import 'package:jadb/_data_ingestion/radkfile/seed_data.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/seed_data.dart'; import 'package:sqflite_common/sqlite_api.dart'; import 'package:xml/xml.dart'; @@ -13,6 +15,7 @@ Future seedData(Database db) async { await parseAndSeedDataFromJMdict(db); await parseAndSeedDataFromRADKFILE(db); await parseAndSeedDataFromKANJIDIC(db); + await parseAndSeedDataFromTanosJLPT(db); } Future parseAndSeedDataFromJMdict(Database db) async { @@ -51,5 +54,22 @@ Future parseAndSeedDataFromRADKFILE(Database db) async { final blocks = parseRADKFILEBlocks(raw); print('[RADKFILE] Writing to database...'); - seedRADKFILEData(blocks, db); + await seedRADKFILEData(blocks, db); +} + +Future parseAndSeedDataFromTanosJLPT(Database db) async { + print('[TANOS-JLPT] Reading files...'); + Map files = { + 'N1': File('data/tanos-jlpt/n1.csv'), + 'N2': File('data/tanos-jlpt/n2.csv'), + 'N3': File('data/tanos-jlpt/n3.csv'), + 'N4': File('data/tanos-jlpt/n4.csv'), + 'N5': File('data/tanos-jlpt/n5.csv'), + }; + + print('[TANOS-JLPT] Parsing content...'); + final rankedWords = await parseJLPTRankedWords(files); + + print('[TANOS-JLPT] Writing to database...'); + await seedTanosJLPTData(rankedWords, db); } diff --git a/lib/_data_ingestion/tanos-jlpt/csv_parser.dart b/lib/_data_ingestion/tanos-jlpt/csv_parser.dart new file mode 100644 index 0000000..108aefb --- /dev/null +++ b/lib/_data_ingestion/tanos-jlpt/csv_parser.dart @@ -0,0 +1,61 @@ +import 'dart:convert'; +import 'dart:io'; + +import 'package:csv/csv.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart'; + +Future> parseJLPTRankedWords( + Map files, +) async { + final List result = []; + + for (final entry in files.entries) { + final jlptLevel = entry.key; + final file = entry.value; + + if (!file.existsSync()) { + throw Exception("File $jlptLevel does not exist"); + } + + final rows = await file + .openRead() + .transform(utf8.decoder) + .transform(CsvToListConverter()) + .toList(); + + for (final row in rows) { + if (row.length != 3) { + throw Exception("Invalid line in $jlptLevel: $row"); + } + + final kanji = (row[0] as String).isEmpty + ? null + : (row[0] as String) + .replaceFirst(RegExp('^お・'), '') + .replaceAll(RegExp(r'(.*)'), ''); + + final readings = (row[1] as String) + .split(RegExp('[・/、(:?\s+)]')) + .map((e) => e.trim()) + .toList(); + + final meanings = + (row[2] as String).split(',').expand(cleanMeaning).toList(); + + result.add(JLPTRankedWord( + readings: readings, + kanji: kanji, + jlptLevel: jlptLevel, + meanings: meanings, + )); + } + } + + return result; +} + +List cleanMeaning(String meaning) { + final initialTrim = meaning.trim().replaceAll(RegExp(r'^\d.\s+'), ''); + final woParens = initialTrim.replaceAll(RegExp(r'\s*\(.*?\)\s*'), ''); + return {initialTrim, woParens}.toList(); +} diff --git a/lib/_data_ingestion/tanos-jlpt/objects.dart b/lib/_data_ingestion/tanos-jlpt/objects.dart new file mode 100644 index 0000000..4deaac2 --- /dev/null +++ b/lib/_data_ingestion/tanos-jlpt/objects.dart @@ -0,0 +1,17 @@ +class JLPTRankedWord { + final List readings; + final String? kanji; + final String jlptLevel; + final List meanings; + + const JLPTRankedWord({ + required this.readings, + this.kanji, + required this.jlptLevel, + required this.meanings, + }); + + @override + String toString() => + '(${jlptLevel},${kanji},"${readings.join(",")}","${meanings.join(",")})'; +} diff --git a/lib/_data_ingestion/tanos-jlpt/overrides.dart b/lib/_data_ingestion/tanos-jlpt/overrides.dart new file mode 100644 index 0000000..ca52b47 --- /dev/null +++ b/lib/_data_ingestion/tanos-jlpt/overrides.dart @@ -0,0 +1,314 @@ +const Map<(String?, String), int?> TANOS_JLPT_OVERRIDES = { + // N5: + (null, 'あなた'): 1223615, + (null, 'あの'): 1000430, + (null, 'ある'): 1296400, + (null, 'あれ'): 2847612, + (null, 'いい'): 2820690, + (null, 'いつ'): 1188760, + (null, 'かかる'): 1207590, + (null, 'かぎ'): 1260490, + (null, 'かける'): 1207610, + (null, 'かばん'): 1208910, + ('九', 'きゅう'): 1578150, + (null, 'キロ'): 1042610, + (null, 'コート'): 1049000, + (null, 'ここ'): 1288810, + (null, 'こっち'): 1004500, + (null, 'コップ'): 2846389, + (null, 'この'): 1582920, + (null, 'コピーする'): 1050590, + (null, 'これ'): 2216120, + ('散歩', 'さんぽする'): 1303620, + ('四', 'し'): 1579470, + (null, 'しかし'): 1505990, + ('七', 'しち'): 1319210, + (null, 'じゃ'): 1005900, + ('十', 'じゅう とお'): 1579840, + (null, 'する'): 1157170, + (null, 'せっけん'): 1382590, + ('掃除', 'そうじする'): 1399790, + (null, 'そうして'): 1612860, + (null, 'そこ'): 1006670, + (null, 'その'): 1006830, + (null, 'そば'): 1403830, + (null, 'それ'): 1006970, + (null, 'たいへん'): 1415000, + (null, 'たて'): 1335640, + (null, 'たぶん'): 1407980, + (null, 'だんだん'): 2546180, + (null, 'ちょうど'): 1427340, + (null, 'つける'): 1495770, + (null, 'できる'): 1340450, + (null, 'では'): 1008450, + (null, 'どう'): 1008910, + (null, 'どの'): 1920240, + (null, 'なる'): 2138260, + ('何', 'なん'): 2846738, + (null, 'はい'): 1010080, + (null, 'はく'): 1607260, + (null, 'はし'): 1237410, + (null, 'バス'): 1098390, + (null, 'パン'): 1103090, + (null, 'フォーク'): 1110110, + (null, 'ふろ'): 1500100, + (null, 'ペット'): 1120990, + (null, 'ペン'): 1121380, + ('勉強', 'べんきょうする'): 1512670, + (null, 'ボタン'): 1182880, + (null, 'ほんとう'): 1523060, + ('毎月', 'まいげつ'): 1584350, + ('毎年', 'まいねん'): 1584360, + (null, 'まずい'): 1495000, + (null, 'また'): 1524930, + (null, 'マッチ'): 2784220, + (null, 'もう'): 1012480, + (null, 'やる'): 1012980, + (null, 'ゆっくりと'): 1013050, + (null, 'よく'): 1605870, + (null, 'ラジカセ'): 1138960, + (null, 'りっぱ'): 1551790, + ('練習', 'れんしゅうする'): 1559160, + (null, 'より'): 1013190, + + // N4: + (null, 'あ'): 2394370, + (null, 'ああ'): 2085080, + ('明日', 'あす'): 1584660, + (null, 'あんな'): 1000590, + (null, 'いっぱい'): 1165670, + (null, 'うち'): 1457730, + (null, 'うん'): 1001090, + ('鏡', 'かがみ'): 1238550, + ('堅/硬/固い', 'かたい'): 1257110, + (null, 'くれる'): 1269130, + (null, 'けれど'): 2853889, + (null, 'けんか'): 1257040, + (null, 'こう'): 1004310, + ('工場', 'こうじょう'): 1578700, + (null, 'さっき'): 1005180, + (null, 'すり'): 1567450, + (null, 'そう'): 1006610, + (null, 'たいてい'): 1414580, + (null, 'チェック'): 1077550, + (null, 'ちゃん'): 1007660, + ('妻', 'つま'): 1294330, + (null, 'つもり'): 1382980, + (null, 'とうとう'): 1449890, + (null, 'はず'): 1476430, + (null, 'ひげ'): 1601810, + (null, 'ぶどう'): 1499230, + (null, 'もし'): 2607730, + (null, 'アルバイト'): 1019420, + (null, 'ビル'): 1106010, + (null, 'ベル'): 1120010, + (null, 'レポート'): 1145990, + + // N3: + ('いえ', 'いえ'): 1583250, + ('行き', 'いき'): 1578790, + (null, 'いたずら'): 1151580, + (null, 'いつか'): 1188790, + (null, 'うがい'): 1577660, + ('撃つ', 'うつ'): 1253570, + (null, 'おめでとう'): 1270700, + ('河', 'かわ'): 1390020, + // ('九', 'きゅう'): 1578150, + ('共同', 'きょうどう'): 1591660, + ('腰', 'こし'): 1288340, + // ('四', 'し'): 1579470, + ('お', 'じき'): 1581790, + // ('七', 'しち'): 1319210, + (null, 'しばしば'): 2179090, + (null, 'そっと'): 1006810, + // (null, 'それ'): 1006970, + (null, 'ただ'): 1538900, + (null, 'たとえ'): 1597120, + (null, 'ちょうだい'): 1430230, + // (null, 'できる'): 1340450, + ('度', 'ど'): 1445160, + ('解ける', 'とける'): 1198910, + ('土曜/土曜日', 'どよう'): 1445580, + (null, 'なお'): 1430580, + ('日本', 'にっぽん'): 1582710, + ('熱中する', 'ねっちゅうする'): 1467950, + (null, 'はさみ'): 1573820, + (null, 'ふと'): 1493240, + (null, 'ほぼ'): 1551940, + ('本当', 'ほんとう'): 1523060, + (null, 'まさに'): 1376640, + ('密な', 'みつな'): 2014380, + ('名人', 'めいじん'): 1531680, + ('木曜/木曜日', 'もくよう'): 1534880, + ('尤も', 'もっとも'): 1535810, + ('元', 'もと'): 2219590, + (null, 'やや'): 2771700, + (null, 'ゆっくり'): 1013050, + (null, 'よい'): 1605820, + (null, 'よると'): 1219680, + (null, 'よろしく'): 1224890, + (null, 'カード'): 1036400, + (null, 'グラス'): 1046430, + (null, 'グランド'): 1046840, + (null, 'ケース'): 1047880, + (null, 'コード'): 1049010, + (null, 'ゴール'): 1054230, + (null, 'サイン'): 1056230, + (null, 'ジュース'): 1065950, + (null, 'スイッチ'): 1067210, + (null, 'ダイヤ'): 1076860, + (null, 'トラック'): 1085760, + (null, 'トン'): 1457320, + (null, 'パス'): 1101440, + (null, 'バン'): 1100090, + (null, 'ビール'): 2796520, + (null, 'ピン'): 1107060, + (null, 'プロ'): 1117030, + (null, 'ホーム'): 1121740, + (null, 'ボール'): 1123550, + (null, 'ママ'): 1129240, + (null, 'ミス'): 1130650, + (null, 'ライター'): 1137880, + (null, 'ロケット'): 1147220, + + // N2: + (null, 'アイデア'): 1014210, + (null, 'あいまい'): 1567920, + (null, 'あくび'): 1254010, + ('あひら', 'あひら'): null, + ('一段と', 'いちだんと'): 1164690, + (null, 'いっていらっしゃい'): 2088750, + ('恨む', 'うらむ'): 1289780, + (null, 'おげんきで'): 1260720, + (null, 'おまちください'): 1002360, + (null, 'カーブ'): 1036560, + (null, 'ガム'): 1040350, + (null, 'カラー'): 1038500, + ('基盤', 'きばん'): 1219170, + ('けれど/けれども', ''): 2853889, + (null, 'ごぞんじですか'): 1593570, + (null, 'コック'): 1050310, + (null, 'こぼれる'): 1557650, + ('しいんと', '(する)'): 1631970, + ('じゅうたん', '(カーペット)'): 1595370, + ('滑れる', 'ずれる'): 1006460, + (null, 'せめて'): 1006570, + ('だいいち', '(副)'): 1415270, + (null, 'ちぎる'): 1389020, + (null, 'チップ'): 1077740, + (null, 'チョーク'): 1078240, + (null, 'ついで'): 1345470, + ('統一', 'とういつ'): 1449670, + ('傾らか', 'なだらか'): 1632290, + ('虹', 'にじ'): 1463740, + ('×', 'ばつ'): 2197150, + (null, 'バック'): 1098760, + (null, 'ブラシ'): 1611450, + (null, 'ブローチ'): 1114910, + (null, 'へそ'): 1571170, + (null, 'へる'): 1263120, + (null, 'ぺん'): 1121380, + ('坊っちゃん', 'ぼっちゃん'): 1603720, + (null, 'ぼろ'): 1572500, + (null, 'マラソン'): 1129290, + ('ミリ', '(メートル)'): 1131830, + (null, 'メーター'): 1132530, + (null, 'やたらに'): 1537780, + (null, 'ランチ'): 1140100, + (null, 'レベル'): 1145910, + (null, 'ロッカー'): 1147560, + (null, 'ワンピース'): 1149240, + + // N1: + ('愛憎', 'あいにく'): 1379210, + ('灰', 'あく'): 1201960, + (null, 'あら'): 1000520, + ('依', 'い'): 2252790, + ('伊井', 'いい'): 2252790, + (null, 'いく'): 1578850, + (null, 'えい'): 1001130, + ('於', 'お'): 1485770, + (null, 'おおい'): 2853873, + ('音色', 'おんいろ'): 1576910, + ('仮', 'か'): 1187290, + ('割', 'かつ'): 1208840, + ('日付', 'かづけ'): 1464340, + ('下品', 'かひん'): null, + ('乾', 'かん'): null, + ('蓋', 'がい'): 1204570, + ('学芸', 'がくげい'): 1206720, + ('共', 'きょう'): 1578040, + ('供', 'きょう'): 1233550, + ('僅', 'きん'): null, + ('傾', 'けい'): null, + ('巨', 'こ'): null, + ('煌々と', 'こうこうと'): 1569020, + ('濠', 'ごう'): 1956270, + ('佐', 'さ'): null, + ('真実', 'さな'): 1363780, + (null, 'し'): 2542020, + ('次', 'し'): 1579580, + (null, 'しいんと'): 1631970, + ('湿気る', 'しける'): 1320420, + ('傷', 'しょう'): null, + ('前', 'せん'): 1387100, + ('誰', 'たれ'): 1370860, + ('著', 'ちゃく'): 1597470, + ('中腹', 'ちゅうっぱら'): 1425430, + ('長大', 'ちょうだい'): 1610340, + ('沈黙', 'ちんもく'): 1431810, + ('途中', 'つちゅう'): 1582200, + ('伝言', 'つてごと'): 2545840, + (null, 'と'): 1008490, + ('働', 'どう'): null, + ('同盟', 'どうめい'): 1599290, + ('南', 'なん'): 1460830, + (null, 'なんか'): 1009500, + ('悪い', 'にくい'): 1403390, + ('乗っ取る', 'のっとる'): 1354700, + ('灰皿', 'はいさら'): 1201940, + ('伐', 'ばつ'): null, + ('一筋', 'ひとすき'): 1575970, + ('復旧', 'ふくきゅう'): 1500730, + ('保育', 'ほいく'): 1513290, + ('倣', 'ほう'): null, + (null, 'ぼつぼつ'): 1011840, + ('哉', 'や'): 1537760, + ('夜行', 'やぎょう'): 1584820, + ('矢鱈に', 'やたらに'): 1537780, + (null, 'ゆとり'): 1013070, + ('流', 'りゅう'): 1552090, + ('了', 'りょう'): 1553290, + ('藁', 'わら'): 1562710, + (null, 'アクセル'): 1015290, + (null, 'アップ'): 1016610, + (null, 'イエス'): 1021010, + (null, 'キャリア'): 1041980, + (null, 'クラブ'): 1243910, + (null, 'シート'): 1059450, + (null, 'シック'): 1060610, + (null, 'ソース'): 1075270, + (null, 'ソックス'): 1075420, + (null, 'タイム'): 1076010, + (null, 'ダース'): 1076650, + (null, 'デザート'): 1082850, + (null, 'ニュー'): 1091340, + (null, 'ハンガー'): 1096430, + (null, 'バット'): 1099230, + (null, 'パンク'): 1103110, + (null, 'ファン'): 1108540, + (null, 'フォーム'): 1110340, + (null, 'ベース'): 1119210, + (null, 'ベスト'): 1119530, + (null, 'ペア'): 1120690, + (null, 'ホース'): 1121630, + (null, 'ホール'): 1122130, + (null, 'ボルト'): 1124390, + (null, 'ポーズ'): 1124650, + (null, 'ポンプ'): 1126040, + (null, 'マーク'): 1126140, + (null, 'ランプ'): 1140360, + (null, 'レース'): 1144380, + (null, 'レディー'): 1145590, + (null, 'レバー'): 1145720, +}; diff --git a/lib/_data_ingestion/tanos-jlpt/resolve.dart b/lib/_data_ingestion/tanos-jlpt/resolve.dart new file mode 100644 index 0000000..68a3270 --- /dev/null +++ b/lib/_data_ingestion/tanos-jlpt/resolve.dart @@ -0,0 +1,106 @@ +import 'package:jadb/_data_ingestion/jmdict/table_names.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/overrides.dart'; +import 'package:jadb/util/sqlite_utils.dart'; +import 'package:sqflite_common/sqlite_api.dart'; + +Future> _findReadingCandidates( + JLPTRankedWord word, + Database db, +) => + db + .query( + JMdictTableNames.readingElement, + columns: ['entryId'], + where: + 'reading IN (${word.readings.map((e) => escapeStringValue(e)).join(',')})', + ) + .then((rows) => rows.map((row) => row['entryId'] as int).toList()); + +Future> _findKanjiCandidates( + JLPTRankedWord word, + Database db, +) => + db + .query( + JMdictTableNames.kanjiElement, + columns: ['entryId'], + where: 'reading = ?', + whereArgs: [word.kanji], + ) + .then((rows) => rows.map((row) => row['entryId'] as int).toList()); + +Future> _findSenseCandidates( + JLPTRankedWord word, + Database db, +) => + db + .rawQuery('SELECT entryId, phrase ' + 'FROM ${JMdictTableNames.senseGlossary} ' + 'JOIN ${JMdictTableNames.sense} ON ${JMdictTableNames.senseGlossary}.senseId = ${JMdictTableNames.sense}.id ' + 'WHERE phrase IN (${word.meanings.map((e) => escapeStringValue(e)).join(',')})') + .then((rows) => rows + .map((row) => (row['entryId'] as int, row['phrase'] as String)) + .toList()); + +Future findEntry( + JLPTRankedWord word, + Database db, { + bool useOverrides = true, +}) async { + final List readingCandidates = await _findReadingCandidates(word, db); + final List<(int, String)> senseCandidates = + await _findSenseCandidates(word, db); + + List entryIds; + + if (word.kanji != null) { + final List kanjiCandidates = await _findKanjiCandidates(word, db); + + entryIds = readingCandidates + .where((readingId) => kanjiCandidates.contains(readingId)) + .toList(); + + if (entryIds.isEmpty) { + print('No entry found, trying to combine with senses'); + + entryIds = readingCandidates + .where((readingId) => + senseCandidates.any((sense) => sense.$1 == readingId)) + .toList(); + } + } else { + entryIds = readingCandidates; + } + + if ((entryIds.isEmpty || entryIds.length > 1) && useOverrides) { + print('No entry found, trying to fetch from overrides'); + final overrideEntries = word.readings + .map((reading) => TANOS_JLPT_OVERRIDES[(word.kanji, reading)]) + .whereType() + .toSet(); + + if (overrideEntries.length > 1) { + throw Exception( + 'Multiple override entries found for ${word.toString()}: $entryIds'); + } else if (overrideEntries.length == 0 && + !word.readings.any((reading) => + TANOS_JLPT_OVERRIDES.containsKey((word.kanji, reading)))) { + throw Exception( + 'No override entry found for ${word.toString()}: $entryIds'); + } + + print('Found override: ${overrideEntries.firstOrNull}'); + + return overrideEntries.firstOrNull; + } + + if (entryIds.length > 1) { + throw Exception( + 'Multiple override entries found for ${word.toString()}: $entryIds'); + } else if (entryIds.isEmpty) { + throw Exception('No entry found for ${word.toString()}'); + } + + return entryIds.first; +} diff --git a/lib/_data_ingestion/tanos-jlpt/seed_data.dart b/lib/_data_ingestion/tanos-jlpt/seed_data.dart new file mode 100644 index 0000000..ee13bd7 --- /dev/null +++ b/lib/_data_ingestion/tanos-jlpt/seed_data.dart @@ -0,0 +1,11 @@ +import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart'; +import 'package:sqflite_common/sqlite_api.dart'; + +Future seedTanosJLPTData( + List rankedWords, + Database db, +) async { + throw UnimplementedError( + 'This function is not implemented yet. Please implement it.', + ); +} diff --git a/lib/cli/commands/create_tanos_jlpt_mappings.dart b/lib/cli/commands/create_tanos_jlpt_mappings.dart new file mode 100644 index 0000000..f9e10ae --- /dev/null +++ b/lib/cli/commands/create_tanos_jlpt_mappings.dart @@ -0,0 +1,94 @@ +import 'dart:io'; + +import 'package:jadb/_data_ingestion/open_local_db.dart'; + +import 'package:args/command_runner.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart'; +import 'package:jadb/_data_ingestion/tanos-jlpt/resolve.dart'; +import 'package:jadb/cli/args.dart'; +import 'package:sqflite_common/sqlite_api.dart'; + +class CreateTanosJlptMappings extends Command { + final name = "create-tanos-jlpt-mappings"; + final description = + "Resolve Tanos JLPT data against JMDict. This tool is useful to create overrides for ambiguous references"; + + CreateTanosJlptMappings() { + addLibsqliteArg(argParser); + addJadbArg(argParser); + + argParser.addFlag( + 'overrides', + abbr: 'o', + help: 'Whether to use existing overrides when resolving', + defaultsTo: false, + ); + } + + Future run() async { + if (argResults!.option('libsqlite') == null || + argResults!.option('jadb') == null) { + print(argParser.usage); + exit(64); + } + + final db = await openLocalDb( + jadbPath: argResults!.option('jadb')!, + libsqlitePath: argResults!.option('libsqlite')!, + ); + + final useOverrides = argResults!.flag('overrides'); + + Map files = { + 'N1': File('data/tanos-jlpt/n1.csv'), + 'N2': File('data/tanos-jlpt/n2.csv'), + 'N3': File('data/tanos-jlpt/n3.csv'), + 'N4': File('data/tanos-jlpt/n4.csv'), + 'N5': File('data/tanos-jlpt/n5.csv'), + }; + + final rankedWords = await parseJLPTRankedWords(files); + + await resolveExisting(rankedWords, db, useOverrides); + } +} + +Future resolveExisting( + List rankedWords, + Database db, + bool useOverrides, +) async { + List missingWords = []; + for (final (i, word) in rankedWords.indexed) { + try { + print( + '[${(i + 1).toString().padLeft(4, '0')}/${rankedWords.length}] ${word.toString()}'); + await findEntry(word, db, useOverrides: useOverrides); + } catch (e) { + print(e); + missingWords.add(word); + } + } + + print('Missing entries:'); + for (final word in missingWords) { + print(word.toString()); + } + + print('Statistics:'); + for (final jlptLevel in ['N5', 'N4', 'N3', 'N2', 'N1']) { + final missingWordCount = + missingWords.where((e) => e.jlptLevel == jlptLevel).length; + final totalWordCount = + rankedWords.where((e) => e.jlptLevel == jlptLevel).length; + + final failureRate = + ((missingWordCount / totalWordCount) * 100).toStringAsFixed(2); + + print( + '${jlptLevel} failures: [${missingWordCount}/${totalWordCount}] (${failureRate}%)'); + } + + print('Not able to determine the entry for ${missingWords.length} words'); +} diff --git a/lib/search/word_search.dart b/lib/search/word_search.dart index 65e9854..6910c40 100644 --- a/lib/search/word_search.dart +++ b/lib/search/word_search.dart @@ -10,6 +10,7 @@ import 'package:jadb/models/word_search/word_search_ruby.dart'; import 'package:jadb/models/word_search/word_search_sense.dart'; import 'package:jadb/models/word_search/word_search_sources.dart'; import 'package:jadb/models/word_search/word_search_xref_entry.dart'; +import 'package:jadb/util/sqlite_utils.dart'; import 'package:sqflite_common/sqlite_api.dart'; // TODO: Support globs @@ -20,10 +21,6 @@ import 'package:sqflite_common/sqlite_api.dart'; // TODO: Support mixing kana and romaji -String _escapeStringValue(String value) { - return "'" + value.replaceAll("'", "''") + "'"; -} - Future?> searchWordWithDbConnection( DatabaseExecutor connection, String word, { @@ -171,7 +168,7 @@ Future?> searchWordWithDbConnection( final readingIds = readingElements .map((element) => ( element['entryId'] as int, - _escapeStringValue(element['reading'] as String) + escapeStringValue(element['reading'] as String) )) .toList(); @@ -194,7 +191,7 @@ Future?> searchWordWithDbConnection( final kanjiIds = kanjiElements .map((element) => ( element['entryId'] as int, - _escapeStringValue(element['reading'] as String) + escapeStringValue(element['reading'] as String) )) .toList(); diff --git a/lib/util/sqlite_utils.dart b/lib/util/sqlite_utils.dart new file mode 100644 index 0000000..08ad633 --- /dev/null +++ b/lib/util/sqlite_utils.dart @@ -0,0 +1,3 @@ +String escapeStringValue(String value) { + return "'" + value.replaceAll("'", "''") + "'"; +} diff --git a/migrations/0001_JMDict.sql b/migrations/0001_JMDict.sql index 8e904b3..c6b29ab 100644 --- a/migrations/0001_JMDict.sql +++ b/migrations/0001_JMDict.sql @@ -50,6 +50,8 @@ CREATE TABLE "JMdict_KanjiElement" ( PRIMARY KEY ("entryId", "reading") ) WITHOUT ROWID; +CREATE INDEX "JMdict_KanjiElement_byReading" ON "JMdict_KanjiElement"("reading"); + CREATE TABLE "JMdict_KanjiElementInfo" ( "entryId" INTEGER NOT NULL, "reading" TEXT NOT NULL, @@ -74,6 +76,8 @@ CREATE TABLE "JMdict_ReadingElement" ( PRIMARY KEY ("entryId", "reading") ) WITHOUT ROWID; +CREATE INDEX "JMdict_ReadingElement_byReading" ON "JMdict_ReadingElement"("reading"); + CREATE TABLE "JMdict_ReadingElementRestriction" ( "entryId" INTEGER NOT NULL, "reading" TEXT NOT NULL, @@ -216,6 +220,8 @@ CREATE TABLE "JMdict_SenseGlossary" ( PRIMARY KEY ("senseId", "language", "phrase") ) WITHOUT ROWID; +CREATE INDEX "JMdict_SenseGlossary_byPhrase" ON JMdict_SenseGlossary("phrase"); + CREATE TABLE "JMdict_SenseInfo" ( "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("id"), "info" TEXT NOT NULL, diff --git a/pubspec.lock b/pubspec.lock index b00f476..51db131 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -81,6 +81,14 @@ packages: url: "https://pub.dev" source: hosted version: "3.0.6" + csv: + dependency: "direct main" + description: + name: csv + sha256: c6aa2679b2a18cb57652920f674488d89712efaf4d3fdf2e537215b35fc19d6c + url: "https://pub.dev" + source: hosted + version: "6.0.0" equatable: dependency: "direct main" description: diff --git a/pubspec.yaml b/pubspec.yaml index b97300f..832f101 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -9,6 +9,7 @@ environment: dependencies: args: ^2.7.0 collection: ^1.19.1 + csv: ^6.0.0 equatable: ^2.0.7 sqflite_common: ^2.5.5 sqflite_common_ffi: ^2.3.5