WIP: Add tanos jlpt tags to database

This commit is contained in:
2025-04-29 21:11:09 +02:00
parent adb2d687fb
commit fa4353bae0
14 changed files with 653 additions and 13 deletions

View File

@@ -1,6 +1,7 @@
import 'package:args/command_runner.dart';
import 'package:jadb/cli/commands/create_db.dart';
import 'package:jadb/cli/commands/create_tanos_jlpt_mappings.dart';
import 'package:jadb/cli/commands/query_kanji.dart';
import 'package:jadb/cli/commands/query_word.dart';
@@ -13,6 +14,7 @@ Future<void> main(List<String> args) async {
runner.addCommand(CreateDb());
runner.addCommand(QueryKanji());
runner.addCommand(QueryWord());
runner.addCommand(CreateTanosJlptMappings());
runner.run(args);
}

12
flake.lock generated
View File

@@ -3,7 +3,7 @@
"jmdict-src": {
"flake": false,
"locked": {
"narHash": "sha256-qQL58YVurB+EH84PM1m+RbfijWdXVRKkyxvo93e1210=",
"narHash": "sha256-aCHFgCZh21CPiLWRPnZRieaDq5+dG9BnOA9Pcv6G1IY=",
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz"
},
@@ -15,7 +15,7 @@
"jmdict-with-examples-src": {
"flake": false,
"locked": {
"narHash": "sha256-DqxYTNU0p+J08Mh440Bh82IBSmDITDg0mlj+U4gPDBU=",
"narHash": "sha256-B+9l2E1nTUqQQeXXUtQ1oYdR8TL3sVqDUpLqGZdpgv4=",
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e_examp.gz"
},
@@ -27,7 +27,7 @@
"kanjidic2-src": {
"flake": false,
"locked": {
"narHash": "sha256-XYfpEJBmlhYMDcz7Qi7FgMTTkGbd91uiBitVS5ajbOM=",
"narHash": "sha256-XQ6j4XSqipQbaa1TBwu+6iIQRb4eaeqi7W5PSCpf4c0=",
"type": "file",
"url": "https://www.edrdg.org/kanjidic/kanjidic2.xml.gz"
},
@@ -38,11 +38,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1745526057,
"narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=",
"lastModified": 1746904237,
"narHash": "sha256-3e+AVBczosP5dCLQmMoMEogM57gmZ2qrVSrmq9aResQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "f771eb401a46846c1aebd20552521b233dd7e18b",
"rev": "d89fc19e405cb2d55ce7cc114356846a0ee5e956",
"type": "github"
},
"original": {

View File

@@ -6,6 +6,8 @@ import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart';
import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart';
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
import 'package:jadb/_data_ingestion/radkfile/seed_data.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/seed_data.dart';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
@@ -13,6 +15,7 @@ Future<void> seedData(Database db) async {
await parseAndSeedDataFromJMdict(db);
await parseAndSeedDataFromRADKFILE(db);
await parseAndSeedDataFromKANJIDIC(db);
await parseAndSeedDataFromTanosJLPT(db);
}
Future<void> parseAndSeedDataFromJMdict(Database db) async {
@@ -51,5 +54,22 @@ Future<void> parseAndSeedDataFromRADKFILE(Database db) async {
final blocks = parseRADKFILEBlocks(raw);
print('[RADKFILE] Writing to database...');
seedRADKFILEData(blocks, db);
await seedRADKFILEData(blocks, db);
}
Future<void> parseAndSeedDataFromTanosJLPT(Database db) async {
print('[TANOS-JLPT] Reading files...');
Map<String, File> files = {
'N1': File('data/tanos-jlpt/n1.csv'),
'N2': File('data/tanos-jlpt/n2.csv'),
'N3': File('data/tanos-jlpt/n3.csv'),
'N4': File('data/tanos-jlpt/n4.csv'),
'N5': File('data/tanos-jlpt/n5.csv'),
};
print('[TANOS-JLPT] Parsing content...');
final rankedWords = await parseJLPTRankedWords(files);
print('[TANOS-JLPT] Writing to database...');
await seedTanosJLPTData(rankedWords, db);
}

View File

@@ -0,0 +1,61 @@
import 'dart:convert';
import 'dart:io';
import 'package:csv/csv.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
Future<List<JLPTRankedWord>> parseJLPTRankedWords(
Map<String, File> files,
) async {
final List<JLPTRankedWord> result = [];
for (final entry in files.entries) {
final jlptLevel = entry.key;
final file = entry.value;
if (!file.existsSync()) {
throw Exception("File $jlptLevel does not exist");
}
final rows = await file
.openRead()
.transform(utf8.decoder)
.transform(CsvToListConverter())
.toList();
for (final row in rows) {
if (row.length != 3) {
throw Exception("Invalid line in $jlptLevel: $row");
}
final kanji = (row[0] as String).isEmpty
? null
: (row[0] as String)
.replaceFirst(RegExp('^お・'), '')
.replaceAll(RegExp(r'.*'), '');
final readings = (row[1] as String)
.split(RegExp('[・/、(:?\s+)]'))
.map((e) => e.trim())
.toList();
final meanings =
(row[2] as String).split(',').expand(cleanMeaning).toList();
result.add(JLPTRankedWord(
readings: readings,
kanji: kanji,
jlptLevel: jlptLevel,
meanings: meanings,
));
}
}
return result;
}
List<String> cleanMeaning(String meaning) {
final initialTrim = meaning.trim().replaceAll(RegExp(r'^\d.\s+'), '');
final woParens = initialTrim.replaceAll(RegExp(r'\s*\(.*?\)\s*'), '');
return {initialTrim, woParens}.toList();
}

View File

@@ -0,0 +1,17 @@
class JLPTRankedWord {
final List<String> readings;
final String? kanji;
final String jlptLevel;
final List<String> meanings;
const JLPTRankedWord({
required this.readings,
this.kanji,
required this.jlptLevel,
required this.meanings,
});
@override
String toString() =>
'(${jlptLevel},${kanji},"${readings.join(",")}","${meanings.join(",")})';
}

View File

@@ -0,0 +1,314 @@
const Map<(String?, String), int?> TANOS_JLPT_OVERRIDES = {
// N5:
(null, 'あなた'): 1223615,
(null, 'あの'): 1000430,
(null, 'ある'): 1296400,
(null, 'あれ'): 2847612,
(null, 'いい'): 2820690,
(null, 'いつ'): 1188760,
(null, 'かかる'): 1207590,
(null, 'かぎ'): 1260490,
(null, 'かける'): 1207610,
(null, 'かばん'): 1208910,
('', 'きゅう'): 1578150,
(null, 'キロ'): 1042610,
(null, 'コート'): 1049000,
(null, 'ここ'): 1288810,
(null, 'こっち'): 1004500,
(null, 'コップ'): 2846389,
(null, 'この'): 1582920,
(null, 'コピーする'): 1050590,
(null, 'これ'): 2216120,
('散歩', 'さんぽする'): 1303620,
('', ''): 1579470,
(null, 'しかし'): 1505990,
('', 'しち'): 1319210,
(null, 'じゃ'): 1005900,
('', 'じゅう とお'): 1579840,
(null, 'する'): 1157170,
(null, 'せっけん'): 1382590,
('掃除', 'そうじする'): 1399790,
(null, 'そうして'): 1612860,
(null, 'そこ'): 1006670,
(null, 'その'): 1006830,
(null, 'そば'): 1403830,
(null, 'それ'): 1006970,
(null, 'たいへん'): 1415000,
(null, 'たて'): 1335640,
(null, 'たぶん'): 1407980,
(null, 'だんだん'): 2546180,
(null, 'ちょうど'): 1427340,
(null, 'つける'): 1495770,
(null, 'できる'): 1340450,
(null, 'では'): 1008450,
(null, 'どう'): 1008910,
(null, 'どの'): 1920240,
(null, 'なる'): 2138260,
('', 'なん'): 2846738,
(null, 'はい'): 1010080,
(null, 'はく'): 1607260,
(null, 'はし'): 1237410,
(null, 'バス'): 1098390,
(null, 'パン'): 1103090,
(null, 'フォーク'): 1110110,
(null, 'ふろ'): 1500100,
(null, 'ペット'): 1120990,
(null, 'ペン'): 1121380,
('勉強', 'べんきょうする'): 1512670,
(null, 'ボタン'): 1182880,
(null, 'ほんとう'): 1523060,
('毎月', 'まいげつ'): 1584350,
('毎年', 'まいねん'): 1584360,
(null, 'まずい'): 1495000,
(null, 'また'): 1524930,
(null, 'マッチ'): 2784220,
(null, 'もう'): 1012480,
(null, 'やる'): 1012980,
(null, 'ゆっくりと'): 1013050,
(null, 'よく'): 1605870,
(null, 'ラジカセ'): 1138960,
(null, 'りっぱ'): 1551790,
('練習', 'れんしゅうする'): 1559160,
(null, 'より'): 1013190,
// N4:
(null, ''): 2394370,
(null, 'ああ'): 2085080,
('明日', 'あす'): 1584660,
(null, 'あんな'): 1000590,
(null, 'いっぱい'): 1165670,
(null, 'うち'): 1457730,
(null, 'うん'): 1001090,
('', 'かがみ'): 1238550,
('堅/硬/固い', 'かたい'): 1257110,
(null, 'くれる'): 1269130,
(null, 'けれど'): 2853889,
(null, 'けんか'): 1257040,
(null, 'こう'): 1004310,
('工場', 'こうじょう'): 1578700,
(null, 'さっき'): 1005180,
(null, 'すり'): 1567450,
(null, 'そう'): 1006610,
(null, 'たいてい'): 1414580,
(null, 'チェック'): 1077550,
(null, 'ちゃん'): 1007660,
('', 'つま'): 1294330,
(null, 'つもり'): 1382980,
(null, 'とうとう'): 1449890,
(null, 'はず'): 1476430,
(null, 'ひげ'): 1601810,
(null, 'ぶどう'): 1499230,
(null, 'もし'): 2607730,
(null, 'アルバイト'): 1019420,
(null, 'ビル'): 1106010,
(null, 'ベル'): 1120010,
(null, 'レポート'): 1145990,
// N3:
('いえ', 'いえ'): 1583250,
('行き', 'いき'): 1578790,
(null, 'いたずら'): 1151580,
(null, 'いつか'): 1188790,
(null, 'うがい'): 1577660,
('撃つ', 'うつ'): 1253570,
(null, 'おめでとう'): 1270700,
('', 'かわ'): 1390020,
// ('九', 'きゅう'): 1578150,
('共同', 'きょうどう'): 1591660,
('', 'こし'): 1288340,
// ('四', 'し'): 1579470,
('', 'じき'): 1581790,
// ('七', 'しち'): 1319210,
(null, 'しばしば'): 2179090,
(null, 'そっと'): 1006810,
// (null, 'それ'): 1006970,
(null, 'ただ'): 1538900,
(null, 'たとえ'): 1597120,
(null, 'ちょうだい'): 1430230,
// (null, 'できる'): 1340450,
('', ''): 1445160,
('解ける', 'とける'): 1198910,
('土曜/土曜日', 'どよう'): 1445580,
(null, 'なお'): 1430580,
('日本', 'にっぽん'): 1582710,
('熱中する', 'ねっちゅうする'): 1467950,
(null, 'はさみ'): 1573820,
(null, 'ふと'): 1493240,
(null, 'ほぼ'): 1551940,
('本当', 'ほんとう'): 1523060,
(null, 'まさに'): 1376640,
('密な', 'みつな'): 2014380,
('名人', 'めいじん'): 1531680,
('木曜/木曜日', 'もくよう'): 1534880,
('尤も', 'もっとも'): 1535810,
('', 'もと'): 2219590,
(null, 'やや'): 2771700,
(null, 'ゆっくり'): 1013050,
(null, 'よい'): 1605820,
(null, 'よると'): 1219680,
(null, 'よろしく'): 1224890,
(null, 'カード'): 1036400,
(null, 'グラス'): 1046430,
(null, 'グランド'): 1046840,
(null, 'ケース'): 1047880,
(null, 'コード'): 1049010,
(null, 'ゴール'): 1054230,
(null, 'サイン'): 1056230,
(null, 'ジュース'): 1065950,
(null, 'スイッチ'): 1067210,
(null, 'ダイヤ'): 1076860,
(null, 'トラック'): 1085760,
(null, 'トン'): 1457320,
(null, 'パス'): 1101440,
(null, 'バン'): 1100090,
(null, 'ビール'): 2796520,
(null, 'ピン'): 1107060,
(null, 'プロ'): 1117030,
(null, 'ホーム'): 1121740,
(null, 'ボール'): 1123550,
(null, 'ママ'): 1129240,
(null, 'ミス'): 1130650,
(null, 'ライター'): 1137880,
(null, 'ロケット'): 1147220,
// N2:
(null, 'アイデア'): 1014210,
(null, 'あいまい'): 1567920,
(null, 'あくび'): 1254010,
('あひら', 'あひら'): null,
('一段と', 'いちだんと'): 1164690,
(null, 'いっていらっしゃい'): 2088750,
('恨む', 'うらむ'): 1289780,
(null, 'おげんきで'): 1260720,
(null, 'おまちください'): 1002360,
(null, 'カーブ'): 1036560,
(null, 'ガム'): 1040350,
(null, 'カラー'): 1038500,
('基盤', 'きばん'): 1219170,
('けれど/けれども', ''): 2853889,
(null, 'ごぞんじですか'): 1593570,
(null, 'コック'): 1050310,
(null, 'こぼれる'): 1557650,
('しいんと', '(する)'): 1631970,
('じゅうたん', '(カーペット)'): 1595370,
('滑れる', 'ずれる'): 1006460,
(null, 'せめて'): 1006570,
('だいいち', '(副)'): 1415270,
(null, 'ちぎる'): 1389020,
(null, 'チップ'): 1077740,
(null, 'チョーク'): 1078240,
(null, 'ついで'): 1345470,
('統一', 'とういつ'): 1449670,
('傾らか', 'なだらか'): 1632290,
('', 'にじ'): 1463740,
('×', 'ばつ'): 2197150,
(null, 'バック'): 1098760,
(null, 'ブラシ'): 1611450,
(null, 'ブローチ'): 1114910,
(null, 'へそ'): 1571170,
(null, 'へる'): 1263120,
(null, 'ぺん'): 1121380,
('坊っちゃん', 'ぼっちゃん'): 1603720,
(null, 'ぼろ'): 1572500,
(null, 'マラソン'): 1129290,
('ミリ', '(メートル)'): 1131830,
(null, 'メーター'): 1132530,
(null, 'やたらに'): 1537780,
(null, 'ランチ'): 1140100,
(null, 'レベル'): 1145910,
(null, 'ロッカー'): 1147560,
(null, 'ワンピース'): 1149240,
// N1:
('愛憎', 'あいにく'): 1379210,
('', 'あく'): 1201960,
(null, 'あら'): 1000520,
('', ''): 2252790,
('伊井', 'いい'): 2252790,
(null, 'いく'): 1578850,
(null, 'えい'): 1001130,
('', ''): 1485770,
(null, 'おおい'): 2853873,
('音色', 'おんいろ'): 1576910,
('', ''): 1187290,
('', 'かつ'): 1208840,
('日付', 'かづけ'): 1464340,
('下品', 'かひん'): null,
('', 'かん'): null,
('', 'がい'): 1204570,
('学芸', 'がくげい'): 1206720,
('', 'きょう'): 1578040,
('', 'きょう'): 1233550,
('', 'きん'): null,
('', 'けい'): null,
('', ''): null,
('煌々と', 'こうこうと'): 1569020,
('', 'ごう'): 1956270,
('', ''): null,
('真実', 'さな'): 1363780,
(null, ''): 2542020,
('', ''): 1579580,
(null, 'しいんと'): 1631970,
('湿気る', 'しける'): 1320420,
('', 'しょう'): null,
('', 'せん'): 1387100,
('', 'たれ'): 1370860,
('', 'ちゃく'): 1597470,
('中腹', 'ちゅうっぱら'): 1425430,
('長大', 'ちょうだい'): 1610340,
('沈黙', 'ちんもく'): 1431810,
('途中', 'つちゅう'): 1582200,
('伝言', 'つてごと'): 2545840,
(null, ''): 1008490,
('', 'どう'): null,
('同盟', 'どうめい'): 1599290,
('', 'なん'): 1460830,
(null, 'なんか'): 1009500,
('悪い', 'にくい'): 1403390,
('乗っ取る', 'のっとる'): 1354700,
('灰皿', 'はいさら'): 1201940,
('', 'ばつ'): null,
('一筋', 'ひとすき'): 1575970,
('復旧', 'ふくきゅう'): 1500730,
('保育', 'ほいく'): 1513290,
('', 'ほう'): null,
(null, 'ぼつぼつ'): 1011840,
('', ''): 1537760,
('夜行', 'やぎょう'): 1584820,
('矢鱈に', 'やたらに'): 1537780,
(null, 'ゆとり'): 1013070,
('', 'りゅう'): 1552090,
('', 'りょう'): 1553290,
('', 'わら'): 1562710,
(null, 'アクセル'): 1015290,
(null, 'アップ'): 1016610,
(null, 'イエス'): 1021010,
(null, 'キャリア'): 1041980,
(null, 'クラブ'): 1243910,
(null, 'シート'): 1059450,
(null, 'シック'): 1060610,
(null, 'ソース'): 1075270,
(null, 'ソックス'): 1075420,
(null, 'タイム'): 1076010,
(null, 'ダース'): 1076650,
(null, 'デザート'): 1082850,
(null, 'ニュー'): 1091340,
(null, 'ハンガー'): 1096430,
(null, 'バット'): 1099230,
(null, 'パンク'): 1103110,
(null, 'ファン'): 1108540,
(null, 'フォーム'): 1110340,
(null, 'ベース'): 1119210,
(null, 'ベスト'): 1119530,
(null, 'ペア'): 1120690,
(null, 'ホース'): 1121630,
(null, 'ホール'): 1122130,
(null, 'ボルト'): 1124390,
(null, 'ポーズ'): 1124650,
(null, 'ポンプ'): 1126040,
(null, 'マーク'): 1126140,
(null, 'ランプ'): 1140360,
(null, 'レース'): 1144380,
(null, 'レディー'): 1145590,
(null, 'レバー'): 1145720,
};

View File

@@ -0,0 +1,106 @@
import 'package:jadb/_data_ingestion/jmdict/table_names.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/overrides.dart';
import 'package:jadb/util/sqlite_utils.dart';
import 'package:sqflite_common/sqlite_api.dart';
Future<List<int>> _findReadingCandidates(
JLPTRankedWord word,
Database db,
) =>
db
.query(
JMdictTableNames.readingElement,
columns: ['entryId'],
where:
'reading IN (${word.readings.map((e) => escapeStringValue(e)).join(',')})',
)
.then((rows) => rows.map((row) => row['entryId'] as int).toList());
Future<List<int>> _findKanjiCandidates(
JLPTRankedWord word,
Database db,
) =>
db
.query(
JMdictTableNames.kanjiElement,
columns: ['entryId'],
where: 'reading = ?',
whereArgs: [word.kanji],
)
.then((rows) => rows.map((row) => row['entryId'] as int).toList());
Future<List<(int, String)>> _findSenseCandidates(
JLPTRankedWord word,
Database db,
) =>
db
.rawQuery('SELECT entryId, phrase '
'FROM ${JMdictTableNames.senseGlossary} '
'JOIN ${JMdictTableNames.sense} ON ${JMdictTableNames.senseGlossary}.senseId = ${JMdictTableNames.sense}.id '
'WHERE phrase IN (${word.meanings.map((e) => escapeStringValue(e)).join(',')})')
.then((rows) => rows
.map((row) => (row['entryId'] as int, row['phrase'] as String))
.toList());
Future<int?> findEntry(
JLPTRankedWord word,
Database db, {
bool useOverrides = true,
}) async {
final List<int> readingCandidates = await _findReadingCandidates(word, db);
final List<(int, String)> senseCandidates =
await _findSenseCandidates(word, db);
List<int> entryIds;
if (word.kanji != null) {
final List<int> kanjiCandidates = await _findKanjiCandidates(word, db);
entryIds = readingCandidates
.where((readingId) => kanjiCandidates.contains(readingId))
.toList();
if (entryIds.isEmpty) {
print('No entry found, trying to combine with senses');
entryIds = readingCandidates
.where((readingId) =>
senseCandidates.any((sense) => sense.$1 == readingId))
.toList();
}
} else {
entryIds = readingCandidates;
}
if ((entryIds.isEmpty || entryIds.length > 1) && useOverrides) {
print('No entry found, trying to fetch from overrides');
final overrideEntries = word.readings
.map((reading) => TANOS_JLPT_OVERRIDES[(word.kanji, reading)])
.whereType<int>()
.toSet();
if (overrideEntries.length > 1) {
throw Exception(
'Multiple override entries found for ${word.toString()}: $entryIds');
} else if (overrideEntries.length == 0 &&
!word.readings.any((reading) =>
TANOS_JLPT_OVERRIDES.containsKey((word.kanji, reading)))) {
throw Exception(
'No override entry found for ${word.toString()}: $entryIds');
}
print('Found override: ${overrideEntries.firstOrNull}');
return overrideEntries.firstOrNull;
}
if (entryIds.length > 1) {
throw Exception(
'Multiple override entries found for ${word.toString()}: $entryIds');
} else if (entryIds.isEmpty) {
throw Exception('No entry found for ${word.toString()}');
}
return entryIds.first;
}

View File

@@ -0,0 +1,11 @@
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
import 'package:sqflite_common/sqlite_api.dart';
Future<void> seedTanosJLPTData(
List<JLPTRankedWord> rankedWords,
Database db,
) async {
throw UnimplementedError(
'This function is not implemented yet. Please implement it.',
);
}

View File

@@ -0,0 +1,94 @@
import 'dart:io';
import 'package:jadb/_data_ingestion/open_local_db.dart';
import 'package:args/command_runner.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/resolve.dart';
import 'package:jadb/cli/args.dart';
import 'package:sqflite_common/sqlite_api.dart';
class CreateTanosJlptMappings extends Command {
final name = "create-tanos-jlpt-mappings";
final description =
"Resolve Tanos JLPT data against JMDict. This tool is useful to create overrides for ambiguous references";
CreateTanosJlptMappings() {
addLibsqliteArg(argParser);
addJadbArg(argParser);
argParser.addFlag(
'overrides',
abbr: 'o',
help: 'Whether to use existing overrides when resolving',
defaultsTo: false,
);
}
Future<void> run() async {
if (argResults!.option('libsqlite') == null ||
argResults!.option('jadb') == null) {
print(argParser.usage);
exit(64);
}
final db = await openLocalDb(
jadbPath: argResults!.option('jadb')!,
libsqlitePath: argResults!.option('libsqlite')!,
);
final useOverrides = argResults!.flag('overrides');
Map<String, File> files = {
'N1': File('data/tanos-jlpt/n1.csv'),
'N2': File('data/tanos-jlpt/n2.csv'),
'N3': File('data/tanos-jlpt/n3.csv'),
'N4': File('data/tanos-jlpt/n4.csv'),
'N5': File('data/tanos-jlpt/n5.csv'),
};
final rankedWords = await parseJLPTRankedWords(files);
await resolveExisting(rankedWords, db, useOverrides);
}
}
Future<void> resolveExisting(
List<JLPTRankedWord> rankedWords,
Database db,
bool useOverrides,
) async {
List<JLPTRankedWord> missingWords = [];
for (final (i, word) in rankedWords.indexed) {
try {
print(
'[${(i + 1).toString().padLeft(4, '0')}/${rankedWords.length}] ${word.toString()}');
await findEntry(word, db, useOverrides: useOverrides);
} catch (e) {
print(e);
missingWords.add(word);
}
}
print('Missing entries:');
for (final word in missingWords) {
print(word.toString());
}
print('Statistics:');
for (final jlptLevel in ['N5', 'N4', 'N3', 'N2', 'N1']) {
final missingWordCount =
missingWords.where((e) => e.jlptLevel == jlptLevel).length;
final totalWordCount =
rankedWords.where((e) => e.jlptLevel == jlptLevel).length;
final failureRate =
((missingWordCount / totalWordCount) * 100).toStringAsFixed(2);
print(
'${jlptLevel} failures: [${missingWordCount}/${totalWordCount}] (${failureRate}%)');
}
print('Not able to determine the entry for ${missingWords.length} words');
}

View File

@@ -10,6 +10,7 @@ import 'package:jadb/models/word_search/word_search_ruby.dart';
import 'package:jadb/models/word_search/word_search_sense.dart';
import 'package:jadb/models/word_search/word_search_sources.dart';
import 'package:jadb/models/word_search/word_search_xref_entry.dart';
import 'package:jadb/util/sqlite_utils.dart';
import 'package:sqflite_common/sqlite_api.dart';
// TODO: Support globs
@@ -20,10 +21,6 @@ import 'package:sqflite_common/sqlite_api.dart';
// TODO: Support mixing kana and romaji
String _escapeStringValue(String value) {
return "'" + value.replaceAll("'", "''") + "'";
}
Future<List<WordSearchResult>?> searchWordWithDbConnection(
DatabaseExecutor connection,
String word, {
@@ -171,7 +168,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
final readingIds = readingElements
.map((element) => (
element['entryId'] as int,
_escapeStringValue(element['reading'] as String)
escapeStringValue(element['reading'] as String)
))
.toList();
@@ -194,7 +191,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
final kanjiIds = kanjiElements
.map((element) => (
element['entryId'] as int,
_escapeStringValue(element['reading'] as String)
escapeStringValue(element['reading'] as String)
))
.toList();

View File

@@ -0,0 +1,3 @@
String escapeStringValue(String value) {
return "'" + value.replaceAll("'", "''") + "'";
}

View File

@@ -50,6 +50,8 @@ CREATE TABLE "JMdict_KanjiElement" (
PRIMARY KEY ("entryId", "reading")
) WITHOUT ROWID;
CREATE INDEX "JMdict_KanjiElement_byReading" ON "JMdict_KanjiElement"("reading");
CREATE TABLE "JMdict_KanjiElementInfo" (
"entryId" INTEGER NOT NULL,
"reading" TEXT NOT NULL,
@@ -74,6 +76,8 @@ CREATE TABLE "JMdict_ReadingElement" (
PRIMARY KEY ("entryId", "reading")
) WITHOUT ROWID;
CREATE INDEX "JMdict_ReadingElement_byReading" ON "JMdict_ReadingElement"("reading");
CREATE TABLE "JMdict_ReadingElementRestriction" (
"entryId" INTEGER NOT NULL,
"reading" TEXT NOT NULL,
@@ -216,6 +220,8 @@ CREATE TABLE "JMdict_SenseGlossary" (
PRIMARY KEY ("senseId", "language", "phrase")
) WITHOUT ROWID;
CREATE INDEX "JMdict_SenseGlossary_byPhrase" ON JMdict_SenseGlossary("phrase");
CREATE TABLE "JMdict_SenseInfo" (
"senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("id"),
"info" TEXT NOT NULL,

View File

@@ -81,6 +81,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "3.0.6"
csv:
dependency: "direct main"
description:
name: csv
sha256: c6aa2679b2a18cb57652920f674488d89712efaf4d3fdf2e537215b35fc19d6c
url: "https://pub.dev"
source: hosted
version: "6.0.0"
equatable:
dependency: "direct main"
description:

View File

@@ -9,6 +9,7 @@ environment:
dependencies:
args: ^2.7.0
collection: ^1.19.1
csv: ^6.0.0
equatable: ^2.0.7
sqflite_common: ^2.5.5
sqflite_common_ffi: ^2.3.5