WIP: Add tanos jlpt tags to database
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import 'package:args/command_runner.dart';
|
||||
|
||||
import 'package:jadb/cli/commands/create_db.dart';
|
||||
import 'package:jadb/cli/commands/create_tanos_jlpt_mappings.dart';
|
||||
import 'package:jadb/cli/commands/query_kanji.dart';
|
||||
import 'package:jadb/cli/commands/query_word.dart';
|
||||
|
||||
@@ -13,6 +14,7 @@ Future<void> main(List<String> args) async {
|
||||
runner.addCommand(CreateDb());
|
||||
runner.addCommand(QueryKanji());
|
||||
runner.addCommand(QueryWord());
|
||||
runner.addCommand(CreateTanosJlptMappings());
|
||||
|
||||
runner.run(args);
|
||||
}
|
||||
|
||||
12
flake.lock
generated
12
flake.lock
generated
@@ -3,7 +3,7 @@
|
||||
"jmdict-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-qQL58YVurB+EH84PM1m+RbfijWdXVRKkyxvo93e1210=",
|
||||
"narHash": "sha256-aCHFgCZh21CPiLWRPnZRieaDq5+dG9BnOA9Pcv6G1IY=",
|
||||
"type": "file",
|
||||
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz"
|
||||
},
|
||||
@@ -15,7 +15,7 @@
|
||||
"jmdict-with-examples-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-DqxYTNU0p+J08Mh440Bh82IBSmDITDg0mlj+U4gPDBU=",
|
||||
"narHash": "sha256-B+9l2E1nTUqQQeXXUtQ1oYdR8TL3sVqDUpLqGZdpgv4=",
|
||||
"type": "file",
|
||||
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e_examp.gz"
|
||||
},
|
||||
@@ -27,7 +27,7 @@
|
||||
"kanjidic2-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-XYfpEJBmlhYMDcz7Qi7FgMTTkGbd91uiBitVS5ajbOM=",
|
||||
"narHash": "sha256-XQ6j4XSqipQbaa1TBwu+6iIQRb4eaeqi7W5PSCpf4c0=",
|
||||
"type": "file",
|
||||
"url": "https://www.edrdg.org/kanjidic/kanjidic2.xml.gz"
|
||||
},
|
||||
@@ -38,11 +38,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1745526057,
|
||||
"narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=",
|
||||
"lastModified": 1746904237,
|
||||
"narHash": "sha256-3e+AVBczosP5dCLQmMoMEogM57gmZ2qrVSrmq9aResQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "f771eb401a46846c1aebd20552521b233dd7e18b",
|
||||
"rev": "d89fc19e405cb2d55ce7cc114356846a0ee5e956",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -6,6 +6,8 @@ import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart';
|
||||
import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart';
|
||||
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
|
||||
import 'package:jadb/_data_ingestion/radkfile/seed_data.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/seed_data.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
import 'package:xml/xml.dart';
|
||||
|
||||
@@ -13,6 +15,7 @@ Future<void> seedData(Database db) async {
|
||||
await parseAndSeedDataFromJMdict(db);
|
||||
await parseAndSeedDataFromRADKFILE(db);
|
||||
await parseAndSeedDataFromKANJIDIC(db);
|
||||
await parseAndSeedDataFromTanosJLPT(db);
|
||||
}
|
||||
|
||||
Future<void> parseAndSeedDataFromJMdict(Database db) async {
|
||||
@@ -51,5 +54,22 @@ Future<void> parseAndSeedDataFromRADKFILE(Database db) async {
|
||||
final blocks = parseRADKFILEBlocks(raw);
|
||||
|
||||
print('[RADKFILE] Writing to database...');
|
||||
seedRADKFILEData(blocks, db);
|
||||
await seedRADKFILEData(blocks, db);
|
||||
}
|
||||
|
||||
Future<void> parseAndSeedDataFromTanosJLPT(Database db) async {
|
||||
print('[TANOS-JLPT] Reading files...');
|
||||
Map<String, File> files = {
|
||||
'N1': File('data/tanos-jlpt/n1.csv'),
|
||||
'N2': File('data/tanos-jlpt/n2.csv'),
|
||||
'N3': File('data/tanos-jlpt/n3.csv'),
|
||||
'N4': File('data/tanos-jlpt/n4.csv'),
|
||||
'N5': File('data/tanos-jlpt/n5.csv'),
|
||||
};
|
||||
|
||||
print('[TANOS-JLPT] Parsing content...');
|
||||
final rankedWords = await parseJLPTRankedWords(files);
|
||||
|
||||
print('[TANOS-JLPT] Writing to database...');
|
||||
await seedTanosJLPTData(rankedWords, db);
|
||||
}
|
||||
|
||||
61
lib/_data_ingestion/tanos-jlpt/csv_parser.dart
Normal file
61
lib/_data_ingestion/tanos-jlpt/csv_parser.dart
Normal file
@@ -0,0 +1,61 @@
|
||||
import 'dart:convert';
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:csv/csv.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
|
||||
|
||||
Future<List<JLPTRankedWord>> parseJLPTRankedWords(
|
||||
Map<String, File> files,
|
||||
) async {
|
||||
final List<JLPTRankedWord> result = [];
|
||||
|
||||
for (final entry in files.entries) {
|
||||
final jlptLevel = entry.key;
|
||||
final file = entry.value;
|
||||
|
||||
if (!file.existsSync()) {
|
||||
throw Exception("File $jlptLevel does not exist");
|
||||
}
|
||||
|
||||
final rows = await file
|
||||
.openRead()
|
||||
.transform(utf8.decoder)
|
||||
.transform(CsvToListConverter())
|
||||
.toList();
|
||||
|
||||
for (final row in rows) {
|
||||
if (row.length != 3) {
|
||||
throw Exception("Invalid line in $jlptLevel: $row");
|
||||
}
|
||||
|
||||
final kanji = (row[0] as String).isEmpty
|
||||
? null
|
||||
: (row[0] as String)
|
||||
.replaceFirst(RegExp('^お・'), '')
|
||||
.replaceAll(RegExp(r'(.*)'), '');
|
||||
|
||||
final readings = (row[1] as String)
|
||||
.split(RegExp('[・/、(:?\s+)]'))
|
||||
.map((e) => e.trim())
|
||||
.toList();
|
||||
|
||||
final meanings =
|
||||
(row[2] as String).split(',').expand(cleanMeaning).toList();
|
||||
|
||||
result.add(JLPTRankedWord(
|
||||
readings: readings,
|
||||
kanji: kanji,
|
||||
jlptLevel: jlptLevel,
|
||||
meanings: meanings,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
List<String> cleanMeaning(String meaning) {
|
||||
final initialTrim = meaning.trim().replaceAll(RegExp(r'^\d.\s+'), '');
|
||||
final woParens = initialTrim.replaceAll(RegExp(r'\s*\(.*?\)\s*'), '');
|
||||
return {initialTrim, woParens}.toList();
|
||||
}
|
||||
17
lib/_data_ingestion/tanos-jlpt/objects.dart
Normal file
17
lib/_data_ingestion/tanos-jlpt/objects.dart
Normal file
@@ -0,0 +1,17 @@
|
||||
class JLPTRankedWord {
|
||||
final List<String> readings;
|
||||
final String? kanji;
|
||||
final String jlptLevel;
|
||||
final List<String> meanings;
|
||||
|
||||
const JLPTRankedWord({
|
||||
required this.readings,
|
||||
this.kanji,
|
||||
required this.jlptLevel,
|
||||
required this.meanings,
|
||||
});
|
||||
|
||||
@override
|
||||
String toString() =>
|
||||
'(${jlptLevel},${kanji},"${readings.join(",")}","${meanings.join(",")})';
|
||||
}
|
||||
314
lib/_data_ingestion/tanos-jlpt/overrides.dart
Normal file
314
lib/_data_ingestion/tanos-jlpt/overrides.dart
Normal file
@@ -0,0 +1,314 @@
|
||||
const Map<(String?, String), int?> TANOS_JLPT_OVERRIDES = {
|
||||
// N5:
|
||||
(null, 'あなた'): 1223615,
|
||||
(null, 'あの'): 1000430,
|
||||
(null, 'ある'): 1296400,
|
||||
(null, 'あれ'): 2847612,
|
||||
(null, 'いい'): 2820690,
|
||||
(null, 'いつ'): 1188760,
|
||||
(null, 'かかる'): 1207590,
|
||||
(null, 'かぎ'): 1260490,
|
||||
(null, 'かける'): 1207610,
|
||||
(null, 'かばん'): 1208910,
|
||||
('九', 'きゅう'): 1578150,
|
||||
(null, 'キロ'): 1042610,
|
||||
(null, 'コート'): 1049000,
|
||||
(null, 'ここ'): 1288810,
|
||||
(null, 'こっち'): 1004500,
|
||||
(null, 'コップ'): 2846389,
|
||||
(null, 'この'): 1582920,
|
||||
(null, 'コピーする'): 1050590,
|
||||
(null, 'これ'): 2216120,
|
||||
('散歩', 'さんぽする'): 1303620,
|
||||
('四', 'し'): 1579470,
|
||||
(null, 'しかし'): 1505990,
|
||||
('七', 'しち'): 1319210,
|
||||
(null, 'じゃ'): 1005900,
|
||||
('十', 'じゅう とお'): 1579840,
|
||||
(null, 'する'): 1157170,
|
||||
(null, 'せっけん'): 1382590,
|
||||
('掃除', 'そうじする'): 1399790,
|
||||
(null, 'そうして'): 1612860,
|
||||
(null, 'そこ'): 1006670,
|
||||
(null, 'その'): 1006830,
|
||||
(null, 'そば'): 1403830,
|
||||
(null, 'それ'): 1006970,
|
||||
(null, 'たいへん'): 1415000,
|
||||
(null, 'たて'): 1335640,
|
||||
(null, 'たぶん'): 1407980,
|
||||
(null, 'だんだん'): 2546180,
|
||||
(null, 'ちょうど'): 1427340,
|
||||
(null, 'つける'): 1495770,
|
||||
(null, 'できる'): 1340450,
|
||||
(null, 'では'): 1008450,
|
||||
(null, 'どう'): 1008910,
|
||||
(null, 'どの'): 1920240,
|
||||
(null, 'なる'): 2138260,
|
||||
('何', 'なん'): 2846738,
|
||||
(null, 'はい'): 1010080,
|
||||
(null, 'はく'): 1607260,
|
||||
(null, 'はし'): 1237410,
|
||||
(null, 'バス'): 1098390,
|
||||
(null, 'パン'): 1103090,
|
||||
(null, 'フォーク'): 1110110,
|
||||
(null, 'ふろ'): 1500100,
|
||||
(null, 'ペット'): 1120990,
|
||||
(null, 'ペン'): 1121380,
|
||||
('勉強', 'べんきょうする'): 1512670,
|
||||
(null, 'ボタン'): 1182880,
|
||||
(null, 'ほんとう'): 1523060,
|
||||
('毎月', 'まいげつ'): 1584350,
|
||||
('毎年', 'まいねん'): 1584360,
|
||||
(null, 'まずい'): 1495000,
|
||||
(null, 'また'): 1524930,
|
||||
(null, 'マッチ'): 2784220,
|
||||
(null, 'もう'): 1012480,
|
||||
(null, 'やる'): 1012980,
|
||||
(null, 'ゆっくりと'): 1013050,
|
||||
(null, 'よく'): 1605870,
|
||||
(null, 'ラジカセ'): 1138960,
|
||||
(null, 'りっぱ'): 1551790,
|
||||
('練習', 'れんしゅうする'): 1559160,
|
||||
(null, 'より'): 1013190,
|
||||
|
||||
// N4:
|
||||
(null, 'あ'): 2394370,
|
||||
(null, 'ああ'): 2085080,
|
||||
('明日', 'あす'): 1584660,
|
||||
(null, 'あんな'): 1000590,
|
||||
(null, 'いっぱい'): 1165670,
|
||||
(null, 'うち'): 1457730,
|
||||
(null, 'うん'): 1001090,
|
||||
('鏡', 'かがみ'): 1238550,
|
||||
('堅/硬/固い', 'かたい'): 1257110,
|
||||
(null, 'くれる'): 1269130,
|
||||
(null, 'けれど'): 2853889,
|
||||
(null, 'けんか'): 1257040,
|
||||
(null, 'こう'): 1004310,
|
||||
('工場', 'こうじょう'): 1578700,
|
||||
(null, 'さっき'): 1005180,
|
||||
(null, 'すり'): 1567450,
|
||||
(null, 'そう'): 1006610,
|
||||
(null, 'たいてい'): 1414580,
|
||||
(null, 'チェック'): 1077550,
|
||||
(null, 'ちゃん'): 1007660,
|
||||
('妻', 'つま'): 1294330,
|
||||
(null, 'つもり'): 1382980,
|
||||
(null, 'とうとう'): 1449890,
|
||||
(null, 'はず'): 1476430,
|
||||
(null, 'ひげ'): 1601810,
|
||||
(null, 'ぶどう'): 1499230,
|
||||
(null, 'もし'): 2607730,
|
||||
(null, 'アルバイト'): 1019420,
|
||||
(null, 'ビル'): 1106010,
|
||||
(null, 'ベル'): 1120010,
|
||||
(null, 'レポート'): 1145990,
|
||||
|
||||
// N3:
|
||||
('いえ', 'いえ'): 1583250,
|
||||
('行き', 'いき'): 1578790,
|
||||
(null, 'いたずら'): 1151580,
|
||||
(null, 'いつか'): 1188790,
|
||||
(null, 'うがい'): 1577660,
|
||||
('撃つ', 'うつ'): 1253570,
|
||||
(null, 'おめでとう'): 1270700,
|
||||
('河', 'かわ'): 1390020,
|
||||
// ('九', 'きゅう'): 1578150,
|
||||
('共同', 'きょうどう'): 1591660,
|
||||
('腰', 'こし'): 1288340,
|
||||
// ('四', 'し'): 1579470,
|
||||
('お', 'じき'): 1581790,
|
||||
// ('七', 'しち'): 1319210,
|
||||
(null, 'しばしば'): 2179090,
|
||||
(null, 'そっと'): 1006810,
|
||||
// (null, 'それ'): 1006970,
|
||||
(null, 'ただ'): 1538900,
|
||||
(null, 'たとえ'): 1597120,
|
||||
(null, 'ちょうだい'): 1430230,
|
||||
// (null, 'できる'): 1340450,
|
||||
('度', 'ど'): 1445160,
|
||||
('解ける', 'とける'): 1198910,
|
||||
('土曜/土曜日', 'どよう'): 1445580,
|
||||
(null, 'なお'): 1430580,
|
||||
('日本', 'にっぽん'): 1582710,
|
||||
('熱中する', 'ねっちゅうする'): 1467950,
|
||||
(null, 'はさみ'): 1573820,
|
||||
(null, 'ふと'): 1493240,
|
||||
(null, 'ほぼ'): 1551940,
|
||||
('本当', 'ほんとう'): 1523060,
|
||||
(null, 'まさに'): 1376640,
|
||||
('密な', 'みつな'): 2014380,
|
||||
('名人', 'めいじん'): 1531680,
|
||||
('木曜/木曜日', 'もくよう'): 1534880,
|
||||
('尤も', 'もっとも'): 1535810,
|
||||
('元', 'もと'): 2219590,
|
||||
(null, 'やや'): 2771700,
|
||||
(null, 'ゆっくり'): 1013050,
|
||||
(null, 'よい'): 1605820,
|
||||
(null, 'よると'): 1219680,
|
||||
(null, 'よろしく'): 1224890,
|
||||
(null, 'カード'): 1036400,
|
||||
(null, 'グラス'): 1046430,
|
||||
(null, 'グランド'): 1046840,
|
||||
(null, 'ケース'): 1047880,
|
||||
(null, 'コード'): 1049010,
|
||||
(null, 'ゴール'): 1054230,
|
||||
(null, 'サイン'): 1056230,
|
||||
(null, 'ジュース'): 1065950,
|
||||
(null, 'スイッチ'): 1067210,
|
||||
(null, 'ダイヤ'): 1076860,
|
||||
(null, 'トラック'): 1085760,
|
||||
(null, 'トン'): 1457320,
|
||||
(null, 'パス'): 1101440,
|
||||
(null, 'バン'): 1100090,
|
||||
(null, 'ビール'): 2796520,
|
||||
(null, 'ピン'): 1107060,
|
||||
(null, 'プロ'): 1117030,
|
||||
(null, 'ホーム'): 1121740,
|
||||
(null, 'ボール'): 1123550,
|
||||
(null, 'ママ'): 1129240,
|
||||
(null, 'ミス'): 1130650,
|
||||
(null, 'ライター'): 1137880,
|
||||
(null, 'ロケット'): 1147220,
|
||||
|
||||
// N2:
|
||||
(null, 'アイデア'): 1014210,
|
||||
(null, 'あいまい'): 1567920,
|
||||
(null, 'あくび'): 1254010,
|
||||
('あひら', 'あひら'): null,
|
||||
('一段と', 'いちだんと'): 1164690,
|
||||
(null, 'いっていらっしゃい'): 2088750,
|
||||
('恨む', 'うらむ'): 1289780,
|
||||
(null, 'おげんきで'): 1260720,
|
||||
(null, 'おまちください'): 1002360,
|
||||
(null, 'カーブ'): 1036560,
|
||||
(null, 'ガム'): 1040350,
|
||||
(null, 'カラー'): 1038500,
|
||||
('基盤', 'きばん'): 1219170,
|
||||
('けれど/けれども', ''): 2853889,
|
||||
(null, 'ごぞんじですか'): 1593570,
|
||||
(null, 'コック'): 1050310,
|
||||
(null, 'こぼれる'): 1557650,
|
||||
('しいんと', '(する)'): 1631970,
|
||||
('じゅうたん', '(カーペット)'): 1595370,
|
||||
('滑れる', 'ずれる'): 1006460,
|
||||
(null, 'せめて'): 1006570,
|
||||
('だいいち', '(副)'): 1415270,
|
||||
(null, 'ちぎる'): 1389020,
|
||||
(null, 'チップ'): 1077740,
|
||||
(null, 'チョーク'): 1078240,
|
||||
(null, 'ついで'): 1345470,
|
||||
('統一', 'とういつ'): 1449670,
|
||||
('傾らか', 'なだらか'): 1632290,
|
||||
('虹', 'にじ'): 1463740,
|
||||
('×', 'ばつ'): 2197150,
|
||||
(null, 'バック'): 1098760,
|
||||
(null, 'ブラシ'): 1611450,
|
||||
(null, 'ブローチ'): 1114910,
|
||||
(null, 'へそ'): 1571170,
|
||||
(null, 'へる'): 1263120,
|
||||
(null, 'ぺん'): 1121380,
|
||||
('坊っちゃん', 'ぼっちゃん'): 1603720,
|
||||
(null, 'ぼろ'): 1572500,
|
||||
(null, 'マラソン'): 1129290,
|
||||
('ミリ', '(メートル)'): 1131830,
|
||||
(null, 'メーター'): 1132530,
|
||||
(null, 'やたらに'): 1537780,
|
||||
(null, 'ランチ'): 1140100,
|
||||
(null, 'レベル'): 1145910,
|
||||
(null, 'ロッカー'): 1147560,
|
||||
(null, 'ワンピース'): 1149240,
|
||||
|
||||
// N1:
|
||||
('愛憎', 'あいにく'): 1379210,
|
||||
('灰', 'あく'): 1201960,
|
||||
(null, 'あら'): 1000520,
|
||||
('依', 'い'): 2252790,
|
||||
('伊井', 'いい'): 2252790,
|
||||
(null, 'いく'): 1578850,
|
||||
(null, 'えい'): 1001130,
|
||||
('於', 'お'): 1485770,
|
||||
(null, 'おおい'): 2853873,
|
||||
('音色', 'おんいろ'): 1576910,
|
||||
('仮', 'か'): 1187290,
|
||||
('割', 'かつ'): 1208840,
|
||||
('日付', 'かづけ'): 1464340,
|
||||
('下品', 'かひん'): null,
|
||||
('乾', 'かん'): null,
|
||||
('蓋', 'がい'): 1204570,
|
||||
('学芸', 'がくげい'): 1206720,
|
||||
('共', 'きょう'): 1578040,
|
||||
('供', 'きょう'): 1233550,
|
||||
('僅', 'きん'): null,
|
||||
('傾', 'けい'): null,
|
||||
('巨', 'こ'): null,
|
||||
('煌々と', 'こうこうと'): 1569020,
|
||||
('濠', 'ごう'): 1956270,
|
||||
('佐', 'さ'): null,
|
||||
('真実', 'さな'): 1363780,
|
||||
(null, 'し'): 2542020,
|
||||
('次', 'し'): 1579580,
|
||||
(null, 'しいんと'): 1631970,
|
||||
('湿気る', 'しける'): 1320420,
|
||||
('傷', 'しょう'): null,
|
||||
('前', 'せん'): 1387100,
|
||||
('誰', 'たれ'): 1370860,
|
||||
('著', 'ちゃく'): 1597470,
|
||||
('中腹', 'ちゅうっぱら'): 1425430,
|
||||
('長大', 'ちょうだい'): 1610340,
|
||||
('沈黙', 'ちんもく'): 1431810,
|
||||
('途中', 'つちゅう'): 1582200,
|
||||
('伝言', 'つてごと'): 2545840,
|
||||
(null, 'と'): 1008490,
|
||||
('働', 'どう'): null,
|
||||
('同盟', 'どうめい'): 1599290,
|
||||
('南', 'なん'): 1460830,
|
||||
(null, 'なんか'): 1009500,
|
||||
('悪い', 'にくい'): 1403390,
|
||||
('乗っ取る', 'のっとる'): 1354700,
|
||||
('灰皿', 'はいさら'): 1201940,
|
||||
('伐', 'ばつ'): null,
|
||||
('一筋', 'ひとすき'): 1575970,
|
||||
('復旧', 'ふくきゅう'): 1500730,
|
||||
('保育', 'ほいく'): 1513290,
|
||||
('倣', 'ほう'): null,
|
||||
(null, 'ぼつぼつ'): 1011840,
|
||||
('哉', 'や'): 1537760,
|
||||
('夜行', 'やぎょう'): 1584820,
|
||||
('矢鱈に', 'やたらに'): 1537780,
|
||||
(null, 'ゆとり'): 1013070,
|
||||
('流', 'りゅう'): 1552090,
|
||||
('了', 'りょう'): 1553290,
|
||||
('藁', 'わら'): 1562710,
|
||||
(null, 'アクセル'): 1015290,
|
||||
(null, 'アップ'): 1016610,
|
||||
(null, 'イエス'): 1021010,
|
||||
(null, 'キャリア'): 1041980,
|
||||
(null, 'クラブ'): 1243910,
|
||||
(null, 'シート'): 1059450,
|
||||
(null, 'シック'): 1060610,
|
||||
(null, 'ソース'): 1075270,
|
||||
(null, 'ソックス'): 1075420,
|
||||
(null, 'タイム'): 1076010,
|
||||
(null, 'ダース'): 1076650,
|
||||
(null, 'デザート'): 1082850,
|
||||
(null, 'ニュー'): 1091340,
|
||||
(null, 'ハンガー'): 1096430,
|
||||
(null, 'バット'): 1099230,
|
||||
(null, 'パンク'): 1103110,
|
||||
(null, 'ファン'): 1108540,
|
||||
(null, 'フォーム'): 1110340,
|
||||
(null, 'ベース'): 1119210,
|
||||
(null, 'ベスト'): 1119530,
|
||||
(null, 'ペア'): 1120690,
|
||||
(null, 'ホース'): 1121630,
|
||||
(null, 'ホール'): 1122130,
|
||||
(null, 'ボルト'): 1124390,
|
||||
(null, 'ポーズ'): 1124650,
|
||||
(null, 'ポンプ'): 1126040,
|
||||
(null, 'マーク'): 1126140,
|
||||
(null, 'ランプ'): 1140360,
|
||||
(null, 'レース'): 1144380,
|
||||
(null, 'レディー'): 1145590,
|
||||
(null, 'レバー'): 1145720,
|
||||
};
|
||||
106
lib/_data_ingestion/tanos-jlpt/resolve.dart
Normal file
106
lib/_data_ingestion/tanos-jlpt/resolve.dart
Normal file
@@ -0,0 +1,106 @@
|
||||
import 'package:jadb/_data_ingestion/jmdict/table_names.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/overrides.dart';
|
||||
import 'package:jadb/util/sqlite_utils.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
Future<List<int>> _findReadingCandidates(
|
||||
JLPTRankedWord word,
|
||||
Database db,
|
||||
) =>
|
||||
db
|
||||
.query(
|
||||
JMdictTableNames.readingElement,
|
||||
columns: ['entryId'],
|
||||
where:
|
||||
'reading IN (${word.readings.map((e) => escapeStringValue(e)).join(',')})',
|
||||
)
|
||||
.then((rows) => rows.map((row) => row['entryId'] as int).toList());
|
||||
|
||||
Future<List<int>> _findKanjiCandidates(
|
||||
JLPTRankedWord word,
|
||||
Database db,
|
||||
) =>
|
||||
db
|
||||
.query(
|
||||
JMdictTableNames.kanjiElement,
|
||||
columns: ['entryId'],
|
||||
where: 'reading = ?',
|
||||
whereArgs: [word.kanji],
|
||||
)
|
||||
.then((rows) => rows.map((row) => row['entryId'] as int).toList());
|
||||
|
||||
Future<List<(int, String)>> _findSenseCandidates(
|
||||
JLPTRankedWord word,
|
||||
Database db,
|
||||
) =>
|
||||
db
|
||||
.rawQuery('SELECT entryId, phrase '
|
||||
'FROM ${JMdictTableNames.senseGlossary} '
|
||||
'JOIN ${JMdictTableNames.sense} ON ${JMdictTableNames.senseGlossary}.senseId = ${JMdictTableNames.sense}.id '
|
||||
'WHERE phrase IN (${word.meanings.map((e) => escapeStringValue(e)).join(',')})')
|
||||
.then((rows) => rows
|
||||
.map((row) => (row['entryId'] as int, row['phrase'] as String))
|
||||
.toList());
|
||||
|
||||
Future<int?> findEntry(
|
||||
JLPTRankedWord word,
|
||||
Database db, {
|
||||
bool useOverrides = true,
|
||||
}) async {
|
||||
final List<int> readingCandidates = await _findReadingCandidates(word, db);
|
||||
final List<(int, String)> senseCandidates =
|
||||
await _findSenseCandidates(word, db);
|
||||
|
||||
List<int> entryIds;
|
||||
|
||||
if (word.kanji != null) {
|
||||
final List<int> kanjiCandidates = await _findKanjiCandidates(word, db);
|
||||
|
||||
entryIds = readingCandidates
|
||||
.where((readingId) => kanjiCandidates.contains(readingId))
|
||||
.toList();
|
||||
|
||||
if (entryIds.isEmpty) {
|
||||
print('No entry found, trying to combine with senses');
|
||||
|
||||
entryIds = readingCandidates
|
||||
.where((readingId) =>
|
||||
senseCandidates.any((sense) => sense.$1 == readingId))
|
||||
.toList();
|
||||
}
|
||||
} else {
|
||||
entryIds = readingCandidates;
|
||||
}
|
||||
|
||||
if ((entryIds.isEmpty || entryIds.length > 1) && useOverrides) {
|
||||
print('No entry found, trying to fetch from overrides');
|
||||
final overrideEntries = word.readings
|
||||
.map((reading) => TANOS_JLPT_OVERRIDES[(word.kanji, reading)])
|
||||
.whereType<int>()
|
||||
.toSet();
|
||||
|
||||
if (overrideEntries.length > 1) {
|
||||
throw Exception(
|
||||
'Multiple override entries found for ${word.toString()}: $entryIds');
|
||||
} else if (overrideEntries.length == 0 &&
|
||||
!word.readings.any((reading) =>
|
||||
TANOS_JLPT_OVERRIDES.containsKey((word.kanji, reading)))) {
|
||||
throw Exception(
|
||||
'No override entry found for ${word.toString()}: $entryIds');
|
||||
}
|
||||
|
||||
print('Found override: ${overrideEntries.firstOrNull}');
|
||||
|
||||
return overrideEntries.firstOrNull;
|
||||
}
|
||||
|
||||
if (entryIds.length > 1) {
|
||||
throw Exception(
|
||||
'Multiple override entries found for ${word.toString()}: $entryIds');
|
||||
} else if (entryIds.isEmpty) {
|
||||
throw Exception('No entry found for ${word.toString()}');
|
||||
}
|
||||
|
||||
return entryIds.first;
|
||||
}
|
||||
11
lib/_data_ingestion/tanos-jlpt/seed_data.dart
Normal file
11
lib/_data_ingestion/tanos-jlpt/seed_data.dart
Normal file
@@ -0,0 +1,11 @@
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
Future<void> seedTanosJLPTData(
|
||||
List<JLPTRankedWord> rankedWords,
|
||||
Database db,
|
||||
) async {
|
||||
throw UnimplementedError(
|
||||
'This function is not implemented yet. Please implement it.',
|
||||
);
|
||||
}
|
||||
94
lib/cli/commands/create_tanos_jlpt_mappings.dart
Normal file
94
lib/cli/commands/create_tanos_jlpt_mappings.dart
Normal file
@@ -0,0 +1,94 @@
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:jadb/_data_ingestion/open_local_db.dart';
|
||||
|
||||
import 'package:args/command_runner.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
|
||||
import 'package:jadb/_data_ingestion/tanos-jlpt/resolve.dart';
|
||||
import 'package:jadb/cli/args.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
class CreateTanosJlptMappings extends Command {
|
||||
final name = "create-tanos-jlpt-mappings";
|
||||
final description =
|
||||
"Resolve Tanos JLPT data against JMDict. This tool is useful to create overrides for ambiguous references";
|
||||
|
||||
CreateTanosJlptMappings() {
|
||||
addLibsqliteArg(argParser);
|
||||
addJadbArg(argParser);
|
||||
|
||||
argParser.addFlag(
|
||||
'overrides',
|
||||
abbr: 'o',
|
||||
help: 'Whether to use existing overrides when resolving',
|
||||
defaultsTo: false,
|
||||
);
|
||||
}
|
||||
|
||||
Future<void> run() async {
|
||||
if (argResults!.option('libsqlite') == null ||
|
||||
argResults!.option('jadb') == null) {
|
||||
print(argParser.usage);
|
||||
exit(64);
|
||||
}
|
||||
|
||||
final db = await openLocalDb(
|
||||
jadbPath: argResults!.option('jadb')!,
|
||||
libsqlitePath: argResults!.option('libsqlite')!,
|
||||
);
|
||||
|
||||
final useOverrides = argResults!.flag('overrides');
|
||||
|
||||
Map<String, File> files = {
|
||||
'N1': File('data/tanos-jlpt/n1.csv'),
|
||||
'N2': File('data/tanos-jlpt/n2.csv'),
|
||||
'N3': File('data/tanos-jlpt/n3.csv'),
|
||||
'N4': File('data/tanos-jlpt/n4.csv'),
|
||||
'N5': File('data/tanos-jlpt/n5.csv'),
|
||||
};
|
||||
|
||||
final rankedWords = await parseJLPTRankedWords(files);
|
||||
|
||||
await resolveExisting(rankedWords, db, useOverrides);
|
||||
}
|
||||
}
|
||||
|
||||
Future<void> resolveExisting(
|
||||
List<JLPTRankedWord> rankedWords,
|
||||
Database db,
|
||||
bool useOverrides,
|
||||
) async {
|
||||
List<JLPTRankedWord> missingWords = [];
|
||||
for (final (i, word) in rankedWords.indexed) {
|
||||
try {
|
||||
print(
|
||||
'[${(i + 1).toString().padLeft(4, '0')}/${rankedWords.length}] ${word.toString()}');
|
||||
await findEntry(word, db, useOverrides: useOverrides);
|
||||
} catch (e) {
|
||||
print(e);
|
||||
missingWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
print('Missing entries:');
|
||||
for (final word in missingWords) {
|
||||
print(word.toString());
|
||||
}
|
||||
|
||||
print('Statistics:');
|
||||
for (final jlptLevel in ['N5', 'N4', 'N3', 'N2', 'N1']) {
|
||||
final missingWordCount =
|
||||
missingWords.where((e) => e.jlptLevel == jlptLevel).length;
|
||||
final totalWordCount =
|
||||
rankedWords.where((e) => e.jlptLevel == jlptLevel).length;
|
||||
|
||||
final failureRate =
|
||||
((missingWordCount / totalWordCount) * 100).toStringAsFixed(2);
|
||||
|
||||
print(
|
||||
'${jlptLevel} failures: [${missingWordCount}/${totalWordCount}] (${failureRate}%)');
|
||||
}
|
||||
|
||||
print('Not able to determine the entry for ${missingWords.length} words');
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import 'package:jadb/models/word_search/word_search_ruby.dart';
|
||||
import 'package:jadb/models/word_search/word_search_sense.dart';
|
||||
import 'package:jadb/models/word_search/word_search_sources.dart';
|
||||
import 'package:jadb/models/word_search/word_search_xref_entry.dart';
|
||||
import 'package:jadb/util/sqlite_utils.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
// TODO: Support globs
|
||||
@@ -20,10 +21,6 @@ import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
// TODO: Support mixing kana and romaji
|
||||
|
||||
String _escapeStringValue(String value) {
|
||||
return "'" + value.replaceAll("'", "''") + "'";
|
||||
}
|
||||
|
||||
Future<List<WordSearchResult>?> searchWordWithDbConnection(
|
||||
DatabaseExecutor connection,
|
||||
String word, {
|
||||
@@ -171,7 +168,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
|
||||
final readingIds = readingElements
|
||||
.map((element) => (
|
||||
element['entryId'] as int,
|
||||
_escapeStringValue(element['reading'] as String)
|
||||
escapeStringValue(element['reading'] as String)
|
||||
))
|
||||
.toList();
|
||||
|
||||
@@ -194,7 +191,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
|
||||
final kanjiIds = kanjiElements
|
||||
.map((element) => (
|
||||
element['entryId'] as int,
|
||||
_escapeStringValue(element['reading'] as String)
|
||||
escapeStringValue(element['reading'] as String)
|
||||
))
|
||||
.toList();
|
||||
|
||||
|
||||
3
lib/util/sqlite_utils.dart
Normal file
3
lib/util/sqlite_utils.dart
Normal file
@@ -0,0 +1,3 @@
|
||||
String escapeStringValue(String value) {
|
||||
return "'" + value.replaceAll("'", "''") + "'";
|
||||
}
|
||||
@@ -50,6 +50,8 @@ CREATE TABLE "JMdict_KanjiElement" (
|
||||
PRIMARY KEY ("entryId", "reading")
|
||||
) WITHOUT ROWID;
|
||||
|
||||
CREATE INDEX "JMdict_KanjiElement_byReading" ON "JMdict_KanjiElement"("reading");
|
||||
|
||||
CREATE TABLE "JMdict_KanjiElementInfo" (
|
||||
"entryId" INTEGER NOT NULL,
|
||||
"reading" TEXT NOT NULL,
|
||||
@@ -74,6 +76,8 @@ CREATE TABLE "JMdict_ReadingElement" (
|
||||
PRIMARY KEY ("entryId", "reading")
|
||||
) WITHOUT ROWID;
|
||||
|
||||
CREATE INDEX "JMdict_ReadingElement_byReading" ON "JMdict_ReadingElement"("reading");
|
||||
|
||||
CREATE TABLE "JMdict_ReadingElementRestriction" (
|
||||
"entryId" INTEGER NOT NULL,
|
||||
"reading" TEXT NOT NULL,
|
||||
@@ -216,6 +220,8 @@ CREATE TABLE "JMdict_SenseGlossary" (
|
||||
PRIMARY KEY ("senseId", "language", "phrase")
|
||||
) WITHOUT ROWID;
|
||||
|
||||
CREATE INDEX "JMdict_SenseGlossary_byPhrase" ON JMdict_SenseGlossary("phrase");
|
||||
|
||||
CREATE TABLE "JMdict_SenseInfo" (
|
||||
"senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("id"),
|
||||
"info" TEXT NOT NULL,
|
||||
|
||||
@@ -81,6 +81,14 @@ packages:
|
||||
url: "https://pub.dev"
|
||||
source: hosted
|
||||
version: "3.0.6"
|
||||
csv:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
name: csv
|
||||
sha256: c6aa2679b2a18cb57652920f674488d89712efaf4d3fdf2e537215b35fc19d6c
|
||||
url: "https://pub.dev"
|
||||
source: hosted
|
||||
version: "6.0.0"
|
||||
equatable:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
|
||||
@@ -9,6 +9,7 @@ environment:
|
||||
dependencies:
|
||||
args: ^2.7.0
|
||||
collection: ^1.19.1
|
||||
csv: ^6.0.0
|
||||
equatable: ^2.0.7
|
||||
sqflite_common: ^2.5.5
|
||||
sqflite_common_ffi: ^2.3.5
|
||||
|
||||
Reference in New Issue
Block a user