diff --git a/bin/jadb.dart b/bin/jadb.dart index 214e133..d05ffd8 100644 --- a/bin/jadb.dart +++ b/bin/jadb.dart @@ -4,9 +4,7 @@ import 'dart:io'; import 'package:args/args.dart'; import 'package:args/command_runner.dart'; -import 'package:jadb/_data_ingestion/jmdict/parser.dart'; -import 'package:jadb/_data_ingestion/kanjidic/parser.dart'; -import 'package:jadb/_data_ingestion/radkfile/parser.dart'; +import 'package:jadb/_data_ingestion/seed_database.dart'; import 'package:jadb/search.dart'; import 'package:sqflite_common_ffi/sqflite_ffi.dart'; import 'package:sqlite3/open.dart'; @@ -58,9 +56,13 @@ class CreateDb extends Command { ), ); - await addDataFromJMdict(db); - await addDataFromRADKFILE(db); - await addDataFromKANJIDIC(db); + seedData(db).then((_) { + print("Database created successfully"); + }).catchError((error) { + print("Error creating database: $error"); + }).whenComplete(() { + db.close(); + }); } } diff --git a/lib/_data_ingestion/jmdict/parser.dart b/lib/_data_ingestion/jmdict/seed_data.dart similarity index 55% rename from lib/_data_ingestion/jmdict/parser.dart rename to lib/_data_ingestion/jmdict/seed_data.dart index 7c299c8..867f6d2 100644 --- a/lib/_data_ingestion/jmdict/parser.dart +++ b/lib/_data_ingestion/jmdict/seed_data.dart @@ -1,80 +1,9 @@ import 'dart:collection'; -import 'dart:io'; import 'package:collection/collection.dart'; +import 'package:jadb/_data_ingestion/jmdict/objects.dart'; import 'package:jadb/util/romaji_transliteration.dart'; import 'package:sqflite_common/sqlite_api.dart'; -import 'package:xml/xml.dart'; - -import 'objects.dart'; - -/// parse priority values from r_ele and k_ele xml elements -/// -/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq -List getPriorityValues(XmlElement e, String prefix) { - int? news, ichi, spec, gai, nf; - for (final pri in e.findElements('${prefix}_pri')) { - final txt = pri.innerText; - if (txt.startsWith('news')) - news = int.parse(txt.substring(4)); - else if (txt.startsWith('ichi')) - ichi = int.parse(txt.substring(4)); - else if (txt.startsWith('spec')) - spec = int.parse(txt.substring(4)); - else if (txt.startsWith('gai')) - gai = int.parse(txt.substring(3)); - else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2)); - } - return [news, ichi, spec, gai, nf]; -} - -// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref -XRefParts parseXrefParts(String s) { - final parts = s.split('・'); - late final XRefParts result; - switch (parts.length) { - case 1: - result = parts[0].contains(kanaRegex) - ? XRefParts(readingRef: parts[0]) - : XRefParts(kanjiRef: parts[0]); - break; - - case 2: - if (int.tryParse(parts[1]) != null) { - if (parts[0].contains(kanaRegex)) { - result = XRefParts( - readingRef: parts[0], - senseOrderNum: int.parse(parts[1]), - ); - } else { - result = XRefParts( - kanjiRef: parts[0], - senseOrderNum: int.parse(parts[1]), - ); - } - } else { - result = XRefParts( - kanjiRef: parts[0], - readingRef: parts[1], - ); - } - break; - - case 3: - result = XRefParts( - kanjiRef: parts[0], - readingRef: parts[1], - senseOrderNum: int.parse(parts[2]), - ); - break; - - default: - result = XRefParts(); - break; - } - - return result; -} class ResolvedXref { Entry entry; @@ -139,138 +68,7 @@ ResolvedXref resolveXref( } } -List parseXML(XmlElement root) { - final List entries = []; - - int senseId = 0; - - for (final entry in root.childElements) { - final entryId = int.parse(entry.findElements('ent_seq').first.innerText); - - final List kanjiEls = []; - final List readingEls = []; - final List senses = []; - - for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) { - final ke_pri = getPriorityValues(k_ele, 'ke'); - kanjiEls.add( - KanjiElement( - orderNum: kanjiNum + 1, - info: k_ele - .findElements('ke_inf') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - reading: k_ele.findElements('keb').first.innerText, - news: ke_pri[0], - ichi: ke_pri[1], - spec: ke_pri[2], - gai: ke_pri[3], - nf: ke_pri[4], - ), - ); - } - - for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) { - final re_pri = getPriorityValues(r_ele, 're'); - final readingDoesNotMatchKanji = - r_ele.findElements('re_nokanji').isNotEmpty; - readingEls.add( - ReadingElement( - orderNum: orderNum + 1, - readingDoesNotMatchKanji: readingDoesNotMatchKanji, - info: r_ele - .findElements('re_inf') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - restrictions: - r_ele.findElements('re_restr').map((e) => e.innerText).toList(), - reading: r_ele.findElements('reb').first.innerText, - news: re_pri[0], - ichi: re_pri[1], - spec: re_pri[2], - gai: re_pri[3], - nf: re_pri[4], - ), - ); - } - - for (final (orderNum, sense) in entry.findElements('sense').indexed) { - senseId++; - final result = Sense( - id: senseId, - orderNum: orderNum + 1, - restrictedToKanji: - sense.findElements('stagk').map((e) => e.innerText).toList(), - restrictedToReading: - sense.findElements('stagr').map((e) => e.innerText).toList(), - pos: sense - .findElements('pos') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - misc: sense - .findElements('misc') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - dialects: sense - .findElements('dial') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - info: sense.findElements('s_inf').map((e) => e.innerText).toList(), - languageSource: sense - .findElements('lsource') - .map( - (e) => LanguageSource( - language: e.getAttribute('xml:lang') ?? 'eng', - fullyDescribesSense: e.getAttribute('ls_type') == 'part', - constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y', - ), - ) - .toList(), - glossary: sense - .findElements('gloss') - .map( - (e) => Glossary( - language: e.getAttribute('xml:lang') ?? 'eng', - phrase: e.innerText, - type: e.getAttribute('g_type'), - ), - ) - .toList(), - antonyms: sense - .findElements('ant') - .map((e) => parseXrefParts(e.innerText)) - .toList(), - seeAlso: sense - .findElements('xref') - .map((e) => parseXrefParts(e.innerText)) - .toList(), - ); - - if (result.isEmpty) { - print( - 'WARNING: Sense $senseId for entry $entryId is empty, ' - 'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, ' - 'reading: ${readingEls.map((e) => e.reading).join(', ')}', - ); - } else { - senses.add(result); - } - } - - entries.add( - Entry( - id: entryId, - kanji: kanjiEls, - readings: readingEls, - senses: senses, - ), - ); - } - - return entries; -} - -Future insertIntoDB(List entries, Database db) async { +Future seedJMDictData(List entries, Database db) async { print(' [JMdict] Batch 1 - Kanji and readings'); Batch b = db.batch(); for (final e in entries) { @@ -471,17 +269,3 @@ Future insertIntoDB(List entries, Database db) async { await b.commit(); } - -Future addDataFromJMdict(Database db) async { - print('[JMdict] Reading file content...'); - String rawXML = File('data/tmp/JMdict.xml').readAsStringSync(); - - print('[JMdict] Parsing XML tags...'); - XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!; - - print('[JMdict] Parsing XML content...'); - final entries = parseXML(root); - - print('[JMdict] Writing to database...'); - await insertIntoDB(entries, db); -} diff --git a/lib/_data_ingestion/jmdict/xml_parser.dart b/lib/_data_ingestion/jmdict/xml_parser.dart new file mode 100644 index 0000000..a218068 --- /dev/null +++ b/lib/_data_ingestion/jmdict/xml_parser.dart @@ -0,0 +1,201 @@ +import 'package:jadb/_data_ingestion/jmdict/objects.dart'; +import 'package:xml/xml.dart'; + +/// parse priority values from r_ele and k_ele xml elements +/// +/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq +List getPriorityValues(XmlElement e, String prefix) { + int? news, ichi, spec, gai, nf; + for (final pri in e.findElements('${prefix}_pri')) { + final txt = pri.innerText; + if (txt.startsWith('news')) + news = int.parse(txt.substring(4)); + else if (txt.startsWith('ichi')) + ichi = int.parse(txt.substring(4)); + else if (txt.startsWith('spec')) + spec = int.parse(txt.substring(4)); + else if (txt.startsWith('gai')) + gai = int.parse(txt.substring(3)); + else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2)); + } + return [news, ichi, spec, gai, nf]; +} + +// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref +XRefParts parseXrefParts(String s) { + final parts = s.split('・'); + late final XRefParts result; + switch (parts.length) { + case 1: + result = parts[0].contains(kanaRegex) + ? XRefParts(readingRef: parts[0]) + : XRefParts(kanjiRef: parts[0]); + break; + + case 2: + if (int.tryParse(parts[1]) != null) { + if (parts[0].contains(kanaRegex)) { + result = XRefParts( + readingRef: parts[0], + senseOrderNum: int.parse(parts[1]), + ); + } else { + result = XRefParts( + kanjiRef: parts[0], + senseOrderNum: int.parse(parts[1]), + ); + } + } else { + result = XRefParts( + kanjiRef: parts[0], + readingRef: parts[1], + ); + } + break; + + case 3: + result = XRefParts( + kanjiRef: parts[0], + readingRef: parts[1], + senseOrderNum: int.parse(parts[2]), + ); + break; + + default: + result = XRefParts(); + break; + } + + return result; +} + +List parseJMDictData(XmlElement root) { + final List entries = []; + + int senseId = 0; + + for (final entry in root.childElements) { + final entryId = int.parse(entry.findElements('ent_seq').first.innerText); + + final List kanjiEls = []; + final List readingEls = []; + final List senses = []; + + for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) { + final ke_pri = getPriorityValues(k_ele, 'ke'); + kanjiEls.add( + KanjiElement( + orderNum: kanjiNum + 1, + info: k_ele + .findElements('ke_inf') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + reading: k_ele.findElements('keb').first.innerText, + news: ke_pri[0], + ichi: ke_pri[1], + spec: ke_pri[2], + gai: ke_pri[3], + nf: ke_pri[4], + ), + ); + } + + for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) { + final re_pri = getPriorityValues(r_ele, 're'); + final readingDoesNotMatchKanji = + r_ele.findElements('re_nokanji').isNotEmpty; + readingEls.add( + ReadingElement( + orderNum: orderNum + 1, + readingDoesNotMatchKanji: readingDoesNotMatchKanji, + info: r_ele + .findElements('re_inf') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + restrictions: + r_ele.findElements('re_restr').map((e) => e.innerText).toList(), + reading: r_ele.findElements('reb').first.innerText, + news: re_pri[0], + ichi: re_pri[1], + spec: re_pri[2], + gai: re_pri[3], + nf: re_pri[4], + ), + ); + } + + for (final (orderNum, sense) in entry.findElements('sense').indexed) { + senseId++; + final result = Sense( + id: senseId, + orderNum: orderNum + 1, + restrictedToKanji: + sense.findElements('stagk').map((e) => e.innerText).toList(), + restrictedToReading: + sense.findElements('stagr').map((e) => e.innerText).toList(), + pos: sense + .findElements('pos') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + misc: sense + .findElements('misc') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + dialects: sense + .findElements('dial') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + info: sense.findElements('s_inf').map((e) => e.innerText).toList(), + languageSource: sense + .findElements('lsource') + .map( + (e) => LanguageSource( + language: e.getAttribute('xml:lang') ?? 'eng', + fullyDescribesSense: e.getAttribute('ls_type') == 'part', + constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y', + ), + ) + .toList(), + glossary: sense + .findElements('gloss') + .map( + (e) => Glossary( + language: e.getAttribute('xml:lang') ?? 'eng', + phrase: e.innerText, + type: e.getAttribute('g_type'), + ), + ) + .toList(), + antonyms: sense + .findElements('ant') + .map((e) => parseXrefParts(e.innerText)) + .toList(), + seeAlso: sense + .findElements('xref') + .map((e) => parseXrefParts(e.innerText)) + .toList(), + ); + + if (result.isEmpty) { + print( + 'WARNING: Sense $senseId for entry $entryId is empty, ' + 'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, ' + 'reading: ${readingEls.map((e) => e.reading).join(', ')}', + ); + } else { + senses.add(result); + } + } + + entries.add( + Entry( + id: entryId, + kanji: kanjiEls, + readings: readingEls, + senses: senses, + ), + ); + } + + return entries; +} diff --git a/lib/_data_ingestion/kanjidic/seed_data.dart b/lib/_data_ingestion/kanjidic/seed_data.dart new file mode 100644 index 0000000..6f5797a --- /dev/null +++ b/lib/_data_ingestion/kanjidic/seed_data.dart @@ -0,0 +1,74 @@ +import 'package:sqflite_common/sqlite_api.dart'; + +import 'objects.dart'; + +Future seedKANJIDICData(List characters, Database db) async { + final b = db.batch(); + for (final c in characters) { + // if (c.dictionaryReferences.any((e) => + // c.dictionaryReferences + // .where((e2) => e.kanji == e2.kanji && e.type == e2.type) + // .length > + // 1)) { + // print(c.dictionaryReferences.map((e) => e.sqlValue).toList()); + // } + b.insert(TableNames.character, c.sqlValue); + for (final n in c.radicalName) { + b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n}); + } + for (final cp in c.codepoints) { + b.insert(TableNames.codepoint, cp.sqlValue); + } + for (final r in c.radicals) { + b.insert(TableNames.radical, r.sqlValue); + } + for (final sm in c.strokeMiscounts) { + b.insert( + TableNames.strokeMiscount, + { + 'kanji': c.literal, + 'strokeCount': sm, + }, + ); + } + for (final v in c.variants) { + b.insert(TableNames.variant, v.sqlValue); + } + for (final dr in c.dictionaryReferences) { + // There are duplicate entries here + b.insert( + TableNames.dictionaryReference, + dr.sqlValue, + conflictAlgorithm: ConflictAlgorithm.ignore, + ); + } + for (final drm in c.dictionaryReferencesMoro) { + b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue); + } + for (final q in c.querycodes) { + b.insert(TableNames.queryCode, q.sqlValue); + } + for (final r in c.readings) { + b.insert(TableNames.reading, r.sqlValue); + } + for (final k in c.kunyomi) { + b.insert(TableNames.kunyomi, k.sqlValue); + } + for (final o in c.onyomi) { + b.insert(TableNames.onyomi, o.sqlValue); + } + for (final m in c.meanings) { + b.insert(TableNames.meaning, m.sqlValue); + } + for (final n in c.nanori) { + b.insert( + TableNames.nanori, + { + 'kanji': c.literal, + 'nanori': n, + }, + ); + } + } + b.commit(); +} diff --git a/lib/_data_ingestion/kanjidic/parser.dart b/lib/_data_ingestion/kanjidic/xml_parser.dart similarity index 67% rename from lib/_data_ingestion/kanjidic/parser.dart rename to lib/_data_ingestion/kanjidic/xml_parser.dart index dea0910..09e152e 100644 --- a/lib/_data_ingestion/kanjidic/parser.dart +++ b/lib/_data_ingestion/kanjidic/xml_parser.dart @@ -1,12 +1,7 @@ -import 'dart:io'; - -import 'package:sqflite_common/sqlite_api.dart'; +import 'package:jadb/_data_ingestion/kanjidic/objects.dart'; import 'package:xml/xml.dart'; -import 'package:collection/collection.dart'; -import 'objects.dart'; - -List transformXML(XmlElement root) { +List parseKANJIDICData(XmlElement root) { final List result = []; for (final c in root.findElements('character')) { final kanji = c.findElements('literal').first.innerText; @@ -170,88 +165,3 @@ List transformXML(XmlElement root) { } return result; } - -Future insertIntoDB(List characters, Database db) async { - final b = db.batch(); - for (final c in characters) { - // if (c.dictionaryReferences.any((e) => - // c.dictionaryReferences - // .where((e2) => e.kanji == e2.kanji && e.type == e2.type) - // .length > - // 1)) { - // print(c.dictionaryReferences.map((e) => e.sqlValue).toList()); - // } - b.insert(TableNames.character, c.sqlValue); - for (final n in c.radicalName) { - b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n}); - } - for (final cp in c.codepoints) { - b.insert(TableNames.codepoint, cp.sqlValue); - } - for (final r in c.radicals) { - b.insert(TableNames.radical, r.sqlValue); - } - for (final sm in c.strokeMiscounts) { - b.insert( - TableNames.strokeMiscount, - { - 'kanji': c.literal, - 'strokeCount': sm, - }, - ); - } - for (final v in c.variants) { - b.insert(TableNames.variant, v.sqlValue); - } - for (final dr in c.dictionaryReferences) { - // There are duplicate entries here - b.insert( - TableNames.dictionaryReference, - dr.sqlValue, - conflictAlgorithm: ConflictAlgorithm.ignore, - ); - } - for (final drm in c.dictionaryReferencesMoro) { - b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue); - } - for (final q in c.querycodes) { - b.insert(TableNames.queryCode, q.sqlValue); - } - for (final r in c.readings) { - b.insert(TableNames.reading, r.sqlValue); - } - for (final k in c.kunyomi) { - b.insert(TableNames.kunyomi, k.sqlValue); - } - for (final o in c.onyomi) { - b.insert(TableNames.onyomi, o.sqlValue); - } - for (final m in c.meanings) { - b.insert(TableNames.meaning, m.sqlValue); - } - for (final n in c.nanori) { - b.insert( - TableNames.nanori, - { - 'kanji': c.literal, - 'nanori': n, - }, - ); - } - } - b.commit(); -} - -Future addDataFromKANJIDIC(Database db) async { - print('[KANJIDIC2] Reading file...'); - String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync(); - - print('[KANJIDIC2] Parsing XML...'); - XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!; - - print('[KANJIDIC2] Transforming data...'); - final entries = transformXML(root); - - print('[KANJIDIC2] Writing to database...'); - await insertIntoDB(entries, db); -} diff --git a/lib/_data_ingestion/radkfile/parser.dart b/lib/_data_ingestion/radkfile/parser.dart index 1c57509..4c44385 100644 --- a/lib/_data_ingestion/radkfile/parser.dart +++ b/lib/_data_ingestion/radkfile/parser.dart @@ -1,32 +1,10 @@ import 'dart:io'; -import 'package:sqflite_common/sqlite_api.dart'; - -Future addDataFromRADKFILE(Database db) async { +Iterable parseRADKFILEBlocks(File radkfile) { final String content = File('data/tmp/radkfile_utf8').readAsStringSync(); + final Iterable blocks = content.replaceAll(RegExp(r'^#.*$'), '').split(r'$').skip(2); - print('[RADKFILE] Writing to database...'); - final b = db.batch(); - - for (final block in blocks) { - final String radical = block[1]; - final List kanjiList = block - .replaceFirst(RegExp(r'.*\n'), '') - .split('') - ..removeWhere((e) => e == '' || e == '\n'); - - for (final kanji in kanjiList.toSet()) { - b.insert( - 'RADKFILE', - { - 'radical': radical, - 'kanji': kanji, - }, - ); - } - } - - b.commit(); + return blocks; } diff --git a/lib/_data_ingestion/radkfile/seed_data.dart b/lib/_data_ingestion/radkfile/seed_data.dart new file mode 100644 index 0000000..35596d8 --- /dev/null +++ b/lib/_data_ingestion/radkfile/seed_data.dart @@ -0,0 +1,28 @@ +import 'package:sqflite_common/sqlite_api.dart'; + +Future seedRADKFILEData( + Iterable blocks, + Database db, +) async { + final b = db.batch(); + + for (final block in blocks) { + final String radical = block[1]; + final List kanjiList = block + .replaceFirst(RegExp(r'.*\n'), '') + .split('') + ..removeWhere((e) => e == '' || e == '\n'); + + for (final kanji in kanjiList.toSet()) { + b.insert( + 'RADKFILE', + { + 'radical': radical, + 'kanji': kanji, + }, + ); + } + } + + b.commit(); +} diff --git a/lib/_data_ingestion/seed_database.dart b/lib/_data_ingestion/seed_database.dart new file mode 100644 index 0000000..2eb2acf --- /dev/null +++ b/lib/_data_ingestion/seed_database.dart @@ -0,0 +1,55 @@ +import 'dart:io'; + +import 'package:jadb/_data_ingestion/jmdict/seed_data.dart'; +import 'package:jadb/_data_ingestion/jmdict/xml_parser.dart'; +import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart'; +import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart'; +import 'package:jadb/_data_ingestion/radkfile/parser.dart'; +import 'package:jadb/_data_ingestion/radkfile/seed_data.dart'; +import 'package:sqflite_common/sqlite_api.dart'; +import 'package:xml/xml.dart'; + +Future seedData(Database db) async { + await parseAndSeedDataFromJMdict(db); + await parseAndSeedDataFromRADKFILE(db); + await parseAndSeedDataFromKANJIDIC(db); +} + +Future parseAndSeedDataFromJMdict(Database db) async { + print('[JMdict] Reading file content...'); + String rawXML = File('data/tmp/JMdict.xml').readAsStringSync(); + + print('[JMdict] Parsing XML tags...'); + XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!; + + print('[JMdict] Parsing XML content...'); + final entries = parseJMDictData(root); + + print('[JMdict] Writing to database...'); + await seedJMDictData(entries, db); +} + +Future parseAndSeedDataFromKANJIDIC(Database db) async { + print('[KANJIDIC2] Reading file...'); + String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync(); + + print('[KANJIDIC2] Parsing XML...'); + XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!; + + print('[KANJIDIC2] Parsing XML content...'); + final entries = parseKANJIDICData(root); + + print('[KANJIDIC2] Writing to database...'); + await seedKANJIDICData(entries, db); +} + +Future parseAndSeedDataFromRADKFILE(Database db) async { + print('[RADKFILE] Reading file...'); + File raw = File('data/tmp/RADKFILE'); + + print('[RADKFILE] Parsing content...'); + final blocks = parseRADKFILEBlocks(raw); + + print('[RADKFILE] Writing to database...'); + seedRADKFILEData(blocks, db); +}