import 'dart:collection'; import 'dart:io'; import 'package:sqflite_common/sqlite_api.dart'; import 'package:xml/xml.dart'; import '../romaji_transliteration.dart'; import 'objects.dart'; List getPriNums(XmlElement e, String prefix) { int? news, ichi, spec, gai, nf; for (final pri in e.findElements('${prefix}_pri')) { final txt = pri.innerText; if (txt.startsWith('news')) news = int.parse(txt.substring(4)); else if (txt.startsWith('ichi')) ichi = int.parse(txt.substring(4)); else if (txt.startsWith('spec')) spec = int.parse(txt.substring(4)); else if (txt.startsWith('gai')) gai = int.parse(txt.substring(3)); else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2)); } return [news, ichi, spec, gai, nf]; } List transformXML(XmlElement root) { final List entries = []; int senseId = 0; for (final entry in root.childElements) { final entryId = int.parse(entry.findElements('ent_seq').first.innerText); final List kanjiEls = []; final List readingEls = []; final List senses = []; for (final k_ele in entry.findAllElements('k_ele')) { final ke_pri = getPriNums(k_ele, 'ke'); kanjiEls.add( KanjiElement( info: k_ele.findElements('ke_inf').map((e) => e.innerText).toList(), reading: k_ele.findElements('keb').first.innerText, news: ke_pri[0], ichi: ke_pri[1], spec: ke_pri[2], gai: ke_pri[3], nf: ke_pri[4], ), ); } for (final r_ele in entry.findAllElements('r_ele')) { final re_pri = getPriNums(r_ele, 're'); readingEls.add( ReadingElement( info: r_ele .findElements('re_inf') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), restrictions: r_ele.findElements('re_restr').map((e) => e.innerText).toList(), reading: r_ele.findElements('reb').first.innerText, news: re_pri[0], ichi: re_pri[1], spec: re_pri[2], gai: re_pri[3], nf: re_pri[4], ), ); } for (final sense in entry.findAllElements('sense')) { senseId++; senses.add( Sense( id: senseId, restrictedToKanji: sense.findElements('stagk').map((e) => e.innerText).toList(), restrictedToReading: sense.findElements('stagr').map((e) => e.innerText).toList(), pos: sense .findElements('pos') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), misc: sense .findElements('misc') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), dialects: sense .findElements('dial') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), info: sense.findElements('s_inf').map((e) => e.innerText).toList(), languageSource: sense .findElements('lsource') .map( (e) => LanguageSource( language: e.getAttribute('xml:lang') ?? 'eng', fullyDescribesSense: e.getAttribute('ls_type') == 'part', constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y', ), ) .toList(), glossary: sense .findElements('gloss') .map( (e) => Glossary( language: e.getAttribute('xml:lang') ?? 'eng', phrase: e.innerText, type: e.getAttribute('g_type'), ), ) .toList(), antonyms: sense .findElements('ant') .map((e) => XRefParts.fromString(e.innerText)) .toList(), seeAlso: sense .findElements('xref') .map((e) => XRefParts.fromString(e.innerText)) .toList(), ), ); } entries.add( Entry( id: entryId, kanji: kanjiEls, readings: readingEls, senses: senses, ), ); } return entries; } Future insertIntoDB(List entries, Database db) async { print(' [JMdict] Batch 1'); Batch b = db.batch(); for (final e in entries) { b.insert(TableNames.entry, e.sqlValue); for (final k in e.kanji) { b.insert(TableNames.kanjiElement, k.sqlValue..addAll({'entryId': e.id})); // b.insert( // TableNames.entryByKana, // {'entryId': e.id, 'kana': transliterateKatakanaToHiragana(k.reading)}, // // Some entries have the same reading twice with difference in katakana and hiragana // conflictAlgorithm: ConflictAlgorithm.ignore, // ); for (final i in k.info) { b.insert( TableNames.kanjiInfo, {'entryId': e.id, 'reading': k.reading, 'info': i}, ); } } for (final r in e.readings) { b.insert( TableNames.readingElement, r.sqlValue..addAll({'entryId': e.id}), ); b.insert( TableNames.entryByKana, {'entryId': e.id, 'kana': transliterateKanaToLatin(r.reading)}, // Some entries have the same reading twice with difference in katakana and hiragana conflictAlgorithm: ConflictAlgorithm.ignore, ); for (final i in r.info) { b.insert( TableNames.readingInfo, {'entryId': e.id, 'reading': r.reading, 'info': i}, ); } for (final res in r.restrictions) { b.insert( TableNames.readingRestriction, {'entryId': e.id, 'reading': r.reading, 'restriction': res}, ); } } for (final s in e.senses) { for (final g in s.glossary) { if (g.language == "eng") b.insert( TableNames.entryByEnglish, {'entryId': e.id, 'english': g.phrase}, // Some entries have the same reading twice with difference in katakana and hiragana conflictAlgorithm: ConflictAlgorithm.ignore, ); } } } await b.commit(); print(' [JMdict] Building trees'); SplayTreeMap> entriesByKanji = SplayTreeMap(); for (final entry in entries) { for (final kanji in entry.kanji) { if (entriesByKanji.containsKey(kanji.reading)) { entriesByKanji.update(kanji.reading, (list) => list..add(entry)); } else { entriesByKanji.putIfAbsent(kanji.reading, () => {entry}); } } } SplayTreeMap> entriesByReading = SplayTreeMap(); for (final entry in entries) { for (final reading in entry.readings) { if (entriesByReading.containsKey(reading.reading)) { entriesByReading.update(reading.reading, (list) => list..add(entry)); } else { entriesByReading.putIfAbsent(reading.reading, () => {entry}); } } } print(' [JMdict] Batch 2'); b = db.batch(); for (final e in entries) { for (final s in e.senses) { b.insert( TableNames.sense, s.sqlValue..addAll({'id': s.id, 'entryId': e.id})); for (final d in s.dialects) { b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d}); } for (final f in s.fields) { b.insert(TableNames.senseField, {'senseId': s.id, 'field': f}); } for (final i in s.info) { b.insert(TableNames.senseInfo, {'senseId': s.id, 'info': i}); } for (final m in s.misc) { b.insert(TableNames.senseMisc, {'senseId': s.id, 'misc': m}); } for (final p in s.pos) { b.insert(TableNames.sensePOS, {'senseId': s.id, 'pos': p}); } for (final l in s.languageSource) { b.insert( TableNames.senseLanguageSource, l.sqlValue..addAll({'senseId': s.id}), ); } for (final rk in s.restrictedToKanji) { b.insert( TableNames.senseRestrictedToKanji, {'entryId': e.id, 'senseId': s.id, 'kanji': rk}, ); } for (final rr in s.restrictedToReading) { b.insert( TableNames.senseRestrictedToReading, {'entryId': e.id, 'senseId': s.id, 'reading': rr}, ); } for (final ls in s.languageSource) { b.insert( TableNames.senseLanguageSource, ls.sqlValue..addAll({'senseId': s.id}), ); } for (final g in s.glossary) { if (g.language == 'eng') b.insert( TableNames.senseGlossary, g.sqlValue..addAll({'senseId': s.id}), // There are some duplicate glossary, especially in // the other languages. conflictAlgorithm: ConflictAlgorithm.ignore, ); } for (final xref in s.seeAlso) { final Set entries; if (xref.kanjiRef != null && xref.readingRef != null) { entries = entriesByKanji[xref.kanjiRef]! .difference(entriesByReading[xref.readingRef]!); } else if (xref.kanjiRef != null) { entries = entriesByKanji[xref.kanjiRef]!; } else { entries = entriesByReading[xref.readingRef]!; } for (final ex in entries) if (!(xref.senseNum != null && xref.senseNum! > ex.senses.length)) { b.insert( TableNames.senseSeeAlso, { 'senseId': s.id, 'xrefEntryId': ex.id, 'seeAlsoKanji': xref.kanjiRef, 'seeAlsoReading': xref.readingRef, 'seeAlsoSense': xref.senseNum, }, ); } } for (final ant in s.antonyms) { final Set entries; if (ant.kanjiRef != null && ant.readingRef != null) { entries = entriesByKanji[ant.kanjiRef]! .difference(entriesByReading[ant.readingRef]!); } else if (ant.kanjiRef != null) { entries = entriesByKanji[ant.kanjiRef]!; } else { entries = entriesByReading[ant.readingRef]!; } for (final ex in entries) { if (!(ant.senseNum != null && ant.senseNum! > ex.senses.length)) { b.insert(TableNames.senseAntonyms, { 'senseId': s.id, 'xrefEntryId': ex.id, 'antonymKanji': ant.kanjiRef, 'antonymReading': ant.readingRef, 'antonymSense': ant.senseNum, }); } } } } } await b.commit(); } Future addDataFromJMdict(Database db) async { print('[JMdict] Reading file...'); String rawXML = File('data/JMdict.xml').readAsStringSync(); print('[JMdict] Parsing XML...'); XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!; print('[JMdict] Transforming data...'); final entries = transformXML(root); print('[JMdict] Writing to database...'); await insertIntoDB(entries, db); }