import 'dart:collection'; import 'package:collection/collection.dart'; import 'package:jadb/_data_ingestion/jmdict/objects.dart'; import 'package:jadb/table_names/jmdict.dart'; import 'package:sqflite_common/sqlite_api.dart'; class ResolvedXref { Entry entry; bool ambiguous; ResolvedXref(this.entry, this.ambiguous); } ResolvedXref resolveXref( SplayTreeMap> entriesByKanji, SplayTreeMap> entriesByReading, XRefParts xref, ) { List candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) { (null, null) => throw Exception( 'Xref $xref has no kanji or reading reference', ), (final String k, null) => entriesByKanji[k]!.toList(), (null, final String r) => entriesByReading[r]!.toList(), (final String k, final String r) => entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(), }; // Filter out entries that don't have the number of senses specified in the xref if (xref.senseOrderNum != null) { candidateEntries.retainWhere( (entry) => entry.senses.length >= xref.senseOrderNum!, ); } // If the xref has a reading ref but no kanji ref, and there are multiple // entries to choose from, prefer entries with empty kanji readings // if possible. if (xref.kanjiRef == null && xref.readingRef != null && candidateEntries.length > 1) { final candidatesWithEmptyKanji = candidateEntries .where((entry) => entry.kanji.isEmpty) .toList(); if (candidatesWithEmptyKanji.isNotEmpty) { candidateEntries = candidatesWithEmptyKanji; } } // Having more senses is a cheap way to choose the most likely correct // entry in case there are multiple candidates left. candidateEntries.sortBy((entry) => entry.senses.length); if (candidateEntries.isEmpty) { throw Exception( 'SKIPPING: Xref $xref has ${candidateEntries.length} entries, ' 'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, ' 'senseOrderNum: ${xref.senseOrderNum}', ); } else if (candidateEntries.length > 1) { print( 'WARNING: Xref $xref has ${candidateEntries.length} entries, ' 'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, ' 'senseOrderNum: ${xref.senseOrderNum}', ); return ResolvedXref(candidateEntries.first, true); } else { return ResolvedXref(candidateEntries.first, false); } } Future seedJMDictData(List entries, Database db) async { print(' [JMdict] Batch 1 - Kanji and readings'); Batch b = db.batch(); int elementId = 0; for (final e in entries) { b.insert(JMdictTableNames.entry, e.sqlValue); for (final k in e.kanji) { elementId++; b.insert( JMdictTableNames.kanjiElement, k.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}), ); for (final i in k.info) { b.insert(JMdictTableNames.kanjiInfo, { 'elementId': elementId, 'info': i, }); } } for (final r in e.readings) { elementId++; b.insert( JMdictTableNames.readingElement, r.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}), ); for (final i in r.info) { b.insert(JMdictTableNames.readingInfo, { 'elementId': elementId, 'info': i, }); } for (final res in r.restrictions) { b.insert(JMdictTableNames.readingRestriction, { 'elementId': elementId, 'restriction': res, }); } } } await b.commit(noResult: true); print(' [JMdict] Batch 2 - Senses'); b = db.batch(); for (final e in entries) { for (final s in e.senses) { b.insert( JMdictTableNames.sense, s.sqlValue..addAll({'entryId': e.entryId}), ); for (final d in s.dialects) { b.insert(JMdictTableNames.senseDialect, { 'senseId': s.senseId, 'dialect': d, }); } for (final f in s.fields) { b.insert(JMdictTableNames.senseField, { 'senseId': s.senseId, 'field': f, }); } for (final i in s.info) { b.insert(JMdictTableNames.senseInfo, {'senseId': s.senseId, 'info': i}); } for (final m in s.misc) { b.insert(JMdictTableNames.senseMisc, {'senseId': s.senseId, 'misc': m}); } for (final p in s.pos) { b.insert(JMdictTableNames.sensePOS, {'senseId': s.senseId, 'pos': p}); } for (final rk in s.restrictedToKanji) { b.insert(JMdictTableNames.senseRestrictedToKanji, { 'entryId': e.entryId, 'senseId': s.senseId, 'kanji': rk, }); } for (final rr in s.restrictedToReading) { b.insert(JMdictTableNames.senseRestrictedToReading, { 'entryId': e.entryId, 'senseId': s.senseId, 'reading': rr, }); } for (final ls in s.languageSource) { b.insert( JMdictTableNames.senseLanguageSource, ls.sqlValue..addAll({'senseId': s.senseId}), ); } for (final g in s.glossary) { b.insert( JMdictTableNames.senseGlossary, g.sqlValue..addAll({'senseId': s.senseId}), ); } } } await b.commit(noResult: true); print(' [JMdict] Building xref trees'); final SplayTreeMap> entriesByKanji = SplayTreeMap(); for (final entry in entries) { for (final kanji in entry.kanji) { if (entriesByKanji.containsKey(kanji.reading)) { entriesByKanji.update(kanji.reading, (list) => list..add(entry)); } else { entriesByKanji.putIfAbsent(kanji.reading, () => {entry}); } } } final SplayTreeMap> entriesByReading = SplayTreeMap(); for (final entry in entries) { for (final reading in entry.readings) { if (entriesByReading.containsKey(reading.reading)) { entriesByReading.update(reading.reading, (list) => list..add(entry)); } else { entriesByReading.putIfAbsent(reading.reading, () => {entry}); } } } print(' [JMdict] Batch 3 - Xrefs'); b = db.batch(); for (final e in entries) { for (final s in e.senses) { for (final xref in s.seeAlso) { final resolvedEntry = resolveXref( entriesByKanji, entriesByReading, xref, ); b.insert(JMdictTableNames.senseSeeAlso, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId, 'seeAlsoKanji': xref.kanjiRef, 'seeAlsoReading': xref.readingRef, 'seeAlsoSense': xref.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }); } for (final ant in s.antonyms) { final resolvedEntry = resolveXref( entriesByKanji, entriesByReading, ant, ); b.insert(JMdictTableNames.senseAntonyms, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId, 'antonymKanji': ant.kanjiRef, 'antonymReading': ant.readingRef, 'antonymSense': ant.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }); } } } await b.commit(noResult: true); }