diff --git a/lib/_data_ingestion/jmdict/seed_data.dart b/lib/_data_ingestion/jmdict/seed_data.dart index d84a929..03b3208 100644 --- a/lib/_data_ingestion/jmdict/seed_data.dart +++ b/lib/_data_ingestion/jmdict/seed_data.dart @@ -5,6 +5,8 @@ import 'package:jadb/_data_ingestion/jmdict/objects.dart'; import 'package:jadb/table_names/jmdict.dart'; import 'package:sqflite_common/sqlite_api.dart'; +/// A wrapper for the result of resolving an xref, which includes the resolved entry and a flag +/// indicating whether the xref was ambiguous (i.e. could refer to multiple entries). class ResolvedXref { Entry entry; bool ambiguous; @@ -12,6 +14,13 @@ class ResolvedXref { ResolvedXref(this.entry, this.ambiguous); } +/// Resolves an xref (pair of kanji, optionally reading, and optionally sense number) to an a specific +/// JMdict entry, if possible. +/// +/// If the xref is ambiguous (i.e. it could refer to multiple entries), the +/// first entry is returned, and the returned value is marked as ambiguous. +/// +/// If the xref cannot be resolved to any entry at all, an exception is thrown. ResolvedXref resolveXref( SplayTreeMap> entriesByKanji, SplayTreeMap> entriesByReading, @@ -181,24 +190,17 @@ Future seedJMDictData(List entries, Database db) async { print(' [JMdict] Building xref trees'); final SplayTreeMap> entriesByKanji = SplayTreeMap(); + final SplayTreeMap> entriesByReading = SplayTreeMap(); for (final entry in entries) { for (final kanji in entry.kanji) { - if (entriesByKanji.containsKey(kanji.reading)) { - entriesByKanji.update(kanji.reading, (list) => list..add(entry)); - } else { - entriesByKanji.putIfAbsent(kanji.reading, () => {entry}); - } + entriesByKanji.putIfAbsent(kanji.reading, () => {}); + entriesByKanji.update(kanji.reading, (set) => set..add(entry)); } - } - final SplayTreeMap> entriesByReading = SplayTreeMap(); - for (final entry in entries) { + for (final reading in entry.readings) { - if (entriesByReading.containsKey(reading.reading)) { - entriesByReading.update(reading.reading, (list) => list..add(entry)); - } else { - entriesByReading.putIfAbsent(reading.reading, () => {entry}); - } + entriesByReading.putIfAbsent(reading.reading, () => {}); + entriesByReading.update(reading.reading, (set) => set..add(entry)); } } @@ -207,6 +209,7 @@ Future seedJMDictData(List entries, Database db) async { for (final e in entries) { for (final s in e.senses) { + final seenSeeAlsoXrefs = {}; for (final xref in s.seeAlso) { final resolvedEntry = resolveXref( entriesByKanji, @@ -214,6 +217,15 @@ Future seedJMDictData(List entries, Database db) async { xref, ); + if (seenSeeAlsoXrefs.contains(resolvedEntry.entry.entryId)) { + print( + 'WARNING: Skipping duplicate seeAlso xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n' + ' (kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, senseOrderNum: ${xref.senseOrderNum})', + ); + continue; + } + seenSeeAlsoXrefs.add(resolvedEntry.entry.entryId); + b.insert(JMdictTableNames.senseSeeAlso, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId, @@ -224,6 +236,7 @@ Future seedJMDictData(List entries, Database db) async { }); } + final seenAntonymXrefs = {}; for (final ant in s.antonyms) { final resolvedEntry = resolveXref( entriesByKanji, @@ -231,6 +244,15 @@ Future seedJMDictData(List entries, Database db) async { ant, ); + if (seenAntonymXrefs.contains(resolvedEntry.entry.entryId)) { + print( + 'WARNING: Skipping duplicate antonym xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n' + ' (kanjiRef: ${ant.kanjiRef}, readingRef: ${ant.readingRef}, senseOrderNum: ${ant.senseOrderNum})', + ); + continue; + } + seenAntonymXrefs.add(resolvedEntry.entry.entryId); + b.insert(JMdictTableNames.senseAntonyms, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId,