jmdict: skip inserting duplicate xrefs
This commit is contained in:
@@ -5,6 +5,8 @@ import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
||||
import 'package:jadb/table_names/jmdict.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
/// A wrapper for the result of resolving an xref, which includes the resolved entry and a flag
|
||||
/// indicating whether the xref was ambiguous (i.e. could refer to multiple entries).
|
||||
class ResolvedXref {
|
||||
Entry entry;
|
||||
bool ambiguous;
|
||||
@@ -12,6 +14,13 @@ class ResolvedXref {
|
||||
ResolvedXref(this.entry, this.ambiguous);
|
||||
}
|
||||
|
||||
/// Resolves an xref (pair of kanji, optionally reading, and optionally sense number) to an a specific
|
||||
/// JMdict entry, if possible.
|
||||
///
|
||||
/// If the xref is ambiguous (i.e. it could refer to multiple entries), the
|
||||
/// first entry is returned, and the returned value is marked as ambiguous.
|
||||
///
|
||||
/// If the xref cannot be resolved to any entry at all, an exception is thrown.
|
||||
ResolvedXref resolveXref(
|
||||
SplayTreeMap<String, Set<Entry>> entriesByKanji,
|
||||
SplayTreeMap<String, Set<Entry>> entriesByReading,
|
||||
@@ -181,24 +190,17 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
|
||||
print(' [JMdict] Building xref trees');
|
||||
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
|
||||
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
||||
|
||||
for (final entry in entries) {
|
||||
for (final kanji in entry.kanji) {
|
||||
if (entriesByKanji.containsKey(kanji.reading)) {
|
||||
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
|
||||
} else {
|
||||
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
|
||||
}
|
||||
entriesByKanji.putIfAbsent(kanji.reading, () => {});
|
||||
entriesByKanji.update(kanji.reading, (set) => set..add(entry));
|
||||
}
|
||||
}
|
||||
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
||||
for (final entry in entries) {
|
||||
|
||||
for (final reading in entry.readings) {
|
||||
if (entriesByReading.containsKey(reading.reading)) {
|
||||
entriesByReading.update(reading.reading, (list) => list..add(entry));
|
||||
} else {
|
||||
entriesByReading.putIfAbsent(reading.reading, () => {entry});
|
||||
}
|
||||
entriesByReading.putIfAbsent(reading.reading, () => {});
|
||||
entriesByReading.update(reading.reading, (set) => set..add(entry));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,6 +209,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
|
||||
for (final e in entries) {
|
||||
for (final s in e.senses) {
|
||||
final seenSeeAlsoXrefs = <int>{};
|
||||
for (final xref in s.seeAlso) {
|
||||
final resolvedEntry = resolveXref(
|
||||
entriesByKanji,
|
||||
@@ -214,6 +217,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
xref,
|
||||
);
|
||||
|
||||
if (seenSeeAlsoXrefs.contains(resolvedEntry.entry.entryId)) {
|
||||
print(
|
||||
'WARNING: Skipping duplicate seeAlso xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
|
||||
' (kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, senseOrderNum: ${xref.senseOrderNum})',
|
||||
);
|
||||
continue;
|
||||
}
|
||||
seenSeeAlsoXrefs.add(resolvedEntry.entry.entryId);
|
||||
|
||||
b.insert(JMdictTableNames.senseSeeAlso, {
|
||||
'senseId': s.senseId,
|
||||
'xrefEntryId': resolvedEntry.entry.entryId,
|
||||
@@ -224,6 +236,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
});
|
||||
}
|
||||
|
||||
final seenAntonymXrefs = <int>{};
|
||||
for (final ant in s.antonyms) {
|
||||
final resolvedEntry = resolveXref(
|
||||
entriesByKanji,
|
||||
@@ -231,6 +244,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
ant,
|
||||
);
|
||||
|
||||
if (seenAntonymXrefs.contains(resolvedEntry.entry.entryId)) {
|
||||
print(
|
||||
'WARNING: Skipping duplicate antonym xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
|
||||
' (kanjiRef: ${ant.kanjiRef}, readingRef: ${ant.readingRef}, senseOrderNum: ${ant.senseOrderNum})',
|
||||
);
|
||||
continue;
|
||||
}
|
||||
seenAntonymXrefs.add(resolvedEntry.entry.entryId);
|
||||
|
||||
b.insert(JMdictTableNames.senseAntonyms, {
|
||||
'senseId': s.senseId,
|
||||
'xrefEntryId': resolvedEntry.entry.entryId,
|
||||
|
||||
Reference in New Issue
Block a user