jmdict: skip inserting duplicate xrefs
This commit is contained in:
@@ -5,6 +5,8 @@ import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
|||||||
import 'package:jadb/table_names/jmdict.dart';
|
import 'package:jadb/table_names/jmdict.dart';
|
||||||
import 'package:sqflite_common/sqlite_api.dart';
|
import 'package:sqflite_common/sqlite_api.dart';
|
||||||
|
|
||||||
|
/// A wrapper for the result of resolving an xref, which includes the resolved entry and a flag
|
||||||
|
/// indicating whether the xref was ambiguous (i.e. could refer to multiple entries).
|
||||||
class ResolvedXref {
|
class ResolvedXref {
|
||||||
Entry entry;
|
Entry entry;
|
||||||
bool ambiguous;
|
bool ambiguous;
|
||||||
@@ -12,6 +14,13 @@ class ResolvedXref {
|
|||||||
ResolvedXref(this.entry, this.ambiguous);
|
ResolvedXref(this.entry, this.ambiguous);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resolves an xref (pair of kanji, optionally reading, and optionally sense number) to an a specific
|
||||||
|
/// JMdict entry, if possible.
|
||||||
|
///
|
||||||
|
/// If the xref is ambiguous (i.e. it could refer to multiple entries), the
|
||||||
|
/// first entry is returned, and the returned value is marked as ambiguous.
|
||||||
|
///
|
||||||
|
/// If the xref cannot be resolved to any entry at all, an exception is thrown.
|
||||||
ResolvedXref resolveXref(
|
ResolvedXref resolveXref(
|
||||||
SplayTreeMap<String, Set<Entry>> entriesByKanji,
|
SplayTreeMap<String, Set<Entry>> entriesByKanji,
|
||||||
SplayTreeMap<String, Set<Entry>> entriesByReading,
|
SplayTreeMap<String, Set<Entry>> entriesByReading,
|
||||||
@@ -181,24 +190,17 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|||||||
|
|
||||||
print(' [JMdict] Building xref trees');
|
print(' [JMdict] Building xref trees');
|
||||||
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
|
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
|
||||||
|
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
||||||
|
|
||||||
for (final entry in entries) {
|
for (final entry in entries) {
|
||||||
for (final kanji in entry.kanji) {
|
for (final kanji in entry.kanji) {
|
||||||
if (entriesByKanji.containsKey(kanji.reading)) {
|
entriesByKanji.putIfAbsent(kanji.reading, () => {});
|
||||||
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
|
entriesByKanji.update(kanji.reading, (set) => set..add(entry));
|
||||||
} else {
|
|
||||||
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
|
||||||
for (final entry in entries) {
|
|
||||||
for (final reading in entry.readings) {
|
for (final reading in entry.readings) {
|
||||||
if (entriesByReading.containsKey(reading.reading)) {
|
entriesByReading.putIfAbsent(reading.reading, () => {});
|
||||||
entriesByReading.update(reading.reading, (list) => list..add(entry));
|
entriesByReading.update(reading.reading, (set) => set..add(entry));
|
||||||
} else {
|
|
||||||
entriesByReading.putIfAbsent(reading.reading, () => {entry});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -207,6 +209,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|||||||
|
|
||||||
for (final e in entries) {
|
for (final e in entries) {
|
||||||
for (final s in e.senses) {
|
for (final s in e.senses) {
|
||||||
|
final seenSeeAlsoXrefs = <int>{};
|
||||||
for (final xref in s.seeAlso) {
|
for (final xref in s.seeAlso) {
|
||||||
final resolvedEntry = resolveXref(
|
final resolvedEntry = resolveXref(
|
||||||
entriesByKanji,
|
entriesByKanji,
|
||||||
@@ -214,6 +217,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|||||||
xref,
|
xref,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (seenSeeAlsoXrefs.contains(resolvedEntry.entry.entryId)) {
|
||||||
|
print(
|
||||||
|
'WARNING: Skipping duplicate seeAlso xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
|
||||||
|
' (kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, senseOrderNum: ${xref.senseOrderNum})',
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seenSeeAlsoXrefs.add(resolvedEntry.entry.entryId);
|
||||||
|
|
||||||
b.insert(JMdictTableNames.senseSeeAlso, {
|
b.insert(JMdictTableNames.senseSeeAlso, {
|
||||||
'senseId': s.senseId,
|
'senseId': s.senseId,
|
||||||
'xrefEntryId': resolvedEntry.entry.entryId,
|
'xrefEntryId': resolvedEntry.entry.entryId,
|
||||||
@@ -224,6 +236,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final seenAntonymXrefs = <int>{};
|
||||||
for (final ant in s.antonyms) {
|
for (final ant in s.antonyms) {
|
||||||
final resolvedEntry = resolveXref(
|
final resolvedEntry = resolveXref(
|
||||||
entriesByKanji,
|
entriesByKanji,
|
||||||
@@ -231,6 +244,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|||||||
ant,
|
ant,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (seenAntonymXrefs.contains(resolvedEntry.entry.entryId)) {
|
||||||
|
print(
|
||||||
|
'WARNING: Skipping duplicate antonym xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
|
||||||
|
' (kanjiRef: ${ant.kanjiRef}, readingRef: ${ant.readingRef}, senseOrderNum: ${ant.senseOrderNum})',
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seenAntonymXrefs.add(resolvedEntry.entry.entryId);
|
||||||
|
|
||||||
b.insert(JMdictTableNames.senseAntonyms, {
|
b.insert(JMdictTableNames.senseAntonyms, {
|
||||||
'senseId': s.senseId,
|
'senseId': s.senseId,
|
||||||
'xrefEntryId': resolvedEntry.entry.entryId,
|
'xrefEntryId': resolvedEntry.entry.entryId,
|
||||||
|
|||||||
Reference in New Issue
Block a user