jmdict: skip inserting duplicate xrefs

This commit is contained in:
2026-04-01 16:03:56 +09:00
parent 5aa068eaec
commit f40825de65

View File

@@ -5,6 +5,8 @@ import 'package:jadb/_data_ingestion/jmdict/objects.dart';
import 'package:jadb/table_names/jmdict.dart';
import 'package:sqflite_common/sqlite_api.dart';
/// A wrapper for the result of resolving an xref, which includes the resolved entry and a flag
/// indicating whether the xref was ambiguous (i.e. could refer to multiple entries).
class ResolvedXref {
Entry entry;
bool ambiguous;
@@ -12,6 +14,13 @@ class ResolvedXref {
ResolvedXref(this.entry, this.ambiguous);
}
/// Resolves an xref (pair of kanji, optionally reading, and optionally sense number) to an a specific
/// JMdict entry, if possible.
///
/// If the xref is ambiguous (i.e. it could refer to multiple entries), the
/// first entry is returned, and the returned value is marked as ambiguous.
///
/// If the xref cannot be resolved to any entry at all, an exception is thrown.
ResolvedXref resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByKanji,
SplayTreeMap<String, Set<Entry>> entriesByReading,
@@ -181,24 +190,17 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
print(' [JMdict] Building xref trees');
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
entriesByKanji.putIfAbsent(kanji.reading, () => {});
entriesByKanji.update(kanji.reading, (set) => set..add(entry));
}
}
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
entriesByReading.putIfAbsent(reading.reading, () => {});
entriesByReading.update(reading.reading, (set) => set..add(entry));
}
}
@@ -207,6 +209,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
for (final e in entries) {
for (final s in e.senses) {
final seenSeeAlsoXrefs = <int>{};
for (final xref in s.seeAlso) {
final resolvedEntry = resolveXref(
entriesByKanji,
@@ -214,6 +217,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
xref,
);
if (seenSeeAlsoXrefs.contains(resolvedEntry.entry.entryId)) {
print(
'WARNING: Skipping duplicate seeAlso xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
' (kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, senseOrderNum: ${xref.senseOrderNum})',
);
continue;
}
seenSeeAlsoXrefs.add(resolvedEntry.entry.entryId);
b.insert(JMdictTableNames.senseSeeAlso, {
'senseId': s.senseId,
'xrefEntryId': resolvedEntry.entry.entryId,
@@ -224,6 +236,7 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
});
}
final seenAntonymXrefs = <int>{};
for (final ant in s.antonyms) {
final resolvedEntry = resolveXref(
entriesByKanji,
@@ -231,6 +244,15 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
ant,
);
if (seenAntonymXrefs.contains(resolvedEntry.entry.entryId)) {
print(
'WARNING: Skipping duplicate antonym xref from sense ${s.senseId} to entry ${resolvedEntry.entry.entryId}\n'
' (kanjiRef: ${ant.kanjiRef}, readingRef: ${ant.readingRef}, senseOrderNum: ${ant.senseOrderNum})',
);
continue;
}
seenAntonymXrefs.add(resolvedEntry.entry.entryId);
b.insert(JMdictTableNames.senseAntonyms, {
'senseId': s.senseId,
'xrefEntryId': resolvedEntry.entry.entryId,