Files
jadb/lib/_data_ingestion/jmdict/seed_data.dart

248 lines
7.1 KiB
Dart

import 'dart:collection';
import 'package:collection/collection.dart';
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
import 'package:jadb/table_names/jmdict.dart';
import 'package:sqflite_common/sqlite_api.dart';
class ResolvedXref {
Entry entry;
bool ambiguous;
ResolvedXref(this.entry, this.ambiguous);
}
ResolvedXref resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByKanji,
SplayTreeMap<String, Set<Entry>> entriesByReading,
XRefParts xref,
) {
List<Entry> candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) {
(null, null) => throw Exception(
'Xref $xref has no kanji or reading reference',
),
(final String k, null) => entriesByKanji[k]!.toList(),
(null, final String r) => entriesByReading[r]!.toList(),
(final String k, final String r) =>
entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(),
};
// Filter out entries that don't have the number of senses specified in the xref
if (xref.senseOrderNum != null) {
candidateEntries.retainWhere(
(entry) => entry.senses.length >= xref.senseOrderNum!,
);
}
// If the xref has a reading ref but no kanji ref, and there are multiple
// entries to choose from, prefer entries with empty kanji readings
// if possible.
if (xref.kanjiRef == null &&
xref.readingRef != null &&
candidateEntries.length > 1) {
final candidatesWithEmptyKanji = candidateEntries
.where((entry) => entry.kanji.isEmpty)
.toList();
if (candidatesWithEmptyKanji.isNotEmpty) {
candidateEntries = candidatesWithEmptyKanji;
}
}
// Having more senses is a cheap way to choose the most likely correct
// entry in case there are multiple candidates left.
candidateEntries.sortBy<num>((entry) => entry.senses.length);
if (candidateEntries.isEmpty) {
throw Exception(
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseOrderNum: ${xref.senseOrderNum}',
);
} else if (candidateEntries.length > 1) {
print(
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseOrderNum: ${xref.senseOrderNum}',
);
return ResolvedXref(candidateEntries.first, true);
} else {
return ResolvedXref(candidateEntries.first, false);
}
}
Future<void> seedJMDictData(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1 - Kanji and readings');
Batch b = db.batch();
int elementId = 0;
for (final e in entries) {
b.insert(JMdictTableNames.entry, e.sqlValue);
for (final k in e.kanji) {
elementId++;
b.insert(
JMdictTableNames.kanjiElement,
k.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}),
);
for (final i in k.info) {
b.insert(JMdictTableNames.kanjiInfo, {
'elementId': elementId,
'info': i,
});
}
}
for (final r in e.readings) {
elementId++;
b.insert(
JMdictTableNames.readingElement,
r.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}),
);
for (final i in r.info) {
b.insert(JMdictTableNames.readingInfo, {
'elementId': elementId,
'info': i,
});
}
for (final res in r.restrictions) {
b.insert(JMdictTableNames.readingRestriction, {
'elementId': elementId,
'restriction': res,
});
}
}
}
await b.commit(noResult: true);
print(' [JMdict] Batch 2 - Senses');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
b.insert(
JMdictTableNames.sense,
s.sqlValue..addAll({'entryId': e.entryId}),
);
for (final d in s.dialects) {
b.insert(JMdictTableNames.senseDialect, {
'senseId': s.senseId,
'dialect': d,
});
}
for (final f in s.fields) {
b.insert(JMdictTableNames.senseField, {
'senseId': s.senseId,
'field': f,
});
}
for (final i in s.info) {
b.insert(JMdictTableNames.senseInfo, {'senseId': s.senseId, 'info': i});
}
for (final m in s.misc) {
b.insert(JMdictTableNames.senseMisc, {'senseId': s.senseId, 'misc': m});
}
for (final p in s.pos) {
b.insert(JMdictTableNames.sensePOS, {'senseId': s.senseId, 'pos': p});
}
for (final rk in s.restrictedToKanji) {
b.insert(JMdictTableNames.senseRestrictedToKanji, {
'entryId': e.entryId,
'senseId': s.senseId,
'kanji': rk,
});
}
for (final rr in s.restrictedToReading) {
b.insert(JMdictTableNames.senseRestrictedToReading, {
'entryId': e.entryId,
'senseId': s.senseId,
'reading': rr,
});
}
for (final ls in s.languageSource) {
b.insert(
JMdictTableNames.senseLanguageSource,
ls.sqlValue..addAll({'senseId': s.senseId}),
);
}
for (final g in s.glossary) {
b.insert(
JMdictTableNames.senseGlossary,
g.sqlValue..addAll({'senseId': s.senseId}),
);
}
}
}
await b.commit(noResult: true);
print(' [JMdict] Building xref trees');
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
}
}
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
}
}
print(' [JMdict] Batch 3 - Xrefs');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
for (final xref in s.seeAlso) {
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
xref,
);
b.insert(JMdictTableNames.senseSeeAlso, {
'senseId': s.senseId,
'xrefEntryId': resolvedEntry.entry.entryId,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
});
}
for (final ant in s.antonyms) {
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
ant,
);
b.insert(JMdictTableNames.senseAntonyms, {
'senseId': s.senseId,
'xrefEntryId': resolvedEntry.entry.entryId,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
});
}
}
}
await b.commit(noResult: true);
}