248 lines
7.1 KiB
Dart
248 lines
7.1 KiB
Dart
import 'dart:collection';
|
|
|
|
import 'package:collection/collection.dart';
|
|
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
|
import 'package:jadb/table_names/jmdict.dart';
|
|
import 'package:sqflite_common/sqlite_api.dart';
|
|
|
|
class ResolvedXref {
|
|
Entry entry;
|
|
bool ambiguous;
|
|
|
|
ResolvedXref(this.entry, this.ambiguous);
|
|
}
|
|
|
|
ResolvedXref resolveXref(
|
|
SplayTreeMap<String, Set<Entry>> entriesByKanji,
|
|
SplayTreeMap<String, Set<Entry>> entriesByReading,
|
|
XRefParts xref,
|
|
) {
|
|
List<Entry> candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) {
|
|
(null, null) => throw Exception(
|
|
'Xref $xref has no kanji or reading reference',
|
|
),
|
|
(final String k, null) => entriesByKanji[k]!.toList(),
|
|
(null, final String r) => entriesByReading[r]!.toList(),
|
|
(final String k, final String r) =>
|
|
entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(),
|
|
};
|
|
|
|
// Filter out entries that don't have the number of senses specified in the xref
|
|
if (xref.senseOrderNum != null) {
|
|
candidateEntries.retainWhere(
|
|
(entry) => entry.senses.length >= xref.senseOrderNum!,
|
|
);
|
|
}
|
|
|
|
// If the xref has a reading ref but no kanji ref, and there are multiple
|
|
// entries to choose from, prefer entries with empty kanji readings
|
|
// if possible.
|
|
if (xref.kanjiRef == null &&
|
|
xref.readingRef != null &&
|
|
candidateEntries.length > 1) {
|
|
final candidatesWithEmptyKanji = candidateEntries
|
|
.where((entry) => entry.kanji.isEmpty)
|
|
.toList();
|
|
|
|
if (candidatesWithEmptyKanji.isNotEmpty) {
|
|
candidateEntries = candidatesWithEmptyKanji;
|
|
}
|
|
}
|
|
|
|
// Having more senses is a cheap way to choose the most likely correct
|
|
// entry in case there are multiple candidates left.
|
|
candidateEntries.sortBy<num>((entry) => entry.senses.length);
|
|
|
|
if (candidateEntries.isEmpty) {
|
|
throw Exception(
|
|
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
|
|
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
|
|
'senseOrderNum: ${xref.senseOrderNum}',
|
|
);
|
|
} else if (candidateEntries.length > 1) {
|
|
print(
|
|
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
|
|
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
|
|
'senseOrderNum: ${xref.senseOrderNum}',
|
|
);
|
|
return ResolvedXref(candidateEntries.first, true);
|
|
} else {
|
|
return ResolvedXref(candidateEntries.first, false);
|
|
}
|
|
}
|
|
|
|
Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
|
print(' [JMdict] Batch 1 - Kanji and readings');
|
|
Batch b = db.batch();
|
|
|
|
int elementId = 0;
|
|
for (final e in entries) {
|
|
b.insert(JMdictTableNames.entry, e.sqlValue);
|
|
|
|
for (final k in e.kanji) {
|
|
elementId++;
|
|
b.insert(
|
|
JMdictTableNames.kanjiElement,
|
|
k.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}),
|
|
);
|
|
|
|
for (final i in k.info) {
|
|
b.insert(JMdictTableNames.kanjiInfo, {
|
|
'elementId': elementId,
|
|
'info': i,
|
|
});
|
|
}
|
|
}
|
|
|
|
for (final r in e.readings) {
|
|
elementId++;
|
|
b.insert(
|
|
JMdictTableNames.readingElement,
|
|
r.sqlValue..addAll({'entryId': e.entryId, 'elementId': elementId}),
|
|
);
|
|
|
|
for (final i in r.info) {
|
|
b.insert(JMdictTableNames.readingInfo, {
|
|
'elementId': elementId,
|
|
'info': i,
|
|
});
|
|
}
|
|
for (final res in r.restrictions) {
|
|
b.insert(JMdictTableNames.readingRestriction, {
|
|
'elementId': elementId,
|
|
'restriction': res,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
await b.commit(noResult: true);
|
|
|
|
print(' [JMdict] Batch 2 - Senses');
|
|
b = db.batch();
|
|
|
|
for (final e in entries) {
|
|
for (final s in e.senses) {
|
|
b.insert(
|
|
JMdictTableNames.sense,
|
|
s.sqlValue..addAll({'entryId': e.entryId}),
|
|
);
|
|
for (final d in s.dialects) {
|
|
b.insert(JMdictTableNames.senseDialect, {
|
|
'senseId': s.senseId,
|
|
'dialect': d,
|
|
});
|
|
}
|
|
for (final f in s.fields) {
|
|
b.insert(JMdictTableNames.senseField, {
|
|
'senseId': s.senseId,
|
|
'field': f,
|
|
});
|
|
}
|
|
for (final i in s.info) {
|
|
b.insert(JMdictTableNames.senseInfo, {'senseId': s.senseId, 'info': i});
|
|
}
|
|
for (final m in s.misc) {
|
|
b.insert(JMdictTableNames.senseMisc, {'senseId': s.senseId, 'misc': m});
|
|
}
|
|
for (final p in s.pos) {
|
|
b.insert(JMdictTableNames.sensePOS, {'senseId': s.senseId, 'pos': p});
|
|
}
|
|
for (final rk in s.restrictedToKanji) {
|
|
b.insert(JMdictTableNames.senseRestrictedToKanji, {
|
|
'entryId': e.entryId,
|
|
'senseId': s.senseId,
|
|
'kanji': rk,
|
|
});
|
|
}
|
|
for (final rr in s.restrictedToReading) {
|
|
b.insert(JMdictTableNames.senseRestrictedToReading, {
|
|
'entryId': e.entryId,
|
|
'senseId': s.senseId,
|
|
'reading': rr,
|
|
});
|
|
}
|
|
for (final ls in s.languageSource) {
|
|
b.insert(
|
|
JMdictTableNames.senseLanguageSource,
|
|
ls.sqlValue..addAll({'senseId': s.senseId}),
|
|
);
|
|
}
|
|
for (final g in s.glossary) {
|
|
b.insert(
|
|
JMdictTableNames.senseGlossary,
|
|
g.sqlValue..addAll({'senseId': s.senseId}),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
await b.commit(noResult: true);
|
|
|
|
print(' [JMdict] Building xref trees');
|
|
final SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
|
|
|
|
for (final entry in entries) {
|
|
for (final kanji in entry.kanji) {
|
|
if (entriesByKanji.containsKey(kanji.reading)) {
|
|
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
|
|
} else {
|
|
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
|
|
}
|
|
}
|
|
}
|
|
final SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
|
for (final entry in entries) {
|
|
for (final reading in entry.readings) {
|
|
if (entriesByReading.containsKey(reading.reading)) {
|
|
entriesByReading.update(reading.reading, (list) => list..add(entry));
|
|
} else {
|
|
entriesByReading.putIfAbsent(reading.reading, () => {entry});
|
|
}
|
|
}
|
|
}
|
|
|
|
print(' [JMdict] Batch 3 - Xrefs');
|
|
b = db.batch();
|
|
|
|
for (final e in entries) {
|
|
for (final s in e.senses) {
|
|
for (final xref in s.seeAlso) {
|
|
final resolvedEntry = resolveXref(
|
|
entriesByKanji,
|
|
entriesByReading,
|
|
xref,
|
|
);
|
|
|
|
b.insert(JMdictTableNames.senseSeeAlso, {
|
|
'senseId': s.senseId,
|
|
'xrefEntryId': resolvedEntry.entry.entryId,
|
|
'seeAlsoKanji': xref.kanjiRef,
|
|
'seeAlsoReading': xref.readingRef,
|
|
'seeAlsoSense': xref.senseOrderNum,
|
|
'ambiguous': resolvedEntry.ambiguous,
|
|
});
|
|
}
|
|
|
|
for (final ant in s.antonyms) {
|
|
final resolvedEntry = resolveXref(
|
|
entriesByKanji,
|
|
entriesByReading,
|
|
ant,
|
|
);
|
|
|
|
b.insert(JMdictTableNames.senseAntonyms, {
|
|
'senseId': s.senseId,
|
|
'xrefEntryId': resolvedEntry.entry.entryId,
|
|
'antonymKanji': ant.kanjiRef,
|
|
'antonymReading': ant.readingRef,
|
|
'antonymSense': ant.senseOrderNum,
|
|
'ambiguous': resolvedEntry.ambiguous,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
await b.commit(noResult: true);
|
|
}
|