lib/_data_ingestion/jmdict: mark ambiguity in xrefs

This commit is contained in:
2025-04-23 13:26:20 +02:00
parent 9ac4283edd
commit 19cdfb8434
2 changed files with 28 additions and 12 deletions

View File

@@ -76,7 +76,14 @@ XRefParts parseXrefParts(String s) {
return result;
}
Entry resolveXref(
class ResolvedXref {
Entry entry;
bool ambiguous;
ResolvedXref(this.entry, this.ambiguous);
}
ResolvedXref resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByKanji,
SplayTreeMap<String, Set<Entry>> entriesByReading,
XRefParts xref,
@@ -126,16 +133,17 @@ Entry resolveXref(
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseNum: ${xref.senseNum}',
);
return candidateEntries.first;
return ResolvedXref(candidateEntries.first, true);
} else {
return candidateEntries.first;
return ResolvedXref(candidateEntries.first, false);
}
}
List<Entry> transformXML(XmlElement root) {
List<Entry> parseXML(XmlElement root) {
final List<Entry> entries = [];
int senseId = 0;
for (final entry in root.childElements) {
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
@@ -423,7 +431,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
for (final e in entries) {
for (final s in e.senses) {
for (final xref in s.seeAlso) {
final entry = resolveXref(
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
xref,
@@ -433,16 +441,17 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
TableNames.senseSeeAlso,
{
'senseId': s.id,
'xrefEntryId': entry.id,
'xrefEntryId': resolvedEntry.entry.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseNum,
'ambiguous': resolvedEntry.ambiguous,
},
);
}
for (final ant in s.antonyms) {
final entry = resolveXref(
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
ant,
@@ -450,10 +459,11 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
b.insert(TableNames.senseAntonyms, {
'senseId': s.id,
'xrefEntryId': entry.id,
'xrefEntryId': resolvedEntry.entry.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseNum,
'ambiguous': resolvedEntry.ambiguous,
});
}
}
@@ -463,14 +473,14 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
}
Future<void> addDataFromJMdict(Database db) async {
print('[JMdict] Reading file...');
print('[JMdict] Reading file content...');
String rawXML = File('data/JMdict.xml').readAsStringSync();
print('[JMdict] Parsing XML...');
print('[JMdict] Parsing XML tags...');
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
print('[JMdict] Transforming data...');
final entries = transformXML(root);
print('[JMdict] Parsing XML content...');
final entries = parseXML(root);
print('[JMdict] Writing to database...');
await insertIntoDB(entries, db);

View File

@@ -133,6 +133,9 @@ CREATE TABLE "JMdict_SenseSeeAlso" (
"seeAlsoReading" TEXT,
"seeAlsoKanji" TEXT,
"seeAlsoSense" INTEGER,
-- For some entries, the cross reference is ambiguous. This means that while the ingestion
-- has determined some xrefEntryId, it is not guaranteed to be the correct one.
"ambiguous" BOOLEAN,
CHECK ("seeAlsoReading" = NULL <> "seeAlsoKanji" = NULL),
-- CHECK("seeAlsoSense" = NULL OR "seeAlsoSense")
-- TODO: Check that if seeAlsoSense is present, it refers to a sense connected to xrefEntryId.
@@ -148,6 +151,9 @@ CREATE TABLE "JMdict_SenseAntonym" (
"antonymReading" TEXT,
"antonymKanji" TEXT,
"antonymSense" INTEGER,
-- For some entries, the cross reference is ambiguous. This means that while the ingestion
-- has determined some xrefEntryId, it is not guaranteed to be the correct one.
"ambiguous" BOOLEAN,
CHECK ("antonymReading" = NULL <> "antonymKanji" = NULL),
FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),