lib/_data_ingestion/jmdict: mark ambiguity in xrefs
This commit is contained in:
@@ -76,7 +76,14 @@ XRefParts parseXrefParts(String s) {
|
||||
return result;
|
||||
}
|
||||
|
||||
Entry resolveXref(
|
||||
class ResolvedXref {
|
||||
Entry entry;
|
||||
bool ambiguous;
|
||||
|
||||
ResolvedXref(this.entry, this.ambiguous);
|
||||
}
|
||||
|
||||
ResolvedXref resolveXref(
|
||||
SplayTreeMap<String, Set<Entry>> entriesByKanji,
|
||||
SplayTreeMap<String, Set<Entry>> entriesByReading,
|
||||
XRefParts xref,
|
||||
@@ -126,16 +133,17 @@ Entry resolveXref(
|
||||
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
|
||||
'senseNum: ${xref.senseNum}',
|
||||
);
|
||||
return candidateEntries.first;
|
||||
return ResolvedXref(candidateEntries.first, true);
|
||||
} else {
|
||||
return candidateEntries.first;
|
||||
return ResolvedXref(candidateEntries.first, false);
|
||||
}
|
||||
}
|
||||
|
||||
List<Entry> transformXML(XmlElement root) {
|
||||
List<Entry> parseXML(XmlElement root) {
|
||||
final List<Entry> entries = [];
|
||||
|
||||
int senseId = 0;
|
||||
|
||||
for (final entry in root.childElements) {
|
||||
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
|
||||
|
||||
@@ -423,7 +431,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
for (final e in entries) {
|
||||
for (final s in e.senses) {
|
||||
for (final xref in s.seeAlso) {
|
||||
final entry = resolveXref(
|
||||
final resolvedEntry = resolveXref(
|
||||
entriesByKanji,
|
||||
entriesByReading,
|
||||
xref,
|
||||
@@ -433,16 +441,17 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
TableNames.senseSeeAlso,
|
||||
{
|
||||
'senseId': s.id,
|
||||
'xrefEntryId': entry.id,
|
||||
'xrefEntryId': resolvedEntry.entry.id,
|
||||
'seeAlsoKanji': xref.kanjiRef,
|
||||
'seeAlsoReading': xref.readingRef,
|
||||
'seeAlsoSense': xref.senseNum,
|
||||
'ambiguous': resolvedEntry.ambiguous,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for (final ant in s.antonyms) {
|
||||
final entry = resolveXref(
|
||||
final resolvedEntry = resolveXref(
|
||||
entriesByKanji,
|
||||
entriesByReading,
|
||||
ant,
|
||||
@@ -450,10 +459,11 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
|
||||
b.insert(TableNames.senseAntonyms, {
|
||||
'senseId': s.id,
|
||||
'xrefEntryId': entry.id,
|
||||
'xrefEntryId': resolvedEntry.entry.id,
|
||||
'antonymKanji': ant.kanjiRef,
|
||||
'antonymReading': ant.readingRef,
|
||||
'antonymSense': ant.senseNum,
|
||||
'ambiguous': resolvedEntry.ambiguous,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -463,14 +473,14 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
}
|
||||
|
||||
Future<void> addDataFromJMdict(Database db) async {
|
||||
print('[JMdict] Reading file...');
|
||||
print('[JMdict] Reading file content...');
|
||||
String rawXML = File('data/JMdict.xml').readAsStringSync();
|
||||
|
||||
print('[JMdict] Parsing XML...');
|
||||
print('[JMdict] Parsing XML tags...');
|
||||
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
|
||||
|
||||
print('[JMdict] Transforming data...');
|
||||
final entries = transformXML(root);
|
||||
print('[JMdict] Parsing XML content...');
|
||||
final entries = parseXML(root);
|
||||
|
||||
print('[JMdict] Writing to database...');
|
||||
await insertIntoDB(entries, db);
|
||||
|
||||
@@ -133,6 +133,9 @@ CREATE TABLE "JMdict_SenseSeeAlso" (
|
||||
"seeAlsoReading" TEXT,
|
||||
"seeAlsoKanji" TEXT,
|
||||
"seeAlsoSense" INTEGER,
|
||||
-- For some entries, the cross reference is ambiguous. This means that while the ingestion
|
||||
-- has determined some xrefEntryId, it is not guaranteed to be the correct one.
|
||||
"ambiguous" BOOLEAN,
|
||||
CHECK ("seeAlsoReading" = NULL <> "seeAlsoKanji" = NULL),
|
||||
-- CHECK("seeAlsoSense" = NULL OR "seeAlsoSense")
|
||||
-- TODO: Check that if seeAlsoSense is present, it refers to a sense connected to xrefEntryId.
|
||||
@@ -148,6 +151,9 @@ CREATE TABLE "JMdict_SenseAntonym" (
|
||||
"antonymReading" TEXT,
|
||||
"antonymKanji" TEXT,
|
||||
"antonymSense" INTEGER,
|
||||
-- For some entries, the cross reference is ambiguous. This means that while the ingestion
|
||||
-- has determined some xrefEntryId, it is not guaranteed to be the correct one.
|
||||
"ambiguous" BOOLEAN,
|
||||
CHECK ("antonymReading" = NULL <> "antonymKanji" = NULL),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),
|
||||
|
||||
Reference in New Issue
Block a user