diff --git a/lib/_data_ingestion/jmdict/objects.dart b/lib/_data_ingestion/jmdict/objects.dart index d76aad7..97526c0 100644 --- a/lib/_data_ingestion/jmdict/objects.dart +++ b/lib/_data_ingestion/jmdict/objects.dart @@ -51,10 +51,12 @@ abstract class Element extends SQLWritable { } class KanjiElement extends Element { + int orderNum; List info; KanjiElement({ this.info = const [], + required this.orderNum, required String reading, int? news, int? ichi, @@ -69,13 +71,23 @@ class KanjiElement extends Element { gai: gai, nf: nf, ); + + @override + Map get sqlValue => { + ...super.sqlValue, + 'orderNum': orderNum, + }; } class ReadingElement extends Element { + int orderNum; + bool readingDoesNotMatchKanji; List info; List restrictions; ReadingElement({ + required this.orderNum, + required this.readingDoesNotMatchKanji, this.info = const [], this.restrictions = const [], required String reading, @@ -92,6 +104,13 @@ class ReadingElement extends Element { gai: gai, nf: nf, ); + + @override + Map get sqlValue => { + ...super.sqlValue, + 'orderNum': orderNum, + 'readingDoesNotMatchKanji': readingDoesNotMatchKanji, + }; } class LanguageSource extends SQLWritable { @@ -140,18 +159,18 @@ final kanaRegex = class XRefParts { final String? kanjiRef; final String? readingRef; - final int? senseNum; + final int? senseOrderNum; const XRefParts({ this.kanjiRef, this.readingRef, - this.senseNum, + this.senseOrderNum, }) : assert(kanjiRef != null || readingRef != null); Map toJson() => { 'kanjiRef': kanjiRef, 'readingRef': readingRef, - 'senseNum': senseNum, + 'senseOrderNum': senseOrderNum, }; } @@ -167,7 +186,7 @@ class XRef { class Sense extends SQLWritable { final int id; - final int senseNum; + final int orderNum; final List antonyms; final List dialects; final List fields; @@ -182,7 +201,7 @@ class Sense extends SQLWritable { const Sense({ required this.id, - required this.senseNum, + required this.orderNum, this.antonyms = const [], this.dialects = const [], this.fields = const [], diff --git a/lib/_data_ingestion/jmdict/parser.dart b/lib/_data_ingestion/jmdict/parser.dart index 633dde1..937db9b 100644 --- a/lib/_data_ingestion/jmdict/parser.dart +++ b/lib/_data_ingestion/jmdict/parser.dart @@ -44,12 +44,12 @@ XRefParts parseXrefParts(String s) { if (parts[0].contains(kanaRegex)) { result = XRefParts( readingRef: parts[0], - senseNum: int.parse(parts[1]), + senseOrderNum: int.parse(parts[1]), ); } else { result = XRefParts( kanjiRef: parts[0], - senseNum: int.parse(parts[1]), + senseOrderNum: int.parse(parts[1]), ); } } else { @@ -64,7 +64,7 @@ XRefParts parseXrefParts(String s) { result = XRefParts( kanjiRef: parts[0], readingRef: parts[1], - senseNum: int.parse(parts[2]), + senseOrderNum: int.parse(parts[2]), ); break; @@ -98,9 +98,9 @@ ResolvedXref resolveXref( }; // Filter out entries that don't have the number of senses specified in the xref - if (xref.senseNum != null) { + if (xref.senseOrderNum != null) { candidateEntries - .retainWhere((entry) => entry.senses.length >= xref.senseNum!); + .retainWhere((entry) => entry.senses.length >= xref.senseOrderNum!); } // If the xref has a reading ref but no kanji ref, and there are multiple @@ -125,13 +125,13 @@ ResolvedXref resolveXref( throw Exception( 'SKIPPING: Xref $xref has ${candidateEntries.length} entries, ' 'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, ' - 'senseNum: ${xref.senseNum}', + 'senseOrderNum: ${xref.senseOrderNum}', ); } else if (candidateEntries.length > 1) { print( 'WARNING: Xref $xref has ${candidateEntries.length} entries, ' 'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, ' - 'senseNum: ${xref.senseNum}', + 'senseOrderNum: ${xref.senseOrderNum}', ); return ResolvedXref(candidateEntries.first, true); } else { @@ -151,13 +151,14 @@ List parseXML(XmlElement root) { final List readingEls = []; final List senses = []; - for (final k_ele in entry.findElements('k_ele')) { + for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) { final ke_pri = getPriorityValues(k_ele, 'ke'); kanjiEls.add( KanjiElement( + orderNum: kanjiNum + 1, info: k_ele .findElements('ke_inf') - .map((e) => e.innerText.replaceAll(RegExp('[&;]'), '')) + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), reading: k_ele.findElements('keb').first.innerText, news: ke_pri[0], @@ -169,10 +170,14 @@ List parseXML(XmlElement root) { ); } - for (final r_ele in entry.findElements('r_ele')) { + for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) { final re_pri = getPriorityValues(r_ele, 're'); + final readingDoesNotMatchKanji = + r_ele.findElements('re_nokanji').isNotEmpty; readingEls.add( ReadingElement( + orderNum: orderNum + 1, + readingDoesNotMatchKanji: readingDoesNotMatchKanji, info: r_ele .findElements('re_inf') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) @@ -189,14 +194,12 @@ List parseXML(XmlElement root) { ); } - int senseNum = 0; - for (final sense in entry.findElements('sense')) { + for (final (orderNum, sense) in entry.findElements('sense').indexed) { senseId++; - senseNum++; senses.add( Sense( id: senseId, - senseNum: senseNum, + orderNum: orderNum + 1, restrictedToKanji: sense.findElements('stagk').map((e) => e.innerText).toList(), restrictedToReading: @@ -347,7 +350,7 @@ Future insertIntoDB(List entries, Database db) async { b.insert( TableNames.sense, s.sqlValue - ..addAll({'id': s.id, 'entryId': e.id, 'senseNum': s.senseNum})); + ..addAll({'id': s.id, 'entryId': e.id, 'orderNum': s.orderNum})); for (final d in s.dialects) { b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d}); @@ -444,7 +447,7 @@ Future insertIntoDB(List entries, Database db) async { 'xrefEntryId': resolvedEntry.entry.id, 'seeAlsoKanji': xref.kanjiRef, 'seeAlsoReading': xref.readingRef, - 'seeAlsoSense': xref.senseNum, + 'seeAlsoSense': xref.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }, ); @@ -462,7 +465,7 @@ Future insertIntoDB(List entries, Database db) async { 'xrefEntryId': resolvedEntry.entry.id, 'antonymKanji': ant.kanjiRef, 'antonymReading': ant.readingRef, - 'antonymSense': ant.senseNum, + 'antonymSense': ant.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }); } diff --git a/migrations/0001_JMDict.sql b/migrations/0001_JMDict.sql index 1845d82..8e904b3 100644 --- a/migrations/0001_JMDict.sql +++ b/migrations/0001_JMDict.sql @@ -40,6 +40,7 @@ CREATE TABLE "JMdict_Entry" ( CREATE TABLE "JMdict_KanjiElement" ( "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"), + "orderNum" INTEGER, "reading" TEXT NOT NULL, "news" INTEGER CHECK ("news" BETWEEN 1 AND 2), "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2), @@ -62,13 +63,14 @@ CREATE TABLE "JMdict_KanjiElementInfo" ( CREATE TABLE "JMdict_ReadingElement" ( "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"), + "orderNum" INTEGER, "reading" TEXT NOT NULL, "readingDoesNotMatchKanji" BOOLEAN NOT NULL DEFAULT FALSE, "news" INTEGER CHECK ("news" BETWEEN 1 AND 2), "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2), "spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2), "gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2), - "nf" INTEGER, + "nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48), PRIMARY KEY ("entryId", "reading") ) WITHOUT ROWID; @@ -95,11 +97,11 @@ CREATE TABLE "JMdict_ReadingElementInfo" ( CREATE TABLE "JMdict_Sense" ( "id" INTEGER PRIMARY KEY AUTOINCREMENT, "entryId" INTEGER REFERENCES "JMdict_Entry"("id"), - "senseNum" INTEGER, - UNIQUE("entryId", "senseNum") + "orderNum" INTEGER, + UNIQUE("entryId", "orderNum") ); -CREATE INDEX "JMdict_Sense_byEntryId_bySenseNum" ON "JMdict_Sense"("entryId", "senseNum"); +CREATE INDEX "JMdict_Sense_byEntryId_byOrderNum" ON "JMdict_Sense"("entryId", "orderNum"); CREATE TABLE "JMdict_SenseRestrictedToKanji" ( "entryId" INTEGER, @@ -141,7 +143,7 @@ CREATE TABLE "JMdict_SenseSeeAlso" ( -- TODO: Check that if seeAlsoSense is present, it refers to a sense connected to xrefEntryId. FOREIGN KEY ("xrefEntryId", "seeAlsoKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"), FOREIGN KEY ("xrefEntryId", "seeAlsoReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"), + FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"), PRIMARY KEY ("senseId", "xrefEntryId", "seeAlsoReading", "seeAlsoKanji", "seeAlsoSense") ); @@ -157,7 +159,7 @@ CREATE TABLE "JMdict_SenseAntonym" ( CHECK ("antonymReading" = NULL <> "antonymKanji" = NULL), FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"), FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"), + FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"), PRIMARY KEY ("senseId", "xrefEntryId", "antonymReading", "antonymKanji", "antonymSense") );