_data_ingestion: add order numbers for readings and kanji

This commit is contained in:
Oystein Kristoffer Tveit 2025-04-26 14:44:53 +02:00
parent d79f89e2e2
commit 191594f0bc
Signed by: oysteikt
GPG Key ID: 9F2F7D8250F35146
3 changed files with 52 additions and 28 deletions
lib/_data_ingestion/jmdict
migrations

@ -51,10 +51,12 @@ abstract class Element extends SQLWritable {
}
class KanjiElement extends Element {
int orderNum;
List<String> info;
KanjiElement({
this.info = const [],
required this.orderNum,
required String reading,
int? news,
int? ichi,
@ -69,13 +71,23 @@ class KanjiElement extends Element {
gai: gai,
nf: nf,
);
@override
Map<String, Object?> get sqlValue => {
...super.sqlValue,
'orderNum': orderNum,
};
}
class ReadingElement extends Element {
int orderNum;
bool readingDoesNotMatchKanji;
List<String> info;
List<String> restrictions;
ReadingElement({
required this.orderNum,
required this.readingDoesNotMatchKanji,
this.info = const [],
this.restrictions = const [],
required String reading,
@ -92,6 +104,13 @@ class ReadingElement extends Element {
gai: gai,
nf: nf,
);
@override
Map<String, Object?> get sqlValue => {
...super.sqlValue,
'orderNum': orderNum,
'readingDoesNotMatchKanji': readingDoesNotMatchKanji,
};
}
class LanguageSource extends SQLWritable {
@ -140,18 +159,18 @@ final kanaRegex =
class XRefParts {
final String? kanjiRef;
final String? readingRef;
final int? senseNum;
final int? senseOrderNum;
const XRefParts({
this.kanjiRef,
this.readingRef,
this.senseNum,
this.senseOrderNum,
}) : assert(kanjiRef != null || readingRef != null);
Map<String, Object?> toJson() => {
'kanjiRef': kanjiRef,
'readingRef': readingRef,
'senseNum': senseNum,
'senseOrderNum': senseOrderNum,
};
}
@ -167,7 +186,7 @@ class XRef {
class Sense extends SQLWritable {
final int id;
final int senseNum;
final int orderNum;
final List<XRefParts> antonyms;
final List<String> dialects;
final List<String> fields;
@ -182,7 +201,7 @@ class Sense extends SQLWritable {
const Sense({
required this.id,
required this.senseNum,
required this.orderNum,
this.antonyms = const [],
this.dialects = const [],
this.fields = const [],

@ -44,12 +44,12 @@ XRefParts parseXrefParts(String s) {
if (parts[0].contains(kanaRegex)) {
result = XRefParts(
readingRef: parts[0],
senseNum: int.parse(parts[1]),
senseOrderNum: int.parse(parts[1]),
);
} else {
result = XRefParts(
kanjiRef: parts[0],
senseNum: int.parse(parts[1]),
senseOrderNum: int.parse(parts[1]),
);
}
} else {
@ -64,7 +64,7 @@ XRefParts parseXrefParts(String s) {
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseNum: int.parse(parts[2]),
senseOrderNum: int.parse(parts[2]),
);
break;
@ -98,9 +98,9 @@ ResolvedXref resolveXref(
};
// Filter out entries that don't have the number of senses specified in the xref
if (xref.senseNum != null) {
if (xref.senseOrderNum != null) {
candidateEntries
.retainWhere((entry) => entry.senses.length >= xref.senseNum!);
.retainWhere((entry) => entry.senses.length >= xref.senseOrderNum!);
}
// If the xref has a reading ref but no kanji ref, and there are multiple
@ -125,13 +125,13 @@ ResolvedXref resolveXref(
throw Exception(
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseNum: ${xref.senseNum}',
'senseOrderNum: ${xref.senseOrderNum}',
);
} else if (candidateEntries.length > 1) {
print(
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseNum: ${xref.senseNum}',
'senseOrderNum: ${xref.senseOrderNum}',
);
return ResolvedXref(candidateEntries.first, true);
} else {
@ -151,13 +151,14 @@ List<Entry> parseXML(XmlElement root) {
final List<ReadingElement> readingEls = [];
final List<Sense> senses = [];
for (final k_ele in entry.findElements('k_ele')) {
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
final ke_pri = getPriorityValues(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
orderNum: kanjiNum + 1,
info: k_ele
.findElements('ke_inf')
.map((e) => e.innerText.replaceAll(RegExp('[&;]'), ''))
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
@ -169,10 +170,14 @@ List<Entry> parseXML(XmlElement root) {
);
}
for (final r_ele in entry.findElements('r_ele')) {
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
final re_pri = getPriorityValues(r_ele, 're');
final readingDoesNotMatchKanji =
r_ele.findElements('re_nokanji').isNotEmpty;
readingEls.add(
ReadingElement(
orderNum: orderNum + 1,
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
info: r_ele
.findElements('re_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
@ -189,14 +194,12 @@ List<Entry> parseXML(XmlElement root) {
);
}
int senseNum = 0;
for (final sense in entry.findElements('sense')) {
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
senseId++;
senseNum++;
senses.add(
Sense(
id: senseId,
senseNum: senseNum,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
@ -347,7 +350,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
b.insert(
TableNames.sense,
s.sqlValue
..addAll({'id': s.id, 'entryId': e.id, 'senseNum': s.senseNum}));
..addAll({'id': s.id, 'entryId': e.id, 'orderNum': s.orderNum}));
for (final d in s.dialects) {
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
@ -444,7 +447,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
'xrefEntryId': resolvedEntry.entry.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseNum,
'seeAlsoSense': xref.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
},
);
@ -462,7 +465,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
'xrefEntryId': resolvedEntry.entry.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseNum,
'antonymSense': ant.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
});
}

@ -40,6 +40,7 @@ CREATE TABLE "JMdict_Entry" (
CREATE TABLE "JMdict_KanjiElement" (
"entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"),
"orderNum" INTEGER,
"reading" TEXT NOT NULL,
"news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
"ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
@ -62,13 +63,14 @@ CREATE TABLE "JMdict_KanjiElementInfo" (
CREATE TABLE "JMdict_ReadingElement" (
"entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"),
"orderNum" INTEGER,
"reading" TEXT NOT NULL,
"readingDoesNotMatchKanji" BOOLEAN NOT NULL DEFAULT FALSE,
"news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
"ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
"spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2),
"gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2),
"nf" INTEGER,
"nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48),
PRIMARY KEY ("entryId", "reading")
) WITHOUT ROWID;
@ -95,11 +97,11 @@ CREATE TABLE "JMdict_ReadingElementInfo" (
CREATE TABLE "JMdict_Sense" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"entryId" INTEGER REFERENCES "JMdict_Entry"("id"),
"senseNum" INTEGER,
UNIQUE("entryId", "senseNum")
"orderNum" INTEGER,
UNIQUE("entryId", "orderNum")
);
CREATE INDEX "JMdict_Sense_byEntryId_bySenseNum" ON "JMdict_Sense"("entryId", "senseNum");
CREATE INDEX "JMdict_Sense_byEntryId_byOrderNum" ON "JMdict_Sense"("entryId", "orderNum");
CREATE TABLE "JMdict_SenseRestrictedToKanji" (
"entryId" INTEGER,
@ -141,7 +143,7 @@ CREATE TABLE "JMdict_SenseSeeAlso" (
-- TODO: Check that if seeAlsoSense is present, it refers to a sense connected to xrefEntryId.
FOREIGN KEY ("xrefEntryId", "seeAlsoKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
FOREIGN KEY ("xrefEntryId", "seeAlsoReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),
FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"),
FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"),
PRIMARY KEY ("senseId", "xrefEntryId", "seeAlsoReading", "seeAlsoKanji", "seeAlsoSense")
);
@ -157,7 +159,7 @@ CREATE TABLE "JMdict_SenseAntonym" (
CHECK ("antonymReading" = NULL <> "antonymKanji" = NULL),
FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),
FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"),
FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"),
PRIMARY KEY ("senseId", "xrefEntryId", "antonymReading", "antonymKanji", "antonymSense")
);