_data_ingestion: add order numbers for readings and kanji
This commit is contained in:
parent
d79f89e2e2
commit
191594f0bc
@ -51,10 +51,12 @@ abstract class Element extends SQLWritable {
|
||||
}
|
||||
|
||||
class KanjiElement extends Element {
|
||||
int orderNum;
|
||||
List<String> info;
|
||||
|
||||
KanjiElement({
|
||||
this.info = const [],
|
||||
required this.orderNum,
|
||||
required String reading,
|
||||
int? news,
|
||||
int? ichi,
|
||||
@ -69,13 +71,23 @@ class KanjiElement extends Element {
|
||||
gai: gai,
|
||||
nf: nf,
|
||||
);
|
||||
|
||||
@override
|
||||
Map<String, Object?> get sqlValue => {
|
||||
...super.sqlValue,
|
||||
'orderNum': orderNum,
|
||||
};
|
||||
}
|
||||
|
||||
class ReadingElement extends Element {
|
||||
int orderNum;
|
||||
bool readingDoesNotMatchKanji;
|
||||
List<String> info;
|
||||
List<String> restrictions;
|
||||
|
||||
ReadingElement({
|
||||
required this.orderNum,
|
||||
required this.readingDoesNotMatchKanji,
|
||||
this.info = const [],
|
||||
this.restrictions = const [],
|
||||
required String reading,
|
||||
@ -92,6 +104,13 @@ class ReadingElement extends Element {
|
||||
gai: gai,
|
||||
nf: nf,
|
||||
);
|
||||
|
||||
@override
|
||||
Map<String, Object?> get sqlValue => {
|
||||
...super.sqlValue,
|
||||
'orderNum': orderNum,
|
||||
'readingDoesNotMatchKanji': readingDoesNotMatchKanji,
|
||||
};
|
||||
}
|
||||
|
||||
class LanguageSource extends SQLWritable {
|
||||
@ -140,18 +159,18 @@ final kanaRegex =
|
||||
class XRefParts {
|
||||
final String? kanjiRef;
|
||||
final String? readingRef;
|
||||
final int? senseNum;
|
||||
final int? senseOrderNum;
|
||||
|
||||
const XRefParts({
|
||||
this.kanjiRef,
|
||||
this.readingRef,
|
||||
this.senseNum,
|
||||
this.senseOrderNum,
|
||||
}) : assert(kanjiRef != null || readingRef != null);
|
||||
|
||||
Map<String, Object?> toJson() => {
|
||||
'kanjiRef': kanjiRef,
|
||||
'readingRef': readingRef,
|
||||
'senseNum': senseNum,
|
||||
'senseOrderNum': senseOrderNum,
|
||||
};
|
||||
}
|
||||
|
||||
@ -167,7 +186,7 @@ class XRef {
|
||||
|
||||
class Sense extends SQLWritable {
|
||||
final int id;
|
||||
final int senseNum;
|
||||
final int orderNum;
|
||||
final List<XRefParts> antonyms;
|
||||
final List<String> dialects;
|
||||
final List<String> fields;
|
||||
@ -182,7 +201,7 @@ class Sense extends SQLWritable {
|
||||
|
||||
const Sense({
|
||||
required this.id,
|
||||
required this.senseNum,
|
||||
required this.orderNum,
|
||||
this.antonyms = const [],
|
||||
this.dialects = const [],
|
||||
this.fields = const [],
|
||||
|
@ -44,12 +44,12 @@ XRefParts parseXrefParts(String s) {
|
||||
if (parts[0].contains(kanaRegex)) {
|
||||
result = XRefParts(
|
||||
readingRef: parts[0],
|
||||
senseNum: int.parse(parts[1]),
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
} else {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
senseNum: int.parse(parts[1]),
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
@ -64,7 +64,7 @@ XRefParts parseXrefParts(String s) {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
readingRef: parts[1],
|
||||
senseNum: int.parse(parts[2]),
|
||||
senseOrderNum: int.parse(parts[2]),
|
||||
);
|
||||
break;
|
||||
|
||||
@ -98,9 +98,9 @@ ResolvedXref resolveXref(
|
||||
};
|
||||
|
||||
// Filter out entries that don't have the number of senses specified in the xref
|
||||
if (xref.senseNum != null) {
|
||||
if (xref.senseOrderNum != null) {
|
||||
candidateEntries
|
||||
.retainWhere((entry) => entry.senses.length >= xref.senseNum!);
|
||||
.retainWhere((entry) => entry.senses.length >= xref.senseOrderNum!);
|
||||
}
|
||||
|
||||
// If the xref has a reading ref but no kanji ref, and there are multiple
|
||||
@ -125,13 +125,13 @@ ResolvedXref resolveXref(
|
||||
throw Exception(
|
||||
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
|
||||
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
|
||||
'senseNum: ${xref.senseNum}',
|
||||
'senseOrderNum: ${xref.senseOrderNum}',
|
||||
);
|
||||
} else if (candidateEntries.length > 1) {
|
||||
print(
|
||||
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
|
||||
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
|
||||
'senseNum: ${xref.senseNum}',
|
||||
'senseOrderNum: ${xref.senseOrderNum}',
|
||||
);
|
||||
return ResolvedXref(candidateEntries.first, true);
|
||||
} else {
|
||||
@ -151,13 +151,14 @@ List<Entry> parseXML(XmlElement root) {
|
||||
final List<ReadingElement> readingEls = [];
|
||||
final List<Sense> senses = [];
|
||||
|
||||
for (final k_ele in entry.findElements('k_ele')) {
|
||||
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
|
||||
final ke_pri = getPriorityValues(k_ele, 'ke');
|
||||
kanjiEls.add(
|
||||
KanjiElement(
|
||||
orderNum: kanjiNum + 1,
|
||||
info: k_ele
|
||||
.findElements('ke_inf')
|
||||
.map((e) => e.innerText.replaceAll(RegExp('[&;]'), ''))
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
reading: k_ele.findElements('keb').first.innerText,
|
||||
news: ke_pri[0],
|
||||
@ -169,10 +170,14 @@ List<Entry> parseXML(XmlElement root) {
|
||||
);
|
||||
}
|
||||
|
||||
for (final r_ele in entry.findElements('r_ele')) {
|
||||
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
|
||||
final re_pri = getPriorityValues(r_ele, 're');
|
||||
final readingDoesNotMatchKanji =
|
||||
r_ele.findElements('re_nokanji').isNotEmpty;
|
||||
readingEls.add(
|
||||
ReadingElement(
|
||||
orderNum: orderNum + 1,
|
||||
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
|
||||
info: r_ele
|
||||
.findElements('re_inf')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
@ -189,14 +194,12 @@ List<Entry> parseXML(XmlElement root) {
|
||||
);
|
||||
}
|
||||
|
||||
int senseNum = 0;
|
||||
for (final sense in entry.findElements('sense')) {
|
||||
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
|
||||
senseId++;
|
||||
senseNum++;
|
||||
senses.add(
|
||||
Sense(
|
||||
id: senseId,
|
||||
senseNum: senseNum,
|
||||
orderNum: orderNum + 1,
|
||||
restrictedToKanji:
|
||||
sense.findElements('stagk').map((e) => e.innerText).toList(),
|
||||
restrictedToReading:
|
||||
@ -347,7 +350,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
b.insert(
|
||||
TableNames.sense,
|
||||
s.sqlValue
|
||||
..addAll({'id': s.id, 'entryId': e.id, 'senseNum': s.senseNum}));
|
||||
..addAll({'id': s.id, 'entryId': e.id, 'orderNum': s.orderNum}));
|
||||
|
||||
for (final d in s.dialects) {
|
||||
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
|
||||
@ -444,7 +447,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
'xrefEntryId': resolvedEntry.entry.id,
|
||||
'seeAlsoKanji': xref.kanjiRef,
|
||||
'seeAlsoReading': xref.readingRef,
|
||||
'seeAlsoSense': xref.senseNum,
|
||||
'seeAlsoSense': xref.senseOrderNum,
|
||||
'ambiguous': resolvedEntry.ambiguous,
|
||||
},
|
||||
);
|
||||
@ -462,7 +465,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
'xrefEntryId': resolvedEntry.entry.id,
|
||||
'antonymKanji': ant.kanjiRef,
|
||||
'antonymReading': ant.readingRef,
|
||||
'antonymSense': ant.senseNum,
|
||||
'antonymSense': ant.senseOrderNum,
|
||||
'ambiguous': resolvedEntry.ambiguous,
|
||||
});
|
||||
}
|
||||
|
@ -40,6 +40,7 @@ CREATE TABLE "JMdict_Entry" (
|
||||
|
||||
CREATE TABLE "JMdict_KanjiElement" (
|
||||
"entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"),
|
||||
"orderNum" INTEGER,
|
||||
"reading" TEXT NOT NULL,
|
||||
"news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
|
||||
"ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
|
||||
@ -62,13 +63,14 @@ CREATE TABLE "JMdict_KanjiElementInfo" (
|
||||
|
||||
CREATE TABLE "JMdict_ReadingElement" (
|
||||
"entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("id"),
|
||||
"orderNum" INTEGER,
|
||||
"reading" TEXT NOT NULL,
|
||||
"readingDoesNotMatchKanji" BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
"news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
|
||||
"ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
|
||||
"spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2),
|
||||
"gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2),
|
||||
"nf" INTEGER,
|
||||
"nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48),
|
||||
PRIMARY KEY ("entryId", "reading")
|
||||
) WITHOUT ROWID;
|
||||
|
||||
@ -95,11 +97,11 @@ CREATE TABLE "JMdict_ReadingElementInfo" (
|
||||
CREATE TABLE "JMdict_Sense" (
|
||||
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
"entryId" INTEGER REFERENCES "JMdict_Entry"("id"),
|
||||
"senseNum" INTEGER,
|
||||
UNIQUE("entryId", "senseNum")
|
||||
"orderNum" INTEGER,
|
||||
UNIQUE("entryId", "orderNum")
|
||||
);
|
||||
|
||||
CREATE INDEX "JMdict_Sense_byEntryId_bySenseNum" ON "JMdict_Sense"("entryId", "senseNum");
|
||||
CREATE INDEX "JMdict_Sense_byEntryId_byOrderNum" ON "JMdict_Sense"("entryId", "orderNum");
|
||||
|
||||
CREATE TABLE "JMdict_SenseRestrictedToKanji" (
|
||||
"entryId" INTEGER,
|
||||
@ -141,7 +143,7 @@ CREATE TABLE "JMdict_SenseSeeAlso" (
|
||||
-- TODO: Check that if seeAlsoSense is present, it refers to a sense connected to xrefEntryId.
|
||||
FOREIGN KEY ("xrefEntryId", "seeAlsoKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
|
||||
FOREIGN KEY ("xrefEntryId", "seeAlsoReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),
|
||||
FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"),
|
||||
FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"),
|
||||
PRIMARY KEY ("senseId", "xrefEntryId", "seeAlsoReading", "seeAlsoKanji", "seeAlsoSense")
|
||||
);
|
||||
|
||||
@ -157,7 +159,7 @@ CREATE TABLE "JMdict_SenseAntonym" (
|
||||
CHECK ("antonymReading" = NULL <> "antonymKanji" = NULL),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "senseNum"),
|
||||
FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"),
|
||||
PRIMARY KEY ("senseId", "xrefEntryId", "antonymReading", "antonymKanji", "antonymSense")
|
||||
);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user