lib: improve jmdict ingestion, update constants

This commit is contained in:
2025-04-22 19:10:53 +02:00
parent 84e80fe876
commit 7544013ffd
4 changed files with 288 additions and 133 deletions

View File

@@ -148,32 +148,12 @@ class XRefParts {
this.readingRef,
this.senseNum,
}) : assert(kanjiRef != null || readingRef != null);
factory XRefParts.fromString(String s) {
final parts = s.split('');
if (parts.length == 1) {
if (parts[0].contains(kanaRegex)) {
return XRefParts(readingRef: parts[0]);
}
return XRefParts(kanjiRef: parts[0]);
} else if (parts.length == 2) {
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
return XRefParts(readingRef: parts[0], senseNum: int.parse(parts[1]));
}
return XRefParts(kanjiRef: parts[0], senseNum: int.parse(parts[1]));
}
return XRefParts(kanjiRef: parts[0], readingRef: parts[1]);
} else if (parts.length == 3) {
return XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseNum: int.parse(parts[2]),
);
}
return XRefParts();
}
Map<String, Object?> toJson() => {
'kanjiRef': kanjiRef,
'readingRef': readingRef,
'senseNum': senseNum,
};
}
class XRef {
@@ -188,6 +168,7 @@ class XRef {
class Sense extends SQLWritable {
final int id;
final int senseNum;
final List<XRefParts> antonyms;
final List<String> dialects;
final List<String> fields;
@@ -202,6 +183,7 @@ class Sense extends SQLWritable {
const Sense({
required this.id,
required this.senseNum,
this.antonyms = const [],
this.dialects = const [],
this.fields = const [],

View File

@@ -1,13 +1,17 @@
import 'dart:collection';
import 'dart:io';
import 'package:collection/collection.dart';
import 'package:jadb/romaji_transliteration.dart';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import 'objects.dart';
List<int?> getPriNums(XmlElement e, String prefix) {
/// parse priority values from r_ele and k_ele xml elements
///
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
List<int?> getPriorityValues(XmlElement e, String prefix) {
int? news, ichi, spec, gai, nf;
for (final pri in e.findElements('${prefix}_pri')) {
final txt = pri.innerText;
@@ -24,6 +28,110 @@ List<int?> getPriNums(XmlElement e, String prefix) {
return [news, ichi, spec, gai, nf];
}
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
XRefParts parseXrefParts(String s) {
final parts = s.split('');
late final XRefParts result;
switch (parts.length) {
case 1:
result = parts[0].contains(kanaRegex)
? XRefParts(readingRef: parts[0])
: XRefParts(kanjiRef: parts[0]);
break;
case 2:
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
result = XRefParts(
readingRef: parts[0],
senseNum: int.parse(parts[1]),
);
} else {
result = XRefParts(
kanjiRef: parts[0],
senseNum: int.parse(parts[1]),
);
}
} else {
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
);
}
break;
case 3:
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseNum: int.parse(parts[2]),
);
break;
default:
result = XRefParts();
break;
}
return result;
}
Entry resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByKanji,
SplayTreeMap<String, Set<Entry>> entriesByReading,
XRefParts xref,
) {
List<Entry> candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) {
(null, null) =>
throw Exception('Xref $xref has no kanji or reading reference'),
(String k, null) => entriesByKanji[k]!.toList(),
(null, String r) => entriesByReading[r]!.toList(),
(String k, String r) =>
entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(),
};
// Filter out entries that don't have the number of senses specified in the xref
if (xref.senseNum != null) {
candidateEntries
.retainWhere((entry) => entry.senses.length >= xref.senseNum!);
}
// If the xref has a reading ref but no kanji ref, and there are multiple
// entries to choose from, prefer entries with empty kanji readings
// if possible.
if (xref.kanjiRef == null &&
xref.readingRef != null &&
candidateEntries.length > 1) {
final candidatesWithEmptyKanji =
candidateEntries.where((entry) => entry.kanji.length == 0).toList();
if (candidatesWithEmptyKanji.isNotEmpty) {
candidateEntries = candidatesWithEmptyKanji;
}
}
// Having more senses is a cheap way to choose the most likely correct
// entry in case there are multiple candidates left.
candidateEntries.sortBy<num>((entry) => entry.senses.length);
if (candidateEntries.length == 0) {
throw Exception(
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseNum: ${xref.senseNum}',
);
} else if (candidateEntries.length > 1) {
print(
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseNum: ${xref.senseNum}',
);
return candidateEntries.first;
} else {
return candidateEntries.first;
}
}
List<Entry> transformXML(XmlElement root) {
final List<Entry> entries = [];
@@ -36,10 +144,13 @@ List<Entry> transformXML(XmlElement root) {
final List<Sense> senses = [];
for (final k_ele in entry.findAllElements('k_ele')) {
final ke_pri = getPriNums(k_ele, 'ke');
final ke_pri = getPriorityValues(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
info: k_ele.findElements('ke_inf').map((e) => e.innerText).toList(),
info: k_ele
.findElements('ke_inf')
.map((e) => e.innerText.replaceAll(RegExp('[&;]'), ''))
.toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
ichi: ke_pri[1],
@@ -51,7 +162,7 @@ List<Entry> transformXML(XmlElement root) {
}
for (final r_ele in entry.findAllElements('r_ele')) {
final re_pri = getPriNums(r_ele, 're');
final re_pri = getPriorityValues(r_ele, 're');
readingEls.add(
ReadingElement(
info: r_ele
@@ -70,11 +181,14 @@ List<Entry> transformXML(XmlElement root) {
);
}
int senseNum = 0;
for (final sense in entry.findAllElements('sense')) {
senseId++;
senseNum++;
senses.add(
Sense(
id: senseId,
senseNum: senseNum,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
@@ -115,11 +229,11 @@ List<Entry> transformXML(XmlElement root) {
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => XRefParts.fromString(e.innerText))
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => XRefParts.fromString(e.innerText))
.map((e) => parseXrefParts(e.innerText))
.toList(),
),
);
@@ -154,7 +268,11 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
for (final i in k.info) {
b.insert(
TableNames.kanjiInfo,
{'entryId': e.id, 'reading': k.reading, 'info': i},
{
'entryId': e.id,
'reading': k.reading,
'info': i,
},
);
}
}
@@ -166,20 +284,31 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
b.insert(
TableNames.entryByKana,
{'entryId': e.id, 'kana': transliterateKanaToLatin(r.reading)},
{
'entryId': e.id,
'kana': transliterateKanaToLatin(r.reading),
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
for (final i in r.info) {
b.insert(
TableNames.readingInfo,
{'entryId': e.id, 'reading': r.reading, 'info': i},
{
'entryId': e.id,
'reading': r.reading,
'info': i,
},
);
}
for (final res in r.restrictions) {
b.insert(
TableNames.readingRestriction,
{'entryId': e.id, 'reading': r.reading, 'restriction': res},
{
'entryId': e.id,
'reading': r.reading,
'restriction': res,
},
);
}
}
@@ -189,7 +318,10 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
if (g.language == "eng")
b.insert(
TableNames.entryByEnglish,
{'entryId': e.id, 'english': g.phrase},
{
'entryId': e.id,
'english': g.phrase,
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
@@ -199,35 +331,15 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
await b.commit();
print(' [JMdict] Building trees');
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
}
}
SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
}
}
print(' [JMdict] Batch 2');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
b.insert(
TableNames.sense, s.sqlValue..addAll({'id': s.id, 'entryId': e.id}));
TableNames.sense,
s.sqlValue
..addAll({'id': s.id, 'entryId': e.id, 'senseNum': s.senseNum}));
for (final d in s.dialects) {
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
@@ -278,52 +390,71 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
}
}
for (final xref in s.seeAlso) {
final Set<Entry> entries;
if (xref.kanjiRef != null && xref.readingRef != null) {
entries = entriesByKanji[xref.kanjiRef]!
.difference(entriesByReading[xref.readingRef]!);
} else if (xref.kanjiRef != null) {
entries = entriesByKanji[xref.kanjiRef]!;
} else {
entries = entriesByReading[xref.readingRef]!;
}
for (final ex in entries)
if (!(xref.senseNum != null && xref.senseNum! > ex.senses.length)) {
b.insert(
TableNames.senseSeeAlso,
{
'senseId': s.id,
'xrefEntryId': ex.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseNum,
},
);
}
await b.commit();
print(' [JMdict] Building trees');
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
}
}
SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
}
}
print(' [JMdict] Batch 3');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
for (final xref in s.seeAlso) {
final entry = resolveXref(
entriesByKanji,
entriesByReading,
xref,
);
b.insert(
TableNames.senseSeeAlso,
{
'senseId': s.id,
'xrefEntryId': entry.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseNum,
},
);
}
for (final ant in s.antonyms) {
final Set<Entry> entries;
if (ant.kanjiRef != null && ant.readingRef != null) {
entries = entriesByKanji[ant.kanjiRef]!
.difference(entriesByReading[ant.readingRef]!);
} else if (ant.kanjiRef != null) {
entries = entriesByKanji[ant.kanjiRef]!;
} else {
entries = entriesByReading[ant.readingRef]!;
}
for (final ex in entries) {
if (!(ant.senseNum != null && ant.senseNum! > ex.senses.length)) {
b.insert(TableNames.senseAntonyms, {
'senseId': s.id,
'xrefEntryId': ex.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseNum,
});
}
}
final entry = resolveXref(
entriesByKanji,
entriesByReading,
ant,
);
b.insert(TableNames.senseAntonyms, {
'senseId': s.id,
'xrefEntryId': entry.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseNum,
});
}
}
}