Files
jadb/lib/_data_ingestion/jmdict/parser.dart
2025-04-28 21:53:09 +02:00

488 lines
14 KiB
Dart

import 'dart:collection';
import 'dart:io';
import 'package:collection/collection.dart';
import 'package:jadb/romaji_transliteration.dart';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import 'objects.dart';
/// parse priority values from r_ele and k_ele xml elements
///
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
List<int?> getPriorityValues(XmlElement e, String prefix) {
int? news, ichi, spec, gai, nf;
for (final pri in e.findElements('${prefix}_pri')) {
final txt = pri.innerText;
if (txt.startsWith('news'))
news = int.parse(txt.substring(4));
else if (txt.startsWith('ichi'))
ichi = int.parse(txt.substring(4));
else if (txt.startsWith('spec'))
spec = int.parse(txt.substring(4));
else if (txt.startsWith('gai'))
gai = int.parse(txt.substring(3));
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
}
return [news, ichi, spec, gai, nf];
}
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
XRefParts parseXrefParts(String s) {
final parts = s.split('');
late final XRefParts result;
switch (parts.length) {
case 1:
result = parts[0].contains(kanaRegex)
? XRefParts(readingRef: parts[0])
: XRefParts(kanjiRef: parts[0]);
break;
case 2:
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
result = XRefParts(
readingRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
} else {
result = XRefParts(
kanjiRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
}
} else {
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
);
}
break;
case 3:
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseOrderNum: int.parse(parts[2]),
);
break;
default:
result = XRefParts();
break;
}
return result;
}
class ResolvedXref {
Entry entry;
bool ambiguous;
ResolvedXref(this.entry, this.ambiguous);
}
ResolvedXref resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByKanji,
SplayTreeMap<String, Set<Entry>> entriesByReading,
XRefParts xref,
) {
List<Entry> candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) {
(null, null) =>
throw Exception('Xref $xref has no kanji or reading reference'),
(String k, null) => entriesByKanji[k]!.toList(),
(null, String r) => entriesByReading[r]!.toList(),
(String k, String r) =>
entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(),
};
// Filter out entries that don't have the number of senses specified in the xref
if (xref.senseOrderNum != null) {
candidateEntries
.retainWhere((entry) => entry.senses.length >= xref.senseOrderNum!);
}
// If the xref has a reading ref but no kanji ref, and there are multiple
// entries to choose from, prefer entries with empty kanji readings
// if possible.
if (xref.kanjiRef == null &&
xref.readingRef != null &&
candidateEntries.length > 1) {
final candidatesWithEmptyKanji =
candidateEntries.where((entry) => entry.kanji.length == 0).toList();
if (candidatesWithEmptyKanji.isNotEmpty) {
candidateEntries = candidatesWithEmptyKanji;
}
}
// Having more senses is a cheap way to choose the most likely correct
// entry in case there are multiple candidates left.
candidateEntries.sortBy<num>((entry) => entry.senses.length);
if (candidateEntries.length == 0) {
throw Exception(
'SKIPPING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseOrderNum: ${xref.senseOrderNum}',
);
} else if (candidateEntries.length > 1) {
print(
'WARNING: Xref $xref has ${candidateEntries.length} entries, '
'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, '
'senseOrderNum: ${xref.senseOrderNum}',
);
return ResolvedXref(candidateEntries.first, true);
} else {
return ResolvedXref(candidateEntries.first, false);
}
}
List<Entry> parseXML(XmlElement root) {
final List<Entry> entries = [];
int senseId = 0;
for (final entry in root.childElements) {
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
final List<KanjiElement> kanjiEls = [];
final List<ReadingElement> readingEls = [];
final List<Sense> senses = [];
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
final ke_pri = getPriorityValues(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
orderNum: kanjiNum + 1,
info: k_ele
.findElements('ke_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
ichi: ke_pri[1],
spec: ke_pri[2],
gai: ke_pri[3],
nf: ke_pri[4],
),
);
}
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
final re_pri = getPriorityValues(r_ele, 're');
final readingDoesNotMatchKanji =
r_ele.findElements('re_nokanji').isNotEmpty;
readingEls.add(
ReadingElement(
orderNum: orderNum + 1,
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
info: r_ele
.findElements('re_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
restrictions:
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
reading: r_ele.findElements('reb').first.innerText,
news: re_pri[0],
ichi: re_pri[1],
spec: re_pri[2],
gai: re_pri[3],
nf: re_pri[4],
),
);
}
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
senseId++;
final result = Sense(
id: senseId,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => parseXrefParts(e.innerText))
.toList(),
);
if (result.isEmpty) {
print(
'WARNING: Sense $senseId for entry $entryId is empty, '
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
);
} else {
senses.add(result);
}
}
entries.add(
Entry(
id: entryId,
kanji: kanjiEls,
readings: readingEls,
senses: senses,
),
);
}
return entries;
}
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1 - Kanji and readings');
Batch b = db.batch();
for (final e in entries) {
b.insert(TableNames.entry, e.sqlValue);
for (final k in e.kanji) {
b.insert(TableNames.kanjiElement, k.sqlValue..addAll({'entryId': e.id}));
// b.insert(
// TableNames.entryByKana,
// {'entryId': e.id, 'kana': transliterateKatakanaToHiragana(k.reading)},
// // Some entries have the same reading twice with difference in katakana and hiragana
// conflictAlgorithm: ConflictAlgorithm.ignore,
// );
for (final i in k.info) {
b.insert(
TableNames.kanjiInfo,
{
'entryId': e.id,
'reading': k.reading,
'info': i,
},
);
}
}
for (final r in e.readings) {
b.insert(
TableNames.readingElement,
r.sqlValue..addAll({'entryId': e.id}),
);
b.insert(
TableNames.entryByKana,
{
'entryId': e.id,
'kana': transliterateKanaToLatin(r.reading),
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
for (final i in r.info) {
b.insert(
TableNames.readingInfo,
{
'entryId': e.id,
'reading': r.reading,
'info': i,
},
);
}
for (final res in r.restrictions) {
b.insert(
TableNames.readingRestriction,
{
'entryId': e.id,
'reading': r.reading,
'restriction': res,
},
);
}
}
for (final s in e.senses) {
for (final g in s.glossary) {
b.insert(
TableNames.entryByEnglish,
{
'entryId': e.id,
'english': g.phrase,
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
}
}
await b.commit();
print(' [JMdict] Batch 2 - Senses');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
b.insert(TableNames.sense, s.sqlValue..addAll({'entryId': e.id}));
for (final d in s.dialects) {
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
}
for (final f in s.fields) {
b.insert(TableNames.senseField, {'senseId': s.id, 'field': f});
}
for (final i in s.info) {
b.insert(TableNames.senseInfo, {'senseId': s.id, 'info': i});
}
for (final m in s.misc) {
b.insert(TableNames.senseMisc, {'senseId': s.id, 'misc': m});
}
for (final p in s.pos) {
b.insert(TableNames.sensePOS, {'senseId': s.id, 'pos': p});
}
for (final l in s.languageSource) {
b.insert(
TableNames.senseLanguageSource,
l.sqlValue..addAll({'senseId': s.id}),
);
}
for (final rk in s.restrictedToKanji) {
b.insert(
TableNames.senseRestrictedToKanji,
{'entryId': e.id, 'senseId': s.id, 'kanji': rk},
);
}
for (final rr in s.restrictedToReading) {
b.insert(
TableNames.senseRestrictedToReading,
{'entryId': e.id, 'senseId': s.id, 'reading': rr},
);
}
for (final ls in s.languageSource) {
b.insert(
TableNames.senseLanguageSource,
ls.sqlValue..addAll({'senseId': s.id}),
);
}
for (final g in s.glossary) {
b.insert(
TableNames.senseGlossary,
g.sqlValue..addAll({'senseId': s.id}),
);
}
}
}
await b.commit();
print(' [JMdict] Building xref trees');
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
}
}
SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
}
}
print(' [JMdict] Batch 3 - Xrefs');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
for (final xref in s.seeAlso) {
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
xref,
);
b.insert(
TableNames.senseSeeAlso,
{
'senseId': s.id,
'xrefEntryId': resolvedEntry.entry.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
},
);
}
for (final ant in s.antonyms) {
final resolvedEntry = resolveXref(
entriesByKanji,
entriesByReading,
ant,
);
b.insert(TableNames.senseAntonyms, {
'senseId': s.id,
'xrefEntryId': resolvedEntry.entry.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseOrderNum,
'ambiguous': resolvedEntry.ambiguous,
});
}
}
}
await b.commit();
}
Future<void> addDataFromJMdict(Database db) async {
print('[JMdict] Reading file content...');
String rawXML = File('data/JMdict.xml').readAsStringSync();
print('[JMdict] Parsing XML tags...');
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
print('[JMdict] Parsing XML content...');
final entries = parseXML(root);
print('[JMdict] Writing to database...');
await insertIntoDB(entries, db);
}