347 lines
11 KiB
Dart
347 lines
11 KiB
Dart
|
import 'dart:collection';
|
||
|
import 'dart:io';
|
||
|
|
||
|
import 'package:sqflite_common/sqlite_api.dart';
|
||
|
import 'package:xml/xml.dart';
|
||
|
|
||
|
import '../romaji_transliteration.dart';
|
||
|
import 'objects.dart';
|
||
|
|
||
|
List<int?> getPriNums(XmlElement e, String prefix) {
|
||
|
int? news, ichi, spec, gai, nf;
|
||
|
for (final pri in e.findElements('${prefix}_pri')) {
|
||
|
final txt = pri.innerText;
|
||
|
if (txt.startsWith('news'))
|
||
|
news = int.parse(txt.substring(4));
|
||
|
else if (txt.startsWith('ichi'))
|
||
|
ichi = int.parse(txt.substring(4));
|
||
|
else if (txt.startsWith('spec'))
|
||
|
spec = int.parse(txt.substring(4));
|
||
|
else if (txt.startsWith('gai'))
|
||
|
gai = int.parse(txt.substring(3));
|
||
|
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
|
||
|
}
|
||
|
return [news, ichi, spec, gai, nf];
|
||
|
}
|
||
|
|
||
|
List<Entry> transformXML(XmlElement root) {
|
||
|
final List<Entry> entries = [];
|
||
|
|
||
|
int senseId = 0;
|
||
|
for (final entry in root.childElements) {
|
||
|
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
|
||
|
|
||
|
final List<KanjiElement> kanjiEls = [];
|
||
|
final List<ReadingElement> readingEls = [];
|
||
|
final List<Sense> senses = [];
|
||
|
|
||
|
for (final k_ele in entry.findAllElements('k_ele')) {
|
||
|
final ke_pri = getPriNums(k_ele, 'ke');
|
||
|
kanjiEls.add(
|
||
|
KanjiElement(
|
||
|
info: k_ele.findElements('ke_inf').map((e) => e.innerText).toList(),
|
||
|
reading: k_ele.findElements('keb').first.innerText,
|
||
|
news: ke_pri[0],
|
||
|
ichi: ke_pri[1],
|
||
|
spec: ke_pri[2],
|
||
|
gai: ke_pri[3],
|
||
|
nf: ke_pri[4],
|
||
|
),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
for (final r_ele in entry.findAllElements('r_ele')) {
|
||
|
final re_pri = getPriNums(r_ele, 're');
|
||
|
readingEls.add(
|
||
|
ReadingElement(
|
||
|
info: r_ele
|
||
|
.findElements('re_inf')
|
||
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||
|
.toList(),
|
||
|
restrictions:
|
||
|
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
|
||
|
reading: r_ele.findElements('reb').first.innerText,
|
||
|
news: re_pri[0],
|
||
|
ichi: re_pri[1],
|
||
|
spec: re_pri[2],
|
||
|
gai: re_pri[3],
|
||
|
nf: re_pri[4],
|
||
|
),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
for (final sense in entry.findAllElements('sense')) {
|
||
|
senseId++;
|
||
|
senses.add(
|
||
|
Sense(
|
||
|
id: senseId,
|
||
|
restrictedToKanji:
|
||
|
sense.findElements('stagk').map((e) => e.innerText).toList(),
|
||
|
restrictedToReading:
|
||
|
sense.findElements('stagr').map((e) => e.innerText).toList(),
|
||
|
pos: sense
|
||
|
.findElements('pos')
|
||
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||
|
.toList(),
|
||
|
misc: sense
|
||
|
.findElements('misc')
|
||
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||
|
.toList(),
|
||
|
dialects: sense
|
||
|
.findElements('dial')
|
||
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||
|
.toList(),
|
||
|
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
|
||
|
languageSource: sense
|
||
|
.findElements('lsource')
|
||
|
.map(
|
||
|
(e) => LanguageSource(
|
||
|
language: e.getAttribute('xml:lang') ?? 'eng',
|
||
|
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
|
||
|
constructedFromSmallerWords:
|
||
|
e.getAttribute('ls_wasei') == 'y',
|
||
|
),
|
||
|
)
|
||
|
.toList(),
|
||
|
glossary: sense
|
||
|
.findElements('gloss')
|
||
|
.map(
|
||
|
(e) => Glossary(
|
||
|
language: e.getAttribute('xml:lang') ?? 'eng',
|
||
|
phrase: e.innerText,
|
||
|
type: e.getAttribute('g_type'),
|
||
|
),
|
||
|
)
|
||
|
.toList(),
|
||
|
antonyms: sense
|
||
|
.findElements('ant')
|
||
|
.map((e) => XRefParts.fromString(e.innerText))
|
||
|
.toList(),
|
||
|
seeAlso: sense
|
||
|
.findElements('xref')
|
||
|
.map((e) => XRefParts.fromString(e.innerText))
|
||
|
.toList(),
|
||
|
),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
entries.add(
|
||
|
Entry(
|
||
|
id: entryId,
|
||
|
kanji: kanjiEls,
|
||
|
readings: readingEls,
|
||
|
senses: senses,
|
||
|
),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
return entries;
|
||
|
}
|
||
|
|
||
|
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||
|
print(' [JMdict] Batch 1');
|
||
|
Batch b = db.batch();
|
||
|
for (final e in entries) {
|
||
|
b.insert(TableNames.entry, e.sqlValue);
|
||
|
for (final k in e.kanji) {
|
||
|
b.insert(TableNames.kanjiElement, k.sqlValue..addAll({'entryId': e.id}));
|
||
|
// b.insert(
|
||
|
// TableNames.entryByKana,
|
||
|
// {'entryId': e.id, 'kana': transliterateKatakanaToHiragana(k.reading)},
|
||
|
// // Some entries have the same reading twice with difference in katakana and hiragana
|
||
|
// conflictAlgorithm: ConflictAlgorithm.ignore,
|
||
|
// );
|
||
|
for (final i in k.info) {
|
||
|
b.insert(
|
||
|
TableNames.kanjiInfo,
|
||
|
{'entryId': e.id, 'reading': k.reading, 'info': i},
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
for (final r in e.readings) {
|
||
|
b.insert(
|
||
|
TableNames.readingElement,
|
||
|
r.sqlValue..addAll({'entryId': e.id}),
|
||
|
);
|
||
|
|
||
|
b.insert(
|
||
|
TableNames.entryByKana,
|
||
|
{'entryId': e.id, 'kana': transliterateKanaToLatin(r.reading)},
|
||
|
// Some entries have the same reading twice with difference in katakana and hiragana
|
||
|
conflictAlgorithm: ConflictAlgorithm.ignore,
|
||
|
);
|
||
|
for (final i in r.info) {
|
||
|
b.insert(
|
||
|
TableNames.readingInfo,
|
||
|
{'entryId': e.id, 'reading': r.reading, 'info': i},
|
||
|
);
|
||
|
}
|
||
|
for (final res in r.restrictions) {
|
||
|
b.insert(
|
||
|
TableNames.readingRestriction,
|
||
|
{'entryId': e.id, 'reading': r.reading, 'restriction': res},
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (final s in e.senses) {
|
||
|
for (final g in s.glossary) {
|
||
|
if (g.language == "eng")
|
||
|
b.insert(
|
||
|
TableNames.entryByEnglish,
|
||
|
{'entryId': e.id, 'english': g.phrase},
|
||
|
// Some entries have the same reading twice with difference in katakana and hiragana
|
||
|
conflictAlgorithm: ConflictAlgorithm.ignore,
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
await b.commit();
|
||
|
|
||
|
print(' [JMdict] Building trees');
|
||
|
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
|
||
|
for (final entry in entries) {
|
||
|
for (final kanji in entry.kanji) {
|
||
|
if (entriesByKanji.containsKey(kanji.reading)) {
|
||
|
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
|
||
|
} else {
|
||
|
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
|
||
|
for (final entry in entries) {
|
||
|
for (final reading in entry.readings) {
|
||
|
if (entriesByReading.containsKey(reading.reading)) {
|
||
|
entriesByReading.update(reading.reading, (list) => list..add(entry));
|
||
|
} else {
|
||
|
entriesByReading.putIfAbsent(reading.reading, () => {entry});
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
print(' [JMdict] Batch 2');
|
||
|
b = db.batch();
|
||
|
|
||
|
for (final e in entries) {
|
||
|
for (final s in e.senses) {
|
||
|
b.insert(
|
||
|
TableNames.sense, s.sqlValue..addAll({'id': s.id, 'entryId': e.id}));
|
||
|
|
||
|
for (final d in s.dialects) {
|
||
|
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
|
||
|
}
|
||
|
for (final f in s.fields) {
|
||
|
b.insert(TableNames.senseField, {'senseId': s.id, 'field': f});
|
||
|
}
|
||
|
for (final i in s.info) {
|
||
|
b.insert(TableNames.senseInfo, {'senseId': s.id, 'info': i});
|
||
|
}
|
||
|
for (final m in s.misc) {
|
||
|
b.insert(TableNames.senseMisc, {'senseId': s.id, 'misc': m});
|
||
|
}
|
||
|
for (final p in s.pos) {
|
||
|
b.insert(TableNames.sensePOS, {'senseId': s.id, 'pos': p});
|
||
|
}
|
||
|
for (final l in s.languageSource) {
|
||
|
b.insert(
|
||
|
TableNames.senseLanguageSource,
|
||
|
l.sqlValue..addAll({'senseId': s.id}),
|
||
|
);
|
||
|
}
|
||
|
for (final rk in s.restrictedToKanji) {
|
||
|
b.insert(
|
||
|
TableNames.senseRestrictedToKanji,
|
||
|
{'entryId': e.id, 'senseId': s.id, 'kanji': rk},
|
||
|
);
|
||
|
}
|
||
|
for (final rr in s.restrictedToReading) {
|
||
|
b.insert(
|
||
|
TableNames.senseRestrictedToReading,
|
||
|
{'entryId': e.id, 'senseId': s.id, 'reading': rr},
|
||
|
);
|
||
|
}
|
||
|
for (final ls in s.languageSource) {
|
||
|
b.insert(
|
||
|
TableNames.senseLanguageSource,
|
||
|
ls.sqlValue..addAll({'senseId': s.id}),
|
||
|
);
|
||
|
}
|
||
|
for (final g in s.glossary) {
|
||
|
if (g.language == 'eng')
|
||
|
b.insert(
|
||
|
TableNames.senseGlossary,
|
||
|
g.sqlValue..addAll({'senseId': s.id}),
|
||
|
// There are some duplicate glossary, especially in
|
||
|
// the other languages.
|
||
|
conflictAlgorithm: ConflictAlgorithm.ignore,
|
||
|
);
|
||
|
}
|
||
|
|
||
|
for (final xref in s.seeAlso) {
|
||
|
final Set<Entry> entries;
|
||
|
if (xref.kanjiRef != null && xref.readingRef != null) {
|
||
|
entries = entriesByKanji[xref.kanjiRef]!
|
||
|
.difference(entriesByReading[xref.readingRef]!);
|
||
|
} else if (xref.kanjiRef != null) {
|
||
|
entries = entriesByKanji[xref.kanjiRef]!;
|
||
|
} else {
|
||
|
entries = entriesByReading[xref.readingRef]!;
|
||
|
}
|
||
|
for (final ex in entries)
|
||
|
if (!(xref.senseNum != null && xref.senseNum! > ex.senses.length)) {
|
||
|
b.insert(
|
||
|
TableNames.senseSeeAlso,
|
||
|
{
|
||
|
'senseId': s.id,
|
||
|
'xrefEntryId': ex.id,
|
||
|
'seeAlsoKanji': xref.kanjiRef,
|
||
|
'seeAlsoReading': xref.readingRef,
|
||
|
'seeAlsoSense': xref.senseNum,
|
||
|
},
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
for (final ant in s.antonyms) {
|
||
|
final Set<Entry> entries;
|
||
|
if (ant.kanjiRef != null && ant.readingRef != null) {
|
||
|
entries = entriesByKanji[ant.kanjiRef]!
|
||
|
.difference(entriesByReading[ant.readingRef]!);
|
||
|
} else if (ant.kanjiRef != null) {
|
||
|
entries = entriesByKanji[ant.kanjiRef]!;
|
||
|
} else {
|
||
|
entries = entriesByReading[ant.readingRef]!;
|
||
|
}
|
||
|
for (final ex in entries) {
|
||
|
if (!(ant.senseNum != null && ant.senseNum! > ex.senses.length)) {
|
||
|
b.insert(TableNames.senseAntonyms, {
|
||
|
'senseId': s.id,
|
||
|
'xrefEntryId': ex.id,
|
||
|
'antonymKanji': ant.kanjiRef,
|
||
|
'antonymReading': ant.readingRef,
|
||
|
'antonymSense': ant.senseNum,
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
await b.commit();
|
||
|
}
|
||
|
|
||
|
Future<void> addDataFromJMdict(Database db) async {
|
||
|
print('[JMdict] Reading file...');
|
||
|
String rawXML = File('data/JMdict.xml').readAsStringSync();
|
||
|
|
||
|
print('[JMdict] Parsing XML...');
|
||
|
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
|
||
|
|
||
|
print('[JMdict] Transforming data...');
|
||
|
final entries = transformXML(root);
|
||
|
|
||
|
print('[JMdict] Writing to database...');
|
||
|
await insertIntoDB(entries, db);
|
||
|
}
|