202 lines
5.9 KiB
Dart
202 lines
5.9 KiB
Dart
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
|
import 'package:xml/xml.dart';
|
|
|
|
/// parse priority values from r_ele and k_ele xml elements
|
|
///
|
|
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
|
|
List<int?> getPriorityValues(XmlElement e, String prefix) {
|
|
int? news, ichi, spec, gai, nf;
|
|
for (final pri in e.findElements('${prefix}_pri')) {
|
|
final txt = pri.innerText;
|
|
if (txt.startsWith('news'))
|
|
news = int.parse(txt.substring(4));
|
|
else if (txt.startsWith('ichi'))
|
|
ichi = int.parse(txt.substring(4));
|
|
else if (txt.startsWith('spec'))
|
|
spec = int.parse(txt.substring(4));
|
|
else if (txt.startsWith('gai'))
|
|
gai = int.parse(txt.substring(3));
|
|
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
|
|
}
|
|
return [news, ichi, spec, gai, nf];
|
|
}
|
|
|
|
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
|
|
XRefParts parseXrefParts(String s) {
|
|
final parts = s.split('・');
|
|
late final XRefParts result;
|
|
switch (parts.length) {
|
|
case 1:
|
|
result = parts[0].contains(kanaRegex)
|
|
? XRefParts(readingRef: parts[0])
|
|
: XRefParts(kanjiRef: parts[0]);
|
|
break;
|
|
|
|
case 2:
|
|
if (int.tryParse(parts[1]) != null) {
|
|
if (parts[0].contains(kanaRegex)) {
|
|
result = XRefParts(
|
|
readingRef: parts[0],
|
|
senseOrderNum: int.parse(parts[1]),
|
|
);
|
|
} else {
|
|
result = XRefParts(
|
|
kanjiRef: parts[0],
|
|
senseOrderNum: int.parse(parts[1]),
|
|
);
|
|
}
|
|
} else {
|
|
result = XRefParts(
|
|
kanjiRef: parts[0],
|
|
readingRef: parts[1],
|
|
);
|
|
}
|
|
break;
|
|
|
|
case 3:
|
|
result = XRefParts(
|
|
kanjiRef: parts[0],
|
|
readingRef: parts[1],
|
|
senseOrderNum: int.parse(parts[2]),
|
|
);
|
|
break;
|
|
|
|
default:
|
|
result = XRefParts();
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
List<Entry> parseJMDictData(XmlElement root) {
|
|
final List<Entry> entries = [];
|
|
|
|
int senseId = 0;
|
|
|
|
for (final entry in root.childElements) {
|
|
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
|
|
|
|
final List<KanjiElement> kanjiEls = [];
|
|
final List<ReadingElement> readingEls = [];
|
|
final List<Sense> senses = [];
|
|
|
|
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
|
|
final ke_pri = getPriorityValues(k_ele, 'ke');
|
|
kanjiEls.add(
|
|
KanjiElement(
|
|
orderNum: kanjiNum + 1,
|
|
info: k_ele
|
|
.findElements('ke_inf')
|
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
|
.toList(),
|
|
reading: k_ele.findElements('keb').first.innerText,
|
|
news: ke_pri[0],
|
|
ichi: ke_pri[1],
|
|
spec: ke_pri[2],
|
|
gai: ke_pri[3],
|
|
nf: ke_pri[4],
|
|
),
|
|
);
|
|
}
|
|
|
|
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
|
|
final re_pri = getPriorityValues(r_ele, 're');
|
|
final readingDoesNotMatchKanji =
|
|
r_ele.findElements('re_nokanji').isNotEmpty;
|
|
readingEls.add(
|
|
ReadingElement(
|
|
orderNum: orderNum + 1,
|
|
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
|
|
info: r_ele
|
|
.findElements('re_inf')
|
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
|
.toList(),
|
|
restrictions:
|
|
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
|
|
reading: r_ele.findElements('reb').first.innerText,
|
|
news: re_pri[0],
|
|
ichi: re_pri[1],
|
|
spec: re_pri[2],
|
|
gai: re_pri[3],
|
|
nf: re_pri[4],
|
|
),
|
|
);
|
|
}
|
|
|
|
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
|
|
senseId++;
|
|
final result = Sense(
|
|
id: senseId,
|
|
orderNum: orderNum + 1,
|
|
restrictedToKanji:
|
|
sense.findElements('stagk').map((e) => e.innerText).toList(),
|
|
restrictedToReading:
|
|
sense.findElements('stagr').map((e) => e.innerText).toList(),
|
|
pos: sense
|
|
.findElements('pos')
|
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
|
.toList(),
|
|
misc: sense
|
|
.findElements('misc')
|
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
|
.toList(),
|
|
dialects: sense
|
|
.findElements('dial')
|
|
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
|
.toList(),
|
|
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
|
|
languageSource: sense
|
|
.findElements('lsource')
|
|
.map(
|
|
(e) => LanguageSource(
|
|
language: e.getAttribute('xml:lang') ?? 'eng',
|
|
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
|
|
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
|
|
),
|
|
)
|
|
.toList(),
|
|
glossary: sense
|
|
.findElements('gloss')
|
|
.map(
|
|
(e) => Glossary(
|
|
language: e.getAttribute('xml:lang') ?? 'eng',
|
|
phrase: e.innerText,
|
|
type: e.getAttribute('g_type'),
|
|
),
|
|
)
|
|
.toList(),
|
|
antonyms: sense
|
|
.findElements('ant')
|
|
.map((e) => parseXrefParts(e.innerText))
|
|
.toList(),
|
|
seeAlso: sense
|
|
.findElements('xref')
|
|
.map((e) => parseXrefParts(e.innerText))
|
|
.toList(),
|
|
);
|
|
|
|
if (result.isEmpty) {
|
|
print(
|
|
'WARNING: Sense $senseId for entry $entryId is empty, '
|
|
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
|
|
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
|
|
);
|
|
} else {
|
|
senses.add(result);
|
|
}
|
|
}
|
|
|
|
entries.add(
|
|
Entry(
|
|
id: entryId,
|
|
kanji: kanjiEls,
|
|
readings: readingEls,
|
|
senses: senses,
|
|
),
|
|
);
|
|
}
|
|
|
|
return entries;
|
|
}
|