import 'package:jadb/_data_ingestion/jmdict/objects.dart'; import 'package:xml/xml.dart'; /// parse priority values from r_ele and k_ele xml elements /// /// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq List getPriorityValues(XmlElement e, String prefix) { int? news, ichi, spec, gai, nf; for (final pri in e.findElements('${prefix}_pri')) { final txt = pri.innerText; if (txt.startsWith('news')) news = int.parse(txt.substring(4)); else if (txt.startsWith('ichi')) ichi = int.parse(txt.substring(4)); else if (txt.startsWith('spec')) spec = int.parse(txt.substring(4)); else if (txt.startsWith('gai')) gai = int.parse(txt.substring(3)); else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2)); } return [news, ichi, spec, gai, nf]; } // source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref XRefParts parseXrefParts(String s) { final parts = s.split('・'); late final XRefParts result; switch (parts.length) { case 1: result = parts[0].contains(kanaRegex) ? XRefParts(readingRef: parts[0]) : XRefParts(kanjiRef: parts[0]); break; case 2: if (int.tryParse(parts[1]) != null) { if (parts[0].contains(kanaRegex)) { result = XRefParts( readingRef: parts[0], senseOrderNum: int.parse(parts[1]), ); } else { result = XRefParts( kanjiRef: parts[0], senseOrderNum: int.parse(parts[1]), ); } } else { result = XRefParts( kanjiRef: parts[0], readingRef: parts[1], ); } break; case 3: result = XRefParts( kanjiRef: parts[0], readingRef: parts[1], senseOrderNum: int.parse(parts[2]), ); break; default: result = XRefParts(); break; } return result; } List parseJMDictData(XmlElement root) { final List entries = []; int senseId = 0; for (final entry in root.childElements) { final entryId = int.parse(entry.findElements('ent_seq').first.innerText); final List kanjiEls = []; final List readingEls = []; final List senses = []; for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) { final ke_pri = getPriorityValues(k_ele, 'ke'); kanjiEls.add( KanjiElement( orderNum: kanjiNum + 1, info: k_ele .findElements('ke_inf') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), reading: k_ele.findElements('keb').first.innerText, news: ke_pri[0], ichi: ke_pri[1], spec: ke_pri[2], gai: ke_pri[3], nf: ke_pri[4], ), ); } for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) { final re_pri = getPriorityValues(r_ele, 're'); final readingDoesNotMatchKanji = r_ele.findElements('re_nokanji').isNotEmpty; readingEls.add( ReadingElement( orderNum: orderNum + 1, readingDoesNotMatchKanji: readingDoesNotMatchKanji, info: r_ele .findElements('re_inf') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), restrictions: r_ele.findElements('re_restr').map((e) => e.innerText).toList(), reading: r_ele.findElements('reb').first.innerText, news: re_pri[0], ichi: re_pri[1], spec: re_pri[2], gai: re_pri[3], nf: re_pri[4], ), ); } for (final (orderNum, sense) in entry.findElements('sense').indexed) { senseId++; final result = Sense( senseId: senseId, orderNum: orderNum + 1, restrictedToKanji: sense.findElements('stagk').map((e) => e.innerText).toList(), restrictedToReading: sense.findElements('stagr').map((e) => e.innerText).toList(), pos: sense .findElements('pos') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), misc: sense .findElements('misc') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), dialects: sense .findElements('dial') .map((e) => e.innerText.substring(1, e.innerText.length - 1)) .toList(), info: sense.findElements('s_inf').map((e) => e.innerText).toList(), languageSource: sense .findElements('lsource') .map( (e) => LanguageSource( language: e.getAttribute('xml:lang') ?? 'eng', phrase: e.innerText.isNotEmpty ? e.innerText : null, fullyDescribesSense: e.getAttribute('ls_type') == 'part', constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y', ), ) .toList(), glossary: sense .findElements('gloss') .map( (e) => Glossary( language: e.getAttribute('xml:lang') ?? 'eng', phrase: e.innerText, type: e.getAttribute('g_type'), ), ) .toList(), antonyms: sense .findElements('ant') .map((e) => parseXrefParts(e.innerText)) .toList(), seeAlso: sense .findElements('xref') .map((e) => parseXrefParts(e.innerText)) .toList(), ); if (result.isEmpty) { print( 'WARNING: Sense $senseId for entry $entryId is empty, ' 'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, ' 'reading: ${readingEls.map((e) => e.reading).join(', ')}', ); } else { senses.add(result); } } entries.add( Entry( entryId: entryId, kanji: kanjiEls, readings: readingEls, senses: senses, ), ); } return entries; }