Files
jadb/lib/_data_ingestion/kanjidic/xml_parser.dart

176 lines
5.7 KiB
Dart

import 'package:jadb/_data_ingestion/kanjidic/objects.dart';
import 'package:jadb/util/romaji_transliteration.dart';
import 'package:xml/xml.dart';
List<Character> parseKANJIDICData(XmlElement root) {
final List<Character> result = [];
for (final c in root.findElements('character')) {
final kanji = c.findElements('literal').first.innerText;
final codepoint = c.findElements('codepoint').firstOrNull;
final radical = c.findElements('radical').firstOrNull;
final misc = c.findElements('misc').first;
final dicNumber = c.findElements('dic_number').firstOrNull;
final queryCode = c.findElements('query_code').first;
final readingMeaning = c.findElements('reading_meaning').firstOrNull;
// TODO: Group readings and meanings by their rmgroup parent node.
result.add(
Character(
literal: kanji,
strokeCount: int.parse(
misc.findElements('stroke_count').first.innerText,
),
grade: int.tryParse(
misc.findElements('grade').firstOrNull?.innerText ?? '',
),
frequency: int.tryParse(
misc.findElements('freq').firstOrNull?.innerText ?? '',
),
jlpt: int.tryParse(
misc.findElements('jlpt').firstOrNull?.innerText ?? '',
),
radicalName: misc
.findElements('rad_name')
.map((e) => e.innerText)
.toList(),
codepoints:
codepoint
?.findElements('cp_value')
.map(
(e) => CodePoint(
kanji: kanji,
type: e.getAttribute('cp_type')!,
codepoint: e.innerText,
),
)
.toList() ??
[],
radical: radical
?.findElements('rad_value')
.where((e) => e.getAttribute('rad_type') == 'classical')
.map(
(e) => Radical(kanji: kanji, radicalId: int.parse(e.innerText)),
)
.firstOrNull,
strokeMiscounts: misc
.findElements('stroke_count')
.skip(1)
.map((e) => int.parse(e.innerText))
.toList(),
variants: misc
.findElements('variant')
.map(
(e) => Variant(
kanji: kanji,
type: e.getAttribute('var_type')!,
variant: e.innerText,
),
)
.toList(),
dictionaryReferences:
dicNumber
?.findElements('dic_ref')
.where((e) => e.getAttribute('dr_type') != 'moro')
.map(
(e) => DictionaryReference(
kanji: kanji,
type: e.getAttribute('dr_type')!,
ref: e.innerText,
),
)
.toList() ??
[],
dictionaryReferencesMoro:
dicNumber
?.findElements('dic_ref')
.where((e) => e.getAttribute('dr_type') == 'moro')
.map(
(e) => DictionaryReferenceMoro(
kanji: kanji,
ref: e.innerText,
page: int.tryParse(e.getAttribute('m_page') ?? ''),
volume: int.tryParse(e.getAttribute('m_vol') ?? ''),
),
)
.toList() ??
[],
querycodes: queryCode
.findElements('q_code')
.map(
(e) => QueryCode(
kanji: kanji,
code: e.innerText,
type: e.getAttribute('qc_type')!,
skipMisclassification: e.getAttribute('skip_misclass'),
),
)
.toList(),
readings:
readingMeaning
?.findAllElements('reading')
.where(
(e) =>
!['ja_on', 'ja_kun'].contains(e.getAttribute('r_type')),
)
.map(
(e) => Reading(
kanji: kanji,
type: e.getAttribute('r_type')!,
reading: e.innerText,
),
)
.toList() ??
[],
kunyomi:
readingMeaning
?.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_kun')
.map(
(e) => Kunyomi(
kanji: kanji,
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
),
)
.toList() ??
[],
onyomi:
readingMeaning
?.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_on')
.map(
(e) => Onyomi(
kanji: kanji,
yomi: transliterateKatakanaToHiragana(e.innerText),
isJouyou: e.getAttribute('r_status') == 'jy',
type: e.getAttribute('on_type'),
),
)
.toList() ??
[],
meanings:
readingMeaning
?.findAllElements('meaning')
.map(
(e) => Meaning(
kanji: kanji,
language: e.getAttribute('m_lang') ?? 'eng',
meaning: e.innerText,
),
)
.toList() ??
[],
nanori:
readingMeaning
?.findElements('nanori')
.map((e) => e.innerText)
.toList() ??
[],
),
);
}
return result;
}