data_ingestion: fix kanji grade, jlpt, newspaper rank ingestion

This commit is contained in:
2025-04-19 01:18:10 +02:00
parent ab3a5a92db
commit 23f90c1127

View File

@@ -10,40 +10,51 @@ List<Character> transformXML(XmlElement root) {
final List<Character> result = [];
for (final c in root.findElements('character')) {
final kanji = c.findElements('literal').first.innerText;
final codepoint = c.findElements('codepoint').firstOrNull;
final radical = c.findElements('radical').firstOrNull;
final misc = c.findElements('misc').first;
final dic_number = c.findElements('dic_number').firstOrNull;
result.add(
Character(
literal: kanji,
strokeCount:
int.parse(c.findAllElements('stroke_count').first.innerText),
grade:
int.tryParse(c.findElements('grade').firstOrNull?.innerText ?? ''),
frequency:
int.tryParse(c.findElements('freq').firstOrNull?.innerText ?? ''),
int.parse(misc.findElements('stroke_count').first.innerText),
grade: int.tryParse(
misc.findElements('grade').firstOrNull?.innerText ?? ''),
frequency: int.tryParse(
misc.findElements('freq').firstOrNull?.innerText ?? ''),
jlpt: int.tryParse(
c.findElements('rad_name').firstOrNull?.innerText ?? '',
misc.findElements('jlpt').firstOrNull?.innerText ?? '',
),
radicalName:
c.findElements('rad_name').map((e) => e.innerText).toList(),
codepoints: c
.findAllElements('cp_value')
.map(
(e) => CodePoint(
kanji: kanji,
type: e.getAttribute('cp_type')!,
codepoint: e.innerText,
),
)
.toList(),
radicals: c
.findAllElements('rad_value')
.map(
(e) => Radical(
kanji: kanji,
type: e.getAttribute('rad_type')!,
radical: e.innerText,
),
)
.toList(),
radicalName: radical
?.findElements('rad_name')
.map((e) => e.innerText)
.toList() ??
[],
codepoints: codepoint
?.findElements('cp_value')
.map(
(e) => CodePoint(
kanji: kanji,
type: e.getAttribute('cp_type')!,
codepoint: e.innerText,
),
)
.toList() ??
[],
radicals: radical
?.findElements('rad_value')
.map(
(e) => Radical(
kanji: kanji,
type: e.getAttribute('rad_type')!,
radical: e.innerText,
),
)
.toList() ??
[],
strokeMiscounts: c
.findAllElements('stroke_count')
.skip(1)
@@ -59,29 +70,31 @@ List<Character> transformXML(XmlElement root) {
),
)
.toList(),
dictionaryReferences: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') != 'moro')
.map(
(e) => DictionaryReference(
kanji: kanji,
type: e.getAttribute('dr_type')!,
ref: e.innerText,
),
)
.toList(),
dictionaryReferencesMoro: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') == 'moro')
.map(
(e) => DictionaryReferenceMoro(
kanji: kanji,
ref: e.innerText,
page: int.tryParse(e.getAttribute('m_page') ?? ''),
volume: int.tryParse(e.getAttribute('m_vol') ?? ''),
),
)
.toList(),
dictionaryReferences: dic_number
?.findElements('dic_ref')
.where((e) => e.getAttribute('dr_type') != 'moro')
.map(
(e) => DictionaryReference(
kanji: kanji,
type: e.getAttribute('dr_type')!,
ref: e.innerText,
),
)
.toList() ??
[],
dictionaryReferencesMoro: dic_number
?.findElements('dic_ref')
.where((e) => e.getAttribute('dr_type') == 'moro')
.map(
(e) => DictionaryReferenceMoro(
kanji: kanji,
ref: e.innerText,
page: int.tryParse(e.getAttribute('m_page') ?? ''),
volume: int.tryParse(e.getAttribute('m_vol') ?? ''),
),
)
.toList() ??
[],
querycodes: c
.findAllElements('q_code')
.map(