jadb/bin/kanjidic/parser.dart

232 lines
6.9 KiB
Dart
Raw Normal View History

2022-06-20 20:06:07 +02:00
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import 'package:collection/collection.dart';
import 'objects.dart';
List<Character> transformXML(XmlElement root) {
final List<Character> result = [];
for (final c in root.findElements('character')) {
final kanji = c.findElements('literal').first.innerText;
result.add(
Character(
literal: kanji,
strokeCount:
int.parse(c.findAllElements('stroke_count').first.innerText),
grade:
int.tryParse(c.findElements('grade').firstOrNull?.innerText ?? ''),
frequency:
int.tryParse(c.findElements('freq').firstOrNull?.innerText ?? ''),
jlpt: int.tryParse(
c.findElements('rad_name').firstOrNull?.innerText ?? '',
),
radicalName:
c.findElements('rad_name').map((e) => e.innerText).toList(),
codepoints: c
.findAllElements('cp_value')
.map(
(e) => CodePoint(
kanji: kanji,
type: e.getAttribute('cp_type')!,
codepoint: e.innerText,
),
)
.toList(),
radicals: c
.findAllElements('rad_value')
.map(
(e) => Radical(
kanji: kanji,
type: e.getAttribute('rad_type')!,
radical: e.innerText,
),
)
.toList(),
strokeMiscounts: c
.findAllElements('stroke_count')
.skip(1)
.map((e) => int.parse(e.innerText))
.toList(),
variants: c
.findAllElements('variant')
.map(
(e) => Variant(
kanji: kanji,
type: e.getAttribute('var_type')!,
variant: e.innerText,
),
)
.toList(),
dictionaryReferences: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') != 'moro')
.map(
(e) => DictionaryReference(
kanji: kanji,
type: e.getAttribute('dr_type')!,
ref: e.innerText,
),
)
.toList(),
dictionaryReferencesMoro: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') == 'moro')
.map(
(e) => DictionaryReferenceMoro(
kanji: kanji,
ref: e.innerText,
page: int.tryParse(e.getAttribute('m_page') ?? ''),
volume: int.tryParse(e.getAttribute('m_vol') ?? ''),
),
)
.toList(),
querycodes: c
.findAllElements('q_code')
.map(
(e) => QueryCode(
kanji: kanji,
code: e.innerText,
type: e.getAttribute('qc_type')!,
skipMisclassification: e.getAttribute('skip_misclass'),
),
)
.toList(),
readings: c
.findAllElements('reading')
.where(
(e) => !['ja_on', 'ja_kun'].contains(e.getAttribute('r_type')),
)
.map(
(e) => Reading(
kanji: kanji,
type: e.getAttribute('r_type')!,
reading: e.innerText,
),
)
.toList(),
kunyomi: c
.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_kun')
.map(
(e) => Kunyomi(
kanji: kanji,
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
),
)
.toList(),
onyomi: c
.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_on')
.map(
(e) => Onyomi(
kanji: kanji,
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
type: e.getAttribute('on_type')),
)
.toList(),
meanings: c
.findAllElements('meaning')
.map(
(e) => Meaning(
kanji: kanji,
language: e.getAttribute('m_lang') ?? 'eng',
meaning: e.innerText,
),
)
.toList(),
nanori: c.findAllElements('nanori').map((e) => e.innerText).toList(),
),
);
}
return result;
}
Future<void> insertIntoDB(List<Character> characters, Database db) async {
final b = db.batch();
for (final c in characters) {
// if (c.dictionaryReferences.any((e) =>
// c.dictionaryReferences
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
// .length >
// 1)) {
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
// }
b.insert(TableNames.character, c.sqlValue);
for (final n in c.radicalName) {
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
}
for (final cp in c.codepoints) {
b.insert(TableNames.codepoint, cp.sqlValue);
}
for (final r in c.radicals) {
b.insert(TableNames.radical, r.sqlValue);
}
for (final sm in c.strokeMiscounts) {
b.insert(
TableNames.strokeMiscount,
{
'kanji': c.literal,
'strokeCount': sm,
},
);
}
for (final v in c.variants) {
b.insert(TableNames.variant, v.sqlValue);
}
for (final dr in c.dictionaryReferences) {
// There are duplicate entries here
b.insert(
TableNames.dictionaryReference,
dr.sqlValue,
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
for (final drm in c.dictionaryReferencesMoro) {
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
}
for (final q in c.querycodes) {
b.insert(TableNames.queryCode, q.sqlValue);
}
for (final r in c.readings) {
b.insert(TableNames.reading, r.sqlValue);
}
for (final k in c.kunyomi) {
b.insert(TableNames.kunyomi, k.sqlValue);
}
for (final o in c.onyomi) {
b.insert(TableNames.onyomi, o.sqlValue);
}
for (final m in c.meanings) {
b.insert(TableNames.meaning, m.sqlValue);
}
for (final n in c.nanori) {
b.insert(
TableNames.nanori,
{
'kanji': c.literal,
'nanori': n,
},
);
}
}
b.commit();
}
Future<void> addDataFromKANJIDIC(Database db) async {
print('[KANJIDIC2] Reading file...');
String rawXML = File('data/kanjidic2.xml').readAsStringSync();
print('[KANJIDIC2] Parsing XML...');
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
print('[KANJIDIC2] Transforming data...');
final entries = transformXML(root);
print('[KANJIDIC2] Writing to database...');
await insertIntoDB(entries, db);
}