lib/_data_ingestion: split parsing and seeding steps
This commit is contained in:
@@ -4,9 +4,7 @@ import 'dart:io';
|
||||
|
||||
import 'package:args/args.dart';
|
||||
import 'package:args/command_runner.dart';
|
||||
import 'package:jadb/_data_ingestion/jmdict/parser.dart';
|
||||
import 'package:jadb/_data_ingestion/kanjidic/parser.dart';
|
||||
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
|
||||
import 'package:jadb/_data_ingestion/seed_database.dart';
|
||||
import 'package:jadb/search.dart';
|
||||
import 'package:sqflite_common_ffi/sqflite_ffi.dart';
|
||||
import 'package:sqlite3/open.dart';
|
||||
@@ -58,9 +56,13 @@ class CreateDb extends Command {
|
||||
),
|
||||
);
|
||||
|
||||
await addDataFromJMdict(db);
|
||||
await addDataFromRADKFILE(db);
|
||||
await addDataFromKANJIDIC(db);
|
||||
seedData(db).then((_) {
|
||||
print("Database created successfully");
|
||||
}).catchError((error) {
|
||||
print("Error creating database: $error");
|
||||
}).whenComplete(() {
|
||||
db.close();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,80 +1,9 @@
|
||||
import 'dart:collection';
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:collection/collection.dart';
|
||||
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
||||
import 'package:jadb/util/romaji_transliteration.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
import 'package:xml/xml.dart';
|
||||
|
||||
import 'objects.dart';
|
||||
|
||||
/// parse priority values from r_ele and k_ele xml elements
|
||||
///
|
||||
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
|
||||
List<int?> getPriorityValues(XmlElement e, String prefix) {
|
||||
int? news, ichi, spec, gai, nf;
|
||||
for (final pri in e.findElements('${prefix}_pri')) {
|
||||
final txt = pri.innerText;
|
||||
if (txt.startsWith('news'))
|
||||
news = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('ichi'))
|
||||
ichi = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('spec'))
|
||||
spec = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('gai'))
|
||||
gai = int.parse(txt.substring(3));
|
||||
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
|
||||
}
|
||||
return [news, ichi, spec, gai, nf];
|
||||
}
|
||||
|
||||
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
|
||||
XRefParts parseXrefParts(String s) {
|
||||
final parts = s.split('・');
|
||||
late final XRefParts result;
|
||||
switch (parts.length) {
|
||||
case 1:
|
||||
result = parts[0].contains(kanaRegex)
|
||||
? XRefParts(readingRef: parts[0])
|
||||
: XRefParts(kanjiRef: parts[0]);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
if (int.tryParse(parts[1]) != null) {
|
||||
if (parts[0].contains(kanaRegex)) {
|
||||
result = XRefParts(
|
||||
readingRef: parts[0],
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
} else {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
readingRef: parts[1],
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
readingRef: parts[1],
|
||||
senseOrderNum: int.parse(parts[2]),
|
||||
);
|
||||
break;
|
||||
|
||||
default:
|
||||
result = XRefParts();
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
class ResolvedXref {
|
||||
Entry entry;
|
||||
@@ -139,138 +68,7 @@ ResolvedXref resolveXref(
|
||||
}
|
||||
}
|
||||
|
||||
List<Entry> parseXML(XmlElement root) {
|
||||
final List<Entry> entries = [];
|
||||
|
||||
int senseId = 0;
|
||||
|
||||
for (final entry in root.childElements) {
|
||||
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
|
||||
|
||||
final List<KanjiElement> kanjiEls = [];
|
||||
final List<ReadingElement> readingEls = [];
|
||||
final List<Sense> senses = [];
|
||||
|
||||
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
|
||||
final ke_pri = getPriorityValues(k_ele, 'ke');
|
||||
kanjiEls.add(
|
||||
KanjiElement(
|
||||
orderNum: kanjiNum + 1,
|
||||
info: k_ele
|
||||
.findElements('ke_inf')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
reading: k_ele.findElements('keb').first.innerText,
|
||||
news: ke_pri[0],
|
||||
ichi: ke_pri[1],
|
||||
spec: ke_pri[2],
|
||||
gai: ke_pri[3],
|
||||
nf: ke_pri[4],
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
|
||||
final re_pri = getPriorityValues(r_ele, 're');
|
||||
final readingDoesNotMatchKanji =
|
||||
r_ele.findElements('re_nokanji').isNotEmpty;
|
||||
readingEls.add(
|
||||
ReadingElement(
|
||||
orderNum: orderNum + 1,
|
||||
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
|
||||
info: r_ele
|
||||
.findElements('re_inf')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
restrictions:
|
||||
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
|
||||
reading: r_ele.findElements('reb').first.innerText,
|
||||
news: re_pri[0],
|
||||
ichi: re_pri[1],
|
||||
spec: re_pri[2],
|
||||
gai: re_pri[3],
|
||||
nf: re_pri[4],
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
|
||||
senseId++;
|
||||
final result = Sense(
|
||||
id: senseId,
|
||||
orderNum: orderNum + 1,
|
||||
restrictedToKanji:
|
||||
sense.findElements('stagk').map((e) => e.innerText).toList(),
|
||||
restrictedToReading:
|
||||
sense.findElements('stagr').map((e) => e.innerText).toList(),
|
||||
pos: sense
|
||||
.findElements('pos')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
misc: sense
|
||||
.findElements('misc')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
dialects: sense
|
||||
.findElements('dial')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
|
||||
languageSource: sense
|
||||
.findElements('lsource')
|
||||
.map(
|
||||
(e) => LanguageSource(
|
||||
language: e.getAttribute('xml:lang') ?? 'eng',
|
||||
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
|
||||
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
|
||||
),
|
||||
)
|
||||
.toList(),
|
||||
glossary: sense
|
||||
.findElements('gloss')
|
||||
.map(
|
||||
(e) => Glossary(
|
||||
language: e.getAttribute('xml:lang') ?? 'eng',
|
||||
phrase: e.innerText,
|
||||
type: e.getAttribute('g_type'),
|
||||
),
|
||||
)
|
||||
.toList(),
|
||||
antonyms: sense
|
||||
.findElements('ant')
|
||||
.map((e) => parseXrefParts(e.innerText))
|
||||
.toList(),
|
||||
seeAlso: sense
|
||||
.findElements('xref')
|
||||
.map((e) => parseXrefParts(e.innerText))
|
||||
.toList(),
|
||||
);
|
||||
|
||||
if (result.isEmpty) {
|
||||
print(
|
||||
'WARNING: Sense $senseId for entry $entryId is empty, '
|
||||
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
|
||||
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
|
||||
);
|
||||
} else {
|
||||
senses.add(result);
|
||||
}
|
||||
}
|
||||
|
||||
entries.add(
|
||||
Entry(
|
||||
id: entryId,
|
||||
kanji: kanjiEls,
|
||||
readings: readingEls,
|
||||
senses: senses,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
|
||||
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
Future<void> seedJMDictData(List<Entry> entries, Database db) async {
|
||||
print(' [JMdict] Batch 1 - Kanji and readings');
|
||||
Batch b = db.batch();
|
||||
for (final e in entries) {
|
||||
@@ -471,17 +269,3 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
|
||||
|
||||
await b.commit();
|
||||
}
|
||||
|
||||
Future<void> addDataFromJMdict(Database db) async {
|
||||
print('[JMdict] Reading file content...');
|
||||
String rawXML = File('data/tmp/JMdict.xml').readAsStringSync();
|
||||
|
||||
print('[JMdict] Parsing XML tags...');
|
||||
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
|
||||
|
||||
print('[JMdict] Parsing XML content...');
|
||||
final entries = parseXML(root);
|
||||
|
||||
print('[JMdict] Writing to database...');
|
||||
await insertIntoDB(entries, db);
|
||||
}
|
||||
201
lib/_data_ingestion/jmdict/xml_parser.dart
Normal file
201
lib/_data_ingestion/jmdict/xml_parser.dart
Normal file
@@ -0,0 +1,201 @@
|
||||
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
|
||||
import 'package:xml/xml.dart';
|
||||
|
||||
/// parse priority values from r_ele and k_ele xml elements
|
||||
///
|
||||
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
|
||||
List<int?> getPriorityValues(XmlElement e, String prefix) {
|
||||
int? news, ichi, spec, gai, nf;
|
||||
for (final pri in e.findElements('${prefix}_pri')) {
|
||||
final txt = pri.innerText;
|
||||
if (txt.startsWith('news'))
|
||||
news = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('ichi'))
|
||||
ichi = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('spec'))
|
||||
spec = int.parse(txt.substring(4));
|
||||
else if (txt.startsWith('gai'))
|
||||
gai = int.parse(txt.substring(3));
|
||||
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
|
||||
}
|
||||
return [news, ichi, spec, gai, nf];
|
||||
}
|
||||
|
||||
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
|
||||
XRefParts parseXrefParts(String s) {
|
||||
final parts = s.split('・');
|
||||
late final XRefParts result;
|
||||
switch (parts.length) {
|
||||
case 1:
|
||||
result = parts[0].contains(kanaRegex)
|
||||
? XRefParts(readingRef: parts[0])
|
||||
: XRefParts(kanjiRef: parts[0]);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
if (int.tryParse(parts[1]) != null) {
|
||||
if (parts[0].contains(kanaRegex)) {
|
||||
result = XRefParts(
|
||||
readingRef: parts[0],
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
} else {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
senseOrderNum: int.parse(parts[1]),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
readingRef: parts[1],
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
result = XRefParts(
|
||||
kanjiRef: parts[0],
|
||||
readingRef: parts[1],
|
||||
senseOrderNum: int.parse(parts[2]),
|
||||
);
|
||||
break;
|
||||
|
||||
default:
|
||||
result = XRefParts();
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
List<Entry> parseJMDictData(XmlElement root) {
|
||||
final List<Entry> entries = [];
|
||||
|
||||
int senseId = 0;
|
||||
|
||||
for (final entry in root.childElements) {
|
||||
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
|
||||
|
||||
final List<KanjiElement> kanjiEls = [];
|
||||
final List<ReadingElement> readingEls = [];
|
||||
final List<Sense> senses = [];
|
||||
|
||||
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
|
||||
final ke_pri = getPriorityValues(k_ele, 'ke');
|
||||
kanjiEls.add(
|
||||
KanjiElement(
|
||||
orderNum: kanjiNum + 1,
|
||||
info: k_ele
|
||||
.findElements('ke_inf')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
reading: k_ele.findElements('keb').first.innerText,
|
||||
news: ke_pri[0],
|
||||
ichi: ke_pri[1],
|
||||
spec: ke_pri[2],
|
||||
gai: ke_pri[3],
|
||||
nf: ke_pri[4],
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
|
||||
final re_pri = getPriorityValues(r_ele, 're');
|
||||
final readingDoesNotMatchKanji =
|
||||
r_ele.findElements('re_nokanji').isNotEmpty;
|
||||
readingEls.add(
|
||||
ReadingElement(
|
||||
orderNum: orderNum + 1,
|
||||
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
|
||||
info: r_ele
|
||||
.findElements('re_inf')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
restrictions:
|
||||
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
|
||||
reading: r_ele.findElements('reb').first.innerText,
|
||||
news: re_pri[0],
|
||||
ichi: re_pri[1],
|
||||
spec: re_pri[2],
|
||||
gai: re_pri[3],
|
||||
nf: re_pri[4],
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
|
||||
senseId++;
|
||||
final result = Sense(
|
||||
id: senseId,
|
||||
orderNum: orderNum + 1,
|
||||
restrictedToKanji:
|
||||
sense.findElements('stagk').map((e) => e.innerText).toList(),
|
||||
restrictedToReading:
|
||||
sense.findElements('stagr').map((e) => e.innerText).toList(),
|
||||
pos: sense
|
||||
.findElements('pos')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
misc: sense
|
||||
.findElements('misc')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
dialects: sense
|
||||
.findElements('dial')
|
||||
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
|
||||
.toList(),
|
||||
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
|
||||
languageSource: sense
|
||||
.findElements('lsource')
|
||||
.map(
|
||||
(e) => LanguageSource(
|
||||
language: e.getAttribute('xml:lang') ?? 'eng',
|
||||
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
|
||||
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
|
||||
),
|
||||
)
|
||||
.toList(),
|
||||
glossary: sense
|
||||
.findElements('gloss')
|
||||
.map(
|
||||
(e) => Glossary(
|
||||
language: e.getAttribute('xml:lang') ?? 'eng',
|
||||
phrase: e.innerText,
|
||||
type: e.getAttribute('g_type'),
|
||||
),
|
||||
)
|
||||
.toList(),
|
||||
antonyms: sense
|
||||
.findElements('ant')
|
||||
.map((e) => parseXrefParts(e.innerText))
|
||||
.toList(),
|
||||
seeAlso: sense
|
||||
.findElements('xref')
|
||||
.map((e) => parseXrefParts(e.innerText))
|
||||
.toList(),
|
||||
);
|
||||
|
||||
if (result.isEmpty) {
|
||||
print(
|
||||
'WARNING: Sense $senseId for entry $entryId is empty, '
|
||||
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
|
||||
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
|
||||
);
|
||||
} else {
|
||||
senses.add(result);
|
||||
}
|
||||
}
|
||||
|
||||
entries.add(
|
||||
Entry(
|
||||
id: entryId,
|
||||
kanji: kanjiEls,
|
||||
readings: readingEls,
|
||||
senses: senses,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
74
lib/_data_ingestion/kanjidic/seed_data.dart
Normal file
74
lib/_data_ingestion/kanjidic/seed_data.dart
Normal file
@@ -0,0 +1,74 @@
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
import 'objects.dart';
|
||||
|
||||
Future<void> seedKANJIDICData(List<Character> characters, Database db) async {
|
||||
final b = db.batch();
|
||||
for (final c in characters) {
|
||||
// if (c.dictionaryReferences.any((e) =>
|
||||
// c.dictionaryReferences
|
||||
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
|
||||
// .length >
|
||||
// 1)) {
|
||||
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
|
||||
// }
|
||||
b.insert(TableNames.character, c.sqlValue);
|
||||
for (final n in c.radicalName) {
|
||||
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
|
||||
}
|
||||
for (final cp in c.codepoints) {
|
||||
b.insert(TableNames.codepoint, cp.sqlValue);
|
||||
}
|
||||
for (final r in c.radicals) {
|
||||
b.insert(TableNames.radical, r.sqlValue);
|
||||
}
|
||||
for (final sm in c.strokeMiscounts) {
|
||||
b.insert(
|
||||
TableNames.strokeMiscount,
|
||||
{
|
||||
'kanji': c.literal,
|
||||
'strokeCount': sm,
|
||||
},
|
||||
);
|
||||
}
|
||||
for (final v in c.variants) {
|
||||
b.insert(TableNames.variant, v.sqlValue);
|
||||
}
|
||||
for (final dr in c.dictionaryReferences) {
|
||||
// There are duplicate entries here
|
||||
b.insert(
|
||||
TableNames.dictionaryReference,
|
||||
dr.sqlValue,
|
||||
conflictAlgorithm: ConflictAlgorithm.ignore,
|
||||
);
|
||||
}
|
||||
for (final drm in c.dictionaryReferencesMoro) {
|
||||
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
|
||||
}
|
||||
for (final q in c.querycodes) {
|
||||
b.insert(TableNames.queryCode, q.sqlValue);
|
||||
}
|
||||
for (final r in c.readings) {
|
||||
b.insert(TableNames.reading, r.sqlValue);
|
||||
}
|
||||
for (final k in c.kunyomi) {
|
||||
b.insert(TableNames.kunyomi, k.sqlValue);
|
||||
}
|
||||
for (final o in c.onyomi) {
|
||||
b.insert(TableNames.onyomi, o.sqlValue);
|
||||
}
|
||||
for (final m in c.meanings) {
|
||||
b.insert(TableNames.meaning, m.sqlValue);
|
||||
}
|
||||
for (final n in c.nanori) {
|
||||
b.insert(
|
||||
TableNames.nanori,
|
||||
{
|
||||
'kanji': c.literal,
|
||||
'nanori': n,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
b.commit();
|
||||
}
|
||||
@@ -1,12 +1,7 @@
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
import 'package:jadb/_data_ingestion/kanjidic/objects.dart';
|
||||
import 'package:xml/xml.dart';
|
||||
import 'package:collection/collection.dart';
|
||||
|
||||
import 'objects.dart';
|
||||
|
||||
List<Character> transformXML(XmlElement root) {
|
||||
List<Character> parseKANJIDICData(XmlElement root) {
|
||||
final List<Character> result = [];
|
||||
for (final c in root.findElements('character')) {
|
||||
final kanji = c.findElements('literal').first.innerText;
|
||||
@@ -170,88 +165,3 @@ List<Character> transformXML(XmlElement root) {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Future<void> insertIntoDB(List<Character> characters, Database db) async {
|
||||
final b = db.batch();
|
||||
for (final c in characters) {
|
||||
// if (c.dictionaryReferences.any((e) =>
|
||||
// c.dictionaryReferences
|
||||
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
|
||||
// .length >
|
||||
// 1)) {
|
||||
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
|
||||
// }
|
||||
b.insert(TableNames.character, c.sqlValue);
|
||||
for (final n in c.radicalName) {
|
||||
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
|
||||
}
|
||||
for (final cp in c.codepoints) {
|
||||
b.insert(TableNames.codepoint, cp.sqlValue);
|
||||
}
|
||||
for (final r in c.radicals) {
|
||||
b.insert(TableNames.radical, r.sqlValue);
|
||||
}
|
||||
for (final sm in c.strokeMiscounts) {
|
||||
b.insert(
|
||||
TableNames.strokeMiscount,
|
||||
{
|
||||
'kanji': c.literal,
|
||||
'strokeCount': sm,
|
||||
},
|
||||
);
|
||||
}
|
||||
for (final v in c.variants) {
|
||||
b.insert(TableNames.variant, v.sqlValue);
|
||||
}
|
||||
for (final dr in c.dictionaryReferences) {
|
||||
// There are duplicate entries here
|
||||
b.insert(
|
||||
TableNames.dictionaryReference,
|
||||
dr.sqlValue,
|
||||
conflictAlgorithm: ConflictAlgorithm.ignore,
|
||||
);
|
||||
}
|
||||
for (final drm in c.dictionaryReferencesMoro) {
|
||||
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
|
||||
}
|
||||
for (final q in c.querycodes) {
|
||||
b.insert(TableNames.queryCode, q.sqlValue);
|
||||
}
|
||||
for (final r in c.readings) {
|
||||
b.insert(TableNames.reading, r.sqlValue);
|
||||
}
|
||||
for (final k in c.kunyomi) {
|
||||
b.insert(TableNames.kunyomi, k.sqlValue);
|
||||
}
|
||||
for (final o in c.onyomi) {
|
||||
b.insert(TableNames.onyomi, o.sqlValue);
|
||||
}
|
||||
for (final m in c.meanings) {
|
||||
b.insert(TableNames.meaning, m.sqlValue);
|
||||
}
|
||||
for (final n in c.nanori) {
|
||||
b.insert(
|
||||
TableNames.nanori,
|
||||
{
|
||||
'kanji': c.literal,
|
||||
'nanori': n,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
b.commit();
|
||||
}
|
||||
|
||||
Future<void> addDataFromKANJIDIC(Database db) async {
|
||||
print('[KANJIDIC2] Reading file...');
|
||||
String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync();
|
||||
|
||||
print('[KANJIDIC2] Parsing XML...');
|
||||
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
|
||||
|
||||
print('[KANJIDIC2] Transforming data...');
|
||||
final entries = transformXML(root);
|
||||
|
||||
print('[KANJIDIC2] Writing to database...');
|
||||
await insertIntoDB(entries, db);
|
||||
}
|
||||
@@ -1,32 +1,10 @@
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
Future<void> addDataFromRADKFILE(Database db) async {
|
||||
Iterable<String> parseRADKFILEBlocks(File radkfile) {
|
||||
final String content = File('data/tmp/radkfile_utf8').readAsStringSync();
|
||||
|
||||
final Iterable<String> blocks =
|
||||
content.replaceAll(RegExp(r'^#.*$'), '').split(r'$').skip(2);
|
||||
|
||||
print('[RADKFILE] Writing to database...');
|
||||
final b = db.batch();
|
||||
|
||||
for (final block in blocks) {
|
||||
final String radical = block[1];
|
||||
final List<String> kanjiList = block
|
||||
.replaceFirst(RegExp(r'.*\n'), '')
|
||||
.split('')
|
||||
..removeWhere((e) => e == '' || e == '\n');
|
||||
|
||||
for (final kanji in kanjiList.toSet()) {
|
||||
b.insert(
|
||||
'RADKFILE',
|
||||
{
|
||||
'radical': radical,
|
||||
'kanji': kanji,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
b.commit();
|
||||
return blocks;
|
||||
}
|
||||
|
||||
28
lib/_data_ingestion/radkfile/seed_data.dart
Normal file
28
lib/_data_ingestion/radkfile/seed_data.dart
Normal file
@@ -0,0 +1,28 @@
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
|
||||
Future<void> seedRADKFILEData(
|
||||
Iterable<String> blocks,
|
||||
Database db,
|
||||
) async {
|
||||
final b = db.batch();
|
||||
|
||||
for (final block in blocks) {
|
||||
final String radical = block[1];
|
||||
final List<String> kanjiList = block
|
||||
.replaceFirst(RegExp(r'.*\n'), '')
|
||||
.split('')
|
||||
..removeWhere((e) => e == '' || e == '\n');
|
||||
|
||||
for (final kanji in kanjiList.toSet()) {
|
||||
b.insert(
|
||||
'RADKFILE',
|
||||
{
|
||||
'radical': radical,
|
||||
'kanji': kanji,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
b.commit();
|
||||
}
|
||||
55
lib/_data_ingestion/seed_database.dart
Normal file
55
lib/_data_ingestion/seed_database.dart
Normal file
@@ -0,0 +1,55 @@
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:jadb/_data_ingestion/jmdict/seed_data.dart';
|
||||
import 'package:jadb/_data_ingestion/jmdict/xml_parser.dart';
|
||||
import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart';
|
||||
import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart';
|
||||
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
|
||||
import 'package:jadb/_data_ingestion/radkfile/seed_data.dart';
|
||||
import 'package:sqflite_common/sqlite_api.dart';
|
||||
import 'package:xml/xml.dart';
|
||||
|
||||
Future<void> seedData(Database db) async {
|
||||
await parseAndSeedDataFromJMdict(db);
|
||||
await parseAndSeedDataFromRADKFILE(db);
|
||||
await parseAndSeedDataFromKANJIDIC(db);
|
||||
}
|
||||
|
||||
Future<void> parseAndSeedDataFromJMdict(Database db) async {
|
||||
print('[JMdict] Reading file content...');
|
||||
String rawXML = File('data/tmp/JMdict.xml').readAsStringSync();
|
||||
|
||||
print('[JMdict] Parsing XML tags...');
|
||||
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
|
||||
|
||||
print('[JMdict] Parsing XML content...');
|
||||
final entries = parseJMDictData(root);
|
||||
|
||||
print('[JMdict] Writing to database...');
|
||||
await seedJMDictData(entries, db);
|
||||
}
|
||||
|
||||
Future<void> parseAndSeedDataFromKANJIDIC(Database db) async {
|
||||
print('[KANJIDIC2] Reading file...');
|
||||
String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync();
|
||||
|
||||
print('[KANJIDIC2] Parsing XML...');
|
||||
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
|
||||
|
||||
print('[KANJIDIC2] Parsing XML content...');
|
||||
final entries = parseKANJIDICData(root);
|
||||
|
||||
print('[KANJIDIC2] Writing to database...');
|
||||
await seedKANJIDICData(entries, db);
|
||||
}
|
||||
|
||||
Future<void> parseAndSeedDataFromRADKFILE(Database db) async {
|
||||
print('[RADKFILE] Reading file...');
|
||||
File raw = File('data/tmp/RADKFILE');
|
||||
|
||||
print('[RADKFILE] Parsing content...');
|
||||
final blocks = parseRADKFILEBlocks(raw);
|
||||
|
||||
print('[RADKFILE] Writing to database...');
|
||||
seedRADKFILEData(blocks, db);
|
||||
}
|
||||
Reference in New Issue
Block a user