lib/_data_ingestion: split parsing and seeding steps

This commit is contained in:
2025-04-29 13:29:31 +02:00
parent 38706c0532
commit 0214473120
8 changed files with 373 additions and 341 deletions

View File

@@ -4,9 +4,7 @@ import 'dart:io';
import 'package:args/args.dart';
import 'package:args/command_runner.dart';
import 'package:jadb/_data_ingestion/jmdict/parser.dart';
import 'package:jadb/_data_ingestion/kanjidic/parser.dart';
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
import 'package:jadb/_data_ingestion/seed_database.dart';
import 'package:jadb/search.dart';
import 'package:sqflite_common_ffi/sqflite_ffi.dart';
import 'package:sqlite3/open.dart';
@@ -58,9 +56,13 @@ class CreateDb extends Command {
),
);
await addDataFromJMdict(db);
await addDataFromRADKFILE(db);
await addDataFromKANJIDIC(db);
seedData(db).then((_) {
print("Database created successfully");
}).catchError((error) {
print("Error creating database: $error");
}).whenComplete(() {
db.close();
});
}
}

View File

@@ -1,80 +1,9 @@
import 'dart:collection';
import 'dart:io';
import 'package:collection/collection.dart';
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
import 'package:jadb/util/romaji_transliteration.dart';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import 'objects.dart';
/// parse priority values from r_ele and k_ele xml elements
///
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
List<int?> getPriorityValues(XmlElement e, String prefix) {
int? news, ichi, spec, gai, nf;
for (final pri in e.findElements('${prefix}_pri')) {
final txt = pri.innerText;
if (txt.startsWith('news'))
news = int.parse(txt.substring(4));
else if (txt.startsWith('ichi'))
ichi = int.parse(txt.substring(4));
else if (txt.startsWith('spec'))
spec = int.parse(txt.substring(4));
else if (txt.startsWith('gai'))
gai = int.parse(txt.substring(3));
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
}
return [news, ichi, spec, gai, nf];
}
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
XRefParts parseXrefParts(String s) {
final parts = s.split('');
late final XRefParts result;
switch (parts.length) {
case 1:
result = parts[0].contains(kanaRegex)
? XRefParts(readingRef: parts[0])
: XRefParts(kanjiRef: parts[0]);
break;
case 2:
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
result = XRefParts(
readingRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
} else {
result = XRefParts(
kanjiRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
}
} else {
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
);
}
break;
case 3:
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseOrderNum: int.parse(parts[2]),
);
break;
default:
result = XRefParts();
break;
}
return result;
}
class ResolvedXref {
Entry entry;
@@ -139,138 +68,7 @@ ResolvedXref resolveXref(
}
}
List<Entry> parseXML(XmlElement root) {
final List<Entry> entries = [];
int senseId = 0;
for (final entry in root.childElements) {
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
final List<KanjiElement> kanjiEls = [];
final List<ReadingElement> readingEls = [];
final List<Sense> senses = [];
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
final ke_pri = getPriorityValues(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
orderNum: kanjiNum + 1,
info: k_ele
.findElements('ke_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
ichi: ke_pri[1],
spec: ke_pri[2],
gai: ke_pri[3],
nf: ke_pri[4],
),
);
}
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
final re_pri = getPriorityValues(r_ele, 're');
final readingDoesNotMatchKanji =
r_ele.findElements('re_nokanji').isNotEmpty;
readingEls.add(
ReadingElement(
orderNum: orderNum + 1,
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
info: r_ele
.findElements('re_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
restrictions:
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
reading: r_ele.findElements('reb').first.innerText,
news: re_pri[0],
ichi: re_pri[1],
spec: re_pri[2],
gai: re_pri[3],
nf: re_pri[4],
),
);
}
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
senseId++;
final result = Sense(
id: senseId,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => parseXrefParts(e.innerText))
.toList(),
);
if (result.isEmpty) {
print(
'WARNING: Sense $senseId for entry $entryId is empty, '
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
);
} else {
senses.add(result);
}
}
entries.add(
Entry(
id: entryId,
kanji: kanjiEls,
readings: readingEls,
senses: senses,
),
);
}
return entries;
}
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
Future<void> seedJMDictData(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1 - Kanji and readings');
Batch b = db.batch();
for (final e in entries) {
@@ -471,17 +269,3 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
await b.commit();
}
Future<void> addDataFromJMdict(Database db) async {
print('[JMdict] Reading file content...');
String rawXML = File('data/tmp/JMdict.xml').readAsStringSync();
print('[JMdict] Parsing XML tags...');
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
print('[JMdict] Parsing XML content...');
final entries = parseXML(root);
print('[JMdict] Writing to database...');
await insertIntoDB(entries, db);
}

View File

@@ -0,0 +1,201 @@
import 'package:jadb/_data_ingestion/jmdict/objects.dart';
import 'package:xml/xml.dart';
/// parse priority values from r_ele and k_ele xml elements
///
/// source: http://www.edrdg.org/jmwsgi/edhelp.py?sid=#kw_freq
List<int?> getPriorityValues(XmlElement e, String prefix) {
int? news, ichi, spec, gai, nf;
for (final pri in e.findElements('${prefix}_pri')) {
final txt = pri.innerText;
if (txt.startsWith('news'))
news = int.parse(txt.substring(4));
else if (txt.startsWith('ichi'))
ichi = int.parse(txt.substring(4));
else if (txt.startsWith('spec'))
spec = int.parse(txt.substring(4));
else if (txt.startsWith('gai'))
gai = int.parse(txt.substring(3));
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
}
return [news, ichi, spec, gai, nf];
}
// source: www.edrdg.org/jmwsgi/edhelp.py?sid=#syn_xref
XRefParts parseXrefParts(String s) {
final parts = s.split('');
late final XRefParts result;
switch (parts.length) {
case 1:
result = parts[0].contains(kanaRegex)
? XRefParts(readingRef: parts[0])
: XRefParts(kanjiRef: parts[0]);
break;
case 2:
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
result = XRefParts(
readingRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
} else {
result = XRefParts(
kanjiRef: parts[0],
senseOrderNum: int.parse(parts[1]),
);
}
} else {
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
);
}
break;
case 3:
result = XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseOrderNum: int.parse(parts[2]),
);
break;
default:
result = XRefParts();
break;
}
return result;
}
List<Entry> parseJMDictData(XmlElement root) {
final List<Entry> entries = [];
int senseId = 0;
for (final entry in root.childElements) {
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
final List<KanjiElement> kanjiEls = [];
final List<ReadingElement> readingEls = [];
final List<Sense> senses = [];
for (final (kanjiNum, k_ele) in entry.findElements('k_ele').indexed) {
final ke_pri = getPriorityValues(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
orderNum: kanjiNum + 1,
info: k_ele
.findElements('ke_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
ichi: ke_pri[1],
spec: ke_pri[2],
gai: ke_pri[3],
nf: ke_pri[4],
),
);
}
for (final (orderNum, r_ele) in entry.findElements('r_ele').indexed) {
final re_pri = getPriorityValues(r_ele, 're');
final readingDoesNotMatchKanji =
r_ele.findElements('re_nokanji').isNotEmpty;
readingEls.add(
ReadingElement(
orderNum: orderNum + 1,
readingDoesNotMatchKanji: readingDoesNotMatchKanji,
info: r_ele
.findElements('re_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
restrictions:
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
reading: r_ele.findElements('reb').first.innerText,
news: re_pri[0],
ichi: re_pri[1],
spec: re_pri[2],
gai: re_pri[3],
nf: re_pri[4],
),
);
}
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
senseId++;
final result = Sense(
id: senseId,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => parseXrefParts(e.innerText))
.toList(),
);
if (result.isEmpty) {
print(
'WARNING: Sense $senseId for entry $entryId is empty, '
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
);
} else {
senses.add(result);
}
}
entries.add(
Entry(
id: entryId,
kanji: kanjiEls,
readings: readingEls,
senses: senses,
),
);
}
return entries;
}

View File

@@ -0,0 +1,74 @@
import 'package:sqflite_common/sqlite_api.dart';
import 'objects.dart';
Future<void> seedKANJIDICData(List<Character> characters, Database db) async {
final b = db.batch();
for (final c in characters) {
// if (c.dictionaryReferences.any((e) =>
// c.dictionaryReferences
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
// .length >
// 1)) {
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
// }
b.insert(TableNames.character, c.sqlValue);
for (final n in c.radicalName) {
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
}
for (final cp in c.codepoints) {
b.insert(TableNames.codepoint, cp.sqlValue);
}
for (final r in c.radicals) {
b.insert(TableNames.radical, r.sqlValue);
}
for (final sm in c.strokeMiscounts) {
b.insert(
TableNames.strokeMiscount,
{
'kanji': c.literal,
'strokeCount': sm,
},
);
}
for (final v in c.variants) {
b.insert(TableNames.variant, v.sqlValue);
}
for (final dr in c.dictionaryReferences) {
// There are duplicate entries here
b.insert(
TableNames.dictionaryReference,
dr.sqlValue,
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
for (final drm in c.dictionaryReferencesMoro) {
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
}
for (final q in c.querycodes) {
b.insert(TableNames.queryCode, q.sqlValue);
}
for (final r in c.readings) {
b.insert(TableNames.reading, r.sqlValue);
}
for (final k in c.kunyomi) {
b.insert(TableNames.kunyomi, k.sqlValue);
}
for (final o in c.onyomi) {
b.insert(TableNames.onyomi, o.sqlValue);
}
for (final m in c.meanings) {
b.insert(TableNames.meaning, m.sqlValue);
}
for (final n in c.nanori) {
b.insert(
TableNames.nanori,
{
'kanji': c.literal,
'nanori': n,
},
);
}
}
b.commit();
}

View File

@@ -1,12 +1,7 @@
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:jadb/_data_ingestion/kanjidic/objects.dart';
import 'package:xml/xml.dart';
import 'package:collection/collection.dart';
import 'objects.dart';
List<Character> transformXML(XmlElement root) {
List<Character> parseKANJIDICData(XmlElement root) {
final List<Character> result = [];
for (final c in root.findElements('character')) {
final kanji = c.findElements('literal').first.innerText;
@@ -170,88 +165,3 @@ List<Character> transformXML(XmlElement root) {
}
return result;
}
Future<void> insertIntoDB(List<Character> characters, Database db) async {
final b = db.batch();
for (final c in characters) {
// if (c.dictionaryReferences.any((e) =>
// c.dictionaryReferences
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
// .length >
// 1)) {
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
// }
b.insert(TableNames.character, c.sqlValue);
for (final n in c.radicalName) {
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
}
for (final cp in c.codepoints) {
b.insert(TableNames.codepoint, cp.sqlValue);
}
for (final r in c.radicals) {
b.insert(TableNames.radical, r.sqlValue);
}
for (final sm in c.strokeMiscounts) {
b.insert(
TableNames.strokeMiscount,
{
'kanji': c.literal,
'strokeCount': sm,
},
);
}
for (final v in c.variants) {
b.insert(TableNames.variant, v.sqlValue);
}
for (final dr in c.dictionaryReferences) {
// There are duplicate entries here
b.insert(
TableNames.dictionaryReference,
dr.sqlValue,
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
for (final drm in c.dictionaryReferencesMoro) {
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
}
for (final q in c.querycodes) {
b.insert(TableNames.queryCode, q.sqlValue);
}
for (final r in c.readings) {
b.insert(TableNames.reading, r.sqlValue);
}
for (final k in c.kunyomi) {
b.insert(TableNames.kunyomi, k.sqlValue);
}
for (final o in c.onyomi) {
b.insert(TableNames.onyomi, o.sqlValue);
}
for (final m in c.meanings) {
b.insert(TableNames.meaning, m.sqlValue);
}
for (final n in c.nanori) {
b.insert(
TableNames.nanori,
{
'kanji': c.literal,
'nanori': n,
},
);
}
}
b.commit();
}
Future<void> addDataFromKANJIDIC(Database db) async {
print('[KANJIDIC2] Reading file...');
String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync();
print('[KANJIDIC2] Parsing XML...');
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
print('[KANJIDIC2] Transforming data...');
final entries = transformXML(root);
print('[KANJIDIC2] Writing to database...');
await insertIntoDB(entries, db);
}

View File

@@ -1,32 +1,10 @@
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
Future<void> addDataFromRADKFILE(Database db) async {
Iterable<String> parseRADKFILEBlocks(File radkfile) {
final String content = File('data/tmp/radkfile_utf8').readAsStringSync();
final Iterable<String> blocks =
content.replaceAll(RegExp(r'^#.*$'), '').split(r'$').skip(2);
print('[RADKFILE] Writing to database...');
final b = db.batch();
for (final block in blocks) {
final String radical = block[1];
final List<String> kanjiList = block
.replaceFirst(RegExp(r'.*\n'), '')
.split('')
..removeWhere((e) => e == '' || e == '\n');
for (final kanji in kanjiList.toSet()) {
b.insert(
'RADKFILE',
{
'radical': radical,
'kanji': kanji,
},
);
}
}
b.commit();
return blocks;
}

View File

@@ -0,0 +1,28 @@
import 'package:sqflite_common/sqlite_api.dart';
Future<void> seedRADKFILEData(
Iterable<String> blocks,
Database db,
) async {
final b = db.batch();
for (final block in blocks) {
final String radical = block[1];
final List<String> kanjiList = block
.replaceFirst(RegExp(r'.*\n'), '')
.split('')
..removeWhere((e) => e == '' || e == '\n');
for (final kanji in kanjiList.toSet()) {
b.insert(
'RADKFILE',
{
'radical': radical,
'kanji': kanji,
},
);
}
}
b.commit();
}

View File

@@ -0,0 +1,55 @@
import 'dart:io';
import 'package:jadb/_data_ingestion/jmdict/seed_data.dart';
import 'package:jadb/_data_ingestion/jmdict/xml_parser.dart';
import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart';
import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart';
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
import 'package:jadb/_data_ingestion/radkfile/seed_data.dart';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
Future<void> seedData(Database db) async {
await parseAndSeedDataFromJMdict(db);
await parseAndSeedDataFromRADKFILE(db);
await parseAndSeedDataFromKANJIDIC(db);
}
Future<void> parseAndSeedDataFromJMdict(Database db) async {
print('[JMdict] Reading file content...');
String rawXML = File('data/tmp/JMdict.xml').readAsStringSync();
print('[JMdict] Parsing XML tags...');
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
print('[JMdict] Parsing XML content...');
final entries = parseJMDictData(root);
print('[JMdict] Writing to database...');
await seedJMDictData(entries, db);
}
Future<void> parseAndSeedDataFromKANJIDIC(Database db) async {
print('[KANJIDIC2] Reading file...');
String rawXML = File('data/tmp/kanjidic2.xml').readAsStringSync();
print('[KANJIDIC2] Parsing XML...');
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
print('[KANJIDIC2] Parsing XML content...');
final entries = parseKANJIDICData(root);
print('[KANJIDIC2] Writing to database...');
await seedKANJIDICData(entries, db);
}
Future<void> parseAndSeedDataFromRADKFILE(Database db) async {
print('[RADKFILE] Reading file...');
File raw = File('data/tmp/RADKFILE');
print('[RADKFILE] Parsing content...');
final blocks = parseRADKFILEBlocks(raw);
print('[RADKFILE] Writing to database...');
seedRADKFILEData(blocks, db);
}