init commit

This commit is contained in:
2022-06-20 20:06:07 +02:00
commit 5cf0b95d8b
21 changed files with 3043 additions and 0 deletions

0
bin/common.dart Normal file
View File

16
bin/ja_db.dart Normal file
View File

@@ -0,0 +1,16 @@
import 'dart:io';
import 'package:sqflite_common_ffi/sqflite_ffi.dart';
import 'jmdict/parser.dart';
import 'kanjidic/parser.dart';
import 'radkfile/parser.dart';
Future<void> main(List<String> arguments) async {
final db = await databaseFactoryFfi
.openDatabase(Directory.current.uri.resolve('main.db').path);
await addDataFromJMdict(db);
await addDataFromRADKFILE(db);
await addDataFromKANJIDIC(db);
}

235
bin/jmdict/objects.dart Normal file
View File

@@ -0,0 +1,235 @@
import '../common.dart';
import '../objects.dart';
class TableNames {
static const String entry = 'JMdict_Entry';
static const String entryByKana = 'JMdict_EntryByKana';
static const String entryByEnglish = 'JMdict_EntryByEnglish';
static const String kanjiElement = 'JMdict_KanjiElement';
static const String kanjiInfo = 'JMdict_KanjiElementInfo';
static const String readingElement = 'JMdict_ReadingElement';
static const String readingInfo = 'JMdict_ReadingElementInfo';
static const String readingRestriction = 'JMdict_ReadingElementRestriction';
static const String sense = 'JMdict_Sense';
static const String senseAntonyms = 'JMdict_SenseAntonym';
static const String senseDialect = 'JMdict_SenseDialect';
static const String senseField = 'JMdict_SenseField';
static const String senseGlossary = 'JMdict_SenseGlossary';
static const String senseInfo = 'JMdict_SenseInfo';
static const String senseLanguageSource = 'JMdict_SenseLanguageSource';
static const String senseMisc = 'JMdict_SenseMisc';
static const String sensePOS = 'JMdict_SensePOS';
static const String senseRestrictedToKanji = 'JMdict_SenseRestrictedToKanji';
static const String senseRestrictedToReading = 'JMdict_SenseRestrictedToReading';
static const String senseSeeAlso = 'JMdict_SenseSeeAlso';
}
abstract class Element extends SQLWritable {
final String reading;
final int? news;
final int? ichi;
final int? spec;
final int? gai;
final int? nf;
const Element({
required this.reading,
this.news,
this.ichi,
this.spec,
this.gai,
this.nf,
});
Map<String, Object?> get sqlValue => {
'reading': reading,
'news': news,
'ichi': ichi,
'spec': spec,
'gai': gai,
'nf': nf,
};
}
class KanjiElement extends Element {
List<String> info;
KanjiElement({
this.info = const [],
required String reading,
int? news,
int? ichi,
int? spec,
int? gai,
int? nf,
}) : super(
reading: reading,
news: news,
ichi: ichi,
spec: spec,
gai: gai,
nf: nf,
);
}
class ReadingElement extends Element {
List<String> info;
List<String> restrictions;
ReadingElement({
this.info = const [],
this.restrictions = const [],
required String reading,
int? news,
int? ichi,
int? spec,
int? gai,
int? nf,
}) : super(
reading: reading,
news: news,
ichi: ichi,
spec: spec,
gai: gai,
nf: nf,
);
}
class LanguageSource extends SQLWritable {
final String language;
final String? phrase;
final bool fullyDescribesSense;
final bool constructedFromSmallerWords;
const LanguageSource({
required this.language,
this.phrase,
this.fullyDescribesSense = true,
this.constructedFromSmallerWords = false,
});
@override
Map<String, Object?> get sqlValue => {
'language': language,
'phrase': phrase,
'fullyDescribesSense': fullyDescribesSense,
'constructedFromSmallerWords': constructedFromSmallerWords,
};
}
class Glossary extends SQLWritable {
final String language;
final String phrase;
final String? type;
const Glossary({
required this.language,
required this.phrase,
this.type,
});
Map<String, Object?> get sqlValue => {
'language': language,
'phrase': phrase,
'type': type,
};
}
final kanaRegex =
RegExp(r'^[\p{Script=Katakana}\p{Script=Hiragana}ー]+$', unicode: true);
class XRefParts {
final String? kanjiRef;
final String? readingRef;
final int? senseNum;
const XRefParts({
this.kanjiRef,
this.readingRef,
this.senseNum,
}) : assert(kanjiRef != null || readingRef != null);
factory XRefParts.fromString(String s) {
final parts = s.split('');
if (parts.length == 1) {
if (parts[0].contains(kanaRegex)) {
return XRefParts(readingRef: parts[0]);
}
return XRefParts(kanjiRef: parts[0]);
} else if (parts.length == 2) {
if (int.tryParse(parts[1]) != null) {
if (parts[0].contains(kanaRegex)) {
return XRefParts(readingRef: parts[0], senseNum: int.parse(parts[1]));
}
return XRefParts(kanjiRef: parts[0], senseNum: int.parse(parts[1]));
}
return XRefParts(kanjiRef: parts[0], readingRef: parts[1]);
} else if (parts.length == 3) {
return XRefParts(
kanjiRef: parts[0],
readingRef: parts[1],
senseNum: int.parse(parts[2]),
);
}
return XRefParts();
}
}
class XRef {
final String entryId;
final String reading;
const XRef({
required this.entryId,
required this.reading,
});
}
class Sense extends SQLWritable {
final int id;
final List<XRefParts> antonyms;
final List<String> dialects;
final List<String> fields;
final List<String> info;
final List<LanguageSource> languageSource;
final List<Glossary> glossary;
final List<String> misc;
final List<String> pos;
final List<String> restrictedToKanji;
final List<String> restrictedToReading;
final List<XRefParts> seeAlso;
const Sense({
required this.id,
this.antonyms = const [],
this.dialects = const [],
this.fields = const [],
this.info = const [],
this.languageSource = const [],
this.glossary = const [],
this.misc = const [],
this.pos = const [],
this.restrictedToKanji = const [],
this.restrictedToReading = const [],
this.seeAlso = const [],
});
@override
Map<String, Object?> get sqlValue => {};
}
class Entry extends SQLWritable {
final int id;
final List<KanjiElement> kanji;
final List<ReadingElement> readings;
final List<Sense> senses;
const Entry({
required this.id,
required this.kanji,
required this.readings,
required this.senses,
});
Map<String, Object?> get sqlValue => {'id': id};
}

346
bin/jmdict/parser.dart Normal file
View File

@@ -0,0 +1,346 @@
import 'dart:collection';
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import '../romaji_transliteration.dart';
import 'objects.dart';
List<int?> getPriNums(XmlElement e, String prefix) {
int? news, ichi, spec, gai, nf;
for (final pri in e.findElements('${prefix}_pri')) {
final txt = pri.innerText;
if (txt.startsWith('news'))
news = int.parse(txt.substring(4));
else if (txt.startsWith('ichi'))
ichi = int.parse(txt.substring(4));
else if (txt.startsWith('spec'))
spec = int.parse(txt.substring(4));
else if (txt.startsWith('gai'))
gai = int.parse(txt.substring(3));
else if (txt.startsWith('nf')) nf = int.parse(txt.substring(2));
}
return [news, ichi, spec, gai, nf];
}
List<Entry> transformXML(XmlElement root) {
final List<Entry> entries = [];
int senseId = 0;
for (final entry in root.childElements) {
final entryId = int.parse(entry.findElements('ent_seq').first.innerText);
final List<KanjiElement> kanjiEls = [];
final List<ReadingElement> readingEls = [];
final List<Sense> senses = [];
for (final k_ele in entry.findAllElements('k_ele')) {
final ke_pri = getPriNums(k_ele, 'ke');
kanjiEls.add(
KanjiElement(
info: k_ele.findElements('ke_inf').map((e) => e.innerText).toList(),
reading: k_ele.findElements('keb').first.innerText,
news: ke_pri[0],
ichi: ke_pri[1],
spec: ke_pri[2],
gai: ke_pri[3],
nf: ke_pri[4],
),
);
}
for (final r_ele in entry.findAllElements('r_ele')) {
final re_pri = getPriNums(r_ele, 're');
readingEls.add(
ReadingElement(
info: r_ele
.findElements('re_inf')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
restrictions:
r_ele.findElements('re_restr').map((e) => e.innerText).toList(),
reading: r_ele.findElements('reb').first.innerText,
news: re_pri[0],
ichi: re_pri[1],
spec: re_pri[2],
gai: re_pri[3],
nf: re_pri[4],
),
);
}
for (final sense in entry.findAllElements('sense')) {
senseId++;
senses.add(
Sense(
id: senseId,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords:
e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => XRefParts.fromString(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => XRefParts.fromString(e.innerText))
.toList(),
),
);
}
entries.add(
Entry(
id: entryId,
kanji: kanjiEls,
readings: readingEls,
senses: senses,
),
);
}
return entries;
}
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1');
Batch b = db.batch();
for (final e in entries) {
b.insert(TableNames.entry, e.sqlValue);
for (final k in e.kanji) {
b.insert(TableNames.kanjiElement, k.sqlValue..addAll({'entryId': e.id}));
// b.insert(
// TableNames.entryByKana,
// {'entryId': e.id, 'kana': transliterateKatakanaToHiragana(k.reading)},
// // Some entries have the same reading twice with difference in katakana and hiragana
// conflictAlgorithm: ConflictAlgorithm.ignore,
// );
for (final i in k.info) {
b.insert(
TableNames.kanjiInfo,
{'entryId': e.id, 'reading': k.reading, 'info': i},
);
}
}
for (final r in e.readings) {
b.insert(
TableNames.readingElement,
r.sqlValue..addAll({'entryId': e.id}),
);
b.insert(
TableNames.entryByKana,
{'entryId': e.id, 'kana': transliterateKanaToLatin(r.reading)},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
for (final i in r.info) {
b.insert(
TableNames.readingInfo,
{'entryId': e.id, 'reading': r.reading, 'info': i},
);
}
for (final res in r.restrictions) {
b.insert(
TableNames.readingRestriction,
{'entryId': e.id, 'reading': r.reading, 'restriction': res},
);
}
}
for (final s in e.senses) {
for (final g in s.glossary) {
if (g.language == "eng")
b.insert(
TableNames.entryByEnglish,
{'entryId': e.id, 'english': g.phrase},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
}
}
await b.commit();
print(' [JMdict] Building trees');
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
if (entriesByKanji.containsKey(kanji.reading)) {
entriesByKanji.update(kanji.reading, (list) => list..add(entry));
} else {
entriesByKanji.putIfAbsent(kanji.reading, () => {entry});
}
}
}
SplayTreeMap<String, Set<Entry>> entriesByReading = SplayTreeMap();
for (final entry in entries) {
for (final reading in entry.readings) {
if (entriesByReading.containsKey(reading.reading)) {
entriesByReading.update(reading.reading, (list) => list..add(entry));
} else {
entriesByReading.putIfAbsent(reading.reading, () => {entry});
}
}
}
print(' [JMdict] Batch 2');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
b.insert(
TableNames.sense, s.sqlValue..addAll({'id': s.id, 'entryId': e.id}));
for (final d in s.dialects) {
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
}
for (final f in s.fields) {
b.insert(TableNames.senseField, {'senseId': s.id, 'field': f});
}
for (final i in s.info) {
b.insert(TableNames.senseInfo, {'senseId': s.id, 'info': i});
}
for (final m in s.misc) {
b.insert(TableNames.senseMisc, {'senseId': s.id, 'misc': m});
}
for (final p in s.pos) {
b.insert(TableNames.sensePOS, {'senseId': s.id, 'pos': p});
}
for (final l in s.languageSource) {
b.insert(
TableNames.senseLanguageSource,
l.sqlValue..addAll({'senseId': s.id}),
);
}
for (final rk in s.restrictedToKanji) {
b.insert(
TableNames.senseRestrictedToKanji,
{'entryId': e.id, 'senseId': s.id, 'kanji': rk},
);
}
for (final rr in s.restrictedToReading) {
b.insert(
TableNames.senseRestrictedToReading,
{'entryId': e.id, 'senseId': s.id, 'reading': rr},
);
}
for (final ls in s.languageSource) {
b.insert(
TableNames.senseLanguageSource,
ls.sqlValue..addAll({'senseId': s.id}),
);
}
for (final g in s.glossary) {
if (g.language == 'eng')
b.insert(
TableNames.senseGlossary,
g.sqlValue..addAll({'senseId': s.id}),
// There are some duplicate glossary, especially in
// the other languages.
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
for (final xref in s.seeAlso) {
final Set<Entry> entries;
if (xref.kanjiRef != null && xref.readingRef != null) {
entries = entriesByKanji[xref.kanjiRef]!
.difference(entriesByReading[xref.readingRef]!);
} else if (xref.kanjiRef != null) {
entries = entriesByKanji[xref.kanjiRef]!;
} else {
entries = entriesByReading[xref.readingRef]!;
}
for (final ex in entries)
if (!(xref.senseNum != null && xref.senseNum! > ex.senses.length)) {
b.insert(
TableNames.senseSeeAlso,
{
'senseId': s.id,
'xrefEntryId': ex.id,
'seeAlsoKanji': xref.kanjiRef,
'seeAlsoReading': xref.readingRef,
'seeAlsoSense': xref.senseNum,
},
);
}
}
for (final ant in s.antonyms) {
final Set<Entry> entries;
if (ant.kanjiRef != null && ant.readingRef != null) {
entries = entriesByKanji[ant.kanjiRef]!
.difference(entriesByReading[ant.readingRef]!);
} else if (ant.kanjiRef != null) {
entries = entriesByKanji[ant.kanjiRef]!;
} else {
entries = entriesByReading[ant.readingRef]!;
}
for (final ex in entries) {
if (!(ant.senseNum != null && ant.senseNum! > ex.senses.length)) {
b.insert(TableNames.senseAntonyms, {
'senseId': s.id,
'xrefEntryId': ex.id,
'antonymKanji': ant.kanjiRef,
'antonymReading': ant.readingRef,
'antonymSense': ant.senseNum,
});
}
}
}
}
}
await b.commit();
}
Future<void> addDataFromJMdict(Database db) async {
print('[JMdict] Reading file...');
String rawXML = File('data/JMdict.xml').readAsStringSync();
print('[JMdict] Parsing XML...');
XmlElement root = XmlDocument.parse(rawXML).getElement('JMdict')!;
print('[JMdict] Transforming data...');
final entries = transformXML(root);
print('[JMdict] Writing to database...');
await insertIntoDB(entries, db);
}

284
bin/kanjidic/objects.dart Normal file
View File

@@ -0,0 +1,284 @@
import '../objects.dart';
class TableNames {
static const String character = 'KANJIDIC_Character';
static const String radicalName = 'KANJIDIC_RadicalName';
static const String codepoint = 'KANJIDIC_Codepoint';
static const String radical = 'KANJIDIC_Radical';
static const String strokeMiscount = 'KANJIDIC_StrokeMiscount';
static const String variant = 'KANJIDIC_Variant';
static const String dictionaryReference = '_KANJIDIC_DictionaryReference_Part1';
static const String dictionaryReferenceMoro = '_KANJIDIC_DictionaryReference_Moro';
static const String queryCode = 'KANJIDIC_QueryCode';
static const String reading = 'KANJIDIC_Reading';
static const String kunyomi = 'KANJIDIC_Kunyomi';
static const String onyomi = 'KANJIDIC_Onyomi';
static const String meaning = 'KANJIDIC_Meaning';
static const String nanori = 'KANJIDIC_Nanori';
}
class CodePoint extends SQLWritable {
final String kanji;
final String type;
final String codepoint;
const CodePoint({
required this.kanji,
required this.type,
required this.codepoint,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'type': type,
'codepoint': codepoint,
};
}
class Radical extends SQLWritable {
final String kanji;
final String type;
final String radical;
const Radical({
required this.kanji,
required this.type,
required this.radical,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'type': type,
'radical': radical,
};
}
class StrokeMiscount extends SQLWritable {
final String kanji;
final int strokeCount;
const StrokeMiscount({
required this.kanji,
required this.strokeCount,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'strokeCount': strokeCount,
};
}
class Variant extends SQLWritable {
final String kanji;
final String type;
final String variant;
const Variant({
required this.kanji,
required this.type,
required this.variant,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'type': type,
'variant': variant,
};
}
class DictionaryReference extends SQLWritable {
final String kanji;
final String type;
final String ref;
const DictionaryReference({
required this.kanji,
required this.type,
required this.ref,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'type': type,
'ref': ref,
};
}
class DictionaryReferenceMoro extends SQLWritable {
final String kanji;
final String ref;
final int? volume;
final int? page;
const DictionaryReferenceMoro({
required this.kanji,
required this.ref,
required this.volume,
required this.page,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'ref': ref,
'volume': volume,
'page': page,
};
}
class QueryCode extends SQLWritable {
final String kanji;
final String code;
final String type;
final String? skipMisclassification;
const QueryCode({
required this.kanji,
required this.code,
required this.type,
required this.skipMisclassification,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'code': code,
'type': type,
'skipMisclassification': skipMisclassification,
};
}
class Reading extends SQLWritable {
final String kanji;
final String type;
final String reading;
const Reading({
required this.kanji,
required this.type,
required this.reading,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'type': type,
'reading': reading,
};
}
class Kunyomi extends SQLWritable {
final String kanji;
final String yomi;
final bool isJouyou;
const Kunyomi({
required this.kanji,
required this.yomi,
required this.isJouyou,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'yomi': yomi,
'isJouyou': isJouyou,
};
}
class Onyomi extends SQLWritable {
final String kanji;
final String yomi;
final bool isJouyou;
final String? type;
const Onyomi({
required this.kanji,
required this.yomi,
required this.isJouyou,
required this.type,
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'yomi': yomi,
'isJouyou': isJouyou,
'type': type,
};
}
class Meaning extends SQLWritable {
final String kanji;
final String language;
final String meaning;
const Meaning({
required this.kanji,
required this.language,
this.meaning = 'eng',
});
@override
Map<String, Object?> get sqlValue => {
'kanji': kanji,
'language': language,
'meaning': meaning,
};
}
class Character extends SQLWritable {
final String literal;
final int strokeCount;
final int? grade;
final int? frequency;
final int? jlpt;
final List<String> radicalName;
final List<CodePoint> codepoints;
final List<Radical> radicals;
final List<int> strokeMiscounts;
final List<Variant> variants;
final List<DictionaryReference> dictionaryReferences;
final List<DictionaryReferenceMoro> dictionaryReferencesMoro;
final List<QueryCode> querycodes;
final List<Reading> readings;
final List<Onyomi> onyomi;
final List<Kunyomi> kunyomi;
final List<Meaning> meanings;
final List<String> nanori;
const Character({
required this.literal,
required this.strokeCount,
this.grade,
this.frequency,
this.jlpt,
this.radicalName = const [],
this.codepoints = const [],
this.radicals = const [],
this.strokeMiscounts = const [],
this.variants = const [],
this.dictionaryReferences = const [],
this.dictionaryReferencesMoro = const [],
this.querycodes = const [],
this.readings = const [],
this.onyomi = const [],
this.kunyomi = const [],
this.meanings = const [],
this.nanori = const [],
});
Map<String, Object?> get sqlValue => {
'literal': literal,
'grade': grade,
'strokeCount': strokeCount,
'frequency': frequency,
'jlpt': jlpt,
};
}

231
bin/kanjidic/parser.dart Normal file
View File

@@ -0,0 +1,231 @@
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
import 'package:xml/xml.dart';
import 'package:collection/collection.dart';
import 'objects.dart';
List<Character> transformXML(XmlElement root) {
final List<Character> result = [];
for (final c in root.findElements('character')) {
final kanji = c.findElements('literal').first.innerText;
result.add(
Character(
literal: kanji,
strokeCount:
int.parse(c.findAllElements('stroke_count').first.innerText),
grade:
int.tryParse(c.findElements('grade').firstOrNull?.innerText ?? ''),
frequency:
int.tryParse(c.findElements('freq').firstOrNull?.innerText ?? ''),
jlpt: int.tryParse(
c.findElements('rad_name').firstOrNull?.innerText ?? '',
),
radicalName:
c.findElements('rad_name').map((e) => e.innerText).toList(),
codepoints: c
.findAllElements('cp_value')
.map(
(e) => CodePoint(
kanji: kanji,
type: e.getAttribute('cp_type')!,
codepoint: e.innerText,
),
)
.toList(),
radicals: c
.findAllElements('rad_value')
.map(
(e) => Radical(
kanji: kanji,
type: e.getAttribute('rad_type')!,
radical: e.innerText,
),
)
.toList(),
strokeMiscounts: c
.findAllElements('stroke_count')
.skip(1)
.map((e) => int.parse(e.innerText))
.toList(),
variants: c
.findAllElements('variant')
.map(
(e) => Variant(
kanji: kanji,
type: e.getAttribute('var_type')!,
variant: e.innerText,
),
)
.toList(),
dictionaryReferences: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') != 'moro')
.map(
(e) => DictionaryReference(
kanji: kanji,
type: e.getAttribute('dr_type')!,
ref: e.innerText,
),
)
.toList(),
dictionaryReferencesMoro: c
.findAllElements('dic_ref')
.where((e) => e.getAttribute('dr_type') == 'moro')
.map(
(e) => DictionaryReferenceMoro(
kanji: kanji,
ref: e.innerText,
page: int.tryParse(e.getAttribute('m_page') ?? ''),
volume: int.tryParse(e.getAttribute('m_vol') ?? ''),
),
)
.toList(),
querycodes: c
.findAllElements('q_code')
.map(
(e) => QueryCode(
kanji: kanji,
code: e.innerText,
type: e.getAttribute('qc_type')!,
skipMisclassification: e.getAttribute('skip_misclass'),
),
)
.toList(),
readings: c
.findAllElements('reading')
.where(
(e) => !['ja_on', 'ja_kun'].contains(e.getAttribute('r_type')),
)
.map(
(e) => Reading(
kanji: kanji,
type: e.getAttribute('r_type')!,
reading: e.innerText,
),
)
.toList(),
kunyomi: c
.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_kun')
.map(
(e) => Kunyomi(
kanji: kanji,
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
),
)
.toList(),
onyomi: c
.findAllElements('reading')
.where((e) => e.getAttribute('r_type') == 'ja_on')
.map(
(e) => Onyomi(
kanji: kanji,
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
type: e.getAttribute('on_type')),
)
.toList(),
meanings: c
.findAllElements('meaning')
.map(
(e) => Meaning(
kanji: kanji,
language: e.getAttribute('m_lang') ?? 'eng',
meaning: e.innerText,
),
)
.toList(),
nanori: c.findAllElements('nanori').map((e) => e.innerText).toList(),
),
);
}
return result;
}
Future<void> insertIntoDB(List<Character> characters, Database db) async {
final b = db.batch();
for (final c in characters) {
// if (c.dictionaryReferences.any((e) =>
// c.dictionaryReferences
// .where((e2) => e.kanji == e2.kanji && e.type == e2.type)
// .length >
// 1)) {
// print(c.dictionaryReferences.map((e) => e.sqlValue).toList());
// }
b.insert(TableNames.character, c.sqlValue);
for (final n in c.radicalName) {
b.insert(TableNames.radicalName, {'kanji': c.literal, 'name': n});
}
for (final cp in c.codepoints) {
b.insert(TableNames.codepoint, cp.sqlValue);
}
for (final r in c.radicals) {
b.insert(TableNames.radical, r.sqlValue);
}
for (final sm in c.strokeMiscounts) {
b.insert(
TableNames.strokeMiscount,
{
'kanji': c.literal,
'strokeCount': sm,
},
);
}
for (final v in c.variants) {
b.insert(TableNames.variant, v.sqlValue);
}
for (final dr in c.dictionaryReferences) {
// There are duplicate entries here
b.insert(
TableNames.dictionaryReference,
dr.sqlValue,
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
for (final drm in c.dictionaryReferencesMoro) {
b.insert(TableNames.dictionaryReferenceMoro, drm.sqlValue);
}
for (final q in c.querycodes) {
b.insert(TableNames.queryCode, q.sqlValue);
}
for (final r in c.readings) {
b.insert(TableNames.reading, r.sqlValue);
}
for (final k in c.kunyomi) {
b.insert(TableNames.kunyomi, k.sqlValue);
}
for (final o in c.onyomi) {
b.insert(TableNames.onyomi, o.sqlValue);
}
for (final m in c.meanings) {
b.insert(TableNames.meaning, m.sqlValue);
}
for (final n in c.nanori) {
b.insert(
TableNames.nanori,
{
'kanji': c.literal,
'nanori': n,
},
);
}
}
b.commit();
}
Future<void> addDataFromKANJIDIC(Database db) async {
print('[KANJIDIC2] Reading file...');
String rawXML = File('data/kanjidic2.xml').readAsStringSync();
print('[KANJIDIC2] Parsing XML...');
XmlElement root = XmlDocument.parse(rawXML).getElement('kanjidic2')!;
print('[KANJIDIC2] Transforming data...');
final entries = transformXML(root);
print('[KANJIDIC2] Writing to database...');
await insertIntoDB(entries, db);
}

5
bin/objects.dart Normal file
View File

@@ -0,0 +1,5 @@
abstract class SQLWritable {
const SQLWritable();
Map<String, Object?> get sqlValue;
}

13
bin/radkfile/objects.dart Normal file
View File

@@ -0,0 +1,13 @@
class Radical {
final String radical;
final String kanji;
// TODO:
final String something;
const Radical({
required this.radical,
required this.kanji,
required this.something,
});
}

32
bin/radkfile/parser.dart Normal file
View File

@@ -0,0 +1,32 @@
import 'dart:io';
import 'package:sqflite_common/sqlite_api.dart';
Future<void> addDataFromRADKFILE(Database db) async {
final String content = File('data/radkfile_utf8').readAsStringSync();
final Iterable<String> blocks =
content.replaceAll(RegExp(r'^#.*$'), '').split(r'$').skip(2);
print('[RADKFILE] Writing to database...');
final b = db.batch();
for (final block in blocks) {
final String radical = block[1];
final List<String> kanjiList = block
.replaceFirst(RegExp(r'.*\n'), '')
.split('')
..removeWhere((e) => e == '' || e == '\n');
for (final kanji in kanjiList.toSet()) {
b.insert(
'RADKFILE',
{
'radical': radical,
'kanji': kanji,
},
);
}
}
b.commit();
}

View File

@@ -0,0 +1,622 @@
// Source: https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb
const hiragana_syllabic_n = '';
const hiragana_small_tsu = '';
const Map<String, String> hiragana_to_latin = {
'': 'a',
'': 'i',
'': 'u',
'': 'e',
'': 'o',
'': 'ka',
'': 'ki',
'': 'ku',
'': 'ke',
'': 'ko',
'': 'ga',
'': 'gi',
'': 'gu',
'': 'ge',
'': 'go',
'': 'sa',
'': 'shi',
'': 'su',
'': 'se',
'': 'so',
'': 'za',
'': 'ji',
'': 'zu',
'': 'ze',
'': 'zo',
'': 'ta',
'': 'chi',
'': 'tsu',
'': 'te',
'': 'to',
'': 'da',
'': 'ji',
'': 'zu',
'': 'de',
'': 'do',
'': 'na',
'': 'ni',
'': 'nu',
'': 'ne',
'': 'no',
'': 'ha',
'': 'hi',
'': 'fu',
'': 'he',
'': 'ho',
'': 'ba',
'': 'bi',
'': 'bu',
'': 'be',
'': 'bo',
'': 'pa',
'': 'pi',
'': 'pu',
'': 'pe',
'': 'po',
'': 'ma',
'': 'mi',
'': 'mu',
'': 'me',
'': 'mo',
'': 'ya',
'': 'yu',
'': 'yo',
'': 'ra',
'': 'ri',
'': 'ru',
'': 're',
'': 'ro',
'': 'wa',
'うぃ': 'whi',
'うぇ': 'whe',
'': 'wo',
'': 'we',
'': 'wi',
'': '-',
'': 'n',
'きゃ': 'kya',
'きゅ': 'kyu',
'きょ': 'kyo',
'きぇ': 'kye',
'きぃ': 'kyi',
'ぎゃ': 'gya',
'ぎゅ': 'gyu',
'ぎょ': 'gyo',
'ぎぇ': 'gye',
'ぎぃ': 'gyi',
'くぁ': 'kwa',
'くぃ': 'kwi',
'くぅ': 'kwu',
'くぇ': 'kwe',
'くぉ': 'kwo',
'ぐぁ': 'qwa',
'ぐぃ': 'gwi',
'ぐぅ': 'gwu',
'ぐぇ': 'gwe',
'ぐぉ': 'gwo',
'しゃ': 'sha',
'しぃ': 'syi',
'しゅ': 'shu',
'しぇ': 'she',
'しょ': 'sho',
'じゃ': 'ja',
'じゅ': 'ju',
'じぇ': 'jye',
'じょ': 'jo',
'じぃ': 'jyi',
'すぁ': 'swa',
'すぃ': 'swi',
'すぅ': 'swu',
'すぇ': 'swe',
'すぉ': 'swo',
'ちゃ': 'cha',
'ちゅ': 'chu',
'ちぇ': 'tye',
'ちょ': 'cho',
'ちぃ': 'tyi',
'ぢゃ': 'ja',
'ぢぃ': 'dyi',
'ぢゅ': 'ju',
'ぢぇ': 'dye',
'ぢょ': 'jo',
'つぁ': 'tsa',
'つぃ': 'tsi',
'つぇ': 'tse',
'つぉ': 'tso',
'てゃ': 'tha',
'てぃ': 'thi',
'てゅ': 'thu',
'てぇ': 'the',
'てょ': 'tho',
'とぁ': 'twa',
'とぃ': 'twi',
'とぅ': 'twu',
'とぇ': 'twe',
'とぉ': 'two',
'でゃ': 'dha',
'でぃ': 'dhi',
'でゅ': 'dhu',
'でぇ': 'dhe',
'でょ': 'dho',
'どぁ': 'dwa',
'どぃ': 'dwi',
'どぅ': 'dwu',
'どぇ': 'dwe',
'どぉ': 'dwo',
'にゃ': 'nya',
'にゅ': 'nyu',
'にょ': 'nyo',
'にぇ': 'nye',
'にぃ': 'nyi',
'ひゃ': 'hya',
'ひぃ': 'hyi',
'ひゅ': 'hyu',
'ひぇ': 'hye',
'ひょ': 'hyo',
'びゃ': 'bya',
'びぃ': 'byi',
'びゅ': 'byu',
'びぇ': 'bye',
'びょ': 'byo',
'ぴゃ': 'pya',
'ぴぃ': 'pyi',
'ぴゅ': 'pyu',
'ぴぇ': 'pye',
'ぴょ': 'pyo',
'ふぁ': 'fwa',
'ふぃ': 'fyi',
'ふぇ': 'fye',
'ふぉ': 'fwo',
'ふぅ': 'fwu',
'ふゃ': 'fya',
'ふゅ': 'fyu',
'ふょ': 'fyo',
'みゃ': 'mya',
'みぃ': 'myi',
'みゅ': 'myu',
'みぇ': 'mye',
'みょ': 'myo',
'りゃ': 'rya',
'りぃ': 'ryi',
'りゅ': 'ryu',
'りぇ': 'rye',
'りょ': 'ryo',
'ゔぁ': 'va',
'ゔぃ': 'vyi',
'': 'vu',
'ゔぇ': 'vye',
'ゔぉ': 'vo',
'ゔゃ': 'vya',
'ゔゅ': 'vyu',
'ゔょ': 'vyo',
'うぁ': 'wha',
'いぇ': 'ye',
'うぉ': 'who',
'': 'xa',
'': 'xi',
'': 'xu',
'': 'xe',
'': 'xo',
'': 'xka',
'': 'xke',
'': 'xwa'
};
const Map<String, String> latin_to_hiragana = {
'a': '',
'i': '',
'u': '',
'e': '',
'o': '',
'ka': '',
'ki': '',
'ku': '',
'ke': '',
'ko': '',
'ga': '',
'gi': '',
'gu': '',
'ge': '',
'go': '',
'sa': '',
'si': '',
'shi': '',
'su': '',
'se': '',
'so': '',
'za': '',
'zi': '',
'ji': '',
'zu': '',
'ze': '',
'zo': '',
'ta': '',
'ti': '',
'chi': '',
'tu': '',
'tsu': '',
'te': '',
'to': '',
'da': '',
'di': '',
'du': '',
'dzu': '',
'de': '',
'do': '',
'na': '',
'ni': '',
'nu': '',
'ne': '',
'no': '',
'ha': '',
'hi': '',
'hu': '',
'fu': '',
'he': '',
'ho': '',
'ba': '',
'bi': '',
'bu': '',
'be': '',
'bo': '',
'pa': '',
'pi': '',
'pu': '',
'pe': '',
'po': '',
'ma': '',
'mi': '',
'mu': '',
'me': '',
'mo': '',
'ya': '',
'yu': '',
'yo': '',
'ra': '',
'ri': '',
'ru': '',
're': '',
'ro': '',
'la': '',
'li': '',
'lu': '',
'le': '',
'lo': '',
'wa': '',
'wi': 'うぃ',
'we': 'うぇ',
'wo': '',
'wye': '',
'wyi': '',
'-': '',
'n': '',
'nn': '',
"n'": '',
'kya': 'きゃ',
'kyu': 'きゅ',
'kyo': 'きょ',
'kye': 'きぇ',
'kyi': 'きぃ',
'gya': 'ぎゃ',
'gyu': 'ぎゅ',
'gyo': 'ぎょ',
'gye': 'ぎぇ',
'gyi': 'ぎぃ',
'kwa': 'くぁ',
'kwi': 'くぃ',
'kwu': 'くぅ',
'kwe': 'くぇ',
'kwo': 'くぉ',
'gwa': 'ぐぁ',
'gwi': 'ぐぃ',
'gwu': 'ぐぅ',
'gwe': 'ぐぇ',
'gwo': 'ぐぉ',
'qwa': 'ぐぁ',
'qwi': 'ぐぃ',
'qwu': 'ぐぅ',
'qwe': 'ぐぇ',
'qwo': 'ぐぉ',
'sya': 'しゃ',
'syi': 'しぃ',
'syu': 'しゅ',
'sye': 'しぇ',
'syo': 'しょ',
'sha': 'しゃ',
'shu': 'しゅ',
'she': 'しぇ',
'sho': 'しょ',
'ja': 'じゃ',
'ju': 'じゅ',
'je': 'じぇ',
'jo': 'じょ',
'jya': 'じゃ',
'jyi': 'じぃ',
'jyu': 'じゅ',
'jye': 'じぇ',
'jyo': 'じょ',
'zya': 'じゃ',
'zyu': 'じゅ',
'zyo': 'じょ',
'zye': 'じぇ',
'zyi': 'じぃ',
'swa': 'すぁ',
'swi': 'すぃ',
'swu': 'すぅ',
'swe': 'すぇ',
'swo': 'すぉ',
'cha': 'ちゃ',
'chu': 'ちゅ',
'che': 'ちぇ',
'cho': 'ちょ',
'cya': 'ちゃ',
'cyi': 'ちぃ',
'cyu': 'ちゅ',
'cye': 'ちぇ',
'cyo': 'ちょ',
'tya': 'ちゃ',
'tyi': 'ちぃ',
'tyu': 'ちゅ',
'tye': 'ちぇ',
'tyo': 'ちょ',
'dya': 'ぢゃ',
'dyi': 'ぢぃ',
'dyu': 'ぢゅ',
'dye': 'ぢぇ',
'dyo': 'ぢょ',
'tsa': 'つぁ',
'tsi': 'つぃ',
'tse': 'つぇ',
'tso': 'つぉ',
'tha': 'てゃ',
'thi': 'てぃ',
'thu': 'てゅ',
'the': 'てぇ',
'tho': 'てょ',
'twa': 'とぁ',
'twi': 'とぃ',
'twu': 'とぅ',
'twe': 'とぇ',
'two': 'とぉ',
'dha': 'でゃ',
'dhi': 'でぃ',
'dhu': 'でゅ',
'dhe': 'でぇ',
'dho': 'でょ',
'dwa': 'どぁ',
'dwi': 'どぃ',
'dwu': 'どぅ',
'dwe': 'どぇ',
'dwo': 'どぉ',
'nya': 'にゃ',
'nyu': 'にゅ',
'nyo': 'にょ',
'nye': 'にぇ',
'nyi': 'にぃ',
'hya': 'ひゃ',
'hyi': 'ひぃ',
'hyu': 'ひゅ',
'hye': 'ひぇ',
'hyo': 'ひょ',
'bya': 'びゃ',
'byi': 'びぃ',
'byu': 'びゅ',
'bye': 'びぇ',
'byo': 'びょ',
'pya': 'ぴゃ',
'pyi': 'ぴぃ',
'pyu': 'ぴゅ',
'pye': 'ぴぇ',
'pyo': 'ぴょ',
'fa': 'ふぁ',
'fi': 'ふぃ',
'fe': 'ふぇ',
'fo': 'ふぉ',
'fwa': 'ふぁ',
'fwi': 'ふぃ',
'fwu': 'ふぅ',
'fwe': 'ふぇ',
'fwo': 'ふぉ',
'fya': 'ふゃ',
'fyi': 'ふぃ',
'fyu': 'ふゅ',
'fye': 'ふぇ',
'fyo': 'ふょ',
'mya': 'みゃ',
'myi': 'みぃ',
'myu': 'みゅ',
'mye': 'みぇ',
'myo': 'みょ',
'rya': 'りゃ',
'ryi': 'りぃ',
'ryu': 'りゅ',
'rye': 'りぇ',
'ryo': 'りょ',
'lya': 'りゃ',
'lyu': 'りゅ',
'lyo': 'りょ',
'lye': 'りぇ',
'lyi': 'りぃ',
'va': 'ゔぁ',
'vi': 'ゔぃ',
'vu': '',
've': 'ゔぇ',
'vo': 'ゔぉ',
'vya': 'ゔゃ',
'vyi': 'ゔぃ',
'vyu': 'ゔゅ',
'vye': 'ゔぇ',
'vyo': 'ゔょ',
'wha': 'うぁ',
'whi': 'うぃ',
'ye': 'いぇ',
'whe': 'うぇ',
'who': 'うぉ',
'xa': '',
'xi': '',
'xu': '',
'xe': '',
'xo': '',
'xya': '',
'xyu': '',
'xyo': '',
'xtu': '',
'xtsu': '',
'xka': '',
'xke': '',
'xwa': '',
'@@': ' ',
'#[': '',
'#]': '',
'#,': '',
'#.': '',
'#/': '',
};
bool _smallTsu(String for_conversion) => for_conversion == hiragana_small_tsu;
bool _nFollowedByYuYeYo(String for_conversion, String kana) =>
for_conversion == hiragana_syllabic_n &&
kana.length > 1 &&
'やゆよ'.contains(kana.substring(1, 2));
String transliterateHiraganaToLatin(String hiragana) {
String kana = hiragana;
String romaji = '';
bool geminate = false;
while (kana.isNotEmpty) {
final lengths = [if (kana.length > 1) 2, 1];
for (final length in lengths) {
final String for_conversion = kana.substring(0, length);
String? mora;
if (_smallTsu(for_conversion)) {
geminate = true;
kana = kana.replaceRange(0, length, '');
break;
} else if (_nFollowedByYuYeYo(for_conversion, kana)) {
mora = "n'";
}
mora ??= hiragana_to_latin[for_conversion];
if (mora != null) {
if (geminate) {
geminate = false;
romaji += mora.substring(0, 1);
}
romaji += mora;
kana = kana.replaceRange(0, length, '');
break;
} else if (length == 1) {
romaji += for_conversion;
kana = kana.replaceRange(0, length, '');
}
}
}
return romaji;
}
bool _doubleNFollowedByAIUEO(String for_conversion) =>
RegExp(r'^nn[aiueo]$').hasMatch(for_conversion);
bool _hasTableMatch(String for_conversion) =>
latin_to_hiragana[for_conversion] != null;
bool _hasDoubleConsonant(String for_conversion, int length) =>
for_conversion == 'tch' ||
(length == 2 &&
RegExp(r'^([kgsztdnbpmyrlwchf])\1$').hasMatch(for_conversion));
String transliterateLatinToHiragana(String latin) {
String romaji =
latin.toLowerCase().replaceAll('mb', 'nb').replaceAll('mp', 'np');
String kana = '';
while (romaji.isNotEmpty) {
final lengths = [
if (romaji.length > 2) 3,
if (romaji.length > 1) 2,
1,
];
for (final length in lengths) {
String? mora;
int for_removal = length;
final String for_conversion = romaji.substring(0, length);
if (_doubleNFollowedByAIUEO(for_conversion)) {
mora = hiragana_syllabic_n;
for_removal = 1;
} else if (_hasTableMatch(for_conversion)) {
mora = latin_to_hiragana[for_conversion];
} else if (_hasDoubleConsonant(for_conversion, length)) {
mora = hiragana_small_tsu;
for_removal = 1;
}
if (mora != null) {
kana += mora;
romaji = romaji.replaceRange(0, for_removal, '');
break;
} else if (length == 1) {
kana += for_conversion;
romaji = romaji.replaceRange(0, 1, '');
}
}
}
return kana;
}
String _transposeCodepointsInRange(
String text,
int distance,
int rangeStart,
int rangeEnd,
) =>
String.fromCharCodes(
text.codeUnits
.map((c) => c + ((rangeStart <= c && c <= rangeEnd) ? distance : 0)),
);
String transliterateKanaToLatin(String kana) =>
transliterateHiraganaToLatin(transliterateKatakanaToHiragana(kana));
String transliterateLatinToKatakana(String latin) =>
transliterateHiraganaToKatakana(transliterateLatinToHiragana(latin));
String transliterateKatakanaToHiragana(String katakana) =>
_transposeCodepointsInRange(katakana, -96, 12449, 12534);
String transliterateHiraganaToKatakana(String hiragana) =>
_transposeCodepointsInRange(hiragana, 96, 12353, 12438);
String transliterateFullwidthRomajiToHalfwidth(String halfwidth) =>
_transposeCodepointsInRange(
_transposeCodepointsInRange(
halfwidth,
-65248,
65281,
65374,
),
-12256,
12288,
12288,
);
String transliterateHalfwidthRomajiToFullwidth(String halfwidth) =>
_transposeCodepointsInRange(
_transposeCodepointsInRange(
halfwidth,
65248,
33,
126,
),
12256,
32,
32,
);