Implement word search

This commit is contained in:
2025-04-25 22:47:06 +02:00
parent 1c2f90a617
commit b6410c717f
24 changed files with 1352 additions and 325 deletions

View File

@@ -15,9 +15,9 @@ abstract class TableNames {
static const String senseField = 'JMdict_SenseField';
static const String senseGlossary = 'JMdict_SenseGlossary';
static const String senseInfo = 'JMdict_SenseInfo';
static const String senseLanguageSource = 'JMdict_SenseLanguageSource';
static const String senseMisc = 'JMdict_SenseMisc';
static const String sensePOS = 'JMdict_SensePOS';
static const String senseLanguageSource = 'JMdict_SenseLanguageSource';
static const String senseRestrictedToKanji = 'JMdict_SenseRestrictedToKanji';
static const String senseRestrictedToReading =
'JMdict_SenseRestrictedToReading';
@@ -216,7 +216,22 @@ class Sense extends SQLWritable {
});
@override
Map<String, Object?> get sqlValue => {};
Map<String, Object?> get sqlValue => {
'id': id,
'orderNum': orderNum,
};
bool get isEmpty => antonyms.isEmpty &&
dialects.isEmpty &&
fields.isEmpty &&
info.isEmpty &&
languageSource.isEmpty &&
glossary.isEmpty &&
misc.isEmpty &&
pos.isEmpty &&
restrictedToKanji.isEmpty &&
restrictedToReading.isEmpty &&
seeAlso.isEmpty;
}
class Entry extends SQLWritable {

View File

@@ -196,58 +196,65 @@ List<Entry> parseXML(XmlElement root) {
for (final (orderNum, sense) in entry.findElements('sense').indexed) {
senseId++;
senses.add(
Sense(
id: senseId,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords:
e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => parseXrefParts(e.innerText))
.toList(),
),
final result = Sense(
id: senseId,
orderNum: orderNum + 1,
restrictedToKanji:
sense.findElements('stagk').map((e) => e.innerText).toList(),
restrictedToReading:
sense.findElements('stagr').map((e) => e.innerText).toList(),
pos: sense
.findElements('pos')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
misc: sense
.findElements('misc')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
dialects: sense
.findElements('dial')
.map((e) => e.innerText.substring(1, e.innerText.length - 1))
.toList(),
info: sense.findElements('s_inf').map((e) => e.innerText).toList(),
languageSource: sense
.findElements('lsource')
.map(
(e) => LanguageSource(
language: e.getAttribute('xml:lang') ?? 'eng',
fullyDescribesSense: e.getAttribute('ls_type') == 'part',
constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y',
),
)
.toList(),
glossary: sense
.findElements('gloss')
.map(
(e) => Glossary(
language: e.getAttribute('xml:lang') ?? 'eng',
phrase: e.innerText,
type: e.getAttribute('g_type'),
),
)
.toList(),
antonyms: sense
.findElements('ant')
.map((e) => parseXrefParts(e.innerText))
.toList(),
seeAlso: sense
.findElements('xref')
.map((e) => parseXrefParts(e.innerText))
.toList(),
);
if (result.isEmpty) {
print(
'WARNING: Sense $senseId for entry $entryId is empty, '
'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, '
'reading: ${readingEls.map((e) => e.reading).join(', ')}',
);
} else {
senses.add(result);
}
}
entries.add(
@@ -264,7 +271,7 @@ List<Entry> parseXML(XmlElement root) {
}
Future<void> insertIntoDB(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1');
print(' [JMdict] Batch 1 - Kanji and readings');
Batch b = db.batch();
for (final e in entries) {
b.insert(TableNames.entry, e.sqlValue);
@@ -326,32 +333,26 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
for (final s in e.senses) {
for (final g in s.glossary) {
if (g.language == "eng")
b.insert(
TableNames.entryByEnglish,
{
'entryId': e.id,
'english': g.phrase,
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
b.insert(
TableNames.entryByEnglish,
{
'entryId': e.id,
'english': g.phrase,
},
// Some entries have the same reading twice with difference in katakana and hiragana
conflictAlgorithm: ConflictAlgorithm.ignore,
);
}
}
}
await b.commit();
print(' [JMdict] Batch 2');
print(' [JMdict] Batch 2 - Senses');
b = db.batch();
for (final e in entries) {
for (final s in e.senses) {
b.insert(
TableNames.sense,
s.sqlValue
..addAll({'id': s.id, 'entryId': e.id, 'orderNum': s.orderNum}));
b.insert(TableNames.sense, s.sqlValue..addAll({'entryId': e.id}));
for (final d in s.dialects) {
b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d});
}
@@ -392,21 +393,17 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
);
}
for (final g in s.glossary) {
if (g.language == 'eng')
b.insert(
TableNames.senseGlossary,
g.sqlValue..addAll({'senseId': s.id}),
// There are some duplicate glossary, especially in
// the other languages.
conflictAlgorithm: ConflictAlgorithm.ignore,
);
b.insert(
TableNames.senseGlossary,
g.sqlValue..addAll({'senseId': s.id}),
);
}
}
}
await b.commit();
print(' [JMdict] Building trees');
print(' [JMdict] Building xref trees');
SplayTreeMap<String, Set<Entry>> entriesByKanji = SplayTreeMap();
for (final entry in entries) {
for (final kanji in entry.kanji) {
@@ -428,7 +425,7 @@ Future<void> insertIntoDB(List<Entry> entries, Database db) async {
}
}
print(' [JMdict] Batch 3');
print(' [JMdict] Batch 3 - Xrefs');
b = db.batch();
for (final e in entries) {

View File

@@ -1,5 +1,11 @@
/// Interface for objects which are meant to be written to a table in a SQL database.
abstract class SQLWritable {
const SQLWritable();
/// Returns a map of the object's properties and their values.
///
/// Note that there might be properties in the object which is meant to be
/// inserted into a different table. These properties will/should be excluded
/// from this map.
Map<String, Object?> get sqlValue;
}