1 Commits

Author SHA1 Message Date
781e650f0b WIP: use ids for \{kanji,reading\}Element tables 2025-06-24 01:01:07 +02:00
15 changed files with 135 additions and 198 deletions

12
flake.lock generated
View File

@@ -3,7 +3,7 @@
"jmdict-src": {
"flake": false,
"locked": {
"narHash": "sha256-sLl+OrVBgc4QCOZ2cvWGLZBerHDLuApyQOQyDyLUHtk=",
"narHash": "sha256-84P7r/fFlBnawy6yChrD9WMHmOWcEGWUmoK70N4rdGQ=",
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz"
},
@@ -15,7 +15,7 @@
"jmdict-with-examples-src": {
"flake": false,
"locked": {
"narHash": "sha256-FQvkYXwgmCJ+ChVkoFzamlG8kyczHAgsJ3zJ6OvRLZc=",
"narHash": "sha256-PM0sv7VcsCya2Ek02CI7hVwB3Jawn6bICSI+dsJK0yo=",
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e_examp.gz"
},
@@ -27,7 +27,7 @@
"kanjidic2-src": {
"flake": false,
"locked": {
"narHash": "sha256-vyMpRnN9O3vCpvfVDACKdTlapBVx6yXg0X2tgXF2t+U=",
"narHash": "sha256-Lc0wUPpuDKuMDv2t87//w3z20RX8SMJI2iIRtUJ8fn0=",
"type": "file",
"url": "https://www.edrdg.org/kanjidic/kanjidic2.xml.gz"
},
@@ -38,11 +38,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1751792365,
"narHash": "sha256-J1kI6oAj25IG4EdVlg2hQz8NZTBNYvIS0l4wpr9KcUo=",
"lastModified": 1746904237,
"narHash": "sha256-3e+AVBczosP5dCLQmMoMEogM57gmZ2qrVSrmq9aResQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "1fd8bada0b6117e6c7eb54aad5813023eed37ccb",
"rev": "d89fc19e405cb2d55ce7cc114356846a0ee5e956",
"type": "github"
},
"original": {

View File

@@ -104,24 +104,10 @@
platforms = lib.platforms.all;
};
src = builtins.filterSource (path: type: let
baseName = baseNameOf (toString path);
in !(lib.any (b: b) [
(!(lib.cleanSourceFilter path type))
(baseName == ".github" && type == "directory")
(baseName == "nix" && type == "directory")
(baseName == ".envrc" && type == "regular")
(baseName == "flake.lock" && type == "regular")
(baseName == "flake.nix" && type == "regular")
])) ./.;
src = lib.cleanSource ./.;
in forAllSystems (system: pkgs: {
default = self.packages.${system}.database;
filteredSource = pkgs.runCommandLocal "filtered-source" { } ''
ln -s ${src} $out
'';
jmdict = pkgs.callPackage ./nix/jmdict.nix {
inherit jmdict-src jmdict-with-examples-src edrdgMetadata;
};

View File

@@ -72,25 +72,20 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
print(' [JMdict] Batch 1 - Kanji and readings');
Batch b = db.batch();
int elementId = 0;
for (final e in entries) {
b.insert(JMdictTableNames.entry, e.sqlValue);
for (final k in e.kanji) {
elementId++;
b.insert(
JMdictTableNames.kanjiElement,
k.sqlValue..addAll({
'entryId': e.entryId,
'elementId': elementId,
}),
k.sqlValue..addAll({'entryId': e.entryId}),
);
for (final i in k.info) {
b.insert(
JMdictTableNames.kanjiInfo,
{
'elementId': elementId,
'entryId': e.entryId,
'reading': k.reading,
'info': i,
},
);
@@ -98,20 +93,17 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
}
for (final r in e.readings) {
elementId++;
b.insert(
JMdictTableNames.readingElement,
r.sqlValue..addAll({
'entryId': e.entryId,
'elementId': elementId,
}),
r.sqlValue..addAll({'entryId': e.entryId}),
);
for (final i in r.info) {
b.insert(
JMdictTableNames.readingInfo,
{
'elementId': elementId,
'entryId': e.entryId,
'reading': r.reading,
'info': i,
},
);
@@ -120,7 +112,8 @@ Future<void> seedJMDictData(List<Entry> entries, Database db) async {
b.insert(
JMdictTableNames.readingRestriction,
{
'elementId': elementId,
'entryId': e.entryId,
'reading': r.reading,
'restriction': res,
},
);

View File

@@ -1,5 +1,4 @@
import 'package:jadb/_data_ingestion/kanjidic/objects.dart';
import 'package:jadb/util/romaji_transliteration.dart';
import 'package:xml/xml.dart';
List<Character> parseKANJIDICData(XmlElement root) {
@@ -135,7 +134,7 @@ List<Character> parseKANJIDICData(XmlElement root) {
.map(
(e) => Onyomi(
kanji: kanji,
yomi: transliterateKatakanaToHiragana(e.innerText),
yomi: e.innerText,
isJouyou: e.getAttribute('r_status') == 'jy',
type: e.getAttribute('on_type')),
)

View File

@@ -35,19 +35,12 @@ class CreateDb extends Command {
readWrite: true,
);
bool failed = false;
await seedData(db).then((_) {
print("Database created successfully");
}).catchError((error) {
print("Error creating database: $error");
failed = true;
}).whenComplete(() {
db.close();
});
if (failed) {
exit(1);
} else {
exit(0);
}
}
}

View File

@@ -10,11 +10,16 @@ import 'package:args/command_runner.dart';
class QueryKanji extends Command {
final name = "query-kanji";
final description = "Query the database for kanji data";
final invocation = "jadb query-kanji [options] <kanji>";
QueryKanji() {
addLibsqliteArg(argParser);
addJadbArg(argParser);
argParser.addOption(
'kanji',
abbr: 'k',
help: 'The kanji to search for.',
valueHelp: 'KANJI',
);
}
Future<void> run() async {
@@ -29,17 +34,10 @@ class QueryKanji extends Command {
libsqlitePath: argResults!.option('libsqlite')!,
);
if (argResults!.rest.length != 1) {
print('You need to provide exactly one kanji character to search for.');
print('');
printUsage();
exit(64);
}
final String kanji = argResults!.rest.first.trim();
final time = Stopwatch()..start();
final result = await JaDBConnection(db).jadbSearchKanji(kanji);
final result = await JaDBConnection(db).jadbSearchKanji(
argResults!.option('kanji') ?? '',
);
time.stop();
if (result == null) {

View File

@@ -5,24 +5,26 @@ import 'package:jadb/cli/args.dart';
import 'package:jadb/search.dart';
import 'package:args/command_runner.dart';
import 'package:sqflite_common/sqflite.dart';
class QueryWord extends Command {
final name = "query-word";
final description = "Query the database for word data";
final invocation = "jadb query-word [options] (<word> | <ID>)";
QueryWord() {
addLibsqliteArg(argParser);
addJadbArg(argParser);
argParser.addOption(
'word',
abbr: 'w',
help: 'The word to search for.',
valueHelp: 'WORD',
);
}
Future<void> run() async {
if (argResults!.option('libsqlite') == null ||
argResults!.option('jadb') == null) {
print("You need to provide both libsqlite and jadb paths.");
print('');
printUsage();
print(argParser.usage);
exit(64);
}
@@ -31,38 +33,8 @@ class QueryWord extends Command {
libsqlitePath: argResults!.option('libsqlite')!,
);
if (argResults!.rest.isEmpty) {
print('You need to provide a word or ID to search for.');
print('');
printUsage();
exit(64);
}
final String searchWord = argResults!.option('word') ?? 'かな';
final String searchWord = argResults!.rest.join(" ");
final int? maybeId = int.tryParse(searchWord);
if (maybeId != null && maybeId >= 1000000) {
await _searchId(db, maybeId);
} else {
await _searchWord(db, searchWord);
}
}
Future<void> _searchId(DatabaseExecutor db, int id) async {
final time = Stopwatch()..start();
final result = await JaDBConnection(db).jadbGetWordById(id);
time.stop();
if (result == null) {
print("Invalid ID");
} else {
print(result.toString());
}
print("Query took ${time.elapsedMilliseconds}ms");
}
Future<void> _searchWord(DatabaseExecutor db, String searchWord) async {
final time = Stopwatch()..start();
final count = await JaDBConnection(db).jadbSearchWordCount(searchWord);
time.stop();

View File

@@ -3,7 +3,6 @@ import 'package:jadb/table_names/kanjidic.dart';
import 'package:jadb/table_names/radkfile.dart';
import 'package:jadb/models/kanji_search/kanji_search_radical.dart';
import 'package:jadb/models/kanji_search/kanji_search_result.dart';
import 'package:jadb/util/romaji_transliteration.dart';
import 'package:sqflite_common/sqflite.dart';
Future<KanjiSearchResult?> searchKanjiWithDbConnection(
@@ -202,10 +201,7 @@ Future<KanjiSearchResult?> searchKanjiWithDbConnection(
meanings: meanings.map((item) => item['meaning'] as String).toList(),
kunyomi: kunyomis.map((item) => item['yomi'] as String).toList(),
parts: parts.map((item) => item['radical'] as String).toList(),
onyomi: onyomis
.map((item) => item['yomi'] as String)
.map(transliterateHiraganaToKatakana)
.toList(),
onyomi: onyomis.map((item) => item['yomi'] as String).toList(),
radical: radical,
codepoints: {
for (final codepoint in codepoints)

View File

@@ -230,37 +230,40 @@ Future<LinearWordQueryData> fetchLinearWordQueryData(
// Reading queries
final readingIds = readingElements
.map((element) => element['elementId'] as int)
.map((element) => (
element['entryId'] as int,
escapeStringValue(element['reading'] as String)
))
.toList();
late final List<Map<String, Object?>> readingElementInfos;
final Future<List<Map<String, Object?>>> readingElementInfos_query =
connection.query(
JMdictTableNames.readingInfo,
where: '(elementId) IN (${List.filled(readingIds.length, '?').join(',')})',
whereArgs: readingIds,
where: '(entryId, reading) IN (${readingIds.join(',')})',
);
late final List<Map<String, Object?>> readingElementRestrictions;
final Future<List<Map<String, Object?>>> readingElementRestrictions_query =
connection.query(
JMdictTableNames.readingRestriction,
where: '(elementId) IN (${List.filled(readingIds.length, '?').join(',')})',
whereArgs: readingIds,
where: '(entryId, reading) IN (${readingIds.join(',')})',
);
// Kanji queries
final kanjiIds = kanjiElements
.map((element) => element['elementId'] as int)
.map((element) => (
element['entryId'] as int,
escapeStringValue(element['reading'] as String)
))
.toList();
late final List<Map<String, Object?>> kanjiElementInfos;
final Future<List<Map<String, Object?>>> kanjiElementInfos_query =
connection.query(
JMdictTableNames.kanjiInfo,
where: '(elementId) IN (${List.filled(kanjiIds.length, '?').join(',')})',
whereArgs: kanjiIds,
where: '(entryId, reading) IN (${kanjiIds.join(',')})',
);
await Future.wait([

View File

@@ -51,16 +51,18 @@ String _filterFTSSensitiveCharacters(String word) {
WITH
fts_results AS (
SELECT DISTINCT
"${tableName}"."entryId",
"${tableName}FTS"."entryId",
100
+ (("${tableName}FTS"."reading" = ?) * 10000)
+ (("${tableName}FTS"."reading" = ?) * 50)
+ "JMdict_EntryScore"."score"
AS "score"
FROM "${tableName}FTS"
JOIN "${tableName}" USING ("elementId")
JOIN "JMdict_EntryScore" USING ("elementId")
JOIN "${tableName}" USING ("entryId", "reading")
JOIN "JMdict_EntryScore" USING ("entryId", "reading")
WHERE "${tableName}FTS"."reading" MATCH ? || '*'
AND "JMdict_EntryScore"."type" = '${tableName == JMdictTableNames.kanjiElement ? 'k' : 'r'}'
AND "JMdict_EntryScore"."type" = '${tableName == JMdictTableNames.kanjiElement ? 'kanji' : 'reading'}'
ORDER BY
"JMdict_EntryScore"."score" DESC
${!countOnly ? 'LIMIT ?' : ''}
),
non_fts_results AS (
@@ -70,23 +72,22 @@ String _filterFTSSensitiveCharacters(String word) {
+ "JMdict_EntryScore"."score"
AS "score"
FROM "${tableName}"
JOIN "JMdict_EntryScore" USING ("elementId")
JOIN "JMdict_EntryScore" USING ("entryId", "reading")
WHERE "reading" LIKE '%' || ? || '%'
AND "${tableName}"."entryId" NOT IN (SELECT "entryId" FROM "fts_results")
AND "JMdict_EntryScore"."type" = '${tableName == JMdictTableNames.kanjiElement ? 'k' : 'r'}'
AND "entryId" NOT IN (SELECT "entryId" FROM "fts_results")
AND "JMdict_EntryScore"."type" = '${tableName == JMdictTableNames.kanjiElement ? 'kanji' : 'reading'}'
ORDER BY
"JMdict_EntryScore"."score" DESC,
"${tableName}"."entryId" ASC
${!countOnly ? 'LIMIT ?' : ''}
)
${countOnly ? 'SELECT COUNT("entryId") AS count' : 'SELECT "entryId", MAX("score") AS "score"'}
${countOnly ? 'SELECT COUNT("entryId") AS count' : 'SELECT "entryId", "score"'}
FROM (
SELECT * FROM fts_results
UNION
UNION ALL
SELECT * FROM non_fts_results
)
GROUP BY "entryId"
ORDER BY
"score" DESC,
"entryId" ASC
'''
.trim(),
[

View File

@@ -182,35 +182,16 @@ GroupedWordResult _regroup_words({
'No readings found for entryId: $entryId',
);
final Map<int, String> readingElementIdsToReading = {
for (final element in readingElements_)
element['elementId'] as int: element['reading'] as String,
};
final Map<int, String> kanjiElementIdsToReading = {
for (final element in kanjiElements_)
element['elementId'] as int: element['reading'] as String,
};
final readingElementInfos_ = readingElementInfos
.where((element) => element['entryId'] == entryId)
.toList();
final kanjiElementInfos_ = kanjiElementInfos
.where((element) => element['entryId'] == entryId)
.toList();
return GroupedWordResult(
rubys: rubys,
readingInfos: {
for (final rei in readingElementInfos_)
readingElementIdsToReading[rei['elementId'] as int]!:
for (final rei in readingElementInfos)
rei['reading'] as String:
JMdictReadingInfo.fromId(rei['info'] as String),
},
kanjiInfos: {
for (final kei in kanjiElementInfos_)
kanjiElementIdsToReading[kei['elementId'] as int]!:
JMdictKanjiInfo.fromId(kei['info'] as String),
for (final kei in kanjiElementInfos)
kei['reading'] as String: JMdictKanjiInfo.fromId(kei['info'] as String),
},
);
}

View File

@@ -43,7 +43,6 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
);
if (entryIds.isEmpty) {
// TODO: try conjugation search
return [];
}

View File

@@ -1,6 +1,5 @@
CREATE TABLE "JMdict_EntryScore" (
"type" CHAR(1) NOT NULL CHECK ("type" IN ('r', 'k')),
"entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
"type" TEXT NOT NULL CHECK ("type" IN ('reading', 'kanji')),
"elementId" INTEGER NOT NULL,
"score" INTEGER NOT NULL DEFAULT 0,
"common" BOOLEAN NOT NULL DEFAULT FALSE,
@@ -20,8 +19,7 @@ CREATE INDEX "JMdict_EntryScore_byType_byCommon" ON "JMdict_EntryScore"("type",
CREATE VIEW "JMdict_EntryScoreView_Reading" AS
SELECT
'r' AS "type",
"JMdict_ReadingElement"."entryId",
'reading' AS "type",
"JMdict_ReadingElement"."elementId",
(
"news" IS 1
@@ -52,8 +50,7 @@ LEFT JOIN "JMdict_JLPTTag" USING ("entryId");
CREATE VIEW "JMdict_EntryScoreView_Kanji" AS
SELECT
'k' AS "type",
"JMdict_KanjiElement"."entryId",
'kanji' AS "type",
"JMdict_KanjiElement"."elementId",
(
"news" IS 1
@@ -97,12 +94,11 @@ AFTER INSERT ON "JMdict_ReadingElement"
BEGIN
INSERT INTO "JMdict_EntryScore" (
"type",
"entryId",
"elementId",
"score",
"common"
)
SELECT "type", "entryId", "elementId", "score", "common"
SELECT "type", "elementId", "score", "common"
FROM "JMdict_EntryScoreView_Reading"
WHERE "elementId" = NEW."elementId";
END;
@@ -123,7 +119,7 @@ CREATE TRIGGER "JMdict_EntryScore_Delete_JMdict_ReadingElement"
AFTER DELETE ON "JMdict_ReadingElement"
BEGIN
DELETE FROM "JMdict_EntryScore"
WHERE "type" = 'r'
WHERE "type" = 'reading'
AND "elementId" = OLD."elementId";
END;
@@ -134,12 +130,11 @@ AFTER INSERT ON "JMdict_KanjiElement"
BEGIN
INSERT INTO "JMdict_EntryScore" (
"type",
"entryId",
"elementId",
"score",
"common"
)
SELECT "type", "entryId", "elementId", "score", "common"
SELECT "type", "elementId", "score", "common"
FROM "JMdict_EntryScoreView_Kanji"
WHERE "elementId" = NEW."elementId";
END;
@@ -160,7 +155,7 @@ CREATE TRIGGER "JMdict_EntryScore_Delete_JMdict_KanjiElement"
AFTER DELETE ON "JMdict_KanjiElement"
BEGIN
DELETE FROM "JMdict_EntryScore"
WHERE "type" = 'k'
WHERE "type" = 'kanji'
AND "elementId" = OLD."elementId";
END;
@@ -174,9 +169,26 @@ BEGIN
"score" = "JMdict_EntryScoreView"."score",
"common" = "JMdict_EntryScoreView"."common"
FROM "JMdict_EntryScoreView"
WHERE "JMdict_EntryScoreView"."entryId" = NEW."entryId"
AND "JMdict_EntryScore"."entryId" = NEW."entryId"
AND "JMdict_EntryScoreView"."elementId" = "JMdict_EntryScore"."elementId";
WHERE
(
(
"JMdict_EntryScoreView"."type" = 'kanji'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_KanjiElement" WHERE "entryId" = NEW."entryId"
)
)
OR
(
"JMdict_EntryScoreView"."type" = 'reading'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_ReadingElement" WHERE "entryId" = NEW."entryId"
)
)
)
AND "JMdict_EntryScoreView"."entryId" = "JMdict_EntryScore"."entryId"
AND "JMdict_EntryScoreView"."reading" = "JMdict_EntryScore"."reading";
END;
CREATE TRIGGER "JMdict_EntryScore_Update_JMdict_JLPTTag"
@@ -188,9 +200,26 @@ BEGIN
"score" = "JMdict_EntryScoreView"."score",
"common" = "JMdict_EntryScoreView"."common"
FROM "JMdict_EntryScoreView"
WHERE "JMdict_EntryScoreView"."entryId" = NEW."entryId"
AND "JMdict_EntryScore"."entryId" = NEW."entryId"
AND "JMdict_EntryScoreView"."elementId" = "JMdict_EntryScore"."elementId";
WHERE
(
(
"JMdict_EntryScoreView"."type" = 'kanji'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_KanjiElement" WHERE "entryId" = NEW."entryId"
)
)
OR
(
"JMdict_EntryScoreView"."type" = 'reading'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_ReadingElement" WHERE "entryId" = NEW."entryId"
)
)
)
AND "JMdict_EntryScoreView"."entryId" = "JMdict_EntryScore"."entryId"
AND "JMdict_EntryScoreView"."reading" = "JMdict_EntryScore"."reading";
END;
CREATE TRIGGER "JMdict_EntryScore_Delete_JMdict_JLPTTag"
@@ -201,7 +230,24 @@ BEGIN
"score" = "JMdict_EntryScoreView"."score",
"common" = "JMdict_EntryScoreView"."common"
FROM "JMdict_EntryScoreView"
WHERE "JMdict_EntryScoreView"."entryId" = OLD."entryId"
AND "JMdict_EntryScore"."entryId" = OLD."entryId"
AND "JMdict_EntryScoreView"."elementId" = "JMdict_EntryScore"."elementId";
WHERE
(
(
"JMdict_EntryScoreView"."type" = 'kanji'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_KanjiElement" WHERE "entryId" = OLD."entryId"
)
)
OR
(
"JMdict_EntryScoreView"."type" = 'reading'
AND
"JMdict_EntryScoreView"."elementId" IN (
SELECT "elementId" FROM "JMdict_ReadingElement" WHERE "entryId" = OLD."entryId"
)
)
)
AND "JMdict_EntryScoreView"."entryId" = "JMdict_EntryScore"."entryId"
AND "JMdict_EntryScoreView"."reading" = "JMdict_EntryScore"."reading";
END;

View File

@@ -65,7 +65,7 @@ JOIN "JMdict_KanjiElement"
ON "JMdict_KanjiElementFTS"."entryId" = "JMdict_KanjiElement"."entryId"
AND "JMdict_KanjiElementFTS"."reading" LIKE '%' || "JMdict_KanjiElement"."reading"
JOIN "JMdict_EntryScore"
ON "JMdict_EntryScore"."type" = 'k'
ON "JMdict_EntryScore"."type" = 'kanji'
AND "JMdict_KanjiElement"."entryId" = "JMdict_EntryScore"."entryId"
AND "JMdict_KanjiElement"."reading" = "JMdict_EntryScore"."reading"
WHERE "JMdict_EntryScore"."common" = 1;
@@ -78,9 +78,9 @@ CREATE VIEW "JMdict_CombinedEntryScore"
AS
SELECT
CASE
WHEN "JMdict_EntryScore"."type" = 'k'
WHEN "JMdict_EntryScore"."type" = 'kanji'
THEN (SELECT entryId FROM "JMdict_KanjiElement" WHERE "elementId" = "JMdict_EntryScore"."elementId")
WHEN "JMdict_EntryScore"."type" = 'r'
WHEN "JMdict_EntryScore"."type" = 'reading'
THEN (SELECT entryId FROM "JMdict_ReadingElement" WHERE "elementId" = "JMdict_EntryScore"."elementId")
END AS "entryId",
MAX("JMdict_EntryScore"."score") AS "score",

View File

@@ -4,42 +4,12 @@ import 'package:test/test.dart';
import 'setup_database_connection.dart';
void main() {
test("Search a word - english - auto", () async {
test("Search a word", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWord("kana");
expect(result, isNotNull);
});
test("Get word search count - english - auto", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWordCount("kana");
expect(result, isNotNull);
});
test("Search a word - japanese kana - auto", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWord("かな");
expect(result, isNotNull);
});
test("Get word search count - japanese kana - auto", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWordCount("かな");
expect(result, isNotNull);
});
test("Search a word - japanese kanji - auto", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWord("仮名");
expect(result, isNotNull);
});
test("Get word search count - japanese kanji - auto", () async {
final connection = await setup_database_connection();
final result = await connection.jadbSearchWordCount("仮名");
expect(result, isNotNull);
});
test("Get a word by id", () async {
final connection = await setup_database_connection();
final result = await connection.jadbGetWordById(1577090);