From 7bacfc39a898c70097136afd5701e974bda37bfd Mon Sep 17 00:00:00 2001 From: h7x4 Date: Mon, 6 Apr 2026 12:55:40 +0900 Subject: [PATCH] WIP --- flake.nix | 18 ++++++--- lib/_data_ingestion/jmdict/seed_data.dart | 19 ++++------ lib/_data_ingestion/open_local_db.dart | 1 + migrations/0001_JMDict.sql | 45 ++++++++++------------- nix/database_tool.nix | 4 ++ 5 files changed, 44 insertions(+), 43 deletions(-) diff --git a/flake.nix b/flake.nix index 591a6aa..5237c88 100644 --- a/flake.nix +++ b/flake.nix @@ -87,7 +87,7 @@ gnumake lcov sqldiff - sqlite-interactive + sqlite-interactive-icu-ext ]; env = { LIBSQLITE_PATH = "${pkgs.sqlite.out}/lib/libsqlite3.so"; @@ -98,7 +98,7 @@ sqlite-debugging = pkgs.mkShell { packages = with pkgs; [ - sqlite-sqlite-interactive + sqlite-interactive-icu-ext sqlite-analyzer sqlite-web sqlint @@ -107,8 +107,8 @@ }; }); - overlays.sqlite-icu-ext = final: prev: { - sqlite = prev.sqlite.overrideAttrs (prev': { + overlays.sqlite-icu-ext = final: prev: let + overrideArgs = prev': { configureFlags = prev'.configureFlags ++ [ "--with-icu-config=${lib.getExe' prev.icu.dev "icu-config"}" "--enable-icu-collations" @@ -117,7 +117,10 @@ buildInputs = prev'.buildInputs ++ [ prev.icu ]; - }); + }; + in { + sqlite-icu-ext = prev.sqlite.overrideAttrs overrideArgs; + sqlite-interactive-icu-ext = prev.sqlite-interactive.overrideAttrs overrideArgs; }; packages = let @@ -153,7 +156,7 @@ ln -s ${src} $out ''; - inherit (pkgs) dart sqlite sqlite-interactive; + inherit (pkgs) sqlite-icu-ext sqlite-interactive-icu-ext; jmdict = pkgs.callPackage ./nix/jmdict.nix { inherit jmdict-src jmdict-with-examples-src edrdgMetadata; @@ -169,17 +172,20 @@ database-tool = pkgs.callPackage ./nix/database_tool.nix { inherit src; + sqlite = pkgs.sqlite-icu-ext; }; database = pkgs.callPackage ./nix/database.nix { inherit (self.packages.${system}) database-tool jmdict radkfile kanjidic2; inherit src; + sqlite = pkgs.sqlite-icu-ext; }; database-wal = pkgs.callPackage ./nix/database.nix { inherit (self.packages.${system}) database-tool jmdict radkfile kanjidic2; inherit src; wal = true; + sqlite = pkgs.sqlite-icu-ext; }; docs = pkgs.callPackage ./nix/docs.nix { diff --git a/lib/_data_ingestion/jmdict/seed_data.dart b/lib/_data_ingestion/jmdict/seed_data.dart index 03b3208..0347500 100644 --- a/lib/_data_ingestion/jmdict/seed_data.dart +++ b/lib/_data_ingestion/jmdict/seed_data.dart @@ -10,8 +10,9 @@ import 'package:sqflite_common/sqlite_api.dart'; class ResolvedXref { Entry entry; bool ambiguous; + int? senseOrderNum; - ResolvedXref(this.entry, this.ambiguous); + ResolvedXref(this.entry, this.ambiguous, senseOrderNum); } /// Resolves an xref (pair of kanji, optionally reading, and optionally sense number) to an a specific @@ -74,9 +75,10 @@ ResolvedXref resolveXref( 'kanjiRef: ${xref.kanjiRef}, readingRef: ${xref.readingRef}, ' 'senseOrderNum: ${xref.senseOrderNum}', ); - return ResolvedXref(candidateEntries.first, true); + + return ResolvedXref(candidateEntries.first, true, xref.senseOrderNum); } else { - return ResolvedXref(candidateEntries.first, false); + return ResolvedXref(candidateEntries.first, false, xref.senseOrderNum); } } @@ -161,14 +163,14 @@ Future seedJMDictData(List entries, Database db) async { b.insert(JMdictTableNames.senseRestrictedToKanji, { 'entryId': e.entryId, 'senseId': s.senseId, - 'kanji': rk, + 'kanjiOrderNum': e.kanji.indexWhere((k) => k.reading == rk) + 1, }); } for (final rr in s.restrictedToReading) { b.insert(JMdictTableNames.senseRestrictedToReading, { 'entryId': e.entryId, 'senseId': s.senseId, - 'reading': rr, + 'readingOrderNum': e.readings.indexWhere((r) => r.reading == rr) + 1, }); } for (final ls in s.languageSource) { @@ -229,9 +231,7 @@ Future seedJMDictData(List entries, Database db) async { b.insert(JMdictTableNames.senseSeeAlso, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId, - 'seeAlsoKanji': xref.kanjiRef, - 'seeAlsoReading': xref.readingRef, - 'seeAlsoSense': xref.senseOrderNum, + 'xrefSenseOrderNum': resolvedEntry.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }); } @@ -256,9 +256,6 @@ Future seedJMDictData(List entries, Database db) async { b.insert(JMdictTableNames.senseAntonyms, { 'senseId': s.senseId, 'xrefEntryId': resolvedEntry.entry.entryId, - 'antonymKanji': ant.kanjiRef, - 'antonymReading': ant.readingRef, - 'antonymSense': ant.senseOrderNum, 'ambiguous': resolvedEntry.ambiguous, }); } diff --git a/lib/_data_ingestion/open_local_db.dart b/lib/_data_ingestion/open_local_db.dart index ac56def..7b00b79 100644 --- a/lib/_data_ingestion/open_local_db.dart +++ b/lib/_data_ingestion/open_local_db.dart @@ -27,6 +27,7 @@ Future openLocalDb({ await db.execute('PRAGMA journal_mode = WAL'); } await db.execute('PRAGMA foreign_keys = ON'); + await db.execute("SELECT icu_load_collation('ja_JP', 'japanese')"); }, readOnly: !readWrite, ), diff --git a/migrations/0001_JMDict.sql b/migrations/0001_JMDict.sql index 28d448e..1faf4fc 100644 --- a/migrations/0001_JMDict.sql +++ b/migrations/0001_JMDict.sql @@ -1,3 +1,5 @@ +SELECT icu_load_collation('ja_JP', 'japanese'); + CREATE TABLE "JMdict_Version" ( "version" VARCHAR(10) PRIMARY KEY NOT NULL, "date" DATE NOT NULL, @@ -55,13 +57,13 @@ CREATE TABLE "JMdict_KanjiElement" ( "elementId" INTEGER PRIMARY KEY, "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"), "orderNum" INTEGER NOT NULL, - "reading" TEXT NOT NULL, + "reading" TEXT NOT NULL COLLATE japanese, "news" INTEGER CHECK ("news" BETWEEN 1 AND 2), "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2), "spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2), "gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2), "nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48), - UNIQUE("entryId", "reading"), + -- UNIQUE("entryId", "reading"), UNIQUE("entryId", "orderNum") ) WITHOUT ROWID; @@ -80,14 +82,14 @@ CREATE TABLE "JMdict_ReadingElement" ( "elementId" INTEGER PRIMARY KEY, "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"), "orderNum" INTEGER NOT NULL, - "reading" TEXT NOT NULL, + "reading" TEXT NOT NULL COLLATE japanese, "readingDoesNotMatchKanji" BOOLEAN NOT NULL DEFAULT FALSE, "news" INTEGER CHECK ("news" BETWEEN 1 AND 2), "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2), "spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2), "gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2), "nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48), - UNIQUE("entryId", "reading"), + -- UNIQUE("entryId", "reading"), UNIQUE("entryId", "orderNum") ) WITHOUT ROWID; @@ -120,17 +122,17 @@ CREATE INDEX "JMdict_Sense_byEntryId_byOrderNum" ON "JMdict_Sense"("entryId", "o CREATE TABLE "JMdict_SenseRestrictedToKanji" ( "entryId" INTEGER NOT NULL, "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"), - "kanji" TEXT NOT NULL, - FOREIGN KEY ("entryId", "kanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"), - PRIMARY KEY ("entryId", "senseId", "kanji") + "kanjiOrderNum" INTEGER NOT NULL CHECK ("kanjiOrderNum" > 0), + FOREIGN KEY ("entryId", "kanjiOrderNum") REFERENCES "JMdict_KanjiElement"("entryId", "orderNum"), + PRIMARY KEY ("entryId", "senseId", "kanjiOrderNum") ) WITHOUT ROWID; CREATE TABLE "JMdict_SenseRestrictedToReading" ( "entryId" INTEGER NOT NULL, "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"), - "reading" TEXT NOT NULL, - FOREIGN KEY ("entryId", "reading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"), - PRIMARY KEY ("entryId", "senseId", "reading") + "readingOrderNum" INTEGER NOT NULL CHECK ("readingOrderNum" > 0), + FOREIGN KEY ("entryId", "readingOrderNum") REFERENCES "JMdict_ReadingElement"("entryId", "orderNum"), + PRIMARY KEY ("entryId", "senseId", "readingOrderNum") ) WITHOUT ROWID; -- In order to add xrefs, you will need to have added the entry to xref to. @@ -145,32 +147,23 @@ CREATE TABLE "JMdict_SenseRestrictedToReading" ( CREATE TABLE "JMdict_SenseSeeAlso" ( "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"), - "xrefEntryId" INTEGER NOT NULL, - "seeAlsoReading" TEXT, - "seeAlsoKanji" TEXT, - "seeAlsoSense" INTEGER, + "xrefEntryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"), + -- Sometimes the cross reference is to a specific sense + "xrefSenseOrderNum" INTEGER, -- For some entries, the cross reference is ambiguous. This means that while the ingestion -- has determined some xrefEntryId, it is not guaranteed to be the correct one. "ambiguous" BOOLEAN NOT NULL DEFAULT FALSE, - FOREIGN KEY ("xrefEntryId", "seeAlsoKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "seeAlsoReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "seeAlsoSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"), - UNIQUE("senseId", "xrefEntryId", "seeAlsoReading", "seeAlsoKanji", "seeAlsoSense") + FOREIGN KEY ("xrefEntryId", "xrefSenseOrderNum") REFERENCES "JMdict_Sense"("entryId", "orderNum"), + UNIQUE("senseId", "xrefEntryId", "xrefSenseOrderNum") ); CREATE TABLE "JMdict_SenseAntonym" ( "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"), - "xrefEntryId" INTEGER NOT NULL, - "antonymReading" TEXT, - "antonymKanji" TEXT, - "antonymSense" INTEGER, + "xrefEntryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"), -- For some entries, the cross reference is ambiguous. This means that while the ingestion -- has determined some xrefEntryId, it is not guaranteed to be the correct one. "ambiguous" BOOLEAN NOT NULL DEFAULT FALSE, - FOREIGN KEY ("xrefEntryId", "antonymKanji") REFERENCES "JMdict_KanjiElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "antonymReading") REFERENCES "JMdict_ReadingElement"("entryId", "reading"), - FOREIGN KEY ("xrefEntryId", "antonymSense") REFERENCES "JMdict_Sense"("entryId", "orderNum"), - UNIQUE("senseId", "xrefEntryId", "antonymReading", "antonymKanji", "antonymSense") + UNIQUE("senseId", "xrefEntryId") ); -- These cross references are going to be mostly accessed from a sense diff --git a/nix/database_tool.nix b/nix/database_tool.nix index 3fffb74..11c0c03 100644 --- a/nix/database_tool.nix +++ b/nix/database_tool.nix @@ -1,6 +1,7 @@ { src, buildDartApplication, + sqlite, }: buildDartApplication { pname = "jadb-database-tool"; @@ -9,6 +10,9 @@ buildDartApplication { dartEntryPoints."bin/jadb" = "bin/jadb.dart"; + # NOTE: here we are overriding the implicitly added runtimeDependency from the package fixup in pub2nix. + runtimeDependencies = [ sqlite ]; + # NOTE: the default dart hooks are using `dart compile`, which is not able to call the # new dart build hooks required to use package:sqlite3 >= 3.0.0. So we override # these phases to use `dart build` instead.