9 Commits

Author SHA1 Message Date
oysteikt 41d990a447 WIP
Build and test / build (push) Failing after 17s
2026-05-06 02:00:10 +09:00
oysteikt c43285c78a .gitea/workflows/build-and-test: don't autoupdate datasources
Build and test / build (push) Successful in 5m6s
2026-05-06 00:19:59 +09:00
oysteikt c22e96b7f9 flake.lock: bump nixpkgs
Build and test / build (push) Failing after 3m54s
2026-05-06 00:13:32 +09:00
oysteikt d3516495ab data_ingestion/jmdict: throw proper errors on invalid xrefs 2026-05-05 23:48:05 +09:00
oysteikt be4dd72dcb migrations/JMdict: no default value for EntryScore.score
Build and test / build (push) Failing after 4m4s
2026-05-05 23:23:01 +09:00
oysteikt 32ec34a150 migrations/JMdict: only index common entries in EntryScore_byCommon 2026-05-05 23:22:37 +09:00
oysteikt bd0822b740 .gitea/workflows/build-and-test: remove duplicate nix config
Build and test / build (push) Successful in 5m32s
2026-04-29 08:39:52 +09:00
oysteikt 8827893101 search: prefer Iterable over List in some public APIs
Build and test / build (push) Successful in 7m51s
2026-04-14 17:55:40 +09:00
oysteikt 28c4403e2d test/romaji_transliteration: add tests for iteration marks and yori ligature
Build and test / build (push) Successful in 6m56s
2026-04-13 22:06:03 +09:00
11 changed files with 143 additions and 27 deletions
+2 -3
View File
@@ -16,15 +16,14 @@ jobs:
uses: https://github.com/cachix/install-nix-action@v31
with:
extra_nix_config: |
experimental-features = nix-command flakes
show-trace = true
max-jobs = auto
trusted-users = root
experimental-features = nix-command flakes
build-users-group =
- name: Update database inputs
run: nix flake update datasources
# - name: Update database inputs
# run: nix flake update datasources
- name: Build database
run: nix build .#database -L
Generated
+3 -3
View File
@@ -22,11 +22,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1775423009,
"narHash": "sha256-vPKLpjhIVWdDrfiUM8atW6YkIggCEKdSAlJPzzhkQlw=",
"lastModified": 1777954456,
"narHash": "sha256-hGdgeU2Nk87RAuZyYjyDjFL6LK7dAZN5RE9+hrDTkDU=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "68d8aa3d661f0e6bd5862291b5bb263b2a6595c9",
"rev": "549bd84d6279f9852cae6225e372cc67fb91a4c1",
"type": "github"
},
"original": {
+2
View File
@@ -116,6 +116,8 @@
ln -s ${src} $out
'';
fts5-icu-tokenizer = pkgs.callPackage ./nix/fts5-icu-tokenizer/package.nix { };
inherit (datasources.packages.${system}) jmdict radkfile kanjidic2;
database-tool = pkgs.callPackage ./nix/database_tool.nix {
+38 -9
View File
@@ -27,15 +27,44 @@ ResolvedXref resolveXref(
SplayTreeMap<String, Set<Entry>> entriesByReading,
XRefParts xref,
) {
List<Entry> candidateEntries = switch ((xref.kanjiRef, xref.readingRef)) {
(null, null) => throw Exception(
'Xref $xref has no kanji or reading reference',
),
(final String k, null) => entriesByKanji[k]!.toList(),
(null, final String r) => entriesByReading[r]!.toList(),
(final String k, final String r) =>
entriesByKanji[k]!.intersection(entriesByReading[r]!).toList(),
};
late List<Entry> candidateEntries;
switch ((xref.kanjiRef, xref.readingRef)) {
case (null, null):
throw Exception('Xref $xref has no kanji or reading reference');
case (final String k, null):
if (!entriesByKanji.containsKey(k)) {
throw Exception(
'Xref $xref has kanji reference "$k" but no entries found with that kanji',
);
}
candidateEntries = entriesByKanji[k]!.toList();
break;
case (null, final String r):
if (!entriesByReading.containsKey(r)) {
throw Exception(
'Xref $xref has reading reference "$r" but no entries found with that reading',
);
}
candidateEntries = entriesByReading[r]!.toList();
break;
case (final String k, final String r):
if (!entriesByKanji.containsKey(k)) {
throw Exception(
'Xref $xref has kanji reference "$k" but no entries found with that kanji',
);
}
if (!entriesByReading.containsKey(r)) {
throw Exception(
'Xref $xref has reading reference "$r" but no entries found with that reading',
);
}
candidateEntries = entriesByKanji[k]!
.intersection(entriesByReading[r]!)
.toList();
}
// Filter out entries that don't have the number of senses specified in the xref
if (xref.senseOrderNum != null) {
+2 -2
View File
@@ -18,12 +18,12 @@ extension JaDBConnection on DatabaseExecutor {
searchKanjiWithDbConnection(this, kanji);
/// Search for a kanji in the database.
Future<Map<String, KanjiSearchResult>> jadbGetManyKanji(Set<String> kanji) =>
Future<Map<String, KanjiSearchResult>> jadbGetManyKanji(Iterable<String> kanji) =>
searchManyKanjiWithDbConnection(this, kanji);
/// Filter a list of characters, and return the ones that are listed in the kanji dictionary.
Future<List<String>> filterKanji(
List<String> kanji, {
Iterable<String> kanji, {
bool deduplicate = false,
}) => filterKanjiWithDbConnection(this, kanji, deduplicate);
+2 -2
View File
@@ -6,7 +6,7 @@ import 'package:sqflite_common/sqflite.dart';
/// If [deduplicate] is true, the returned list will deduplicate the input kanji list before returning the filtered results.
Future<List<String>> filterKanjiWithDbConnection(
DatabaseExecutor connection,
List<String> kanji,
Iterable<String> kanji,
bool deduplicate,
) async {
final Set<String> filteredKanji = await connection
@@ -14,7 +14,7 @@ Future<List<String>> filterKanjiWithDbConnection(
SELECT "literal"
FROM "${KANJIDICTableNames.character}"
WHERE "literal" IN (${kanji.map((_) => '?').join(',')})
''', kanji)
''', kanji.toList())
.then((value) => value.map((e) => e['literal'] as String).toSet());
if (deduplicate) {
+1 -1
View File
@@ -274,7 +274,7 @@ Future<KanjiSearchResult?> searchKanjiWithDbConnection(
/// Searches for multiple kanji at once, returning a map of kanji to their search results.
Future<Map<String, KanjiSearchResult>> searchManyKanjiWithDbConnection(
DatabaseExecutor connection,
Set<String> kanji,
Iterable<String> kanji,
) async {
if (kanji.isEmpty) {
return {};
@@ -1,6 +1,6 @@
CREATE TABLE "JMdict_EntryScore" (
"elementId" INTEGER PRIMARY KEY,
"score" INTEGER NOT NULL DEFAULT 0,
"score" INTEGER NOT NULL,
"common" BOOLEAN NOT NULL DEFAULT FALSE,
"entryId" INTEGER NOT NULL GENERATED ALWAYS AS (("elementId" / 100) % 10000000) STORED,
@@ -15,8 +15,7 @@ CREATE TABLE "JMdict_EntryScore" (
CREATE INDEX "JMdict_EntryScore_byElementId_byScore" ON "JMdict_EntryScore"("elementId", "score");
CREATE INDEX "JMdict_EntryScore_byScore" ON "JMdict_EntryScore"("score");
CREATE INDEX "JMdict_EntryScore_byElementId_byCommon" ON "JMdict_EntryScore"("elementId", "common");
CREATE INDEX "JMdict_EntryScore_byCommon" ON "JMdict_EntryScore"("common");
CREATE INDEX "JMdict_EntryScore_byCommon" ON "JMdict_EntryScore"("common") WHERE "common" = TRUE;
-- NOTE: these views are deduplicated in order not to perform an unnecessary
-- UNION on every trigger
@@ -0,0 +1,20 @@
diff --git i/CMakeLists.txt w/CMakeLists.txt
index 9d99543..11ce4a4 100644
--- i/CMakeLists.txt
+++ w/CMakeLists.txt
@@ -131,6 +131,15 @@ if(NOT SQLite3_FOUND)
endif()
endif()
+if(SQLite3_FOUND AND NOT TARGET SQLite3::SQLite3)
+ add_library(SQLite3::SQLite3 UNKNOWN IMPORTED)
+
+ set_target_properties(SQLite3::SQLite3 PROPERTIES
+ IMPORTED_LOCATION "${SQLite3_LIBRARIES}"
+ INTERFACE_INCLUDE_DIRECTORIES "${SQLite3_INCLUDE_DIRS}"
+ )
+endif()
+
# --- Configure the Library ---
# Select source files based on API version
+40
View File
@@ -0,0 +1,40 @@
{
lib,
stdenv,
fetchFromSourcehut,
cmake,
pkg-config,
icu,
sqlite,
}:
stdenv.mkDerivation (finalAttrs: {
pname = "fts5-icu-tokenizer";
version = "5.5";
src = fetchFromSourcehut {
vc = "hg";
owner = "~cwt";
repo = "fts5-icu-tokenizer";
rev = "v${finalAttrs.version}";
hash = "sha256-7Klsu9d1sY+W0buo6kwYdCyDA/u2dBTgu6WuttomTBo=";
};
patches = [
./0001-provide-sqlite-externally.patch
];
nativeBuildInputs = [
cmake
pkg-config
];
buildInputs = [
icu
sqlite
];
cmakeFlags = [
(lib.cmakeFeature "LOCALE" "ja")
(lib.cmakeFeature "API_VERSION" "v2")
];
})
+31 -4
View File
@@ -8,7 +8,7 @@ void main() {
expect(result, 'かたまり');
});
test('Basic test with diacritics', () {
test('Basic test with dakuten', () {
final result = transliterateLatinToHiragana('gadamari');
expect(result, 'がだまり');
});
@@ -54,7 +54,7 @@ void main() {
test('Basic test', expectSpans('katamari', ['', '', '', '']));
test(
'Basic test with diacritics',
'Basic test with dakuten',
expectSpans('gadamari', ['', '', '', '']),
);
test('wi and we', expectSpans('wiwe', ['うぃ', 'うぇ']));
@@ -72,7 +72,7 @@ void main() {
expect(result, 'katamari');
});
test('Basic test with diacritics', () {
test('Basic test with dakuten', () {
final result = transliterateHiraganaToLatin('がだまり');
expect(result, 'gadamari');
});
@@ -91,6 +91,21 @@ void main() {
final result = transliterateHiraganaToLatin('かっぱ');
expect(result, 'kappa');
});
test('Iteration mark', () {
final result = transliterateHiraganaToLatin('さゝき');
expect(result, 'sasaki');
}, skip: 'Not yet implemented');
test('Iteration mark with dakuten', () {
final result = transliterateHiraganaToLatin('あひゞき');
expect(result, 'ahibiki');
}, skip: 'Not yet implemented');
test('Yori', () {
final result = transliterateHiraganaToLatin('');
expect(result, 'yori');
}, skip: 'Not yet implemented');
});
group('Hiragana -> Romaji Spans', () {
@@ -110,7 +125,7 @@ void main() {
test('Basic test', expectSpans('かたまり', ['ka', 'ta', 'ma', 'ri']));
test(
'Basic test with diacritics',
'Basic test with dakuten',
expectSpans('がだまり', ['ga', 'da', 'ma', 'ri']),
);
test('wi and we', expectSpans('うぃうぇ', ['whi', 'whe']));
@@ -118,5 +133,17 @@ void main() {
// TODO: fix the implementation
// test('Double consonant', expectSpans('かっぱ', ['ka', 'ppa']));
test(
'Iteration mark',
expectSpans('さゝき', ['sa', 'sa', 'ki']),
skip: 'Not yet implemented',
);
test(
'Iteration mark with dakuten',
expectSpans('あひゞき', ['a', 'hi', 'bi', 'ki']),
skip: 'Not yet implemented',
);
test('Yori', expectSpans('', ['yori']), skip: 'Not yet implemented');
});
}