diff --git a/flake.lock b/flake.lock index 23f9256..4095788 100644 --- a/flake.lock +++ b/flake.lock @@ -35,6 +35,22 @@ "url": "https://git.pvv.ntnu.no/Mugiten/datasources.git" } }, + "kanjivg-src": { + "flake": false, + "locked": { + "lastModified": 1778620714, + "narHash": "sha256-LwNcY5A6XPGI+DASZfmP7OeYe8IFesShhSrE7Go2ux8=", + "ref": "refs/heads/master", + "rev": "1957802840a6f059d1e27dcb5755722955cc7dbb", + "revCount": 2217, + "type": "git", + "url": "https://git.pvv.ntnu.no/mugiten/kanjivg.git" + }, + "original": { + "type": "git", + "url": "https://git.pvv.ntnu.no/mugiten/kanjivg.git" + } + }, "nix-sqlite": { "inputs": { "nixpkgs": [ @@ -75,6 +91,7 @@ "root": { "inputs": { "datasources": "datasources", + "kanjivg-src": "kanjivg-src", "nixpkgs": "nixpkgs", "tamerye": "tamerye" } diff --git a/flake.nix b/flake.nix index 09c2019..6dc69d9 100644 --- a/flake.nix +++ b/flake.nix @@ -13,6 +13,11 @@ url = "git+https://git.pvv.ntnu.no/Mugiten/datasources.git"; inputs.nixpkgs.follows = "nixpkgs"; }; + + kanjivg-src = { + url = "git+https://git.pvv.ntnu.no/mugiten/kanjivg.git"; + flake = false; + }; }; outputs = { @@ -20,6 +25,7 @@ nixpkgs, tamerye, datasources, + kanjivg-src, }: let inherit (nixpkgs) lib; systems = [ @@ -139,6 +145,7 @@ database = pkgs.callPackage ./nix/database.nix { sqlite = pkgs.tamerye-sqlite-cli; inherit (datasources.packages.${system}) jmdict radkfile kanjidic2 tanos-jlpt; + kanjivg = kanjivg-src; inherit (self.packages.${system}) database-tool; inherit src; }; @@ -146,6 +153,7 @@ database-wal = pkgs.callPackage ./nix/database.nix { sqlite = pkgs.tamerye-sqlite-cli; inherit (datasources.packages.${system}) jmdict radkfile kanjidic2 tanos-jlpt; + kanjivg = kanjivg-src; inherit (self.packages.${system}) database-tool; inherit src; wal = true; diff --git a/lib/_data_ingestion/kanjivg/objects.dart b/lib/_data_ingestion/kanjivg/objects.dart new file mode 100644 index 0000000..504589d --- /dev/null +++ b/lib/_data_ingestion/kanjivg/objects.dart @@ -0,0 +1,168 @@ +import 'package:jadb/_data_ingestion/sql_writable.dart'; + +/// Enum set in the kvg:position attribute, used by `` elements in the KanjiVG SVG files. +enum KanjiPathGroupPosition { + upperA, + upperB, + lower1, + lower2, + bottom, + kamae, + kamaec, + left, + middle, + nyo, + nyoc, + right, + tare, + tarec, + top; + + static KanjiPathGroupPosition? fromString(String? str) { + if (str == null) return null; + switch (str) { + case '⿵A': + return KanjiPathGroupPosition.upperA; + case '⿵B': + return KanjiPathGroupPosition.upperB; + case '⿶1': + return KanjiPathGroupPosition.lower1; + case '⿶2': + return KanjiPathGroupPosition.lower2; + case 'bottom': + return KanjiPathGroupPosition.bottom; + case 'kamae': + return KanjiPathGroupPosition.kamae; + case 'kamaec': + return KanjiPathGroupPosition.kamaec; + case 'left': + return KanjiPathGroupPosition.left; + case 'middle': + return KanjiPathGroupPosition.middle; + case 'nyo': + return KanjiPathGroupPosition.nyo; + case 'nyoc': + return KanjiPathGroupPosition.nyoc; + case 'right': + return KanjiPathGroupPosition.right; + case 'tare': + return KanjiPathGroupPosition.tare; + case 'tarec': + return KanjiPathGroupPosition.tarec; + case 'top': + return KanjiPathGroupPosition.top; + default: + throw ArgumentError('Unknown position: $str'); + } + } +} + +enum KanjiVGRadical { + general, + jis, + nelson, + tradit; + + static KanjiVGRadical? fromString(String? str) { + if (str == null) return null; + switch (str) { + case 'general': + return KanjiVGRadical.general; + case 'jis': + return KanjiVGRadical.jis; + case 'nelson': + return KanjiVGRadical.nelson; + case 'tradit': + return KanjiVGRadical.tradit; + default: + throw ArgumentError('Unknown radical: $str'); + } + } +} + +/// Contents of a \ element in the KanjiVG SVG files. +class KanjiPathGroupTreeNode extends SQLWritable { + final int id; + final List children; + final String? element; + final String? original; + final KanjiPathGroupPosition? position; + final KanjiVGRadical? radical; + final int? part; + + // Currently unused data. + final bool radicalForm; + final bool tradForm; + final bool partial; + final String? variant; + + KanjiPathGroupTreeNode({ + required this.id, + this.children = const [], + this.element, + this.original, + this.position, + this.radical, + this.part, + + this.variant, + this.radicalForm = false, + this.tradForm = false, + this.partial = false, + }); + + @override + Map get sqlValue => { + 'groupId': id, + 'element': element, + 'original': original, + 'position': position?.name, + 'radical': radical?.name, + 'part': part, + }; +} + +/// Contents of a `` element in the StrokeNumber's group in the KanjiVG SVG files +class KanjiStrokeNumber extends SQLWritable { + final int num; + final double x; + final double y; + + KanjiStrokeNumber(this.num, this.x, this.y); + + @override + Map get sqlValue => {'strokeNum': num, 'x': x, 'y': y}; +} + +/// Contents of a `` element in the KanjiVG SVG files +class KanjiVGPath extends SQLWritable { + final int id; + final String? type; + final String svgPath; + + KanjiVGPath({required this.id, required this.type, required this.svgPath}); + + @override + Map get sqlValue => { + 'pathId': id, + 'type': type, + 'svgPath': svgPath, + }; +} + +class KanjiVGItem extends SQLWritable { + final String character; + final List paths; + final List strokeNumbers; + final List pathGroups; + + KanjiVGItem({ + required this.character, + required this.paths, + required this.strokeNumbers, + required this.pathGroups, + }); + + @override + Map get sqlValue => {'character': character}; +} diff --git a/lib/_data_ingestion/kanjivg/parser.dart b/lib/_data_ingestion/kanjivg/parser.dart new file mode 100644 index 0000000..71792b6 --- /dev/null +++ b/lib/_data_ingestion/kanjivg/parser.dart @@ -0,0 +1,100 @@ +import 'dart:io'; + +import 'package:collection/collection.dart'; +import 'package:jadb/_data_ingestion/kanjivg/objects.dart'; +import 'package:xml/xml.dart'; + +List parseKanjiVGData(Directory rootDir) { + final List items = []; + + for (final file in rootDir.listSync()) { + if (file is File && file.path.endsWith('.svg')) { + final String rawSVG = file.readAsStringSync(); + final XmlDocument doc = XmlDocument.parse(rawSVG); + + final strokePathsGroup = doc + .findAllElements('g') + .firstWhereOrNull( + (e) => e.getAttribute('id')?.startsWith('kvg:StrokePaths') ?? false, + ); + + final strokeNumbersGroup = doc + .findAllElements('g') + .firstWhereOrNull( + (e) => + e.getAttribute('id')?.startsWith('kvg:StrokeNumbers') ?? false, + ); + + final pathGroups = strokePathsGroup != null + ? _parsePathGroups(strokePathsGroup) + : []; + + final strokeNumbers = strokeNumbersGroup != null + ? _parseStrokeNumbers(strokeNumbersGroup) + : []; + + final paths = strokePathsGroup != null + ? _parsePaths(strokePathsGroup) + : []; + + items.add( + KanjiVGItem( + character: file.uri.pathSegments.last.split('.').first, + paths: paths, + strokeNumbers: strokeNumbers, + pathGroups: pathGroups, + ), + ); + } + } + + return items; +} + +List _parseStrokeNumbers(XmlElement group) => group + .childElements + .map((e) { + final num = int.parse(e.innerText); + final xy = e + .getAttribute('transform')! + .split('matrix(1 0 0 1 ')[1] + .split(')')[0] + .split(' ') + .map(double.parse) + .toList(); + return KanjiStrokeNumber(num, xy[0], xy[1]); + }) + .toList(growable: false); + +List _parsePathGroups(XmlElement group) => group + .findElements('g') + .map((e) { + return KanjiPathGroupTreeNode( + // NOTE: the outermost group does not have a number + id: int.tryParse(e.getAttribute('id')!.split('-').last.substring(1)) ?? 0, + element: e.getAttribute('kvg:element'), + original: e.getAttribute('kvg:original'), + variant: e.getAttribute('kvg:variant'), + position: KanjiPathGroupPosition.fromString( + e.getAttribute('kvg:position'), + ), + radical: KanjiVGRadical.fromString(e.getAttribute('kvg:radical')), + part: int.tryParse(e.getAttribute('kvg:part') ?? ''), + radicalForm: e.getAttribute('kvg:radicalForm') == 'true', + tradForm: e.getAttribute('kvg:tradForm') == 'true', + partial: e.getAttribute('kvg:partial') == 'true', + children: _parsePathGroups(e), + ); + }) + .toList(growable: false); + +List _parsePaths(XmlElement group) => group + .findAllElements('path') + .map( + (e) => KanjiVGPath( + id: int.parse(e.getAttribute('id')!.split('-').last.substring(1)), + type: e.getAttribute('kvg:type'), + svgPath: e.getAttribute('d')!, + ), + ) + .toList(growable: false); diff --git a/lib/_data_ingestion/kanjivg/seed_data.dart b/lib/_data_ingestion/kanjivg/seed_data.dart new file mode 100644 index 0000000..5d2fe29 --- /dev/null +++ b/lib/_data_ingestion/kanjivg/seed_data.dart @@ -0,0 +1,53 @@ +import 'package:jadb/_data_ingestion/kanjivg/objects.dart'; +import 'package:jadb/table_names/kanjivg.dart'; +import 'package:sqflite_common/sqflite.dart'; + +Future seedKanjiVGData(Iterable items, Database db) { + return db.transaction((txn) async { + await txn.execute('PRAGMA defer_foreign_keys = ON'); + + final b = txn.batch(); + + for (final item in items) { + b.insert(KanjiVGTableNames.entry, item.sqlValue); + + for (final path in item.paths) { + b.insert( + KanjiVGTableNames.path, + path.sqlValue..addAll({'character': item.character}), + ); + } + + for (final strokeNumber in item.strokeNumbers) { + b.insert( + KanjiVGTableNames.strokeNumber, + strokeNumber.sqlValue..addAll({'character': item.character}), + ); + } + + for (final pathGroup in item.pathGroups) { + _insertPathGroup(b, null, pathGroup, item.character); + } + } + + await b.commit(noResult: true); + }); +} + +/// Recursively insert path groups and their children +void _insertPathGroup( + Batch b, + int? parentGroupId, + KanjiPathGroupTreeNode node, + String character, +) { + b.insert( + KanjiVGTableNames.pathGroup, + node.sqlValue + ..addAll({'character': character, 'parentGroupId': parentGroupId}), + ); + + for (final child in node.children) { + _insertPathGroup(b, node.id, child, character); + } +} diff --git a/lib/_data_ingestion/seed_database.dart b/lib/_data_ingestion/seed_database.dart index 610a292..d176917 100644 --- a/lib/_data_ingestion/seed_database.dart +++ b/lib/_data_ingestion/seed_database.dart @@ -4,6 +4,8 @@ import 'package:jadb/_data_ingestion/jmdict/seed_data.dart'; import 'package:jadb/_data_ingestion/jmdict/xml_parser.dart'; import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart'; import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart'; +import 'package:jadb/_data_ingestion/kanjivg/parser.dart'; +import 'package:jadb/_data_ingestion/kanjivg/seed_data.dart'; import 'package:jadb/_data_ingestion/radkfile/parser.dart'; import 'package:jadb/_data_ingestion/radkfile/seed_data.dart'; import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart'; @@ -17,6 +19,7 @@ Future seedData(Database db) async { await parseAndSeedDataFromRADKFILE(db); await parseAndSeedDataFromKANJIDIC(db); await parseAndSeedDataFromTanosJLPT(db); + await parseAndSeedDataFromKanjiVG(db); print('Performing VACUUM'); await db.execute('VACUUM'); @@ -102,3 +105,17 @@ Future parseAndSeedDataFromTanosJLPT(Database db) async { print('[TANOS-JLPT] Writing to database...'); await seedTanosJLPTData(resolvedEntries, db); } + +Future parseAndSeedDataFromKanjiVG(Database db) async { + final kanjivgPath = + Platform.environment['KANJIVG_PATH'] ?? 'data/kanjivg'; + if (!Directory(kanjivgPath).existsSync()) { + throw Exception('KANJIVG directory not found at $kanjivgPath'); + } + + print('[KANJIVG] Parsing content...'); + final items = parseKanjiVGData(Directory(kanjivgPath)); + + print('[KANJIVG] Writing to database...'); + await seedKanjiVGData(items, db); +} diff --git a/lib/models/verify_tables.dart b/lib/models/verify_tables.dart index fe5c955..23a5559 100644 --- a/lib/models/verify_tables.dart +++ b/lib/models/verify_tables.dart @@ -1,5 +1,6 @@ import 'package:jadb/table_names/jmdict.dart'; import 'package:jadb/table_names/kanjidic.dart'; +import 'package:jadb/table_names/kanjivg.dart'; import 'package:jadb/table_names/radkfile.dart'; import 'package:jadb/table_names/tanos_jlpt.dart'; import 'package:sqflite_common/sqlite_api.dart'; @@ -21,6 +22,7 @@ Future verifyTablesWithDbConnection(DatabaseExecutor db) async { ...KANJIDICTableNames.allTables, ...RADKFILETableNames.allTables, ...TanosJLPTTableNames.allTables, + ...KanjiVGTableNames.allTables, }; final missingTables = expectedTables.difference(tables); diff --git a/lib/table_names/kanjivg.dart b/lib/table_names/kanjivg.dart new file mode 100644 index 0000000..cee9e41 --- /dev/null +++ b/lib/table_names/kanjivg.dart @@ -0,0 +1,9 @@ +abstract class KanjiVGTableNames { + static const String version = 'KanjiVG_Version'; + static const String entry = 'KanjiVG_Entry'; + static const String path = 'KanjiVG_Path'; + static const String strokeNumber = 'KanjiVG_StrokeNumber'; + static const String pathGroup = 'KanjiVG_PathGroup'; + + static Set get allTables => {version, entry, path, strokeNumber, pathGroup}; +} diff --git a/migrations/0011_KanjiVG.sql b/migrations/0011_KanjiVG.sql new file mode 100644 index 0000000..acb4904 --- /dev/null +++ b/migrations/0011_KanjiVG.sql @@ -0,0 +1,67 @@ +CREATE TABLE "KanjiVG_Version" ( + "version" VARCHAR(10) PRIMARY KEY NOT NULL, + "date" DATE NOT NULL, + "hash" VARCHAR(64) NOT NULL +) WITHOUT ROWID; + +CREATE TRIGGER "KanjiVG_Version_SingleRow" +BEFORE INSERT ON "KanjiVG_Version" +WHEN (SELECT COUNT(*) FROM "KanjiVG_Version") >= 1 +BEGIN + SELECT RAISE(FAIL, 'Only one row allowed in KanjiVG_Version'); +END; + +CREATE TABLE "KanjiVG_Entry" ( + "character" CHAR(1) PRIMARY KEY NOT NULL +) WITHOUT ROWID; + +CREATE TABLE "KanjiVG_StrokeNumber" ( + "character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"), + "strokeNum" INTEGER NOT NULL, + "x" REAL NOT NULL, + "y" REAL NOT NULL, + PRIMARY KEY ("character", "strokeNum"), + FOREIGN KEY ("character", "strokeNum") REFERENCES "KanjiVG_Path"("character", "pathId") +) WITHOUT ROWID; + +CREATE TABLE "KanjiVG_Path" ( + "character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"), + "pathId" INTEGER NOT NULL, + "type" VARCHAR(10), + "svgPath" TEXT NOT NULL, + PRIMARY KEY ("character", "pathId") +) WITHOUT ROWID; + +CREATE TABLE "KanjiVG_PathGroup" ( + "character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"), + "groupId" INTEGER NOT NULL, + "parentGroupId" TEXT, + "element" TEXT, + "original" TEXT, + "position" VARCHAR(6), + "radical" TEXT, + "part" INTEGER, + PRIMARY KEY ("character", "groupId"), + CHECK ( + "position" IN ( + 'upperA', + 'upperB', + 'lower1', + 'lower2', + 'bottom', + 'kamae', + 'kamaec', + 'left', + 'middle', + 'nyo', + 'nyoc', + 'right', + 'tare', + 'tarec', + 'top' + ) + OR + "position" IS NULL + ), + FOREIGN KEY ("character", "parentGroupId") REFERENCES "KanjiVG_PathGroup"("character", "groupId") +) WITHOUT ROWID; diff --git a/nix/database.nix b/nix/database.nix index 21439e1..4545a22 100644 --- a/nix/database.nix +++ b/nix/database.nix @@ -7,6 +7,7 @@ radkfile, kanjidic2, tanos-jlpt, + kanjivg, sqlite, wal ? false, }: @@ -39,6 +40,8 @@ stdenvNoCC.mkDerivation { TANOS_JLPT_VERSION = tanos-jlpt.version; TANOS_JLPT_DATE = tanos-jlpt.date; TANOS_JLPT_HASH = tanos-jlpt.hash; + + KANJIVG_PATH = "${kanjivg}/kanji"; }; buildPhase = ''