WIP: add kanjivg data
Build and test / build (push) Successful in 10m1s

This commit is contained in:
2026-03-03 13:47:59 +09:00
parent 8ba7c66e67
commit 31bacb7476
10 changed files with 444 additions and 0 deletions
Generated
+17
View File
@@ -35,6 +35,22 @@
"url": "https://git.pvv.ntnu.no/Mugiten/datasources.git"
}
},
"kanjivg-src": {
"flake": false,
"locked": {
"lastModified": 1778620714,
"narHash": "sha256-LwNcY5A6XPGI+DASZfmP7OeYe8IFesShhSrE7Go2ux8=",
"ref": "refs/heads/master",
"rev": "1957802840a6f059d1e27dcb5755722955cc7dbb",
"revCount": 2217,
"type": "git",
"url": "https://git.pvv.ntnu.no/mugiten/kanjivg.git"
},
"original": {
"type": "git",
"url": "https://git.pvv.ntnu.no/mugiten/kanjivg.git"
}
},
"nix-sqlite": {
"inputs": {
"nixpkgs": [
@@ -75,6 +91,7 @@
"root": {
"inputs": {
"datasources": "datasources",
"kanjivg-src": "kanjivg-src",
"nixpkgs": "nixpkgs",
"tamerye": "tamerye"
}
+8
View File
@@ -13,6 +13,11 @@
url = "git+https://git.pvv.ntnu.no/Mugiten/datasources.git";
inputs.nixpkgs.follows = "nixpkgs";
};
kanjivg-src = {
url = "git+https://git.pvv.ntnu.no/mugiten/kanjivg.git";
flake = false;
};
};
outputs = {
@@ -20,6 +25,7 @@
nixpkgs,
tamerye,
datasources,
kanjivg-src,
}: let
inherit (nixpkgs) lib;
systems = [
@@ -139,6 +145,7 @@
database = pkgs.callPackage ./nix/database.nix {
sqlite = pkgs.tamerye-sqlite-cli;
inherit (datasources.packages.${system}) jmdict radkfile kanjidic2 tanos-jlpt;
kanjivg = kanjivg-src;
inherit (self.packages.${system}) database-tool;
inherit src;
};
@@ -146,6 +153,7 @@
database-wal = pkgs.callPackage ./nix/database.nix {
sqlite = pkgs.tamerye-sqlite-cli;
inherit (datasources.packages.${system}) jmdict radkfile kanjidic2 tanos-jlpt;
kanjivg = kanjivg-src;
inherit (self.packages.${system}) database-tool;
inherit src;
wal = true;
+168
View File
@@ -0,0 +1,168 @@
import 'package:jadb/_data_ingestion/sql_writable.dart';
/// Enum set in the kvg:position attribute, used by `<g>` elements in the KanjiVG SVG files.
enum KanjiPathGroupPosition {
upperA,
upperB,
lower1,
lower2,
bottom,
kamae,
kamaec,
left,
middle,
nyo,
nyoc,
right,
tare,
tarec,
top;
static KanjiPathGroupPosition? fromString(String? str) {
if (str == null) return null;
switch (str) {
case '⿵A':
return KanjiPathGroupPosition.upperA;
case '⿵B':
return KanjiPathGroupPosition.upperB;
case '⿶1':
return KanjiPathGroupPosition.lower1;
case '⿶2':
return KanjiPathGroupPosition.lower2;
case 'bottom':
return KanjiPathGroupPosition.bottom;
case 'kamae':
return KanjiPathGroupPosition.kamae;
case 'kamaec':
return KanjiPathGroupPosition.kamaec;
case 'left':
return KanjiPathGroupPosition.left;
case 'middle':
return KanjiPathGroupPosition.middle;
case 'nyo':
return KanjiPathGroupPosition.nyo;
case 'nyoc':
return KanjiPathGroupPosition.nyoc;
case 'right':
return KanjiPathGroupPosition.right;
case 'tare':
return KanjiPathGroupPosition.tare;
case 'tarec':
return KanjiPathGroupPosition.tarec;
case 'top':
return KanjiPathGroupPosition.top;
default:
throw ArgumentError('Unknown position: $str');
}
}
}
enum KanjiVGRadical {
general,
jis,
nelson,
tradit;
static KanjiVGRadical? fromString(String? str) {
if (str == null) return null;
switch (str) {
case 'general':
return KanjiVGRadical.general;
case 'jis':
return KanjiVGRadical.jis;
case 'nelson':
return KanjiVGRadical.nelson;
case 'tradit':
return KanjiVGRadical.tradit;
default:
throw ArgumentError('Unknown radical: $str');
}
}
}
/// Contents of a \<g> element in the KanjiVG SVG files.
class KanjiPathGroupTreeNode extends SQLWritable {
final int id;
final List<KanjiPathGroupTreeNode> children;
final String? element;
final String? original;
final KanjiPathGroupPosition? position;
final KanjiVGRadical? radical;
final int? part;
// Currently unused data.
final bool radicalForm;
final bool tradForm;
final bool partial;
final String? variant;
KanjiPathGroupTreeNode({
required this.id,
this.children = const [],
this.element,
this.original,
this.position,
this.radical,
this.part,
this.variant,
this.radicalForm = false,
this.tradForm = false,
this.partial = false,
});
@override
Map<String, Object?> get sqlValue => {
'groupId': id,
'element': element,
'original': original,
'position': position?.name,
'radical': radical?.name,
'part': part,
};
}
/// Contents of a `<text>` element in the StrokeNumber's group in the KanjiVG SVG files
class KanjiStrokeNumber extends SQLWritable {
final int num;
final double x;
final double y;
KanjiStrokeNumber(this.num, this.x, this.y);
@override
Map<String, Object?> get sqlValue => {'strokeNum': num, 'x': x, 'y': y};
}
/// Contents of a `<path>` element in the KanjiVG SVG files
class KanjiVGPath extends SQLWritable {
final int id;
final String? type;
final String svgPath;
KanjiVGPath({required this.id, required this.type, required this.svgPath});
@override
Map<String, Object?> get sqlValue => {
'pathId': id,
'type': type,
'svgPath': svgPath,
};
}
class KanjiVGItem extends SQLWritable {
final String character;
final List<KanjiVGPath> paths;
final List<KanjiStrokeNumber> strokeNumbers;
final List<KanjiPathGroupTreeNode> pathGroups;
KanjiVGItem({
required this.character,
required this.paths,
required this.strokeNumbers,
required this.pathGroups,
});
@override
Map<String, Object?> get sqlValue => {'character': character};
}
+100
View File
@@ -0,0 +1,100 @@
import 'dart:io';
import 'package:collection/collection.dart';
import 'package:jadb/_data_ingestion/kanjivg/objects.dart';
import 'package:xml/xml.dart';
List<KanjiVGItem> parseKanjiVGData(Directory rootDir) {
final List<KanjiVGItem> items = [];
for (final file in rootDir.listSync()) {
if (file is File && file.path.endsWith('.svg')) {
final String rawSVG = file.readAsStringSync();
final XmlDocument doc = XmlDocument.parse(rawSVG);
final strokePathsGroup = doc
.findAllElements('g')
.firstWhereOrNull(
(e) => e.getAttribute('id')?.startsWith('kvg:StrokePaths') ?? false,
);
final strokeNumbersGroup = doc
.findAllElements('g')
.firstWhereOrNull(
(e) =>
e.getAttribute('id')?.startsWith('kvg:StrokeNumbers') ?? false,
);
final pathGroups = strokePathsGroup != null
? _parsePathGroups(strokePathsGroup)
: <KanjiPathGroupTreeNode>[];
final strokeNumbers = strokeNumbersGroup != null
? _parseStrokeNumbers(strokeNumbersGroup)
: <KanjiStrokeNumber>[];
final paths = strokePathsGroup != null
? _parsePaths(strokePathsGroup)
: <KanjiVGPath>[];
items.add(
KanjiVGItem(
character: file.uri.pathSegments.last.split('.').first,
paths: paths,
strokeNumbers: strokeNumbers,
pathGroups: pathGroups,
),
);
}
}
return items;
}
List<KanjiStrokeNumber> _parseStrokeNumbers(XmlElement group) => group
.childElements
.map((e) {
final num = int.parse(e.innerText);
final xy = e
.getAttribute('transform')!
.split('matrix(1 0 0 1 ')[1]
.split(')')[0]
.split(' ')
.map(double.parse)
.toList();
return KanjiStrokeNumber(num, xy[0], xy[1]);
})
.toList(growable: false);
List<KanjiPathGroupTreeNode> _parsePathGroups(XmlElement group) => group
.findElements('g')
.map((e) {
return KanjiPathGroupTreeNode(
// NOTE: the outermost group does not have a number
id: int.tryParse(e.getAttribute('id')!.split('-').last.substring(1)) ?? 0,
element: e.getAttribute('kvg:element'),
original: e.getAttribute('kvg:original'),
variant: e.getAttribute('kvg:variant'),
position: KanjiPathGroupPosition.fromString(
e.getAttribute('kvg:position'),
),
radical: KanjiVGRadical.fromString(e.getAttribute('kvg:radical')),
part: int.tryParse(e.getAttribute('kvg:part') ?? ''),
radicalForm: e.getAttribute('kvg:radicalForm') == 'true',
tradForm: e.getAttribute('kvg:tradForm') == 'true',
partial: e.getAttribute('kvg:partial') == 'true',
children: _parsePathGroups(e),
);
})
.toList(growable: false);
List<KanjiVGPath> _parsePaths(XmlElement group) => group
.findAllElements('path')
.map(
(e) => KanjiVGPath(
id: int.parse(e.getAttribute('id')!.split('-').last.substring(1)),
type: e.getAttribute('kvg:type'),
svgPath: e.getAttribute('d')!,
),
)
.toList(growable: false);
@@ -0,0 +1,53 @@
import 'package:jadb/_data_ingestion/kanjivg/objects.dart';
import 'package:jadb/table_names/kanjivg.dart';
import 'package:sqflite_common/sqflite.dart';
Future<void> seedKanjiVGData(Iterable<KanjiVGItem> items, Database db) {
return db.transaction((txn) async {
await txn.execute('PRAGMA defer_foreign_keys = ON');
final b = txn.batch();
for (final item in items) {
b.insert(KanjiVGTableNames.entry, item.sqlValue);
for (final path in item.paths) {
b.insert(
KanjiVGTableNames.path,
path.sqlValue..addAll({'character': item.character}),
);
}
for (final strokeNumber in item.strokeNumbers) {
b.insert(
KanjiVGTableNames.strokeNumber,
strokeNumber.sqlValue..addAll({'character': item.character}),
);
}
for (final pathGroup in item.pathGroups) {
_insertPathGroup(b, null, pathGroup, item.character);
}
}
await b.commit(noResult: true);
});
}
/// Recursively insert path groups and their children
void _insertPathGroup(
Batch b,
int? parentGroupId,
KanjiPathGroupTreeNode node,
String character,
) {
b.insert(
KanjiVGTableNames.pathGroup,
node.sqlValue
..addAll({'character': character, 'parentGroupId': parentGroupId}),
);
for (final child in node.children) {
_insertPathGroup(b, node.id, child, character);
}
}
+17
View File
@@ -4,6 +4,8 @@ import 'package:jadb/_data_ingestion/jmdict/seed_data.dart';
import 'package:jadb/_data_ingestion/jmdict/xml_parser.dart';
import 'package:jadb/_data_ingestion/kanjidic/seed_data.dart';
import 'package:jadb/_data_ingestion/kanjidic/xml_parser.dart';
import 'package:jadb/_data_ingestion/kanjivg/parser.dart';
import 'package:jadb/_data_ingestion/kanjivg/seed_data.dart';
import 'package:jadb/_data_ingestion/radkfile/parser.dart';
import 'package:jadb/_data_ingestion/radkfile/seed_data.dart';
import 'package:jadb/_data_ingestion/tanos-jlpt/csv_parser.dart';
@@ -17,6 +19,7 @@ Future<void> seedData(Database db) async {
await parseAndSeedDataFromRADKFILE(db);
await parseAndSeedDataFromKANJIDIC(db);
await parseAndSeedDataFromTanosJLPT(db);
await parseAndSeedDataFromKanjiVG(db);
print('Performing VACUUM');
await db.execute('VACUUM');
@@ -102,3 +105,17 @@ Future<void> parseAndSeedDataFromTanosJLPT(Database db) async {
print('[TANOS-JLPT] Writing to database...');
await seedTanosJLPTData(resolvedEntries, db);
}
Future<void> parseAndSeedDataFromKanjiVG(Database db) async {
final kanjivgPath =
Platform.environment['KANJIVG_PATH'] ?? 'data/kanjivg';
if (!Directory(kanjivgPath).existsSync()) {
throw Exception('KANJIVG directory not found at $kanjivgPath');
}
print('[KANJIVG] Parsing content...');
final items = parseKanjiVGData(Directory(kanjivgPath));
print('[KANJIVG] Writing to database...');
await seedKanjiVGData(items, db);
}
+2
View File
@@ -1,5 +1,6 @@
import 'package:jadb/table_names/jmdict.dart';
import 'package:jadb/table_names/kanjidic.dart';
import 'package:jadb/table_names/kanjivg.dart';
import 'package:jadb/table_names/radkfile.dart';
import 'package:jadb/table_names/tanos_jlpt.dart';
import 'package:sqflite_common/sqlite_api.dart';
@@ -21,6 +22,7 @@ Future<void> verifyTablesWithDbConnection(DatabaseExecutor db) async {
...KANJIDICTableNames.allTables,
...RADKFILETableNames.allTables,
...TanosJLPTTableNames.allTables,
...KanjiVGTableNames.allTables,
};
final missingTables = expectedTables.difference(tables);
+9
View File
@@ -0,0 +1,9 @@
abstract class KanjiVGTableNames {
static const String version = 'KanjiVG_Version';
static const String entry = 'KanjiVG_Entry';
static const String path = 'KanjiVG_Path';
static const String strokeNumber = 'KanjiVG_StrokeNumber';
static const String pathGroup = 'KanjiVG_PathGroup';
static Set<String> get allTables => {version, entry, path, strokeNumber, pathGroup};
}
+67
View File
@@ -0,0 +1,67 @@
CREATE TABLE "KanjiVG_Version" (
"version" VARCHAR(10) PRIMARY KEY NOT NULL,
"date" DATE NOT NULL,
"hash" VARCHAR(64) NOT NULL
) WITHOUT ROWID;
CREATE TRIGGER "KanjiVG_Version_SingleRow"
BEFORE INSERT ON "KanjiVG_Version"
WHEN (SELECT COUNT(*) FROM "KanjiVG_Version") >= 1
BEGIN
SELECT RAISE(FAIL, 'Only one row allowed in KanjiVG_Version');
END;
CREATE TABLE "KanjiVG_Entry" (
"character" CHAR(1) PRIMARY KEY NOT NULL
) WITHOUT ROWID;
CREATE TABLE "KanjiVG_StrokeNumber" (
"character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"),
"strokeNum" INTEGER NOT NULL,
"x" REAL NOT NULL,
"y" REAL NOT NULL,
PRIMARY KEY ("character", "strokeNum"),
FOREIGN KEY ("character", "strokeNum") REFERENCES "KanjiVG_Path"("character", "pathId")
) WITHOUT ROWID;
CREATE TABLE "KanjiVG_Path" (
"character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"),
"pathId" INTEGER NOT NULL,
"type" VARCHAR(10),
"svgPath" TEXT NOT NULL,
PRIMARY KEY ("character", "pathId")
) WITHOUT ROWID;
CREATE TABLE "KanjiVG_PathGroup" (
"character" CHAR(1) NOT NULL REFERENCES "KanjiVG_Entry"("character"),
"groupId" INTEGER NOT NULL,
"parentGroupId" TEXT,
"element" TEXT,
"original" TEXT,
"position" VARCHAR(6),
"radical" TEXT,
"part" INTEGER,
PRIMARY KEY ("character", "groupId"),
CHECK (
"position" IN (
'upperA',
'upperB',
'lower1',
'lower2',
'bottom',
'kamae',
'kamaec',
'left',
'middle',
'nyo',
'nyoc',
'right',
'tare',
'tarec',
'top'
)
OR
"position" IS NULL
),
FOREIGN KEY ("character", "parentGroupId") REFERENCES "KanjiVG_PathGroup"("character", "groupId")
) WITHOUT ROWID;
+3
View File
@@ -7,6 +7,7 @@
radkfile,
kanjidic2,
tanos-jlpt,
kanjivg,
sqlite,
wal ? false,
}:
@@ -39,6 +40,8 @@ stdenvNoCC.mkDerivation {
TANOS_JLPT_VERSION = tanos-jlpt.version;
TANOS_JLPT_DATE = tanos-jlpt.date;
TANOS_JLPT_HASH = tanos-jlpt.hash;
KANJIVG_PATH = "${kanjivg}/kanji";
};
buildPhase = ''