Jisho-Study-Tool/lib/migrations/tools/update_0002.dart

179 lines
5.0 KiB
Dart
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import 'dart:convert';
import 'dart:io';
import 'package:html/parser.dart';
import 'package:http/http.dart' as http;
import 'package:unofficial_jisho_api/api.dart';
class Radical {
final int id;
final String symbol;
final String? search_symbol;
final String meaning;
final int strokes;
const Radical({
required this.id,
required this.symbol,
required this.strokes,
required this.meaning,
this.search_symbol,
});
@override
String toString() {
return '$id - ($symbol, $strokes${search_symbol != null ? ", $search_symbol" : ""})';
}
String get sql_tuple => ' ('
'$id, '
"'$symbol', "
'$strokes, '
"'$meaning', "
"${search_symbol != null ? "'$search_symbol'" : 'NULL'}"
')';
factory Radical.fromJson(Map<String, dynamic> json) {
return Radical(
id: json['id'] as int,
symbol: json['symbol'] as String,
strokes: json['strokes'] as int,
meaning: json['meaning'] as String,
search_symbol: json['search_symbol'] as String?,
);
}
Map<String, Object?> toJson() => {
'id': id,
'symbol': symbol,
'strokes': strokes,
'meaning': meaning,
'search_symbol': search_symbol,
};
}
String hexToUnicode(String code) =>
String.fromCharCode(int.parse(code, radix: 16));
/// Some of the radicals in jisho are written using katakana,
/// and some are written using either the symbols from the
/// Kangxi radical block (U+2F00-U+2FDF) or the
/// Unified CJK Character block (U+4E00-U+9FFF). These have been
/// used without care, and therefore some of the radicals are not
/// easily searchable. This conversion table helps solve this issue.
///
/// See:
/// https://en.wikipedia.org/wiki/List_of_radicals_in_Unicode
/// https://second.wiki/wiki/unicodeblock_kangxi-radikale
/// https://wiki.contextgarden.net/List_of_Unicode_blocks
Future<Map<String, String>> fetchEquivalentUCJKIdeographs() async {
final response = await http.get(
Uri.parse(
'https://www.unicode.org/Public/UNIDATA/EquivalentUnifiedIdeograph.txt',
),
);
final Map<String, String> result = {};
for (final line in response.body.split('\n')) {
if (line.startsWith('#') || RegExp(r'^\s*$').hasMatch(line)) continue;
final items = line.split(RegExp(r'\s+'));
if (items[0].contains('.')) {
final startEnd = items[0].split('..');
final start = int.parse(startEnd[0], radix: 16);
final end = int.parse(startEnd[1], radix: 16);
for (int i = 0; i <= (end - start); i++) {
result[String.fromCharCode(start + i)] = hexToUnicode(items[2]);
}
} else {
result[hexToUnicode(items[0])] = hexToUnicode(items[2]);
}
}
return result;
}
final cacheFile = File('data/0002_radicals.json');
Future<void> cacheRadicals() async {
final Map<String, String> equivalentSymbols =
await fetchEquivalentUCJKIdeographs();
equivalentSymbols[''] = '';
equivalentSymbols[''] = '丿';
equivalentSymbols[''] = '';
equivalentSymbols[''] = '';
equivalentSymbols[''] = '';
final Map<String, List<String>> inverseEquivalentSymbols = {};
for (final entry in equivalentSymbols.entries) {
if (inverseEquivalentSymbols.containsKey(entry.value)) {
inverseEquivalentSymbols[entry.value]!.add(entry.key);
continue;
}
inverseEquivalentSymbols[entry.value] = [entry.key];
}
final response = await http.get(Uri.parse('https://jisho.org/'));
final document = parse(response.body);
final table = document.querySelector('.radical_table')!;
final List<Radical> radicals = [];
int i = 0;
for (final node in table.children) {
if (node.className == 'reset_icon_list_item') continue;
if (node.className == 'number') {
i = int.parse(node.innerHtml);
continue;
}
final String radical = node.innerHtml;
print('Caching: $radical');
KanjiResult? result;
for (final item in [
radical,
equivalentSymbols[radical],
...inverseEquivalentSymbols[radical] ?? [],
]) {
if (item == null) continue;
result = await searchForKanji(item);
if (result.found) break;
}
final Radical radicalData = Radical(
id: int.parse(node.attributes['data-radical']!),
symbol: radical,
strokes: i,
search_symbol: node.attributes['data-radk'],
meaning: ['', ''].contains(radical)
? 'katakana, jisho search radical'
: result!.data!.radical!.meaning,
);
radicals.add(radicalData);
}
assert(radicals.length == 252, '[ERROR] Missing radicals!');
final encoder = JsonEncoder.withIndent(' ');
cacheFile.writeAsStringSync(encoder.convert(radicals));
}
Future<void> main(List<String> args) async {
if (!cacheFile.existsSync()) {
await cacheRadicals();
}
List<Radical> radicals = (jsonDecode(cacheFile.readAsStringSync()) as List).map((e) => Radical.fromJson(e)).toList();
File('0002_populate_radicals.sql').writeAsStringSync(
'''
INSERT INTO Kanji_Radical(id, symbol, strokes, meaning, searchSymbol) VALUES
${radicals.map((r) => r.sql_tuple).join(',\n')};
''',
);
}