74 lines
1.9 KiB
Dart
74 lines
1.9 KiB
Dart
import 'dart:convert';
|
||
import 'dart:io';
|
||
|
||
import 'package:csv/csv.dart';
|
||
import 'package:jadb/_data_ingestion/tanos-jlpt/objects.dart';
|
||
import 'package:xml/xml_events.dart';
|
||
|
||
Future<List<JLPTRankedWord>> parseJLPTRankedWords(
|
||
Map<String, File> files,
|
||
) async {
|
||
final List<JLPTRankedWord> result = [];
|
||
|
||
final codec = CsvCodec(
|
||
fieldDelimiter: ',',
|
||
lineDelimiter: '\n',
|
||
quoteMode: QuoteMode.strings,
|
||
escapeCharacter: '\\',
|
||
);
|
||
|
||
for (final entry in files.entries) {
|
||
final jlptLevel = entry.key;
|
||
final file = entry.value;
|
||
|
||
if (!file.existsSync()) {
|
||
throw Exception('File $jlptLevel does not exist');
|
||
}
|
||
|
||
final words = await file
|
||
.openRead()
|
||
.transform(utf8.decoder)
|
||
.transform(codec.decoder)
|
||
.flatten()
|
||
.map((row) {
|
||
if (row.length != 3) {
|
||
throw Exception('Invalid line in $jlptLevel: $row');
|
||
}
|
||
return row;
|
||
})
|
||
.map((row) => row.map((e) => e as String).toList())
|
||
.map((row) {
|
||
final kanji = row[0].isEmpty
|
||
? null
|
||
: row[0]
|
||
.replaceFirst(RegExp('^お・'), '')
|
||
.replaceAll(RegExp(r'(.*)'), '');
|
||
|
||
final readings = row[1]
|
||
.split(RegExp('[・/、(:?s+)]'))
|
||
.map((e) => e.trim())
|
||
.toList();
|
||
|
||
final meanings = row[2].split(',').expand(cleanMeaning).toList();
|
||
|
||
return JLPTRankedWord(
|
||
readings: readings,
|
||
kanji: kanji,
|
||
jlptLevel: jlptLevel,
|
||
meanings: meanings,
|
||
);
|
||
})
|
||
.toList();
|
||
|
||
result.addAll(words);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
List<String> cleanMeaning(String meaning) {
|
||
final initialTrim = meaning.trim().replaceAll(RegExp(r'^\d.\s+'), '');
|
||
final woParens = initialTrim.replaceAll(RegExp(r'\s*\(.*?\)\s*'), '');
|
||
return {initialTrim, woParens}.toList();
|
||
}
|