Update code
This commit is contained in:
parent
624fcf5d2c
commit
0509ac1f61
|
@ -36,11 +36,24 @@ class PhraseScrapeMeaning {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class KanjiKanaPair {
|
||||||
|
String kanji;
|
||||||
|
String kana;
|
||||||
|
|
||||||
|
KanjiKanaPair({
|
||||||
|
String kanji,
|
||||||
|
String kana
|
||||||
|
}){
|
||||||
|
this.kanji = kanji;
|
||||||
|
this.kana = kana;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class PhrasePageScrapeResult {
|
class PhrasePageScrapeResult {
|
||||||
bool found;
|
bool found;
|
||||||
String query;
|
String query;
|
||||||
String uri;
|
String uri;
|
||||||
List<String> otherForms;
|
List<KanjiKanaPair> otherForms;
|
||||||
List<PhraseScrapeMeaning> meanings;
|
List<PhraseScrapeMeaning> meanings;
|
||||||
List<String> tags;
|
List<String> tags;
|
||||||
List<String> notes;
|
List<String> notes;
|
||||||
|
@ -49,7 +62,7 @@ class PhrasePageScrapeResult {
|
||||||
bool found,
|
bool found,
|
||||||
String query,
|
String query,
|
||||||
String uri,
|
String uri,
|
||||||
List<String> otherForms,
|
List<KanjiKanaPair> otherForms,
|
||||||
List<PhraseScrapeMeaning> meanings,
|
List<PhraseScrapeMeaning> meanings,
|
||||||
List<String> tags,
|
List<String> tags,
|
||||||
List<String> notes,
|
List<String> notes,
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import 'package:unofficial_jisho_api/src/objects.dart';
|
import 'package:unofficial_jisho_api/src/objects.dart';
|
||||||
import 'package:http/http.dart' as http;
|
import 'package:http/http.dart' as http;
|
||||||
import 'package:xml/xml.dart' as xml;
|
|
||||||
import 'package:html_unescape/html_unescape.dart' as html_entities;
|
import 'package:html_unescape/html_unescape.dart' as html_entities;
|
||||||
import 'dart:convert';
|
import 'dart:convert';
|
||||||
|
import 'dart:html';
|
||||||
|
|
||||||
final htmlUnescape = html_entities.HtmlUnescape();
|
final htmlUnescape = html_entities.HtmlUnescape();
|
||||||
|
|
||||||
|
@ -230,33 +230,35 @@ KanjiResult parseKanjiPageData(String pageHtml, String kanji) {
|
||||||
|
|
||||||
/* EXAMPLE SEARCH FUNCTIONS START */
|
/* EXAMPLE SEARCH FUNCTIONS START */
|
||||||
|
|
||||||
RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]');
|
final RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]');
|
||||||
|
|
||||||
String uriForExampleSearch(String phrase) {
|
String uriForExampleSearch(String phrase) {
|
||||||
return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences';
|
return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences';
|
||||||
}
|
}
|
||||||
|
|
||||||
ExampleResultData getKanjiAndKana(xml.XmlNode div) {
|
ExampleResultData getKanjiAndKana(Element div) {
|
||||||
final ul = div.find('ul').eq(0);
|
final ul = div.querySelector('ul');
|
||||||
final contents = ul.contents();
|
final contents = ul.children;
|
||||||
|
|
||||||
|
|
||||||
var kanji = '';
|
var kanji = '';
|
||||||
var kana = '';
|
var kana = '';
|
||||||
for (var i = 0; i < contents.length; i += 1) {
|
for (var i = 0; i < contents.length; i += 1) {
|
||||||
final content = contents.eq(i);
|
final content = contents[i];
|
||||||
if (content[0].name == 'li') {
|
if (content.tagName == 'li') {
|
||||||
final li = content;
|
final li = content;
|
||||||
final furigana = li.find('.furigana').text();
|
final furigana = li.querySelector('.furigana').text;
|
||||||
final unlifted = li.find('.unlinked').text();
|
final unlifted = li.querySelector('.unlinked').text;
|
||||||
|
|
||||||
if (furigana) {
|
if (furigana != null) {
|
||||||
kanji += unlifted;
|
kanji += unlifted;
|
||||||
kana += furigana;
|
kana += furigana;
|
||||||
|
|
||||||
final kanaEnding = [];
|
final kanaEnding = [];
|
||||||
for (var j = unlifted.length - 1; j > 0; j -= 1) {
|
for (var j = unlifted.length - 1; j > 0; j -= 1) {
|
||||||
if (!unlifted[j].match(kanjiRegex)) {
|
final char = unlifted[j];
|
||||||
kanaEnding.add(unlifted[j]);
|
if (!kanjiRegex.hasMatch(char)) {
|
||||||
|
kanaEnding.add(char);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -268,8 +270,8 @@ ExampleResultData getKanjiAndKana(xml.XmlNode div) {
|
||||||
kana += unlifted;
|
kana += unlifted;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final text = content.text().trim();
|
final text = content.text.trim();
|
||||||
if (text) {
|
if (text != null) {
|
||||||
kanji += text;
|
kanji += text;
|
||||||
kana += text;
|
kana += text;
|
||||||
}
|
}
|
||||||
|
@ -282,31 +284,32 @@ ExampleResultData getKanjiAndKana(xml.XmlNode div) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<ExampleSentencePiece> getPieces(xml.XmlNode sentenceElement) {
|
List<ExampleSentencePiece> getPieces(Element sentenceElement) {
|
||||||
final pieceElements = sentenceElement.find('li.clearfix');
|
final pieceElements = sentenceElement.querySelectorAll('li.clearfix');
|
||||||
final pieces = [];
|
final pieces = [];
|
||||||
for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) {
|
for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) {
|
||||||
final pieceElement = pieceElements.eq(pieceIndex);
|
final pieceElement = pieceElements[pieceIndex];
|
||||||
pieces.add(ExampleSentencePiece(
|
pieces.add(ExampleSentencePiece(
|
||||||
lifted: pieceElement.children('.furigana').text(),
|
lifted: pieceElement.querySelector('.furigana').text,
|
||||||
unlifted: pieceElement.children('.unlinked').text(),
|
unlifted: pieceElement.querySelector('.unlinked').text,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
return pieces;
|
return pieces;
|
||||||
}
|
}
|
||||||
|
|
||||||
ExampleResultData parseExampleDiv(xml.XmlNode div) {
|
ExampleResultData parseExampleDiv(Element div) {
|
||||||
final result = getKanjiAndKana(div);
|
final result = getKanjiAndKana(div);
|
||||||
result.english = div.find('.english').text();
|
result.english = div.querySelector('.english').text;
|
||||||
result.pieces = getPieces(div);
|
result.pieces = getPieces(div);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
ExampleResults parseExamplePageData(String pageHtml, String phrase) {
|
ExampleResults parseExamplePageData(String pageHtml, String phrase) {
|
||||||
final document = xml.parse(pageHtml);
|
final parser = DomParser();
|
||||||
final divs = document.descendants.where((node) => node.attributes[0].value == 'sentence_content').toList();
|
final document = parser.parseFromString(pageHtml, 'text/html');
|
||||||
|
final divs = document.querySelectorAll('.sentence_content');
|
||||||
|
|
||||||
final results = divs.map((div) => parseExampleDiv(div));
|
final results = divs.map((div) => parseExampleDiv(div));
|
||||||
|
|
||||||
|
@ -323,77 +326,72 @@ ExampleResults parseExamplePageData(String pageHtml, String phrase) {
|
||||||
|
|
||||||
/* PHRASE SCRAPE FUNCTIONS START */
|
/* PHRASE SCRAPE FUNCTIONS START */
|
||||||
|
|
||||||
List<String> getTags(xml.XmlDocument document) {
|
List<String> getTags(Document document) {
|
||||||
final tags = [];
|
final tags = [];
|
||||||
final tagElements = document.descendants.where((node) => node.attributes[0].value == 'concept_light-tag').toList();
|
final tagElements = document.querySelectorAll('.concept_light-tag');
|
||||||
|
|
||||||
for (var i = 0; i < tagElements.length; i += 1) {
|
for (var i = 0; i < tagElements.length; i += 1) {
|
||||||
final tagText = tagElements.eq(i).text();
|
final tagText = tagElements[i].text;
|
||||||
tags.add(tagText);
|
tags.add(tagText);
|
||||||
}
|
}
|
||||||
|
|
||||||
return tags;
|
return tags;
|
||||||
}
|
}
|
||||||
|
|
||||||
PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(xml.XmlDocument document) {
|
PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(Document document) {
|
||||||
final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] );
|
final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] );
|
||||||
|
|
||||||
//TODO: Fix
|
|
||||||
// const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div');
|
// const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div');
|
||||||
final meaningsWrapper = document.descendants.where((node) => node.attributes[0].value == 'page_container').toList();
|
final meaningsWrapper = document.querySelector('.meanings-wrapper');
|
||||||
|
|
||||||
|
final meaningsChildren = meaningsWrapper.children;
|
||||||
|
|
||||||
final meaningsChildren = meaningsWrapper.children();
|
|
||||||
final meanings = [];
|
final meanings = [];
|
||||||
|
|
||||||
var mostRecentWordTypes = [];
|
var mostRecentWordTypes = [];
|
||||||
for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) {
|
for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) {
|
||||||
final child = meaningsChildren.eq(meaningIndex);
|
final child = meaningsChildren[meaningIndex];
|
||||||
if (child.hasClass('meaning-tags')) {
|
if (child.className.contains('meaning-tags')) {
|
||||||
mostRecentWordTypes = child.text().split(',').map((s) => s.trim().toLowerCase());
|
mostRecentWordTypes = child.text.split(',').map((s) => s.trim().toLowerCase()).toList();
|
||||||
} else if (mostRecentWordTypes[0] == 'other forms') {
|
} else if (mostRecentWordTypes[0] == 'other forms') {
|
||||||
returnValues.otherForms = child.text().split('、')
|
|
||||||
.map((s) => s.replaceAll('【', '').replaceAll('】', '').split(' '))
|
|
||||||
.map((a) => (ExampleResultData( kanji: a[0], kana: a[1] )));
|
|
||||||
} else if (mostRecentWordTypes[0] == 'notes') {
|
|
||||||
returnValues.notes = child.text().split('\n');
|
|
||||||
} else {
|
|
||||||
final meaning = child.find('.meaning-meaning').text();
|
|
||||||
final meaningAbstract = child.find('.meaning-abstract')
|
|
||||||
.find('a')
|
|
||||||
.remove()
|
|
||||||
.end()
|
|
||||||
.text();
|
|
||||||
|
|
||||||
final supplemental = child.find('.supplemental_info').text().split(',')
|
returnValues.otherForms = child.text.split('、')
|
||||||
|
.map((s) => s.replaceAll('【', '').replaceAll('】', '').split(' '))
|
||||||
|
.map((a) => (KanjiKanaPair( kanji: a[0], kana: a[1] )));
|
||||||
|
|
||||||
|
} else if (mostRecentWordTypes[0] == 'notes') {
|
||||||
|
returnValues.notes = child.text.split('\n');
|
||||||
|
} else {
|
||||||
|
final meaning = child.querySelector('.meaning-meaning').text;
|
||||||
|
child.querySelector('.meaning-abstract')
|
||||||
|
.querySelector('a')
|
||||||
|
.remove();
|
||||||
|
final meaningAbstract = child.querySelector('.meaning-abstract').text;
|
||||||
|
|
||||||
|
final supplemental = child.querySelector('.supplemental_info').text.split(',')
|
||||||
.map((s) => s.trim())
|
.map((s) => s.trim())
|
||||||
.filter((s) => s);
|
.toList();
|
||||||
|
|
||||||
final seeAlsoTerms = [];
|
final seeAlsoTerms = [];
|
||||||
for (var i = supplemental.length - 1; i >= 0; i -= 1) {
|
for (var i = supplemental.length - 1; i >= 0; i -= 1) {
|
||||||
final supplementalEntry = supplemental[i];
|
final supplementalEntry = supplemental[i];
|
||||||
if (supplementalEntry.startsWith('See also')) {
|
if (supplementalEntry.startsWith('See also')) {
|
||||||
seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', ''));
|
seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', ''));
|
||||||
supplemental.splice(i, 1);
|
supplemental.removeAt(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final sentences = [];
|
final sentences = [];
|
||||||
final sentenceElements = child.find('.sentences').children('.sentence');
|
final sentenceElements = child.querySelector('.sentences').querySelectorAll('.sentence');
|
||||||
|
|
||||||
for (var sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) {
|
for (var sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) {
|
||||||
final sentenceElement = sentenceElements.eq(sentenceIndex);
|
final sentenceElement = sentenceElements[sentenceIndex];
|
||||||
|
|
||||||
final english = sentenceElement.find('.english').text();
|
final english = sentenceElement.querySelector('.english').text;
|
||||||
final pieces = getPieces(sentenceElement);
|
final pieces = getPieces(sentenceElement);
|
||||||
|
|
||||||
final japanese = sentenceElement
|
sentenceElement.querySelector('.english').remove();
|
||||||
.find('.english').remove().end()
|
sentenceElement.querySelector('.furigana').remove();
|
||||||
.find('.furigana')
|
final japanese = sentenceElement.text;
|
||||||
.remove()
|
|
||||||
.end()
|
|
||||||
.text();
|
|
||||||
|
|
||||||
sentences.add(PhraseScrapeSentence(english: english, japanese: japanese, pieces: pieces));
|
sentences.add(PhraseScrapeSentence(english: english, japanese: japanese, pieces: pieces));
|
||||||
}
|
}
|
||||||
|
@ -419,7 +417,8 @@ String uriForPhraseScrape(String searchTerm) {
|
||||||
}
|
}
|
||||||
|
|
||||||
PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) {
|
PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) {
|
||||||
final document = xml.parse(pageHtml);
|
final parser = DomParser();
|
||||||
|
final document = parser.parseFromString(pageHtml, 'text/html');
|
||||||
final result = getMeaningsOtherFormsAndNotes(document);
|
final result = getMeaningsOtherFormsAndNotes(document);
|
||||||
|
|
||||||
result.found = true;
|
result.found = true;
|
||||||
|
|
|
@ -14,29 +14,13 @@ List<String> getFilePaths(String dirname) {
|
||||||
return filenames.map((filename) => path.join(currentdir, 'test', dirname, filename.path)).toList();
|
return filenames.map((filename) => path.join(currentdir, 'test', dirname, filename.path)).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
void runTestCases(List<String> testCaseFiles, String apiFunction) async {
|
void runTestCases(List<String> testCaseFiles, Function apiFunction) async {
|
||||||
for (var testCount = 0; testCount < testCaseFiles.length; testCount++) {
|
for (var testCount = 0; testCount < testCaseFiles.length; testCount++) {
|
||||||
final file = await File(testCaseFiles[testCount]).readAsString();
|
final file = await File(testCaseFiles[testCount]).readAsString();
|
||||||
final testCase = jsonDecode(file);
|
final testCase = jsonDecode(file);
|
||||||
await test('Test ${testCount}', () async {
|
await test('Test ${testCount}', () async {
|
||||||
switch(apiFunction) {
|
final result = await apiFunction(testCase['query']);
|
||||||
case 'searchForKanji': {
|
|
||||||
final result = await jisho.searchForKanji(testCase['query']);
|
|
||||||
expect(result.toJson(), testCase['expectedResult']);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'searchForExamples': {
|
|
||||||
final result = await jisho.searchForExamples(testCase['query']);
|
|
||||||
expect(result, testCase['expectedResult']);
|
expect(result, testCase['expectedResult']);
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'scrapeForPhrase': {
|
|
||||||
final result = await jisho.scrapeForPhrase(testCase['query']);
|
|
||||||
expect(result, testCase['expectedResult']);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
throw 'No API function provided';
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,10 +29,8 @@ void main() async {
|
||||||
|
|
||||||
await test_local_functions();
|
await test_local_functions();
|
||||||
|
|
||||||
await runTestCases(getFilePaths('kanji_test_cases'), 'searchForKanji');
|
await runTestCases(getFilePaths('kanji_test_cases'), jisho.searchForKanji);
|
||||||
|
await runTestCases(getFilePaths('example_test_cases'), jisho.searchForExamples);
|
||||||
await runTestCases(getFilePaths('example_test_cases'), 'searchForExamples');
|
await runTestCases(getFilePaths('phrase_scrape_test_cases'), jisho.scrapeForPhrase);
|
||||||
|
|
||||||
await runTestCases(getFilePaths('phrase_scrape_test_cases'), 'scrapeForPhrase');
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue