Update code

This commit is contained in:
Oystein Kristoffer Tveit 2020-06-14 12:48:53 +02:00
parent 624fcf5d2c
commit 0509ac1f61
3 changed files with 80 additions and 86 deletions

View File

@ -36,11 +36,24 @@ class PhraseScrapeMeaning {
} }
} }
class KanjiKanaPair {
String kanji;
String kana;
KanjiKanaPair({
String kanji,
String kana
}){
this.kanji = kanji;
this.kana = kana;
}
}
class PhrasePageScrapeResult { class PhrasePageScrapeResult {
bool found; bool found;
String query; String query;
String uri; String uri;
List<String> otherForms; List<KanjiKanaPair> otherForms;
List<PhraseScrapeMeaning> meanings; List<PhraseScrapeMeaning> meanings;
List<String> tags; List<String> tags;
List<String> notes; List<String> notes;
@ -49,7 +62,7 @@ class PhrasePageScrapeResult {
bool found, bool found,
String query, String query,
String uri, String uri,
List<String> otherForms, List<KanjiKanaPair> otherForms,
List<PhraseScrapeMeaning> meanings, List<PhraseScrapeMeaning> meanings,
List<String> tags, List<String> tags,
List<String> notes, List<String> notes,

View File

@ -1,8 +1,8 @@
import 'package:unofficial_jisho_api/src/objects.dart'; import 'package:unofficial_jisho_api/src/objects.dart';
import 'package:http/http.dart' as http; import 'package:http/http.dart' as http;
import 'package:xml/xml.dart' as xml;
import 'package:html_unescape/html_unescape.dart' as html_entities; import 'package:html_unescape/html_unescape.dart' as html_entities;
import 'dart:convert'; import 'dart:convert';
import 'dart:html';
final htmlUnescape = html_entities.HtmlUnescape(); final htmlUnescape = html_entities.HtmlUnescape();
@ -230,33 +230,35 @@ KanjiResult parseKanjiPageData(String pageHtml, String kanji) {
/* EXAMPLE SEARCH FUNCTIONS START */ /* EXAMPLE SEARCH FUNCTIONS START */
RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]'); final RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]');
String uriForExampleSearch(String phrase) { String uriForExampleSearch(String phrase) {
return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences'; return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences';
} }
ExampleResultData getKanjiAndKana(xml.XmlNode div) { ExampleResultData getKanjiAndKana(Element div) {
final ul = div.find('ul').eq(0); final ul = div.querySelector('ul');
final contents = ul.contents(); final contents = ul.children;
var kanji = ''; var kanji = '';
var kana = ''; var kana = '';
for (var i = 0; i < contents.length; i += 1) { for (var i = 0; i < contents.length; i += 1) {
final content = contents.eq(i); final content = contents[i];
if (content[0].name == 'li') { if (content.tagName == 'li') {
final li = content; final li = content;
final furigana = li.find('.furigana').text(); final furigana = li.querySelector('.furigana').text;
final unlifted = li.find('.unlinked').text(); final unlifted = li.querySelector('.unlinked').text;
if (furigana) { if (furigana != null) {
kanji += unlifted; kanji += unlifted;
kana += furigana; kana += furigana;
final kanaEnding = []; final kanaEnding = [];
for (var j = unlifted.length - 1; j > 0; j -= 1) { for (var j = unlifted.length - 1; j > 0; j -= 1) {
if (!unlifted[j].match(kanjiRegex)) { final char = unlifted[j];
kanaEnding.add(unlifted[j]); if (!kanjiRegex.hasMatch(char)) {
kanaEnding.add(char);
} else { } else {
break; break;
} }
@ -268,8 +270,8 @@ ExampleResultData getKanjiAndKana(xml.XmlNode div) {
kana += unlifted; kana += unlifted;
} }
} else { } else {
final text = content.text().trim(); final text = content.text.trim();
if (text) { if (text != null) {
kanji += text; kanji += text;
kana += text; kana += text;
} }
@ -282,31 +284,32 @@ ExampleResultData getKanjiAndKana(xml.XmlNode div) {
); );
} }
List<ExampleSentencePiece> getPieces(xml.XmlNode sentenceElement) { List<ExampleSentencePiece> getPieces(Element sentenceElement) {
final pieceElements = sentenceElement.find('li.clearfix'); final pieceElements = sentenceElement.querySelectorAll('li.clearfix');
final pieces = []; final pieces = [];
for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) { for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) {
final pieceElement = pieceElements.eq(pieceIndex); final pieceElement = pieceElements[pieceIndex];
pieces.add(ExampleSentencePiece( pieces.add(ExampleSentencePiece(
lifted: pieceElement.children('.furigana').text(), lifted: pieceElement.querySelector('.furigana').text,
unlifted: pieceElement.children('.unlinked').text(), unlifted: pieceElement.querySelector('.unlinked').text,
)); ));
} }
return pieces; return pieces;
} }
ExampleResultData parseExampleDiv(xml.XmlNode div) { ExampleResultData parseExampleDiv(Element div) {
final result = getKanjiAndKana(div); final result = getKanjiAndKana(div);
result.english = div.find('.english').text(); result.english = div.querySelector('.english').text;
result.pieces = getPieces(div); result.pieces = getPieces(div);
return result; return result;
} }
ExampleResults parseExamplePageData(String pageHtml, String phrase) { ExampleResults parseExamplePageData(String pageHtml, String phrase) {
final document = xml.parse(pageHtml); final parser = DomParser();
final divs = document.descendants.where((node) => node.attributes[0].value == 'sentence_content').toList(); final document = parser.parseFromString(pageHtml, 'text/html');
final divs = document.querySelectorAll('.sentence_content');
final results = divs.map((div) => parseExampleDiv(div)); final results = divs.map((div) => parseExampleDiv(div));
@ -323,77 +326,72 @@ ExampleResults parseExamplePageData(String pageHtml, String phrase) {
/* PHRASE SCRAPE FUNCTIONS START */ /* PHRASE SCRAPE FUNCTIONS START */
List<String> getTags(xml.XmlDocument document) { List<String> getTags(Document document) {
final tags = []; final tags = [];
final tagElements = document.descendants.where((node) => node.attributes[0].value == 'concept_light-tag').toList(); final tagElements = document.querySelectorAll('.concept_light-tag');
for (var i = 0; i < tagElements.length; i += 1) { for (var i = 0; i < tagElements.length; i += 1) {
final tagText = tagElements.eq(i).text(); final tagText = tagElements[i].text;
tags.add(tagText); tags.add(tagText);
} }
return tags; return tags;
} }
PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(xml.XmlDocument document) { PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(Document document) {
final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] ); final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] );
//TODO: Fix
// const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div'); // const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div');
final meaningsWrapper = document.descendants.where((node) => node.attributes[0].value == 'page_container').toList(); final meaningsWrapper = document.querySelector('.meanings-wrapper');
final meaningsChildren = meaningsWrapper.children;
final meaningsChildren = meaningsWrapper.children();
final meanings = []; final meanings = [];
var mostRecentWordTypes = []; var mostRecentWordTypes = [];
for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) { for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) {
final child = meaningsChildren.eq(meaningIndex); final child = meaningsChildren[meaningIndex];
if (child.hasClass('meaning-tags')) { if (child.className.contains('meaning-tags')) {
mostRecentWordTypes = child.text().split(',').map((s) => s.trim().toLowerCase()); mostRecentWordTypes = child.text.split(',').map((s) => s.trim().toLowerCase()).toList();
} else if (mostRecentWordTypes[0] == 'other forms') { } else if (mostRecentWordTypes[0] == 'other forms') {
returnValues.otherForms = child.text().split('')
.map((s) => s.replaceAll('', '').replaceAll('', '').split(' '))
.map((a) => (ExampleResultData( kanji: a[0], kana: a[1] )));
} else if (mostRecentWordTypes[0] == 'notes') {
returnValues.notes = child.text().split('\n');
} else {
final meaning = child.find('.meaning-meaning').text();
final meaningAbstract = child.find('.meaning-abstract')
.find('a')
.remove()
.end()
.text();
final supplemental = child.find('.supplemental_info').text().split(',') returnValues.otherForms = child.text.split('')
.map((s) => s.replaceAll('', '').replaceAll('', '').split(' '))
.map((a) => (KanjiKanaPair( kanji: a[0], kana: a[1] )));
} else if (mostRecentWordTypes[0] == 'notes') {
returnValues.notes = child.text.split('\n');
} else {
final meaning = child.querySelector('.meaning-meaning').text;
child.querySelector('.meaning-abstract')
.querySelector('a')
.remove();
final meaningAbstract = child.querySelector('.meaning-abstract').text;
final supplemental = child.querySelector('.supplemental_info').text.split(',')
.map((s) => s.trim()) .map((s) => s.trim())
.filter((s) => s); .toList();
final seeAlsoTerms = []; final seeAlsoTerms = [];
for (var i = supplemental.length - 1; i >= 0; i -= 1) { for (var i = supplemental.length - 1; i >= 0; i -= 1) {
final supplementalEntry = supplemental[i]; final supplementalEntry = supplemental[i];
if (supplementalEntry.startsWith('See also')) { if (supplementalEntry.startsWith('See also')) {
seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', '')); seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', ''));
supplemental.splice(i, 1); supplemental.removeAt(i);
} }
} }
final sentences = []; final sentences = [];
final sentenceElements = child.find('.sentences').children('.sentence'); final sentenceElements = child.querySelector('.sentences').querySelectorAll('.sentence');
for (var sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) { for (var sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) {
final sentenceElement = sentenceElements.eq(sentenceIndex); final sentenceElement = sentenceElements[sentenceIndex];
final english = sentenceElement.find('.english').text(); final english = sentenceElement.querySelector('.english').text;
final pieces = getPieces(sentenceElement); final pieces = getPieces(sentenceElement);
final japanese = sentenceElement sentenceElement.querySelector('.english').remove();
.find('.english').remove().end() sentenceElement.querySelector('.furigana').remove();
.find('.furigana') final japanese = sentenceElement.text;
.remove()
.end()
.text();
sentences.add(PhraseScrapeSentence(english: english, japanese: japanese, pieces: pieces)); sentences.add(PhraseScrapeSentence(english: english, japanese: japanese, pieces: pieces));
} }
@ -419,7 +417,8 @@ String uriForPhraseScrape(String searchTerm) {
} }
PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) { PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) {
final document = xml.parse(pageHtml); final parser = DomParser();
final document = parser.parseFromString(pageHtml, 'text/html');
final result = getMeaningsOtherFormsAndNotes(document); final result = getMeaningsOtherFormsAndNotes(document);
result.found = true; result.found = true;

View File

@ -14,29 +14,13 @@ List<String> getFilePaths(String dirname) {
return filenames.map((filename) => path.join(currentdir, 'test', dirname, filename.path)).toList(); return filenames.map((filename) => path.join(currentdir, 'test', dirname, filename.path)).toList();
} }
void runTestCases(List<String> testCaseFiles, String apiFunction) async { void runTestCases(List<String> testCaseFiles, Function apiFunction) async {
for (var testCount = 0; testCount < testCaseFiles.length; testCount++) { for (var testCount = 0; testCount < testCaseFiles.length; testCount++) {
final file = await File(testCaseFiles[testCount]).readAsString(); final file = await File(testCaseFiles[testCount]).readAsString();
final testCase = jsonDecode(file); final testCase = jsonDecode(file);
await test('Test ${testCount}', () async { await test('Test ${testCount}', () async {
switch(apiFunction) { final result = await apiFunction(testCase['query']);
case 'searchForKanji': { expect(result, testCase['expectedResult']);
final result = await jisho.searchForKanji(testCase['query']);
expect(result.toJson(), testCase['expectedResult']);
break;
}
case 'searchForExamples': {
final result = await jisho.searchForExamples(testCase['query']);
expect(result, testCase['expectedResult']);
break;
}
case 'scrapeForPhrase': {
final result = await jisho.scrapeForPhrase(testCase['query']);
expect(result, testCase['expectedResult']);
break;
}
throw 'No API function provided';
}
}); });
} }
} }
@ -45,10 +29,8 @@ void main() async {
await test_local_functions(); await test_local_functions();
await runTestCases(getFilePaths('kanji_test_cases'), 'searchForKanji'); await runTestCases(getFilePaths('kanji_test_cases'), jisho.searchForKanji);
await runTestCases(getFilePaths('example_test_cases'), jisho.searchForExamples);
await runTestCases(getFilePaths('example_test_cases'), 'searchForExamples'); await runTestCases(getFilePaths('phrase_scrape_test_cases'), jisho.scrapeForPhrase);
await runTestCases(getFilePaths('phrase_scrape_test_cases'), 'scrapeForPhrase');
} }