From 86b8ccfdbb14cf7023e7098bbf1d27676ef05555 Mon Sep 17 00:00:00 2001 From: h7x4abk3g Date: Mon, 22 Jun 2020 15:18:48 +0200 Subject: [PATCH] Split into files --- lib/src/baseURI.dart | 3 + lib/src/exampleSearch.dart | 108 ++++++ lib/src/kanjiSearch.dart | 214 +++++++++++ lib/src/phraseScrape.dart | 154 ++++++++ lib/src/phraseSearch.dart | 5 + lib/src/unofficial_jisho_api_base.dart | 485 +------------------------ test/local_function_test_cases.dart | 293 --------------- test/unofficial_jisho_api_test.dart | 5 - 8 files changed, 488 insertions(+), 779 deletions(-) create mode 100644 lib/src/baseURI.dart create mode 100644 lib/src/exampleSearch.dart create mode 100644 lib/src/kanjiSearch.dart create mode 100644 lib/src/phraseScrape.dart create mode 100644 lib/src/phraseSearch.dart delete mode 100644 test/local_function_test_cases.dart diff --git a/lib/src/baseURI.dart b/lib/src/baseURI.dart new file mode 100644 index 0000000..c63a529 --- /dev/null +++ b/lib/src/baseURI.dart @@ -0,0 +1,3 @@ +const String JISHO_API = 'https://jisho.org/api/v1/search/words'; +const String SCRAPE_BASE_URI = 'https://jisho.org/search/'; +const String STROKE_ORDER_DIAGRAM_BASE_URI = 'https://classic.jisho.org/static/images/stroke_diagrams/'; \ No newline at end of file diff --git a/lib/src/exampleSearch.dart b/lib/src/exampleSearch.dart new file mode 100644 index 0000000..dacbd4b --- /dev/null +++ b/lib/src/exampleSearch.dart @@ -0,0 +1,108 @@ +import './baseURI.dart'; +import './objects.dart'; + +import 'package:html/parser.dart'; +import 'package:html/dom.dart'; + +final RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]'); + +String uriForExampleSearch(String phrase) { + return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences'; +} + +/* TODO: This is the wrong approach. + * Symbols such as 、「」。 are missing in mid sentence + * Maybe also JP fullwidth numbers? + */ + +String getEndSymbolsOfExampleSentence(Element ul) { + final endSymbols = RegExp(r'<\/li>([^<>]+)$'); + return endSymbols.firstMatch(ul.innerHtml).group(1); +} + +ExampleResultData getKanjiAndKana(Element div) { + final ul = div.querySelector('ul'); + final contents = ul.children; + + var kanji = ''; + var kana = ''; + for (var i = 0; i < contents.length; i += 1) { + final content = contents[i]; + if (content.localName == 'li') { + final li = content; + final furigana = li.querySelector('.furigana')?.text; + final unlifted = li.querySelector('.unlinked')?.text; + + if (furigana != null) { + kanji += unlifted; + kana += furigana; + + final kanaEnding = []; + for (var j = unlifted.length - 1; j > 0; j -= 1) { + final char = unlifted[j]; + if (!kanjiRegex.hasMatch(char)) { + kanaEnding.add(char); + } else { + break; + } + } + + kana += kanaEnding.reversed.join(''); + } else { + kanji += unlifted; + kana += unlifted; + } + } else { + final text = content.text.trim(); + if (text != null) { + kanji += text; + kana += text; + } + } + } + final endSymbols = getEndSymbolsOfExampleSentence(ul).trim(); + kanji+= endSymbols; + kana += endSymbols; + + return ExampleResultData( + kanji: kanji, + kana: kana, + ); +} + +List getPieces(Element sentenceElement) { + final pieceElements = sentenceElement.querySelectorAll('li.clearfix'); + final List pieces = []; + for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) { + final pieceElement = pieceElements[pieceIndex]; + pieces.add(ExampleSentencePiece( + lifted: pieceElement.querySelector('.furigana')?.text, + unlifted: pieceElement.querySelector('.unlinked')?.text, + )); + } + + return pieces; +} + +ExampleResultData parseExampleDiv(Element div) { + final result = getKanjiAndKana(div); + result.english = div.querySelector('.english').text; + result.pieces = getPieces(div) ?? []; + + return result; +} + +ExampleResults parseExamplePageData(String pageHtml, String phrase) { + final document = parse(pageHtml); + final divs = document.querySelectorAll('.sentence_content'); + + final results = divs.map((div) => parseExampleDiv(div)).toList(); + + return ExampleResults( + query: phrase, + found: results.isNotEmpty, + results: results ?? [], + uri: uriForExampleSearch(phrase), + phrase: phrase, + ); +} \ No newline at end of file diff --git a/lib/src/kanjiSearch.dart b/lib/src/kanjiSearch.dart new file mode 100644 index 0000000..53adcee --- /dev/null +++ b/lib/src/kanjiSearch.dart @@ -0,0 +1,214 @@ +import './baseURI.dart'; +import './objects.dart'; + + +import 'package:html_unescape/html_unescape.dart' as html_entities; +final htmlUnescape = html_entities.HtmlUnescape(); + +const String ONYOMI_LOCATOR_SYMBOL = 'On'; +const String KUNYOMI_LOCATOR_SYMBOL = 'Kun'; + +String removeNewlines(String str) { + return str.replaceAll(RegExp(r'(?:\r|\n)') , '').trim(); +} + +String uriForKanjiSearch(String kanji) { + return '${SCRAPE_BASE_URI}${Uri.encodeComponent(kanji)}%23kanji'; +} + +String getUriForStrokeOrderDiagram(String kanji) { + return '${STROKE_ORDER_DIAGRAM_BASE_URI}${kanji.codeUnitAt(0)}_frames.png'; +} + +bool containsKanjiGlyph(String pageHtml, String kanji) { + final kanjiGlyphToken = '

${kanji}

'; + return pageHtml.contains(kanjiGlyphToken); +} + +String getStringBetweenIndicies(String data, int startIndex, int endIndex) { + final result = data.substring(startIndex, endIndex); + return removeNewlines(result).trim(); +} + +String getStringBetweenStrings(String data, String startString, String endString) { + final regex = RegExp('${RegExp.escape(startString)}(.*?)${RegExp.escape(endString)}', dotAll: true); + final match = regex.allMatches(data).toList(); + + return match.isNotEmpty ? match[0].group(1).toString() : null; +} + +int getIntBetweenStrings(String pageHtml, String startString, String endString) { + final stringBetweenStrings = getStringBetweenStrings(pageHtml, startString, endString); + return int.parse(stringBetweenStrings); +} + +List getAllGlobalGroupMatches(String str, RegExp regex) { + var regexResults = regex.allMatches(str).toList(); + List results = []; + for (var match in regexResults) { + results.add(match.group(1)); + } + + return results; +} + +List parseAnchorsToArray(String str) { + final regex = RegExp(r'(.*?)<\/a>'); + return getAllGlobalGroupMatches(str, regex); +} + +List getYomi(String pageHtml, String yomiLocatorSymbol) { + final yomiSection = getStringBetweenStrings(pageHtml, '
${yomiLocatorSymbol}:
', ''); + return parseAnchorsToArray(yomiSection ?? ''); +} + +List getKunyomi(String pageHtml) { + return getYomi(pageHtml, KUNYOMI_LOCATOR_SYMBOL); +} + +List getOnyomi(String pageHtml) { + return getYomi(pageHtml, ONYOMI_LOCATOR_SYMBOL); +} + +List getYomiExamples(String pageHtml, String yomiLocatorSymbol) { + final locatorString = '

${yomiLocatorSymbol} reading compounds

'; + final exampleSection = getStringBetweenStrings(pageHtml, locatorString, ''); + if (exampleSection==null) { + return null; + } + + final regex = RegExp(r'
  • (.*?)<\/li>', dotAll: true); + final regexResults = getAllGlobalGroupMatches(exampleSection, regex).map((s) => s.trim()); + + final examples = regexResults.map((regexResult) { + final examplesLines = regexResult.split('\n').map((s) => s.trim()).toList(); + return YomiExample( + example: examplesLines[0], + reading: examplesLines[1].replaceAll('【', '').replaceAll('】', ''), + meaning: htmlUnescape.convert(examplesLines[2]), + ); + }); + + return examples.toList(); +} + +List getOnyomiExamples(String pageHtml) { + return getYomiExamples(pageHtml, ONYOMI_LOCATOR_SYMBOL); +} + +List getKunyomiExamples(String pageHtml) { + return getYomiExamples(pageHtml, KUNYOMI_LOCATOR_SYMBOL); +} + +Radical getRadical(String pageHtml) { + const radicalMeaningStartString = ''; + const radicalMeaningEndString = ''; + + var radicalMeaning = getStringBetweenStrings( + pageHtml, + radicalMeaningStartString, + radicalMeaningEndString, + ).trim(); + + if (radicalMeaning!=null) { + final radicalMeaningStartIndex = pageHtml.indexOf(radicalMeaningStartString); + + final radicalMeaningEndIndex = pageHtml.indexOf( + radicalMeaningEndString, + radicalMeaningStartIndex, + ); + + final radicalSymbolStartIndex = radicalMeaningEndIndex + radicalMeaningEndString.length; + const radicalSymbolEndString = ''; + final radicalSymbolEndIndex = pageHtml.indexOf(radicalSymbolEndString, radicalSymbolStartIndex); + + final radicalSymbolsString = getStringBetweenIndicies( + pageHtml, + radicalSymbolStartIndex, + radicalSymbolEndIndex, + ); + + if (radicalSymbolsString.length > 1) { + final radicalForms = radicalSymbolsString + .substring(1) + .replaceAll('(', '') + .replaceAll(')', '') + .trim() + .split(', '); + + return Radical( + symbol: radicalSymbolsString[0], + forms: radicalForms ?? [], + meaning: radicalMeaning + ); + } + + return Radical ( + symbol: radicalSymbolsString, + meaning: radicalMeaning + ); + } + + return null; +} + +List getParts(String pageHtml) { + const partsSectionStartString = '
    Parts:
    '; + const partsSectionEndString = ''; + + final partsSection = getStringBetweenStrings( + pageHtml, + partsSectionStartString, + partsSectionEndString, + ); + + var result = parseAnchorsToArray(partsSection); + result.sort(); + + return (result); +} + +String getSvgUri(String pageHtml) { + var svgRegex = RegExp('\/\/.*?.cloudfront.net\/.*?.svg'); + final regexResult = svgRegex.firstMatch(pageHtml).group(0).toString(); + return regexResult.isNotEmpty ? 'https:${regexResult}' : null; +} + +String getGifUri(String kanji) { + final unicodeString = kanji.codeUnitAt(0).toRadixString(16); + final fileName = '${unicodeString}.gif'; + final animationUri = 'https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/${fileName}'; + + return animationUri; +} + +int getNewspaperFrequencyRank(String pageHtml) { + final frequencySection = getStringBetweenStrings(pageHtml, '
    ', '
    '); + return (frequencySection != null) ? int.parse(getStringBetweenStrings(frequencySection, '', '')) : null; +} + +KanjiResult parseKanjiPageData(String pageHtml, String kanji) { + final result = KanjiResult(); + result.query = kanji; + result.found = containsKanjiGlyph(pageHtml, kanji); + if (result.found==false) { + return result; + } + + result.taughtIn = getStringBetweenStrings(pageHtml, 'taught in ', ''); + result.jlptLevel = getStringBetweenStrings(pageHtml, 'JLPT level ', ''); + result.newspaperFrequencyRank = getNewspaperFrequencyRank(pageHtml); + result.strokeCount = getIntBetweenStrings(pageHtml, '', ' strokes'); + result.meaning = htmlUnescape.convert(removeNewlines(getStringBetweenStrings(pageHtml, '
    ', '
    ')).trim()); + result.kunyomi = getKunyomi(pageHtml) ?? []; + result.onyomi = getOnyomi(pageHtml) ?? []; + result.onyomiExamples = getOnyomiExamples(pageHtml) ?? []; + result.kunyomiExamples = getKunyomiExamples(pageHtml) ?? []; + result.radical = getRadical(pageHtml); + result.parts = getParts(pageHtml) ?? []; + result.strokeOrderDiagramUri = getUriForStrokeOrderDiagram(kanji); + result.strokeOrderSvgUri = getSvgUri(pageHtml); + result.strokeOrderGifUri = getGifUri(kanji); + result.uri = uriForKanjiSearch(kanji); + return result; +} \ No newline at end of file diff --git a/lib/src/phraseScrape.dart b/lib/src/phraseScrape.dart new file mode 100644 index 0000000..3324cf9 --- /dev/null +++ b/lib/src/phraseScrape.dart @@ -0,0 +1,154 @@ +import './objects.dart'; +import './exampleSearch.dart'; + +import 'package:html/parser.dart'; +import 'package:html/dom.dart'; + +List getTags(Document document) { + final List tags = []; + final tagElements = document.querySelectorAll('.concept_light-tag'); + + for (var i = 0; i < tagElements.length; i += 1) { + final tagText = tagElements[i].text; + tags.add(tagText); + } + + return tags; +} + +List getMostRecentWordTypes(Element child) { + return child.text.split(',').map((s) => s.trim().toLowerCase()).toList(); +} + +List getOtherForms(Element child) { + return child.text.split('、') + .map((s) => s.replaceAll('【', '').replaceAll('】', '').split(' ')) + .map((a) => (KanjiKanaPair( kanji: a[0], kana: (a.length == 2) ? a[1] : null ))).toList(); +} + +List getNotes(Element child) => child.text.split('\n'); + +String getMeaning(Element child) => child.querySelector('.meaning-meaning').text; + +String getMeaningAbstract(Element child) { + final meaningAbstract = child.querySelector('.meaning-abstract'); + if (meaningAbstract == null) return null; + + for (var element in meaningAbstract.querySelectorAll('a')) { + element.remove(); + } + + return child.querySelector('.meaning-abstract')?.text; +} + +List getSupplemental(Element child) { + final supplemental = child.querySelector('.supplemental_info'); + if (supplemental == null) return []; + return supplemental.text.split(',').map((s) => s.trim()).toList(); +} + +List getSeeAlsoTerms(List supplemental) { + if (supplemental == null) return []; + + final List seeAlsoTerms = []; + for (var i = supplemental.length - 1; i >= 0; i -= 1) { + final supplementalEntry = supplemental[i]; + if (supplementalEntry.startsWith('See also')) { + seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', '')); + supplemental.removeAt(i); + } + } + return seeAlsoTerms; +} + +List getSentences(Element child) { + final sentenceElements = child.querySelector('.sentences')?.querySelectorAll('.sentence'); + if (sentenceElements == null) return []; + + final List sentences = []; + for (var sentenceIndex = 0; sentenceIndex < (sentenceElements?.length ?? 0); sentenceIndex += 1) { + final sentenceElement = sentenceElements[sentenceIndex]; + + final english = sentenceElement.querySelector('.english').text; + final pieces = getPieces(sentenceElement); + + sentenceElement.querySelector('.english').remove(); + for (var element in sentenceElement.children[0].children) { + element.querySelector('.furigana')?.remove(); + } + + final japanese = sentenceElement.text; + + sentences.add( + PhraseScrapeSentence( + english: english, + japanese: japanese, + pieces: pieces ?? [] + ) + ); + } + + return sentences; +} + +PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(Document document) { + final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] ); + + final meaningsWrapper = document.querySelector('.meanings-wrapper'); + if (meaningsWrapper == null) return PhrasePageScrapeResult(found: false); + returnValues.found = true; + + final meaningsChildren = meaningsWrapper.children; + + final List meanings = []; + var mostRecentWordTypes = []; + for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) { + final child = meaningsChildren[meaningIndex]; + + if (child.className.contains('meaning-tags')) { + mostRecentWordTypes = getMostRecentWordTypes(child); + + } else if (mostRecentWordTypes[0] == 'other forms') { + returnValues.otherForms = getOtherForms(child); + + } else if (mostRecentWordTypes[0] == 'notes') { + returnValues.notes = getNotes(child); + + } else { + final meaning = getMeaning(child); + final meaningAbstract = getMeaningAbstract(child); + final supplemental = getSupplemental(child); + final seeAlsoTerms = getSeeAlsoTerms(supplemental); + final sentences = getSentences(child); + + meanings.add(PhraseScrapeMeaning( + seeAlsoTerms: seeAlsoTerms ?? [], + sentences: sentences ?? [], + definition: meaning, + supplemental: supplemental ?? [], + definitionAbstract: meaningAbstract, + tags: mostRecentWordTypes ?? [], + )); + } + } + + returnValues.meanings = meanings; + + return returnValues; +} + +String uriForPhraseScrape(String searchTerm) { + return 'https://jisho.org/word/${Uri.encodeComponent(searchTerm)}'; +} + +PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) { + final document = parse(pageHtml); + final result = getMeaningsOtherFormsAndNotes(document); + + result.query = query; + if (!result.found) return result; + result.uri = uriForPhraseScrape(query); + result.tags = getTags(document); + + return result; +} \ No newline at end of file diff --git a/lib/src/phraseSearch.dart b/lib/src/phraseSearch.dart new file mode 100644 index 0000000..211ceb3 --- /dev/null +++ b/lib/src/phraseSearch.dart @@ -0,0 +1,5 @@ +import './baseURI.dart'; + +String uriForPhraseSearch(String phrase) { + return '${JISHO_API}?keyword=${Uri.encodeComponent(phrase)}'; +} \ No newline at end of file diff --git a/lib/src/unofficial_jisho_api_base.dart b/lib/src/unofficial_jisho_api_base.dart index 46eaf94..a964493 100644 --- a/lib/src/unofficial_jisho_api_base.dart +++ b/lib/src/unofficial_jisho_api_base.dart @@ -1,489 +1,12 @@ import 'package:unofficial_jisho_api/src/objects.dart'; import 'package:http/http.dart' as http; -import 'package:html_unescape/html_unescape.dart' as html_entities; import 'dart:convert'; -import 'package:html/parser.dart'; -import 'package:html/dom.dart'; -final htmlUnescape = html_entities.HtmlUnescape(); +import './phraseSearch.dart'; +import './kanjiSearch.dart'; +import './exampleSearch.dart'; +import './phraseScrape.dart'; -const String JISHO_API = 'https://jisho.org/api/v1/search/words'; -const String SCRAPE_BASE_URI = 'https://jisho.org/search/'; -const String STROKE_ORDER_DIAGRAM_BASE_URI = 'https://classic.jisho.org/static/images/stroke_diagrams/'; - -/* KANJI SEARCH FUNCTIONS START */ - -const String ONYOMI_LOCATOR_SYMBOL = 'On'; -const KUNYOMI_LOCATOR_SYMBOL = 'Kun'; - -String removeNewlines(String str) { - return str.replaceAll(RegExp(r'(?:\r|\n)') , '').trim(); -} - -String uriForKanjiSearch(String kanji) { - return '${SCRAPE_BASE_URI}${Uri.encodeComponent(kanji)}%23kanji'; -} - -String getUriForStrokeOrderDiagram(String kanji) { - return '${STROKE_ORDER_DIAGRAM_BASE_URI}${kanji.codeUnitAt(0)}_frames.png'; -} - -String uriForPhraseSearch(String phrase) { - return '${JISHO_API}?keyword=${Uri.encodeComponent(phrase)}'; -} - -bool containsKanjiGlyph(String pageHtml, String kanji) { - final kanjiGlyphToken = '

    ${kanji}

    '; - return pageHtml.contains(kanjiGlyphToken); -} - -String getStringBetweenIndicies(String data, int startIndex, int endIndex) { - final result = data.substring(startIndex, endIndex); - return removeNewlines(result).trim(); -} - -String getStringBetweenStrings(String data, String startString, String endString) { - final regex = RegExp('${RegExp.escape(startString)}(.*?)${RegExp.escape(endString)}', dotAll: true); - final match = regex.allMatches(data).toList(); - - return match.isNotEmpty ? match[0].group(1).toString() : null; -} - -int getIntBetweenStrings(String pageHtml, String startString, String endString) { - final stringBetweenStrings = getStringBetweenStrings(pageHtml, startString, endString); - return int.parse(stringBetweenStrings); -} - -List getAllGlobalGroupMatches(String str, RegExp regex) { - var regexResults = regex.allMatches(str).toList(); - List results = []; - for (var match in regexResults) { - results.add(match.group(1)); - } - - return results; -} - -List parseAnchorsToArray(String str) { - final regex = RegExp(r'
    (.*?)<\/a>'); - return getAllGlobalGroupMatches(str, regex); -} - -List getYomi(String pageHtml, String yomiLocatorSymbol) { - final yomiSection = getStringBetweenStrings(pageHtml, '
    ${yomiLocatorSymbol}:
    ', ''); - return parseAnchorsToArray(yomiSection ?? ''); -} - -List getKunyomi(String pageHtml) { - return getYomi(pageHtml, KUNYOMI_LOCATOR_SYMBOL); -} - -List getOnyomi(String pageHtml) { - return getYomi(pageHtml, ONYOMI_LOCATOR_SYMBOL); -} - -List getYomiExamples(String pageHtml, String yomiLocatorSymbol) { - final locatorString = '

    ${yomiLocatorSymbol} reading compounds

    '; - final exampleSection = getStringBetweenStrings(pageHtml, locatorString, ''); - if (exampleSection==null) { - return null; - } - - final regex = RegExp(r'
  • (.*?)<\/li>', dotAll: true); - final regexResults = getAllGlobalGroupMatches(exampleSection, regex).map((s) => s.trim()); - - final examples = regexResults.map((regexResult) { - final examplesLines = regexResult.split('\n').map((s) => s.trim()).toList(); - return YomiExample( - example: examplesLines[0], - reading: examplesLines[1].replaceAll('【', '').replaceAll('】', ''), - meaning: htmlUnescape.convert(examplesLines[2]), - ); - }); - - return examples.toList(); -} - -List getOnyomiExamples(String pageHtml) { - return getYomiExamples(pageHtml, ONYOMI_LOCATOR_SYMBOL); -} - -List getKunyomiExamples(String pageHtml) { - return getYomiExamples(pageHtml, KUNYOMI_LOCATOR_SYMBOL); -} - -Radical getRadical(String pageHtml) { - const radicalMeaningStartString = ''; - const radicalMeaningEndString = ''; - - var radicalMeaning = getStringBetweenStrings( - pageHtml, - radicalMeaningStartString, - radicalMeaningEndString, - ).trim(); - - if (radicalMeaning!=null) { - final radicalMeaningStartIndex = pageHtml.indexOf(radicalMeaningStartString); - - final radicalMeaningEndIndex = pageHtml.indexOf( - radicalMeaningEndString, - radicalMeaningStartIndex, - ); - - final radicalSymbolStartIndex = radicalMeaningEndIndex + radicalMeaningEndString.length; - const radicalSymbolEndString = ''; - final radicalSymbolEndIndex = pageHtml.indexOf(radicalSymbolEndString, radicalSymbolStartIndex); - - final radicalSymbolsString = getStringBetweenIndicies( - pageHtml, - radicalSymbolStartIndex, - radicalSymbolEndIndex, - ); - - if (radicalSymbolsString.length > 1) { - final radicalForms = radicalSymbolsString - .substring(1) - .replaceAll('(', '') - .replaceAll(')', '') - .trim() - .split(', '); - - return Radical( - symbol: radicalSymbolsString[0], - forms: radicalForms ?? [], - meaning: radicalMeaning - ); - } - - return Radical ( - symbol: radicalSymbolsString, - meaning: radicalMeaning - ); - } - - return null; -} - -List getParts(String pageHtml) { - const partsSectionStartString = '
    Parts:
    '; - const partsSectionEndString = ''; - - final partsSection = getStringBetweenStrings( - pageHtml, - partsSectionStartString, - partsSectionEndString, - ); - - var result = parseAnchorsToArray(partsSection); - result.sort(); - - return (result); -} - -String getSvgUri(String pageHtml) { - var svgRegex = RegExp('\/\/.*?.cloudfront.net\/.*?.svg'); - final regexResult = svgRegex.firstMatch(pageHtml).group(0).toString(); - return regexResult.isNotEmpty ? 'https:${regexResult}' : null; -} - -String getGifUri(String kanji) { - final unicodeString = kanji.codeUnitAt(0).toRadixString(16); - final fileName = '${unicodeString}.gif'; - final animationUri = 'https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/${fileName}'; - - return animationUri; -} - -int getNewspaperFrequencyRank(String pageHtml) { - final frequencySection = getStringBetweenStrings(pageHtml, '
    ', '
    '); - return (frequencySection != null) ? int.parse(getStringBetweenStrings(frequencySection, '', '')) : null; -} - -KanjiResult parseKanjiPageData(String pageHtml, String kanji) { - final result = KanjiResult(); - result.query = kanji; - result.found = containsKanjiGlyph(pageHtml, kanji); - if (result.found==false) { - return result; - } - - result.taughtIn = getStringBetweenStrings(pageHtml, 'taught in ', ''); - result.jlptLevel = getStringBetweenStrings(pageHtml, 'JLPT level ', ''); - result.newspaperFrequencyRank = getNewspaperFrequencyRank(pageHtml); - result.strokeCount = getIntBetweenStrings(pageHtml, '', ' strokes'); - result.meaning = htmlUnescape.convert(removeNewlines(getStringBetweenStrings(pageHtml, '
    ', '
    ')).trim()); - result.kunyomi = getKunyomi(pageHtml) ?? []; - result.onyomi = getOnyomi(pageHtml) ?? []; - result.onyomiExamples = getOnyomiExamples(pageHtml) ?? []; - result.kunyomiExamples = getKunyomiExamples(pageHtml) ?? []; - result.radical = getRadical(pageHtml); - result.parts = getParts(pageHtml) ?? []; - result.strokeOrderDiagramUri = getUriForStrokeOrderDiagram(kanji); - result.strokeOrderSvgUri = getSvgUri(pageHtml); - result.strokeOrderGifUri = getGifUri(kanji); - result.uri = uriForKanjiSearch(kanji); - return result; -} - -/* KANJI SEARCH FUNCTIONS END */ - -/* EXAMPLE SEARCH FUNCTIONS START */ - -final RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]'); - -String uriForExampleSearch(String phrase) { - return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences'; -} - -/* TODO: This is the wrong approach. - * Symbols such as 、「」。 are missing in mid sentence - * Maybe also JP fullwidth numbers? - */ - -String getEndSymbolsOfExampleSentence(Element ul) { - final endSymbols = RegExp(r'<\/li>([^<>]+)$'); - return endSymbols.firstMatch(ul.innerHtml).group(1); -} - -ExampleResultData getKanjiAndKana(Element div) { - final ul = div.querySelector('ul'); - final contents = ul.children; - - var kanji = ''; - var kana = ''; - for (var i = 0; i < contents.length; i += 1) { - final content = contents[i]; - if (content.localName == 'li') { - final li = content; - final furigana = li.querySelector('.furigana')?.text; - final unlifted = li.querySelector('.unlinked')?.text; - - if (furigana != null) { - kanji += unlifted; - kana += furigana; - - final kanaEnding = []; - for (var j = unlifted.length - 1; j > 0; j -= 1) { - final char = unlifted[j]; - if (!kanjiRegex.hasMatch(char)) { - kanaEnding.add(char); - } else { - break; - } - } - - kana += kanaEnding.reversed.join(''); - } else { - kanji += unlifted; - kana += unlifted; - } - } else { - final text = content.text.trim(); - if (text != null) { - kanji += text; - kana += text; - } - } - } - final endSymbols = getEndSymbolsOfExampleSentence(ul).trim(); - kanji+= endSymbols; - kana += endSymbols; - - return ExampleResultData( - kanji: kanji, - kana: kana, - ); -} - -List getPieces(Element sentenceElement) { - final pieceElements = sentenceElement.querySelectorAll('li.clearfix'); - final List pieces = []; - for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) { - final pieceElement = pieceElements[pieceIndex]; - pieces.add(ExampleSentencePiece( - lifted: pieceElement.querySelector('.furigana')?.text, - unlifted: pieceElement.querySelector('.unlinked')?.text, - )); - } - - return pieces; -} - -ExampleResultData parseExampleDiv(Element div) { - final result = getKanjiAndKana(div); - result.english = div.querySelector('.english').text; - result.pieces = getPieces(div) ?? []; - - return result; -} - -ExampleResults parseExamplePageData(String pageHtml, String phrase) { - final document = parse(pageHtml); - final divs = document.querySelectorAll('.sentence_content'); - - final results = divs.map((div) => parseExampleDiv(div)).toList(); - - return ExampleResults( - query: phrase, - found: results.isNotEmpty, - results: results ?? [], - uri: uriForExampleSearch(phrase), - phrase: phrase, - ); -} - -/* EXAMPLE SEARCH FUNCTIONS END */ - -/* PHRASE SCRAPE FUNCTIONS START */ - -List getTags(Document document) { - final List tags = []; - final tagElements = document.querySelectorAll('.concept_light-tag'); - - for (var i = 0; i < tagElements.length; i += 1) { - final tagText = tagElements[i].text; - tags.add(tagText); - } - - return tags; -} - -List getMostRecentWordTypes(Element child) { - return child.text.split(',').map((s) => s.trim().toLowerCase()).toList(); -} - -List getOtherForms(Element child) { - return child.text.split('、') - .map((s) => s.replaceAll('【', '').replaceAll('】', '').split(' ')) - .map((a) => (KanjiKanaPair( kanji: a[0], kana: (a.length == 2) ? a[1] : null ))).toList(); -} - -List getNotes(Element child) => child.text.split('\n'); - -String getMeaning(Element child) => child.querySelector('.meaning-meaning').text; - -String getMeaningAbstract(Element child) { - final meaningAbstract = child.querySelector('.meaning-abstract'); - if (meaningAbstract == null) return null; - - for (var element in meaningAbstract.querySelectorAll('a')) { - element.remove(); - } - - return child.querySelector('.meaning-abstract')?.text; -} - -List getSupplemental(Element child) { - final supplemental = child.querySelector('.supplemental_info'); - if (supplemental == null) return []; - return supplemental.text.split(',').map((s) => s.trim()).toList(); -} - -List getSeeAlsoTerms(List supplemental) { - if (supplemental == null) return []; - - final List seeAlsoTerms = []; - for (var i = supplemental.length - 1; i >= 0; i -= 1) { - final supplementalEntry = supplemental[i]; - if (supplementalEntry.startsWith('See also')) { - seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', '')); - supplemental.removeAt(i); - } - } - return seeAlsoTerms; -} - -List getSentences(Element child) { - final sentenceElements = child.querySelector('.sentences')?.querySelectorAll('.sentence'); - if (sentenceElements == null) return []; - - final List sentences = []; - for (var sentenceIndex = 0; sentenceIndex < (sentenceElements?.length ?? 0); sentenceIndex += 1) { - final sentenceElement = sentenceElements[sentenceIndex]; - - final english = sentenceElement.querySelector('.english').text; - final pieces = getPieces(sentenceElement); - - sentenceElement.querySelector('.english').remove(); - for (var element in sentenceElement.children[0].children) { - element.querySelector('.furigana')?.remove(); - } - - final japanese = sentenceElement.text; - - sentences.add( - PhraseScrapeSentence( - english: english, - japanese: japanese, - pieces: pieces ?? [] - ) - ); - } - - return sentences; -} - -PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(Document document) { - final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] ); - - final meaningsWrapper = document.querySelector('.meanings-wrapper'); - if (meaningsWrapper == null) return PhrasePageScrapeResult(found: false); - returnValues.found = true; - - final meaningsChildren = meaningsWrapper.children; - - final List meanings = []; - var mostRecentWordTypes = []; - for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) { - final child = meaningsChildren[meaningIndex]; - - if (child.className.contains('meaning-tags')) { - mostRecentWordTypes = getMostRecentWordTypes(child); - - } else if (mostRecentWordTypes[0] == 'other forms') { - returnValues.otherForms = getOtherForms(child); - - } else if (mostRecentWordTypes[0] == 'notes') { - returnValues.notes = getNotes(child); - - } else { - final meaning = getMeaning(child); - final meaningAbstract = getMeaningAbstract(child); - final supplemental = getSupplemental(child); - final seeAlsoTerms = getSeeAlsoTerms(supplemental); - final sentences = getSentences(child); - - meanings.add(PhraseScrapeMeaning( - seeAlsoTerms: seeAlsoTerms ?? [], - sentences: sentences ?? [], - definition: meaning, - supplemental: supplemental ?? [], - definitionAbstract: meaningAbstract, - tags: mostRecentWordTypes ?? [], - )); - } - } - - returnValues.meanings = meanings; - - return returnValues; -} - -String uriForPhraseScrape(String searchTerm) { - return 'https://jisho.org/word/${Uri.encodeComponent(searchTerm)}'; -} - -PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) { - final document = parse(pageHtml); - final result = getMeaningsOtherFormsAndNotes(document); - - result.query = query; - if (!result.found) return result; - result.uri = uriForPhraseScrape(query); - result.tags = getTags(document); - - return result; -} class JishoApi { diff --git a/test/local_function_test_cases.dart b/test/local_function_test_cases.dart deleted file mode 100644 index 62b6919..0000000 --- a/test/local_function_test_cases.dart +++ /dev/null @@ -1,293 +0,0 @@ -import 'package:unofficial_jisho_api/src/objects.dart'; -import 'package:unofficial_jisho_api/unofficial_jisho_api.dart'; - -import 'package:test/test.dart'; -import 'dart:convert'; -import 'package:http/http.dart' as http; - -void test_local_functions() async { - - -/* KANJI SEARCH FUNCTION TESTS START */ - - test('removeNewLines', () { - final result = removeNewlines('Line \nwith\r\n Newlines and spaces\n'); - expect(result, 'Line with Newlines and spaces'); - }); - - test('uriForKanjiSearch', () { - final result = uriForKanjiSearch('時'); - expect(result, 'https://jisho.org/search/%E6%99%82%23kanji'); - }); - - test('getUriForStrokeOrderDiagram', () { - final result = getUriForStrokeOrderDiagram('時'); - expect(result, 'https://classic.jisho.org/static/images/stroke_diagrams/26178_frames.png'); - }); - - test('uriForPhraseSearch', () { - final result = uriForPhraseSearch('時間'); - expect(result, 'https://jisho.org/api/v1/search/words?keyword=%E6%99%82%E9%96%93'); - }); - - final kanjiPage = (await http.get('https://jisho.org/search/%E6%99%82%23kanji')).body; - - test('containsKanjiGlyph', () { - final result = containsKanjiGlyph(kanjiPage, '時'); - expect(result, true); - }); - - test('getStringBetweenIndicies', () { - final result = getStringBetweenIndicies('String\n\rwith\nNewlines', 3, 9); - expect(result, 'ingw'); - }); - - test('getStringBetweenStrings', () { - const data = 'STArT I want this string END'; - final result = getStringBetweenStrings(data, 'STArT', 'END'); - expect(result, ' I want this string '); - }); - - test('getIntBetweenStrings', () { - final result = getIntBetweenStrings(kanjiPage, '', ' strokes'); - expect(result, 10); - }); - - test('getAllGlobalGroupMatches', () { - - }); - - test('parseAnchorsToArray', () { - const htmlCode = - ''' -
    - '''; - - final result = parseAnchorsToArray(htmlCode); - expect(result, [ - 'Hello', 'Hi', 'How are you doing']); - }); - - test('getYomi', () { - final result = getYomi(kanjiPage, 'On'); - expect(result, ['ジ']); - - }); - - test('getKunyomi', () { - final result = getKunyomi(kanjiPage); - expect(result, ['とき', '-どき']); - }); - - test('getOnyomi', () { - final result = getOnyomi(kanjiPage); - expect(result, ['ジ']); - }); - - test('getYomiExamples', () { - final result = getYomiExamples(kanjiPage, 'On'); - expect( - json.encode(result), - json.encode([ - YomiExample( - example: '時', - reading: 'ジ', - meaning: '''hour, o'clock, (specified) time, when ..., during ...''' - ), - YomiExample( - example: '時価', - reading: 'ジカ', - meaning: 'current value, price, market value' - ), - YomiExample( - example: '零時', - reading: 'レイジ', - meaning: '''twelve o'clock, midnight, noon''' - ), - YomiExample( - example: '平時', - reading: 'ヘイジ', - meaning: 'peacetime, time of peace, ordinary times, normal times' - ), - ]) - ); - }); - - test('getOnyomiExamples', () { - final result = getOnyomiExamples(kanjiPage); - expect( - json.encode(result), - json.encode([ - YomiExample( - example: '時', - reading: 'ジ', - meaning: '''hour, o'clock, (specified) time, when ..., during ...''' - ), - YomiExample( - example: '時価', - reading: 'ジカ', - meaning: 'current value, price, market value' - ), - YomiExample( - example: '零時', - reading: 'レイジ', - meaning: '''twelve o'clock, midnight, noon''' - ), - YomiExample( - example: '平時', - reading: 'ヘイジ', - meaning: 'peacetime, time of peace, ordinary times, normal times' - ), - ]) - ); - }); - - test('getKunyomiExamples', () { - final result = getKunyomiExamples(kanjiPage); - expect( - json.encode(result), - json.encode([ - YomiExample( - example: '時', - reading: 'とき', - meaning: 'time, hour, moment, occasion, case, chance, opportunity, season, the times, the age, the day, tense' - ), - YomiExample( - example: '時折', - reading: 'ときおり', - meaning: 'sometimes' - ), - YomiExample( - example: '切り替え時', - reading: 'きりかえとき', - meaning: 'time to switch over, response time' - ), - YomiExample( - example: '逢魔が時', - reading: 'おうまがとき', - meaning: '''twilight, time for disasters (similar to 'the witching hour' but not midnight)''' - ), - ]) - ); - }); - - test('getRadical', () { - final result = getRadical(kanjiPage); - expect( - json.encode(result), - json.encode(Radical( - symbol: '日', - meaning: 'sun, day' - )) - ); - }); - - test('getParts', () { - final result = getParts(kanjiPage); - expect(result, ['土', '寸', '日']); - }); - - test('getSvgUri', () { - final result = getSvgUri(kanjiPage); - expect(result, 'https://d1w6u4xc3l95km.cloudfront.net/kanji-2015-03/06642.svg'); - }); - - test('getGifUri', () { - final result = getGifUri(kanjiPage); - expect(result, 'https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/3c.gif'); - }); - - test('getNewspaperFrequencyRank', () { - final result = getNewspaperFrequencyRank(kanjiPage); - expect(result, 16); - }); - - test('parseKanjiPageData', () { - final result = parseKanjiPageData(kanjiPage, '時'); - - final expectedResult = KanjiResult(); - expectedResult.query = '時'; - expectedResult.found = true; - expectedResult.taughtIn = 'grade 2'; - expectedResult.jlptLevel = 'N5'; - expectedResult.newspaperFrequencyRank = 16; - expectedResult.strokeCount = 10; - expectedResult.meaning = 'time, hour'; - expectedResult.kunyomi = ['とき', '-どき']; - expectedResult.onyomi = ['ジ']; - expectedResult.onyomiExamples = - [ - YomiExample( - example: '時', - reading: 'ジ', - meaning: '''hour, o'clock, (specified) time, when ..., during ...''' - ), - YomiExample( - example: '時価', - reading: 'ジカ', - meaning: 'current value, price, market value' - ), - YomiExample( - example: '零時', - reading: 'レイジ', - meaning: '''twelve o'clock, midnight, noon''' - ), - YomiExample( - example: '平時', - reading: 'ヘイジ', - meaning: 'peacetime, time of peace, ordinary times, normal times' - ), - ]; - expectedResult.kunyomiExamples = - [ - YomiExample( - example: '時', - reading: 'とき', - meaning: 'time, hour, moment, occasion, case, chance, opportunity, season, the times, the age, the day, tense' - ), - YomiExample( - example: '時折', - reading: 'ときおり', - meaning: 'sometimes' - ), - YomiExample( - example: '切り替え時', - reading: 'きりかえとき', - meaning: 'time to switch over, response time' - ), - YomiExample( - example: '逢魔が時', - reading: 'おうまがとき', - meaning: '''twilight, time for disasters (similar to 'the witching hour' but not midnight)''' - ), - ]; - expectedResult.radical = - Radical( - symbol: '日', - meaning: 'sun, day' - ); - expectedResult.parts = ['土', '寸', '日']; - expectedResult.strokeOrderDiagramUri = 'https://classic.jisho.org/static/images/stroke_diagrams/26178_frames.png'; - expectedResult.strokeOrderSvgUri = 'https://d1w6u4xc3l95km.cloudfront.net/kanji-2015-03/06642.svg'; - expectedResult.strokeOrderGifUri = 'https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/6642.gif'; - expectedResult.uri = 'https://jisho.org/search/%E6%99%82%23kanji'; - - expect( - json.encode(result), - json.encode(expectedResult) - ); - }); - - /* KANJI SEARCH FUNCTION TESTS END */ - -} \ No newline at end of file diff --git a/test/unofficial_jisho_api_test.dart b/test/unofficial_jisho_api_test.dart index 97bc250..cfed7fb 100644 --- a/test/unofficial_jisho_api_test.dart +++ b/test/unofficial_jisho_api_test.dart @@ -3,7 +3,6 @@ import 'package:path/path.dart' as path; import 'dart:convert'; import 'package:unofficial_jisho_api/unofficial_jisho_api.dart'; -import 'local_function_test_cases.dart' show test_local_functions; import 'package:test/test.dart'; final jisho = JishoApi(); @@ -26,11 +25,7 @@ void runTestCases(List testCaseFiles, Function apiFunction) async { } void main() async { - - await test_local_functions(); - await runTestCases(getFilePaths('kanji_test_cases'), jisho.searchForKanji); await runTestCases(getFilePaths('example_test_cases'), jisho.searchForExamples); await runTestCases(getFilePaths('phrase_scrape_test_cases'), jisho.scrapeForPhrase); - }