import 'package:unofficial_jisho_api/src/objects.dart'; import 'package:http/http.dart' as http; import 'package:html_unescape/html_unescape.dart' as html_entities; import 'dart:convert'; import 'package:html/parser.dart'; final htmlUnescape = html_entities.HtmlUnescape(); // TODO: Put public facing types in this file. const String JISHO_API = 'http://jisho.org/api/v1/search/words'; const String SCRAPE_BASE_URI = 'http://jisho.org/search/'; const String STROKE_ORDER_DIAGRAM_BASE_URI = 'http://classic.jisho.org/static/images/stroke_diagrams/'; /* KANJI SEARCH FUNCTIONS START */ const String ONYOMI_LOCATOR_SYMBOL = 'On'; const KUNYOMI_LOCATOR_SYMBOL = 'Kun'; String removeNewlines(String str) { return str.replaceAll(RegExp(r'(?:\r|\n)') , '').trim(); } String uriForKanjiSearch(String kanji) { return '${SCRAPE_BASE_URI}${Uri.encodeComponent(kanji)}%23kanji'; } String getUriForStrokeOrderDiagram(String kanji) { return '${STROKE_ORDER_DIAGRAM_BASE_URI}${kanji.codeUnitAt(0)}_frames.png'; } String uriForPhraseSearch(String phrase) { return '${JISHO_API}?keyword=${Uri.encodeComponent(phrase)}'; } bool containsKanjiGlyph(String pageHtml, String kanji) { final kanjiGlyphToken = '

${kanji}

'; return pageHtml.contains(kanjiGlyphToken); } String getStringBetweenIndicies(String data, int startIndex, int endIndex) { final result = data.substring(startIndex, endIndex); return removeNewlines(result).trim(); } String getStringBetweenStrings(String data, String startString, String endString) { final regex = RegExp('${RegExp.escape(startString)}(.*?)${RegExp.escape(endString)}', dotAll: true); final match = regex.allMatches(data).toList(); //TODO: Something wrong here return match.isNotEmpty ? match[0].group(1).toString() : null; } int getIntBetweenStrings(String pageHtml, String startString, String endString) { final stringBetweenStrings = getStringBetweenStrings(pageHtml, startString, endString); return int.parse(stringBetweenStrings); } List getAllGlobalGroupMatches(String str, RegExp regex) { var regexResults = regex.allMatches(str).toList(); List results = []; for (var match in regexResults) { results.add(match.group(1)); } return results; } List parseAnchorsToArray(String str) { final regex = RegExp(r'(.*?)<\/a>'); return getAllGlobalGroupMatches(str, regex); } List getYomi(String pageHtml, String yomiLocatorSymbol) { final yomiSection = getStringBetweenStrings(pageHtml, '
${yomiLocatorSymbol}:
', ''); return parseAnchorsToArray(yomiSection ?? ''); } List getKunyomi(String pageHtml) { return getYomi(pageHtml, KUNYOMI_LOCATOR_SYMBOL); } List getOnyomi(String pageHtml) { return getYomi(pageHtml, ONYOMI_LOCATOR_SYMBOL); } List getYomiExamples(String pageHtml, String yomiLocatorSymbol) { final locatorString = '

${yomiLocatorSymbol} reading compounds

'; final exampleSection = getStringBetweenStrings(pageHtml, locatorString, ''); if (exampleSection==null) { return null; } final regex = RegExp(r'
  • (.*?)<\/li>', dotAll: true); final regexResults = getAllGlobalGroupMatches(exampleSection, regex).map((s) => s.trim()); final examples = regexResults.map((regexResult) { final examplesLines = regexResult.split('\n').map((s) => s.trim()).toList(); return YomiExample( example: examplesLines[0], reading: examplesLines[1].replaceAll('【', '').replaceAll('】', ''), meaning: htmlUnescape.convert(examplesLines[2]), ); }); return examples.toList(); } List getOnyomiExamples(String pageHtml) { return getYomiExamples(pageHtml, ONYOMI_LOCATOR_SYMBOL); } List getKunyomiExamples(String pageHtml) { return getYomiExamples(pageHtml, KUNYOMI_LOCATOR_SYMBOL); } Radical getRadical(String pageHtml) { const radicalMeaningStartString = ''; const radicalMeaningEndString = ''; var radicalMeaning = getStringBetweenStrings( pageHtml, radicalMeaningStartString, radicalMeaningEndString, ).trim(); if (radicalMeaning!=null) { final radicalMeaningStartIndex = pageHtml.indexOf(radicalMeaningStartString); final radicalMeaningEndIndex = pageHtml.indexOf( radicalMeaningEndString, radicalMeaningStartIndex, ); final radicalSymbolStartIndex = radicalMeaningEndIndex + radicalMeaningEndString.length; const radicalSymbolEndString = ''; final radicalSymbolEndIndex = pageHtml.indexOf(radicalSymbolEndString, radicalSymbolStartIndex); final radicalSymbolsString = getStringBetweenIndicies( pageHtml, radicalSymbolStartIndex, radicalSymbolEndIndex, ); if (radicalSymbolsString.length > 1) { final radicalForms = radicalSymbolsString .substring(1) .replaceAll('(', '') .replaceAll(')', '') .trim() .split(', '); return Radical( symbol: radicalSymbolsString[0], forms: radicalForms, meaning: radicalMeaning ); } return Radical ( symbol: radicalSymbolsString, meaning: radicalMeaning ); } return null; } List getParts(String pageHtml) { const partsSectionStartString = '
    Parts:
    '; const partsSectionEndString = ''; final partsSection = getStringBetweenStrings( pageHtml, partsSectionStartString, partsSectionEndString, ); var result = parseAnchorsToArray(partsSection); result.sort(); return (result); } String getSvgUri(String pageHtml) { var svgRegex = RegExp('\/\/.*?.cloudfront.net\/.*?.svg'); final regexResult = svgRegex.firstMatch(pageHtml).group(0).toString(); return regexResult.isNotEmpty ? 'http:${regexResult}' : null; } String getGifUri(String kanji) { final unicodeString = kanji.codeUnitAt(0).toRadixString(16); final fileName = '${unicodeString}.gif'; final animationUri = 'https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/${fileName}'; return animationUri; } int getNewspaperFrequencyRank(String pageHtml) { final frequencySection = getStringBetweenStrings(pageHtml, '
    ', '
    '); return (frequencySection != null) ? int.parse(getStringBetweenStrings(frequencySection, '', '')) : null; } KanjiResult parseKanjiPageData(String pageHtml, String kanji) { final result = KanjiResult(); result.query = kanji; result.found = containsKanjiGlyph(pageHtml, kanji); if (result.found==false) { return result; } result.taughtIn = getStringBetweenStrings(pageHtml, 'taught in ', ''); result.jlptLevel = getStringBetweenStrings(pageHtml, 'JLPT level ', ''); result.newspaperFrequencyRank = getNewspaperFrequencyRank(pageHtml); result.strokeCount = getIntBetweenStrings(pageHtml, '', ' strokes'); result.meaning = htmlUnescape.convert(removeNewlines(getStringBetweenStrings(pageHtml, '
    ', '
    ')).trim()); result.kunyomi = getKunyomi(pageHtml); result.onyomi = getOnyomi(pageHtml); result.onyomiExamples = getOnyomiExamples(pageHtml); result.kunyomiExamples = getKunyomiExamples(pageHtml); result.radical = getRadical(pageHtml); result.parts = getParts(pageHtml); result.strokeOrderDiagramUri = getUriForStrokeOrderDiagram(kanji); result.strokeOrderSvgUri = getSvgUri(pageHtml); result.strokeOrderGifUri = getGifUri(kanji); result.uri = uriForKanjiSearch(kanji); return result; } /* KANJI SEARCH FUNCTIONS END */ /* EXAMPLE SEARCH FUNCTIONS START */ final RegExp kanjiRegex = RegExp(r'[\u4e00-\u9faf\u3400-\u4dbf]'); String uriForExampleSearch(String phrase) { return '${SCRAPE_BASE_URI}${Uri.encodeComponent(phrase)}%23sentences'; } ExampleResultData getKanjiAndKana(div) { final ul = div.querySelector('ul'); final contents = ul.children; var kanji = ''; var kana = ''; for (var i = 0; i < contents.length; i += 1) { final content = contents[i]; if (content.tagName == 'li') { final li = content; final furigana = li.querySelector('.furigana').text; final unlifted = li.querySelector('.unlinked').text; if (furigana != null) { kanji += unlifted; kana += furigana; final kanaEnding = []; for (var j = unlifted.length - 1; j > 0; j -= 1) { final char = unlifted[j]; if (!kanjiRegex.hasMatch(char)) { kanaEnding.add(char); } else { break; } } kana += kanaEnding.reversed.join(''); } else { kanji += unlifted; kana += unlifted; } } else { final text = content.text.trim(); if (text != null) { kanji += text; kana += text; } } } return ExampleResultData( kanji: kanji, kana: kana, ); } List getPieces(sentenceElement) { final pieceElements = sentenceElement.querySelectorAll('li.clearfix'); final pieces = []; for (var pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) { final pieceElement = pieceElements[pieceIndex]; pieces.add(ExampleSentencePiece( lifted: pieceElement.querySelector('.furigana').text, unlifted: pieceElement.querySelector('.unlinked').text, )); } return pieces; } ExampleResultData parseExampleDiv(div) { final result = getKanjiAndKana(div); result.english = div.querySelector('.english').text; result.pieces = getPieces(div); return result; } ExampleResults parseExamplePageData(String pageHtml, String phrase) { final document = parse(pageHtml); final divs = document.querySelectorAll('.sentence_content'); final results = divs.map((div) => parseExampleDiv(div)); return ExampleResults( query: phrase, found: results.isNotEmpty, results: results, uri: uriForExampleSearch(phrase), phrase: phrase, ); } /* EXAMPLE SEARCH FUNCTIONS END */ /* PHRASE SCRAPE FUNCTIONS START */ List getTags(document) { final tags = []; final tagElements = document.querySelectorAll('.concept_light-tag'); for (var i = 0; i < tagElements.length; i += 1) { final tagText = tagElements[i].text; tags.add(tagText); } return tags; } PhrasePageScrapeResult getMeaningsOtherFormsAndNotes(document) { final returnValues = PhrasePageScrapeResult( otherForms: [], notes: [] ); // const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div'); final meaningsWrapper = document.querySelector('.meanings-wrapper'); final meaningsChildren = meaningsWrapper.children; final meanings = []; var mostRecentWordTypes = []; for (var meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) { final child = meaningsChildren[meaningIndex]; if (child.className.contains('meaning-tags')) { mostRecentWordTypes = child.text.split(',').map((s) => s.trim().toLowerCase()).toList(); } else if (mostRecentWordTypes[0] == 'other forms') { returnValues.otherForms = child.text.split('、') .map((s) => s.replaceAll('【', '').replaceAll('】', '').split(' ')) .map((a) => (KanjiKanaPair( kanji: a[0], kana: a[1] ))); } else if (mostRecentWordTypes[0] == 'notes') { returnValues.notes = child.text.split('\n'); } else { final meaning = child.querySelector('.meaning-meaning').text; child.querySelector('.meaning-abstract') .querySelector('a') .remove(); final meaningAbstract = child.querySelector('.meaning-abstract').text; final supplemental = child.querySelector('.supplemental_info').text.split(',') .map((s) => s.trim()) .toList(); final seeAlsoTerms = []; for (var i = supplemental.length - 1; i >= 0; i -= 1) { final supplementalEntry = supplemental[i]; if (supplementalEntry.startsWith('See also')) { seeAlsoTerms.add(supplementalEntry.replaceAll('See also ', '')); supplemental.removeAt(i); } } final sentences = []; final sentenceElements = child.querySelector('.sentences').querySelectorAll('.sentence'); for (var sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) { final sentenceElement = sentenceElements[sentenceIndex]; final english = sentenceElement.querySelector('.english').text; final pieces = getPieces(sentenceElement); sentenceElement.querySelector('.english').remove(); sentenceElement.querySelector('.furigana').remove(); final japanese = sentenceElement.text; sentences.add(PhraseScrapeSentence(english: english, japanese: japanese, pieces: pieces)); } meanings.add(PhraseScrapeMeaning( seeAlsoTerms: seeAlsoTerms, sentences: sentences, definition: meaning, supplemental: supplemental, definitionAbstract: meaningAbstract, tags: mostRecentWordTypes, )); } } returnValues.meanings = meanings; return returnValues; } String uriForPhraseScrape(String searchTerm) { return 'https://jisho.org/word/${Uri.encodeComponent(searchTerm)}'; } PhrasePageScrapeResult parsePhrasePageData(String pageHtml, String query) { final document = parse(pageHtml); final result = getMeaningsOtherFormsAndNotes(document); result.found = true; result.query = query; result.uri = uriForPhraseScrape(query); result.tags = getTags(document); // result.meanings = meanings; // result.otherForms = forms; // result.notes = notes; return result; } class JishoApi { /// Query the official Jisho API for a word or phrase /// /// See [here]{@link https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api} /// for discussion about the official API. /// @param {string} phrase The search term to search for. /// @returns {Object} The response data from the official Jisho.org API. Its format is somewhat /// complex and is not documented, so put on your trial-and-error hat. /// @async searchForPhrase(String phrase) async { final uri = uriForPhraseSearch(phrase); return http.get(uri).then((response) => jsonDecode(response.body).data); } /// Scrape the word page for a word/phrase. /// /// This allows you to get some information that isn't provided by the official API, such as /// part-of-speech and JLPT level. However, the official API should be preferred /// if it has the information you need. This function scrapes https://jisho.org/word/XXX. /// In general, you'll want to include kanji in your search term, for example 掛かる /// instead of かかる (no results). /// @param {string} phrase The search term to search for. /// @returns {PhrasePageScrapeResult} Information about the searched query. /// @async Future scrapeForPhrase(String phrase) async { final uri = uriForPhraseScrape(phrase); try { final response = await http.get(uri); return parsePhrasePageData(response.body, phrase); } catch (err) { if (err.response.status == 404) { return PhrasePageScrapeResult( query: phrase, found: false, ); } throw err; } } /// Scrape Jisho.org for information about a kanji character. /// @param {string} kanji The kanji to search for. /// @returns {KanjiResult} Information about the searched kanji. /// @async Future searchForKanji(String kanji) async { final uri = uriForKanjiSearch(kanji); return http.get(uri).then((response) => parseKanjiPageData(response.body, kanji)); } /// Scrape Jisho.org for examples. /// @param {string} phrase The word or phrase to search for. /// @returns {ExampleResults} /// @async Future searchForExamples(String phrase) async { final uri = uriForExampleSearch(phrase); return http.get(uri).then((response) => parseExamplePageData(response.body, phrase)); } }