/*
* This module encapsulates the official Jisho.org API
* and also provides kanji and example search features that scrape Jisho.org.
* Permission to scrape granted by Jisho's admin Kimtaro:
* http://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api
*/
const axios = require('axios').create({ timeout: 10000 });
const cheerio = require('cheerio');
const escapeStringRegexp = require('escape-string-regexp');
const { XmlEntities } = require('html-entities');
const JISHO_API = 'http://jisho.org/api/v1/search/words';
const SCRAPE_BASE_URI = 'http://jisho.org/search/';
const STROKE_ORDER_DIAGRAM_BASE_URI = 'http://classic.jisho.org/static/images/stroke_diagrams/';
const htmlEntities = new XmlEntities();
/* KANJI SEARCH FUNCTIONS START */
const ONYOMI_LOCATOR_SYMBOL = 'On';
const KUNYOMI_LOCATOR_SYMBOL = 'Kun';
function removeNewlines(str) {
return str.replace(/(?:\r|\n)/g, '').trim();
}
function uriForKanjiSearch(kanji) {
return `${SCRAPE_BASE_URI}${encodeURIComponent(kanji)}%23kanji`;
}
function getUriForStrokeOrderDiagram(kanji) {
return `${STROKE_ORDER_DIAGRAM_BASE_URI}${kanji.charCodeAt(0).toString()}_frames.png`;
}
function uriForPhraseSearch(phrase) {
return `${JISHO_API}?keyword=${encodeURIComponent(phrase)}`;
}
function containsKanjiGlyph(pageHtml, kanji) {
const kanjiGlyphToken = `
${kanji}
`;
return pageHtml.indexOf(kanjiGlyphToken) !== -1;
}
function getStringBetweenIndicies(data, startIndex, endIndex) {
const result = data.substring(startIndex, endIndex);
return removeNewlines(result).trim();
}
function getStringBetweenStrings(data, startString, endString) {
const regex = new RegExp(`${escapeStringRegexp(startString)}(.*?)${escapeStringRegexp(endString)}`, 's');
const match = data.match(regex);
return match ? match[1] : undefined;
}
function getIntBetweenStrings(pageHtml, startString, endString) {
const stringBetweenStrings = getStringBetweenStrings(pageHtml, startString, endString);
if (stringBetweenStrings) {
return parseInt(stringBetweenStrings, 10);
}
return undefined;
}
function getAllGlobalGroupMatches(str, regex) {
let regexResult = regex.exec(str);
const results = [];
while (regexResult) {
results.push(regexResult[1]);
regexResult = regex.exec(str);
}
return results;
}
function parseAnchorsToArray(str) {
const regex = /(.*?)<\/a>/g;
return getAllGlobalGroupMatches(str, regex);
}
function getYomi(pageHtml, yomiLocatorSymbol) {
const yomiSection = getStringBetweenStrings(pageHtml, `${yomiLocatorSymbol}:`, '');
return parseAnchorsToArray(yomiSection || '');
}
function getKunyomi(pageHtml) {
return getYomi(pageHtml, KUNYOMI_LOCATOR_SYMBOL);
}
function getOnyomi(pageHtml) {
return getYomi(pageHtml, ONYOMI_LOCATOR_SYMBOL);
}
function getYomiExamples(pageHtml, yomiLocatorSymbol) {
const locatorString = `${yomiLocatorSymbol} reading compounds
`;
const exampleSection = getStringBetweenStrings(pageHtml, locatorString, '');
if (!exampleSection) {
return [];
}
const regex = /(.*?)<\/li>/gs;
const regexResults = getAllGlobalGroupMatches(exampleSection, regex).map(s => s.trim());
const examples = regexResults.map((regexResult) => {
const examplesLines = regexResult.split('\n').map(s => s.trim());
return {
example: examplesLines[0],
reading: examplesLines[1].replace('【', '').replace('】', ''),
meaning: htmlEntities.decode(examplesLines[2]),
};
});
return examples;
}
function getOnyomiExamples(pageHtml) {
return getYomiExamples(pageHtml, ONYOMI_LOCATOR_SYMBOL);
}
function getKunyomiExamples(pageHtml) {
return getYomiExamples(pageHtml, KUNYOMI_LOCATOR_SYMBOL);
}
function getRadical(pageHtml) {
const radicalMeaningStartString = '';
const radicalMeaningEndString = '';
const radicalMeaning = getStringBetweenStrings(
pageHtml,
radicalMeaningStartString,
radicalMeaningEndString,
).trim();
if (radicalMeaning) {
const radicalMeaningStartIndex = pageHtml.indexOf(radicalMeaningStartString);
const radicalMeaningEndIndex = pageHtml.indexOf(
radicalMeaningEndString,
radicalMeaningStartIndex,
);
const radicalSymbolStartIndex = radicalMeaningEndIndex + radicalMeaningEndString.length;
const radicalSymbolEndString = '';
const radicalSymbolEndIndex = pageHtml.indexOf(radicalSymbolEndString, radicalSymbolStartIndex);
const radicalSymbolsString = getStringBetweenIndicies(
pageHtml,
radicalSymbolStartIndex,
radicalSymbolEndIndex,
);
if (radicalSymbolsString.length > 1) {
const radicalForms = radicalSymbolsString
.substring(1)
.replace('(', '')
.replace(')', '')
.trim()
.split(', ');
return { symbol: radicalSymbolsString[0], forms: radicalForms, meaning: radicalMeaning };
}
return { symbol: radicalSymbolsString, meaning: radicalMeaning };
}
return undefined;
}
function getParts(pageHtml) {
const partsSectionStartString = 'Parts:';
const partsSectionEndString = '';
const partsSection = getStringBetweenStrings(
pageHtml,
partsSectionStartString,
partsSectionEndString,
);
return parseAnchorsToArray(partsSection).sort();
}
function getSvgUri(pageHtml) {
const svgRegex = /\/\/.*?.cloudfront.net\/.*?.svg/;
const regexResult = svgRegex.exec(pageHtml);
return regexResult ? `http:${regexResult[0]}` : undefined;
}
function getGifUri(kanji) {
const unicodeString = kanji.codePointAt(0).toString(16);
const fileName = `${unicodeString}.gif`;
const animationUri = `https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/${fileName}`;
return animationUri;
}
function getNewspaperFrequencyRank(pageHtml) {
const frequencySection = getStringBetweenStrings(pageHtml, '', '
');
return frequencySection ? getStringBetweenStrings(frequencySection, '', '') : undefined;
}
function parseKanjiPageData(pageHtml, kanji) {
const result = {};
result.query = kanji;
result.found = containsKanjiGlyph(pageHtml, kanji);
if (!result.found) {
return result;
}
result.taughtIn = getStringBetweenStrings(pageHtml, 'taught in ', '');
result.jlptLevel = getStringBetweenStrings(pageHtml, 'JLPT level ', '');
result.newspaperFrequencyRank = getNewspaperFrequencyRank(pageHtml);
result.strokeCount = getIntBetweenStrings(pageHtml, '', ' strokes');
result.meaning = htmlEntities.decode(removeNewlines(getStringBetweenStrings(pageHtml, '', '
')).trim());
result.kunyomi = getKunyomi(pageHtml);
result.onyomi = getOnyomi(pageHtml);
result.onyomiExamples = getOnyomiExamples(pageHtml);
result.kunyomiExamples = getKunyomiExamples(pageHtml);
result.radical = getRadical(pageHtml);
result.parts = getParts(pageHtml);
result.strokeOrderDiagramUri = getUriForStrokeOrderDiagram(kanji);
result.strokeOrderSvgUri = getSvgUri(pageHtml);
result.strokeOrderGifUri = getGifUri(kanji);
result.uri = uriForKanjiSearch(kanji);
return result;
}
/* KANJI SEARCH FUNCTIONS END */
/* EXAMPLE SEARCH FUNCTIONS START */
const kanjiRegex = /[\u4e00-\u9faf\u3400-\u4dbf]/g;
function uriForExampleSearch(phrase) {
return `${SCRAPE_BASE_URI}${encodeURIComponent(phrase)}%23sentences`;
}
function getKanjiAndKana(div) {
const ul = div.find('ul').eq(0);
const contents = ul.contents();
let kanji = '';
let kana = '';
for (let i = 0; i < contents.length; i += 1) {
const content = contents.eq(i);
if (content[0].name === 'li') {
const li = content;
const furigana = li.find('.furigana').text();
const unlifted = li.find('.unlinked').text();
if (furigana) {
kanji += unlifted;
kana += furigana;
const kanaEnding = [];
for (let j = unlifted.length - 1; j > 0; j -= 1) {
if (!unlifted[j].match(kanjiRegex)) {
kanaEnding.push(unlifted[j]);
} else {
break;
}
}
kana += kanaEnding.reverse().join('');
} else {
kanji += unlifted;
kana += unlifted;
}
} else {
const text = content.text().trim();
if (text) {
kanji += text;
kana += text;
}
}
}
return { kanji, kana };
}
function getPieces(sentenceElement) {
const pieceElements = sentenceElement.find('li.clearfix');
const pieces = [];
for (let pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) {
const pieceElement = pieceElements.eq(pieceIndex);
pieces.push({
lifted: pieceElement.children('.furigana').text(),
unlifted: pieceElement.children('.unlinked').text(),
});
}
return pieces;
}
function parseExampleDiv(div) {
const english = div.find('.english').text();
const { kanji, kana } = getKanjiAndKana(div);
return {
english,
kanji,
kana,
pieces: getPieces(div),
};
}
function parseExamplePageData(pageHtml, phrase) {
const $ = cheerio.load(pageHtml);
const divs = $('.sentence_content');
const results = [];
for (let i = 0; i < divs.length; i += 1) {
const div = divs.eq(i);
results.push(parseExampleDiv(div));
}
return {
query: phrase,
found: results.length > 0,
results,
uri: uriForExampleSearch(phrase),
phrase,
};
}
/* EXAMPLE SEARCH FUNCTIONS END */
/* PHRASE SCRAPE FUNCTIONS START */
function getTags($) {
const tags = [];
const tagElements = $('.concept_light-tag');
for (let i = 0; i < tagElements.length; i += 1) {
const tagText = tagElements.eq(i).text();
tags.push(tagText);
}
return tags;
}
function getMeaningsOtherFormsAndNotes($) {
const returnValues = { otherForms: [], notes: [] };
const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div');
const meaningsChildren = meaningsWrapper.children();
const meanings = [];
let mostRecentWordTypes = [];
for (let meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) {
const child = meaningsChildren.eq(meaningIndex);
if (child.hasClass('meaning-tags')) {
mostRecentWordTypes = child.text().split(',').map(s => s.trim().toLowerCase());
} else if (mostRecentWordTypes[0] === 'other forms') {
returnValues.otherForms = child.text().split('、')
.map(s => s.replace('【', '').replace('】', '').split(' '))
.map(a => ({ kanji: a[0], kana: a[1] }));
} else if (mostRecentWordTypes[0] === 'notes') {
returnValues.notes = child.text().split('\n');
} else {
const meaning = child.find('.meaning-meaning').text();
const meaningAbstract = child.find('.meaning-abstract')
.find('a')
.remove()
.end()
.text();
const supplemental = child.find('.supplemental_info').text().split(',')
.map(s => s.trim())
.filter(s => s);
const seeAlsoTerms = [];
for (let i = supplemental.length - 1; i >= 0; i -= 1) {
const supplementalEntry = supplemental[i];
if (supplementalEntry.startsWith('See also')) {
seeAlsoTerms.push(supplementalEntry.replace('See also ', ''));
supplemental.splice(i, 1);
}
}
const sentences = [];
const sentenceElements = child.find('.sentences').children('.sentence');
for (let sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) {
const sentenceElement = sentenceElements.eq(sentenceIndex);
const english = sentenceElement.find('.english').text();
const pieces = getPieces(sentenceElement);
const japanese = sentenceElement
.find('.english').remove().end()
.find('.furigana')
.remove()
.end()
.text();
sentences.push({ english, japanese, pieces });
}
meanings.push({
seeAlsoTerms,
sentences,
definition: meaning,
supplemental,
definitionAbstract: meaningAbstract,
tags: mostRecentWordTypes,
});
}
}
returnValues.meanings = meanings;
return returnValues;
}
function uriForPhraseScrape(searchTerm) {
return `https://jisho.org/word/${encodeURIComponent(searchTerm)}`;
}
function parsePhrasePageData(pageHtml, query) {
const $ = cheerio.load(pageHtml);
const { meanings, otherForms, notes } = getMeaningsOtherFormsAndNotes($);
const result = {
found: true,
query,
uri: uriForPhraseScrape(query),
tags: getTags($),
meanings,
otherForms,
notes,
};
return result;
}
/* PHRASE SCRAPE FUNCTIONS END */
/**
* @typedef {Object} PhraseScrapeSentence
* @property {string} english The English meaning of the sentence.
* @property {string} japanese The Japanese text of the sentence.
* @property {Array.} pieces The lifted/unlifted pairs
* that make up the sentence. Lifted text is furigana, unlifted is the text below the furigana.
*/
/**
* @typedef {Object} PhraseScrapeMeaning
* @property {Array.} seeAlsoTerms The words that Jisho lists as "see also".
* @property {Array.} sentences Example sentences for this meaning.
* @property {string} definition The definition.
* @property {Array.} supplemental Supplemental information.
* For example "usually written using kana alone".
* @property {string} definitionAbstract An "abstract" definition.
* Often this is a Wikipedia definition.
* @property {Array.} tags Tags associated with this meaning.
*/
/**
* @typedef {Object} PhrasePageScrapeResult
* @property {boolean} found True if a result was found.
* @property {string} query The term that you searched for.
* @property {string} [uri] The URI that these results were scraped from, if a result was found.
* @property {Array.} [otherForms] Other forms of the search term, if a result was found.
* @property {Array.} [meanings] Information about the meanings associated
* with result.
* @property {Array.} [tags] Tags associated with this search result.
* @property {Array.} [notes] Notes associated with the search result.
*/
/**
* @typedef {Object} YomiExample
* @property {string} example The original text of the example.
* @property {string} reading The reading of the example.
* @property {string} meaning The meaning of the example.
*/
/**
* @typedef {Object} KanjiResult
* @property {boolean} found True if results were found.
* @property {string} query The term that you searched for.
* @property {string} [taughtIn] The school level that the kanji is taught in, if applicable.
* @property {string} [jlptLevel] The lowest JLPT exam that this kanji is likely to
* appear in, if applicable. 'N5' or 'N4' or 'N3' or 'N2' or 'N1'.
* @property {number} [newspaperFrequencyRank] A number representing this kanji's frequency rank
* in newspapers, if applicable.
* @property {number} [strokeCount] How many strokes this kanji is typically drawn in,
* if applicable.
* @property {string} [meaning] The meaning of the kanji, if applicable.
* @property {Array.} [kunyomi] This character's kunyomi, if applicable.
* @property {Array.} [kunyomiExamples] Examples of this character's kunyomi
* being used, if applicable.
* @property {string} [onyomi] This character's onyomi, if applicable.
* @property {Array.} [onyomiExamples] Examples of this character's onyomi
* being used, if applicable.
* @property {Object} [radical] Information about this character's radical, if applicable.
* @property {string} [radical.symbol] The radical symbol, if applicable.
* @property {Array.} [radical.forms] The radical forms used in this kanji, if applicable.
* @property {string} [radical.meaning] The meaning of the radical, if applicable.
* @property {Array.} [parts] The parts used in this kanji, if applicable.
* @property {string} [strokeOrderDiagramUri] The URL to a diagram showing how to draw this kanji
* step by step, if applicable.
* @property {string} [strokeOrderSvgUri] The URL to an SVG describing how to draw this kanji,
* if applicable.
* @property {string} [strokeOrderGifUri] The URL to a gif showing the kanji being draw and its
* stroke order, if applicable.
* @property {string} [uri] The URI that these results were scraped from, if applicable.
*/
/**
* @typedef {Object} ExampleSentencePiece
* @property {string} unlifted Baseline text shown on Jisho.org (below the lifted text / furigana)
* @property {string} lifted Furigana text shown on Jisho.org (above the unlifted text)
*/
/**
* @typedef {Object} ExampleResultData
* @property {string} kanji The example sentence including kanji.
* @property {string} kana The example sentence without kanji (only kana). Sometimes this may
* include some Kanji, as furigana is not always available from Jisho.org.
* @property {string} english An English translation of the example.
* @property {Array.} pieces The lifted/unlifted pairs
* that make up the sentence. Lifted text is furigana, unlifted is the text below the furigana.
*/
/**
* @typedef {Object} ExampleResults
* @property {string} query The term that you searched for.
* @property {boolean} found True if results were found.
* @property {string} uri The URI that these results were scraped from.
* @property {Array.} results The examples that were found, if any.
*/
/**
* A wrapper around the Jisho search functions.
*/
class API {
/**
* Query the official Jisho API for a word or phrase. See
* [here]{@link https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api}
* for discussion about the official API.
* @param {string} phrase The search term to search for.
* @returns {Object} The response data from the official Jisho.org API. Its format is somewhat
* complex and is not documented, so put on your trial-and-error hat.
* @async
*/
searchForPhrase(phrase) {
const uri = uriForPhraseSearch(phrase);
return axios.get(uri).then(response => response.data);
}
/**
* Scrape the word page for a word/phrase. This allows you to
* get some information that isn't provided by the official API, such as
* part-of-speech and JLPT level. However, the official API should be preferred
* if it has the information you need. This function scrapes https://jisho.org/word/XXX.
* In general, you'll want to include kanji in your search term, for example 掛かる
* instead of かかる (no results).
* @param {string} phrase The search term to search for.
* @returns {PhrasePageScrapeResult} Information about the searched query.
* @async
*/
async scrapeForPhrase(phrase) {
const uri = uriForPhraseScrape(phrase);
try {
const response = await axios.get(uri);
return parsePhrasePageData(response.data, phrase);
} catch (err) {
if (err.response.status === 404) {
return {
query: phrase,
found: false,
};
}
throw err;
}
}
/**
* Scrape Jisho.org for information about a kanji character.
* @param {string} kanji The kanji to search for.
* @returns {KanjiResult} Information about the searched kanji.
* @async
*/
searchForKanji(kanji) {
const uri = uriForKanjiSearch(kanji);
return axios.get(uri).then(response => parseKanjiPageData(response.data, kanji));
}
/**
* Scrape Jisho.org for examples.
* @param {string} phrase The word or phrase to search for.
* @returns {ExampleResults}
* @async
*/
searchForExamples(phrase) {
const uri = uriForExampleSearch(phrase);
return axios.get(uri).then(response => parseExamplePageData(response.data, phrase));
}
}
API.prototype.getUriForKanjiSearch = uriForKanjiSearch;
API.prototype.getUriForExampleSearch = uriForExampleSearch;
API.prototype.getUriForPhraseSearch = uriForPhraseSearch;
API.prototype.getUriForPhraseScrape = uriForPhraseScrape;
API.prototype.parseExamplePageHtml = parseExamplePageData;
API.prototype.parseKanjiPageHtml = parseKanjiPageData;
API.prototype.parsePhraseScrapeHtml = parsePhrasePageData;
module.exports = API;