diff --git a/flake.lock b/flake.lock index ac24e90..9e35b30 100644 --- a/flake.lock +++ b/flake.lock @@ -3,19 +3,19 @@ "jmdict-src": { "flake": false, "locked": { - "narHash": "sha256-+hXdq4aUDuxpCoLJ4zYuKg6zQLHas+/M6+sEDA84RgQ=", + "narHash": "sha256-J7GywcE/5LD6YFhdIXxQ2kvAamYkMpannRQyj5yU/nA=", "type": "file", - "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz" + "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz" }, "original": { "type": "file", - "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz" + "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz" } }, "jmdict-with-examples-src": { "flake": false, "locked": { - "narHash": "sha256-H3a5XoSJUvzTReP5g3jL7JGTIEsGyMrLopV4Yfxewig=", + "narHash": "sha256-cQtnDLqSHP1be6rkK2ceHL3HmX4YnfvUQi/af3uBQDc=", "type": "file", "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict_e_examp.gz" }, @@ -27,7 +27,7 @@ "kanjidic2-src": { "flake": false, "locked": { - "narHash": "sha256-sQS2knH7D/qqrn1BchAvFwjajp9HXv/5r4jx1OEuJWs=", + "narHash": "sha256-ml7ZfAXmjDD1sVKhHMS66ytFItIjXRIppqSaYu8IAws=", "type": "file", "url": "https://www.edrdg.org/kanjidic/kanjidic2.xml.gz" }, @@ -38,11 +38,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1745391562, - "narHash": "sha256-sPwcCYuiEopaafePqlG826tBhctuJsLx/mhKKM5Fmjo=", + "lastModified": 1745526057, + "narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "8a2f738d9d1f1d986b5a4cd2fd2061a7127237d7", + "rev": "f771eb401a46846c1aebd20552521b233dd7e18b", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 97e33dc..924f425 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,8 @@ nixpkgs.url = "nixpkgs/nixos-unstable"; jmdict-src = { - url = "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"; + # url = "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"; + url = "http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz"; flake = false; }; diff --git a/lib/_data_ingestion/jmdict/objects.dart b/lib/_data_ingestion/jmdict/objects.dart index 97526c0..c01f08e 100644 --- a/lib/_data_ingestion/jmdict/objects.dart +++ b/lib/_data_ingestion/jmdict/objects.dart @@ -15,9 +15,9 @@ abstract class TableNames { static const String senseField = 'JMdict_SenseField'; static const String senseGlossary = 'JMdict_SenseGlossary'; static const String senseInfo = 'JMdict_SenseInfo'; - static const String senseLanguageSource = 'JMdict_SenseLanguageSource'; static const String senseMisc = 'JMdict_SenseMisc'; static const String sensePOS = 'JMdict_SensePOS'; + static const String senseLanguageSource = 'JMdict_SenseLanguageSource'; static const String senseRestrictedToKanji = 'JMdict_SenseRestrictedToKanji'; static const String senseRestrictedToReading = 'JMdict_SenseRestrictedToReading'; @@ -216,7 +216,22 @@ class Sense extends SQLWritable { }); @override - Map get sqlValue => {}; + Map get sqlValue => { + 'id': id, + 'orderNum': orderNum, + }; + + bool get isEmpty => antonyms.isEmpty && + dialects.isEmpty && + fields.isEmpty && + info.isEmpty && + languageSource.isEmpty && + glossary.isEmpty && + misc.isEmpty && + pos.isEmpty && + restrictedToKanji.isEmpty && + restrictedToReading.isEmpty && + seeAlso.isEmpty; } class Entry extends SQLWritable { diff --git a/lib/_data_ingestion/jmdict/parser.dart b/lib/_data_ingestion/jmdict/parser.dart index 937db9b..f94b749 100644 --- a/lib/_data_ingestion/jmdict/parser.dart +++ b/lib/_data_ingestion/jmdict/parser.dart @@ -196,58 +196,65 @@ List parseXML(XmlElement root) { for (final (orderNum, sense) in entry.findElements('sense').indexed) { senseId++; - senses.add( - Sense( - id: senseId, - orderNum: orderNum + 1, - restrictedToKanji: - sense.findElements('stagk').map((e) => e.innerText).toList(), - restrictedToReading: - sense.findElements('stagr').map((e) => e.innerText).toList(), - pos: sense - .findElements('pos') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - misc: sense - .findElements('misc') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - dialects: sense - .findElements('dial') - .map((e) => e.innerText.substring(1, e.innerText.length - 1)) - .toList(), - info: sense.findElements('s_inf').map((e) => e.innerText).toList(), - languageSource: sense - .findElements('lsource') - .map( - (e) => LanguageSource( - language: e.getAttribute('xml:lang') ?? 'eng', - fullyDescribesSense: e.getAttribute('ls_type') == 'part', - constructedFromSmallerWords: - e.getAttribute('ls_wasei') == 'y', - ), - ) - .toList(), - glossary: sense - .findElements('gloss') - .map( - (e) => Glossary( - language: e.getAttribute('xml:lang') ?? 'eng', - phrase: e.innerText, - type: e.getAttribute('g_type'), - ), - ) - .toList(), - antonyms: sense - .findElements('ant') - .map((e) => parseXrefParts(e.innerText)) - .toList(), - seeAlso: sense - .findElements('xref') - .map((e) => parseXrefParts(e.innerText)) - .toList(), - ), + final result = Sense( + id: senseId, + orderNum: orderNum + 1, + restrictedToKanji: + sense.findElements('stagk').map((e) => e.innerText).toList(), + restrictedToReading: + sense.findElements('stagr').map((e) => e.innerText).toList(), + pos: sense + .findElements('pos') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + misc: sense + .findElements('misc') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + dialects: sense + .findElements('dial') + .map((e) => e.innerText.substring(1, e.innerText.length - 1)) + .toList(), + info: sense.findElements('s_inf').map((e) => e.innerText).toList(), + languageSource: sense + .findElements('lsource') + .map( + (e) => LanguageSource( + language: e.getAttribute('xml:lang') ?? 'eng', + fullyDescribesSense: e.getAttribute('ls_type') == 'part', + constructedFromSmallerWords: e.getAttribute('ls_wasei') == 'y', + ), + ) + .toList(), + glossary: sense + .findElements('gloss') + .map( + (e) => Glossary( + language: e.getAttribute('xml:lang') ?? 'eng', + phrase: e.innerText, + type: e.getAttribute('g_type'), + ), + ) + .toList(), + antonyms: sense + .findElements('ant') + .map((e) => parseXrefParts(e.innerText)) + .toList(), + seeAlso: sense + .findElements('xref') + .map((e) => parseXrefParts(e.innerText)) + .toList(), ); + + if (result.isEmpty) { + print( + 'WARNING: Sense $senseId for entry $entryId is empty, ' + 'kanji: ${kanjiEls.map((e) => e.reading).join(', ')}, ' + 'reading: ${readingEls.map((e) => e.reading).join(', ')}', + ); + } else { + senses.add(result); + } } entries.add( @@ -264,7 +271,7 @@ List parseXML(XmlElement root) { } Future insertIntoDB(List entries, Database db) async { - print(' [JMdict] Batch 1'); + print(' [JMdict] Batch 1 - Kanji and readings'); Batch b = db.batch(); for (final e in entries) { b.insert(TableNames.entry, e.sqlValue); @@ -326,32 +333,26 @@ Future insertIntoDB(List entries, Database db) async { for (final s in e.senses) { for (final g in s.glossary) { - if (g.language == "eng") - b.insert( - TableNames.entryByEnglish, - { - 'entryId': e.id, - 'english': g.phrase, - }, - // Some entries have the same reading twice with difference in katakana and hiragana - conflictAlgorithm: ConflictAlgorithm.ignore, - ); + b.insert( + TableNames.entryByEnglish, + { + 'entryId': e.id, + 'english': g.phrase, + }, + // Some entries have the same reading twice with difference in katakana and hiragana + conflictAlgorithm: ConflictAlgorithm.ignore, + ); } } } await b.commit(); - print(' [JMdict] Batch 2'); + print(' [JMdict] Batch 2 - Senses'); b = db.batch(); - for (final e in entries) { for (final s in e.senses) { - b.insert( - TableNames.sense, - s.sqlValue - ..addAll({'id': s.id, 'entryId': e.id, 'orderNum': s.orderNum})); - + b.insert(TableNames.sense, s.sqlValue..addAll({'entryId': e.id})); for (final d in s.dialects) { b.insert(TableNames.senseDialect, {'senseId': s.id, 'dialect': d}); } @@ -392,21 +393,17 @@ Future insertIntoDB(List entries, Database db) async { ); } for (final g in s.glossary) { - if (g.language == 'eng') - b.insert( - TableNames.senseGlossary, - g.sqlValue..addAll({'senseId': s.id}), - // There are some duplicate glossary, especially in - // the other languages. - conflictAlgorithm: ConflictAlgorithm.ignore, - ); + b.insert( + TableNames.senseGlossary, + g.sqlValue..addAll({'senseId': s.id}), + ); } } } await b.commit(); - print(' [JMdict] Building trees'); + print(' [JMdict] Building xref trees'); SplayTreeMap> entriesByKanji = SplayTreeMap(); for (final entry in entries) { for (final kanji in entry.kanji) { @@ -428,7 +425,7 @@ Future insertIntoDB(List entries, Database db) async { } } - print(' [JMdict] Batch 3'); + print(' [JMdict] Batch 3 - Xrefs'); b = db.batch(); for (final e in entries) { diff --git a/lib/_data_ingestion/sql_writable.dart b/lib/_data_ingestion/sql_writable.dart index 942af34..d6426c5 100644 --- a/lib/_data_ingestion/sql_writable.dart +++ b/lib/_data_ingestion/sql_writable.dart @@ -1,5 +1,11 @@ +/// Interface for objects which are meant to be written to a table in a SQL database. abstract class SQLWritable { const SQLWritable(); + /// Returns a map of the object's properties and their values. + /// + /// Note that there might be properties in the object which is meant to be + /// inserted into a different table. These properties will/should be excluded + /// from this map. Map get sqlValue; } diff --git a/lib/models/jmdict/jmdict_dialect.dart b/lib/models/jmdict/jmdict_dialect.dart new file mode 100644 index 0000000..3674cec --- /dev/null +++ b/lib/models/jmdict/jmdict_dialect.dart @@ -0,0 +1,43 @@ +/// Dialect tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_dial +enum JMdictDialect { + brazilian(id: 'bra', description: 'Brazilian'), + hokkaido(id: 'hob', description: 'Hokkaido-ben'), + kansai(id: 'ksb', description: 'Kansai-ben'), + kantou(id: 'ktb', description: 'Kantou-ben'), + kyoto(id: 'kyb', description: 'Kyoto-ben'), + kyushu(id: 'kyu', description: 'Kyuushuu-ben'), + nagano(id: 'nab', description: 'Nagano-ben'), + osaka(id: 'osb', description: 'Osaka-ben'), + ryukyu(id: 'rkb', description: 'Ryuukyuu-ben'), + tokyo(id: 'std', description: 'Tokyo-ben (std)'), + tohoku(id: 'thb', description: 'Touhoku-ben'), + tosa(id: 'tsb', description: 'Tosa-ben'), + tsugaru(id: 'tsug', description: 'Tsugaru-ben'); + + final String id; + final String description; + + const JMdictDialect({ + required this.id, + required this.description, + }); + + static JMdictDialect fromId(String id) => + JMdictDialect.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictDialect fromJson(Map json) => + JMdictDialect.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/jmdict_field.dart b/lib/models/jmdict/jmdict_field.dart new file mode 100644 index 0000000..24a30a9 --- /dev/null +++ b/lib/models/jmdict/jmdict_field.dart @@ -0,0 +1,126 @@ +/// Domain / Field of expertise tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_fld +enum JMdictField { + agriculture(id: 'agric', description: 'agriculture'), + anatomy(id: 'anat', description: 'anatomy'), + archeology(id: 'archeol', description: 'archeology'), + architecture(id: 'archit', description: 'architecture'), + art(id: 'art', description: 'art, aesthetics'), + astronomy(id: 'astron', description: 'astronomy'), + audiovisual(id: 'audvid', description: 'audiovisual'), + aviation(id: 'aviat', description: 'aviation'), + baseball(id: 'baseb', description: 'baseball'), + biochemistry(id: 'biochem', description: 'biochemistry'), + biology(id: 'biol', description: 'biology'), + botany(id: 'bot', description: 'botany'), + boxing(id: 'boxing', description: 'boxing'), + buddhism(id: 'Buddh', description: 'Buddhism'), + business(id: 'bus', description: 'business'), + cardGames(id: 'cards', description: 'card games'), + chemistry(id: 'chem', description: 'chemistry'), + chineseMythology(id: 'chmyth', description: 'Chinese mythology'), + christianity(id: 'Christn', description: 'Christianity'), + civilEngineering(id: 'civeng', description: 'civil engineering'), + clothing(id: 'cloth', description: 'clothing'), + computing(id: 'comp', description: 'computing'), + crystallography(id: 'cryst', description: 'crystallography'), + dentistry(id: 'dent', description: 'dentistry'), + ecology(id: 'ecol', description: 'ecology'), + economics(id: 'econ', description: 'economics'), + electricEngineering(id: 'elec', description: 'electricity, elec. eng.'), + electronics(id: 'electr', description: 'electronics'), + embryology(id: 'embryo', description: 'embryology'), + engineering(id: 'engr', description: 'engineering'), + entomology(id: 'ent', description: 'entomology'), + figureSkating(id: 'figskt', description: 'figure skating'), + film(id: 'film', description: 'film'), + finance(id: 'finc', description: 'finance'), + fishing(id: 'fish', description: 'fishing'), + food(id: 'food', description: 'food, cooking'), + gardening(id: 'gardn', description: 'gardening, horticulture'), + genetics(id: 'genet', description: 'genetics'), + geography(id: 'geogr', description: 'geography'), + geology(id: 'geol', description: 'geology'), + geometry(id: 'geom', description: 'geometry'), + go(id: 'go', description: 'go (game)'), + golf(id: 'golf', description: 'golf'), + grammar(id: 'gramm', description: 'grammar'), + greekMythology(id: 'grmyth', description: 'Greek mythology'), + hanafuda(id: 'hanaf', description: 'hanafuda'), + horseRacing(id: 'horse', description: 'horse racing'), + internet(id: 'internet', description: 'Internet'), + japaneseMythology(id: 'jpmyth', description: 'Japanese mythology'), + kabuki(id: 'kabuki', description: 'kabuki'), + law(id: 'law', description: 'law'), + linguistics(id: 'ling', description: 'linguistics'), + logic(id: 'logic', description: 'logic'), + martialArts(id: 'MA', description: 'martial arts'), + mahjong(id: 'mahj', description: 'mahjong'), + manga(id: 'manga', description: 'manga'), + mathematics(id: 'math', description: 'mathematics'), + mechanicalEngineering(id: 'mech', description: 'mechanical engineering'), + medicine(id: 'med', description: 'medicine'), + meteorology(id: 'met', description: 'meteorology'), + military(id: 'mil', description: 'military'), + mineralogy(id: 'min', description: 'mineralogy'), + mining(id: 'mining', description: 'mining'), + motorsport(id: 'motor', description: 'motorsport'), + music(id: 'music', description: 'music'), + noh(id: 'noh', description: 'noh (theatre)'), + ornithology(id: 'ornith', description: 'ornithology'), + paleontology(id: 'paleo', description: 'paleontology'), + pathology(id: 'pathol', description: 'pathology'), + pharmacology(id: 'pharm', description: 'pharmacology'), + philosophy(id: 'phil', description: 'philosophy'), + photography(id: 'photo', description: 'photography'), + physics(id: 'physics', description: 'physics'), + physiology(id: 'physiol', description: 'physiology'), + politics(id: 'politics', description: 'politics'), + printing(id: 'print', description: 'printing (press)'), + professionalWrestling(id: 'prowres', description: 'professional wrestling'), + psychiatry(id: 'psy', description: 'psychiatry'), + psychoanalysis(id: 'psyanal', description: 'psychoanalysis'), + psychology(id: 'psych', description: 'psychology'), + railway(id: 'rail', description: 'railway'), + romanMythology(id: 'rommyth', description: 'Roman mythology'), + shinto(id: 'Shinto', description: 'Shinto'), + shogi(id: 'shogi', description: 'shogi (game)'), + skiing(id: 'ski', description: 'skiing'), + sports(id: 'sports', description: 'sports'), + statistics(id: 'stat', description: 'statistics'), + stockMarket(id: 'stockm', description: 'stock market'), + sumo(id: 'sumo', description: 'sumo (wrestling)'), + surgery(id: 'surg', description: 'surgery'), + telecommunications(id: 'telec', description: 'telecommunications'), + trademark(id: 'tradem', description: 'trademark'), + television(id: 'tv', description: 'television'), + veterinaryTerms(id: 'vet', description: 'veterinary terms'), + videoGames(id: 'vidg', description: 'video games'), + zoology(id: 'zool', description: 'zoology'); + + final String id; + final String description; + + const JMdictField({ + required this.id, + required this.description, + }); + + static JMdictField fromId(String id) => + JMdictField.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictField fromJson(Map json) => + JMdictField.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/jmdict_kanji_info.dart b/lib/models/jmdict/jmdict_kanji_info.dart new file mode 100644 index 0000000..e0f1f78 --- /dev/null +++ b/lib/models/jmdict/jmdict_kanji_info.dart @@ -0,0 +1,37 @@ +/// Kanji info tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_kinf +enum JMdictKanjiInfo { + ateji(id: 'ateji', description: 'ateji (phonetic) reading'), + iK(id: 'iK', description: 'word containing irregular kanji usage'), + ik(id: 'ik', description: 'word containing irregular kana usage'), + io(id: 'io', description: 'irregular okurigana usage'), + oK(id: 'oK', description: 'word containing out-dated kanji or kanji usage'), + rK(id: 'rK', description: 'rarely-used kanji form'), + sK(id: 'sK', description: 'search-only kanji form'); + + final String id; + final String description; + + const JMdictKanjiInfo({ + required this.id, + required this.description, + }); + + static JMdictKanjiInfo fromId(String id) => + JMdictKanjiInfo.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictKanjiInfo fromJson(Map json) => + JMdictKanjiInfo.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/jmdict_misc.dart b/lib/models/jmdict/jmdict_misc.dart new file mode 100644 index 0000000..bf78492 --- /dev/null +++ b/lib/models/jmdict/jmdict_misc.dart @@ -0,0 +1,98 @@ +/// Miscellaneous sense tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_misc +enum JMdictMisc { + abbreviation(id: 'abbr', description: 'abbreviation'), + aphorism(id: 'aphorism', description: 'aphorism (pithy saying)'), + archaic(id: 'arch', description: 'archaic'), + character(id: 'char', description: 'character'), + childrensLanguage(id: 'chn', description: 'children\'s language'), + colloquial(id: 'col', description: 'colloquial'), + company(id: 'company', description: 'company name'), + creature(id: 'creat', description: 'creature'), + datedTerm(id: 'dated', description: 'dated term'), + deity(id: 'dei', description: 'deity'), + derogatory(id: 'derog', description: 'derogatory'), + document(id: 'doc', description: 'document'), + euphemistic(id: 'euph', description: 'euphemistic'), + event(id: 'ev', description: 'event'), + familiarLanguage(id: 'fam', description: 'familiar language'), + female(id: 'fem', description: 'female term, language, or name'), + fiction(id: 'fict', description: 'fiction'), + formal(id: 'form', description: 'formal or literary term'), + givenName( + id: 'given', + description: 'given name or forename, gender not specified', + ), + group(id: 'group', description: 'group'), + historical(id: 'hist', description: 'historical term'), + honorific( + id: 'hon', + description: 'honorific or respectful (sonkeigo) language', + ), + humble(id: 'hum', description: 'humble (kenjougo) language'), + idiomatic(id: 'id', description: 'idiomatic expression'), + jocular(id: 'joc', description: 'jocular, humorous term'), + legend(id: 'leg', description: 'legend'), + mangaSlang(id: 'm-sl', description: 'manga slang'), + male(id: 'male', description: 'male term, language, or name'), + mythology(id: 'myth', description: 'mythology'), + internetSlang(id: 'net-sl', description: 'Internet slang'), + object(id: 'obj', description: 'object'), + obsolete(id: 'obs', description: 'obsolete term'), + onomatopoeic(id: 'on-mim', description: 'onomatopoeic or mimetic word'), + organizationName(id: 'organization', description: 'organization name'), + other(id: 'oth', description: 'other'), + personName(id: 'person', description: 'full name of a particular person'), + placeName(id: 'place', description: 'place name'), + poetical(id: 'poet', description: 'poetical term'), + polite(id: 'pol', description: 'polite (teineigo) language'), + product(id: 'product', description: 'product name'), + proverb(id: 'proverb', description: 'proverb'), + quotation(id: 'quote', description: 'quotation'), + rare(id: 'rare', description: 'rare term'), + religion(id: 'relig', description: 'religion'), + sensitive(id: 'sens', description: 'sensitive'), + service(id: 'serv', description: 'service'), + ship(id: 'ship', description: 'ship name'), + slang(id: 'sl', description: 'slang'), + railwayStation(id: 'station', description: 'railway station'), + surname(id: 'surname', description: 'family or surname'), + onlyKana(id: 'uk', description: 'word usually written using kana alone'), + unclassifiedName(id: 'unclass', description: 'unclassified name'), + vulgar(id: 'vulg', description: 'vulgar expression or word'), + workOfArt( + id: 'work', + description: 'work of art, literature, music, etc. name', + ), + xRated( + id: 'X', + description: 'rude or X-rated term (not displayed in educational software)', + ), + yojijukugo(id: 'yoji', description: 'yojijukugo'); + + final String id; + final String description; + + const JMdictMisc({ + required this.id, + required this.description, + }); + + static JMdictMisc fromId(String id) => + JMdictMisc.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictMisc fromJson(Map json) => + JMdictMisc.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/jmdict_pos.dart b/lib/models/jmdict/jmdict_pos.dart new file mode 100644 index 0000000..3f02444 --- /dev/null +++ b/lib/models/jmdict/jmdict_pos.dart @@ -0,0 +1,122 @@ +/// Part-of-speech tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_pos +enum JMdictPOS { + adjF(id: 'adj-f', description: 'noun or verb acting prenominally'), + adjI(id: 'adj-i', description: 'adjective (keiyoushi)'), + adjIx(id: 'adj-ix', description: 'adjective (keiyoushi) - yoi/ii class'), + adjKari(id: 'adj-kari', description: '\'kari\' adjective (archaic)'), + adjKu(id: 'adj-ku', description: '\'ku\' adjective (archaic)'), + adjNa(id: 'adj-na', description: 'adjectival nouns or quasi-adjectives (keiyodoshi)'), + adjNari(id: 'adj-nari', description: 'archaic/formal form of na-adjective'), + adjNo(id: 'adj-no', description: 'nouns which may take the genitive case particle ''no'''), + adjPn(id: 'adj-pn', description: 'pre-noun adjectival (rentaishi)'), + adjShiku(id: 'adj-shiku', description: '\'shiku\' adjective (archaic)'), + adjT(id: 'adj-t', description: '\'taru\' adjective'), + adv(id: 'adv', description: 'adverb (fukushi)'), + advTo(id: 'adv-to', description: 'adverb taking the ''to'' particle'), + aux(id: 'aux', description: 'auxiliary'), + auxAdj(id: 'aux-adj', description: 'auxiliary adjective'), + auxV(id: 'aux-v', description: 'auxiliary verb'), + conj(id: 'conj', description: 'conjunction'), + cop(id: 'cop', description: 'copula'), + ctr(id: 'ctr', description: 'counter'), + exp(id: 'exp', description: 'expressions (phrases, clauses, etc.)'), + int(id: 'int', description: 'interjection (kandoushi)'), + n(id: 'n', description: 'noun (common) (futsuumeishi)'), + nAdv(id: 'n-adv', description: 'adverbial noun (fukushitekimeishi)'), + nPr(id: 'n-pr', description: 'proper noun'), + nPref(id: 'n-pref', description: 'noun, used as a prefix'), + nSuf(id: 'n-suf', description: 'noun, used as a suffix'), + nT(id: 'n-t', description: 'noun (temporal) (jisoumeishi)'), + num(id: 'num', description: 'numeric'), + pn(id: 'pn', description: 'pronoun'), + pref(id: 'pref', description: 'prefix'), + prt(id: 'prt', description: 'particle'), + suf(id: 'suf', description: 'suffix'), + unc(id: 'unc', description: 'unclassified'), + vUnspec(id: 'v-unspec', description: 'verb unspecified'), + v1(id: 'v1', description: 'Ichidan verb'), + v1S(id: 'v1-s', description: 'Ichidan verb - kureru special class'), + v2aS(id: 'v2a-s', description: 'Nidan verb with ''u'' ending (archaic)'), + v2bK(id: 'v2b-k', description: 'Nidan verb (upper class) with ''bu'' ending (archaic)'), + v2bS(id: 'v2b-s', description: 'Nidan verb (lower class) with ''bu'' ending (archaic)'), + v2dK(id: 'v2d-k', description: 'Nidan verb (upper class) with ''dzu'' ending (archaic)'), + v2dS(id: 'v2d-s', description: 'Nidan verb (lower class) with ''dzu'' ending (archaic)'), + v2gK(id: 'v2g-k', description: 'Nidan verb (upper class) with ''gu'' ending (archaic)'), + v2gS(id: 'v2g-s', description: 'Nidan verb (lower class) with ''gu'' ending (archaic)'), + v2hK(id: 'v2h-k', description: 'Nidan verb (upper class) with ''hu/fu'' ending (archaic)'), + v2hS(id: 'v2h-s', description: 'Nidan verb (lower class) with ''hu/fu'' ending (archaic)'), + v2kK(id: 'v2k-k', description: 'Nidan verb (upper class) with ''ku'' ending (archaic)'), + v2kS(id: 'v2k-s', description: 'Nidan verb (lower class) with ''ku'' ending (archaic)'), + v2mK(id: 'v2m-k', description: 'Nidan verb (upper class) with ''mu'' ending (archaic)'), + v2mS(id: 'v2m-s', description: 'Nidan verb (lower class) with ''mu'' ending (archaic)'), + v2nS(id: 'v2n-s', description: 'Nidan verb (lower class) with ''nu'' ending (archaic)'), + v2rK(id: 'v2r-k', description: 'Nidan verb (upper class) with ''ru'' ending (archaic)'), + v2rS(id: 'v2r-s', description: 'Nidan verb (lower class) with ''ru'' ending (archaic)'), + v2sS(id: 'v2s-s', description: 'Nidan verb (lower class) with ''su'' ending (archaic)'), + v2tK(id: 'v2t-k', description: 'Nidan verb (upper class) with ''tsu'' ending (archaic)'), + v2tS(id: 'v2t-s', description: 'Nidan verb (lower class) with ''tsu'' ending (archaic)'), + v2wS(id: 'v2w-s', description: 'Nidan verb (lower class) with ''u'' ending and ''we'' conjugation (archaic)'), + v2yK(id: 'v2y-k', description: 'Nidan verb (upper class) with ''yu'' ending (archaic)'), + v2yS(id: 'v2y-s', description: 'Nidan verb (lower class) with ''yu'' ending (archaic)'), + v2zS(id: 'v2z-s', description: 'Nidan verb (lower class) with ''zu'' ending (archaic)'), + v4b(id: 'v4b', description: 'Yodan verb with ''bu'' ending (archaic)'), + v4g(id: 'v4g', description: 'Yodan verb with ''gu'' ending (archaic)'), + v4h(id: 'v4h', description: 'Yodan verb with ''hu/fu'' ending (archaic)'), + v4k(id: 'v4k', description: 'Yodan verb with ''ku'' ending (archaic)'), + v4m(id: 'v4m', description: 'Yodan verb with ''mu'' ending (archaic)'), + v4n(id: 'v4n', description: 'Yodan verb with ''nu'' ending (archaic)'), + v4r(id: 'v4r', description: 'Yodan verb with ''ru'' ending (archaic)'), + v4s(id: 'v4s', description: 'Yodan verb with ''su'' ending (archaic)'), + v4t(id: 'v4t', description: 'Yodan verb with ''tsu'' ending (archaic)'), + v5aru(id: 'v5aru', description: 'Godan verb - -aru special class'), + v5b(id: 'v5b', description: 'Godan verb with ''bu'' ending'), + v5g(id: 'v5g', description: 'Godan verb with ''gu'' ending'), + v5k(id: 'v5k', description: 'Godan verb with ''ku'' ending'), + v5kS(id: 'v5k-s', description: 'Godan verb - Iku/Yuku special class'), + v5m(id: 'v5m', description: 'Godan verb with ''mu'' ending'), + v5n(id: 'v5n', description: 'Godan verb with ''nu'' ending'), + v5r(id: 'v5r', description: 'Godan verb with ''ru'' ending'), + v5rI(id: 'v5r-i', description: 'Godan verb with ''ru'' ending (irregular verb)'), + v5s(id: 'v5s', description: 'Godan verb with ''su'' ending'), + v5t(id: 'v5t', description: 'Godan verb with ''tsu'' ending'), + v5u(id: 'v5u', description: 'Godan verb with ''u'' ending'), + v5uS(id: 'v5u-s', description: 'Godan verb with ''u'' ending (special class)'), + v5uru(id: 'v5uru', description: 'Godan verb - Uru old class verb (old form of Eru)'), + vi(id: 'vi', description: 'intransitive verb'), + vk(id: 'vk', description: 'Kuru verb - special class'), + vn(id: 'vn', description: 'irregular nu verb'), + vr(id: 'vr', description: 'irregular ru verb, plain form ends with -ri'), + vs(id: 'vs', description: 'noun or participle which takes the aux. verb suru'), + vsC(id: 'vs-c', description: 'suru verb - precursor to the modern suru'), + vsI(id: 'vs-i', description: 'suru verb - included'), + vsS(id: 'vs-s', description: 'suru verb - special class'), + vt(id: 'vt', description: 'transitive verb'), + vz(id: 'vz', description: 'Ichidan verb - zuru verb (alternative form of -jiru verbs)'); + + final String id; + final String description; + + const JMdictPOS({ + required this.id, + required this.description, + }); + + static JMdictPOS fromId(String id) => + JMdictPOS.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictPOS fromJson(Map json) => + JMdictPOS.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/jmdict_reading_info.dart b/lib/models/jmdict/jmdict_reading_info.dart new file mode 100644 index 0000000..a5e722a --- /dev/null +++ b/lib/models/jmdict/jmdict_reading_info.dart @@ -0,0 +1,39 @@ +/// Reading info tags from JMdict +/// +/// See https://www.edrdg.org/jmwsgi/edhelp.py#kw_rinf +enum JMdictReadingInfo { + gikun( + id: 'gikun', + description: + 'gikun (meaning as reading) or jukujikun (special kanji reading)', + ), + ik(id: 'ik', description: 'word containing irregular kana usage'), + ok(id: 'ok', description: 'out-dated or obsolete kana usage'), + rk(id: 'rk', description: 'rarely used kana form'), + sk(id: 'sk', description: 'search-only kana form'); + + final String id; + final String description; + + const JMdictReadingInfo({ + required this.id, + required this.description, + }); + + static JMdictReadingInfo fromId(String id) => + JMdictReadingInfo.values.firstWhere( + (e) => e.id == id, + orElse: () => throw Exception('Unknown id: $id'), + ); + + Map toJson() => { + 'id': id, + 'description': description, + }; + + static JMdictReadingInfo fromJson(Map json) => + JMdictReadingInfo.values.firstWhere( + (e) => e.id == json['id'], + orElse: () => throw Exception('Unknown id: ${json['id']}'), + ); +} diff --git a/lib/models/jmdict/word_search_result.dart b/lib/models/jmdict/word_search_result.dart deleted file mode 100644 index 1cd012b..0000000 --- a/lib/models/jmdict/word_search_result.dart +++ /dev/null @@ -1,5 +0,0 @@ -class WordSearchResult { - // TODO: implement me - - Map toJson() => {}; -} diff --git a/lib/models/kanjidic/kanji_search_radical.dart b/lib/models/kanji_search/kanji_search_radical.dart similarity index 100% rename from lib/models/kanjidic/kanji_search_radical.dart rename to lib/models/kanji_search/kanji_search_radical.dart diff --git a/lib/models/kanjidic/kanji_search_result.dart b/lib/models/kanji_search/kanji_search_result.dart similarity index 100% rename from lib/models/kanjidic/kanji_search_result.dart rename to lib/models/kanji_search/kanji_search_result.dart diff --git a/lib/models/word_search/word_search_result.dart b/lib/models/word_search/word_search_result.dart new file mode 100644 index 0000000..dcc17a0 --- /dev/null +++ b/lib/models/word_search/word_search_result.dart @@ -0,0 +1,64 @@ +import 'package:jadb/models/jmdict/jmdict_kanji_info.dart'; +import 'package:jadb/models/jmdict/jmdict_reading_info.dart'; +import 'package:jadb/models/word_search/word_search_ruby.dart'; +import 'package:jadb/models/word_search/word_search_sense.dart'; +import 'package:jadb/models/word_search/word_search_sources.dart'; + +/// A class representing a single dictionary entry from a word search. +class WordSearchResult { + /// The ID of the entry in the database. + final int entryId; + + /// The variants of the word in Japanese. + final List japanese; + + /// Extra information about the kanji used in the word. + final Map kanjiInfo; + + /// Extra information about the kana used in the word. + final Map readingInfo; + + /// The meanings of the word, including parts of speech and other information. + final List senses; + + /// A class listing the sources used to make up the data for this word search result. + final WordSearchSources sources; + + const WordSearchResult({ + required this.entryId, + required this.japanese, + required this.kanjiInfo, + required this.readingInfo, + required this.senses, + required this.sources, + }); + + Map toJson() => { + 'entryId': entryId, + 'japanese': japanese.map((e) => e.toJson()).toList(), + 'kanjiInfo': + kanjiInfo.map((key, value) => MapEntry(key, value.toJson())), + 'readingInfo': + readingInfo.map((key, value) => MapEntry(key, value.toJson())), + 'senses': senses.map((e) => e.toJson()).toList(), + 'sources': sources.toJson(), + }; + + factory WordSearchResult.fromJson(Map json) => + WordSearchResult( + entryId: json['entryId'] as int, + japanese: (json['japanese'] as List) + .map((e) => WordSearchRuby.fromJson(e)) + .toList(), + kanjiInfo: (json['kanjiInfo'] as Map).map( + (key, value) => MapEntry(key, JMdictKanjiInfo.fromJson(value)), + ), + readingInfo: (json['readingInfo'] as Map).map( + (key, value) => MapEntry(key, JMdictReadingInfo.fromJson(value)), + ), + senses: (json['senses'] as List) + .map((e) => WordSearchSense.fromJson(e)) + .toList(), + sources: WordSearchSources.fromJson(json['sources']), + ); +} diff --git a/lib/models/word_search/word_search_ruby.dart b/lib/models/word_search/word_search_ruby.dart new file mode 100644 index 0000000..ba594f6 --- /dev/null +++ b/lib/models/word_search/word_search_ruby.dart @@ -0,0 +1,23 @@ +/// A pair of base and optional furigana. +class WordSearchRuby { + /// Base part. Could be a kanji or a reading. + String base; + + /// Furigana, if applicable. + String? furigana; + + WordSearchRuby({ + required this.base, + this.furigana, + }); + + Map toJson() => { + 'base': base, + 'furigana': furigana, + }; + + factory WordSearchRuby.fromJson(Map json) => WordSearchRuby( + base: json['base'] as String, + furigana: json['furigana'] as String?, + ); +} diff --git a/lib/models/word_search/word_search_sense.dart b/lib/models/word_search/word_search_sense.dart new file mode 100644 index 0000000..275d3da --- /dev/null +++ b/lib/models/word_search/word_search_sense.dart @@ -0,0 +1,111 @@ +import 'package:jadb/models/jmdict/jmdict_dialect.dart'; +import 'package:jadb/models/jmdict/jmdict_field.dart'; +import 'package:jadb/models/jmdict/jmdict_misc.dart'; +import 'package:jadb/models/jmdict/jmdict_pos.dart'; +import 'package:jadb/models/word_search/word_search_xref_entry.dart'; + +class WordSearchSense { + /// The meaning(s) of the word. + final List englishDefinitions; + + /// Type of word (Noun, Verb, etc.). + final List partsOfSpeech; + + /// Relevant words (might include synonyms). + final List seeAlso; + + /// Words with opposite meaning. + final List antonyms; + + /// Restrictions on which of the readings of the parent entry this sense applies to. + final List restrictedToReading; + + /// Restrictions on which of the kanji of the parent entry this sense applies to. + final List restrictedToKanji; + + /// Tags for which domains or fields of expertise that this sense is relevant to. + final List fields; + + /// Tags for which dialects this sense is used in. + final List dialects; + + /// Tags for miscellaneous information. + final List misc; + + /// Extra information about the sense. + final List info; + + // TODO: there is a lot more info to collect in the languageSource data + + /// Information about the the origin of the word, if loaned from another language. + final List languageSource; + + // TODO: add example sentences + + const WordSearchSense({ + required this.englishDefinitions, + required this.partsOfSpeech, + required this.seeAlso, + required this.antonyms, + required this.restrictedToReading, + required this.restrictedToKanji, + required this.fields, + required this.dialects, + required this.misc, + required this.info, + required this.languageSource, + }); + + bool get isEmpty => + englishDefinitions.isEmpty && + partsOfSpeech.isEmpty && + seeAlso.isEmpty && + antonyms.isEmpty && + restrictedToReading.isEmpty && + restrictedToKanji.isEmpty && + fields.isEmpty && + dialects.isEmpty && + misc.isEmpty && + info.isEmpty && + languageSource.isEmpty; + + Map toJson() => { + 'englishDefinitions': englishDefinitions, + 'partsOfSpeech': partsOfSpeech.map((e) => e.toJson()).toList(), + 'seeAlso': seeAlso.map((e) => e.toJson()).toList(), + 'antonyms': antonyms.map((e) => e.toJson()).toList(), + 'restrictedToReading': restrictedToReading, + 'restrictedToKanji': restrictedToKanji, + 'fields': fields.map((e) => e.toJson()).toList(), + 'dialects': dialects.map((e) => e.toJson()).toList(), + 'misc': misc.map((e) => e.toJson()).toList(), + 'info': info, + 'languageSource': languageSource, + }; + + factory WordSearchSense.fromJson(Map json) => + WordSearchSense( + englishDefinitions: List.from(json['englishDefinitions']), + partsOfSpeech: (json['partsOfSpeech'] as List) + .map((e) => JMdictPOS.fromJson(e)) + .toList(), + seeAlso: (json['seeAlso'] as List) + .map((e) => WordSearchXrefEntry.fromJson(e)) + .toList(), + antonyms: (json['antonyms'] as List) + .map((e) => WordSearchXrefEntry.fromJson(e)) + .toList(), + restrictedToReading: List.from(json['restrictedToReading']), + restrictedToKanji: List.from(json['restrictedToKanji']), + fields: (json['fields'] as List) + .map((e) => JMdictField.fromJson(e)) + .toList(), + dialects: (json['dialects'] as List) + .map((e) => JMdictDialect.fromJson(e)) + .toList(), + misc: + (json['misc'] as List).map((e) => JMdictMisc.fromJson(e)).toList(), + info: List.from(json['info']), + languageSource: List.from(json['languageSource']), + ); +} diff --git a/lib/models/word_search/word_search_sources.dart b/lib/models/word_search/word_search_sources.dart new file mode 100644 index 0000000..74bb648 --- /dev/null +++ b/lib/models/word_search/word_search_sources.dart @@ -0,0 +1,30 @@ +/// A class representing which sources were used to make up the data for +/// a word search result. +class WordSearchSources { + /// Whether JMdict was used. + final bool jmdict; + + /// Whether JMnedict was used. + final bool jmnedict; + + const WordSearchSources({ + this.jmdict = true, + this.jmnedict = false, + }); + + Map get sqlValue => { + 'jmdict': jmdict, + 'jmnedict': jmnedict, + }; + + Map toJson() => { + 'jmdict': jmdict, + 'jmnedict': jmnedict, + }; + + factory WordSearchSources.fromJson(Map json) => + WordSearchSources( + jmdict: json['jmdict'] as bool? ?? true, + jmnedict: json['jmnedict'] as bool? ?? false, + ); +} diff --git a/lib/models/word_search/word_search_xref_entry.dart b/lib/models/word_search/word_search_xref_entry.dart new file mode 100644 index 0000000..669a83a --- /dev/null +++ b/lib/models/word_search/word_search_xref_entry.dart @@ -0,0 +1,25 @@ +/// A cross-reference entry from one word-result to another entry. +class WordSearchXrefEntry { + /// The ID of the entry that this entry cross-references to. + final int entryId; + + /// Whether the entryId was ambiguous during the creation of the + /// database (and hence might be incorrect). + final bool ambiguous; + + const WordSearchXrefEntry({ + required this.entryId, + required this.ambiguous, + }); + + Map toJson() => { + 'entryId': entryId, + 'ambiguous': ambiguous, + }; + + factory WordSearchXrefEntry.fromJson(Map json) => + WordSearchXrefEntry( + entryId: json['entryId'] as int, + ambiguous: json['ambiguous'] as bool, + ); +} diff --git a/lib/search.dart b/lib/search.dart index 8f88f56..9ab1d85 100644 --- a/lib/search.dart +++ b/lib/search.dart @@ -1,9 +1,9 @@ -import 'package:jadb/models/jmdict/word_search_result.dart'; -import 'package:jadb/models/kanjidic/kanji_search_result.dart'; +import 'package:jadb/models/word_search/word_search_result.dart'; +import 'package:jadb/models/kanji_search/kanji_search_result.dart'; import 'package:jadb/models/radkfile/radicals_search_result.dart'; -import 'package:jadb/search/jmdict.dart'; +import 'package:jadb/search/word_search.dart'; -import 'package:jadb/search/kanji.dart'; +import 'package:jadb/search/kanji_search.dart'; import 'package:sqflite_common/sqlite_api.dart'; diff --git a/lib/search/jmdict.dart b/lib/search/jmdict.dart deleted file mode 100644 index 135ef14..0000000 --- a/lib/search/jmdict.dart +++ /dev/null @@ -1,215 +0,0 @@ -import 'package:jadb/models/jmdict/word_search_result.dart'; -import 'package:sqflite_common/sqlite_api.dart'; - -// TODO: Support globs - -// TODO: Support tags - -// TODO: Prefer original kana type when sorting results - -// TODO: Support mixing kana and romaji - -Future?> searchWordWithDbConnection( - DatabaseExecutor connection, - String word, { - bool isKana = true, -}) async { - if (word.isEmpty) { - return null; - } - - late final List matches; - if (isKana) { - matches = (await connection.query( - 'JMdict_EntryByKana', - where: 'kana LIKE ?', - whereArgs: ['%$word%'], - )) - .map((row) => row['entryId'] as int) - .toList(); - } else { - matches = (await connection.query( - 'JMdict_EntryByEnglish', - where: 'english LIKE ?', - whereArgs: ['%$word%'], - )) - .map((row) => row['entryId'] as int) - .toList(); - } - - if (matches.isEmpty) { - return []; - } - - late final List senseIds; - final Future> senseIds_query = connection - .query( - 'JMdict_Sense', - where: 'entryId IN (${matches.join(',')})', - ) - .then((rows) => rows.map((row) => row['id'] as int).toList()); - - late final List> readingElements; - final Future>> readingElements_query = - connection.query( - 'JMdict_ReadingElement', - where: 'entryId IN (${matches.join(',')})', - ); - - late final List> kanjiElements; - final Future>> kanjiElements_query = - connection.query( - 'JMdict_KanjiElement', - where: 'entryId IN (${matches.join(',')})', - ); - - await Future.wait([ - senseIds_query.then((value) => senseIds = value), - readingElements_query.then((value) => readingElements = value), - kanjiElements_query.then((value) => kanjiElements = value), - ]); - - print(senseIds); - print(readingElements); - print(kanjiElements); - - // Sense queries - - late final List> senseAntonyms; - final Future>> senseAntonyms_query = - connection.query( - 'JMdict_SenseAntonym', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseDialects; - final Future>> senseDialects_query = - connection.query( - 'JMdict_SenseDialect', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseFields; - final Future>> senseFields_query = connection.query( - 'JMdict_SenseField', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseGlossaries; - final Future>> senseGlossaries_query = - connection.query( - 'JMdict_SenseGlossary', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseInfos; - final Future>> senseInfos_query = connection.query( - 'JMdict_SenseInfo', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseLanguageSources; - final Future>> senseLanguageSources_query = - connection.query( - 'JMdict_SenseLanguageSource', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseMiscs; - final Future>> senseMiscs_query = connection.query( - 'JMdict_SenseMisc', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> sensePOSs; - final Future>> sensePOSs_query = connection.query( - 'JMdict_SensePOS', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseRestrictedToKanjis; - final Future>> senseRestrictedToKanjis_query = - connection.query( - 'JMdict_SenseRestrictedToKanji', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseRestrictedToReadings; - final Future>> senseRestrictedToReadings_query = - connection.query( - 'JMdict_SenseRestrictedToReading', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> senseSeeAlsos; - final Future>> senseSeeAlsos_query = - connection.query( - 'JMdict_SenseSeeAlso', - where: 'entryId IN (${senseIds.join(',')})', - ); - - late final List> exampleSentences; - final Future>> exampleSentences_query = - connection.query( - 'JMdict_ExampleSentence', - where: 'entryId IN (${senseIds.join(',')})', - ); - - // Reading queries - - final readingIds = readingElements - .map((element) => - (element['entryId'] as int, element['reading'] as String)) - .toList(); - - late final List> readingElementInfos; - final Future>> readingElementInfos_query = - connection.query( - 'JMdict_ReadingElementInfo', - where: 'entryId IN (${readingIds.join(',')})', - ); - - late final List> readingElementRestrictions; - final Future>> readingElementRestrictions_query = - connection.query( - 'JMdict_ReadingElementRestriction', - where: 'entryId IN (${readingIds.join(',')})', - ); - - // Kanji queries - - final kanjiIds = kanjiElements - .map((element) => - (element['entryId'] as int, element['reading'] as String)) - .toList(); - - late final List> kanjiElementInfos; - final Future>> kanjiElementInfos_query = - connection.query( - 'JMdict_KanjiElementInfo', - where: 'entryId IN (${kanjiIds.join(',')})', - ); - - await Future.wait([ - senseAntonyms_query.then((value) => senseAntonyms = value), - senseDialects_query.then((value) => senseDialects = value), - senseFields_query.then((value) => senseFields = value), - senseGlossaries_query.then((value) => senseGlossaries = value), - senseInfos_query.then((value) => senseInfos = value), - senseLanguageSources_query.then((value) => senseLanguageSources = value), - senseMiscs_query.then((value) => senseMiscs = value), - sensePOSs_query.then((value) => sensePOSs = value), - senseRestrictedToKanjis_query - .then((value) => senseRestrictedToKanjis = value), - senseRestrictedToReadings_query - .then((value) => senseRestrictedToReadings = value), - senseSeeAlsos_query.then((value) => senseSeeAlsos = value), - exampleSentences_query.then((value) => exampleSentences = value), - readingElementInfos_query.then((value) => readingElementInfos = value), - readingElementRestrictions_query - .then((value) => readingElementRestrictions = value), - kanjiElementInfos_query.then((value) => kanjiElementInfos = value), - ]); - - throw UnimplementedError(); -} diff --git a/lib/search/kanji.dart b/lib/search/kanji_search.dart similarity index 98% rename from lib/search/kanji.dart rename to lib/search/kanji_search.dart index 7466b69..7ae8804 100644 --- a/lib/search/kanji.dart +++ b/lib/search/kanji_search.dart @@ -1,4 +1,4 @@ -import 'package:jadb/models/kanjidic/kanji_search_result.dart'; +import 'package:jadb/models/kanji_search/kanji_search_result.dart'; import 'package:sqflite_common/sqflite.dart'; Future searchKanjiWithDbConnection( diff --git a/lib/search/word_search.dart b/lib/search/word_search.dart new file mode 100644 index 0000000..65e9854 --- /dev/null +++ b/lib/search/word_search.dart @@ -0,0 +1,506 @@ +import 'package:collection/collection.dart'; +import 'package:jadb/models/jmdict/jmdict_dialect.dart'; +import 'package:jadb/models/jmdict/jmdict_field.dart'; +import 'package:jadb/models/jmdict/jmdict_kanji_info.dart'; +import 'package:jadb/models/jmdict/jmdict_misc.dart'; +import 'package:jadb/models/jmdict/jmdict_pos.dart'; +import 'package:jadb/models/jmdict/jmdict_reading_info.dart'; +import 'package:jadb/models/word_search/word_search_result.dart'; +import 'package:jadb/models/word_search/word_search_ruby.dart'; +import 'package:jadb/models/word_search/word_search_sense.dart'; +import 'package:jadb/models/word_search/word_search_sources.dart'; +import 'package:jadb/models/word_search/word_search_xref_entry.dart'; +import 'package:sqflite_common/sqlite_api.dart'; + +// TODO: Support globs + +// TODO: Support tags + +// TODO: Prefer original kana type when sorting results + +// TODO: Support mixing kana and romaji + +String _escapeStringValue(String value) { + return "'" + value.replaceAll("'", "''") + "'"; +} + +Future?> searchWordWithDbConnection( + DatabaseExecutor connection, + String word, { + bool isKana = true, +}) async { + if (word.isEmpty) { + return null; + } + + late final List entryIds; + if (isKana) { + entryIds = (await connection.query( + 'JMdict_EntryByKana', + where: 'kana LIKE ?', + whereArgs: ['$word%'], + )) + .map((row) => row['entryId'] as int) + .toList(); + } else { + entryIds = (await connection.query( + 'JMdict_EntryByEnglish', + where: 'english LIKE ?', + whereArgs: ['$word%'], + )) + .map((row) => row['entryId'] as int) + .toList(); + } + + if (entryIds.isEmpty) { + return []; + } + + late final List> senses; + final Future>> senses_query = connection.query( + 'JMdict_Sense', + where: 'entryId IN (${entryIds.join(',')})', + ); + + late final List> readingElements; + final Future>> readingElements_query = + connection.query( + 'JMdict_ReadingElement', + where: 'entryId IN (${entryIds.join(',')})', + ); + + late final List> kanjiElements; + final Future>> kanjiElements_query = + connection.query( + 'JMdict_KanjiElement', + where: 'entryId IN (${entryIds.join(',')})', + ); + + await Future.wait([ + senses_query.then((value) => senses = value), + readingElements_query.then((value) => readingElements = value), + kanjiElements_query.then((value) => kanjiElements = value), + ]); + + // Sense queries + + final senseIds = senses.map((element) => element['id'] as int).toList(); + + late final List> senseAntonyms; + final Future>> senseAntonyms_query = + connection.query( + 'JMdict_SenseAntonym', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseDialects; + final Future>> senseDialects_query = + connection.query( + 'JMdict_SenseDialect', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseFields; + final Future>> senseFields_query = connection.query( + 'JMdict_SenseField', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseGlossaries; + final Future>> senseGlossaries_query = + connection.query( + 'JMdict_SenseGlossary', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseInfos; + final Future>> senseInfos_query = connection.query( + 'JMdict_SenseInfo', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseLanguageSources; + final Future>> senseLanguageSources_query = + connection.query( + 'JMdict_SenseLanguageSource', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseMiscs; + final Future>> senseMiscs_query = connection.query( + 'JMdict_SenseMisc', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> sensePOSs; + final Future>> sensePOSs_query = connection.query( + 'JMdict_SensePOS', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseRestrictedToKanjis; + final Future>> senseRestrictedToKanjis_query = + connection.query( + 'JMdict_SenseRestrictedToKanji', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseRestrictedToReadings; + final Future>> senseRestrictedToReadings_query = + connection.query( + 'JMdict_SenseRestrictedToReading', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> senseSeeAlsos; + final Future>> senseSeeAlsos_query = + connection.query( + 'JMdict_SenseSeeAlso', + where: 'senseId IN (${senseIds.join(',')})', + ); + + late final List> exampleSentences; + final Future>> exampleSentences_query = + connection.query( + 'JMdict_ExampleSentence', + where: 'senseId IN (${senseIds.join(',')})', + ); + + // Reading queries + + final readingIds = readingElements + .map((element) => ( + element['entryId'] as int, + _escapeStringValue(element['reading'] as String) + )) + .toList(); + + late final List> readingElementInfos; + final Future>> readingElementInfos_query = + connection.query( + 'JMdict_ReadingElementInfo', + where: '(entryId, reading) IN (${readingIds.join(',')})', + ); + + late final List> readingElementRestrictions; + final Future>> readingElementRestrictions_query = + connection.query( + 'JMdict_ReadingElementRestriction', + where: '(entryId, reading) IN (${readingIds.join(',')})', + ); + + // Kanji queries + + final kanjiIds = kanjiElements + .map((element) => ( + element['entryId'] as int, + _escapeStringValue(element['reading'] as String) + )) + .toList(); + + late final List> kanjiElementInfos; + final Future>> kanjiElementInfos_query = + connection.query( + 'JMdict_KanjiElementInfo', + where: '(entryId, reading) IN (${kanjiIds.join(',')})', + ); + + await Future.wait([ + senseAntonyms_query.then((value) => senseAntonyms = value), + senseDialects_query.then((value) => senseDialects = value), + senseFields_query.then((value) => senseFields = value), + senseGlossaries_query.then((value) => senseGlossaries = value), + senseInfos_query.then((value) => senseInfos = value), + senseLanguageSources_query.then((value) => senseLanguageSources = value), + senseMiscs_query.then((value) => senseMiscs = value), + sensePOSs_query.then((value) => sensePOSs = value), + senseRestrictedToKanjis_query + .then((value) => senseRestrictedToKanjis = value), + senseRestrictedToReadings_query + .then((value) => senseRestrictedToReadings = value), + senseSeeAlsos_query.then((value) => senseSeeAlsos = value), + exampleSentences_query.then((value) => exampleSentences = value), + readingElementInfos_query.then((value) => readingElementInfos = value), + readingElementRestrictions_query + .then((value) => readingElementRestrictions = value), + kanjiElementInfos_query.then((value) => kanjiElementInfos = value), + ]); + + return _regroupWordSearchResults( + entryIds: entryIds, + readingElements: readingElements, + kanjiElements: kanjiElements, + senses: senses, + senseAntonyms: senseAntonyms, + senseDialects: senseDialects, + senseFields: senseFields, + senseGlossaries: senseGlossaries, + senseInfos: senseInfos, + senseLanguageSources: senseLanguageSources, + senseMiscs: senseMiscs, + sensePOSs: sensePOSs, + senseRestrictedToKanjis: senseRestrictedToKanjis, + senseRestrictedToReadings: senseRestrictedToReadings, + senseSeeAlsos: senseSeeAlsos, + exampleSentences: exampleSentences, + readingElementInfos: readingElementInfos, + readingElementRestrictions: readingElementRestrictions, + kanjiElementInfos: kanjiElementInfos, + ); +} + +List _regroupWordSearchResults({ + required List entryIds, + required List> readingElements, + required List> kanjiElements, + required List> senses, + required List> senseAntonyms, + required List> senseDialects, + required List> senseFields, + required List> senseGlossaries, + required List> senseInfos, + required List> senseLanguageSources, + required List> senseMiscs, + required List> sensePOSs, + required List> senseRestrictedToKanjis, + required List> senseRestrictedToReadings, + required List> senseSeeAlsos, + required List> exampleSentences, + required List> readingElementInfos, + required List> readingElementRestrictions, + required List> kanjiElementInfos, +}) { + final List results = []; + + for (final entryId in entryIds) { + final List> entryReadingElements = readingElements + .where((element) => element['entryId'] == entryId) + .toList(); + + final List> entryKanjiElements = kanjiElements + .where((element) => element['entryId'] == entryId) + .toList(); + + final List> entrySenses = + senses.where((element) => element['entryId'] == entryId).toList(); + + final GroupedWordResult entryReadingElementsGrouped = _regroup_words( + entryId: entryId, + readingElements: entryReadingElements, + kanjiElements: entryKanjiElements, + readingElementInfos: readingElementInfos, + readingElementRestrictions: readingElementRestrictions, + kanjiElementInfos: kanjiElementInfos, + ); + + final List entrySensesGrouped = _regroup_senses( + senses: entrySenses, + senseAntonyms: senseAntonyms, + senseDialects: senseDialects, + senseFields: senseFields, + senseGlossaries: senseGlossaries, + senseInfos: senseInfos, + senseLanguageSources: senseLanguageSources, + senseMiscs: senseMiscs, + sensePOSs: sensePOSs, + senseRestrictedToKanjis: senseRestrictedToKanjis, + senseRestrictedToReadings: senseRestrictedToReadings, + senseSeeAlsos: senseSeeAlsos, + exampleSentences: exampleSentences, + ); + + results.add( + WordSearchResult( + entryId: entryId, + japanese: entryReadingElementsGrouped.rubys, + kanjiInfo: entryReadingElementsGrouped.kanjiInfos, + readingInfo: entryReadingElementsGrouped.readingInfos, + senses: entrySensesGrouped, + sources: const WordSearchSources( + jmdict: true, + jmnedict: false, + ), + ), + ); + } + + return results; +} + +class GroupedWordResult { + final List rubys; + final Map readingInfos; + final Map kanjiInfos; + + const GroupedWordResult({ + required this.rubys, + required this.readingInfos, + required this.kanjiInfos, + }); +} + +GroupedWordResult _regroup_words({ + required int entryId, + required List> kanjiElements, + required List> kanjiElementInfos, + required List> readingElements, + required List> readingElementInfos, + required List> readingElementRestrictions, +}) { + final List result = []; + + final kanjiElements_ = + kanjiElements.where((element) => element['entryId'] == entryId).toList(); + + final readingElements_ = readingElements + .where((element) => element['entryId'] == entryId) + .toList(); + + final readingElementRestrictions_ = readingElementRestrictions + .where((element) => element['entryId'] == entryId) + .toList(); + + for (final readingElement in readingElements_) { + for (final kanjiElement in kanjiElements_) { + final kanji = kanjiElement['reading'] as String; + final reading = readingElement['reading'] as String; + + final doesNotMatchKanji = readingElement['doesNotMatchKanji'] == 1; + if (doesNotMatchKanji) { + continue; + } + + final restrictions = readingElementRestrictions_ + .where((element) => element['reading'] == reading) + .toList(); + + if (restrictions.isNotEmpty && + !restrictions.any((element) => element['restriction'] == kanji)) { + continue; + } + + final ruby = WordSearchRuby( + base: kanji, + furigana: reading, + ); + result.add(ruby); + } + } + + for (final readingElement + in readingElements_.where((e) => e['doesNotMatchKanji'] == 1)) { + final reading = readingElement['reading'] as String; + final ruby = WordSearchRuby( + base: reading, + ); + result.add(ruby); + } + + return GroupedWordResult( + rubys: result, + readingInfos: Map.fromEntries( + readingElementInfos.map((e) => MapEntry( + e['reading'] as String, + JMdictReadingInfo.fromId(e['info'] as String), + )), + ), + kanjiInfos: Map.fromEntries( + kanjiElementInfos.map((e) => MapEntry( + e['reading'] as String, + JMdictKanjiInfo.fromId(e['info'] as String), + )), + ), + ); +} + +List _regroup_senses({ + required List> senses, + required List> senseAntonyms, + required List> senseDialects, + required List> senseFields, + required List> senseGlossaries, + required List> senseInfos, + required List> senseLanguageSources, + required List> senseMiscs, + required List> sensePOSs, + required List> senseRestrictedToKanjis, + required List> senseRestrictedToReadings, + required List> senseSeeAlsos, + required List> exampleSentences, +}) { + final groupedSenseAntonyms = + senseAntonyms.groupListsBy((element) => element['senseId'] as int); + final groupedSenseDialects = + senseDialects.groupListsBy((element) => element['senseId'] as int); + final groupedSenseFields = + senseFields.groupListsBy((element) => element['senseId'] as int); + final groupedSenseGlossaries = + senseGlossaries.groupListsBy((element) => element['senseId'] as int); + final groupedSenseInfos = + senseInfos.groupListsBy((element) => element['senseId'] as int); + final groupedSenseLanguageSources = + senseLanguageSources.groupListsBy((element) => element['senseId'] as int); + final groupedSenseMiscs = + senseMiscs.groupListsBy((element) => element['senseId'] as int); + final groupedSensePOSs = + sensePOSs.groupListsBy((element) => element['senseId'] as int); + final groupedSenseRestrictedToKanjis = senseRestrictedToKanjis + .groupListsBy((element) => element['senseId'] as int); + final groupedSenseRestrictedToReadings = senseRestrictedToReadings + .groupListsBy((element) => element['senseId'] as int); + final groupedSenseSeeAlsos = + senseSeeAlsos.groupListsBy((element) => element['senseId'] as int); + + final List result = []; + for (final sense in senses) { + final int senseId = sense['id'] as int; + + final antonyms = groupedSenseAntonyms[senseId] ?? []; + final dialects = groupedSenseDialects[senseId] ?? []; + final fields = groupedSenseFields[senseId] ?? []; + final glossaries = groupedSenseGlossaries[senseId] ?? []; + final infos = groupedSenseInfos[senseId] ?? []; + final languageSources = groupedSenseLanguageSources[senseId] ?? []; + final miscs = groupedSenseMiscs[senseId] ?? []; + final pos = groupedSensePOSs[senseId] ?? []; + final restrictedToKanjis = groupedSenseRestrictedToKanjis[senseId] ?? []; + final restrictedToReadings = + groupedSenseRestrictedToReadings[senseId] ?? []; + final seeAlsos = groupedSenseSeeAlsos[senseId] ?? []; + + final resultSense = WordSearchSense( + englishDefinitions: glossaries.map((e) => e['phrase'] as String).toList(), + partsOfSpeech: + pos.map((e) => JMdictPOS.fromId(e['pos'] as String)).toList(), + seeAlso: seeAlsos + .map((e) => WordSearchXrefEntry( + entryId: e['xrefEntryId'] as int, + ambiguous: e['ambiguous'] == 1, + )) + .toList(), + antonyms: antonyms + .map((e) => WordSearchXrefEntry( + entryId: e['xrefEntryId'] as int, + ambiguous: e['ambiguous'] == 1, + )) + .toList(), + restrictedToReading: + restrictedToReadings.map((e) => e['reading'] as String).toList(), + restrictedToKanji: + restrictedToKanjis.map((e) => e['kanji'] as String).toList(), + fields: + fields.map((e) => JMdictField.fromId(e['field'] as String)).toList(), + dialects: dialects + .map((e) => JMdictDialect.fromId(e['dialect'] as String)) + .toList(), + misc: miscs.map((e) => JMdictMisc.fromId(e['misc'] as String)).toList(), + info: infos.map((e) => e['info'] as String).toList(), + languageSource: + languageSources.map((e) => e['language'] as String).toList(), + ); + + result.add(resultSense); + } + + return result; +} diff --git a/migrations/0002_JMDict_insert_constants.sql b/migrations/0002_JMDict_insert_constants.sql index db0f3ac..76fda80 100644 --- a/migrations/0002_JMDict_insert_constants.sql +++ b/migrations/0002_JMDict_insert_constants.sql @@ -274,18 +274,22 @@ INSERT INTO "JMdict_InfoPOS"("id", "description") VALUES ('vt', 'transitive verb'), ('vz', 'Ichidan verb - zuru verb (alternative form of -jiru verbs)'); +-- NOTE: several of these are not used directly in the XML export, but is implicitly +-- used by whether the reading is marked as kun/on and nanori tags, etc. + INSERT INTO "JMdict_InfoReading"("id", "description") VALUES ('gikun', 'gikun (meaning as reading) or jukujikun (special kanji reading)'), - ('go', 'on-yomi, go'), + -- ('go', 'on-yomi, go'), ('ik', 'word containing irregular kana usage'), - ('jouyou', 'approved reading for jouyou kanji'), - ('kan', 'on-yomi, kan'), - ('kanyou', 'on-yomi, kan''you'), - ('kun', 'kun-yomi'), - ('name', 'reading used only in names (nanori)'), + -- ('jouyou', 'approved reading for jouyou kanji'), + -- ('kan', 'on-yomi, kan'), + -- ('kanyou', 'on-yomi, kan''you'), + -- ('kun', 'kun-yomi'), + -- ('name', 'reading used only in names (nanori)'), ('ok', 'out-dated or obsolete kana usage'), - ('on', 'on-yomi'), - ('rad', 'reading used as name of radical'), + -- ('on', 'on-yomi'), + -- ('rad', 'reading used as name of radical'), ('rk', 'rarely used kana form'), - ('sk', 'search-only kana form'), - ('tou', 'on-yomi, tou'); + ('sk', 'search-only kana form') + -- ('tou', 'on-yomi, tou') + ;