From ec14016ab581d59ef93ed9680952b5c6057003b5 Mon Sep 17 00:00:00 2001 From: h7x4 Date: Mon, 26 May 2025 17:23:49 +0200 Subject: [PATCH] lib/util/lemmatizer: init --- bin/jadb.dart | 2 + lib/cli/commands/lemmatize.dart | 46 ++ lib/util/lemmatizer/lemmatizer.dart | 247 ++++++++++ lib/util/lemmatizer/rules.dart | 10 + lib/util/lemmatizer/rules/godan-verbs.dart | 457 +++++++++++++++++++ lib/util/lemmatizer/rules/i-adjectives.dart | 62 +++ lib/util/lemmatizer/rules/ichidan-verbs.dart | 241 ++++++++++ 7 files changed, 1065 insertions(+) create mode 100644 lib/cli/commands/lemmatize.dart create mode 100644 lib/util/lemmatizer/lemmatizer.dart create mode 100644 lib/util/lemmatizer/rules.dart create mode 100644 lib/util/lemmatizer/rules/godan-verbs.dart create mode 100644 lib/util/lemmatizer/rules/i-adjectives.dart create mode 100644 lib/util/lemmatizer/rules/ichidan-verbs.dart diff --git a/bin/jadb.dart b/bin/jadb.dart index 2d6dcc0..8c2d8e5 100644 --- a/bin/jadb.dart +++ b/bin/jadb.dart @@ -2,6 +2,7 @@ import 'package:args/command_runner.dart'; import 'package:jadb/cli/commands/create_db.dart'; import 'package:jadb/cli/commands/create_tanos_jlpt_mappings.dart'; +import 'package:jadb/cli/commands/lemmatize.dart'; import 'package:jadb/cli/commands/query_kanji.dart'; import 'package:jadb/cli/commands/query_word.dart'; @@ -14,6 +15,7 @@ Future main(List args) async { runner.addCommand(CreateDb()); runner.addCommand(QueryKanji()); runner.addCommand(QueryWord()); + runner.addCommand(Lemmatize()); runner.addCommand(CreateTanosJlptMappings()); runner.run(args); diff --git a/lib/cli/commands/lemmatize.dart b/lib/cli/commands/lemmatize.dart new file mode 100644 index 0000000..8586e4e --- /dev/null +++ b/lib/cli/commands/lemmatize.dart @@ -0,0 +1,46 @@ +// import 'dart:io'; + +// import 'package:jadb/_data_ingestion/open_local_db.dart'; +import 'package:jadb/cli/args.dart'; + +import 'package:args/command_runner.dart'; +import 'package:jadb/util/lemmatizer/lemmatizer.dart'; + +class Lemmatize extends Command { + final name = "lemmatize"; + final description = "Lemmatize a word using the Jadb lemmatizer"; + + Lemmatize() { + addLibsqliteArg(argParser); + addJadbArg(argParser); + argParser.addOption( + 'word', + abbr: 'w', + help: 'The word to search for.', + valueHelp: 'WORD', + ); + } + + Future run() async { + // if (argResults!.option('libsqlite') == null || + // argResults!.option('jadb') == null) { + // print(argParser.usage); + // exit(64); + // } + + // final db = await openLocalDb( + // jadbPath: argResults!.option('jadb')!, + // libsqlitePath: argResults!.option('libsqlite')!, + // ); + + final String searchWord = argResults!.option('word') ?? '食べたくない'; + + final time = Stopwatch()..start(); + final result = lemmatize(searchWord); + time.stop(); + + print(result.toString()); + + print("Lemmatization took ${time.elapsedMilliseconds}ms"); + } +} diff --git a/lib/util/lemmatizer/lemmatizer.dart b/lib/util/lemmatizer/lemmatizer.dart new file mode 100644 index 0000000..52c7629 --- /dev/null +++ b/lib/util/lemmatizer/lemmatizer.dart @@ -0,0 +1,247 @@ +import 'package:jadb/util/lemmatizer/rules.dart'; + +enum WordClass { + noun, + ichidanVerb, + godanVerb, + irregularVerb, + iAdjective, + nAdjective, + adverb, + particle, + input, +} + +enum LemmatizationRuleType { + prefix, + suffix, +} + +class LemmatizationRule { + final String name; + final AllomorphPattern pattern; + final WordClass wordClass; + final List? validChildClasses; + final bool terminal; + + const LemmatizationRule({ + required this.name, + required this.pattern, + required this.wordClass, + this.validChildClasses, + this.terminal = false, + }); + + bool matches(String word) => pattern.matches(word); + + List? apply(String word) => pattern.apply(word); + + LemmatizationRule.simple({ + required String name, + required String pattern, + required String? replacement, + required WordClass wordClass, + validChildClasses, + terminal = false, + lookAheadBehind = const [''], + LemmatizationRuleType type = LemmatizationRuleType.suffix, + }) : this( + name: name, + pattern: AllomorphPattern( + patterns: { + pattern: replacement != null ? [replacement] : null + }, + type: type, + lookAheadBehind: lookAheadBehind, + ), + validChildClasses: validChildClasses, + terminal: terminal, + wordClass: wordClass, + ); +} + +/// Represents a set of patterns for matching allomorphs in a word. +/// The patterns can be either a prefix or a suffix, and they can include +/// replacement characters for deconjugating into base forms. +class AllomorphPattern { + final List lookAheadBehind; + final Map?> patterns; + final LemmatizationRuleType type; + + const AllomorphPattern({ + required this.patterns, + required this.type, + this.lookAheadBehind = const [''], + }); + + List<(String, Pattern)> get allPatternCombinations { + final combinations = <(String, Pattern)>[]; + for (final l in lookAheadBehind) { + for (final p in patterns.keys) { + switch ((type, l is RegExp)) { + case (LemmatizationRuleType.prefix, true): + combinations.add((p, RegExp('^($p)(${(l as RegExp).pattern})'))); + break; + case (LemmatizationRuleType.prefix, false): + combinations.add((p, '$p$l')); + break; + case (LemmatizationRuleType.suffix, true): + combinations.add((p, RegExp('(${(l as RegExp).pattern})($p)\$'))); + break; + case (LemmatizationRuleType.suffix, false): + combinations.add((p, '$l$p')); + break; + } + } + } + return combinations; + } + + bool matches(String word) { + for (final (_, p) in allPatternCombinations) { + if (p is String) { + if (type == LemmatizationRuleType.prefix + ? word.startsWith(p) + : word.endsWith(p)) { + return true; + } + } else if (p is RegExp) { + if (p.hasMatch(word)) { + return true; + } + } + } + return false; + } + + List? apply(String word) { + for (final (affix, p) in allPatternCombinations) { + switch ((type, p is RegExp)) { + case (LemmatizationRuleType.prefix, true): + final match = (p as RegExp).firstMatch(word); + if (match != null) { + final prefix = match.group(1)!; + assert(prefix == affix); + final suffix = word.substring(prefix.length); + return patterns[prefix] != null + ? patterns[prefix]!.map((s) => s + suffix).toList() + : [suffix]; + } + break; + case (LemmatizationRuleType.prefix, false): + if (word.startsWith(p as String)) { + return patterns[affix] != null + ? patterns[affix]! + .map((s) => s + word.substring(affix.length)) + .toList() + : [word.substring(affix.length)]; + } + break; + case (LemmatizationRuleType.suffix, true): + final match = (p as RegExp).firstMatch(word); + if (match != null) { + final suffix = match.group(2)!; + assert(suffix == affix); + final prefix = word.substring(0, word.length - suffix.length); + return patterns[suffix] != null + ? patterns[suffix]!.map((s) => prefix + s).toList() + : [prefix]; + } + break; + case (LemmatizationRuleType.suffix, false): + if (word.endsWith(p as String)) { + final prefix = word.substring(0, word.length - affix.length); + return patterns[affix] != null + ? patterns[affix]!.map((s) => prefix + s).toList() + : [prefix]; + } + break; + } + } + return null; + } +} + +class Lemmatized { + final String original; + final LemmatizationRule rule; + final int variant; + final List children; + + const Lemmatized({ + required this.original, + required this.rule, + this.variant = 0, + this.children = const [], + }); + + String? get applied { + final applied = rule.apply(original); + if (applied == null || applied.isEmpty) { + return null; + } + return applied[variant]; + } + + @override + String toString() { + final childrenString = children + .map((c) => ' - ' + c.toString().split('\n').join('\n ')) + .join('\n'); + + if (children.isEmpty) { + return '$original (${rule.name}) -> ${applied ?? ''}'; + } else { + return '$original (${rule.name}) -> ${applied ?? ''}\n$childrenString'; + } + } +} + +List _lemmatize(LemmatizationRule parentRule, String word) { + final children = []; + + if (parentRule.terminal) { + return children; + } + + final filteredLemmatizationRules = parentRule.validChildClasses == null + ? lemmatizationRules + : lemmatizationRules.where( + (r) => parentRule.validChildClasses!.contains(r.wordClass), + ); + + for (final rule in filteredLemmatizationRules) { + if (rule.matches(word)) { + final applied = rule.apply(word); + for (final (i, a) in (applied ?? []).indexed) { + final subChildren = _lemmatize(rule, a); + children.add( + Lemmatized( + original: word, + rule: rule, + variant: i, + children: subChildren, + ), + ); + } + } + } + return children; +} + +Lemmatized lemmatize(String word) { + final inputRule = LemmatizationRule.simple( + name: 'Input', + pattern: '', + replacement: null, + wordClass: WordClass.input, + ); + return Lemmatized( + original: word, + rule: inputRule, + children: _lemmatize( + inputRule, + word, + ), + ); +} diff --git a/lib/util/lemmatizer/rules.dart b/lib/util/lemmatizer/rules.dart new file mode 100644 index 0000000..94019f1 --- /dev/null +++ b/lib/util/lemmatizer/rules.dart @@ -0,0 +1,10 @@ +import 'package:jadb/util/lemmatizer/lemmatizer.dart'; +import 'package:jadb/util/lemmatizer/rules/godan-verbs.dart'; +import 'package:jadb/util/lemmatizer/rules/i-adjectives.dart'; +import 'package:jadb/util/lemmatizer/rules/ichidan-verbs.dart'; + +List lemmatizationRules = [ + ...ichidanVerbLemmatizationRules, + ...godanVerbLemmatizationRules, + ...iAdjectiveLemmatizationRules, +]; diff --git a/lib/util/lemmatizer/rules/godan-verbs.dart b/lib/util/lemmatizer/rules/godan-verbs.dart new file mode 100644 index 0000000..aecb760 --- /dev/null +++ b/lib/util/lemmatizer/rules/godan-verbs.dart @@ -0,0 +1,457 @@ +import 'package:jadb/util/lemmatizer/lemmatizer.dart'; + +List godanVerbLemmatizationRules = [ + LemmatizationRule( + name: 'Godan verb - base form', + terminal: true, + pattern: AllomorphPattern( + patterns: { + 'う': ['う'], + 'く': ['く'], + 'ぐ': ['ぐ'], + 'す': ['す'], + 'つ': ['つ'], + 'ぬ': ['ぬ'], + 'ぶ': ['ぶ'], + 'む': ['む'], + 'る': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative form', + pattern: AllomorphPattern( + patterns: { + 'わない': ['う'], + 'かない': ['く'], + 'がない': ['ぐ'], + 'さない': ['す'], + 'たない': ['つ'], + 'なない': ['ぬ'], + 'ばない': ['ぶ'], + 'まない': ['む'], + 'らない': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - past form', + pattern: AllomorphPattern( + patterns: { + 'した': ['す'], + 'った': ['る', 'つ', 'う'], + 'んだ': ['む', 'ぬ', 'ぶ'], + 'いだ': ['ぐ'], + 'いた': ['く'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - te-form', + pattern: AllomorphPattern( + patterns: { + 'いて': ['く', 'ぐ'], + 'して': ['す'], + 'って': ['る', 'つ', 'う'], + 'んで': ['む', 'ぬ', 'ぶ'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - te-form with いる', + pattern: AllomorphPattern( + patterns: { + 'いている': ['く', 'ぐ'], + 'している': ['す'], + 'っている': ['る', 'つ', 'う'], + 'んでいる': ['む', 'ぬ', 'ぶ'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - te-form with いた', + pattern: AllomorphPattern( + patterns: { + 'いていた': ['く', 'ぐ'], + 'していた': ['す'], + 'っていた': ['る', 'つ', 'う'], + 'んでいた': ['む', 'ぬ', 'ぶ'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - conditional form', + pattern: AllomorphPattern( + patterns: { + 'けば': ['く'], + 'げば': ['ぐ'], + 'せば': ['す'], + 'てば': ['つ', 'る', 'う'], + 'ねば': ['ぬ'], + 'べば': ['ぶ'], + 'めば': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - volitional form', + pattern: AllomorphPattern( + patterns: { + 'おう': ['う'], + 'こう': ['く'], + 'ごう': ['ぐ'], + 'そう': ['す'], + 'とう': ['つ', 'る', 'う'], + 'のう': ['ぬ'], + 'ぼう': ['ぶ'], + 'もう': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - potential form', + pattern: AllomorphPattern( + patterns: { + 'ける': ['く'], + 'げる': ['ぐ'], + 'せる': ['す'], + 'てる': ['つ', 'る', 'う'], + 'ねる': ['ぬ'], + 'べる': ['ぶ'], + 'める': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - passive form', + pattern: AllomorphPattern( + patterns: { + 'かれる': ['く'], + 'がれる': ['ぐ'], + 'される': ['す'], + 'たれる': ['つ', 'る', 'う'], + 'なれる': ['ぬ'], + 'ばれる': ['ぶ'], + 'まれる': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - causative form', + pattern: AllomorphPattern( + patterns: { + 'かせる': ['く'], + 'がせる': ['ぐ'], + 'させる': ['す'], + 'たせる': ['つ', 'る', 'う'], + 'なせる': ['ぬ'], + 'ばせる': ['ぶ'], + 'ませる': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - causative-passive form', + pattern: AllomorphPattern( + patterns: { + 'かされる': ['く'], + 'がされる': ['ぐ'], + 'される': ['す'], + 'たされる': ['つ', 'る', 'う'], + 'なされる': ['ぬ'], + 'ばされる': ['ぶ'], + 'まされる': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - imperative form', + pattern: AllomorphPattern( + patterns: { + 'え': ['う'], + 'け': ['く'], + 'げ': ['ぐ'], + 'せ': ['す'], + 'て': ['つ', 'る', 'う'], + 'ね': ['ぬ'], + 'べ': ['ぶ'], + 'め': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative past form', + pattern: AllomorphPattern( + patterns: { + 'わなかった': ['う'], + 'かなかった': ['く'], + 'がなかった': ['ぐ'], + 'さなかった': ['す'], + 'たなかった': ['つ'], + 'ななかった': ['ぬ'], + 'ばなかった': ['ぶ'], + 'まなかった': ['む'], + 'らなかった': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative te-form', + pattern: AllomorphPattern( + patterns: { + 'わなくて': ['う'], + 'かなくて': ['く'], + 'がなくて': ['ぐ'], + 'さなくて': ['す'], + 'たなくて': ['つ'], + 'ななくて': ['ぬ'], + 'ばなくて': ['ぶ'], + 'まなくて': ['む'], + 'らなくて': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative conditional form', + pattern: AllomorphPattern( + patterns: { + 'わなければ': ['う'], + 'かなければ': ['く'], + 'がなければ': ['ぐ'], + 'さなければ': ['す'], + 'たなければ': ['つ'], + 'ななければ': ['ぬ'], + 'ばなければ': ['ぶ'], + 'まなければ': ['む'], + 'らなければ': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative volitional form', + pattern: AllomorphPattern( + patterns: { + 'うまい': ['う'], + 'くまい': ['く'], + 'ぐまい': ['ぐ'], + 'すまい': ['す'], + 'つまい': ['つ', 'る', 'う'], + 'ぬまい': ['ぬ'], + 'ぶまい': ['ぶ'], + 'むまい': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative potential form', + pattern: AllomorphPattern( + patterns: { + 'けない': ['く'], + 'げない': ['ぐ'], + 'せない': ['す'], + 'てない': ['つ', 'る', 'う'], + 'ねない': ['ぬ'], + 'べない': ['ぶ'], + 'めない': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative passive form', + pattern: AllomorphPattern( + patterns: { + 'かれない': ['く'], + 'がれない': ['ぐ'], + 'されない': ['す'], + 'たれない': ['つ', 'る', 'う'], + 'なれない': ['ぬ'], + 'ばれない': ['ぶ'], + 'まれない': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative causative form', + pattern: AllomorphPattern( + patterns: { + 'かせない': ['く'], + 'がせない': ['ぐ'], + 'させない': ['す'], + 'たせない': ['つ', 'る', 'う'], + 'なせない': ['ぬ'], + 'ばせない': ['ぶ'], + 'ませない': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative causative-passive form', + pattern: AllomorphPattern( + patterns: { + 'かされない': ['く'], + 'がされない': ['ぐ'], + 'されない': ['す'], + 'たされない': ['つ', 'る', 'う'], + 'なされない': ['ぬ'], + 'ばされない': ['ぶ'], + 'まされない': ['む'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative imperative form', + pattern: AllomorphPattern( + patterns: { + 'うな': ['う'], + 'くな': ['く'], + 'ぐな': ['ぐ'], + 'すな': ['す'], + 'つな': ['つ'], + 'ぬな': ['ぬ'], + 'ぶな': ['ぶ'], + 'むな': ['む'], + 'るな': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - desire form', + pattern: AllomorphPattern( + patterns: { + 'きたい': ['く'], + 'ぎたい': ['ぐ'], + 'したい': ['す'], + 'ちたい': ['つ'], + 'にたい': ['ぬ'], + 'びたい': ['ぶ'], + 'みたい': ['む'], + 'りたい': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative desire form', + pattern: AllomorphPattern( + patterns: { + 'いたくない': ['う'], + 'きたくない': ['く'], + 'ぎたくない': ['ぐ'], + 'したくない': ['す'], + 'ちたくない': ['つ'], + 'にたくない': ['ぬ'], + 'びたくない': ['ぶ'], + 'みたくない': ['む'], + 'りたくない': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - past desire form', + pattern: AllomorphPattern( + patterns: { + 'きたかった': ['く'], + 'ぎたかった': ['ぐ'], + 'したかった': ['す'], + 'ちたかった': ['つ'], + 'にたかった': ['ぬ'], + 'びたかった': ['ぶ'], + 'みたかった': ['む'], + 'りたかった': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), + LemmatizationRule( + name: 'Godan verb - negative past desire form', + pattern: AllomorphPattern( + patterns: { + 'いたくなかった': ['う'], + 'きたくなかった': ['く'], + 'ぎたくなかった': ['ぐ'], + 'したくなかった': ['す'], + 'ちたくなかった': ['つ'], + 'にたくなかった': ['ぬ'], + 'びたくなかった': ['ぶ'], + 'みたくなかった': ['む'], + 'りたくなかった': ['る'], + }, + type: LemmatizationRuleType.suffix, + ), + validChildClasses: [WordClass.godanVerb], + wordClass: WordClass.godanVerb, + ), +]; diff --git a/lib/util/lemmatizer/rules/i-adjectives.dart b/lib/util/lemmatizer/rules/i-adjectives.dart new file mode 100644 index 0000000..e58fcbb --- /dev/null +++ b/lib/util/lemmatizer/rules/i-adjectives.dart @@ -0,0 +1,62 @@ + +import 'package:jadb/util/lemmatizer/lemmatizer.dart'; + +List iAdjectiveLemmatizationRules = [ + LemmatizationRule.simple( + name: 'I adjective - base form', + terminal: true, + pattern: 'い', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - negative form', + pattern: 'くない', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - past form', + pattern: 'かった', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - negative past form', + pattern: 'くなかった', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - te-form', + pattern: 'くて', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - conditional form', + pattern: 'ければ', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - volitional form', + pattern: 'かろう', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), + LemmatizationRule.simple( + name: 'I adjective - continuative form', + pattern: 'く', + replacement: 'い', + validChildClasses: [WordClass.iAdjective], + wordClass: WordClass.iAdjective, + ), +]; diff --git a/lib/util/lemmatizer/rules/ichidan-verbs.dart b/lib/util/lemmatizer/rules/ichidan-verbs.dart new file mode 100644 index 0000000..7c0f0a0 --- /dev/null +++ b/lib/util/lemmatizer/rules/ichidan-verbs.dart @@ -0,0 +1,241 @@ +import 'package:jadb/util/lemmatizer/lemmatizer.dart'; +import 'package:jadb/util/text_filtering.dart'; + +List lookBehinds = [ + kanjiRegex, + 'き', + 'ぎ', + 'し', + 'じ', + 'ち', + 'ぢ', + 'に', + 'ひ', + 'び', + 'び', + 'み', + 'り', + 'け', + 'げ', + 'せ', + 'ぜ', + 'て', + 'で', + 'ね', + 'へ', + 'べ', + 'め', + 'れ', +]; + +List ichidanVerbLemmatizationRules = [ + LemmatizationRule.simple( + name: 'Ichidan verb - base form', + terminal: true, + pattern: 'る', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative form', + pattern: 'ない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - past form', + pattern: 'た', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - te-form', + pattern: 'て', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - te-form with いる', + pattern: 'ている', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - te-form with いた', + pattern: 'ていた', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - conditional form', + pattern: 'れば', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - volitional form', + pattern: 'よう', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - potential form', + pattern: 'られる', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - passive form', + pattern: 'られる', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - causative form', + pattern: 'させる', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - causative passive form', + pattern: 'させられる', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - imperative form', + pattern: 'れ', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative past form', + pattern: 'なかった', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative te-form', + pattern: 'なくて', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative conditional form', + pattern: 'なければ', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative volitional form', + pattern: 'なかろう', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative potential form', + pattern: 'られない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative passive form', + pattern: 'られない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative causative form', + pattern: 'させない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative causative passive form', + pattern: 'させられない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative imperative form', + pattern: 'るな', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - desire form', + pattern: 'たい', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative desire form', + pattern: 'たくない', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - past desire form', + pattern: 'たかった', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), + LemmatizationRule.simple( + name: 'Ichidan verb - negative past desire form', + pattern: 'たくなかった', + replacement: 'る', + lookAheadBehind: lookBehinds, + validChildClasses: [WordClass.ichidanVerb], + wordClass: WordClass.ichidanVerb, + ), +];