lib/util/lemmatizer: init
This commit is contained in:
@@ -2,6 +2,7 @@ import 'package:args/command_runner.dart';
|
||||
|
||||
import 'package:jadb/cli/commands/create_db.dart';
|
||||
import 'package:jadb/cli/commands/create_tanos_jlpt_mappings.dart';
|
||||
import 'package:jadb/cli/commands/lemmatize.dart';
|
||||
import 'package:jadb/cli/commands/query_kanji.dart';
|
||||
import 'package:jadb/cli/commands/query_word.dart';
|
||||
|
||||
@@ -14,6 +15,7 @@ Future<void> main(List<String> args) async {
|
||||
runner.addCommand(CreateDb());
|
||||
runner.addCommand(QueryKanji());
|
||||
runner.addCommand(QueryWord());
|
||||
runner.addCommand(Lemmatize());
|
||||
runner.addCommand(CreateTanosJlptMappings());
|
||||
|
||||
runner.run(args);
|
||||
|
||||
46
lib/cli/commands/lemmatize.dart
Normal file
46
lib/cli/commands/lemmatize.dart
Normal file
@@ -0,0 +1,46 @@
|
||||
// import 'dart:io';
|
||||
|
||||
// import 'package:jadb/_data_ingestion/open_local_db.dart';
|
||||
import 'package:jadb/cli/args.dart';
|
||||
|
||||
import 'package:args/command_runner.dart';
|
||||
import 'package:jadb/util/lemmatizer/lemmatizer.dart';
|
||||
|
||||
class Lemmatize extends Command {
|
||||
final name = "lemmatize";
|
||||
final description = "Lemmatize a word using the Jadb lemmatizer";
|
||||
|
||||
Lemmatize() {
|
||||
addLibsqliteArg(argParser);
|
||||
addJadbArg(argParser);
|
||||
argParser.addOption(
|
||||
'word',
|
||||
abbr: 'w',
|
||||
help: 'The word to search for.',
|
||||
valueHelp: 'WORD',
|
||||
);
|
||||
}
|
||||
|
||||
Future<void> run() async {
|
||||
// if (argResults!.option('libsqlite') == null ||
|
||||
// argResults!.option('jadb') == null) {
|
||||
// print(argParser.usage);
|
||||
// exit(64);
|
||||
// }
|
||||
|
||||
// final db = await openLocalDb(
|
||||
// jadbPath: argResults!.option('jadb')!,
|
||||
// libsqlitePath: argResults!.option('libsqlite')!,
|
||||
// );
|
||||
|
||||
final String searchWord = argResults!.option('word') ?? '食べたくない';
|
||||
|
||||
final time = Stopwatch()..start();
|
||||
final result = lemmatize(searchWord);
|
||||
time.stop();
|
||||
|
||||
print(result.toString());
|
||||
|
||||
print("Lemmatization took ${time.elapsedMilliseconds}ms");
|
||||
}
|
||||
}
|
||||
247
lib/util/lemmatizer/lemmatizer.dart
Normal file
247
lib/util/lemmatizer/lemmatizer.dart
Normal file
@@ -0,0 +1,247 @@
|
||||
import 'package:jadb/util/lemmatizer/rules.dart';
|
||||
|
||||
enum WordClass {
|
||||
noun,
|
||||
ichidanVerb,
|
||||
godanVerb,
|
||||
irregularVerb,
|
||||
iAdjective,
|
||||
nAdjective,
|
||||
adverb,
|
||||
particle,
|
||||
input,
|
||||
}
|
||||
|
||||
enum LemmatizationRuleType {
|
||||
prefix,
|
||||
suffix,
|
||||
}
|
||||
|
||||
class LemmatizationRule {
|
||||
final String name;
|
||||
final AllomorphPattern pattern;
|
||||
final WordClass wordClass;
|
||||
final List<WordClass>? validChildClasses;
|
||||
final bool terminal;
|
||||
|
||||
const LemmatizationRule({
|
||||
required this.name,
|
||||
required this.pattern,
|
||||
required this.wordClass,
|
||||
this.validChildClasses,
|
||||
this.terminal = false,
|
||||
});
|
||||
|
||||
bool matches(String word) => pattern.matches(word);
|
||||
|
||||
List<String>? apply(String word) => pattern.apply(word);
|
||||
|
||||
LemmatizationRule.simple({
|
||||
required String name,
|
||||
required String pattern,
|
||||
required String? replacement,
|
||||
required WordClass wordClass,
|
||||
validChildClasses,
|
||||
terminal = false,
|
||||
lookAheadBehind = const [''],
|
||||
LemmatizationRuleType type = LemmatizationRuleType.suffix,
|
||||
}) : this(
|
||||
name: name,
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
pattern: replacement != null ? [replacement] : null
|
||||
},
|
||||
type: type,
|
||||
lookAheadBehind: lookAheadBehind,
|
||||
),
|
||||
validChildClasses: validChildClasses,
|
||||
terminal: terminal,
|
||||
wordClass: wordClass,
|
||||
);
|
||||
}
|
||||
|
||||
/// Represents a set of patterns for matching allomorphs in a word.
|
||||
/// The patterns can be either a prefix or a suffix, and they can include
|
||||
/// replacement characters for deconjugating into base forms.
|
||||
class AllomorphPattern {
|
||||
final List<Pattern> lookAheadBehind;
|
||||
final Map<String, List<String>?> patterns;
|
||||
final LemmatizationRuleType type;
|
||||
|
||||
const AllomorphPattern({
|
||||
required this.patterns,
|
||||
required this.type,
|
||||
this.lookAheadBehind = const [''],
|
||||
});
|
||||
|
||||
List<(String, Pattern)> get allPatternCombinations {
|
||||
final combinations = <(String, Pattern)>[];
|
||||
for (final l in lookAheadBehind) {
|
||||
for (final p in patterns.keys) {
|
||||
switch ((type, l is RegExp)) {
|
||||
case (LemmatizationRuleType.prefix, true):
|
||||
combinations.add((p, RegExp('^($p)(${(l as RegExp).pattern})')));
|
||||
break;
|
||||
case (LemmatizationRuleType.prefix, false):
|
||||
combinations.add((p, '$p$l'));
|
||||
break;
|
||||
case (LemmatizationRuleType.suffix, true):
|
||||
combinations.add((p, RegExp('(${(l as RegExp).pattern})($p)\$')));
|
||||
break;
|
||||
case (LemmatizationRuleType.suffix, false):
|
||||
combinations.add((p, '$l$p'));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return combinations;
|
||||
}
|
||||
|
||||
bool matches(String word) {
|
||||
for (final (_, p) in allPatternCombinations) {
|
||||
if (p is String) {
|
||||
if (type == LemmatizationRuleType.prefix
|
||||
? word.startsWith(p)
|
||||
: word.endsWith(p)) {
|
||||
return true;
|
||||
}
|
||||
} else if (p is RegExp) {
|
||||
if (p.hasMatch(word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
List<String>? apply(String word) {
|
||||
for (final (affix, p) in allPatternCombinations) {
|
||||
switch ((type, p is RegExp)) {
|
||||
case (LemmatizationRuleType.prefix, true):
|
||||
final match = (p as RegExp).firstMatch(word);
|
||||
if (match != null) {
|
||||
final prefix = match.group(1)!;
|
||||
assert(prefix == affix);
|
||||
final suffix = word.substring(prefix.length);
|
||||
return patterns[prefix] != null
|
||||
? patterns[prefix]!.map((s) => s + suffix).toList()
|
||||
: [suffix];
|
||||
}
|
||||
break;
|
||||
case (LemmatizationRuleType.prefix, false):
|
||||
if (word.startsWith(p as String)) {
|
||||
return patterns[affix] != null
|
||||
? patterns[affix]!
|
||||
.map((s) => s + word.substring(affix.length))
|
||||
.toList()
|
||||
: [word.substring(affix.length)];
|
||||
}
|
||||
break;
|
||||
case (LemmatizationRuleType.suffix, true):
|
||||
final match = (p as RegExp).firstMatch(word);
|
||||
if (match != null) {
|
||||
final suffix = match.group(2)!;
|
||||
assert(suffix == affix);
|
||||
final prefix = word.substring(0, word.length - suffix.length);
|
||||
return patterns[suffix] != null
|
||||
? patterns[suffix]!.map((s) => prefix + s).toList()
|
||||
: [prefix];
|
||||
}
|
||||
break;
|
||||
case (LemmatizationRuleType.suffix, false):
|
||||
if (word.endsWith(p as String)) {
|
||||
final prefix = word.substring(0, word.length - affix.length);
|
||||
return patterns[affix] != null
|
||||
? patterns[affix]!.map((s) => prefix + s).toList()
|
||||
: [prefix];
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class Lemmatized {
|
||||
final String original;
|
||||
final LemmatizationRule rule;
|
||||
final int variant;
|
||||
final List<Lemmatized> children;
|
||||
|
||||
const Lemmatized({
|
||||
required this.original,
|
||||
required this.rule,
|
||||
this.variant = 0,
|
||||
this.children = const [],
|
||||
});
|
||||
|
||||
String? get applied {
|
||||
final applied = rule.apply(original);
|
||||
if (applied == null || applied.isEmpty) {
|
||||
return null;
|
||||
}
|
||||
return applied[variant];
|
||||
}
|
||||
|
||||
@override
|
||||
String toString() {
|
||||
final childrenString = children
|
||||
.map((c) => ' - ' + c.toString().split('\n').join('\n '))
|
||||
.join('\n');
|
||||
|
||||
if (children.isEmpty) {
|
||||
return '$original (${rule.name}) -> ${applied ?? '<null>'}';
|
||||
} else {
|
||||
return '$original (${rule.name}) -> ${applied ?? '<null>'}\n$childrenString';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Lemmatized> _lemmatize(LemmatizationRule parentRule, String word) {
|
||||
final children = <Lemmatized>[];
|
||||
|
||||
if (parentRule.terminal) {
|
||||
return children;
|
||||
}
|
||||
|
||||
final filteredLemmatizationRules = parentRule.validChildClasses == null
|
||||
? lemmatizationRules
|
||||
: lemmatizationRules.where(
|
||||
(r) => parentRule.validChildClasses!.contains(r.wordClass),
|
||||
);
|
||||
|
||||
for (final rule in filteredLemmatizationRules) {
|
||||
if (rule.matches(word)) {
|
||||
final applied = rule.apply(word);
|
||||
for (final (i, a) in (applied ?? []).indexed) {
|
||||
final subChildren = _lemmatize(rule, a);
|
||||
children.add(
|
||||
Lemmatized(
|
||||
original: word,
|
||||
rule: rule,
|
||||
variant: i,
|
||||
children: subChildren,
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return children;
|
||||
}
|
||||
|
||||
Lemmatized lemmatize(String word) {
|
||||
final inputRule = LemmatizationRule.simple(
|
||||
name: 'Input',
|
||||
pattern: '',
|
||||
replacement: null,
|
||||
wordClass: WordClass.input,
|
||||
);
|
||||
return Lemmatized(
|
||||
original: word,
|
||||
rule: inputRule,
|
||||
children: _lemmatize(
|
||||
inputRule,
|
||||
word,
|
||||
),
|
||||
);
|
||||
}
|
||||
10
lib/util/lemmatizer/rules.dart
Normal file
10
lib/util/lemmatizer/rules.dart
Normal file
@@ -0,0 +1,10 @@
|
||||
import 'package:jadb/util/lemmatizer/lemmatizer.dart';
|
||||
import 'package:jadb/util/lemmatizer/rules/godan-verbs.dart';
|
||||
import 'package:jadb/util/lemmatizer/rules/i-adjectives.dart';
|
||||
import 'package:jadb/util/lemmatizer/rules/ichidan-verbs.dart';
|
||||
|
||||
List<LemmatizationRule> lemmatizationRules = [
|
||||
...ichidanVerbLemmatizationRules,
|
||||
...godanVerbLemmatizationRules,
|
||||
...iAdjectiveLemmatizationRules,
|
||||
];
|
||||
457
lib/util/lemmatizer/rules/godan-verbs.dart
Normal file
457
lib/util/lemmatizer/rules/godan-verbs.dart
Normal file
@@ -0,0 +1,457 @@
|
||||
import 'package:jadb/util/lemmatizer/lemmatizer.dart';
|
||||
|
||||
List<LemmatizationRule> godanVerbLemmatizationRules = [
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - base form',
|
||||
terminal: true,
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'う': ['う'],
|
||||
'く': ['く'],
|
||||
'ぐ': ['ぐ'],
|
||||
'す': ['す'],
|
||||
'つ': ['つ'],
|
||||
'ぬ': ['ぬ'],
|
||||
'ぶ': ['ぶ'],
|
||||
'む': ['む'],
|
||||
'る': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'わない': ['う'],
|
||||
'かない': ['く'],
|
||||
'がない': ['ぐ'],
|
||||
'さない': ['す'],
|
||||
'たない': ['つ'],
|
||||
'なない': ['ぬ'],
|
||||
'ばない': ['ぶ'],
|
||||
'まない': ['む'],
|
||||
'らない': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - past form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'した': ['す'],
|
||||
'った': ['る', 'つ', 'う'],
|
||||
'んだ': ['む', 'ぬ', 'ぶ'],
|
||||
'いだ': ['ぐ'],
|
||||
'いた': ['く'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - te-form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'いて': ['く', 'ぐ'],
|
||||
'して': ['す'],
|
||||
'って': ['る', 'つ', 'う'],
|
||||
'んで': ['む', 'ぬ', 'ぶ'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - te-form with いる',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'いている': ['く', 'ぐ'],
|
||||
'している': ['す'],
|
||||
'っている': ['る', 'つ', 'う'],
|
||||
'んでいる': ['む', 'ぬ', 'ぶ'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - te-form with いた',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'いていた': ['く', 'ぐ'],
|
||||
'していた': ['す'],
|
||||
'っていた': ['る', 'つ', 'う'],
|
||||
'んでいた': ['む', 'ぬ', 'ぶ'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - conditional form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'けば': ['く'],
|
||||
'げば': ['ぐ'],
|
||||
'せば': ['す'],
|
||||
'てば': ['つ', 'る', 'う'],
|
||||
'ねば': ['ぬ'],
|
||||
'べば': ['ぶ'],
|
||||
'めば': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - volitional form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'おう': ['う'],
|
||||
'こう': ['く'],
|
||||
'ごう': ['ぐ'],
|
||||
'そう': ['す'],
|
||||
'とう': ['つ', 'る', 'う'],
|
||||
'のう': ['ぬ'],
|
||||
'ぼう': ['ぶ'],
|
||||
'もう': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - potential form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'ける': ['く'],
|
||||
'げる': ['ぐ'],
|
||||
'せる': ['す'],
|
||||
'てる': ['つ', 'る', 'う'],
|
||||
'ねる': ['ぬ'],
|
||||
'べる': ['ぶ'],
|
||||
'める': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - passive form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かれる': ['く'],
|
||||
'がれる': ['ぐ'],
|
||||
'される': ['す'],
|
||||
'たれる': ['つ', 'る', 'う'],
|
||||
'なれる': ['ぬ'],
|
||||
'ばれる': ['ぶ'],
|
||||
'まれる': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - causative form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かせる': ['く'],
|
||||
'がせる': ['ぐ'],
|
||||
'させる': ['す'],
|
||||
'たせる': ['つ', 'る', 'う'],
|
||||
'なせる': ['ぬ'],
|
||||
'ばせる': ['ぶ'],
|
||||
'ませる': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - causative-passive form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かされる': ['く'],
|
||||
'がされる': ['ぐ'],
|
||||
'される': ['す'],
|
||||
'たされる': ['つ', 'る', 'う'],
|
||||
'なされる': ['ぬ'],
|
||||
'ばされる': ['ぶ'],
|
||||
'まされる': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - imperative form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'え': ['う'],
|
||||
'け': ['く'],
|
||||
'げ': ['ぐ'],
|
||||
'せ': ['す'],
|
||||
'て': ['つ', 'る', 'う'],
|
||||
'ね': ['ぬ'],
|
||||
'べ': ['ぶ'],
|
||||
'め': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative past form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'わなかった': ['う'],
|
||||
'かなかった': ['く'],
|
||||
'がなかった': ['ぐ'],
|
||||
'さなかった': ['す'],
|
||||
'たなかった': ['つ'],
|
||||
'ななかった': ['ぬ'],
|
||||
'ばなかった': ['ぶ'],
|
||||
'まなかった': ['む'],
|
||||
'らなかった': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative te-form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'わなくて': ['う'],
|
||||
'かなくて': ['く'],
|
||||
'がなくて': ['ぐ'],
|
||||
'さなくて': ['す'],
|
||||
'たなくて': ['つ'],
|
||||
'ななくて': ['ぬ'],
|
||||
'ばなくて': ['ぶ'],
|
||||
'まなくて': ['む'],
|
||||
'らなくて': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative conditional form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'わなければ': ['う'],
|
||||
'かなければ': ['く'],
|
||||
'がなければ': ['ぐ'],
|
||||
'さなければ': ['す'],
|
||||
'たなければ': ['つ'],
|
||||
'ななければ': ['ぬ'],
|
||||
'ばなければ': ['ぶ'],
|
||||
'まなければ': ['む'],
|
||||
'らなければ': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative volitional form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'うまい': ['う'],
|
||||
'くまい': ['く'],
|
||||
'ぐまい': ['ぐ'],
|
||||
'すまい': ['す'],
|
||||
'つまい': ['つ', 'る', 'う'],
|
||||
'ぬまい': ['ぬ'],
|
||||
'ぶまい': ['ぶ'],
|
||||
'むまい': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative potential form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'けない': ['く'],
|
||||
'げない': ['ぐ'],
|
||||
'せない': ['す'],
|
||||
'てない': ['つ', 'る', 'う'],
|
||||
'ねない': ['ぬ'],
|
||||
'べない': ['ぶ'],
|
||||
'めない': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative passive form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かれない': ['く'],
|
||||
'がれない': ['ぐ'],
|
||||
'されない': ['す'],
|
||||
'たれない': ['つ', 'る', 'う'],
|
||||
'なれない': ['ぬ'],
|
||||
'ばれない': ['ぶ'],
|
||||
'まれない': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative causative form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かせない': ['く'],
|
||||
'がせない': ['ぐ'],
|
||||
'させない': ['す'],
|
||||
'たせない': ['つ', 'る', 'う'],
|
||||
'なせない': ['ぬ'],
|
||||
'ばせない': ['ぶ'],
|
||||
'ませない': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative causative-passive form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'かされない': ['く'],
|
||||
'がされない': ['ぐ'],
|
||||
'されない': ['す'],
|
||||
'たされない': ['つ', 'る', 'う'],
|
||||
'なされない': ['ぬ'],
|
||||
'ばされない': ['ぶ'],
|
||||
'まされない': ['む'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative imperative form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'うな': ['う'],
|
||||
'くな': ['く'],
|
||||
'ぐな': ['ぐ'],
|
||||
'すな': ['す'],
|
||||
'つな': ['つ'],
|
||||
'ぬな': ['ぬ'],
|
||||
'ぶな': ['ぶ'],
|
||||
'むな': ['む'],
|
||||
'るな': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - desire form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'きたい': ['く'],
|
||||
'ぎたい': ['ぐ'],
|
||||
'したい': ['す'],
|
||||
'ちたい': ['つ'],
|
||||
'にたい': ['ぬ'],
|
||||
'びたい': ['ぶ'],
|
||||
'みたい': ['む'],
|
||||
'りたい': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative desire form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'いたくない': ['う'],
|
||||
'きたくない': ['く'],
|
||||
'ぎたくない': ['ぐ'],
|
||||
'したくない': ['す'],
|
||||
'ちたくない': ['つ'],
|
||||
'にたくない': ['ぬ'],
|
||||
'びたくない': ['ぶ'],
|
||||
'みたくない': ['む'],
|
||||
'りたくない': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - past desire form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'きたかった': ['く'],
|
||||
'ぎたかった': ['ぐ'],
|
||||
'したかった': ['す'],
|
||||
'ちたかった': ['つ'],
|
||||
'にたかった': ['ぬ'],
|
||||
'びたかった': ['ぶ'],
|
||||
'みたかった': ['む'],
|
||||
'りたかった': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
LemmatizationRule(
|
||||
name: 'Godan verb - negative past desire form',
|
||||
pattern: AllomorphPattern(
|
||||
patterns: {
|
||||
'いたくなかった': ['う'],
|
||||
'きたくなかった': ['く'],
|
||||
'ぎたくなかった': ['ぐ'],
|
||||
'したくなかった': ['す'],
|
||||
'ちたくなかった': ['つ'],
|
||||
'にたくなかった': ['ぬ'],
|
||||
'びたくなかった': ['ぶ'],
|
||||
'みたくなかった': ['む'],
|
||||
'りたくなかった': ['る'],
|
||||
},
|
||||
type: LemmatizationRuleType.suffix,
|
||||
),
|
||||
validChildClasses: [WordClass.godanVerb],
|
||||
wordClass: WordClass.godanVerb,
|
||||
),
|
||||
];
|
||||
62
lib/util/lemmatizer/rules/i-adjectives.dart
Normal file
62
lib/util/lemmatizer/rules/i-adjectives.dart
Normal file
@@ -0,0 +1,62 @@
|
||||
|
||||
import 'package:jadb/util/lemmatizer/lemmatizer.dart';
|
||||
|
||||
List<LemmatizationRule> iAdjectiveLemmatizationRules = [
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - base form',
|
||||
terminal: true,
|
||||
pattern: 'い',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - negative form',
|
||||
pattern: 'くない',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - past form',
|
||||
pattern: 'かった',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - negative past form',
|
||||
pattern: 'くなかった',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - te-form',
|
||||
pattern: 'くて',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - conditional form',
|
||||
pattern: 'ければ',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - volitional form',
|
||||
pattern: 'かろう',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'I adjective - continuative form',
|
||||
pattern: 'く',
|
||||
replacement: 'い',
|
||||
validChildClasses: [WordClass.iAdjective],
|
||||
wordClass: WordClass.iAdjective,
|
||||
),
|
||||
];
|
||||
241
lib/util/lemmatizer/rules/ichidan-verbs.dart
Normal file
241
lib/util/lemmatizer/rules/ichidan-verbs.dart
Normal file
@@ -0,0 +1,241 @@
|
||||
import 'package:jadb/util/lemmatizer/lemmatizer.dart';
|
||||
import 'package:jadb/util/text_filtering.dart';
|
||||
|
||||
List<Pattern> lookBehinds = [
|
||||
kanjiRegex,
|
||||
'き',
|
||||
'ぎ',
|
||||
'し',
|
||||
'じ',
|
||||
'ち',
|
||||
'ぢ',
|
||||
'に',
|
||||
'ひ',
|
||||
'び',
|
||||
'び',
|
||||
'み',
|
||||
'り',
|
||||
'け',
|
||||
'げ',
|
||||
'せ',
|
||||
'ぜ',
|
||||
'て',
|
||||
'で',
|
||||
'ね',
|
||||
'へ',
|
||||
'べ',
|
||||
'め',
|
||||
'れ',
|
||||
];
|
||||
|
||||
List<LemmatizationRule> ichidanVerbLemmatizationRules = [
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - base form',
|
||||
terminal: true,
|
||||
pattern: 'る',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative form',
|
||||
pattern: 'ない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - past form',
|
||||
pattern: 'た',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - te-form',
|
||||
pattern: 'て',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - te-form with いる',
|
||||
pattern: 'ている',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - te-form with いた',
|
||||
pattern: 'ていた',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - conditional form',
|
||||
pattern: 'れば',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - volitional form',
|
||||
pattern: 'よう',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - potential form',
|
||||
pattern: 'られる',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - passive form',
|
||||
pattern: 'られる',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - causative form',
|
||||
pattern: 'させる',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - causative passive form',
|
||||
pattern: 'させられる',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - imperative form',
|
||||
pattern: 'れ',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative past form',
|
||||
pattern: 'なかった',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative te-form',
|
||||
pattern: 'なくて',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative conditional form',
|
||||
pattern: 'なければ',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative volitional form',
|
||||
pattern: 'なかろう',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative potential form',
|
||||
pattern: 'られない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative passive form',
|
||||
pattern: 'られない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative causative form',
|
||||
pattern: 'させない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative causative passive form',
|
||||
pattern: 'させられない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative imperative form',
|
||||
pattern: 'るな',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - desire form',
|
||||
pattern: 'たい',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative desire form',
|
||||
pattern: 'たくない',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - past desire form',
|
||||
pattern: 'たかった',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
LemmatizationRule.simple(
|
||||
name: 'Ichidan verb - negative past desire form',
|
||||
pattern: 'たくなかった',
|
||||
replacement: 'る',
|
||||
lookAheadBehind: lookBehinds,
|
||||
validChildClasses: [WordClass.ichidanVerb],
|
||||
wordClass: WordClass.ichidanVerb,
|
||||
),
|
||||
];
|
||||
Reference in New Issue
Block a user