288 lines
7.9 KiB
Dart
288 lines
7.9 KiB
Dart
import 'package:collection/collection.dart';
|
|
import 'package:jadb/util/lemmatizer/rules.dart';
|
|
|
|
enum WordClass {
|
|
noun,
|
|
ichidanVerb,
|
|
godanVerb,
|
|
irregularVerb,
|
|
iAdjective,
|
|
nAdjective,
|
|
adverb,
|
|
particle,
|
|
input,
|
|
|
|
// TODO: add toString and fromString so it can be parsed by the cli
|
|
}
|
|
|
|
enum LemmatizationRuleType { prefix, suffix }
|
|
|
|
class LemmatizationRule {
|
|
final String name;
|
|
final AllomorphPattern pattern;
|
|
final WordClass wordClass;
|
|
final Set<WordClass>? validChildClasses;
|
|
final bool terminal;
|
|
|
|
const LemmatizationRule({
|
|
required this.name,
|
|
required this.pattern,
|
|
required this.wordClass,
|
|
this.validChildClasses,
|
|
this.terminal = false,
|
|
});
|
|
|
|
bool matches(String word) => pattern.matches(word);
|
|
|
|
List<String>? apply(String word) => pattern.apply(word);
|
|
|
|
LemmatizationRule.simple({
|
|
required String name,
|
|
required String pattern,
|
|
required String? replacement,
|
|
required WordClass wordClass,
|
|
Set<WordClass>? validChildClasses,
|
|
bool terminal = false,
|
|
List<Pattern> lookAheadBehind = const [''],
|
|
LemmatizationRuleType type = LemmatizationRuleType.suffix,
|
|
}) : this(
|
|
name: name,
|
|
pattern: AllomorphPattern(
|
|
patterns: {
|
|
pattern: replacement != null ? [replacement] : null,
|
|
},
|
|
type: type,
|
|
lookAheadBehind: lookAheadBehind,
|
|
),
|
|
validChildClasses: validChildClasses,
|
|
terminal: terminal,
|
|
wordClass: wordClass,
|
|
);
|
|
|
|
@override
|
|
int get hashCode => Object.hash(
|
|
name,
|
|
pattern,
|
|
wordClass,
|
|
validChildClasses,
|
|
terminal,
|
|
SetEquality().hash(validChildClasses),
|
|
);
|
|
|
|
@override
|
|
bool operator ==(Object other) {
|
|
if (identical(this, other)) return true;
|
|
return other is LemmatizationRule &&
|
|
other.name == name &&
|
|
other.pattern == pattern &&
|
|
other.wordClass == wordClass &&
|
|
other.terminal == terminal &&
|
|
SetEquality().equals(validChildClasses, other.validChildClasses);
|
|
}
|
|
}
|
|
|
|
/// Represents a set of patterns for matching allomorphs in a word.
|
|
/// The patterns can be either a prefix or a suffix, and they can include
|
|
/// replacement characters for deconjugating into base forms.
|
|
class AllomorphPattern {
|
|
final List<Pattern> lookAheadBehind;
|
|
final Map<String, List<String>?> patterns;
|
|
final LemmatizationRuleType type;
|
|
|
|
const AllomorphPattern({
|
|
required this.patterns,
|
|
required this.type,
|
|
this.lookAheadBehind = const [''],
|
|
});
|
|
|
|
/// Convert the [patterns] into regexes
|
|
List<(String, Pattern)> get allPatternCombinations {
|
|
final combinations = <(String, Pattern)>[];
|
|
for (final l in lookAheadBehind) {
|
|
for (final p in patterns.keys) {
|
|
switch ((type, l is RegExp)) {
|
|
case (LemmatizationRuleType.prefix, true):
|
|
combinations.add((p, RegExp('^($p)(${(l as RegExp).pattern})')));
|
|
break;
|
|
case (LemmatizationRuleType.prefix, false):
|
|
combinations.add((p, '$p$l'));
|
|
break;
|
|
case (LemmatizationRuleType.suffix, true):
|
|
combinations.add((p, RegExp('(${(l as RegExp).pattern})($p)\$')));
|
|
break;
|
|
case (LemmatizationRuleType.suffix, false):
|
|
combinations.add((p, '$l$p'));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return combinations;
|
|
}
|
|
|
|
/// Check whether an input string matches any of the [patterns]
|
|
bool matches(String word) {
|
|
for (final (_, p) in allPatternCombinations) {
|
|
if (p is String) {
|
|
if (type == LemmatizationRuleType.prefix
|
|
? word.startsWith(p)
|
|
: word.endsWith(p)) {
|
|
return true;
|
|
}
|
|
} else if (p is RegExp) {
|
|
if (p.hasMatch(word)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Apply the replacement for this pattern.
|
|
///
|
|
/// If none of the [patterns] apply, this function returns `null`.
|
|
List<String>? apply(String word) {
|
|
for (final (affix, p) in allPatternCombinations) {
|
|
switch ((type, p is RegExp)) {
|
|
case (LemmatizationRuleType.prefix, true):
|
|
final match = (p as RegExp).firstMatch(word);
|
|
if (match != null) {
|
|
final prefix = match.group(1)!;
|
|
assert(prefix == affix);
|
|
final suffix = word.substring(prefix.length);
|
|
return patterns[prefix] != null
|
|
? patterns[prefix]!.map((s) => s + suffix).toList()
|
|
: [suffix];
|
|
}
|
|
break;
|
|
case (LemmatizationRuleType.prefix, false):
|
|
if (word.startsWith(p as String)) {
|
|
return patterns[affix] != null
|
|
? patterns[affix]!
|
|
.map((s) => s + word.substring(affix.length))
|
|
.toList()
|
|
: [word.substring(affix.length)];
|
|
}
|
|
break;
|
|
case (LemmatizationRuleType.suffix, true):
|
|
final match = (p as RegExp).firstMatch(word);
|
|
if (match != null) {
|
|
final suffix = match.group(2)!;
|
|
assert(suffix == affix);
|
|
final prefix = word.substring(0, word.length - suffix.length);
|
|
return patterns[suffix] != null
|
|
? patterns[suffix]!.map((s) => prefix + s).toList()
|
|
: [prefix];
|
|
}
|
|
break;
|
|
case (LemmatizationRuleType.suffix, false):
|
|
if (word.endsWith(p as String)) {
|
|
final prefix = word.substring(0, word.length - affix.length);
|
|
return patterns[affix] != null
|
|
? patterns[affix]!.map((s) => prefix + s).toList()
|
|
: [prefix];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
@override
|
|
int get hashCode => Object.hash(
|
|
type,
|
|
ListEquality().hash(lookAheadBehind),
|
|
MapEquality().hash(patterns),
|
|
);
|
|
|
|
@override
|
|
bool operator ==(Object other) {
|
|
if (identical(this, other)) return true;
|
|
return other is AllomorphPattern &&
|
|
other.type == type &&
|
|
ListEquality().equals(other.lookAheadBehind, lookAheadBehind) &&
|
|
MapEquality().equals(other.patterns, patterns);
|
|
}
|
|
}
|
|
|
|
class Lemmatized {
|
|
final String original;
|
|
final LemmatizationRule rule;
|
|
final int variant;
|
|
final List<Lemmatized> children;
|
|
|
|
const Lemmatized({
|
|
required this.original,
|
|
required this.rule,
|
|
this.variant = 0,
|
|
this.children = const [],
|
|
});
|
|
|
|
String? get applied {
|
|
final applied = rule.apply(original);
|
|
if (applied == null || applied.isEmpty) {
|
|
return null;
|
|
}
|
|
return applied[variant];
|
|
}
|
|
|
|
@override
|
|
String toString() {
|
|
final childrenString = children
|
|
.map((c) => ' - ${c.toString().split('\n').join('\n ')}')
|
|
.join('\n');
|
|
|
|
if (children.isEmpty) {
|
|
return '$original (${rule.name}) -> ${applied ?? '<null>'}';
|
|
} else {
|
|
return '$original (${rule.name}) -> ${applied ?? '<null>'}\n$childrenString';
|
|
}
|
|
}
|
|
}
|
|
|
|
List<Lemmatized> _lemmatize(LemmatizationRule parentRule, String word) {
|
|
final children = <Lemmatized>[];
|
|
|
|
if (parentRule.terminal) {
|
|
return children;
|
|
}
|
|
|
|
final filteredLemmatizationRules = parentRule.validChildClasses == null
|
|
? lemmatizationRules
|
|
: [
|
|
for (final wordClass in parentRule.validChildClasses!)
|
|
...lemmatizationRulesByWordClass[wordClass]!,
|
|
];
|
|
|
|
for (final rule in filteredLemmatizationRules) {
|
|
if (rule.matches(word)) {
|
|
final applied = rule.apply(word);
|
|
for (final (i, a) in (applied ?? []).indexed) {
|
|
final subChildren = _lemmatize(rule, a);
|
|
children.add(
|
|
Lemmatized(
|
|
original: word,
|
|
rule: rule,
|
|
variant: i,
|
|
children: subChildren,
|
|
),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
return children;
|
|
}
|
|
|
|
Lemmatized lemmatize(String word) {
|
|
final inputRule = LemmatizationRule.simple(
|
|
name: 'Input',
|
|
pattern: '',
|
|
replacement: null,
|
|
wordClass: WordClass.input,
|
|
);
|
|
return Lemmatized(
|
|
original: word,
|
|
rule: inputRule,
|
|
children: _lemmatize(inputRule, word),
|
|
);
|
|
}
|