Files
jadb/lib/util/lemmatizer/lemmatizer.dart

288 lines
7.9 KiB
Dart

import 'package:collection/collection.dart';
import 'package:jadb/util/lemmatizer/rules.dart';
enum WordClass {
noun,
ichidanVerb,
godanVerb,
irregularVerb,
iAdjective,
nAdjective,
adverb,
particle,
input,
// TODO: add toString and fromString so it can be parsed by the cli
}
enum LemmatizationRuleType { prefix, suffix }
class LemmatizationRule {
final String name;
final AllomorphPattern pattern;
final WordClass wordClass;
final Set<WordClass>? validChildClasses;
final bool terminal;
const LemmatizationRule({
required this.name,
required this.pattern,
required this.wordClass,
this.validChildClasses,
this.terminal = false,
});
bool matches(String word) => pattern.matches(word);
List<String>? apply(String word) => pattern.apply(word);
LemmatizationRule.simple({
required String name,
required String pattern,
required String? replacement,
required WordClass wordClass,
Set<WordClass>? validChildClasses,
bool terminal = false,
List<Pattern> lookAheadBehind = const [''],
LemmatizationRuleType type = LemmatizationRuleType.suffix,
}) : this(
name: name,
pattern: AllomorphPattern(
patterns: {
pattern: replacement != null ? [replacement] : null,
},
type: type,
lookAheadBehind: lookAheadBehind,
),
validChildClasses: validChildClasses,
terminal: terminal,
wordClass: wordClass,
);
@override
int get hashCode => Object.hash(
name,
pattern,
wordClass,
validChildClasses,
terminal,
SetEquality().hash(validChildClasses),
);
@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
return other is LemmatizationRule &&
other.name == name &&
other.pattern == pattern &&
other.wordClass == wordClass &&
other.terminal == terminal &&
SetEquality().equals(validChildClasses, other.validChildClasses);
}
}
/// Represents a set of patterns for matching allomorphs in a word.
/// The patterns can be either a prefix or a suffix, and they can include
/// replacement characters for deconjugating into base forms.
class AllomorphPattern {
final List<Pattern> lookAheadBehind;
final Map<String, List<String>?> patterns;
final LemmatizationRuleType type;
const AllomorphPattern({
required this.patterns,
required this.type,
this.lookAheadBehind = const [''],
});
/// Convert the [patterns] into regexes
List<(String, Pattern)> get allPatternCombinations {
final combinations = <(String, Pattern)>[];
for (final l in lookAheadBehind) {
for (final p in patterns.keys) {
switch ((type, l is RegExp)) {
case (LemmatizationRuleType.prefix, true):
combinations.add((p, RegExp('^($p)(${(l as RegExp).pattern})')));
break;
case (LemmatizationRuleType.prefix, false):
combinations.add((p, '$p$l'));
break;
case (LemmatizationRuleType.suffix, true):
combinations.add((p, RegExp('(${(l as RegExp).pattern})($p)\$')));
break;
case (LemmatizationRuleType.suffix, false):
combinations.add((p, '$l$p'));
break;
}
}
}
return combinations;
}
/// Check whether an input string matches any of the [patterns]
bool matches(String word) {
for (final (_, p) in allPatternCombinations) {
if (p is String) {
if (type == LemmatizationRuleType.prefix
? word.startsWith(p)
: word.endsWith(p)) {
return true;
}
} else if (p is RegExp) {
if (p.hasMatch(word)) {
return true;
}
}
}
return false;
}
/// Apply the replacement for this pattern.
///
/// If none of the [patterns] apply, this function returns `null`.
List<String>? apply(String word) {
for (final (affix, p) in allPatternCombinations) {
switch ((type, p is RegExp)) {
case (LemmatizationRuleType.prefix, true):
final match = (p as RegExp).firstMatch(word);
if (match != null) {
final prefix = match.group(1)!;
assert(prefix == affix);
final suffix = word.substring(prefix.length);
return patterns[prefix] != null
? patterns[prefix]!.map((s) => s + suffix).toList()
: [suffix];
}
break;
case (LemmatizationRuleType.prefix, false):
if (word.startsWith(p as String)) {
return patterns[affix] != null
? patterns[affix]!
.map((s) => s + word.substring(affix.length))
.toList()
: [word.substring(affix.length)];
}
break;
case (LemmatizationRuleType.suffix, true):
final match = (p as RegExp).firstMatch(word);
if (match != null) {
final suffix = match.group(2)!;
assert(suffix == affix);
final prefix = word.substring(0, word.length - suffix.length);
return patterns[suffix] != null
? patterns[suffix]!.map((s) => prefix + s).toList()
: [prefix];
}
break;
case (LemmatizationRuleType.suffix, false):
if (word.endsWith(p as String)) {
final prefix = word.substring(0, word.length - affix.length);
return patterns[affix] != null
? patterns[affix]!.map((s) => prefix + s).toList()
: [prefix];
}
break;
}
}
return null;
}
@override
int get hashCode => Object.hash(
type,
ListEquality().hash(lookAheadBehind),
MapEquality().hash(patterns),
);
@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
return other is AllomorphPattern &&
other.type == type &&
ListEquality().equals(other.lookAheadBehind, lookAheadBehind) &&
MapEquality().equals(other.patterns, patterns);
}
}
class Lemmatized {
final String original;
final LemmatizationRule rule;
final int variant;
final List<Lemmatized> children;
const Lemmatized({
required this.original,
required this.rule,
this.variant = 0,
this.children = const [],
});
String? get applied {
final applied = rule.apply(original);
if (applied == null || applied.isEmpty) {
return null;
}
return applied[variant];
}
@override
String toString() {
final childrenString = children
.map((c) => ' - ${c.toString().split('\n').join('\n ')}')
.join('\n');
if (children.isEmpty) {
return '$original (${rule.name}) -> ${applied ?? '<null>'}';
} else {
return '$original (${rule.name}) -> ${applied ?? '<null>'}\n$childrenString';
}
}
}
List<Lemmatized> _lemmatize(LemmatizationRule parentRule, String word) {
final children = <Lemmatized>[];
if (parentRule.terminal) {
return children;
}
final filteredLemmatizationRules = parentRule.validChildClasses == null
? lemmatizationRules
: [
for (final wordClass in parentRule.validChildClasses!)
...lemmatizationRulesByWordClass[wordClass]!,
];
for (final rule in filteredLemmatizationRules) {
if (rule.matches(word)) {
final applied = rule.apply(word);
for (final (i, a) in (applied ?? []).indexed) {
final subChildren = _lemmatize(rule, a);
children.add(
Lemmatized(
original: word,
rule: rule,
variant: i,
children: subChildren,
),
);
}
}
}
return children;
}
Lemmatized lemmatize(String word) {
final inputRule = LemmatizationRule.simple(
name: 'Input',
pattern: '',
replacement: null,
wordClass: WordClass.input,
);
return Lemmatized(
original: word,
rule: inputRule,
children: _lemmatize(inputRule, word),
);
}