jadb/lib/util/lemmatizer/lemmatizer.dart

import 'package:collection/collection.dart';
import 'package:jadb/util/lemmatizer/rules.dart';

enum WordClass {
  noun,
  ichidanVerb,
  godanVerb,
  irregularVerb,
  iAdjective,
  nAdjective,
  adverb,
  particle,
  input,

  // TODO: add toString and fromString so it can be parsed by the cli
}

enum LemmatizationRuleType { prefix, suffix }

class LemmatizationRule {
  final String name;
  final AllomorphPattern pattern;
  final WordClass wordClass;
  final Set<WordClass>? validChildClasses;
  final bool terminal;

  const LemmatizationRule({
    required this.name,
    required this.pattern,
    required this.wordClass,
    this.validChildClasses,
    this.terminal = false,
  });

  bool matches(String word) => pattern.matches(word);

  List<String>? apply(String word) => pattern.apply(word);

  LemmatizationRule.simple({
    required String name,
    required String pattern,
    required String? replacement,
    required WordClass wordClass,
    Set<WordClass>? validChildClasses,
    bool terminal = false,
    List<Pattern> lookAheadBehind = const [''],
    LemmatizationRuleType type = LemmatizationRuleType.suffix,
  }) : this(
         name: name,
         pattern: AllomorphPattern(
           patterns: {
             pattern: replacement != null ? [replacement] : null,
           },
           type: type,
           lookAheadBehind: lookAheadBehind,
         ),
         validChildClasses: validChildClasses,
         terminal: terminal,
         wordClass: wordClass,
       );

  @override
  int get hashCode => Object.hash(
    name,
    pattern,
    wordClass,
    validChildClasses,
    terminal,
    SetEquality().hash(validChildClasses),
  );

  @override
  bool operator ==(Object other) {
    if (identical(this, other)) return true;
    return other is LemmatizationRule &&
        other.name == name &&
        other.pattern == pattern &&
        other.wordClass == wordClass &&
        other.terminal == terminal &&
        SetEquality().equals(validChildClasses, other.validChildClasses);
  }
}

/// Represents a set of patterns for matching allomorphs in a word.
/// The patterns can be either a prefix or a suffix, and they can include
/// replacement characters for deconjugating into base forms.
class AllomorphPattern {
  final List<Pattern> lookAheadBehind;
  final Map<String, List<String>?> patterns;
  final LemmatizationRuleType type;

  const AllomorphPattern({
    required this.patterns,
    required this.type,
    this.lookAheadBehind = const [''],
  });

  /// Convert the [patterns] into regexes
  List<(String, Pattern)> get allPatternCombinations {
    final combinations = <(String, Pattern)>[];
    for (final l in lookAheadBehind) {
      for (final p in patterns.keys) {
        switch ((type, l is RegExp)) {
          case (LemmatizationRuleType.prefix, true):
            combinations.add((p, RegExp('^($p)(${(l as RegExp).pattern})')));
            break;
          case (LemmatizationRuleType.prefix, false):
            combinations.add((p, '$p$l'));
            break;
          case (LemmatizationRuleType.suffix, true):
            combinations.add((p, RegExp('(${(l as RegExp).pattern})($p)\$')));
            break;
          case (LemmatizationRuleType.suffix, false):
            combinations.add((p, '$l$p'));
            break;
        }
      }
    }
    return combinations;
  }

  /// Check whether an input string matches any of the [patterns]
  bool matches(String word) {
    for (final (_, p) in allPatternCombinations) {
      if (p is String) {
        if (type == LemmatizationRuleType.prefix
            ? word.startsWith(p)
            : word.endsWith(p)) {
          return true;
        }
      } else if (p is RegExp) {
        if (p.hasMatch(word)) {
          return true;
        }
      }
    }
    return false;
  }

  /// Apply the replacement for this pattern.
  ///
  /// If none of the [patterns] apply, this function returns `null`.
  List<String>? apply(String word) {
    for (final (affix, p) in allPatternCombinations) {
      switch ((type, p is RegExp)) {
        case (LemmatizationRuleType.prefix, true):
          final match = (p as RegExp).firstMatch(word);
          if (match != null) {
            final prefix = match.group(1)!;
            assert(prefix == affix);
            final suffix = word.substring(prefix.length);
            return patterns[prefix] != null
                ? patterns[prefix]!.map((s) => s + suffix).toList()
                : [suffix];
          }
          break;
        case (LemmatizationRuleType.prefix, false):
          if (word.startsWith(p as String)) {
            return patterns[affix] != null
                ? patterns[affix]!
                      .map((s) => s + word.substring(affix.length))
                      .toList()
                : [word.substring(affix.length)];
          }
          break;
        case (LemmatizationRuleType.suffix, true):
          final match = (p as RegExp).firstMatch(word);
          if (match != null) {
            final suffix = match.group(2)!;
            assert(suffix == affix);
            final prefix = word.substring(0, word.length - suffix.length);
            return patterns[suffix] != null
                ? patterns[suffix]!.map((s) => prefix + s).toList()
                : [prefix];
          }
          break;
        case (LemmatizationRuleType.suffix, false):
          if (word.endsWith(p as String)) {
            final prefix = word.substring(0, word.length - affix.length);
            return patterns[affix] != null
                ? patterns[affix]!.map((s) => prefix + s).toList()
                : [prefix];
          }
          break;
      }
    }
    return null;
  }

  @override
  int get hashCode => Object.hash(
    type,
    ListEquality().hash(lookAheadBehind),
    MapEquality().hash(patterns),
  );

  @override
  bool operator ==(Object other) {
    if (identical(this, other)) return true;
    return other is AllomorphPattern &&
        other.type == type &&
        ListEquality().equals(other.lookAheadBehind, lookAheadBehind) &&
        MapEquality().equals(other.patterns, patterns);
  }
}

class Lemmatized {
  final String original;
  final LemmatizationRule rule;
  final int variant;
  final List<Lemmatized> children;

  const Lemmatized({
    required this.original,
    required this.rule,
    this.variant = 0,
    this.children = const [],
  });

  String? get applied {
    final applied = rule.apply(original);
    if (applied == null || applied.isEmpty) {
      return null;
    }
    return applied[variant];
  }

  @override
  String toString() {
    final childrenString = children
        .map((c) => '  - ${c.toString().split('\n').join('\n  ')}')
        .join('\n');

    if (children.isEmpty) {
      return '$original (${rule.name}) -> ${applied ?? '<null>'}';
    } else {
      return '$original (${rule.name}) -> ${applied ?? '<null>'}\n$childrenString';
    }
  }
}

List<Lemmatized> _lemmatize(LemmatizationRule parentRule, String word) {
  final children = <Lemmatized>[];

  if (parentRule.terminal) {
    return children;
  }

  final filteredLemmatizationRules = parentRule.validChildClasses == null
      ? lemmatizationRules
      : [
          for (final wordClass in parentRule.validChildClasses!)
            ...lemmatizationRulesByWordClass[wordClass]!,
        ];

  for (final rule in filteredLemmatizationRules) {
    if (rule.matches(word)) {
      final applied = rule.apply(word);
      for (final (i, a) in (applied ?? []).indexed) {
        final subChildren = _lemmatize(rule, a);
        children.add(
          Lemmatized(
            original: word,
            rule: rule,
            variant: i,
            children: subChildren,
          ),
        );
      }
    }
  }
  return children;
}

Lemmatized lemmatize(String word) {
  final inputRule = LemmatizationRule.simple(
    name: 'Input',
    pattern: '',
    replacement: null,
    wordClass: WordClass.input,
  );
  return Lemmatized(
    original: word,
    rule: inputRule,
    children: _lemmatize(inputRule, word),
  );
}