lib/search/word_search: score and order results by several metrics

2025-05-16 18:46:39 +02:00
parent 90d5717928
commit 33cf5028f4
4 changed files with 198 additions and 46 deletions
--- a/lib/models/word_search/word_search_result.dart
+++ b/lib/models/word_search/word_search_result.dart
@@ -7,6 +7,9 @@ import 'package:jadb/models/word_search/word_search_sources.dart';

 /// A class representing a single dictionary entry from a word search.
 class WordSearchResult {
+  /// The score of the entry, used for sorting results.
+  final int score;
+
  /// The ID of the entry in the database.
  final int entryId;

@@ -32,6 +35,7 @@ class WordSearchResult {
  final WordSearchSources sources;

  const WordSearchResult({
+    required this.score,
    required this.entryId,
    required this.isCommon,
    required this.japanese,
@@ -43,6 +47,7 @@ class WordSearchResult {
  });

  Map<String, dynamic> toJson() => {
+        '_score': score,
        'entryId': entryId,
        'isCommon': isCommon,
        'japanese': japanese.map((e) => e.toJson()).toList(),
@@ -57,6 +62,7 @@ class WordSearchResult {

  factory WordSearchResult.fromJson(Map<String, dynamic> json) =>
      WordSearchResult(
+        score: json['_score'] as int,
        entryId: json['entryId'] as int,
        isCommon: json['isCommon'] as bool,
        japanese: (json['japanese'] as List<dynamic>)
--- a/lib/search/word_search/entry_id_query.dart
+++ b/lib/search/word_search/entry_id_query.dart
@@ -1,7 +1,16 @@
+import 'package:jadb/_data_ingestion/jmdict/table_names.dart';
+import 'package:jadb/_data_ingestion/tanos-jlpt/table_names.dart';
 import 'package:jadb/search/word_search/word_search.dart';
 import 'package:jadb/util/text_filtering.dart';
 import 'package:sqflite_common/sqlite_api.dart';

+class ScoredEntryId {
+  final int entryId;
+  final int score;
+
+  const ScoredEntryId(this.entryId, this.score);
+}
+
 SearchMode _determineSearchMode(String word) {
  final bool containsKanji = kanjiRegex.hasMatch(word);
  final bool containsAscii = RegExp(r'[A-Za-z]').hasMatch(word);
@@ -19,7 +28,123 @@ SearchMode _determineSearchMode(String word) {
  }
 }

-Future<List<int>> fetchEntryIds(
+(String, List<Object?>) _kanjiReadingTemplate(
+  String tableName,
+  String word,
+  int pageSize,
+) =>
+    (
+      '''
+        WITH
+          fts_results AS (
+            SELECT
+              "${tableName}FTS"."entryId",
+              CASE
+                WHEN "${tableName}FTS"."reading" = ? THEN 150
+                ELSE 100
+              END
+              - (substr(COALESCE("${TanosJLPTTableNames.jlptTag}"."jlptLevel", 'N0'), 2) * -5)
+              + CASE
+                WHEN "${tableName}"."news" = 1
+                  OR "${tableName}"."ichi" = 1
+                  OR "${tableName}"."spec" = 1
+                  OR "${tableName}"."gai" = 1
+                  THEN 50
+                ELSE 0
+              END
+              + CASE
+                WHEN "${tableName}"."news" = 1 THEN 10
+                WHEN "${tableName}"."news" = 2 THEN 5
+                ELSE 0
+              END
+              + CASE
+                WHEN "${tableName}"."ichi" = 1 THEN 10
+                WHEN "${tableName}"."ichi" = 2 THEN 5
+                ELSE 0
+              END
+              + CASE
+                WHEN "${tableName}"."spec" = 1 THEN 10
+                WHEN "${tableName}"."spec" = 2 THEN 5
+                ELSE 0
+              END
+              + CASE
+                WHEN "${tableName}"."gai" = 1 THEN 10
+                WHEN "${tableName}"."gai" = 2 THEN 5
+                ELSE 0
+              END
+              + CASE
+                WHEN "${tableName}"."orderNum" = 1 THEN 20
+                ELSE 0
+              END
+              AS "score"
+            FROM "${tableName}FTS"
+            LEFT JOIN "${TanosJLPTTableNames.jlptTag}" USING ("entryId")
+            JOIN "${tableName}" USING ("entryId", "reading")
+            WHERE "${tableName}FTS"."reading" MATCH ? || '*'
+            ORDER BY "score" DESC
+            LIMIT ?
+          )
+
+        SELECT *
+        FROM "fts_results"
+        UNION ALL
+        SELECT
+          "entryId",
+          50
+          - (substr(COALESCE("${TanosJLPTTableNames.jlptTag}"."jlptLevel", 'N0'), 2) * -5)
+          + CASE
+            WHEN "${tableName}"."news" = 1
+              OR "${tableName}"."ichi" = 1
+              OR "${tableName}"."spec" = 1
+              OR "${tableName}"."gai" = 1
+              THEN 50
+            ELSE 0
+          END
+          + CASE
+            WHEN "${tableName}"."news" = 1 THEN 10
+            WHEN "${tableName}"."news" = 2 THEN 5
+            ELSE 0
+          END
+          + CASE
+            WHEN "${tableName}"."ichi" = 1 THEN 10
+            WHEN "${tableName}"."ichi" = 2 THEN 5
+            ELSE 0
+          END
+          + CASE
+            WHEN "${tableName}"."spec" = 1 THEN 10
+            WHEN "${tableName}"."spec" = 2 THEN 5
+            ELSE 0
+          END
+          + CASE
+            WHEN "${tableName}"."gai" = 1 THEN 10
+            WHEN "${tableName}"."gai" = 2 THEN 5
+            ELSE 0
+          END
+          + CASE
+            WHEN "orderNum" = 1 THEN 20
+            ELSE 0
+          END
+          AS "score"
+        FROM "${tableName}"
+        LEFT JOIN "${TanosJLPTTableNames.jlptTag}" USING ("entryId")
+        WHERE "reading" LIKE '%' || ? || '%'
+          AND "entryId" NOT IN (SELECT "entryId" FROM "fts_results")
+        ORDER BY
+          "score" DESC,
+          "entryId" ASC
+        LIMIT ?
+      '''
+          .trim(),
+      [
+        word,
+        word,
+        pageSize,
+        word,
+        pageSize,
+      ]
+    );
+
+Future<List<ScoredEntryId>> fetchEntryIds(
  DatabaseExecutor connection,
  String word,
  SearchMode searchMode,
@@ -35,41 +160,60 @@ Future<List<int>> fetchEntryIds(
    'Word should not be empty when fetching entry IDs',
  );

-  late final List<int> entryIds;
-  if (searchMode == SearchMode.Kanji) {
-    entryIds = (await connection.query(
-      'JMdict_EntryByKanji',
-      columns: ['entryId'],
-      where: 'kanji LIKE ?',
-      whereArgs: ['%$word%'],
-      limit: pageSize,
-      offset: offset,
-    ))
-        .map((row) => row['entryId'] as int)
-        .toList();
-  } else if (searchMode == SearchMode.Kana) {
-    entryIds = (await connection.query(
-      'JMdict_EntryByKana',
-      columns: ['entryId'],
-      where: 'kana LIKE ?',
-      whereArgs: ['%$word%'],
-      limit: pageSize,
-      offset: offset,
-    ))
-        .map((row) => row['entryId'] as int)
-        .toList();
-  } else {
-    entryIds = (await connection.query(
-      'JMdict_EntryByEnglish',
-      columns: ['entryId'],
-      where: 'english LIKE ?',
-      whereArgs: ['%$word%'],
-      limit: pageSize,
-      offset: offset,
-    ))
-        .map((row) => row['entryId'] as int)
-        .toList();
+  late final List<ScoredEntryId> entryIds;
+  switch (searchMode) {
+    case SearchMode.Kanji:
+      final (query, args) = _kanjiReadingTemplate(
+        JMdictTableNames.kanjiElement,
+        word,
+        pageSize,
+      );
+      entryIds = (await connection.rawQuery(query, args))
+          .map((row) => ScoredEntryId(
+                row['entryId'] as int,
+                row['score'] as int,
+              ))
+          .toList();
+      break;
+
+    case SearchMode.Kana:
+      final (query, args) = _kanjiReadingTemplate(
+        JMdictTableNames.readingElement,
+        word,
+        pageSize,
+      );
+      entryIds = (await connection.rawQuery(query, args))
+          .map((row) => ScoredEntryId(
+                row['entryId'] as int,
+                row['score'] as int,
+              ))
+          .toList();
+      break;
+
+    case SearchMode.English:
+      entryIds = (await connection.query(
+        JMdictTableNames.senseGlossary,
+        columns: ['entryId'],
+        where: 'english LIKE ?',
+        whereArgs: ['%$word%'],
+        limit: pageSize,
+        offset: offset,
+      ))
+          .map((row) => ScoredEntryId(
+                row['entryId'] as int,
+                0,
+              ))
+          .toList();
+      break;
+
+    case SearchMode.MixedKana:
+    case SearchMode.MixedKanji:
+    default:
+      throw UnimplementedError(
+        'Search mode $searchMode is not implemented',
+      );
  }
+  ;

  return entryIds;
 }
--- a/lib/search/word_search/regrouping.dart
+++ b/lib/search/word_search/regrouping.dart
@@ -12,9 +12,10 @@ import 'package:jadb/models/word_search/word_search_sense.dart';
 import 'package:jadb/models/word_search/word_search_sense_language_source.dart';
 import 'package:jadb/models/word_search/word_search_sources.dart';
 import 'package:jadb/models/word_search/word_search_xref_entry.dart';
+import 'package:jadb/search/word_search/entry_id_query.dart';

 List<WordSearchResult> regroupWordSearchResults({
-  required List<int> entryIds,
+  required List<ScoredEntryId> entryIds,
  required List<Map<String, Object?>> readingElements,
  required List<Map<String, Object?>> kanjiElements,
  required List<Map<String, Object?>> jlptTags,
@@ -41,17 +42,17 @@ List<WordSearchResult> regroupWordSearchResults({
  final commonEntryIds =
      commonEntries.map((entry) => entry['entryId'] as int).toSet();

-  for (final entryId in entryIds) {
+  for (final scoredEntryId in entryIds) {
    final List<Map<String, Object?>> entryReadingElements = readingElements
-        .where((element) => element['entryId'] == entryId)
+        .where((element) => element['entryId'] == scoredEntryId.entryId)
        .toList();

    final List<Map<String, Object?>> entryKanjiElements = kanjiElements
-        .where((element) => element['entryId'] == entryId)
+        .where((element) => element['entryId'] == scoredEntryId.entryId)
        .toList();

    final List<Map<String, Object?>> entryJlptTags =
-        jlptTags.where((element) => element['entryId'] == entryId).toList();
+        jlptTags.where((element) => element['entryId'] == scoredEntryId.entryId).toList();

    final jlptLevel = entryJlptTags
            .map((e) => JlptLevel.fromString(e['jlptLevel'] as String?))
@@ -59,13 +60,13 @@ List<WordSearchResult> regroupWordSearchResults({
            .firstOrNull ??
        JlptLevel.none;

-    final isCommon = commonEntryIds.contains(entryId);
+    final isCommon = commonEntryIds.contains(scoredEntryId.entryId);

    final List<Map<String, Object?>> entrySenses =
-        senses.where((element) => element['entryId'] == entryId).toList();
+        senses.where((element) => element['entryId'] == scoredEntryId.entryId).toList();

    final GroupedWordResult entryReadingElementsGrouped = _regroup_words(
-      entryId: entryId,
+      entryId: scoredEntryId.entryId,
      readingElements: entryReadingElements,
      kanjiElements: entryKanjiElements,
      readingElementInfos: readingElementInfos,
@@ -91,7 +92,8 @@ List<WordSearchResult> regroupWordSearchResults({

    results.add(
      WordSearchResult(
-        entryId: entryId,
+        score: scoredEntryId.score,
+        entryId: scoredEntryId.entryId,
        isCommon: isCommon,
        japanese: entryReadingElementsGrouped.rubys,
        kanjiInfo: entryReadingElementsGrouped.kanjiInfos,
--- a/lib/search/word_search/word_search.dart
+++ b/lib/search/word_search/word_search.dart
@@ -33,7 +33,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(
  }

  final offset = page * pageSize;
-  final List<int> entryIds = await fetchEntryIds(
+  final List<ScoredEntryId> entryIds = await fetchEntryIds(
    connection,
    word,
    searchMode,
@@ -47,7 +47,7 @@ Future<List<WordSearchResult>?> searchWordWithDbConnection(

  final LinearWordQueryData linearWordQueryData = await fetchLinearWordQueryData(
    connection,
-    entryIds,
+    entryIds.map((e) => e.entryId).toList(),
  );

  final result = regroupWordSearchResults(