services/metadata_fetchers: init

Co-authored-by: Oystein Kristoffer Tveit <oysteikt@pvv.ntnu.no>
2024-07-27 22:18:59 +02:00
parent ec448c9f57
commit b2432e782e
8 changed files with 402 additions and 5 deletions
--- a/worblehat/services/bookcase_item.py
+++ b/worblehat/services/bookcase_item.py
@@ -3,6 +3,7 @@ import isbnlib
 from sqlalchemy import select
 from sqlalchemy.orm import Session

+from .metadata_fetchers import fetch_metadata_from_multiple_sources
 from ..models import (
    Author,
    BookcaseItem,
@@ -25,20 +26,32 @@ def is_valid_isbn(isbn: str) -> bool:


 def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None:
-    metadata = isbnlib.meta(isbn, 'openl')
-    if len(metadata.keys()) == 0:
+    """
+    This function fetches metadata for the given ISBN and creates a BookcaseItem from it.
+    It does so using a database connection to connect it to the correct authors and language
+    through the sql ORM.
+
+    If no metadata is found, None is returned.
+
+    Please not that the returned BookcaseItem will likely not be fully populated with the required
+    data, such as the book's location in the library, and the owner of the book, etc.
+    """
+    metadata = fetch_metadata_from_multiple_sources(isbn)
+    if len(metadata) == 0:
        return None

+    metadata = metadata[0]
+
    bookcase_item = BookcaseItem(
-        name = metadata.get('Title'),
+        name = metadata.title,
        isbn = int(isbn.replace('-', '')),
    )

-    if len(authors := metadata.get('Authors')) > 0:
+    if len(authors := metadata.authors) > 0:
        for author in authors:
            bookcase_item.authors.add(Author(author))

-    if (language := metadata.get('Language')):
+    if (language := metadata.language):
        bookcase_item.language = sql_session.scalars(
            select(Language)
            .where(Language.iso639_1_code == language)
--- a/worblehat/services/metadata_fetchers/BookMetadata.py
+++ b/worblehat/services/metadata_fetchers/BookMetadata.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+from typing import Set
+
+
+# TODO: Add more languages
+LANGUAGES: set[str] = set([
+    "no",
+    "en",
+    "de",
+    "fr",
+    "es",
+    "it",
+    "sv",
+    "da",
+    "fi",
+    "ru",
+    "zh",
+    "ja",
+    "ko",
+])
+
+
+@dataclass
+class BookMetadata:
+    """A class representing metadata for a book."""
+    isbn: str
+    title: str
+    # The source of the metadata provider
+    source: str
+    authors: Set[str]
+    language: str | None
+    publish_date: str | None
+    num_pages: int | None
+    subjects: Set[str]
+
+    def to_dict(self) -> dict[str, any]:
+        return {
+            'isbn': self.isbn,
+            'title': self.title,
+            'source': self.metadata_source_id(),
+            'authors': set() if self.authors is None else self.authors,
+            'language': self.language,
+            'publish_date': self.publish_date,
+            'num_pages': self.num_pages,
+            'subjects': set() if self.subjects is None else self.subjects
+        }
+
+    def validate(self) -> None:
+        if not self.isbn:
+            raise ValueError('Missing ISBN')
+        if not self.title:
+            raise ValueError('Missing title')
+        if not self.source:
+            raise ValueError('Missing source')
+        if not self.authors:
+            raise ValueError('Missing authors')
+
+        if self.language is not None and self.language not in LANGUAGES:
+            raise ValueError(f'Invalid language: {self.language}. Consider adding it to the LANGUAGES set if you think this is a mistake.')
+
+        if self.num_pages is not None and self.num_pages < 0:
+            raise ValueError(f'Invalid number of pages: {self.num_pages}')
--- a/worblehat/services/metadata_fetchers/BookMetadataFetcher.py
+++ b/worblehat/services/metadata_fetchers/BookMetadataFetcher.py
@@ -0,0 +1,20 @@
+#base fetcher.
+from abc import ABC, abstractmethod
+from .BookMetadata import BookMetadata
+
+class BookMetadataFetcher(ABC):
+    """
+    A base class for metadata fetchers.
+    """
+
+    @classmethod
+    @abstractmethod
+    def metadata_source_id(cls) -> str:
+        """Returns a unique identifier for the metadata source, to identify where the metadata came from."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
+        """Tries to fetch metadata for the given ISBN."""
+        pass
--- a/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py
+++ b/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py
@@ -0,0 +1,51 @@
+"""
+A BookMetadataFetcher for the Google Books API.
+"""
+
+import requests
+
+from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
+from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
+
+
+class GoogleBooksFetcher(BookMetadataFetcher):
+    @classmethod
+    def metadata_source_id(_cls) -> str:
+      return "google_books"
+
+    @classmethod
+    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
+        try:
+            jsonInput = requests.get(
+                f"https://www.googleapis.com/books/v1/volumes",
+                params = {"q": f"isbn:{isbn}"},
+            ).json()
+            data = jsonInput.get("items")[0].get("volumeInfo")
+
+            authors = set(data.get("authors") or [])
+            title = data.get("title")
+            publishDate = data.get("publish_date")
+            numberOfPages = data.get("number_of_pages")
+            if numberOfPages:
+                numberOfPages = int(numberOfPages)
+            subjects = set(data.get("categories") or [])
+            languages = data.get("languages")
+        except Exception:
+            return None
+
+        return BookMetadata(
+            isbn = isbn,
+            title = title,
+            source = cls.metadata_source_id(),
+            authors = authors,
+            language = languages,
+            publish_date = publishDate,
+            num_pages = numberOfPages,
+            subjects = subjects,
+        )
+
+
+if __name__ == '__main__':
+    book_data = GoogleBooksFetcher.fetch_metadata('0132624788')
+    book_data.validate()
+    print(book_data)
--- a/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py
+++ b/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py
@@ -0,0 +1,61 @@
+"""
+A BookMetadataFetcher for the Open Library API.
+"""
+
+import requests
+
+from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
+from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
+
+LANGUAGE_MAP = {
+    "Norwegian": "no",
+}
+
+
+class OpenLibraryFetcher(BookMetadataFetcher):
+    @classmethod
+    def metadata_source_id(_cls) -> str:
+      return "open_library"
+
+    @classmethod
+    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
+        try:
+            jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json()
+
+            author_keys = jsonInput.get("authors") or []
+            author_names = set()
+            for author_key in author_keys:
+                key = author_key.get('key')
+                author_name = requests.get(f"https://openlibrary.org/{key}.json").json().get("name")
+                author_names.add(author_name)
+
+            title = jsonInput.get("title")
+            publishDate = jsonInput.get("publish_date")
+
+            numberOfPages = jsonInput.get("number_of_pages")
+            if numberOfPages:
+                numberOfPages = int(numberOfPages)
+
+            language_key = jsonInput.get("languages")[0].get("key")
+            language = requests.get(f"https://openlibrary.org/{language_key}.json").json().get("identifiers").get("iso_639_1")[0]
+            subjects = set(jsonInput.get("subjects") or [])
+
+        except Exception:
+            return None
+
+        return BookMetadata(
+            isbn = isbn,
+            title = title,
+            source = cls.metadata_source_id(),
+            authors = author_names,
+            language = language,
+            publish_date = publishDate,
+            num_pages = numberOfPages,
+            subjects = subjects,
+        )
+
+
+if __name__ == '__main__':
+    book_data = OpenLibraryFetcher.fetch_metadata('9788205530751')
+    book_data.validate()
+    print(book_data)
--- a/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py
+++ b/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py
@@ -0,0 +1,109 @@
+"""
+A BookMetadataFetcher that webscrapes https://outland.no/
+"""
+
+from bs4 import BeautifulSoup
+
+import requests
+
+from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
+from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
+
+
+LANGUAGE_MAP = {
+    "Norsk": "no",
+    "Engelsk": "en",
+    "Tysk": "de",
+    "Fransk": "fr",
+    "Spansk": "es",
+    "Italiensk": "it",
+    "Svensk": "sv",
+    "Dansk": "da",
+    "Finsk": "fi",
+    "Russisk": "ru",
+    "Kinesisk": "zh",
+    "Japansk": "ja",
+    "Koreansk": "ko",
+}
+
+
+class OutlandScraperFetcher(BookMetadataFetcher):
+    @classmethod
+    def metadata_source_id(_cls) -> str:
+      return "outland_scraper"
+
+    @classmethod
+    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
+        try:
+            # Find the link to the product page
+            response = requests.get(f"https://outland.no/{isbn}")
+            soup = BeautifulSoup(response.content, "html.parser")
+            soup = soup.find_all("a", class_="product-item-link")
+            href = soup[0].get("href")
+
+            # Find the metadata on the product page
+            response = requests.get(href)
+            soup = BeautifulSoup(response.content, "html.parser")
+            data = soup.find_all("td", class_="col data")
+
+            # Collect the metadata
+            title = soup.find_all("span", class_="base")[0].text
+
+            releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
+            releaseDate = releaseDate[-4:] # only keep year
+
+            bookData = {
+                "Title": title,
+                "PublishDate": releaseDate,
+                "Authors": None,
+                "NumberOfPages": None,
+                "Genre": None,
+                "Language": None,
+                "Subjects": None,
+            }
+
+            dataKeyMap = {
+                "Authors": "Forfattere",
+                "NumberOfPages": "Antall Sider",
+                "Genre": "Sjanger",
+                "Language": "Språk",
+                "Subjects": "Serie"
+            }
+
+            for value in data:
+                for key in dataKeyMap:
+                    if str(value).lower().__contains__(dataKeyMap[key].lower()):
+                        bookData[key] = value.text
+                        break
+
+            if bookData["Language"] is not None:
+                bookData["Language"] = LANGUAGE_MAP.get(bookData["Language"])
+
+            if bookData["Authors"] is not None:
+                bookData["Authors"] = set(bookData["Authors"].split(", "))
+
+            if bookData["Subjects"] is not None:
+                bookData["Subjects"] = set(bookData["Subjects"].split(", "))
+
+            if bookData["NumberOfPages"] is not None:
+                bookData["NumberOfPages"] = int(bookData["NumberOfPages"])
+
+        except Exception:
+            return None
+
+        return BookMetadata(
+            isbn = isbn,
+            title = bookData.get('Title'),
+            source = cls.metadata_source_id(),
+            authors = bookData.get('Authors'),
+            language = bookData.get('Language'),
+            publish_date = bookData.get('PublishDate'),
+            num_pages = bookData.get('NumberOfPages'),
+            subjects = bookData.get('Subjects'),
+        )
+
+
+if __name__ == '__main__':
+    book_data = OutlandScraperFetcher.fetch_metadata('9781947808225')
+    book_data.validate()
+    print(book_data)
--- a/worblehat/services/metadata_fetchers/init.py
+++ b/worblehat/services/metadata_fetchers/init.py
@@ -0,0 +1 @@
+from .book_metadata_fetcher import fetch_metadata_from_multiple_sources
--- a/worblehat/services/metadata_fetchers/book_metadata_fetcher.py
+++ b/worblehat/services/metadata_fetchers/book_metadata_fetcher.py
@@ -0,0 +1,80 @@
+"""
+this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result.
+
+"""
+
+from concurrent.futures import ThreadPoolExecutor
+
+from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
+from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
+
+from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher
+from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher
+from worblehat.services.metadata_fetchers.OutlandScraperFetcher import OutlandScraperFetcher
+
+
+# The order of these fetchers determines the priority of the sources.
+# The first fetcher in the list has the highest priority.
+FETCHERS: list[BookMetadataFetcher] = [
+    OpenLibraryFetcher,
+    GoogleBooksFetcher,
+    OutlandScraperFetcher,
+]
+
+
+FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS]
+
+
+def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]:
+    """
+    Sorts the given metadata by the priority of the sources.
+
+    The order of the metadata is the same as the order of the sources in the FETCHERS list.
+    """
+
+    # Note that this function is O(n^2) but the number of fetchers is small so it's fine.
+    return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
+
+
+def fetch_metadata_from_multiple_sources(isbn: str, strict=False) -> list[BookMetadata]:
+    """
+    Returns a list of metadata fetched from multiple sources.
+
+    Sources that does not have metadata for the given ISBN will be ignored.
+
+    There is no guarantee that there will be any metadata.
+
+    The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
+    """
+    isbn = isbn.replace('-', '').replace('_', '').strip().lower()
+    if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
+        raise ValueError('Invalid ISBN')
+
+    results: list[BookMetadata] = []
+
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS]
+
+    for future in futures:
+        result = future.result()
+        if result is not None:
+            results.append(result)
+
+    for result in results:
+        try:
+            result.validate()
+        except ValueError as e:
+            if strict:
+                raise e
+            else:
+                print(f'Invalid metadata: {e}')
+                results.remove(result)
+
+    return sort_metadata_by_priority(results)
+
+
+if __name__ == '__main__':
+    from pprint import pprint
+    isbn = '0132624788'
+    metadata = fetch_metadata_from_multiple_sources(isbn)
+    pprint(metadata)
				`@@ -0,0 +1 @@`
				`from .book_metadata_fetcher import fetch_metadata_from_multiple_sources`