services/metadata_fetchers: init

Co-authored-by: Oystein Kristoffer Tveit <oysteikt@pvv.ntnu.no>
2024-07-27 22:18:59 +02:00 · 2024-07-27 22:18:59 +02:00 · b2432e782e
commit b2432e782e
parent ec448c9f57
8 changed files with 402 additions and 5 deletions
--- a/worblehat/services/bookcase_item.py
+++ b/worblehat/services/bookcase_item.py
@ -3,6 +3,7 @@ import isbnlib
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 from .metadata_fetchers import fetch_metadata_from_multiple_sources
 from ..models import (
    Author,
    BookcaseItem,
@ -25,20 +26,32 @@ def is_valid_isbn(isbn: str) -> bool:
 def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None:
-    metadata = isbnlib.meta(isbn, 'openl')
+    """
-    if len(metadata.keys()) == 0:
+    This function fetches metadata for the given ISBN and creates a BookcaseItem from it.
    It does so using a database connection to connect it to the correct authors and language
    through the sql ORM.
    If no metadata is found, None is returned.
    Please not that the returned BookcaseItem will likely not be fully populated with the required
    data, such as the book's location in the library, and the owner of the book, etc.
    """
    metadata = fetch_metadata_from_multiple_sources(isbn)
    if len(metadata) == 0:
        return None
    metadata = metadata[0]
    bookcase_item = BookcaseItem(
-        name = metadata.get('Title'),
+        name = metadata.title,
        isbn = int(isbn.replace('-', '')),
    )
-    if len(authors := metadata.get('Authors')) > 0:
+    if len(authors := metadata.authors) > 0:
        for author in authors:
            bookcase_item.authors.add(Author(author))
-    if (language := metadata.get('Language')):
+    if (language := metadata.language):
        bookcase_item.language = sql_session.scalars(
            select(Language)
            .where(Language.iso639_1_code == language)
--- a/worblehat/services/metadata_fetchers/BookMetadata.py
+++ b/worblehat/services/metadata_fetchers/BookMetadata.py
@ -0,0 +1,62 @@
 from dataclasses import dataclass
 from typing import Set
 # TODO: Add more languages
 LANGUAGES: set[str] = set([
    "no",
    "en",
    "de",
    "fr",
    "es",
    "it",
    "sv",
    "da",
    "fi",
    "ru",
    "zh",
    "ja",
    "ko",
 ])
@dataclass
 class BookMetadata:
    """A class representing metadata for a book."""
    isbn: str
    title: str
    # The source of the metadata provider
    source: str
    authors: Set[str]
    language: str | None
    publish_date: str | None
    num_pages: int | None
    subjects: Set[str]
    def to_dict(self) -> dict[str, any]:
        return {
            'isbn': self.isbn,
            'title': self.title,
            'source': self.metadata_source_id(),
            'authors': set() if self.authors is None else self.authors,
            'language': self.language,
            'publish_date': self.publish_date,
            'num_pages': self.num_pages,
            'subjects': set() if self.subjects is None else self.subjects
        }
    def validate(self) -> None:
        if not self.isbn:
            raise ValueError('Missing ISBN')
        if not self.title:
            raise ValueError('Missing title')
        if not self.source:
            raise ValueError('Missing source')
        if not self.authors:
            raise ValueError('Missing authors')
        if self.language is not None and self.language not in LANGUAGES:
            raise ValueError(f'Invalid language: {self.language}. Consider adding it to the LANGUAGES set if you think this is a mistake.')
        if self.num_pages is not None and self.num_pages < 0:
            raise ValueError(f'Invalid number of pages: {self.num_pages}')
--- a/worblehat/services/metadata_fetchers/BookMetadataFetcher.py
+++ b/worblehat/services/metadata_fetchers/BookMetadataFetcher.py
@ -0,0 +1,20 @@
 #base fetcher.
 from abc import ABC, abstractmethod
 from .BookMetadata import BookMetadata
 class BookMetadataFetcher(ABC):
    """
    A base class for metadata fetchers.
    """
    @classmethod
    @abstractmethod
    def metadata_source_id(cls) -> str:
        """Returns a unique identifier for the metadata source, to identify where the metadata came from."""
        pass
    @classmethod
    @abstractmethod
    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
        """Tries to fetch metadata for the given ISBN."""
        pass
--- a/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py
+++ b/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py
@ -0,0 +1,51 @@
 """
 A BookMetadataFetcher for the Google Books API.
 """
 import requests
 from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
 from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
 class GoogleBooksFetcher(BookMetadataFetcher):
    @classmethod
    def metadata_source_id(_cls) -> str:
      return "google_books"
    @classmethod
    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
        try:
            jsonInput = requests.get(
                f"https://www.googleapis.com/books/v1/volumes",
                params = {"q": f"isbn:{isbn}"},
            ).json()
            data = jsonInput.get("items")[0].get("volumeInfo")
            authors = set(data.get("authors") or [])
            title = data.get("title")
            publishDate = data.get("publish_date")
            numberOfPages = data.get("number_of_pages")
            if numberOfPages:
                numberOfPages = int(numberOfPages)
            subjects = set(data.get("categories") or [])
            languages = data.get("languages")
        except Exception:
            return None
        return BookMetadata(
            isbn = isbn,
            title = title,
            source = cls.metadata_source_id(),
            authors = authors,
            language = languages,
            publish_date = publishDate,
            num_pages = numberOfPages,
            subjects = subjects,
        )
 if __name__ == '__main__':
    book_data = GoogleBooksFetcher.fetch_metadata('0132624788')
    book_data.validate()
    print(book_data)
--- a/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py
+++ b/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py
@ -0,0 +1,61 @@
 """
 A BookMetadataFetcher for the Open Library API.
 """
 import requests
 from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
 from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
 LANGUAGE_MAP = {
    "Norwegian": "no",
 }
 class OpenLibraryFetcher(BookMetadataFetcher):
    @classmethod
    def metadata_source_id(_cls) -> str:
      return "open_library"
    @classmethod
    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
        try:
            jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json()
            author_keys = jsonInput.get("authors") or []
            author_names = set()
            for author_key in author_keys:
                key = author_key.get('key')
                author_name = requests.get(f"https://openlibrary.org/{key}.json").json().get("name")
                author_names.add(author_name)
            title = jsonInput.get("title")
            publishDate = jsonInput.get("publish_date")
            numberOfPages = jsonInput.get("number_of_pages")
            if numberOfPages:
                numberOfPages = int(numberOfPages)
            language_key = jsonInput.get("languages")[0].get("key")
            language = requests.get(f"https://openlibrary.org/{language_key}.json").json().get("identifiers").get("iso_639_1")[0]
            subjects = set(jsonInput.get("subjects") or [])
        except Exception:
            return None
        return BookMetadata(
            isbn = isbn,
            title = title,
            source = cls.metadata_source_id(),
            authors = author_names,
            language = language,
            publish_date = publishDate,
            num_pages = numberOfPages,
            subjects = subjects,
        )
 if __name__ == '__main__':
    book_data = OpenLibraryFetcher.fetch_metadata('9788205530751')
    book_data.validate()
    print(book_data)
--- a/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py
+++ b/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py
@ -0,0 +1,109 @@
 """
 A BookMetadataFetcher that webscrapes https://outland.no/
 """
 from bs4 import BeautifulSoup
 import requests
 from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
 from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
 LANGUAGE_MAP = {
    "Norsk": "no",
    "Engelsk": "en",
    "Tysk": "de",
    "Fransk": "fr",
    "Spansk": "es",
    "Italiensk": "it",
    "Svensk": "sv",
    "Dansk": "da",
    "Finsk": "fi",
    "Russisk": "ru",
    "Kinesisk": "zh",
    "Japansk": "ja",
    "Koreansk": "ko",
 }
 class OutlandScraperFetcher(BookMetadataFetcher):
    @classmethod
    def metadata_source_id(_cls) -> str:
      return "outland_scraper"
    @classmethod
    def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
        try:
            # Find the link to the product page
            response = requests.get(f"https://outland.no/{isbn}")
            soup = BeautifulSoup(response.content, "html.parser")
            soup = soup.find_all("a", class_="product-item-link")
            href = soup[0].get("href")
            # Find the metadata on the product page
            response = requests.get(href)
            soup = BeautifulSoup(response.content, "html.parser")
            data = soup.find_all("td", class_="col data")
            # Collect the metadata
            title = soup.find_all("span", class_="base")[0].text
            releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
            releaseDate = releaseDate[-4:] # only keep year
            bookData = {
                "Title": title,
                "PublishDate": releaseDate,
                "Authors": None,
                "NumberOfPages": None,
                "Genre": None,
                "Language": None,
                "Subjects": None,
            }
            dataKeyMap = {
                "Authors": "Forfattere",
                "NumberOfPages": "Antall Sider",
                "Genre": "Sjanger",
                "Language": "Språk",
                "Subjects": "Serie"
            }
            for value in data:
                for key in dataKeyMap:
                    if str(value).lower().__contains__(dataKeyMap[key].lower()):
                        bookData[key] = value.text
                        break
            if bookData["Language"] is not None:
                bookData["Language"] = LANGUAGE_MAP.get(bookData["Language"])
            if bookData["Authors"] is not None:
                bookData["Authors"] = set(bookData["Authors"].split(", "))
            if bookData["Subjects"] is not None:
                bookData["Subjects"] = set(bookData["Subjects"].split(", "))
            if bookData["NumberOfPages"] is not None:
                bookData["NumberOfPages"] = int(bookData["NumberOfPages"])
        except Exception:
            return None
        return BookMetadata(
            isbn = isbn,
            title = bookData.get('Title'),
            source = cls.metadata_source_id(),
            authors = bookData.get('Authors'),
            language = bookData.get('Language'),
            publish_date = bookData.get('PublishDate'),
            num_pages = bookData.get('NumberOfPages'),
            subjects = bookData.get('Subjects'),
        )
 if __name__ == '__main__':
    book_data = OutlandScraperFetcher.fetch_metadata('9781947808225')
    book_data.validate()
    print(book_data)
--- a/worblehat/services/metadata_fetchers/init.py
+++ b/worblehat/services/metadata_fetchers/init.py
@ -0,0 +1 @@
 from .book_metadata_fetcher import fetch_metadata_from_multiple_sources
--- a/worblehat/services/metadata_fetchers/book_metadata_fetcher.py
+++ b/worblehat/services/metadata_fetchers/book_metadata_fetcher.py
@ -0,0 +1,80 @@
 """
 this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result.
 """
 from concurrent.futures import ThreadPoolExecutor
 from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
 from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
 from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher
 from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher
 from worblehat.services.metadata_fetchers.OutlandScraperFetcher import OutlandScraperFetcher
 # The order of these fetchers determines the priority of the sources.
 # The first fetcher in the list has the highest priority.
 FETCHERS: list[BookMetadataFetcher] = [
    OpenLibraryFetcher,
    GoogleBooksFetcher,
    OutlandScraperFetcher,
 ]
 FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS]
 def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]:
    """
    Sorts the given metadata by the priority of the sources.
    The order of the metadata is the same as the order of the sources in the FETCHERS list.
    """
    # Note that this function is O(n^2) but the number of fetchers is small so it's fine.
    return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
 def fetch_metadata_from_multiple_sources(isbn: str, strict=False) -> list[BookMetadata]:
    """
    Returns a list of metadata fetched from multiple sources.
    Sources that does not have metadata for the given ISBN will be ignored.
    There is no guarantee that there will be any metadata.
    The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
    """
    isbn = isbn.replace('-', '').replace('_', '').strip().lower()
    if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
        raise ValueError('Invalid ISBN')
    results: list[BookMetadata] = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS]
    for future in futures:
        result = future.result()
        if result is not None:
            results.append(result)
    for result in results:
        try:
            result.validate()
        except ValueError as e:
            if strict:
                raise e
            else:
                print(f'Invalid metadata: {e}')
                results.remove(result)
    return sort_metadata_by_priority(results)
 if __name__ == '__main__':
    from pprint import pprint
    isbn = '0132624788'
    metadata = fetch_metadata_from_multiple_sources(isbn)
    pprint(metadata)
		`@ -0,0 +1 @@`
							`from .book_metadata_fetcher import fetch_metadata_from_multiple_sources`