diff --git a/worblehat/services/bookcase_item.py b/worblehat/services/bookcase_item.py index 769c8c0..a6f5841 100644 --- a/worblehat/services/bookcase_item.py +++ b/worblehat/services/bookcase_item.py @@ -3,6 +3,7 @@ import isbnlib from sqlalchemy import select from sqlalchemy.orm import Session +from .metadata_fetchers import fetch_metadata_from_multiple_sources from ..models import ( Author, BookcaseItem, @@ -25,20 +26,32 @@ def is_valid_isbn(isbn: str) -> bool: def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None: - metadata = isbnlib.meta(isbn, 'openl') - if len(metadata.keys()) == 0: + """ + This function fetches metadata for the given ISBN and creates a BookcaseItem from it. + It does so using a database connection to connect it to the correct authors and language + through the sql ORM. + + If no metadata is found, None is returned. + + Please not that the returned BookcaseItem will likely not be fully populated with the required + data, such as the book's location in the library, and the owner of the book, etc. + """ + metadata = fetch_metadata_from_multiple_sources(isbn) + if len(metadata) == 0: return None + metadata = metadata[0] + bookcase_item = BookcaseItem( - name = metadata.get('Title'), + name = metadata.title, isbn = int(isbn.replace('-', '')), ) - if len(authors := metadata.get('Authors')) > 0: + if len(authors := metadata.authors) > 0: for author in authors: bookcase_item.authors.add(Author(author)) - if (language := metadata.get('Language')): + if (language := metadata.language): bookcase_item.language = sql_session.scalars( select(Language) .where(Language.iso639_1_code == language) diff --git a/worblehat/services/metadata_fetchers/BookMetadata.py b/worblehat/services/metadata_fetchers/BookMetadata.py new file mode 100644 index 0000000..5616948 --- /dev/null +++ b/worblehat/services/metadata_fetchers/BookMetadata.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass +from typing import Set + + +# TODO: Add more languages +LANGUAGES: set[str] = set([ + "no", + "en", + "de", + "fr", + "es", + "it", + "sv", + "da", + "fi", + "ru", + "zh", + "ja", + "ko", +]) + + +@dataclass +class BookMetadata: + """A class representing metadata for a book.""" + isbn: str + title: str + # The source of the metadata provider + source: str + authors: Set[str] + language: str | None + publish_date: str | None + num_pages: int | None + subjects: Set[str] + + def to_dict(self) -> dict[str, any]: + return { + 'isbn': self.isbn, + 'title': self.title, + 'source': self.metadata_source_id(), + 'authors': set() if self.authors is None else self.authors, + 'language': self.language, + 'publish_date': self.publish_date, + 'num_pages': self.num_pages, + 'subjects': set() if self.subjects is None else self.subjects + } + + def validate(self) -> None: + if not self.isbn: + raise ValueError('Missing ISBN') + if not self.title: + raise ValueError('Missing title') + if not self.source: + raise ValueError('Missing source') + if not self.authors: + raise ValueError('Missing authors') + + if self.language is not None and self.language not in LANGUAGES: + raise ValueError(f'Invalid language: {self.language}. Consider adding it to the LANGUAGES set if you think this is a mistake.') + + if self.num_pages is not None and self.num_pages < 0: + raise ValueError(f'Invalid number of pages: {self.num_pages}') diff --git a/worblehat/services/metadata_fetchers/BookMetadataFetcher.py b/worblehat/services/metadata_fetchers/BookMetadataFetcher.py new file mode 100644 index 0000000..fe289e5 --- /dev/null +++ b/worblehat/services/metadata_fetchers/BookMetadataFetcher.py @@ -0,0 +1,20 @@ +#base fetcher. +from abc import ABC, abstractmethod +from .BookMetadata import BookMetadata + +class BookMetadataFetcher(ABC): + """ + A base class for metadata fetchers. + """ + + @classmethod + @abstractmethod + def metadata_source_id(cls) -> str: + """Returns a unique identifier for the metadata source, to identify where the metadata came from.""" + pass + + @classmethod + @abstractmethod + def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + """Tries to fetch metadata for the given ISBN.""" + pass \ No newline at end of file diff --git a/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py b/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py new file mode 100644 index 0000000..31182da --- /dev/null +++ b/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py @@ -0,0 +1,51 @@ +""" +A BookMetadataFetcher for the Google Books API. +""" + +import requests + +from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata + + +class GoogleBooksFetcher(BookMetadataFetcher): + @classmethod + def metadata_source_id(_cls) -> str: + return "google_books" + + @classmethod + def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + try: + jsonInput = requests.get( + f"https://www.googleapis.com/books/v1/volumes", + params = {"q": f"isbn:{isbn}"}, + ).json() + data = jsonInput.get("items")[0].get("volumeInfo") + + authors = set(data.get("authors") or []) + title = data.get("title") + publishDate = data.get("publish_date") + numberOfPages = data.get("number_of_pages") + if numberOfPages: + numberOfPages = int(numberOfPages) + subjects = set(data.get("categories") or []) + languages = data.get("languages") + except Exception: + return None + + return BookMetadata( + isbn = isbn, + title = title, + source = cls.metadata_source_id(), + authors = authors, + language = languages, + publish_date = publishDate, + num_pages = numberOfPages, + subjects = subjects, + ) + + +if __name__ == '__main__': + book_data = GoogleBooksFetcher.fetch_metadata('0132624788') + book_data.validate() + print(book_data) \ No newline at end of file diff --git a/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py b/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py new file mode 100644 index 0000000..bebaf76 --- /dev/null +++ b/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py @@ -0,0 +1,61 @@ +""" +A BookMetadataFetcher for the Open Library API. +""" + +import requests + +from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata + +LANGUAGE_MAP = { + "Norwegian": "no", +} + + +class OpenLibraryFetcher(BookMetadataFetcher): + @classmethod + def metadata_source_id(_cls) -> str: + return "open_library" + + @classmethod + def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + try: + jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json() + + author_keys = jsonInput.get("authors") or [] + author_names = set() + for author_key in author_keys: + key = author_key.get('key') + author_name = requests.get(f"https://openlibrary.org/{key}.json").json().get("name") + author_names.add(author_name) + + title = jsonInput.get("title") + publishDate = jsonInput.get("publish_date") + + numberOfPages = jsonInput.get("number_of_pages") + if numberOfPages: + numberOfPages = int(numberOfPages) + + language_key = jsonInput.get("languages")[0].get("key") + language = requests.get(f"https://openlibrary.org/{language_key}.json").json().get("identifiers").get("iso_639_1")[0] + subjects = set(jsonInput.get("subjects") or []) + + except Exception: + return None + + return BookMetadata( + isbn = isbn, + title = title, + source = cls.metadata_source_id(), + authors = author_names, + language = language, + publish_date = publishDate, + num_pages = numberOfPages, + subjects = subjects, + ) + + +if __name__ == '__main__': + book_data = OpenLibraryFetcher.fetch_metadata('9788205530751') + book_data.validate() + print(book_data) \ No newline at end of file diff --git a/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py b/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py new file mode 100644 index 0000000..e0ddc9f --- /dev/null +++ b/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py @@ -0,0 +1,109 @@ +""" +A BookMetadataFetcher that webscrapes https://outland.no/ +""" + +from bs4 import BeautifulSoup + +import requests + +from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata + + +LANGUAGE_MAP = { + "Norsk": "no", + "Engelsk": "en", + "Tysk": "de", + "Fransk": "fr", + "Spansk": "es", + "Italiensk": "it", + "Svensk": "sv", + "Dansk": "da", + "Finsk": "fi", + "Russisk": "ru", + "Kinesisk": "zh", + "Japansk": "ja", + "Koreansk": "ko", +} + + +class OutlandScraperFetcher(BookMetadataFetcher): + @classmethod + def metadata_source_id(_cls) -> str: + return "outland_scraper" + + @classmethod + def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + try: + # Find the link to the product page + response = requests.get(f"https://outland.no/{isbn}") + soup = BeautifulSoup(response.content, "html.parser") + soup = soup.find_all("a", class_="product-item-link") + href = soup[0].get("href") + + # Find the metadata on the product page + response = requests.get(href) + soup = BeautifulSoup(response.content, "html.parser") + data = soup.find_all("td", class_="col data") + + # Collect the metadata + title = soup.find_all("span", class_="base")[0].text + + releaseDate = soup.find_all("span", class_="release-date")[0].text.strip() + releaseDate = releaseDate[-4:] # only keep year + + bookData = { + "Title": title, + "PublishDate": releaseDate, + "Authors": None, + "NumberOfPages": None, + "Genre": None, + "Language": None, + "Subjects": None, + } + + dataKeyMap = { + "Authors": "Forfattere", + "NumberOfPages": "Antall Sider", + "Genre": "Sjanger", + "Language": "Språk", + "Subjects": "Serie" + } + + for value in data: + for key in dataKeyMap: + if str(value).lower().__contains__(dataKeyMap[key].lower()): + bookData[key] = value.text + break + + if bookData["Language"] is not None: + bookData["Language"] = LANGUAGE_MAP.get(bookData["Language"]) + + if bookData["Authors"] is not None: + bookData["Authors"] = set(bookData["Authors"].split(", ")) + + if bookData["Subjects"] is not None: + bookData["Subjects"] = set(bookData["Subjects"].split(", ")) + + if bookData["NumberOfPages"] is not None: + bookData["NumberOfPages"] = int(bookData["NumberOfPages"]) + + except Exception: + return None + + return BookMetadata( + isbn = isbn, + title = bookData.get('Title'), + source = cls.metadata_source_id(), + authors = bookData.get('Authors'), + language = bookData.get('Language'), + publish_date = bookData.get('PublishDate'), + num_pages = bookData.get('NumberOfPages'), + subjects = bookData.get('Subjects'), + ) + + +if __name__ == '__main__': + book_data = OutlandScraperFetcher.fetch_metadata('9781947808225') + book_data.validate() + print(book_data) \ No newline at end of file diff --git a/worblehat/services/metadata_fetchers/__init__.py b/worblehat/services/metadata_fetchers/__init__.py new file mode 100644 index 0000000..c489f15 --- /dev/null +++ b/worblehat/services/metadata_fetchers/__init__.py @@ -0,0 +1 @@ +from .book_metadata_fetcher import fetch_metadata_from_multiple_sources \ No newline at end of file diff --git a/worblehat/services/metadata_fetchers/book_metadata_fetcher.py b/worblehat/services/metadata_fetchers/book_metadata_fetcher.py new file mode 100644 index 0000000..6915a4f --- /dev/null +++ b/worblehat/services/metadata_fetchers/book_metadata_fetcher.py @@ -0,0 +1,80 @@ +""" +this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result. + +""" + +from concurrent.futures import ThreadPoolExecutor + +from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata +from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher + +from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher +from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher +from worblehat.services.metadata_fetchers.OutlandScraperFetcher import OutlandScraperFetcher + + +# The order of these fetchers determines the priority of the sources. +# The first fetcher in the list has the highest priority. +FETCHERS: list[BookMetadataFetcher] = [ + OpenLibraryFetcher, + GoogleBooksFetcher, + OutlandScraperFetcher, +] + + +FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS] + + +def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]: + """ + Sorts the given metadata by the priority of the sources. + + The order of the metadata is the same as the order of the sources in the FETCHERS list. + """ + + # Note that this function is O(n^2) but the number of fetchers is small so it's fine. + return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source)) + + +def fetch_metadata_from_multiple_sources(isbn: str, strict=False) -> list[BookMetadata]: + """ + Returns a list of metadata fetched from multiple sources. + + Sources that does not have metadata for the given ISBN will be ignored. + + There is no guarantee that there will be any metadata. + + The results are always ordered in the same way as the fetchers are listed in the FETCHERS list. + """ + isbn = isbn.replace('-', '').replace('_', '').strip().lower() + if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric(): + raise ValueError('Invalid ISBN') + + results: list[BookMetadata] = [] + + with ThreadPoolExecutor() as executor: + futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS] + + for future in futures: + result = future.result() + if result is not None: + results.append(result) + + for result in results: + try: + result.validate() + except ValueError as e: + if strict: + raise e + else: + print(f'Invalid metadata: {e}') + results.remove(result) + + return sort_metadata_by_priority(results) + + +if __name__ == '__main__': + from pprint import pprint + isbn = '0132624788' + metadata = fetch_metadata_from_multiple_sources(isbn) + pprint(metadata) \ No newline at end of file