From 931bd2d63e285e767c8c81c103dda3c0a63f2965 Mon Sep 17 00:00:00 2001 From: h7x4 Date: Sun, 7 Jun 2026 01:17:52 +0900 Subject: [PATCH] services/metadatafetchers: rename to book_data_fetchers --- .../BookData.py} | 13 +-- .../book_data_fetchers/BookDataFetcher.py | 22 +++++ src/worblehat/book_data_fetchers/__init__.py | 3 + .../book_data_fetchers/book_data_fetcher.py | 72 +++++++++++++++++ .../fetchers}/GoogleBooksFetcher.py | 20 ++--- .../fetchers}/OpenLibraryFetcher.py | 20 ++--- .../fetchers}/OutlandScraperFetcher.py | 20 ++--- .../book_data_fetchers/fetchers/__init__.py | 0 src/worblehat/services/bookcase_item.py | 5 +- .../metadata_fetchers/BookMetadataFetcher.py | 22 ----- .../services/metadata_fetchers/__init__.py | 3 - .../book_metadata_fetcher.py | 80 ------------------- 12 files changed, 129 insertions(+), 151 deletions(-) rename src/worblehat/{services/metadata_fetchers/BookMetadata.py => book_data_fetchers/BookData.py} (83%) create mode 100644 src/worblehat/book_data_fetchers/BookDataFetcher.py create mode 100644 src/worblehat/book_data_fetchers/__init__.py create mode 100644 src/worblehat/book_data_fetchers/book_data_fetcher.py rename src/worblehat/{services/metadata_fetchers => book_data_fetchers/fetchers}/GoogleBooksFetcher.py (66%) rename src/worblehat/{services/metadata_fetchers => book_data_fetchers/fetchers}/OpenLibraryFetcher.py (73%) rename src/worblehat/{services/metadata_fetchers => book_data_fetchers/fetchers}/OutlandScraperFetcher.py (84%) create mode 100644 src/worblehat/book_data_fetchers/fetchers/__init__.py delete mode 100644 src/worblehat/services/metadata_fetchers/BookMetadataFetcher.py delete mode 100644 src/worblehat/services/metadata_fetchers/__init__.py delete mode 100644 src/worblehat/services/metadata_fetchers/book_metadata_fetcher.py diff --git a/src/worblehat/services/metadata_fetchers/BookMetadata.py b/src/worblehat/book_data_fetchers/BookData.py similarity index 83% rename from src/worblehat/services/metadata_fetchers/BookMetadata.py rename to src/worblehat/book_data_fetchers/BookData.py index c87567c..b46700f 100644 --- a/src/worblehat/services/metadata_fetchers/BookMetadata.py +++ b/src/worblehat/book_data_fetchers/BookData.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from typing import Any # TODO: Add more languages LANGUAGES: set[str] = { @@ -19,12 +20,14 @@ LANGUAGES: set[str] = { @dataclass -class BookMetadata: - """A class representing metadata for a book.""" +class BookData: + """ + A class representing metadata for a book that we might want to fetch from external sources + """ isbn: str title: str - # The source of the metadata provider + # ID of the data fetcher used to fetch this instance source: str authors: set[str] language: str | None @@ -32,11 +35,11 @@ class BookMetadata: num_pages: int | None subjects: set[str] - def to_dict(self) -> dict[str, any]: + def to_dict(self) -> dict[str, Any]: return { "isbn": self.isbn, "title": self.title, - "source": self.metadata_source_id(), + "source": self.source, "authors": set() if self.authors is None else self.authors, "language": self.language, "publish_date": self.publish_date, diff --git a/src/worblehat/book_data_fetchers/BookDataFetcher.py b/src/worblehat/book_data_fetchers/BookDataFetcher.py new file mode 100644 index 0000000..9358cd5 --- /dev/null +++ b/src/worblehat/book_data_fetchers/BookDataFetcher.py @@ -0,0 +1,22 @@ +# base fetcher. +from abc import ABC, abstractmethod + +from .BookData import BookData + + +class BookDataFetcher(ABC): + """ + A base class for adapters that fetch book data from external sources. + """ + + @classmethod + @abstractmethod + def fetcher_id(cls) -> str: + """Returns a unique identifier for this specific fetcher, to identify where the data came from.""" + pass + + @classmethod + @abstractmethod + def try_fetch_data(cls, isbn: str) -> BookData | None: + """Tries to fetch data for the given ISBN.""" + pass diff --git a/src/worblehat/book_data_fetchers/__init__.py b/src/worblehat/book_data_fetchers/__init__.py new file mode 100644 index 0000000..fad526f --- /dev/null +++ b/src/worblehat/book_data_fetchers/__init__.py @@ -0,0 +1,3 @@ +from .book_data_fetcher import fetch_book_data_from_multiple_sources + +__all__ = ["fetch_book_data_from_multiple_sources"] diff --git a/src/worblehat/book_data_fetchers/book_data_fetcher.py b/src/worblehat/book_data_fetchers/book_data_fetcher.py new file mode 100644 index 0000000..b69fdac --- /dev/null +++ b/src/worblehat/book_data_fetchers/book_data_fetcher.py @@ -0,0 +1,72 @@ +""" +this module contains the fetch_book_data_from_multiple_sources() function which combines all fetchers and returns ranked results (if any) + +""" + +from concurrent.futures import ThreadPoolExecutor + +from worblehat.book_data_fetchers.BookData import BookData +from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher +from worblehat.book_data_fetchers.fetchers.GoogleBooksFetcher import GoogleBooksFetcher +from worblehat.book_data_fetchers.fetchers.OpenLibraryFetcher import OpenLibraryFetcher +from worblehat.book_data_fetchers.fetchers.OutlandScraperFetcher import ( + OutlandScraperFetcher, +) + +# The order of these fetchers determines the priority of the sources. +# The first fetcher in the list has the highest priority. +FETCHERS: list[BookDataFetcher] = [ + OpenLibraryFetcher, + GoogleBooksFetcher, + OutlandScraperFetcher, +] + + +FETCHER_SOURCE_IDS: list[str] = [fetcher.fetcher_id() for fetcher in FETCHERS] + + +def sort_data_by_priority(data: list[BookData]) -> list[BookData]: + """ + Sorts the given data by the priority of the sources. + + The order of the data is the same as the order of the sources in the FETCHERS list. + """ + + # Note that this function is O(n^2) but the number of fetchers is small so it's fine. + return sorted(data, key=lambda m: FETCHER_SOURCE_IDS.index(m.source)) + + +def fetch_book_data_from_multiple_sources(isbn: str, strict: bool=False) -> list[BookData]: + """ + Returns a list of data fetched from multiple fetchers. + + Fetchers that are not able to retrieve any data for the given ISBN will be ignored. + + There is no guarantee that there will be any book data. + + The results are always ordered in the same way as the fetchers are listed in the FETCHERS list. + """ + isbn = isbn.replace("-", "").replace("_", "").strip().lower() + if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric(): + raise ValueError("Invalid ISBN") + + results: list[BookData] = [] + + with ThreadPoolExecutor() as executor: + futures = [executor.submit(fetcher.try_fetch_data, isbn) for fetcher in FETCHERS] + + for future in futures: + result = future.result() + if result is not None: + results.append(result) + + for result in results: + try: + result.validate() + except ValueError as e: + if strict: + raise e + print(f"Invalid data: {e}") + results.remove(result) + + return sort_data_by_priority(results) diff --git a/src/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py b/src/worblehat/book_data_fetchers/fetchers/GoogleBooksFetcher.py similarity index 66% rename from src/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py rename to src/worblehat/book_data_fetchers/fetchers/GoogleBooksFetcher.py index 498f745..89d09f4 100644 --- a/src/worblehat/services/metadata_fetchers/GoogleBooksFetcher.py +++ b/src/worblehat/book_data_fetchers/fetchers/GoogleBooksFetcher.py @@ -4,17 +4,17 @@ A BookMetadataFetcher for the Google Books API. import requests -from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata -from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.book_data_fetchers.BookData import BookData +from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher -class GoogleBooksFetcher(BookMetadataFetcher): +class GoogleBooksFetcher(BookDataFetcher): @classmethod - def metadata_source_id(_cls) -> str: + def fetcher_id(_cls) -> str: return "google_books" @classmethod - def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + def try_fetch_data(cls, isbn: str) -> BookData | None: try: jsonInput = requests.get( "https://www.googleapis.com/books/v1/volumes", @@ -33,19 +33,13 @@ class GoogleBooksFetcher(BookMetadataFetcher): except Exception: return None - return BookMetadata( + return BookData( isbn=isbn, title=title, - source=cls.metadata_source_id(), + source=cls.fetcher_id(), authors=authors, language=languages, publish_date=publishDate, num_pages=numberOfPages, subjects=subjects, ) - - -if __name__ == "__main__": - book_data = GoogleBooksFetcher.fetch_metadata("0132624788") - book_data.validate() - print(book_data) diff --git a/src/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py b/src/worblehat/book_data_fetchers/fetchers/OpenLibraryFetcher.py similarity index 73% rename from src/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py rename to src/worblehat/book_data_fetchers/fetchers/OpenLibraryFetcher.py index a8e7d27..870f9a2 100644 --- a/src/worblehat/services/metadata_fetchers/OpenLibraryFetcher.py +++ b/src/worblehat/book_data_fetchers/fetchers/OpenLibraryFetcher.py @@ -4,21 +4,21 @@ A BookMetadataFetcher for the Open Library API. import requests -from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata -from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.book_data_fetchers.BookData import BookData +from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher LANGUAGE_MAP = { "Norwegian": "no", } -class OpenLibraryFetcher(BookMetadataFetcher): +class OpenLibraryFetcher(BookDataFetcher): @classmethod - def metadata_source_id(_cls) -> str: + def fetcher_id(_cls) -> str: return "open_library" @classmethod - def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + def try_fetch_data(cls, isbn: str) -> BookData | None: try: jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json() @@ -48,19 +48,13 @@ class OpenLibraryFetcher(BookMetadataFetcher): except Exception: return None - return BookMetadata( + return BookData( isbn=isbn, title=title, - source=cls.metadata_source_id(), + source=cls.fetcher_id(), authors=author_names, language=language, publish_date=publishDate, num_pages=numberOfPages, subjects=subjects, ) - - -if __name__ == "__main__": - book_data = OpenLibraryFetcher.fetch_metadata("9788205530751") - book_data.validate() - print(book_data) diff --git a/src/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py b/src/worblehat/book_data_fetchers/fetchers/OutlandScraperFetcher.py similarity index 84% rename from src/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py rename to src/worblehat/book_data_fetchers/fetchers/OutlandScraperFetcher.py index e406903..2f311bb 100644 --- a/src/worblehat/services/metadata_fetchers/OutlandScraperFetcher.py +++ b/src/worblehat/book_data_fetchers/fetchers/OutlandScraperFetcher.py @@ -5,8 +5,8 @@ A BookMetadataFetcher that webscrapes https://outland.no/ import requests from bs4 import BeautifulSoup -from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata -from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher +from worblehat.book_data_fetchers.BookData import BookData +from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher LANGUAGE_MAP = { "Norsk": "no", @@ -25,13 +25,13 @@ LANGUAGE_MAP = { } -class OutlandScraperFetcher(BookMetadataFetcher): +class OutlandScraperFetcher(BookDataFetcher): @classmethod - def metadata_source_id(_cls) -> str: + def fetcher_id(_cls) -> str: return "outland_scraper" @classmethod - def fetch_metadata(cls, isbn: str) -> BookMetadata | None: + def try_fetch_data(cls, isbn: str) -> BookData | None: try: # Find the link to the product page response = requests.get(f"https://outland.no/{isbn}") @@ -89,19 +89,13 @@ class OutlandScraperFetcher(BookMetadataFetcher): except Exception: return None - return BookMetadata( + return BookData( isbn=isbn, title=bookData.get("Title"), - source=cls.metadata_source_id(), + source=cls.fetcher_id(), authors=bookData.get("Authors"), language=bookData.get("Language"), publish_date=bookData.get("PublishDate"), num_pages=bookData.get("NumberOfPages"), subjects=bookData.get("Subjects"), ) - - -if __name__ == "__main__": - book_data = OutlandScraperFetcher.fetch_metadata("9781947808225") - book_data.validate() - print(book_data) diff --git a/src/worblehat/book_data_fetchers/fetchers/__init__.py b/src/worblehat/book_data_fetchers/fetchers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/worblehat/services/bookcase_item.py b/src/worblehat/services/bookcase_item.py index b081dd5..bf8091b 100644 --- a/src/worblehat/services/bookcase_item.py +++ b/src/worblehat/services/bookcase_item.py @@ -2,12 +2,13 @@ import isbnlib from sqlalchemy import select from sqlalchemy.orm import Session +from worblehat.book_data_fetchers import fetch_book_data_from_multiple_sources + from ..models import ( Author, BookcaseItem, Language, ) -from .metadata_fetchers import fetch_metadata_from_multiple_sources def is_valid_pvv_isbn(isbn: str) -> bool: @@ -41,7 +42,7 @@ def create_bookcase_item_from_isbn( Please not that the returned BookcaseItem will likely not be fully populated with the required data, such as the book's location in the library, and the owner of the book, etc. """ - metadata = fetch_metadata_from_multiple_sources(isbn) + metadata = fetch_book_data_from_multiple_sources(isbn) if len(metadata) == 0: return None diff --git a/src/worblehat/services/metadata_fetchers/BookMetadataFetcher.py b/src/worblehat/services/metadata_fetchers/BookMetadataFetcher.py deleted file mode 100644 index cc0251c..0000000 --- a/src/worblehat/services/metadata_fetchers/BookMetadataFetcher.py +++ /dev/null @@ -1,22 +0,0 @@ -# base fetcher. -from abc import ABC, abstractmethod - -from .BookMetadata import BookMetadata - - -class BookMetadataFetcher(ABC): - """ - A base class for metadata fetchers. - """ - - @classmethod - @abstractmethod - def metadata_source_id(cls) -> str: - """Returns a unique identifier for the metadata source, to identify where the metadata came from.""" - pass - - @classmethod - @abstractmethod - def fetch_metadata(cls, isbn: str) -> BookMetadata | None: - """Tries to fetch metadata for the given ISBN.""" - pass diff --git a/src/worblehat/services/metadata_fetchers/__init__.py b/src/worblehat/services/metadata_fetchers/__init__.py deleted file mode 100644 index f7ecf33..0000000 --- a/src/worblehat/services/metadata_fetchers/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .book_metadata_fetcher import fetch_metadata_from_multiple_sources - -__all__ = ["fetch_metadata_from_multiple_sources"] diff --git a/src/worblehat/services/metadata_fetchers/book_metadata_fetcher.py b/src/worblehat/services/metadata_fetchers/book_metadata_fetcher.py deleted file mode 100644 index f5d4b30..0000000 --- a/src/worblehat/services/metadata_fetchers/book_metadata_fetcher.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result. - -""" - -from concurrent.futures import ThreadPoolExecutor - -from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata -from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher -from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher -from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher -from worblehat.services.metadata_fetchers.OutlandScraperFetcher import ( - OutlandScraperFetcher, -) - -# The order of these fetchers determines the priority of the sources. -# The first fetcher in the list has the highest priority. -FETCHERS: list[BookMetadataFetcher] = [ - OpenLibraryFetcher, - GoogleBooksFetcher, - OutlandScraperFetcher, -] - - -FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS] - - -def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]: - """ - Sorts the given metadata by the priority of the sources. - - The order of the metadata is the same as the order of the sources in the FETCHERS list. - """ - - # Note that this function is O(n^2) but the number of fetchers is small so it's fine. - return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source)) - - -def fetch_metadata_from_multiple_sources(isbn: str, strict: bool=False) -> list[BookMetadata]: - """ - Returns a list of metadata fetched from multiple sources. - - Sources that does not have metadata for the given ISBN will be ignored. - - There is no guarantee that there will be any metadata. - - The results are always ordered in the same way as the fetchers are listed in the FETCHERS list. - """ - isbn = isbn.replace("-", "").replace("_", "").strip().lower() - if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric(): - raise ValueError("Invalid ISBN") - - results: list[BookMetadata] = [] - - with ThreadPoolExecutor() as executor: - futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS] - - for future in futures: - result = future.result() - if result is not None: - results.append(result) - - for result in results: - try: - result.validate() - except ValueError as e: - if strict: - raise e - print(f"Invalid metadata: {e}") - results.remove(result) - - return sort_metadata_by_priority(results) - - -if __name__ == "__main__": - from pprint import pprint - - isbn = "0132624788" - metadata = fetch_metadata_from_multiple_sources(isbn) - pprint(metadata)