Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
10bffd53e8
|
|||
|
a91aa3c617
|
|||
|
779312cd9f
|
@@ -1,22 +0,0 @@
|
||||
# base fetcher.
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .BookData import BookData
|
||||
|
||||
|
||||
class BookDataFetcher(ABC):
|
||||
"""
|
||||
A base class for adapters that fetch book data from external sources.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def fetcher_id(cls) -> str:
|
||||
"""Returns a unique identifier for this specific fetcher, to identify where the data came from."""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def try_fetch_data(cls, isbn: str) -> BookData | None:
|
||||
"""Tries to fetch data for the given ISBN."""
|
||||
pass
|
||||
@@ -1,3 +0,0 @@
|
||||
from .book_data_fetcher import fetch_book_data_from_multiple_sources
|
||||
|
||||
__all__ = ["fetch_book_data_from_multiple_sources"]
|
||||
@@ -1,72 +0,0 @@
|
||||
"""
|
||||
this module contains the fetch_book_data_from_multiple_sources() function which combines all fetchers and returns ranked results (if any)
|
||||
|
||||
"""
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from worblehat.book_data_fetchers.BookData import BookData
|
||||
from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher
|
||||
from worblehat.book_data_fetchers.fetchers.GoogleBooksFetcher import GoogleBooksFetcher
|
||||
from worblehat.book_data_fetchers.fetchers.OpenLibraryFetcher import OpenLibraryFetcher
|
||||
from worblehat.book_data_fetchers.fetchers.OutlandScraperFetcher import (
|
||||
OutlandScraperFetcher,
|
||||
)
|
||||
|
||||
# The order of these fetchers determines the priority of the sources.
|
||||
# The first fetcher in the list has the highest priority.
|
||||
FETCHERS: list[BookDataFetcher] = [
|
||||
OpenLibraryFetcher,
|
||||
GoogleBooksFetcher,
|
||||
OutlandScraperFetcher,
|
||||
]
|
||||
|
||||
|
||||
FETCHER_SOURCE_IDS: list[str] = [fetcher.fetcher_id() for fetcher in FETCHERS]
|
||||
|
||||
|
||||
def sort_data_by_priority(data: list[BookData]) -> list[BookData]:
|
||||
"""
|
||||
Sorts the given data by the priority of the sources.
|
||||
|
||||
The order of the data is the same as the order of the sources in the FETCHERS list.
|
||||
"""
|
||||
|
||||
# Note that this function is O(n^2) but the number of fetchers is small so it's fine.
|
||||
return sorted(data, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
|
||||
|
||||
|
||||
def fetch_book_data_from_multiple_sources(isbn: str, strict: bool=False) -> list[BookData]:
|
||||
"""
|
||||
Returns a list of data fetched from multiple fetchers.
|
||||
|
||||
Fetchers that are not able to retrieve any data for the given ISBN will be ignored.
|
||||
|
||||
There is no guarantee that there will be any book data.
|
||||
|
||||
The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
|
||||
"""
|
||||
isbn = isbn.replace("-", "").replace("_", "").strip().lower()
|
||||
if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
|
||||
raise ValueError("Invalid ISBN")
|
||||
|
||||
results: list[BookData] = []
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(fetcher.try_fetch_data, isbn) for fetcher in FETCHERS]
|
||||
|
||||
for future in futures:
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
result.validate()
|
||||
except ValueError as e:
|
||||
if strict:
|
||||
raise e
|
||||
print(f"Invalid data: {e}")
|
||||
results.remove(result)
|
||||
|
||||
return sort_data_by_priority(results)
|
||||
+26
-25
@@ -121,35 +121,36 @@ class WorblehatCli(NumberedCmd):
|
||||
"""),
|
||||
)
|
||||
|
||||
print("Please select the bookcase where the item is placed:")
|
||||
bookcase_selector = InteractiveItemSelector(
|
||||
cls=Bookcase,
|
||||
sql_session=self.sql_session,
|
||||
)
|
||||
bookcase_selector.cmdloop()
|
||||
bookcase = bookcase_selector.result
|
||||
if bookcase == None:
|
||||
return
|
||||
with self.sql_session.no_autoflush:
|
||||
print("Please select the bookcase where the item is placed:")
|
||||
bookcase_selector = InteractiveItemSelector(
|
||||
cls=Bookcase,
|
||||
sql_session=self.sql_session,
|
||||
)
|
||||
bookcase_selector.cmdloop()
|
||||
bookcase = bookcase_selector.result
|
||||
if bookcase == None:
|
||||
return
|
||||
|
||||
bookcase_item.shelf = select_bookcase_shelf(bookcase, self.sql_session)
|
||||
bookcase_item.shelf = select_bookcase_shelf(bookcase, self.sql_session)
|
||||
|
||||
print("Please select the items media type:")
|
||||
media_type_selector = InteractiveItemSelector(
|
||||
cls=MediaType,
|
||||
sql_session=self.sql_session,
|
||||
default=self.sql_session.scalars(
|
||||
select(MediaType).where(MediaType.name.ilike("book")),
|
||||
).one(),
|
||||
)
|
||||
print("Please select the items media type:")
|
||||
media_type_selector = InteractiveItemSelector(
|
||||
cls=MediaType,
|
||||
sql_session=self.sql_session,
|
||||
default=self.sql_session.scalars(
|
||||
select(MediaType).where(MediaType.name.ilike("book")),
|
||||
).one(),
|
||||
)
|
||||
|
||||
media_type_selector.cmdloop()
|
||||
bookcase_item.media_type = media_type_selector.result
|
||||
if bookcase_item.media_type == None:
|
||||
return
|
||||
media_type_selector.cmdloop()
|
||||
bookcase_item.media_type = media_type_selector.result
|
||||
if bookcase_item.media_type == None:
|
||||
return
|
||||
|
||||
username = input("Who owns this book? [PVV]> ")
|
||||
if username != "":
|
||||
bookcase_item.owner = username
|
||||
username = input("Who owns this book? [PVV]> ")
|
||||
if username != "":
|
||||
bookcase_item.owner = username
|
||||
|
||||
self.sql_session.add(bookcase_item)
|
||||
self.sql_session.flush()
|
||||
|
||||
@@ -2,13 +2,12 @@ import isbnlib
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from worblehat.book_data_fetchers import fetch_book_data_from_multiple_sources
|
||||
|
||||
from ..models import (
|
||||
Author,
|
||||
BookcaseItem,
|
||||
Language,
|
||||
)
|
||||
from .metadata_fetchers import fetch_metadata_from_multiple_sources
|
||||
|
||||
|
||||
def is_valid_pvv_isbn(isbn: str) -> bool:
|
||||
@@ -42,7 +41,7 @@ def create_bookcase_item_from_isbn(
|
||||
Please not that the returned BookcaseItem will likely not be fully populated with the required
|
||||
data, such as the book's location in the library, and the owner of the book, etc.
|
||||
"""
|
||||
metadata = fetch_book_data_from_multiple_sources(isbn)
|
||||
metadata = fetch_metadata_from_multiple_sources(isbn)
|
||||
if len(metadata) == 0:
|
||||
return None
|
||||
|
||||
|
||||
+5
-8
@@ -1,5 +1,4 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
# TODO: Add more languages
|
||||
LANGUAGES: set[str] = {
|
||||
@@ -20,14 +19,12 @@ LANGUAGES: set[str] = {
|
||||
|
||||
|
||||
@dataclass
|
||||
class BookData:
|
||||
"""
|
||||
A class representing metadata for a book that we might want to fetch from external sources
|
||||
"""
|
||||
class BookMetadata:
|
||||
"""A class representing metadata for a book."""
|
||||
|
||||
isbn: str
|
||||
title: str
|
||||
# ID of the data fetcher used to fetch this instance
|
||||
# The source of the metadata provider
|
||||
source: str
|
||||
authors: set[str]
|
||||
language: str | None
|
||||
@@ -35,11 +32,11 @@ class BookData:
|
||||
num_pages: int | None
|
||||
subjects: set[str]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
def to_dict(self) -> dict[str, any]:
|
||||
return {
|
||||
"isbn": self.isbn,
|
||||
"title": self.title,
|
||||
"source": self.source,
|
||||
"source": self.metadata_source_id(),
|
||||
"authors": set() if self.authors is None else self.authors,
|
||||
"language": self.language,
|
||||
"publish_date": self.publish_date,
|
||||
@@ -0,0 +1,22 @@
|
||||
# base fetcher.
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .BookMetadata import BookMetadata
|
||||
|
||||
|
||||
class BookMetadataFetcher(ABC):
|
||||
"""
|
||||
A base class for metadata fetchers.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def metadata_source_id(cls) -> str:
|
||||
"""Returns a unique identifier for the metadata source, to identify where the metadata came from."""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||
"""Tries to fetch metadata for the given ISBN."""
|
||||
pass
|
||||
+13
-7
@@ -4,17 +4,17 @@ A BookMetadataFetcher for the Google Books API.
|
||||
|
||||
import requests
|
||||
|
||||
from worblehat.book_data_fetchers.BookData import BookData
|
||||
from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher
|
||||
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||
|
||||
|
||||
class GoogleBooksFetcher(BookDataFetcher):
|
||||
class GoogleBooksFetcher(BookMetadataFetcher):
|
||||
@classmethod
|
||||
def fetcher_id(_cls) -> str:
|
||||
def metadata_source_id(_cls) -> str:
|
||||
return "google_books"
|
||||
|
||||
@classmethod
|
||||
def try_fetch_data(cls, isbn: str) -> BookData | None:
|
||||
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||
try:
|
||||
jsonInput = requests.get(
|
||||
"https://www.googleapis.com/books/v1/volumes",
|
||||
@@ -33,13 +33,19 @@ class GoogleBooksFetcher(BookDataFetcher):
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return BookData(
|
||||
return BookMetadata(
|
||||
isbn=isbn,
|
||||
title=title,
|
||||
source=cls.fetcher_id(),
|
||||
source=cls.metadata_source_id(),
|
||||
authors=authors,
|
||||
language=languages,
|
||||
publish_date=publishDate,
|
||||
num_pages=numberOfPages,
|
||||
subjects=subjects,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
book_data = GoogleBooksFetcher.fetch_metadata("0132624788")
|
||||
book_data.validate()
|
||||
print(book_data)
|
||||
+13
-7
@@ -4,21 +4,21 @@ A BookMetadataFetcher for the Open Library API.
|
||||
|
||||
import requests
|
||||
|
||||
from worblehat.book_data_fetchers.BookData import BookData
|
||||
from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher
|
||||
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||
|
||||
LANGUAGE_MAP = {
|
||||
"Norwegian": "no",
|
||||
}
|
||||
|
||||
|
||||
class OpenLibraryFetcher(BookDataFetcher):
|
||||
class OpenLibraryFetcher(BookMetadataFetcher):
|
||||
@classmethod
|
||||
def fetcher_id(_cls) -> str:
|
||||
def metadata_source_id(_cls) -> str:
|
||||
return "open_library"
|
||||
|
||||
@classmethod
|
||||
def try_fetch_data(cls, isbn: str) -> BookData | None:
|
||||
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||
try:
|
||||
jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json()
|
||||
|
||||
@@ -48,13 +48,19 @@ class OpenLibraryFetcher(BookDataFetcher):
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return BookData(
|
||||
return BookMetadata(
|
||||
isbn=isbn,
|
||||
title=title,
|
||||
source=cls.fetcher_id(),
|
||||
source=cls.metadata_source_id(),
|
||||
authors=author_names,
|
||||
language=language,
|
||||
publish_date=publishDate,
|
||||
num_pages=numberOfPages,
|
||||
subjects=subjects,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
book_data = OpenLibraryFetcher.fetch_metadata("9788205530751")
|
||||
book_data.validate()
|
||||
print(book_data)
|
||||
+13
-7
@@ -5,8 +5,8 @@ A BookMetadataFetcher that webscrapes https://outland.no/
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from worblehat.book_data_fetchers.BookData import BookData
|
||||
from worblehat.book_data_fetchers.BookDataFetcher import BookDataFetcher
|
||||
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||
|
||||
LANGUAGE_MAP = {
|
||||
"Norsk": "no",
|
||||
@@ -25,13 +25,13 @@ LANGUAGE_MAP = {
|
||||
}
|
||||
|
||||
|
||||
class OutlandScraperFetcher(BookDataFetcher):
|
||||
class OutlandScraperFetcher(BookMetadataFetcher):
|
||||
@classmethod
|
||||
def fetcher_id(_cls) -> str:
|
||||
def metadata_source_id(_cls) -> str:
|
||||
return "outland_scraper"
|
||||
|
||||
@classmethod
|
||||
def try_fetch_data(cls, isbn: str) -> BookData | None:
|
||||
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||
try:
|
||||
# Find the link to the product page
|
||||
response = requests.get(f"https://outland.no/{isbn}")
|
||||
@@ -89,13 +89,19 @@ class OutlandScraperFetcher(BookDataFetcher):
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return BookData(
|
||||
return BookMetadata(
|
||||
isbn=isbn,
|
||||
title=bookData.get("Title"),
|
||||
source=cls.fetcher_id(),
|
||||
source=cls.metadata_source_id(),
|
||||
authors=bookData.get("Authors"),
|
||||
language=bookData.get("Language"),
|
||||
publish_date=bookData.get("PublishDate"),
|
||||
num_pages=bookData.get("NumberOfPages"),
|
||||
subjects=bookData.get("Subjects"),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
book_data = OutlandScraperFetcher.fetch_metadata("9781947808225")
|
||||
book_data.validate()
|
||||
print(book_data)
|
||||
@@ -0,0 +1,3 @@
|
||||
from .book_metadata_fetcher import fetch_metadata_from_multiple_sources
|
||||
|
||||
__all__ = ["fetch_metadata_from_multiple_sources"]
|
||||
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result.
|
||||
|
||||
"""
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||
from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher
|
||||
from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher
|
||||
from worblehat.services.metadata_fetchers.OutlandScraperFetcher import (
|
||||
OutlandScraperFetcher,
|
||||
)
|
||||
|
||||
# The order of these fetchers determines the priority of the sources.
|
||||
# The first fetcher in the list has the highest priority.
|
||||
FETCHERS: list[BookMetadataFetcher] = [
|
||||
OpenLibraryFetcher,
|
||||
GoogleBooksFetcher,
|
||||
OutlandScraperFetcher,
|
||||
]
|
||||
|
||||
|
||||
FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS]
|
||||
|
||||
|
||||
def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]:
|
||||
"""
|
||||
Sorts the given metadata by the priority of the sources.
|
||||
|
||||
The order of the metadata is the same as the order of the sources in the FETCHERS list.
|
||||
"""
|
||||
|
||||
# Note that this function is O(n^2) but the number of fetchers is small so it's fine.
|
||||
return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
|
||||
|
||||
|
||||
def fetch_metadata_from_multiple_sources(isbn: str, strict: bool=False) -> list[BookMetadata]:
|
||||
"""
|
||||
Returns a list of metadata fetched from multiple sources.
|
||||
|
||||
Sources that does not have metadata for the given ISBN will be ignored.
|
||||
|
||||
There is no guarantee that there will be any metadata.
|
||||
|
||||
The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
|
||||
"""
|
||||
isbn = isbn.replace("-", "").replace("_", "").strip().lower()
|
||||
if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
|
||||
raise ValueError("Invalid ISBN")
|
||||
|
||||
results: list[BookMetadata] = []
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS]
|
||||
|
||||
for future in futures:
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
result.validate()
|
||||
except ValueError as e:
|
||||
if strict:
|
||||
raise e
|
||||
print(f"Invalid metadata: {e}")
|
||||
results.remove(result)
|
||||
|
||||
return sort_metadata_by_priority(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pprint import pprint
|
||||
|
||||
isbn = "0132624788"
|
||||
metadata = fetch_metadata_from_multiple_sources(isbn)
|
||||
pprint(metadata)
|
||||
Reference in New Issue
Block a user