services/metadata_fetchers: init

Co-authored-by: Oystein Kristoffer Tveit <oysteikt@pvv.ntnu.no>
This commit is contained in:
Adrian Gunnar Lauterer 2024-07-27 22:18:59 +02:00 committed by h7x4
parent ec448c9f57
commit b2432e782e
Signed by: oysteikt
GPG Key ID: 9F2F7D8250F35146
8 changed files with 402 additions and 5 deletions

View File

@ -3,6 +3,7 @@ import isbnlib
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from .metadata_fetchers import fetch_metadata_from_multiple_sources
from ..models import ( from ..models import (
Author, Author,
BookcaseItem, BookcaseItem,
@ -25,20 +26,32 @@ def is_valid_isbn(isbn: str) -> bool:
def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None: def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None:
metadata = isbnlib.meta(isbn, 'openl') """
if len(metadata.keys()) == 0: This function fetches metadata for the given ISBN and creates a BookcaseItem from it.
It does so using a database connection to connect it to the correct authors and language
through the sql ORM.
If no metadata is found, None is returned.
Please not that the returned BookcaseItem will likely not be fully populated with the required
data, such as the book's location in the library, and the owner of the book, etc.
"""
metadata = fetch_metadata_from_multiple_sources(isbn)
if len(metadata) == 0:
return None return None
metadata = metadata[0]
bookcase_item = BookcaseItem( bookcase_item = BookcaseItem(
name = metadata.get('Title'), name = metadata.title,
isbn = int(isbn.replace('-', '')), isbn = int(isbn.replace('-', '')),
) )
if len(authors := metadata.get('Authors')) > 0: if len(authors := metadata.authors) > 0:
for author in authors: for author in authors:
bookcase_item.authors.add(Author(author)) bookcase_item.authors.add(Author(author))
if (language := metadata.get('Language')): if (language := metadata.language):
bookcase_item.language = sql_session.scalars( bookcase_item.language = sql_session.scalars(
select(Language) select(Language)
.where(Language.iso639_1_code == language) .where(Language.iso639_1_code == language)

View File

@ -0,0 +1,62 @@
from dataclasses import dataclass
from typing import Set
# TODO: Add more languages
LANGUAGES: set[str] = set([
"no",
"en",
"de",
"fr",
"es",
"it",
"sv",
"da",
"fi",
"ru",
"zh",
"ja",
"ko",
])
@dataclass
class BookMetadata:
"""A class representing metadata for a book."""
isbn: str
title: str
# The source of the metadata provider
source: str
authors: Set[str]
language: str | None
publish_date: str | None
num_pages: int | None
subjects: Set[str]
def to_dict(self) -> dict[str, any]:
return {
'isbn': self.isbn,
'title': self.title,
'source': self.metadata_source_id(),
'authors': set() if self.authors is None else self.authors,
'language': self.language,
'publish_date': self.publish_date,
'num_pages': self.num_pages,
'subjects': set() if self.subjects is None else self.subjects
}
def validate(self) -> None:
if not self.isbn:
raise ValueError('Missing ISBN')
if not self.title:
raise ValueError('Missing title')
if not self.source:
raise ValueError('Missing source')
if not self.authors:
raise ValueError('Missing authors')
if self.language is not None and self.language not in LANGUAGES:
raise ValueError(f'Invalid language: {self.language}. Consider adding it to the LANGUAGES set if you think this is a mistake.')
if self.num_pages is not None and self.num_pages < 0:
raise ValueError(f'Invalid number of pages: {self.num_pages}')

View File

@ -0,0 +1,20 @@
#base fetcher.
from abc import ABC, abstractmethod
from .BookMetadata import BookMetadata
class BookMetadataFetcher(ABC):
"""
A base class for metadata fetchers.
"""
@classmethod
@abstractmethod
def metadata_source_id(cls) -> str:
"""Returns a unique identifier for the metadata source, to identify where the metadata came from."""
pass
@classmethod
@abstractmethod
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
"""Tries to fetch metadata for the given ISBN."""
pass

View File

@ -0,0 +1,51 @@
"""
A BookMetadataFetcher for the Google Books API.
"""
import requests
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
class GoogleBooksFetcher(BookMetadataFetcher):
@classmethod
def metadata_source_id(_cls) -> str:
return "google_books"
@classmethod
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
try:
jsonInput = requests.get(
f"https://www.googleapis.com/books/v1/volumes",
params = {"q": f"isbn:{isbn}"},
).json()
data = jsonInput.get("items")[0].get("volumeInfo")
authors = set(data.get("authors") or [])
title = data.get("title")
publishDate = data.get("publish_date")
numberOfPages = data.get("number_of_pages")
if numberOfPages:
numberOfPages = int(numberOfPages)
subjects = set(data.get("categories") or [])
languages = data.get("languages")
except Exception:
return None
return BookMetadata(
isbn = isbn,
title = title,
source = cls.metadata_source_id(),
authors = authors,
language = languages,
publish_date = publishDate,
num_pages = numberOfPages,
subjects = subjects,
)
if __name__ == '__main__':
book_data = GoogleBooksFetcher.fetch_metadata('0132624788')
book_data.validate()
print(book_data)

View File

@ -0,0 +1,61 @@
"""
A BookMetadataFetcher for the Open Library API.
"""
import requests
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
LANGUAGE_MAP = {
"Norwegian": "no",
}
class OpenLibraryFetcher(BookMetadataFetcher):
@classmethod
def metadata_source_id(_cls) -> str:
return "open_library"
@classmethod
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
try:
jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json()
author_keys = jsonInput.get("authors") or []
author_names = set()
for author_key in author_keys:
key = author_key.get('key')
author_name = requests.get(f"https://openlibrary.org/{key}.json").json().get("name")
author_names.add(author_name)
title = jsonInput.get("title")
publishDate = jsonInput.get("publish_date")
numberOfPages = jsonInput.get("number_of_pages")
if numberOfPages:
numberOfPages = int(numberOfPages)
language_key = jsonInput.get("languages")[0].get("key")
language = requests.get(f"https://openlibrary.org/{language_key}.json").json().get("identifiers").get("iso_639_1")[0]
subjects = set(jsonInput.get("subjects") or [])
except Exception:
return None
return BookMetadata(
isbn = isbn,
title = title,
source = cls.metadata_source_id(),
authors = author_names,
language = language,
publish_date = publishDate,
num_pages = numberOfPages,
subjects = subjects,
)
if __name__ == '__main__':
book_data = OpenLibraryFetcher.fetch_metadata('9788205530751')
book_data.validate()
print(book_data)

View File

@ -0,0 +1,109 @@
"""
A BookMetadataFetcher that webscrapes https://outland.no/
"""
from bs4 import BeautifulSoup
import requests
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
LANGUAGE_MAP = {
"Norsk": "no",
"Engelsk": "en",
"Tysk": "de",
"Fransk": "fr",
"Spansk": "es",
"Italiensk": "it",
"Svensk": "sv",
"Dansk": "da",
"Finsk": "fi",
"Russisk": "ru",
"Kinesisk": "zh",
"Japansk": "ja",
"Koreansk": "ko",
}
class OutlandScraperFetcher(BookMetadataFetcher):
@classmethod
def metadata_source_id(_cls) -> str:
return "outland_scraper"
@classmethod
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
try:
# Find the link to the product page
response = requests.get(f"https://outland.no/{isbn}")
soup = BeautifulSoup(response.content, "html.parser")
soup = soup.find_all("a", class_="product-item-link")
href = soup[0].get("href")
# Find the metadata on the product page
response = requests.get(href)
soup = BeautifulSoup(response.content, "html.parser")
data = soup.find_all("td", class_="col data")
# Collect the metadata
title = soup.find_all("span", class_="base")[0].text
releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
releaseDate = releaseDate[-4:] # only keep year
bookData = {
"Title": title,
"PublishDate": releaseDate,
"Authors": None,
"NumberOfPages": None,
"Genre": None,
"Language": None,
"Subjects": None,
}
dataKeyMap = {
"Authors": "Forfattere",
"NumberOfPages": "Antall Sider",
"Genre": "Sjanger",
"Language": "Språk",
"Subjects": "Serie"
}
for value in data:
for key in dataKeyMap:
if str(value).lower().__contains__(dataKeyMap[key].lower()):
bookData[key] = value.text
break
if bookData["Language"] is not None:
bookData["Language"] = LANGUAGE_MAP.get(bookData["Language"])
if bookData["Authors"] is not None:
bookData["Authors"] = set(bookData["Authors"].split(", "))
if bookData["Subjects"] is not None:
bookData["Subjects"] = set(bookData["Subjects"].split(", "))
if bookData["NumberOfPages"] is not None:
bookData["NumberOfPages"] = int(bookData["NumberOfPages"])
except Exception:
return None
return BookMetadata(
isbn = isbn,
title = bookData.get('Title'),
source = cls.metadata_source_id(),
authors = bookData.get('Authors'),
language = bookData.get('Language'),
publish_date = bookData.get('PublishDate'),
num_pages = bookData.get('NumberOfPages'),
subjects = bookData.get('Subjects'),
)
if __name__ == '__main__':
book_data = OutlandScraperFetcher.fetch_metadata('9781947808225')
book_data.validate()
print(book_data)

View File

@ -0,0 +1 @@
from .book_metadata_fetcher import fetch_metadata_from_multiple_sources

View File

@ -0,0 +1,80 @@
"""
this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result.
"""
from concurrent.futures import ThreadPoolExecutor
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher
from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher
from worblehat.services.metadata_fetchers.OutlandScraperFetcher import OutlandScraperFetcher
# The order of these fetchers determines the priority of the sources.
# The first fetcher in the list has the highest priority.
FETCHERS: list[BookMetadataFetcher] = [
OpenLibraryFetcher,
GoogleBooksFetcher,
OutlandScraperFetcher,
]
FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS]
def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]:
"""
Sorts the given metadata by the priority of the sources.
The order of the metadata is the same as the order of the sources in the FETCHERS list.
"""
# Note that this function is O(n^2) but the number of fetchers is small so it's fine.
return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
def fetch_metadata_from_multiple_sources(isbn: str, strict=False) -> list[BookMetadata]:
"""
Returns a list of metadata fetched from multiple sources.
Sources that does not have metadata for the given ISBN will be ignored.
There is no guarantee that there will be any metadata.
The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
"""
isbn = isbn.replace('-', '').replace('_', '').strip().lower()
if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
raise ValueError('Invalid ISBN')
results: list[BookMetadata] = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS]
for future in futures:
result = future.result()
if result is not None:
results.append(result)
for result in results:
try:
result.validate()
except ValueError as e:
if strict:
raise e
else:
print(f'Invalid metadata: {e}')
results.remove(result)
return sort_metadata_by_priority(results)
if __name__ == '__main__':
from pprint import pprint
isbn = '0132624788'
metadata = fetch_metadata_from_multiple_sources(isbn)
pprint(metadata)