services/metadata_fetchers: init
Co-authored-by: Oystein Kristoffer Tveit <oysteikt@pvv.ntnu.no>
This commit is contained in:
parent
ec448c9f57
commit
b2432e782e
|
@ -3,6 +3,7 @@ import isbnlib
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from .metadata_fetchers import fetch_metadata_from_multiple_sources
|
||||||
from ..models import (
|
from ..models import (
|
||||||
Author,
|
Author,
|
||||||
BookcaseItem,
|
BookcaseItem,
|
||||||
|
@ -25,20 +26,32 @@ def is_valid_isbn(isbn: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None:
|
def create_bookcase_item_from_isbn(isbn: str, sql_session: Session) -> BookcaseItem | None:
|
||||||
metadata = isbnlib.meta(isbn, 'openl')
|
"""
|
||||||
if len(metadata.keys()) == 0:
|
This function fetches metadata for the given ISBN and creates a BookcaseItem from it.
|
||||||
|
It does so using a database connection to connect it to the correct authors and language
|
||||||
|
through the sql ORM.
|
||||||
|
|
||||||
|
If no metadata is found, None is returned.
|
||||||
|
|
||||||
|
Please not that the returned BookcaseItem will likely not be fully populated with the required
|
||||||
|
data, such as the book's location in the library, and the owner of the book, etc.
|
||||||
|
"""
|
||||||
|
metadata = fetch_metadata_from_multiple_sources(isbn)
|
||||||
|
if len(metadata) == 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
metadata = metadata[0]
|
||||||
|
|
||||||
bookcase_item = BookcaseItem(
|
bookcase_item = BookcaseItem(
|
||||||
name = metadata.get('Title'),
|
name = metadata.title,
|
||||||
isbn = int(isbn.replace('-', '')),
|
isbn = int(isbn.replace('-', '')),
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(authors := metadata.get('Authors')) > 0:
|
if len(authors := metadata.authors) > 0:
|
||||||
for author in authors:
|
for author in authors:
|
||||||
bookcase_item.authors.add(Author(author))
|
bookcase_item.authors.add(Author(author))
|
||||||
|
|
||||||
if (language := metadata.get('Language')):
|
if (language := metadata.language):
|
||||||
bookcase_item.language = sql_session.scalars(
|
bookcase_item.language = sql_session.scalars(
|
||||||
select(Language)
|
select(Language)
|
||||||
.where(Language.iso639_1_code == language)
|
.where(Language.iso639_1_code == language)
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Set
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Add more languages
|
||||||
|
LANGUAGES: set[str] = set([
|
||||||
|
"no",
|
||||||
|
"en",
|
||||||
|
"de",
|
||||||
|
"fr",
|
||||||
|
"es",
|
||||||
|
"it",
|
||||||
|
"sv",
|
||||||
|
"da",
|
||||||
|
"fi",
|
||||||
|
"ru",
|
||||||
|
"zh",
|
||||||
|
"ja",
|
||||||
|
"ko",
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BookMetadata:
|
||||||
|
"""A class representing metadata for a book."""
|
||||||
|
isbn: str
|
||||||
|
title: str
|
||||||
|
# The source of the metadata provider
|
||||||
|
source: str
|
||||||
|
authors: Set[str]
|
||||||
|
language: str | None
|
||||||
|
publish_date: str | None
|
||||||
|
num_pages: int | None
|
||||||
|
subjects: Set[str]
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, any]:
|
||||||
|
return {
|
||||||
|
'isbn': self.isbn,
|
||||||
|
'title': self.title,
|
||||||
|
'source': self.metadata_source_id(),
|
||||||
|
'authors': set() if self.authors is None else self.authors,
|
||||||
|
'language': self.language,
|
||||||
|
'publish_date': self.publish_date,
|
||||||
|
'num_pages': self.num_pages,
|
||||||
|
'subjects': set() if self.subjects is None else self.subjects
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate(self) -> None:
|
||||||
|
if not self.isbn:
|
||||||
|
raise ValueError('Missing ISBN')
|
||||||
|
if not self.title:
|
||||||
|
raise ValueError('Missing title')
|
||||||
|
if not self.source:
|
||||||
|
raise ValueError('Missing source')
|
||||||
|
if not self.authors:
|
||||||
|
raise ValueError('Missing authors')
|
||||||
|
|
||||||
|
if self.language is not None and self.language not in LANGUAGES:
|
||||||
|
raise ValueError(f'Invalid language: {self.language}. Consider adding it to the LANGUAGES set if you think this is a mistake.')
|
||||||
|
|
||||||
|
if self.num_pages is not None and self.num_pages < 0:
|
||||||
|
raise ValueError(f'Invalid number of pages: {self.num_pages}')
|
|
@ -0,0 +1,20 @@
|
||||||
|
#base fetcher.
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from .BookMetadata import BookMetadata
|
||||||
|
|
||||||
|
class BookMetadataFetcher(ABC):
|
||||||
|
"""
|
||||||
|
A base class for metadata fetchers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def metadata_source_id(cls) -> str:
|
||||||
|
"""Returns a unique identifier for the metadata source, to identify where the metadata came from."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||||
|
"""Tries to fetch metadata for the given ISBN."""
|
||||||
|
pass
|
|
@ -0,0 +1,51 @@
|
||||||
|
"""
|
||||||
|
A BookMetadataFetcher for the Google Books API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleBooksFetcher(BookMetadataFetcher):
|
||||||
|
@classmethod
|
||||||
|
def metadata_source_id(_cls) -> str:
|
||||||
|
return "google_books"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||||
|
try:
|
||||||
|
jsonInput = requests.get(
|
||||||
|
f"https://www.googleapis.com/books/v1/volumes",
|
||||||
|
params = {"q": f"isbn:{isbn}"},
|
||||||
|
).json()
|
||||||
|
data = jsonInput.get("items")[0].get("volumeInfo")
|
||||||
|
|
||||||
|
authors = set(data.get("authors") or [])
|
||||||
|
title = data.get("title")
|
||||||
|
publishDate = data.get("publish_date")
|
||||||
|
numberOfPages = data.get("number_of_pages")
|
||||||
|
if numberOfPages:
|
||||||
|
numberOfPages = int(numberOfPages)
|
||||||
|
subjects = set(data.get("categories") or [])
|
||||||
|
languages = data.get("languages")
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return BookMetadata(
|
||||||
|
isbn = isbn,
|
||||||
|
title = title,
|
||||||
|
source = cls.metadata_source_id(),
|
||||||
|
authors = authors,
|
||||||
|
language = languages,
|
||||||
|
publish_date = publishDate,
|
||||||
|
num_pages = numberOfPages,
|
||||||
|
subjects = subjects,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
book_data = GoogleBooksFetcher.fetch_metadata('0132624788')
|
||||||
|
book_data.validate()
|
||||||
|
print(book_data)
|
|
@ -0,0 +1,61 @@
|
||||||
|
"""
|
||||||
|
A BookMetadataFetcher for the Open Library API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||||
|
|
||||||
|
LANGUAGE_MAP = {
|
||||||
|
"Norwegian": "no",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class OpenLibraryFetcher(BookMetadataFetcher):
|
||||||
|
@classmethod
|
||||||
|
def metadata_source_id(_cls) -> str:
|
||||||
|
return "open_library"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||||
|
try:
|
||||||
|
jsonInput = requests.get(f"https://openlibrary.org/isbn/{isbn}.json").json()
|
||||||
|
|
||||||
|
author_keys = jsonInput.get("authors") or []
|
||||||
|
author_names = set()
|
||||||
|
for author_key in author_keys:
|
||||||
|
key = author_key.get('key')
|
||||||
|
author_name = requests.get(f"https://openlibrary.org/{key}.json").json().get("name")
|
||||||
|
author_names.add(author_name)
|
||||||
|
|
||||||
|
title = jsonInput.get("title")
|
||||||
|
publishDate = jsonInput.get("publish_date")
|
||||||
|
|
||||||
|
numberOfPages = jsonInput.get("number_of_pages")
|
||||||
|
if numberOfPages:
|
||||||
|
numberOfPages = int(numberOfPages)
|
||||||
|
|
||||||
|
language_key = jsonInput.get("languages")[0].get("key")
|
||||||
|
language = requests.get(f"https://openlibrary.org/{language_key}.json").json().get("identifiers").get("iso_639_1")[0]
|
||||||
|
subjects = set(jsonInput.get("subjects") or [])
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return BookMetadata(
|
||||||
|
isbn = isbn,
|
||||||
|
title = title,
|
||||||
|
source = cls.metadata_source_id(),
|
||||||
|
authors = author_names,
|
||||||
|
language = language,
|
||||||
|
publish_date = publishDate,
|
||||||
|
num_pages = numberOfPages,
|
||||||
|
subjects = subjects,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
book_data = OpenLibraryFetcher.fetch_metadata('9788205530751')
|
||||||
|
book_data.validate()
|
||||||
|
print(book_data)
|
|
@ -0,0 +1,109 @@
|
||||||
|
"""
|
||||||
|
A BookMetadataFetcher that webscrapes https://outland.no/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||||
|
|
||||||
|
|
||||||
|
LANGUAGE_MAP = {
|
||||||
|
"Norsk": "no",
|
||||||
|
"Engelsk": "en",
|
||||||
|
"Tysk": "de",
|
||||||
|
"Fransk": "fr",
|
||||||
|
"Spansk": "es",
|
||||||
|
"Italiensk": "it",
|
||||||
|
"Svensk": "sv",
|
||||||
|
"Dansk": "da",
|
||||||
|
"Finsk": "fi",
|
||||||
|
"Russisk": "ru",
|
||||||
|
"Kinesisk": "zh",
|
||||||
|
"Japansk": "ja",
|
||||||
|
"Koreansk": "ko",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class OutlandScraperFetcher(BookMetadataFetcher):
|
||||||
|
@classmethod
|
||||||
|
def metadata_source_id(_cls) -> str:
|
||||||
|
return "outland_scraper"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetch_metadata(cls, isbn: str) -> BookMetadata | None:
|
||||||
|
try:
|
||||||
|
# Find the link to the product page
|
||||||
|
response = requests.get(f"https://outland.no/{isbn}")
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
soup = soup.find_all("a", class_="product-item-link")
|
||||||
|
href = soup[0].get("href")
|
||||||
|
|
||||||
|
# Find the metadata on the product page
|
||||||
|
response = requests.get(href)
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
data = soup.find_all("td", class_="col data")
|
||||||
|
|
||||||
|
# Collect the metadata
|
||||||
|
title = soup.find_all("span", class_="base")[0].text
|
||||||
|
|
||||||
|
releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
|
||||||
|
releaseDate = releaseDate[-4:] # only keep year
|
||||||
|
|
||||||
|
bookData = {
|
||||||
|
"Title": title,
|
||||||
|
"PublishDate": releaseDate,
|
||||||
|
"Authors": None,
|
||||||
|
"NumberOfPages": None,
|
||||||
|
"Genre": None,
|
||||||
|
"Language": None,
|
||||||
|
"Subjects": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
dataKeyMap = {
|
||||||
|
"Authors": "Forfattere",
|
||||||
|
"NumberOfPages": "Antall Sider",
|
||||||
|
"Genre": "Sjanger",
|
||||||
|
"Language": "Språk",
|
||||||
|
"Subjects": "Serie"
|
||||||
|
}
|
||||||
|
|
||||||
|
for value in data:
|
||||||
|
for key in dataKeyMap:
|
||||||
|
if str(value).lower().__contains__(dataKeyMap[key].lower()):
|
||||||
|
bookData[key] = value.text
|
||||||
|
break
|
||||||
|
|
||||||
|
if bookData["Language"] is not None:
|
||||||
|
bookData["Language"] = LANGUAGE_MAP.get(bookData["Language"])
|
||||||
|
|
||||||
|
if bookData["Authors"] is not None:
|
||||||
|
bookData["Authors"] = set(bookData["Authors"].split(", "))
|
||||||
|
|
||||||
|
if bookData["Subjects"] is not None:
|
||||||
|
bookData["Subjects"] = set(bookData["Subjects"].split(", "))
|
||||||
|
|
||||||
|
if bookData["NumberOfPages"] is not None:
|
||||||
|
bookData["NumberOfPages"] = int(bookData["NumberOfPages"])
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return BookMetadata(
|
||||||
|
isbn = isbn,
|
||||||
|
title = bookData.get('Title'),
|
||||||
|
source = cls.metadata_source_id(),
|
||||||
|
authors = bookData.get('Authors'),
|
||||||
|
language = bookData.get('Language'),
|
||||||
|
publish_date = bookData.get('PublishDate'),
|
||||||
|
num_pages = bookData.get('NumberOfPages'),
|
||||||
|
subjects = bookData.get('Subjects'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
book_data = OutlandScraperFetcher.fetch_metadata('9781947808225')
|
||||||
|
book_data.validate()
|
||||||
|
print(book_data)
|
|
@ -0,0 +1 @@
|
||||||
|
from .book_metadata_fetcher import fetch_metadata_from_multiple_sources
|
|
@ -0,0 +1,80 @@
|
||||||
|
"""
|
||||||
|
this module contains the fetch_book_metadata() function which fetches book metadata from multiple sources in threads and returns the higest ranked non-None result.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadata import BookMetadata
|
||||||
|
from worblehat.services.metadata_fetchers.BookMetadataFetcher import BookMetadataFetcher
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.GoogleBooksFetcher import GoogleBooksFetcher
|
||||||
|
from worblehat.services.metadata_fetchers.OpenLibraryFetcher import OpenLibraryFetcher
|
||||||
|
from worblehat.services.metadata_fetchers.OutlandScraperFetcher import OutlandScraperFetcher
|
||||||
|
|
||||||
|
|
||||||
|
# The order of these fetchers determines the priority of the sources.
|
||||||
|
# The first fetcher in the list has the highest priority.
|
||||||
|
FETCHERS: list[BookMetadataFetcher] = [
|
||||||
|
OpenLibraryFetcher,
|
||||||
|
GoogleBooksFetcher,
|
||||||
|
OutlandScraperFetcher,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
FETCHER_SOURCE_IDS: list[str] = [fetcher.metadata_source_id() for fetcher in FETCHERS]
|
||||||
|
|
||||||
|
|
||||||
|
def sort_metadata_by_priority(metadata: list[BookMetadata]) -> list[BookMetadata]:
|
||||||
|
"""
|
||||||
|
Sorts the given metadata by the priority of the sources.
|
||||||
|
|
||||||
|
The order of the metadata is the same as the order of the sources in the FETCHERS list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Note that this function is O(n^2) but the number of fetchers is small so it's fine.
|
||||||
|
return sorted(metadata, key=lambda m: FETCHER_SOURCE_IDS.index(m.source))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_metadata_from_multiple_sources(isbn: str, strict=False) -> list[BookMetadata]:
|
||||||
|
"""
|
||||||
|
Returns a list of metadata fetched from multiple sources.
|
||||||
|
|
||||||
|
Sources that does not have metadata for the given ISBN will be ignored.
|
||||||
|
|
||||||
|
There is no guarantee that there will be any metadata.
|
||||||
|
|
||||||
|
The results are always ordered in the same way as the fetchers are listed in the FETCHERS list.
|
||||||
|
"""
|
||||||
|
isbn = isbn.replace('-', '').replace('_', '').strip().lower()
|
||||||
|
if len(isbn) != 10 and len(isbn) != 13 and not isbn.isnumeric():
|
||||||
|
raise ValueError('Invalid ISBN')
|
||||||
|
|
||||||
|
results: list[BookMetadata] = []
|
||||||
|
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
futures = [executor.submit(fetcher.fetch_metadata, isbn) for fetcher in FETCHERS]
|
||||||
|
|
||||||
|
for future in futures:
|
||||||
|
result = future.result()
|
||||||
|
if result is not None:
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
try:
|
||||||
|
result.validate()
|
||||||
|
except ValueError as e:
|
||||||
|
if strict:
|
||||||
|
raise e
|
||||||
|
else:
|
||||||
|
print(f'Invalid metadata: {e}')
|
||||||
|
results.remove(result)
|
||||||
|
|
||||||
|
return sort_metadata_by_priority(results)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from pprint import pprint
|
||||||
|
isbn = '0132624788'
|
||||||
|
metadata = fetch_metadata_from_multiple_sources(isbn)
|
||||||
|
pprint(metadata)
|
Loading…
Reference in New Issue