added outland scraper fetcher

This commit is contained in:
= 2023-09-01 15:32:17 +02:00
parent 7b5446a824
commit 0ce79cda52
3 changed files with 134 additions and 3 deletions

48
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "alembic"
@ -19,6 +19,24 @@ typing-extensions = ">=4"
[package.extras]
tz = ["python-dateutil"]
[[package]]
name = "beautifulsoup4"
version = "4.12.2"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.6.0"
files = [
{file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
{file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
]
[package.dependencies]
soupsieve = ">1.2"
[package.extras]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "blinker"
version = "1.6.2"
@ -30,6 +48,19 @@ files = [
{file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
]
[[package]]
name = "bs4"
version = "0.0.1"
description = "Dummy package for Beautiful Soup"
optional = false
python-versions = "*"
files = [
{file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
]
[package.dependencies]
beautifulsoup4 = "*"
[[package]]
name = "certifi"
version = "2023.7.22"
@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "soupsieve"
version = "2.4.1"
description = "A modern CSS selector implementation for Beautiful Soup."
optional = false
python-versions = ">=3.7"
files = [
{file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
{file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
]
[[package]]
name = "sqlalchemy"
version = "2.0.12"
@ -579,7 +621,7 @@ files = [
]
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
typing-extensions = ">=4.2.0"
[package.extras]
@ -681,4 +723,4 @@ email = ["email-validator"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454"
content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00"

View File

@ -17,6 +17,7 @@ python = "^3.11"
sqlalchemy = "^2.0.8"
psycopg2-binary = "^2.9.6"
requests = "^2.31.0"
bs4 = "^0.0.1"
[tool.poetry.group.dev.dependencies]
werkzeug = "^2.3.3"

View File

@ -0,0 +1,88 @@
from typing import Dict, Optional
# import isbnlib
#used these instead of isbnlib as i have already written the code for them
import json
import requests
from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher
from worblehat.models.BookMetadata import BookMetadata
class OutlandScraperFetcher(BookMetadataFetcher):
def fetch_metadata(self, isbn: str) -> BookMetadata:
metadata = self.__outland(isbn)
if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')):
return None
# parse the metadata into a BookMetadata object
self.__metadata = BookMetadata(
isbn = isbn,
title = metadata.get('Title'),
authors = metadata.get('Authors'),
language = metadata.get('Language'),
publish_date = metadata.get('PublishDate'),
num_pages = metadata.get('NumberOfPages'),
subjects = metadata.get('Subjects'),
)
return self.__metadata
#create a dictionary to represent a book and its data
#gather data from openlibrary api and return it directly as json
def __outland(self, isbn):
#get data from openlibrary
try:
from bs4 import BeautifulSoup
url = "https://outland.no/"+isbn
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#get all hrefs from elements with class "product-item-link"
soup = soup.find_all("a", class_="product-item-link")
#get the first href
href = soup[0].get("href")
#get data from the first href
response = requests.get(href)
soup = BeautifulSoup(response.content, "html.parser")
#get all elements with class "language"
data = soup.find_all("td", class_="col data")
base = soup.find_all("span", class_="base")[0].text
releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
#only keep the year of the release date
releaseDate = releaseDate[-4:]
#get the element withch contains anything in the intestingData list anywhere in it.
intrestingData = {
"Authors": "Forfattere",
"NumberOfPages": "Antall Sider",
"Genre": "Sjanger",
"Language": "Språk",
"Subjects": "Serie"
}
bookData = {
"Title": base,
"PublishDate": releaseDate,
"Authors": None,
"NumberOfPages": None,
"Genre": None,
"Language": None,
"Subjects": None
}
for value in data:
for key in intrestingData:
if str(value).lower().__contains__(intrestingData[key].lower()):
#get the next element in the list and add it to the bookData dict
bookData[key] = value.text
break
return bookData
except Exception as e:
print(str(e))
return False
if __name__ == '__main__':
fetcher = OutlandScraperFetcher()
book_data = fetcher.fetch_metadata('9781947808225')
print(book_data)