added outland scraper fetcher
This commit is contained in:
parent
7b5446a824
commit
0ce79cda52
48
poetry.lock
generated
48
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "alembic"
|
||||
@ -19,6 +19,24 @@ typing-extensions = ">=4"
|
||||
[package.extras]
|
||||
tz = ["python-dateutil"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.12.2"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.6.0"
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
|
||||
{file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">1.2"
|
||||
|
||||
[package.extras]
|
||||
html5lib = ["html5lib"]
|
||||
lxml = ["lxml"]
|
||||
|
||||
[[package]]
|
||||
name = "blinker"
|
||||
version = "1.6.2"
|
||||
@ -30,6 +48,19 @@ files = [
|
||||
{file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bs4"
|
||||
version = "0.0.1"
|
||||
description = "Dummy package for Beautiful Soup"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = "*"
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2023.7.22"
|
||||
@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3"
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.4.1"
|
||||
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
|
||||
{file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlalchemy"
|
||||
version = "2.0.12"
|
||||
@ -579,7 +621,7 @@ files = [
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
|
||||
typing-extensions = ">=4.2.0"
|
||||
|
||||
[package.extras]
|
||||
@ -681,4 +723,4 @@ email = ["email-validator"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454"
|
||||
content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00"
|
||||
|
@ -17,6 +17,7 @@ python = "^3.11"
|
||||
sqlalchemy = "^2.0.8"
|
||||
psycopg2-binary = "^2.9.6"
|
||||
requests = "^2.31.0"
|
||||
bs4 = "^0.0.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
werkzeug = "^2.3.3"
|
||||
|
@ -0,0 +1,88 @@
|
||||
from typing import Dict, Optional
|
||||
|
||||
# import isbnlib
|
||||
#used these instead of isbnlib as i have already written the code for them
|
||||
import json
|
||||
import requests
|
||||
|
||||
from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher
|
||||
from worblehat.models.BookMetadata import BookMetadata
|
||||
|
||||
class OutlandScraperFetcher(BookMetadataFetcher):
|
||||
|
||||
def fetch_metadata(self, isbn: str) -> BookMetadata:
|
||||
metadata = self.__outland(isbn)
|
||||
if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')):
|
||||
return None
|
||||
|
||||
# parse the metadata into a BookMetadata object
|
||||
self.__metadata = BookMetadata(
|
||||
isbn = isbn,
|
||||
title = metadata.get('Title'),
|
||||
authors = metadata.get('Authors'),
|
||||
language = metadata.get('Language'),
|
||||
publish_date = metadata.get('PublishDate'),
|
||||
num_pages = metadata.get('NumberOfPages'),
|
||||
subjects = metadata.get('Subjects'),
|
||||
)
|
||||
return self.__metadata
|
||||
|
||||
#create a dictionary to represent a book and its data
|
||||
#gather data from openlibrary api and return it directly as json
|
||||
def __outland(self, isbn):
|
||||
#get data from openlibrary
|
||||
try:
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
url = "https://outland.no/"+isbn
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
#get all hrefs from elements with class "product-item-link"
|
||||
soup = soup.find_all("a", class_="product-item-link")
|
||||
#get the first href
|
||||
href = soup[0].get("href")
|
||||
|
||||
#get data from the first href
|
||||
response = requests.get(href)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
#get all elements with class "language"
|
||||
data = soup.find_all("td", class_="col data")
|
||||
base = soup.find_all("span", class_="base")[0].text
|
||||
releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
|
||||
#only keep the year of the release date
|
||||
releaseDate = releaseDate[-4:]
|
||||
|
||||
#get the element withch contains anything in the intestingData list anywhere in it.
|
||||
intrestingData = {
|
||||
"Authors": "Forfattere",
|
||||
"NumberOfPages": "Antall Sider",
|
||||
"Genre": "Sjanger",
|
||||
"Language": "Språk",
|
||||
"Subjects": "Serie"
|
||||
}
|
||||
bookData = {
|
||||
"Title": base,
|
||||
"PublishDate": releaseDate,
|
||||
"Authors": None,
|
||||
"NumberOfPages": None,
|
||||
"Genre": None,
|
||||
"Language": None,
|
||||
"Subjects": None
|
||||
}
|
||||
|
||||
for value in data:
|
||||
for key in intrestingData:
|
||||
if str(value).lower().__contains__(intrestingData[key].lower()):
|
||||
#get the next element in the list and add it to the bookData dict
|
||||
bookData[key] = value.text
|
||||
break
|
||||
|
||||
return bookData
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return False
|
||||
|
||||
if __name__ == '__main__':
|
||||
fetcher = OutlandScraperFetcher()
|
||||
book_data = fetcher.fetch_metadata('9781947808225')
|
||||
print(book_data)
|
Loading…
Reference in New Issue
Block a user