added outland scraper fetcher
This commit is contained in:
parent
7b5446a824
commit
0ce79cda52
|
@ -1,4 +1,4 @@
|
||||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "alembic"
|
name = "alembic"
|
||||||
|
@ -19,6 +19,24 @@ typing-extensions = ">=4"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
tz = ["python-dateutil"]
|
tz = ["python-dateutil"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "beautifulsoup4"
|
||||||
|
version = "4.12.2"
|
||||||
|
description = "Screen-scraping library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6.0"
|
||||||
|
files = [
|
||||||
|
{file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
|
||||||
|
{file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
soupsieve = ">1.2"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
html5lib = ["html5lib"]
|
||||||
|
lxml = ["lxml"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "blinker"
|
name = "blinker"
|
||||||
version = "1.6.2"
|
version = "1.6.2"
|
||||||
|
@ -30,6 +48,19 @@ files = [
|
||||||
{file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
|
{file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bs4"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "Dummy package for Beautiful Soup"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
beautifulsoup4 = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2023.7.22"
|
version = "2023.7.22"
|
||||||
|
@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3"
|
||||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "soupsieve"
|
||||||
|
version = "2.4.1"
|
||||||
|
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
|
||||||
|
{file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sqlalchemy"
|
name = "sqlalchemy"
|
||||||
version = "2.0.12"
|
version = "2.0.12"
|
||||||
|
@ -579,7 +621,7 @@ files = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
|
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
|
||||||
typing-extensions = ">=4.2.0"
|
typing-extensions = ">=4.2.0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
|
@ -681,4 +723,4 @@ email = ["email-validator"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454"
|
content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00"
|
||||||
|
|
|
@ -17,6 +17,7 @@ python = "^3.11"
|
||||||
sqlalchemy = "^2.0.8"
|
sqlalchemy = "^2.0.8"
|
||||||
psycopg2-binary = "^2.9.6"
|
psycopg2-binary = "^2.9.6"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
|
bs4 = "^0.0.1"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
werkzeug = "^2.3.3"
|
werkzeug = "^2.3.3"
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
# import isbnlib
|
||||||
|
#used these instead of isbnlib as i have already written the code for them
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher
|
||||||
|
from worblehat.models.BookMetadata import BookMetadata
|
||||||
|
|
||||||
|
class OutlandScraperFetcher(BookMetadataFetcher):
|
||||||
|
|
||||||
|
def fetch_metadata(self, isbn: str) -> BookMetadata:
|
||||||
|
metadata = self.__outland(isbn)
|
||||||
|
if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# parse the metadata into a BookMetadata object
|
||||||
|
self.__metadata = BookMetadata(
|
||||||
|
isbn = isbn,
|
||||||
|
title = metadata.get('Title'),
|
||||||
|
authors = metadata.get('Authors'),
|
||||||
|
language = metadata.get('Language'),
|
||||||
|
publish_date = metadata.get('PublishDate'),
|
||||||
|
num_pages = metadata.get('NumberOfPages'),
|
||||||
|
subjects = metadata.get('Subjects'),
|
||||||
|
)
|
||||||
|
return self.__metadata
|
||||||
|
|
||||||
|
#create a dictionary to represent a book and its data
|
||||||
|
#gather data from openlibrary api and return it directly as json
|
||||||
|
def __outland(self, isbn):
|
||||||
|
#get data from openlibrary
|
||||||
|
try:
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
url = "https://outland.no/"+isbn
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
#get all hrefs from elements with class "product-item-link"
|
||||||
|
soup = soup.find_all("a", class_="product-item-link")
|
||||||
|
#get the first href
|
||||||
|
href = soup[0].get("href")
|
||||||
|
|
||||||
|
#get data from the first href
|
||||||
|
response = requests.get(href)
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
#get all elements with class "language"
|
||||||
|
data = soup.find_all("td", class_="col data")
|
||||||
|
base = soup.find_all("span", class_="base")[0].text
|
||||||
|
releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
|
||||||
|
#only keep the year of the release date
|
||||||
|
releaseDate = releaseDate[-4:]
|
||||||
|
|
||||||
|
#get the element withch contains anything in the intestingData list anywhere in it.
|
||||||
|
intrestingData = {
|
||||||
|
"Authors": "Forfattere",
|
||||||
|
"NumberOfPages": "Antall Sider",
|
||||||
|
"Genre": "Sjanger",
|
||||||
|
"Language": "Språk",
|
||||||
|
"Subjects": "Serie"
|
||||||
|
}
|
||||||
|
bookData = {
|
||||||
|
"Title": base,
|
||||||
|
"PublishDate": releaseDate,
|
||||||
|
"Authors": None,
|
||||||
|
"NumberOfPages": None,
|
||||||
|
"Genre": None,
|
||||||
|
"Language": None,
|
||||||
|
"Subjects": None
|
||||||
|
}
|
||||||
|
|
||||||
|
for value in data:
|
||||||
|
for key in intrestingData:
|
||||||
|
if str(value).lower().__contains__(intrestingData[key].lower()):
|
||||||
|
#get the next element in the list and add it to the bookData dict
|
||||||
|
bookData[key] = value.text
|
||||||
|
break
|
||||||
|
|
||||||
|
return bookData
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
fetcher = OutlandScraperFetcher()
|
||||||
|
book_data = fetcher.fetch_metadata('9781947808225')
|
||||||
|
print(book_data)
|
Loading…
Reference in New Issue