From 0ce79cda5207f8dcd6b7b28a671797bf562cffe2 Mon Sep 17 00:00:00 2001 From: = <=> Date: Fri, 1 Sep 2023 15:32:17 +0200 Subject: [PATCH] added outland scraper fetcher --- poetry.lock | 48 +++++++++- pyproject.toml | 1 + .../outland_scraper_fetcher.py | 88 +++++++++++++++++++ 3 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 worblehat/services/metadata_fetchers/outland_scraper_fetcher.py diff --git a/poetry.lock b/poetry.lock index c27cc98..9bc0dc0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "alembic" @@ -19,6 +19,24 @@ typing-extensions = ">=4" [package.extras] tz = ["python-dateutil"] +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "blinker" version = "1.6.2" @@ -30,6 +48,19 @@ files = [ {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"}, ] +[[package]] +name = "bs4" +version = "0.0.1" +description = "Dummy package for Beautiful Soup" +optional = false +python-versions = "*" +files = [ + {file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"}, +] + +[package.dependencies] +beautifulsoup4 = "*" + [[package]] name = "certifi" version = "2023.7.22" @@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "soupsieve" +version = "2.4.1" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"}, + {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, +] + [[package]] name = "sqlalchemy" version = "2.0.12" @@ -579,7 +621,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.2.0" [package.extras] @@ -681,4 +723,4 @@ email = ["email-validator"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454" +content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00" diff --git a/pyproject.toml b/pyproject.toml index 092d0b5..5bc2175 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ python = "^3.11" sqlalchemy = "^2.0.8" psycopg2-binary = "^2.9.6" requests = "^2.31.0" +bs4 = "^0.0.1" [tool.poetry.group.dev.dependencies] werkzeug = "^2.3.3" diff --git a/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py b/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py new file mode 100644 index 0000000..5817166 --- /dev/null +++ b/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py @@ -0,0 +1,88 @@ +from typing import Dict, Optional + +# import isbnlib +#used these instead of isbnlib as i have already written the code for them +import json +import requests + +from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher +from worblehat.models.BookMetadata import BookMetadata + +class OutlandScraperFetcher(BookMetadataFetcher): + + def fetch_metadata(self, isbn: str) -> BookMetadata: + metadata = self.__outland(isbn) + if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')): + return None + + # parse the metadata into a BookMetadata object + self.__metadata = BookMetadata( + isbn = isbn, + title = metadata.get('Title'), + authors = metadata.get('Authors'), + language = metadata.get('Language'), + publish_date = metadata.get('PublishDate'), + num_pages = metadata.get('NumberOfPages'), + subjects = metadata.get('Subjects'), + ) + return self.__metadata + + #create a dictionary to represent a book and its data + #gather data from openlibrary api and return it directly as json + def __outland(self, isbn): + #get data from openlibrary + try: + + from bs4 import BeautifulSoup + url = "https://outland.no/"+isbn + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + #get all hrefs from elements with class "product-item-link" + soup = soup.find_all("a", class_="product-item-link") + #get the first href + href = soup[0].get("href") + + #get data from the first href + response = requests.get(href) + soup = BeautifulSoup(response.content, "html.parser") + #get all elements with class "language" + data = soup.find_all("td", class_="col data") + base = soup.find_all("span", class_="base")[0].text + releaseDate = soup.find_all("span", class_="release-date")[0].text.strip() + #only keep the year of the release date + releaseDate = releaseDate[-4:] + + #get the element withch contains anything in the intestingData list anywhere in it. + intrestingData = { + "Authors": "Forfattere", + "NumberOfPages": "Antall Sider", + "Genre": "Sjanger", + "Language": "Språk", + "Subjects": "Serie" + } + bookData = { + "Title": base, + "PublishDate": releaseDate, + "Authors": None, + "NumberOfPages": None, + "Genre": None, + "Language": None, + "Subjects": None + } + + for value in data: + for key in intrestingData: + if str(value).lower().__contains__(intrestingData[key].lower()): + #get the next element in the list and add it to the bookData dict + bookData[key] = value.text + break + + return bookData + except Exception as e: + print(str(e)) + return False + +if __name__ == '__main__': + fetcher = OutlandScraperFetcher() + book_data = fetcher.fetch_metadata('9781947808225') + print(book_data) \ No newline at end of file