added outland scraper fetcher

2023-09-01 15:32:17 +02:00
parent 7b5446a824
commit 0ce79cda52
3 changed files with 134 additions and 3 deletions
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 [[package]]
 name = "alembic"
@@ -19,6 +19,24 @@ typing-extensions = ">=4"
 [package.extras]
 tz = ["python-dateutil"]
 [[package]]
 name = "beautifulsoup4"
 version = "4.12.2"
 description = "Screen-scraping library"
 optional = false
 python-versions = ">=3.6.0"
 files = [
    {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
    {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
 ]
 [package.dependencies]
 soupsieve = ">1.2"
 [package.extras]
 html5lib = ["html5lib"]
 lxml = ["lxml"]
 [[package]]
 name = "blinker"
 version = "1.6.2"
@@ -30,6 +48,19 @@ files = [
    {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
 ]
 [[package]]
 name = "bs4"
 version = "0.0.1"
 description = "Dummy package for Beautiful Soup"
 optional = false
 python-versions = "*"
 files = [
    {file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
 ]
 [package.dependencies]
 beautifulsoup4 = "*"
 [[package]]
 name = "certifi"
 version = "2023.7.22"
@@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 [[package]]
 name = "soupsieve"
 version = "2.4.1"
 description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
    {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
 ]
 [[package]]
 name = "sqlalchemy"
 version = "2.0.12"
@@ -579,7 +621,7 @@ files = [
 ]
 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
+greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
 typing-extensions = ">=4.2.0"
 [package.extras]
@@ -681,4 +723,4 @@ email = ["email-validator"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454"
+content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ python = "^3.11"
 sqlalchemy = "^2.0.8"
 psycopg2-binary = "^2.9.6"
 requests = "^2.31.0"
 bs4 = "^0.0.1"
 [tool.poetry.group.dev.dependencies]
 werkzeug = "^2.3.3"
--- a/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py
+++ b/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py
@@ -0,0 +1,88 @@
 from typing import Dict, Optional
 # import isbnlib
 #used these instead of isbnlib as i have already written the code for them
 import json
 import requests
 from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher
 from worblehat.models.BookMetadata import BookMetadata
 class OutlandScraperFetcher(BookMetadataFetcher):
    def fetch_metadata(self, isbn: str) -> BookMetadata:
        metadata = self.__outland(isbn)
        if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')):
            return None
        # parse the metadata into a BookMetadata object
        self.__metadata = BookMetadata(
            isbn = isbn,
            title = metadata.get('Title'),
            authors = metadata.get('Authors'),
            language = metadata.get('Language'),
            publish_date = metadata.get('PublishDate'),
            num_pages = metadata.get('NumberOfPages'),
            subjects = metadata.get('Subjects'),
        )
        return self.__metadata
    #create a dictionary to represent a book and its data
    #gather data from openlibrary api and return it directly as json 
    def __outland(self, isbn):
        #get data from openlibrary
        try:
            from bs4 import BeautifulSoup
            url = "https://outland.no/"+isbn
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            #get all hrefs from elements with class "product-item-link"
            soup = soup.find_all("a", class_="product-item-link")
            #get the first href
            href = soup[0].get("href")
            #get data from the first href
            response = requests.get(href)
            soup = BeautifulSoup(response.content, "html.parser")
            #get all elements with class "language"
            data = soup.find_all("td", class_="col data")
            base = soup.find_all("span", class_="base")[0].text
            releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
            #only keep the year of the release date
            releaseDate = releaseDate[-4:]
            #get the element withch contains anything in the intestingData list anywhere in it. 
            intrestingData = {
                        "Authors": "Forfattere",
                        "NumberOfPages": "Antall Sider",
                        "Genre": "Sjanger",
                        "Language": "Språk",
                        "Subjects": "Serie"
                        } 
            bookData = {
                        "Title": base,
                        "PublishDate": releaseDate,
                        "Authors": None,
                        "NumberOfPages": None,
                        "Genre": None,
                        "Language": None,
                        "Subjects": None
                        } 
            for value in data:
                for key in intrestingData:
                    if str(value).lower().__contains__(intrestingData[key].lower()):
                        #get the next element in the list and add it to the bookData dict
                        bookData[key] = value.text
                        break
            return bookData
        except Exception as e:
            print(str(e))
            return False
 if __name__ == '__main__':
    fetcher = OutlandScraperFetcher()
    book_data = fetcher.fetch_metadata('9781947808225')
    print(book_data)