added outland scraper fetcher

2023-09-01 15:32:17 +02:00
parent 7b5446a824
commit 0ce79cda52
3 changed files with 134 additions and 3 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.

 [[package]]
 name = "alembic"
@ -19,6 +19,24 @@ typing-extensions = ">=4"
 [package.extras]
 tz = ["python-dateutil"]

+[[package]]
+name = "beautifulsoup4"
+version = "4.12.2"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
+    {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "blinker"
 version = "1.6.2"
@ -30,6 +48,19 @@ files = [
    {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"},
 ]

+[[package]]
+name = "bs4"
+version = "0.0.1"
+description = "Dummy package for Beautiful Soup"
+optional = false
+python-versions = "*"
+files = [
+    {file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+
 [[package]]
 name = "certifi"
 version = "2023.7.22"
@ -528,6 +559,17 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

+[[package]]
+name = "soupsieve"
+version = "2.4.1"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
+    {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.12"
@ -579,7 +621,7 @@ files = [
 ]

 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
+greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
 typing-extensions = ">=4.2.0"

 [package.extras]
@ -681,4 +723,4 @@ email = ["email-validator"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "cfc36ccfe7ef6b7ea7ad473141163856a7bbcd9fb85a0416a385c481f0699454"
+content-hash = "3a08189ad749f780ddbbed3af8cf9503147cc55a729ef667d50567acdb484b00"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,6 +17,7 @@ python = "^3.11"
 sqlalchemy = "^2.0.8"
 psycopg2-binary = "^2.9.6"
 requests = "^2.31.0"
+bs4 = "^0.0.1"

 [tool.poetry.group.dev.dependencies]
 werkzeug = "^2.3.3"
--- a/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py
+++ b/worblehat/services/metadata_fetchers/outland_scraper_fetcher.py
@ -0,0 +1,88 @@
+from typing import Dict, Optional
+
+# import isbnlib
+#used these instead of isbnlib as i have already written the code for them
+import json
+import requests
+
+from worblehat.services.metadata_fetchers.base_fetcher import BookMetadataFetcher
+from worblehat.models.BookMetadata import BookMetadata
+
+class OutlandScraperFetcher(BookMetadataFetcher):
+
+    def fetch_metadata(self, isbn: str) -> BookMetadata:
+        metadata = self.__outland(isbn)
+        if not metadata or len(metadata.keys()) == 0 or (not metadata.get('Title') and not metadata.get('Authors') and not metadata.get('Language')):
+            return None
+        
+        # parse the metadata into a BookMetadata object
+        self.__metadata = BookMetadata(
+            isbn = isbn,
+            title = metadata.get('Title'),
+            authors = metadata.get('Authors'),
+            language = metadata.get('Language'),
+            publish_date = metadata.get('PublishDate'),
+            num_pages = metadata.get('NumberOfPages'),
+            subjects = metadata.get('Subjects'),
+        )
+        return self.__metadata
+
+    #create a dictionary to represent a book and its data
+    #gather data from openlibrary api and return it directly as json 
+    def __outland(self, isbn):
+        #get data from openlibrary
+        try:
+            
+            from bs4 import BeautifulSoup
+            url = "https://outland.no/"+isbn
+            response = requests.get(url)
+            soup = BeautifulSoup(response.content, "html.parser")
+            #get all hrefs from elements with class "product-item-link"
+            soup = soup.find_all("a", class_="product-item-link")
+            #get the first href
+            href = soup[0].get("href")
+            
+            #get data from the first href
+            response = requests.get(href)
+            soup = BeautifulSoup(response.content, "html.parser")
+            #get all elements with class "language"
+            data = soup.find_all("td", class_="col data")
+            base = soup.find_all("span", class_="base")[0].text
+            releaseDate = soup.find_all("span", class_="release-date")[0].text.strip()
+            #only keep the year of the release date
+            releaseDate = releaseDate[-4:]
+            
+            #get the element withch contains anything in the intestingData list anywhere in it. 
+            intrestingData = {
+                        "Authors": "Forfattere",
+                        "NumberOfPages": "Antall Sider",
+                        "Genre": "Sjanger",
+                        "Language": "Språk",
+                        "Subjects": "Serie"
+                        } 
+            bookData = {
+                        "Title": base,
+                        "PublishDate": releaseDate,
+                        "Authors": None,
+                        "NumberOfPages": None,
+                        "Genre": None,
+                        "Language": None,
+                        "Subjects": None
+                        } 
+            
+            for value in data:
+                for key in intrestingData:
+                    if str(value).lower().__contains__(intrestingData[key].lower()):
+                        #get the next element in the list and add it to the bookData dict
+                        bookData[key] = value.text
+                        break
+            
+            return bookData
+        except Exception as e:
+            print(str(e))
+            return False
+        
+if __name__ == '__main__':
+    fetcher = OutlandScraperFetcher()
+    book_data = fetcher.fetch_metadata('9781947808225')
+    print(book_data)