0
0
This commit is contained in:
Oystein Kristoffer Tveit 2025-03-18 00:32:17 +01:00
commit 53d1ed3594
Signed by: oysteikt
GPG Key ID: 9F2F7D8250F35146

179
scrape.py Normal file

@ -0,0 +1,179 @@
import csv
from datetime import datetime
from dataclasses import dataclass
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
import requests
BASE_URL = "https://auctions.yahoo.co.jp/"
@dataclass
class Advertisement:
title: str
url: str
current_price: int
fixed_price: int | None
bids: int
time_since_posted: str
shipping_fee: int | str | None
def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]:
response = requests.get(f"{BASE_URL}/search/search", {
"p": keyword
});
soup = BeautifulSoup(response.text, "html.parser")
try:
number_of_results = soup.find(class_='Tab__subText').text.removesuffix('').replace(',', '')
number_of_pages_to_scrape = (int(number_of_results) // 100) + 1
except:
print('Failed to scrape page\n')
print('Content of page:')
print(soup.prettify())
return
result = []
progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False)
progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...")
for page in progress_bar:
response = requests.get(f"{BASE_URL}/search/search", {
"p": keyword,
"b": page * 100 + 1,
});
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.find(class_='Products__items'):
if isinstance(item, NavigableString):
continue
advertisement_data = extract_advertisement_data(item)
if advertisement_data is not None:
result.append(advertisement_data)
return result
# Advertisementの一つのHTML要素から、Advertisementのデータを取得
def extract_advertisement_data(ad: Tag) -> Advertisement | None:
title = (
ad
.find(class_='Product__title')
.text
.replace('\n', '')
.strip()
)
url = (
ad
.find(class_='Product__title')
.find('a')
['href']
)
current_price = int(
ad
.find(class_='Product__price')
.find(string="現在")
.parent
.find_next_sibling()
.text
.removesuffix('')
.replace(',', '')
.strip()
)
all_prices = ad.find_all(class_='Product__price')
if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None:
fixed_price = int(
fixed_price
.parent
.find_next_sibling()
.text
.removesuffix('')
.replace(',', '')
.strip()
)
else:
fixed_price = None
bids = int(
ad
.find(class_='Product__bid')
.text
.strip()
)
time_since_posted = (
ad
.find(class_='Product__time')
.text
.strip()
)
# 'Product__postage'のHTML要素があれば、送料を取得
if (shipping_fee := ad.find(class_='Product__postage')) is not None:
shipping_text = shipping_fee.text.strip()
if shipping_text.startswith('+送料'):
shipping_fee = int(
shipping_fee
.text
.strip()
.removeprefix('+送料')
.removesuffix('')
.removesuffix('')
.replace(',', '')
)
elif shipping_text == '送料未定':
shipping_fee = 'undecided'
elif shipping_text == '送料は商品ページ参照':
shipping_fee = 'refer to product page'
# 写真のうえの’送料無料’の文字列を取得
elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None:
shipping_fee = "free"
else:
shipping_fee = shipping_text
return Advertisement(
title,
url,
current_price,
fixed_price,
bids,
time_since_posted,
shipping_fee,
)
def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str):
with open(filename, 'w') as file:
writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys())
writer.writeheader()
for advertisement in advertisements:
writer.writerow(advertisement.__dict__)
def main():
with open('keywords.txt', 'r') as file:
keywords = file.read().splitlines()
progress_bar = tqdm(keywords)
progress_bar.set_description("Scraping websites...")
for keyword in progress_bar:
timestamp = datetime.now()
advertisements = crawl_search_index_for_keyword(keyword)
formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat()
save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv')
if __name__ == "__main__":
main()