0
0
2025-03-18 00:33:07 +01:00

183 lines
4.5 KiB
Python

import csv
from datetime import datetime
from dataclasses import dataclass
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
import requests
BASE_URL = "https://auctions.yahoo.co.jp/"
KEYWORDS = [
"keyword1",
"keyword2",
"keyword3",
]
@dataclass
class Advertisement:
title: str
url: str
current_price: int
fixed_price: int | None
bids: int
time_since_posted: str
shipping_fee: int | str | None
def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]:
response = requests.get(f"{BASE_URL}/search/search", {
"p": keyword
});
soup = BeautifulSoup(response.text, "html.parser")
try:
number_of_results = soup.find(class_='Tab__subText').text.removesuffix('').replace(',', '')
number_of_pages_to_scrape = (int(number_of_results) // 100) + 1
except:
print(soup.prettify())
print('Failed to scrape page')
print('See website contents above')
exit(1)
result = []
progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False)
progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...")
for page in progress_bar:
response = requests.get(f"{BASE_URL}/search/search", {
"p": keyword,
"b": page * 100 + 1,
});
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.find(class_='Products__items'):
if isinstance(item, NavigableString):
continue
advertisement_data = extract_advertisement_data(item)
if advertisement_data is not None:
result.append(advertisement_data)
return result
# Advertisementの一つのHTML要素から、Advertisementのデータを取得
def extract_advertisement_data(ad: Tag) -> Advertisement | None:
title = (
ad
.find(class_='Product__title')
.text
.replace('\n', '')
.strip()
)
url = (
ad
.find(class_='Product__title')
.find('a')
['href']
)
current_price = int(
ad
.find(class_='Product__price')
.find(string="現在")
.parent
.find_next_sibling()
.text
.removesuffix('')
.replace(',', '')
.strip()
)
all_prices = ad.find_all(class_='Product__price')
if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None:
fixed_price = int(
fixed_price
.parent
.find_next_sibling()
.text
.removesuffix('')
.replace(',', '')
.strip()
)
else:
fixed_price = None
bids = int(
ad
.find(class_='Product__bid')
.text
.strip()
)
time_since_posted = (
ad
.find(class_='Product__time')
.text
.strip()
)
# 'Product__postage'のHTML要素があれば、送料を取得
if (shipping_fee := ad.find(class_='Product__postage')) is not None:
shipping_text = shipping_fee.text.strip()
if shipping_text.startswith('+送料'):
shipping_fee = int(
shipping_fee
.text
.strip()
.removeprefix('+送料')
.removesuffix('')
.removesuffix('')
.replace(',', '')
)
elif shipping_text == '送料未定':
shipping_fee = 'undecided'
elif shipping_text == '送料は商品ページ参照':
shipping_fee = 'refer to product page'
# 写真のうえの’送料無料’の文字列を取得
elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None:
shipping_fee = "free"
else:
shipping_fee = shipping_text
return Advertisement(
title,
url,
current_price,
fixed_price,
bids,
time_since_posted,
shipping_fee,
)
def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str):
with open(filename, 'w') as file:
writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys())
writer.writeheader()
for advertisement in advertisements:
writer.writerow(advertisement.__dict__)
def main():
progress_bar = tqdm(KEYWORDS)
progress_bar.set_description("Scraping websites...")
for keyword in progress_bar:
timestamp = datetime.now()
advertisements = crawl_search_index_for_keyword(keyword)
formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat().replace(':', '-')
save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv')
if __name__ == "__main__":
main()