183 lines
4.5 KiB
Python
183 lines
4.5 KiB
Python
import csv
|
|
|
|
from datetime import datetime
|
|
from dataclasses import dataclass
|
|
|
|
from tqdm import tqdm
|
|
from bs4 import BeautifulSoup
|
|
from bs4.element import Tag, NavigableString
|
|
import requests
|
|
|
|
BASE_URL = "https://auctions.yahoo.co.jp/"
|
|
|
|
KEYWORDS = [
|
|
"keyword1",
|
|
"keyword2",
|
|
"keyword3",
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class Advertisement:
|
|
title: str
|
|
url: str
|
|
current_price: int
|
|
fixed_price: int | None
|
|
bids: int
|
|
time_since_posted: str
|
|
shipping_fee: int | str | None
|
|
|
|
|
|
def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]:
|
|
response = requests.get(f"{BASE_URL}/search/search", {
|
|
"p": keyword
|
|
});
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
try:
|
|
number_of_results = soup.find(class_='Tab__subText').text.removesuffix('件').replace(',', '')
|
|
number_of_pages_to_scrape = (int(number_of_results) // 100) + 1
|
|
except:
|
|
print(soup.prettify())
|
|
print('Failed to scrape page')
|
|
print('See website contents above')
|
|
exit(1)
|
|
|
|
result = []
|
|
progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False)
|
|
progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...")
|
|
for page in progress_bar:
|
|
response = requests.get(f"{BASE_URL}/search/search", {
|
|
"p": keyword,
|
|
"b": page * 100 + 1,
|
|
});
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
for item in soup.find(class_='Products__items'):
|
|
if isinstance(item, NavigableString):
|
|
continue
|
|
advertisement_data = extract_advertisement_data(item)
|
|
if advertisement_data is not None:
|
|
result.append(advertisement_data)
|
|
|
|
return result
|
|
|
|
|
|
# Advertisementの一つのHTML要素から、Advertisementのデータを取得
|
|
def extract_advertisement_data(ad: Tag) -> Advertisement | None:
|
|
title = (
|
|
ad
|
|
.find(class_='Product__title')
|
|
.text
|
|
.replace('\n', '')
|
|
.strip()
|
|
)
|
|
|
|
url = (
|
|
ad
|
|
.find(class_='Product__title')
|
|
.find('a')
|
|
['href']
|
|
)
|
|
|
|
current_price = int(
|
|
ad
|
|
.find(class_='Product__price')
|
|
.find(string="現在")
|
|
.parent
|
|
.find_next_sibling()
|
|
.text
|
|
.removesuffix('円')
|
|
.replace(',', '')
|
|
.strip()
|
|
)
|
|
|
|
all_prices = ad.find_all(class_='Product__price')
|
|
if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None:
|
|
fixed_price = int(
|
|
fixed_price
|
|
.parent
|
|
.find_next_sibling()
|
|
.text
|
|
.removesuffix('円')
|
|
.replace(',', '')
|
|
.strip()
|
|
)
|
|
else:
|
|
fixed_price = None
|
|
|
|
bids = int(
|
|
ad
|
|
.find(class_='Product__bid')
|
|
.text
|
|
.strip()
|
|
)
|
|
|
|
time_since_posted = (
|
|
ad
|
|
.find(class_='Product__time')
|
|
.text
|
|
.strip()
|
|
)
|
|
|
|
# 'Product__postage'のHTML要素があれば、送料を取得
|
|
if (shipping_fee := ad.find(class_='Product__postage')) is not None:
|
|
shipping_text = shipping_fee.text.strip()
|
|
|
|
if shipping_text.startswith('+送料'):
|
|
shipping_fee = int(
|
|
shipping_fee
|
|
.text
|
|
.strip()
|
|
.removeprefix('+送料')
|
|
.removesuffix('〜')
|
|
.removesuffix('円')
|
|
.replace(',', '')
|
|
)
|
|
|
|
elif shipping_text == '送料未定':
|
|
shipping_fee = 'undecided'
|
|
|
|
elif shipping_text == '送料は商品ページ参照':
|
|
shipping_fee = 'refer to product page'
|
|
|
|
# 写真のうえの’送料無料’の文字列を取得
|
|
elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None:
|
|
shipping_fee = "free"
|
|
|
|
else:
|
|
shipping_fee = shipping_text
|
|
|
|
return Advertisement(
|
|
title,
|
|
url,
|
|
current_price,
|
|
fixed_price,
|
|
bids,
|
|
time_since_posted,
|
|
shipping_fee,
|
|
)
|
|
|
|
|
|
def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str):
|
|
with open(filename, 'w') as file:
|
|
writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys())
|
|
writer.writeheader()
|
|
for advertisement in advertisements:
|
|
writer.writerow(advertisement.__dict__)
|
|
|
|
|
|
def main():
|
|
progress_bar = tqdm(KEYWORDS)
|
|
progress_bar.set_description("Scraping websites...")
|
|
|
|
for keyword in progress_bar:
|
|
timestamp = datetime.now()
|
|
advertisements = crawl_search_index_for_keyword(keyword)
|
|
|
|
formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat().replace(':', '-')
|
|
save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|