import csv from datetime import datetime from dataclasses import dataclass from tqdm import tqdm from bs4 import BeautifulSoup from bs4.element import Tag, NavigableString import requests BASE_URL = "https://auctions.yahoo.co.jp/" KEYWORDS = [ "keyword1", "keyword2", "keyword3", ] @dataclass class Advertisement: title: str url: str current_price: int fixed_price: int | None bids: int time_since_posted: str shipping_fee: int | str | None def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]: response = requests.get(f"{BASE_URL}/search/search", { "p": keyword }); soup = BeautifulSoup(response.text, "html.parser") try: number_of_results = soup.find(class_='Tab__subText').text.removesuffix('件').replace(',', '') number_of_pages_to_scrape = (int(number_of_results) // 100) + 1 except: print(soup.prettify()) print('Failed to scrape page') print('See website contents above') exit(1) result = [] progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False) progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...") for page in progress_bar: response = requests.get(f"{BASE_URL}/search/search", { "p": keyword, "b": page * 100 + 1, }); soup = BeautifulSoup(response.text, "html.parser") for item in soup.find(class_='Products__items'): if isinstance(item, NavigableString): continue advertisement_data = extract_advertisement_data(item) if advertisement_data is not None: result.append(advertisement_data) return result # Advertisementの一つのHTML要素から、Advertisementのデータを取得 def extract_advertisement_data(ad: Tag) -> Advertisement | None: title = ( ad .find(class_='Product__title') .text .replace('\n', '') .strip() ) url = ( ad .find(class_='Product__title') .find('a') ['href'] ) current_price = int( ad .find(class_='Product__price') .find(string="現在") .parent .find_next_sibling() .text .removesuffix('円') .replace(',', '') .strip() ) all_prices = ad.find_all(class_='Product__price') if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None: fixed_price = int( fixed_price .parent .find_next_sibling() .text .removesuffix('円') .replace(',', '') .strip() ) else: fixed_price = None bids = int( ad .find(class_='Product__bid') .text .strip() ) time_since_posted = ( ad .find(class_='Product__time') .text .strip() ) # 'Product__postage'のHTML要素があれば、送料を取得 if (shipping_fee := ad.find(class_='Product__postage')) is not None: shipping_text = shipping_fee.text.strip() if shipping_text.startswith('+送料'): shipping_fee = int( shipping_fee .text .strip() .removeprefix('+送料') .removesuffix('〜') .removesuffix('円') .replace(',', '') ) elif shipping_text == '送料未定': shipping_fee = 'undecided' elif shipping_text == '送料は商品ページ参照': shipping_fee = 'refer to product page' # 写真のうえの’送料無料’の文字列を取得 elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None: shipping_fee = "free" else: shipping_fee = shipping_text return Advertisement( title, url, current_price, fixed_price, bids, time_since_posted, shipping_fee, ) def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str): with open(filename, 'w') as file: writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys()) writer.writeheader() for advertisement in advertisements: writer.writerow(advertisement.__dict__) def main(): progress_bar = tqdm(KEYWORDS) progress_bar.set_description("Scraping websites...") for keyword in progress_bar: timestamp = datetime.now() advertisements = crawl_search_index_for_keyword(keyword) formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat().replace(':', '-') save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv') if __name__ == "__main__": main()