v1

2025-03-18 00:32:17 +01:00 · 2025-03-18 00:32:17 +01:00 · 53d1ed3594
commit 53d1ed3594
1 changed files with 179 additions and 0 deletions
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,179 @@
+import csv
+
+from datetime import datetime
+from dataclasses import dataclass
+
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+from bs4.element import Tag, NavigableString
+import requests
+
+BASE_URL = "https://auctions.yahoo.co.jp/"
+
+
+@dataclass
+class Advertisement:
+  title: str
+  url: str
+  current_price: int
+  fixed_price: int | None
+  bids: int
+  time_since_posted: str
+  shipping_fee: int | str |  None
+
+
+def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]:
+    response = requests.get(f"{BASE_URL}/search/search", {
+        "p": keyword
+    });
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    try:
+      number_of_results = soup.find(class_='Tab__subText').text.removesuffix('件').replace(',', '')
+      number_of_pages_to_scrape = (int(number_of_results) // 100) + 1
+    except:
+      print('Failed to scrape page\n')
+      print('Content of page:')
+      print(soup.prettify())
+      return
+
+    result = []
+    progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False)
+    progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...")
+    for page in progress_bar:
+        response = requests.get(f"{BASE_URL}/search/search", {
+            "p": keyword,
+            "b": page * 100 + 1,
+        });
+        soup = BeautifulSoup(response.text, "html.parser")
+        for item in soup.find(class_='Products__items'):
+          if isinstance(item, NavigableString):
+            continue
+          advertisement_data = extract_advertisement_data(item)
+          if advertisement_data is not None:
+            result.append(advertisement_data)
+
+    return result
+
+
+# Advertisementの一つのHTML要素から、Advertisementのデータを取得
+def extract_advertisement_data(ad: Tag) -> Advertisement | None:
+  title = (
+    ad
+      .find(class_='Product__title')
+      .text
+      .replace('\n', '')
+      .strip()
+  )
+
+  url = (
+    ad
+      .find(class_='Product__title')
+      .find('a')
+      ['href']
+  )
+
+  current_price = int(
+    ad
+      .find(class_='Product__price')
+      .find(string="現在")
+      .parent
+      .find_next_sibling()
+      .text
+      .removesuffix('円')
+      .replace(',', '')
+      .strip()
+  )
+
+  all_prices = ad.find_all(class_='Product__price')
+  if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None:
+    fixed_price = int(
+      fixed_price
+        .parent
+        .find_next_sibling()
+        .text
+        .removesuffix('円')
+        .replace(',', '')
+        .strip()
+    )
+  else:
+    fixed_price = None
+
+  bids = int(
+    ad
+      .find(class_='Product__bid')
+      .text
+      .strip()
+  )
+
+  time_since_posted = (
+    ad
+      .find(class_='Product__time')
+      .text
+      .strip()
+  )
+
+  # 'Product__postage'のHTML要素があれば、送料を取得
+  if (shipping_fee := ad.find(class_='Product__postage')) is not None:
+    shipping_text = shipping_fee.text.strip()
+
+    if shipping_text.startswith('＋送料'):
+      shipping_fee = int(
+        shipping_fee
+          .text
+          .strip()
+          .removeprefix('＋送料')
+          .removesuffix('〜')
+          .removesuffix('円')
+          .replace(',', '')
+      )
+
+    elif shipping_text == '送料未定':
+      shipping_fee = 'undecided'
+
+    elif shipping_text == '送料は商品ページ参照':
+      shipping_fee = 'refer to product page'
+
+    # 写真のうえの’送料無料’の文字列を取得
+    elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None:
+      shipping_fee = "free"
+
+    else:
+      shipping_fee = shipping_text
+
+  return Advertisement(
+    title,
+    url,
+    current_price,
+    fixed_price,
+    bids,
+    time_since_posted,
+    shipping_fee,
+  )
+
+
+def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str):
+  with open(filename, 'w') as file:
+    writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys())
+    writer.writeheader()
+    for advertisement in advertisements:
+      writer.writerow(advertisement.__dict__)
+
+
+def main():
+  with open('keywords.txt', 'r') as file:
+    keywords = file.read().splitlines()
+
+  progress_bar = tqdm(keywords)
+  progress_bar.set_description("Scraping websites...")
+
+  for keyword in progress_bar:
+    timestamp = datetime.now()
+    advertisements = crawl_search_index_for_keyword(keyword)
+
+    formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat()
+    save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv')
+
+
+if __name__ == "__main__":
+  main()