v1
This commit is contained in:
commit
53d1ed3594
179
scrape.py
Normal file
179
scrape.py
Normal file
@ -0,0 +1,179 @@
|
||||
import csv
|
||||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
|
||||
from tqdm import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag, NavigableString
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://auctions.yahoo.co.jp/"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Advertisement:
|
||||
title: str
|
||||
url: str
|
||||
current_price: int
|
||||
fixed_price: int | None
|
||||
bids: int
|
||||
time_since_posted: str
|
||||
shipping_fee: int | str | None
|
||||
|
||||
|
||||
def crawl_search_index_for_keyword(keyword: str) -> list[Advertisement]:
|
||||
response = requests.get(f"{BASE_URL}/search/search", {
|
||||
"p": keyword
|
||||
});
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
try:
|
||||
number_of_results = soup.find(class_='Tab__subText').text.removesuffix('件').replace(',', '')
|
||||
number_of_pages_to_scrape = (int(number_of_results) // 100) + 1
|
||||
except:
|
||||
print('Failed to scrape page\n')
|
||||
print('Content of page:')
|
||||
print(soup.prettify())
|
||||
return
|
||||
|
||||
result = []
|
||||
progress_bar = tqdm(range(number_of_pages_to_scrape), leave=False)
|
||||
progress_bar.set_description(f"Scraping {number_of_results} items for '{keyword}'...")
|
||||
for page in progress_bar:
|
||||
response = requests.get(f"{BASE_URL}/search/search", {
|
||||
"p": keyword,
|
||||
"b": page * 100 + 1,
|
||||
});
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
for item in soup.find(class_='Products__items'):
|
||||
if isinstance(item, NavigableString):
|
||||
continue
|
||||
advertisement_data = extract_advertisement_data(item)
|
||||
if advertisement_data is not None:
|
||||
result.append(advertisement_data)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Advertisementの一つのHTML要素から、Advertisementのデータを取得
|
||||
def extract_advertisement_data(ad: Tag) -> Advertisement | None:
|
||||
title = (
|
||||
ad
|
||||
.find(class_='Product__title')
|
||||
.text
|
||||
.replace('\n', '')
|
||||
.strip()
|
||||
)
|
||||
|
||||
url = (
|
||||
ad
|
||||
.find(class_='Product__title')
|
||||
.find('a')
|
||||
['href']
|
||||
)
|
||||
|
||||
current_price = int(
|
||||
ad
|
||||
.find(class_='Product__price')
|
||||
.find(string="現在")
|
||||
.parent
|
||||
.find_next_sibling()
|
||||
.text
|
||||
.removesuffix('円')
|
||||
.replace(',', '')
|
||||
.strip()
|
||||
)
|
||||
|
||||
all_prices = ad.find_all(class_='Product__price')
|
||||
if len(all_prices) > 1 and (fixed_price := all_prices[1].find(string="即決")) is not None:
|
||||
fixed_price = int(
|
||||
fixed_price
|
||||
.parent
|
||||
.find_next_sibling()
|
||||
.text
|
||||
.removesuffix('円')
|
||||
.replace(',', '')
|
||||
.strip()
|
||||
)
|
||||
else:
|
||||
fixed_price = None
|
||||
|
||||
bids = int(
|
||||
ad
|
||||
.find(class_='Product__bid')
|
||||
.text
|
||||
.strip()
|
||||
)
|
||||
|
||||
time_since_posted = (
|
||||
ad
|
||||
.find(class_='Product__time')
|
||||
.text
|
||||
.strip()
|
||||
)
|
||||
|
||||
# 'Product__postage'のHTML要素があれば、送料を取得
|
||||
if (shipping_fee := ad.find(class_='Product__postage')) is not None:
|
||||
shipping_text = shipping_fee.text.strip()
|
||||
|
||||
if shipping_text.startswith('+送料'):
|
||||
shipping_fee = int(
|
||||
shipping_fee
|
||||
.text
|
||||
.strip()
|
||||
.removeprefix('+送料')
|
||||
.removesuffix('〜')
|
||||
.removesuffix('円')
|
||||
.replace(',', '')
|
||||
)
|
||||
|
||||
elif shipping_text == '送料未定':
|
||||
shipping_fee = 'undecided'
|
||||
|
||||
elif shipping_text == '送料は商品ページ参照':
|
||||
shipping_fee = 'refer to product page'
|
||||
|
||||
# 写真のうえの’送料無料’の文字列を取得
|
||||
elif shipping_text == '' and ad.find(class_='Product__icon--freeShipping') is not None:
|
||||
shipping_fee = "free"
|
||||
|
||||
else:
|
||||
shipping_fee = shipping_text
|
||||
|
||||
return Advertisement(
|
||||
title,
|
||||
url,
|
||||
current_price,
|
||||
fixed_price,
|
||||
bids,
|
||||
time_since_posted,
|
||||
shipping_fee,
|
||||
)
|
||||
|
||||
|
||||
def save_advertisements_to_csv(advertisements: list[Advertisement], filename: str):
|
||||
with open(filename, 'w') as file:
|
||||
writer = csv.DictWriter(file, fieldnames=Advertisement.__dataclass_fields__.keys())
|
||||
writer.writeheader()
|
||||
for advertisement in advertisements:
|
||||
writer.writerow(advertisement.__dict__)
|
||||
|
||||
|
||||
def main():
|
||||
with open('keywords.txt', 'r') as file:
|
||||
keywords = file.read().splitlines()
|
||||
|
||||
progress_bar = tqdm(keywords)
|
||||
progress_bar.set_description("Scraping websites...")
|
||||
|
||||
for keyword in progress_bar:
|
||||
timestamp = datetime.now()
|
||||
advertisements = crawl_search_index_for_keyword(keyword)
|
||||
|
||||
formatted_timestamp = timestamp.astimezone().replace(microsecond=0).isoformat()
|
||||
save_advertisements_to_csv(advertisements, f'{formatted_timestamp}-{keyword}.csv')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user