-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultithread_scrape.py
39 lines (33 loc) · 1.2 KB
/
multithread_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import pandas as pd
import requests
import csv
import concurrent.futures
from bs4 import BeautifulSoup
import time
import random
MAX_THREADS = 10
def extract_info_scrape(one_asin):
time.sleep(random.uniform(0, 0.2))
page = requests.get("https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books".format(one_asin), headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.content, 'html.parser')
try:
title = soup.find(class_="bookTitle").getText().strip()
print(title)
author = soup.find(class_="authorName").getText().strip()
except AttributeError:
title = ""
author = ""
with open('./asin.csv', mode='a') as f:
csvWriter = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csvWriter.writerow([one_asin,title, author])
def main():
start_time = time.time()
path = "./metadata_raw.csv"
data = pd.read_csv(path)
all_asin_list = data["asin"].tolist()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
executor.map(extract_info_scrape, all_asin_list)
end_time = time.time()
print("Total time taken: ", end_time-start_time)
main()