-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScrapyMcScrapeFace.py
67 lines (49 loc) · 1.79 KB
/
ScrapyMcScrapeFace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#Horrible quickly written jet-lagged code here. Sorry for all the loops, fantastically named variables and imperative code
import requests
from bs4 import BeautifulSoup
import codecs
subscription_key = ""
assert subscription_key
search_term = "haiti earthquake 2018"
output_dir = '/dbfs/he/'
count = 10
offset = 0
ii = 0
for k in range(1000):
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search" + '?count=' + str(count) + '&offset=' + str(offset) + '&mkt=en-us&answerCount=2&responseFilter=webpages,news'
headers = {"Ocp-Apim-Subscription-Key" : subscription_key, 'Accept-Encoding': 'deflate'}
params = {"q": search_term, "textDecorations":False, "textFormat":"HTML"}
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = response.json()
#print(search_results["webPages"]['totalEstimatedMatches'])
offset += 10
#if 'webPages' not in search_results:
# continue
#print(search_results)
for v in search_results["webPages"]["value"]:
# Use requests to get the contents
try:
r = requests.get(v['url'])
except Exception:
print("Skipping ", v["name"])
continue
# Get the text of the contents
try:
html_content = r.text
except Exception:
print("Skipping ", v["name"])
continue
# Convert the html content into a beautiful soup object
soup = BeautifulSoup(html_content, 'lxml')
tags = soup.find_all('p')[:]
filename = output_dir + str(ii) + '.txt'
with codecs.open(filename, "w", "utf-8-sig") as transcript:
jj = 0
for i in range(len(tags)):
#print(tags[i].text)
transcript.write(tags[i].text + "\n")
jj += 1
ii += 1
print("Processed",v["name"], str(ii))
#print(v['url'])