-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeindivpage.py
249 lines (211 loc) · 11.7 KB
/
scrapeindivpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import json
# To implement this web scraping script, requests and Beautiful Soup Python libraries was used.
links = []
with open('Listing Links (2024-07-24).csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
URL = row[0]
links.append(URL)
activity_df = pd.DataFrame()
list_of_activity_details = []
for URL in links:
print(URL)
page = requests.get(URL)
# Data Cleaning part 2: if URL does not point to html page, then ignore
# (this data cleaning) was placed here for greater efficiency because requests.get() was required
content_type = page.headers.get('content-type', '')
if 'text/html' not in content_type:
continue
soup = BeautifulSoup(page.content, "html.parser")
# List of variables to be extracted - typically every event must have event name and event description
event_name = ""
# Upon analysis of different webpages on the site, it is found that WaterAid typically presents event dates in 3 ways:
# (1) If there are no exact data / multiple dates possible, date is represented as a string of explanation words e.g. "New date TBC"
# (2) If date of event is just one day, date is simply represented as e.g. "25 July 2024"
# (3) If date of event takes place over a few days/ range of days, dates will be represented as a string of range e.g. "5 July 2024 - 7 July 2024"
event_date = ""
event_loc = ""
event_synopsis = ""
event_description = ""
event_link_to_register = ""
cat1_activity_type = ""
# [1] Extract Event Name
event_name = soup.find('h1').text.strip()
# print("name is", event_name)
# [2] Extract Date (if available)
try:
#finds outer div container containing date div containers
date_div = (soup.find('div', {"class":"field field-name-field-event-date paragraph--type--property"}) or
soup.find('div', {"class":"paragraph paragraph--type--property paragraph--view-mode--default"}) )
if date_div:
# print(date_div.find('div', 'property-label').get_text())
if date_div.find('div', 'property-label').get_text().find("Date") != -1:
# print("Date header is found")
date_content = date_div.findAll('div', class_='property property-value')[0]
# if date is not already represented as a string, find date enclosed within <time> tags
if date_content.find('time'):
dates = date_content.findAll('time')
# print(dates)
# if there are two dates enclosed within <time> tags, then put "-" in between to represent as 3rd case
if len(dates) == 1:
date = dates[0]
event_date = date.get_text()
elif len(dates) == 2:
start = dates[0].get_text()
end = dates[1].get_text()
event_date = start + " - " + end
else: # else assign its value to event_date
event_date = date_content.get_text()
else:
event_date = "NA"
else: # Dates usually contained within top left column, however sometimes it may be placed within the main body of information, hence type2:
date_div_type2 = soup.findAll('div', class_='control-width clearfix')[0]
date_content_type2 = date_div_type2.findAll('p')[0] #dates usually appear on the first instance of <p>
# print("date content header", date_content_type2.get_text().find("Date"))
# if date_content_type2.findAll('strong'): #test to see if <strong> within p; if no <strong> means date does not exist in text body
if date_content_type2.get_text().find("Date") != -1: #if "Date" is part of the text, then date exist in text body
end_index = date_content_type2.get_text().find("Location:")
event_date = date_content_type2.get_text()[6:end_index]
else:
event_date = "NA"
except:
event_date = "NA"
# print("date is", event_date)
# [3] Extract Location (if available)
try:
#finds outer div container containing location div containers
loc_div = soup.findAll('div', {"class":"paragraph paragraph--type--property paragraph--view-mode--default"})
if loc_div != []:
for para in loc_div:
#finds location label div container within previous div and check its text is "Location:" or "Where:"
loc_label = para.find('div', class_='property property-label').get_text()
if "Location" in loc_label or "Where" in loc_label:
event_loc = para.findAll('div', class_='property property-value')[0].get_text()
break
elif "Start" in loc_label:
event_loc += para.findAll('div', class_='property property-value')[0].get_text() + " to "
continue #to find next para with finish
elif "Finish" in loc_label:
event_loc += para.findAll('div', class_='property property-value')[0].get_text()
else:
continue
if event_loc == "":
event_loc = "NA"
else: # Location usually contained within top left column, however sometimes it may be placed within the main body of information, hence type2:
loc_div_type2 = soup.findAll('div', class_='control-width clearfix')[0]
loc_content_type2 = loc_div_type2.findAll('p')[0]
loc_content_type2_text = loc_content_type2.get_text()
strong_check = loc_content_type2.findAll('strong') # check if <strong> within <p>
if strong_check: # if <strong> within <p>, then there might be location and check if there is location
for item in strong_check:
if "Location" in item.get_text():
start = loc_content_type2_text.find("Location:") + 10
end = loc_content_type2_text.find("Open to:")
if end == -1:
event_loc = loc_content_type2_text[start:]
else:
event_loc = loc_content_type2_text[start:end]
break
else:
event_loc = "NA"
else: # if <strong> not within <p>, there is no location
event_loc = "NA"
except:
event_loc = "NA"
# print("location is", event_loc)
# [4] Extract Activity/Event Synopsis
event_synopsis = soup.find('div', 'field field-name-field-synopsis').get_text().strip()
event_synopsis.replace(' ', ' ')
# print(event_synopsis)
# [5] Extract Activity Description
try:
# Case 1: Check if Activity Description added as additional sub text of synopsis, if yes, extract
synopsis_div = soup.findAll('div', 'event__synopsis column--middle')[0]
sub_info_div = synopsis_div.findAll('div', 'field field-name-body control-width__inner--small')
if sub_info_div != []:
# if sub_info_div is not empty list means there are additional text in synopsis
paras_in_div = sub_info_div[0].findAll('p')
for para in paras_in_div:
text = para.get_text()
event_description += text
event_description += " "
else:
# else means no additional text in synopsis so check for case 2:
# Case 2: Activity description contained within event content div container
main_content_body_div = soup.find('div', 'event__content')
main_sub_div_L1 = main_content_body_div.find('div', 'field field-name-field-wa-page-sections')
main_sub_div_L2 = main_sub_div_L1.findAll('div', 'paragraph paragraph--type--page-section paragraph--view-mode--default bundle--general-text-area control-width')
# from analysis of web pages, index til 2 because sufficient / valid activity description will be contained within first 1-3 paras in L2 sub div
for para_div in main_sub_div_L2[:2]:
para = para_div.findAll('p')
# If first <p> element contains <strong> element means that body of text is not description, we need to check the next para_div for description
has_strong = para[0].findAll('strong')
if has_strong != []: #if has strong list is not empty means <strong> element exist so then move onto the next para_div i.e. continue
continue
else: #if has strong list is empty means the paragraphs in that para_div is the description, extract all text and break to stop loop
for p in para:
event_description += p.get_text()
if para_div.find('ul'): #for teaching resources type of activity, some of their webpages have <ul> as part of description, so check for that and extract
event_description += para_div.find('ul').get_text().strip()
event_description.replace('Â ', ' ')
break
if event_description == "": #if event description still empty, means no description
event_description = "NA"
except:
event_description = "NA"
# print("eventdesp is", event_description)
# [6] Find all hyperlinks to register for event
try:
# Find all <a> elements with class containing the word "button",
url_div = soup.findAll('a', "button")
unique_ls_url_element = list(set(url_div)) # remove duplicates
# and amongst these buttons find the button with the word "Register" and obtain the href links
for a in unique_ls_url_element:
if "Register" in a.get_text():
event_link_to_register = a['href']
if event_link_to_register == "":
event_link_to_register = "NA"
except:
event_link_to_register = "NA"
# print("link is", event_link_to_register)
# [7] Extract type of activity level 1 category from page URL
rev_url = URL[::-1]
idx_of_last_slash = len(URL) - 1 - rev_url.find('/')
start_idx = URL.find('get-involved/') + 13
cat1_activity_type = URL[start_idx:idx_of_last_slash].replace('-', ' ')
if cat1_activity_type.find("/") != -1:
cat1_activity_type = cat1_activity_type[:cat1_activity_type.find("/")]
# print(cat1_activity_type)
# Prepare data to be stored in csv
activity = pd.DataFrame({
'Listing URL': [URL],
'Name of Activity': [event_name],
'Date': [event_date],
'Location': [event_loc],
'Event Synopsis': [event_synopsis],
'Event Description': [event_description],
'Registration Link': [event_link_to_register],
'Activity Category': [cat1_activity_type]
})
# print(event_description)
activity_df = pd.concat([activity, activity_df])
# Prepare data to be stored in JSON document
activity_dict = {
'Listing URL': URL,
'Name of Activity': event_name,
'Date': event_date,
'Location': event_loc,
'Event Synopsis': event_synopsis,
'Event Description': event_description,
'Registration Link': event_link_to_register,
'Activity Category': cat1_activity_type
}
list_of_activity_details.append(activity_dict)
activity_df.to_csv('Listings_Details.csv', index=False, encoding='utf-8-sig') #write to csv
print(list_of_activity_details)
# with open('Listing Details JSON.json', 'w') as file: #write to json
# file.write(json.dumps(list_of_activity_details))