-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
40 lines (29 loc) · 1.18 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
from requests import get
from bs4 import BeautifulSoup
import csv
def scrape_content():
url = "https://en.wikipedia.org/wiki/List_of_fabrics"
# Get list of links to scrape
response = get(url)
page_content = BeautifulSoup(response.text, 'html.parser')
fabrics_content = page_content.find('div', class_ = 'mw-parser-output')
all_links = fabrics_content.find_all('a')
with open('fabrics.csv', 'w') as data_file:
for link in all_links:
try:
if link['href'][0:6] == '/wiki/':
fabric_name = link['title']
# Ignore first link to generic textile
if fabric_name in ['Textile', 'Fibre']:
continue
fabric_link = "https://en.wikipedia.org/" + link['href']
dedicated_fabric_content = BeautifulSoup(get(fabric_link).text, 'html.parser')
fabric_description = dedicated_fabric_content.find('div', class_ = "mw-parser-output").find('p').get_text()
data_file_writer = csv.writer(data_file)
data_file_writer.writerow([fabric_name, fabric_description, fabric_link])
except KeyError:
# It comes to the last fabric anyway
break
return True
print(scrape_content())