-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetxkcd.py
105 lines (91 loc) · 3.12 KB
/
getxkcd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from sys import exit
from bs4 import BeautifulSoup
import requests
import re
# remove all html tags from string
def removetags(html):
cleanmatch = re.compile('<.*?>')
cleantext = re.sub(cleanmatch, '', html)
return cleantext
# download images and transcripts
def scrape():
# open indexing file
index_filename = 'original/index.txt'
with open(index_filename, 'w') as index_file:
# find number of comics
try:
homepage = requests.get('https://www.xkcd.com/').text
homepage_soup = BeautifulSoup(homepage, 'lxml')
prev_comic = homepage_soup.find('a', {'rel':'prev'}).get('href')[1:-1]
num_comics = 1 + int(prev_comic)
except IOError:
print('Error: could not retrieve number of comics!')
# loop through all comics
for comic_index in range(num_comics):
if (comic_index + 1) % 10 == 0:
print(comic_index + 1)
# get html file of transcript
try:
trans_url = 'https://www.explainxkcd.com/wiki/index.php/' + str(comic_index+1)
trans_raw = requests.get(trans_url)
trans_html = str(trans_raw.text.encode('utf-8'))
start = trans_html.find('>Explanation')
end = trans_html.find('<span', start + 15)
trans_html = trans_html[start+11:end]
trans_html = removetags(trans_html)
#print('---\n')
#print(trans_html)
#print('\n---')
filename = 'original/' + str(comic_index + 1) + '_transcript.txt'
with open(filename, 'w') as f:
f.write(trans_html)
except IOError:
print('Error: transcripting XKCD ' + str(comic_index + 1) + '!')
continue
#print('Attempting to download XKCD ' + str(comic_index + 1) + '...')
# get html file of comic and beautify
try:
comic_url = 'https://www.xkcd.com/' + str(comic_index+1)
comic_raw = requests.get(comic_url)
comic_html = comic_raw.text
comic_soup = BeautifulSoup(comic_html, 'lxml')
except IOError:
print('Error: could not load XKCD ' + str(comic_index + 1) + '!')
continue
# find image url
try:
image_url = comic_soup.find('div', {'id':'comic'}).img.get('src')
image_url = 'https:' + image_url
except AttributeError:
print('Error: no image in XKCD ' + str(comic_index + 1) + '!')
continue
'''
# find comic transcript (replaced with explainxkcd)
try:
transcript = comic_soup.find('div', {'id':'transcript'}).contents
except AttributeError:
print('Error: no transcript in XKCD ' + str(comic_index + 1) + '!')
'''
# download and write image to file
try:
filename = 'original/' + str(comic_index + 1) + '.png'
with open(filename, 'wb') as f:
f.write(requests.get(image_url).content)
except IOError:
print('Error: no picture in XKCD ' + str(comic_index + 1) + '!')
continue
'''
# write transcript to file (replaced with explainxkcd)
try:
filename = 'original/' + str(comic_index + 1) + '_transcript.txt'
with open(filename, 'w') as f:
for line in transcript:
f.write(str(line.encode('utf-8')))
except IOError:
print('Error: transcripting XKCD ' + str(comic_index + 1) + '!')
continue
'''
# record comic number in index file
index_file.write(str(comic_index + 1) + '\n')
# run the main function
scrape()