-
Notifications
You must be signed in to change notification settings - Fork 0
/
mklist-horrortheque-com
executable file
·67 lines (61 loc) · 1.84 KB
/
mklist-horrortheque-com
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
"""
Fetch list of movies made by Horrortheque. All their movies
are supposed to be in the public domain. It is unclear how
the people behind the site conclude about the public
domain status.
"""
import json
import lxml.html
import movielib
import re
import urllib2
def fetch_movie_info(entryurl):
try:
root = lxml.html.fromstring(movielib.http_get_read(entryurl))
except urllib2.HTTPError as e:
return None
retval = {}
for a in root.cssselect("div.two_third a[href]"):
link = a.attrib['href']
if -1 != link.find('imdb.com/title/'):
retval['imdb'] = link
# Add trailing slash if missing, to normalize URL
if '/' != retval['imdb'][-1]:
retval['imdb'] = retval['imdb'] + '/'
retval['title'] = root.cssselect("span.current")[0].text_content()
y = root.cssselect("div.two_third")[0].text_content()
m = re.search("Year: (\d+)", y)
retval['year'] = int(m.group(1))
print retval
return retval
def fetch_movie_list(url):
list = {}
try:
root = lxml.html.fromstring(movielib.http_get_read(url))
except urllib2.HTTPError as e:
return None
for a in root.cssselect("div.azindex ul li a"):
entryurl = a.attrib['href']
# Add trailing slash if missing, to avoid HTTP redirect
if '/' != entryurl[-1]:
entryurl = entryurl + '/'
#print entryurl
info = fetch_movie_info(entryurl)
#print info
ref = entryurl
if 'imdb' in info:
ref = info['imdb']
list[ref] = {
'status' : 'free',
'freenessurl' : entryurl,
'year' : info['year'],
'title' : info['title'],
}
return list
def main():
url = "http://www.horrortheque.com/movie-index/"
l = fetch_movie_list(url)
movielib.savelist(l, name='free-movies-horrortheque-com.json')
if __name__ == '__main__':
main()