-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-thehillproductions
executable file
·65 lines (58 loc) · 1.84 KB
/
mklist-thehillproductions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
"""
Fetch list of movies made by The Hill Productions. All their movies
are Creative Commons licensed.
"""
import json
import lxml.html
import movielib
import re
import urllib2
def fetch_movie_info(entryurl):
try:
root = lxml.html.fromstring(movielib.http_get_read(entryurl))
except urllib2.HTTPError as e:
return None
retval = {}
for a in root.cssselect("a[href]"):
link = a.attrib['href']
if -1 != link.find('imdb.com/title/'):
retval['imdb'] = link
titleyear = root.cssselect("header h1")[0].text_content()
#print titleyear
m = re.search("^(.+) \((\d+)\)$", titleyear)
retval['title'] = m.group(1)
retval['year'] = int(m.group(2))
return retval
def fetch_movie_list(url):
list = {}
try:
root = lxml.html.fromstring(movielib.http_get_read(url))
except urllib2.HTTPError as e:
return None
for a in root.cssselect("div.symple-column a[onclick]"):
entryurl = a.attrib['href']
# Skip 'coming soon' entries
if 'http://www.thehillproductions.com/#' != entryurl:
# Add trailing slash if missing, to avoid HTTP redirect
if '/' != entryurl[-1]:
entryurl = entryurl + '/'
#print entryurl
info = fetch_movie_info(entryurl)
#print info
ref = entryurl
if 'imdb' in info:
ref = info['imdb']
list[ref] = {
'status' : 'free',
'freenessurl' : entryurl,
'title' : info['title'],
'year' : info['year'],
}
return list
def main():
url = "http://www.thehillproductions.com/"
l = fetch_movie_list(url)
movielib.savelist(l, name='free-movies-thehillproductions.json')
if __name__ == '__main__':
main()