-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunner.py
65 lines (52 loc) · 1.6 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
# coding: utf-8
from responsehandler import ResponseHandler
from termparser import TermParser
from webdriver import Driver
import copy
BATCH_SIZE = 10
def harvest_html(driver, term):
driver.search(term)
return driver.page_source()
def export_batch(batch_no, terms_d, items):
# Export results
with open('./output/morphs-res-'+str(batch_no)+'.csv', 'w') as f:
for term in items:
payload = items.get(term)
for morph_data in payload.get('morphs'):
morph = morph_data.get('morph')
meaning = payload.get('meaning')
type = morph_data.get('type')
res = morph_data.get('res')
if not morph: morph = "n/a"
if not meaning: meaning = "n/a"
if not type: type = "n/a"
if not res: res = "n/a"
f.write((
term + '\t' +
meaning + '\t' +
morph + '\t' +
type + '\t' +
res + '\n'
).encode('utf-8'))
print 'Done writing: batch', batch_no
# Run Driver
driver = Driver("http://s.weibo.com", "searchInp_form", "searchBtn")
# Populate terms
termparser = TermParser("gmh-lex-glossary.html")
terms = termparser.get_terms()
# Crawl for term results
morphs = {}
i = 0
for term in terms:
i = i + 1
print i
morph_list = []
for morph_data in terms.get(term).get("morphs"):
html = harvest_html(driver, morph_data.get("morph"))
handler = ResponseHandler(html)
morph_data.update({"res": handler.response_type()})
morph_list.append(morph_data)
morphs.update({term: {"term": term, "morphs": morph_list}})
export_batch(i, terms, morphs)
driver.quit()