-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_url.py
52 lines (42 loc) · 1.42 KB
/
get_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#coding:utf-8
import urllib2
from lxml import etree
from urllib import unquote
def getHtml(url):
html = urllib2.urlopen(url).read()
return html
def getUrls(content):
urlList = content.xpath('//div[@class="mw-category"]//ul//li/a/@href')
for index in range(len(urlList)):
urlList[index] = 'https://beijing.mingluji.com' + urlList[index]
return urlList
def getNextPage(content):
tagAText = content.xpath('//div[@id="mw-pages"]/a[2]/text()')
tagAUrl = content.xpath('//div[@id="mw-pages"]/a[2]/@href')
tagAUrl[0] = 'https://beijing.mingluji.com' + tagAUrl[0]
return tagAText, tagAUrl
def parseHtml(html):
content = etree.HTML(html)
urlList = getUrls(content)
tagAText , tagAUrl = getNextPage(content)
return urlList , tagAText ,tagAUrl
def crawl(f,url):
html = getHtml(url)
url_list, tag_a_text, tag_a_url = parseHtml(html)
for url in url_list:
f.write(url + '\n')
return tag_a_text ,tag_a_url
if __name__ == '__main__':
url = 'https://beijing.mingluji.com/%E5%88%86%E7%B1%BB:%E5%BB%B6%E5%BA%86%E5%8E%BF'
file_name = url.split(':')[2]
count = 1
f = open(unquote(file_name) +'.txt','w')
tagA_text , tagA_url = crawl(f,url)
print tagA_text[0]
while(tagA_text[0] == u'下一页'):
new_url = tagA_url[0]
tagA_text, tagA_url = crawl(f, new_url)
print count
count +=1
f.close()
print "over!!!!!!"