forked from MrCroxx/CroxxProxyPool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler-sample.py
86 lines (54 loc) · 1.69 KB
/
crawler-sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#-*- coding:utf-8 -*-
'''
一个爬取链家网北京2号线二手房信息的爬虫
'''
from CroxxProxyPool import ProxyPool,Proxy
import requests
from lxml import etree
import threading
from datetime import datetime
import time
from winsound import Beep
pp = ProxyPool()
pp.start(source = 'xicidaili',delay = 30,ssl = True,debug = False)
mutex = threading.Lock()
count = 0
pages = 49
def crawl(page):
try:
global pp,count,mutex,pages
url = 'https://bj.lianjia.com/ditiefang/li656/pg' + str(page)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
}
proxy = pp.pop()
print '< Page %s Crawling ... %s of %s >' % (page,count,pages),datetime.now()
proxies = {
'http':proxy.toURL(),
'https':proxy.toURL(),
}
print 'Proxy : ',proxy.toURL()
res = requests.get(url,headers = headers,proxies=proxies)
html = etree.HTML(res.text)
alist = html.xpath("//ul[@class='sellListContent']//div[@class='title']/a")
if len(alist) == 0:
print 0/0
pp.push(proxy)
mutex.acquire()
with open('url.txt','a') as f:
for a in alist:
f.write(a.attrib['href'] + '\n')
count += 1
print '< Page %s Finished ! %s of %s >' % (page,count,pages),datetime.now()
Beep(800,300)
mutex.release()
except:
print '< Page %s Failed ! Restarting ... %s of %s >' % (page,count,pages),datetime.now()
threading.Thread(target=crawl,args=(page,)).start()
for i in range(1,pages+1):
threading.Thread(target=crawl,args=(i,)).start()
while count < pages:
pass
time.sleep(1)
Beep(800,2000)
pp.stop()