-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfull_text_Crawler.py
101 lines (93 loc) · 3.68 KB
/
full_text_Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from TreadCrawler import ThreadUrlCrawler
import requests
from typing import Union
from test.test_proxy_pool import get_proxy, delete_proxy
from bs4 import BeautifulSoup
from Utils.MongoClient import MongoClient
class FullTextCrawler(ThreadUrlCrawler):
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
}
mongo_client = MongoClient("guba", "东方精工")
failed_proxies = {}
proxy_fail_times_treshold = 16
def crawl(self, url):
"""
the href of each item have different fartherPath:
1、https://caifuhao
2、http://guba.eastmoney.com
:param data_json: the json data lack full text
:return: the data json with full text
"""
url_map = {
"caifuhao": "https:",
"/new": "http://guba.eastmoney.com",
}
match_times = 0
url_map_len = len(url_map)
for k, v in url_map.items():
match_times += 1
if k in url:
soup = self.get_soup_form_url(v + url)
if soup:
try:
time = soup.find("div", {"class": "time"}).text
except (ValueError, AttributeError) as e:
time = ""
try:
if soup.find("div", {"id": "post_content"}):
full_text = soup.find("div", {"id": "post_content"}).text
else:
full_text = soup.find("div", {"class": "newstext"}).text
except (ValueError, AttributeError) as e:
full_text = ""
else:
full_text = None
return False
elif match_times == url_map_len:
full_text = None
if full_text:
print(f"Successfully crawled: {url}, full text: {full_text}")
self.mongo_client.update_one(
{"href": url}, {"$set": {"full_text": full_text, "time": time}}
)
return True
elif full_text is None:
print(f"Failed to crawl: {url}")
return False
def get_soup_form_url(self, url) -> Union[BeautifulSoup, None]:
proxy = get_proxy().get("proxy")
if proxy is None:
print("proxy is None")
return None
proxies = {
"http": f"http://{proxy}",
"https": f"http://{proxy}",
}
try:
response = requests.get(
url, headers=self.header, timeout=10, proxies=proxies
) # 使用request获取网页
if response.status_code != 200:
self.failed_proxies[proxy] = self.failed_proxies.get(proxy, 0) + 1
if self.failed_proxies[proxy] >= self.proxy_fail_times_treshold:
delete_proxy(proxy)
del self.failed_proxies[proxy]
return None
else:
html = response.content.decode(
"utf-8", "ignore"
) # 将网页源码转换格式为html
soup = BeautifulSoup(
html, features="lxml"
) # 构建soup对象,"lxml"为设置的解析器
return soup
except Exception as e:
self.failed_proxies[proxy] = self.failed_proxies.get(proxy, 0) + 1
if self.failed_proxies[proxy] >= self.proxy_fail_times_treshold:
delete_proxy(proxy)
del self.failed_proxies[proxy]
return None
if __name__ == "__main__":
full_text_crawler = FullTextCrawler()
full_text_crawler.start()