Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Boris committed Apr 8, 2021
2 parents 980abe0 + 2fdc965 commit dc3e242
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 12 deletions.
9 changes: 8 additions & 1 deletion feapder/core/spiders/batch_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from feapder.network.item import UpdateItem
from feapder.network.request import Request
from feapder.utils.log import log
from feapder.utils.perfect_dict import PerfectDict
from feapder.utils.redis_lock import RedisLock

CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
Expand Down Expand Up @@ -111,7 +112,7 @@ def __init__(
auto_start_requests=False,
send_run_time=send_run_time,
batch_interval=batch_interval,
task_table=task_table
task_table=task_table,
)

self._redisdb = RedisDB()
Expand Down Expand Up @@ -325,6 +326,9 @@ def distribute_task(self, tasks):
for task in tasks:
for parser in self._parsers: # 寻找task对应的parser
if parser.name in task:
task = PerfectDict(
_dict=dict(zip(self._task_keys, task)), _values=list(task)
)
requests = parser.start_requests(task)
if requests and not isinstance(requests, Iterable):
raise Exception(
Expand Down Expand Up @@ -372,6 +376,9 @@ def distribute_task(self, tasks):
else: # task没对应的parser 则将task下发到所有的parser
for task in tasks:
for parser in self._parsers:
task = PerfectDict(
_dict=dict(zip(self._task_keys, task)), _values=list(task)
)
requests = parser.start_requests(task)
if requests and not isinstance(requests, Iterable):
raise Exception(
Expand Down
10 changes: 4 additions & 6 deletions feapder/network/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"""

import requests
from feapder.utils.webdriver import WebDriverPool
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.exceptions import InsecureRequestWarning

Expand All @@ -21,6 +20,8 @@
from feapder.network.proxy_pool import proxy_pool
from feapder.network.response import Response
from feapder.utils.log import log
from feapder.utils.perfect_dict import PerfectDict
from feapder.utils.webdriver import WebDriverPool

# 屏蔽warning信息
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
Expand Down Expand Up @@ -215,7 +216,7 @@ def to_dict(self):
):
continue

if callable(value) or isinstance(value, Item): # 序列化 如item
if callable(value) or isinstance(value, (Item, PerfectDict)):
value = tools.dumps_obj(value)

request_dict[key] = value
Expand Down Expand Up @@ -254,10 +255,7 @@ def get_response(self, save_cached=False):
self.requests_kwargs.update(headers=headers)
else:
self.requests_kwargs.setdefault(
"headers",
{
"User-Agent": setting.DEFAULT_USERAGENT
},
"headers", {"User-Agent": setting.DEFAULT_USERAGENT}
)

# 代理
Expand Down
5 changes: 3 additions & 2 deletions feapder/network/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,9 @@ def text(self):
else:
self._cached_text = self._get_unicode_html(self.content)

self._cached_text = self._absolute_links(self._cached_text)
self._cached_text = self._del_special_character(self._cached_text)
if self._cached_text:
self._cached_text = self._absolute_links(self._cached_text)
self._cached_text = self._del_special_character(self._cached_text)

return self._cached_text

Expand Down
51 changes: 51 additions & 0 deletions feapder/utils/perfect_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
"""
Created on 2021/4/8 11:32 上午
---------
@summary:
---------
@author: Boris
@email: [email protected]
"""


class PerfectDict(dict):
"""
>>> from feapder.utils.perfect_dict import PerfectDict
>>> data = PerfectDict(id=1, url="xxx")
>>> data
{'id': 1, 'url': 'xxx'}
>>> data.id
1
>>> data.get("id")
1
>>> data["id"]
1
>>> id, url = data
>>> id
1
>>> url
'xxx'
>>> data[0]
1
>>> data[1]
'xxx'
>>> data = PerfectDict({"id":1, "url":"xxx"})
>>> data
{'id': 1, 'url': 'xxx'}
"""

def __init__(self, _dict: dict = None, _values: list = None, **kwargs):
self.__dict__ = _dict or kwargs or {}
super().__init__(self.__dict__, **kwargs)
self.__values__ = _values or list(self.__dict__.values())

def __getitem__(self, key):
if isinstance(key, int):
return self.__values__[key]
else:
return self.__dict__[key]

def __iter__(self, *args, **kwargs):
for value in self.__values__:
yield value
4 changes: 1 addition & 3 deletions tests/batch-spider/spiders/test_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,4 @@ def failed_request(self, request, response):
"""

yield request
yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1


yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1
27 changes: 27 additions & 0 deletions tests/test_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
"""
Created on 2021/4/8 1:06 下午
---------
@summary:
---------
@author: Boris
@email: [email protected]
"""

from feapder.utils.perfect_dict import PerfectDict


task_key = ["id", "url"]
task = [1, "http://www.badu.com"]
task = Task(_dict=dict(zip(task_key, task)), _values=task)

task = Task(id=1, url="http://www.badu.com")
task = Task({"id":"1", "url":"http://www.badu.com"})

print(task)
id, url = task
print(id, url)
print(task[0], task[1])
print(task.id, task.url)
print(task["id"], task["url"])
print(task.get("id"), task.get("url"))

0 comments on commit dc3e242

Please sign in to comment.