From e04d3df05a5725cd8ed80b1ae4358f16213b5d27 Mon Sep 17 00:00:00 2001 From: Boris Date: Thu, 8 Apr 2021 13:26:03 +0800 Subject: [PATCH 1/2] add perfect_dict --- feapder/core/spiders/batch_spider.py | 9 +++- feapder/network/request.py | 10 ++--- feapder/utils/perfect_dict.py | 51 +++++++++++++++++++++++ tests/batch-spider/spiders/test_spider.py | 4 +- tests/test_task.py | 27 ++++++++++++ 5 files changed, 91 insertions(+), 10 deletions(-) create mode 100644 feapder/utils/perfect_dict.py create mode 100644 tests/test_task.py diff --git a/feapder/core/spiders/batch_spider.py b/feapder/core/spiders/batch_spider.py index 27057504..d2dbcc88 100644 --- a/feapder/core/spiders/batch_spider.py +++ b/feapder/core/spiders/batch_spider.py @@ -26,6 +26,7 @@ from feapder.network.item import UpdateItem from feapder.network.request import Request from feapder.utils.log import log +from feapder.utils.perfect_dict import PerfectDict from feapder.utils.redis_lock import RedisLock CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline" @@ -111,7 +112,7 @@ def __init__( auto_start_requests=False, send_run_time=send_run_time, batch_interval=batch_interval, - task_table=task_table + task_table=task_table, ) self._redisdb = RedisDB() @@ -325,6 +326,9 @@ def distribute_task(self, tasks): for task in tasks: for parser in self._parsers: # 寻找task对应的parser if parser.name in task: + task = PerfectDict( + _dict=dict(zip(self._task_keys, task)), _values=list(task) + ) requests = parser.start_requests(task) if requests and not isinstance(requests, Iterable): raise Exception( @@ -372,6 +376,9 @@ def distribute_task(self, tasks): else: # task没对应的parser 则将task下发到所有的parser for task in tasks: for parser in self._parsers: + task = PerfectDict( + _dict=dict(zip(self._task_keys, task)), _values=list(task) + ) requests = parser.start_requests(task) if requests and not isinstance(requests, Iterable): raise Exception( diff --git a/feapder/network/request.py b/feapder/network/request.py index d805aaec..655cdc8a 100644 --- a/feapder/network/request.py +++ b/feapder/network/request.py @@ -9,7 +9,6 @@ """ import requests -from feapder.utils.webdriver import WebDriverPool from requests.adapters import HTTPAdapter from requests.packages.urllib3.exceptions import InsecureRequestWarning @@ -21,6 +20,8 @@ from feapder.network.proxy_pool import proxy_pool from feapder.network.response import Response from feapder.utils.log import log +from feapder.utils.perfect_dict import PerfectDict +from feapder.utils.webdriver import WebDriverPool # 屏蔽warning信息 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) @@ -215,7 +216,7 @@ def to_dict(self): ): continue - if callable(value) or isinstance(value, Item): # 序列化 如item + if callable(value) or isinstance(value, (Item, PerfectDict)): value = tools.dumps_obj(value) request_dict[key] = value @@ -254,10 +255,7 @@ def get_response(self, save_cached=False): self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( - "headers", - { - "User-Agent": setting.DEFAULT_USERAGENT - }, + "headers", {"User-Agent": setting.DEFAULT_USERAGENT} ) # 代理 diff --git a/feapder/utils/perfect_dict.py b/feapder/utils/perfect_dict.py new file mode 100644 index 00000000..97943982 --- /dev/null +++ b/feapder/utils/perfect_dict.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +""" +Created on 2021/4/8 11:32 上午 +--------- +@summary: +--------- +@author: Boris +@email: boris_liu@foxmail.com +""" + + +class PerfectDict(dict): + """ + >>> from feapder.utils.perfect_dict import PerfectDict + >>> data = PerfectDict(id=1, url="xxx") + >>> data + {'id': 1, 'url': 'xxx'} + >>> data.id + 1 + >>> data.get("id") + 1 + >>> data["id"] + 1 + >>> id, url = data + >>> id + 1 + >>> url + 'xxx' + >>> data[0] + 1 + >>> data[1] + 'xxx' + >>> data = PerfectDict({"id":1, "url":"xxx"}) + >>> data + {'id': 1, 'url': 'xxx'} + """ + + def __init__(self, _dict: dict = None, _values: list = None, **kwargs): + self.__dict__ = _dict or kwargs or {} + super().__init__(self.__dict__, **kwargs) + self.__values__ = _values or list(self.__dict__.values()) + + def __getitem__(self, key): + if isinstance(key, int): + return self.__values__[key] + else: + return self.__dict__[key] + + def __iter__(self, *args, **kwargs): + for value in self.__values__: + yield value diff --git a/tests/batch-spider/spiders/test_spider.py b/tests/batch-spider/spiders/test_spider.py index e99a8d46..bc213e78 100644 --- a/tests/batch-spider/spiders/test_spider.py +++ b/tests/batch-spider/spiders/test_spider.py @@ -49,6 +49,4 @@ def failed_request(self, request, response): """ yield request - yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1 - - + yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1 diff --git a/tests/test_task.py b/tests/test_task.py new file mode 100644 index 00000000..00399ea0 --- /dev/null +++ b/tests/test_task.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" +Created on 2021/4/8 1:06 下午 +--------- +@summary: +--------- +@author: Boris +@email: boris_liu@foxmail.com +""" + +from feapder.utils.perfect_dict import PerfectDict + + +task_key = ["id", "url"] +task = [1, "http://www.badu.com"] +task = Task(_dict=dict(zip(task_key, task)), _values=task) + +task = Task(id=1, url="http://www.badu.com") +task = Task({"id":"1", "url":"http://www.badu.com"}) + +print(task) +id, url = task +print(id, url) +print(task[0], task[1]) +print(task.id, task.url) +print(task["id"], task["url"]) +print(task.get("id"), task.get("url")) From 2fdc9654e87c2bf67651cf47101d40b757cd90dc Mon Sep 17 00:00:00 2001 From: Boris Date: Thu, 8 Apr 2021 13:33:29 +0800 Subject: [PATCH 2/2] =?UTF-8?q?response=20text=E5=86=85=E5=AE=B9=E4=B8=BA?= =?UTF-8?q?=E7=A9=BA=E6=97=B6=E4=B8=8D=E5=8F=96=E7=BB=9D=E5=AF=B9=E8=BF=9E?= =?UTF-8?q?=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- feapder/network/response.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/feapder/network/response.py b/feapder/network/response.py index 44cd15dc..ca1aaba7 100644 --- a/feapder/network/response.py +++ b/feapder/network/response.py @@ -243,8 +243,9 @@ def text(self): else: self._cached_text = self._get_unicode_html(self.content) - self._cached_text = self._absolute_links(self._cached_text) - self._cached_text = self._del_special_character(self._cached_text) + if self._cached_text: + self._cached_text = self._absolute_links(self._cached_text) + self._cached_text = self._del_special_character(self._cached_text) return self._cached_text