From ec98dabfab60283303a9208ccd8177d9f995ba72 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta <1731933+elacuesta@users.noreply.github.com> Date: Sun, 14 Jun 2020 06:45:27 -0300 Subject: [PATCH] Support for dataclass and attrs items (#3881) --- docs/conf.py | 1 + docs/faq.rst | 8 +- docs/topics/architecture.rst | 2 +- docs/topics/coroutines.rst | 13 +- docs/topics/exporters.rst | 34 ++- docs/topics/feed-exports.rst | 4 +- docs/topics/item-pipeline.rst | 54 ++-- docs/topics/items.rst | 245 +++++++++++++----- docs/topics/leaks.rst | 15 +- docs/topics/loaders.rst | 53 ++-- docs/topics/media-pipeline.rst | 33 ++- docs/topics/settings.rst | 4 +- docs/topics/signals.rst | 10 +- docs/topics/spider-middleware.rst | 13 +- docs/topics/spiders.rst | 18 +- scrapy/commands/parse.py | 6 +- scrapy/contracts/default.py | 28 +- scrapy/core/scraper.py | 27 +- scrapy/exporters.py | 38 +-- scrapy/item.py | 2 +- scrapy/loader/__init__.py | 18 +- scrapy/pipelines/files.py | 23 +- scrapy/pipelines/images.py | 19 +- scrapy/shell.py | 11 +- scrapy/spiders/feed.py | 2 +- .../project/module/middlewares.py.tmpl | 8 +- .../project/module/pipelines.py.tmpl | 4 + scrapy/utils/serialize.py | 6 +- setup.py | 1 + tests/requirements-py3.txt | 2 + tests/test_engine.py | 40 ++- tests/test_loader.py | 47 +++- tests/test_pipeline_files.py | 146 ++++++++--- tests/test_pipeline_images.py | 131 +++++++--- tests/test_utils_serialize.py | 43 ++- tox.ini | 3 +- 36 files changed, 757 insertions(+), 355 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 29b2fc40689..86734fae7ad 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -281,6 +281,7 @@ # ------------------------------------- intersphinx_mapping = { + 'attrs': ('https://www.attrs.org/en/stable/', None), 'coverage': ('https://coverage.readthedocs.io/en/stable', None), 'cssselect': ('https://cssselect.readthedocs.io/en/latest', None), 'pytest': ('https://docs.pytest.org/en/latest', None), diff --git a/docs/faq.rst b/docs/faq.rst index 9cdb7d09d9c..d5ea3cb87b1 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -342,15 +342,15 @@ method for this purpose. For example:: from copy import deepcopy - from scrapy.item import Item - + from itemadapter import is_item, ItemAdapter class MultiplyItemsMiddleware: def process_spider_output(self, response, result, spider): for item in result: - if isinstance(item, (Item, dict)): - for _ in range(item['multiply_by']): + if is_item(item): + adapter = ItemAdapter(item) + for _ in range(adapter['multiply_by']): yield deepcopy(item) Does Scrapy support IPv6 addresses? diff --git a/docs/topics/architecture.rst b/docs/topics/architecture.rst index ae25dfa2f6a..074c5924199 100644 --- a/docs/topics/architecture.rst +++ b/docs/topics/architecture.rst @@ -104,7 +104,7 @@ Spiders ------- Spiders are custom classes written by Scrapy users to parse responses and -extract items (aka scraped items) from them or additional requests to +extract :ref:`items ` from them or additional requests to follow. For more information see :ref:`topics-spiders`. .. _component-pipelines: diff --git a/docs/topics/coroutines.rst b/docs/topics/coroutines.rst index 7a9ecd4d5ae..a0952d323f9 100644 --- a/docs/topics/coroutines.rst +++ b/docs/topics/coroutines.rst @@ -53,21 +53,28 @@ There are several use cases for coroutines in Scrapy. Code that would return Deferreds when written for previous Scrapy versions, such as downloader middlewares and signal handlers, can be rewritten to be shorter and cleaner:: + from itemadapter import ItemAdapter + class DbPipeline: def _update_item(self, data, item): - item['field'] = data + adapter = ItemAdapter(item) + adapter['field'] = data return item def process_item(self, item, spider): - dfd = db.get_some_data(item['id']) + adapter = ItemAdapter(item) + dfd = db.get_some_data(adapter['id']) dfd.addCallback(self._update_item, item) return dfd becomes:: + from itemadapter import ItemAdapter + class DbPipeline: async def process_item(self, item, spider): - item['field'] = await db.get_some_data(item['id']) + adapter = ItemAdapter(item) + adapter['field'] = await db.get_some_data(adapter['id']) return item Coroutines may be used to call asynchronous code. This includes other diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst index 7daf25ab32f..e5c99e5b1f5 100644 --- a/docs/topics/exporters.rst +++ b/docs/topics/exporters.rst @@ -40,6 +40,7 @@ Here you can see an :doc:`Item Pipeline ` which uses multiple Item Exporters to group scraped items to different files according to the value of one of their fields:: + from itemadapter import ItemAdapter from scrapy.exporters import XmlItemExporter class PerYearXmlExportPipeline: @@ -53,7 +54,8 @@ value of one of their fields:: exporter.finish_exporting() def _exporter_for_item(self, item): - year = item['year'] + adapter = ItemAdapter(item) + year = adapter['year'] if year not in self.year_to_exporter: f = open('{}.xml'.format(year), 'wb') exporter = XmlItemExporter(f) @@ -167,9 +169,10 @@ BaseItemExporter value unchanged except for ``unicode`` values which are encoded to ``str`` using the encoding declared in the :attr:`encoding` attribute. - :param field: the field being serialized. If a raw dict is being - exported (not :class:`~.Item`) *field* value is an empty dict. - :type field: :class:`~scrapy.item.Field` object or an empty dict + :param field: the field being serialized. If the source :ref:`item object + ` does not define field metadata, *field* is an empty + :class:`dict`. + :type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance :param name: the name of the field being serialized :type name: str @@ -192,14 +195,17 @@ BaseItemExporter .. attribute:: fields_to_export - A list with the name of the fields that will be exported, or None if you - want to export all fields. Defaults to None. + A list with the name of the fields that will be exported, or ``None`` if + you want to export all fields. Defaults to ``None``. Some exporters (like :class:`CsvItemExporter`) respect the order of the fields defined in this attribute. - Some exporters may require fields_to_export list in order to export the - data properly when spiders return dicts (not :class:`~Item` instances). + When using :ref:`item objects ` that do not expose all their + possible fields, exporters that do not support exporting a different + subset of fields per item will only export the fields found in the first + item exported. Use ``fields_to_export`` to define all the fields to be + exported. .. attribute:: export_empty_fields @@ -238,7 +244,7 @@ XmlItemExporter .. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs) - Exports Items in XML format to the specified file object. + Exports items in XML format to the specified file object. :param file: the file-like object to use for exporting the data. Its ``write`` method should accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) @@ -292,7 +298,7 @@ CsvItemExporter .. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs) - Exports Items in CSV format to the given file-like object. If the + Exports items in CSV format to the given file-like object. If the :attr:`fields_to_export` attribute is set, it will be used to define the CSV columns and their order. The :attr:`export_empty_fields` attribute has no effect on this exporter. @@ -325,7 +331,7 @@ PickleItemExporter .. class:: PickleItemExporter(file, protocol=0, **kwargs) - Exports Items in pickle format to the given file-like object. + Exports items in pickle format to the given file-like object. :param file: the file-like object to use for exporting the data. Its ``write`` method should accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) @@ -345,7 +351,7 @@ PprintItemExporter .. class:: PprintItemExporter(file, **kwargs) - Exports Items in pretty print format to the specified file object. + Exports items in pretty print format to the specified file object. :param file: the file-like object to use for exporting the data. Its ``write`` method should accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) @@ -365,7 +371,7 @@ JsonItemExporter .. class:: JsonItemExporter(file, **kwargs) - Exports Items in JSON format to the specified file-like object, writing all + Exports items in JSON format to the specified file-like object, writing all objects as a list of objects. The additional ``__init__`` method arguments are passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any @@ -394,7 +400,7 @@ JsonLinesItemExporter .. class:: JsonLinesItemExporter(file, **kwargs) - Exports Items in JSON format to the specified file-like object, writing one + Exports items in JSON format to the specified file-like object, writing one JSON-encoded item per line. The additional ``__init__`` method arguments are passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 9e5968a295d..24d69040c32 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -298,8 +298,8 @@ Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``. Use FEED_EXPORT_FIELDS option to define fields to export and their order. -When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields -defined in dicts or :class:`~.Item` subclasses a spider is yielding. +When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields +defined in :ref:`item objects ` yielded by your spider. If an exporter requires a fixed set of fields (this is the case for :ref:`CSV ` export format) and FEED_EXPORT_FIELDS diff --git a/docs/topics/item-pipeline.rst b/docs/topics/item-pipeline.rst index c9194caa163..cd6a6d47e75 100644 --- a/docs/topics/item-pipeline.rst +++ b/docs/topics/item-pipeline.rst @@ -27,15 +27,19 @@ Each item pipeline component is a Python class that must implement the following .. method:: process_item(self, item, spider) - This method is called for every item pipeline component. :meth:`process_item` - must either: return a dict with data, return an :class:`~scrapy.item.Item` - (or any descendant class) object, return a - :class:`~twisted.internet.defer.Deferred` or raise - :exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer - processed by further pipeline components. + This method is called for every item pipeline component. - :param item: the item scraped - :type item: :class:`~scrapy.item.Item` object or a dict + `item` is an :ref:`item object `, see + :ref:`supporting-item-types`. + + :meth:`process_item` must either: return an :ref:`item object `, + return a :class:`~twisted.internet.defer.Deferred` or raise a + :exc:`~scrapy.exceptions.DropItem` exception. + + Dropped items are no longer processed by further pipeline components. + + :param item: the scraped item + :type item: :ref:`item object ` :param spider: the spider which scraped the item :type spider: :class:`~scrapy.spiders.Spider` object @@ -79,16 +83,17 @@ Let's take a look at the following hypothetical pipeline that adjusts the (``price_excludes_vat`` attribute), and drops those items which don't contain a price:: + from itemadapter import ItemAdapter from scrapy.exceptions import DropItem - class PricePipeline: vat_factor = 1.15 def process_item(self, item, spider): - if item.get('price'): - if item.get('price_excludes_vat'): - item['price'] = item['price'] * self.vat_factor + adapter = ItemAdapter(item) + if adapter.get('price'): + if adapter.get('price_excludes_vat'): + adapter['price'] = adapter['price'] * self.vat_factor return item else: raise DropItem("Missing price in %s" % item) @@ -103,6 +108,8 @@ format:: import json + from itemadapter import ItemAdapter + class JsonWriterPipeline: def open_spider(self, spider): @@ -112,7 +119,7 @@ format:: self.file.close() def process_item(self, item, spider): - line = json.dumps(dict(item)) + "\n" + line = json.dumps(ItemAdapter(item).asdict()) + "\n" self.file.write(line) return item @@ -131,6 +138,7 @@ The main point of this example is to show how to use :meth:`from_crawler` method and how to clean up the resources properly.:: import pymongo + from itemadapter import ItemAdapter class MongoPipeline: @@ -155,7 +163,7 @@ method and how to clean up the resources properly.:: self.client.close() def process_item(self, item, spider): - self.db[self.collection_name].insert_one(dict(item)) + self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) return item .. _MongoDB: https://www.mongodb.com/ @@ -177,10 +185,11 @@ item. :: - import scrapy import hashlib from urllib.parse import quote + import scrapy + from itemadapter import ItemAdapter class ScreenshotPipeline: """Pipeline that uses Splash to render screenshot of @@ -189,7 +198,8 @@ item. SPLASH_URL = "http://localhost:8050/render.png?url={}" async def process_item(self, item, spider): - encoded_item_url = quote(item["url"]) + adapter = ItemAdapter(item) + encoded_item_url = quote(adapter["url"]) screenshot_url = self.SPLASH_URL.format(encoded_item_url) request = scrapy.Request(screenshot_url) response = await spider.crawler.engine.download(request, spider) @@ -199,14 +209,14 @@ item. return item # Save screenshot to file, filename will be hash of url. - url = item["url"] + url = adapter["url"] url_hash = hashlib.md5(url.encode("utf8")).hexdigest() filename = "{}.png".format(url_hash) with open(filename, "wb") as f: f.write(response.body) # Store filename in item. - item["screenshot_filename"] = filename + adapter["screenshot_filename"] = filename return item .. _Splash: https://splash.readthedocs.io/en/stable/ @@ -219,6 +229,7 @@ already processed. Let's say that our items have a unique id, but our spider returns multiples items with the same id:: + from itemadapter import ItemAdapter from scrapy.exceptions import DropItem class DuplicatesPipeline: @@ -227,10 +238,11 @@ returns multiples items with the same id:: self.ids_seen = set() def process_item(self, item, spider): - if item['id'] in self.ids_seen: - raise DropItem("Duplicate item found: %s" % item) + adapter = ItemAdapter(item) + if adapter['id'] in self.ids_seen: + raise DropItem("Duplicate item found: %r" % item) else: - self.ids_seen.add(item['id']) + self.ids_seen.add(adapter['id']) return item diff --git a/docs/topics/items.rst b/docs/topics/items.rst index 0941a8a1b72..65bf156ac22 100644 --- a/docs/topics/items.rst +++ b/docs/topics/items.rst @@ -8,29 +8,155 @@ Items :synopsis: Item and Field classes The main goal in scraping is to extract structured data from unstructured -sources, typically, web pages. Scrapy spiders can return the extracted data -as Python dicts. While convenient and familiar, Python dicts lack structure: -it is easy to make a typo in a field name or return inconsistent data, -especially in a larger project with many spiders. - -To define common output data format Scrapy provides the :class:`Item` class. -:class:`Item` objects are simple containers used to collect the scraped data. -They provide an API similar to :class:`dict` API with a convenient syntax -for declaring their available fields. - -Various Scrapy components use extra information provided by Items: -exporters look at declared fields to figure out columns to export, -serialization can be customized using Item fields metadata, :mod:`trackref` -tracks Item instances to help find memory leaks -(see :ref:`topics-leaks-trackrefs`), etc. +sources, typically, web pages. :ref:`Spiders ` may return the +extracted data as `items`, Python objects that define key-value pairs. + +Scrapy supports :ref:`multiple types of items `. When you create an +item, you may use whichever type of item you want. When you write code that +receives an item, your code should :ref:`work for any item type +`. + +.. _item-types: + +Item Types +========== + +Scrapy supports the following types of items, via the `itemadapter`_ library: +:ref:`dictionaries `, :ref:`Item objects `, +:ref:`dataclass objects `, and :ref:`attrs objects `. + +.. _itemadapter: https://github.com/scrapy/itemadapter + +.. _dict-items: + +Dictionaries +------------ + +As an item type, :class:`dict` is convenient and familiar. + +.. _item-objects: + +Item objects +------------ + +:class:`Item` provides a :class:`dict`-like API plus additional features that +make it the most feature-complete item type: + +.. class:: Item([arg]) + + :class:`Item` objects replicate the standard :class:`dict` API, including + its ``__init__`` method. + + :class:`Item` allows defining field names, so that: + + - :class:`KeyError` is raised when using undefined field names (i.e. + prevents typos going unnoticed) + + - :ref:`Item exporters ` can export all fields by + default even if the first scraped object does not have values for all + of them + + :class:`Item` also allows defining field metadata, which can be used to + :ref:`customize serialization `. + + :mod:`trackref` tracks :class:`Item` objects to help find memory leaks + (see :ref:`topics-leaks-trackrefs`). + + :class:`Item` objects also provide the following additional API members: + + .. automethod:: copy + + .. automethod:: deepcopy + + .. attribute:: fields + + A dictionary containing *all declared fields* for this Item, not only + those populated. The keys are the field names and the values are the + :class:`Field` objects used in the :ref:`Item declaration + `. + +Example:: + + from scrapy.item import Item, Field + + class CustomItem(Item): + one_field = Field() + another_field = Field() + +.. _dataclass-items: + +Dataclass objects +----------------- + +.. versionadded:: 2.2 + +:func:`~dataclasses.dataclass` allows defining item classes with field names, +so that :ref:`item exporters ` can export all fields by +default even if the first scraped object does not have values for all of them. + +Additionally, ``dataclass`` items also allow to: + +* define the type and default value of each defined field. + +* define custom field metadata through :func:`dataclasses.field`, which can be used to + :ref:`customize serialization `. + +They work natively in Python 3.7 or later, or using the `dataclasses +backport`_ in Python 3.6. + +.. _dataclasses backport: https://pypi.org/project/dataclasses/ + +Example:: + + from dataclasses import dataclass + + @dataclass + class CustomItem: + one_field: str + another_field: int + +.. note:: Field types are not enforced at run time. + +.. _attrs-items: + +attr.s objects +-------------- + +.. versionadded:: 2.2 + +:func:`attr.s` allows defining item classes with field names, +so that :ref:`item exporters ` can export all fields by +default even if the first scraped object does not have values for all of them. + +Additionally, ``attr.s`` items also allow to: + +* define the type and default value of each defined field. + +* define custom field :ref:`metadata `, which can be used to + :ref:`customize serialization `. + +In order to use this type, the :doc:`attrs package ` needs to be installed. + +Example:: + + import attr + + @attr.s + class CustomItem: + one_field = attr.ib() + another_field = attr.ib() + + +Working with Item objects +========================= .. _topics-items-declaring: -Declaring Items -=============== +Declaring Item subclasses +------------------------- -Items are declared using a simple class definition syntax and :class:`Field` -objects. Here is an example:: +Item subclasses are declared using a simple class definition syntax and +:class:`Field` objects. Here is an example:: import scrapy @@ -48,10 +174,11 @@ objects. Here is an example:: .. _Django: https://www.djangoproject.com/ .. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/ + .. _topics-items-fields: -Item Fields -=========== +Declaring fields +---------------- :class:`Field` objects are used to specify metadata for each field. For example, the serializer function for the ``last_updated`` field illustrated in @@ -72,15 +199,31 @@ It's important to note that the :class:`Field` objects used to declare the item do not stay assigned as class attributes. Instead, they can be accessed through the :attr:`Item.fields` attribute. -Working with Items -================== +.. class:: Field([arg]) + + The :class:`Field` class is just an alias to the built-in :class:`dict` class and + doesn't provide any extra functionality or attributes. In other words, + :class:`Field` objects are plain-old Python dicts. A separate class is used + to support the :ref:`item declaration syntax ` + based on class attributes. + +.. note:: Field metadata can also be declared for ``dataclass`` and ``attrs`` + items. Please refer to the documentation for `dataclasses.field`_ and + `attr.ib`_ for additional information. + + .. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field + .. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib + + +Working with Item objects +------------------------- Here are some examples of common tasks performed with items, using the ``Product`` item :ref:`declared above `. You will notice the API is very similar to the :class:`dict` API. Creating items --------------- +'''''''''''''' >>> product = Product(name='Desktop PC', price=1000) >>> print(product) @@ -88,7 +231,7 @@ Product(name='Desktop PC', price=1000) Getting field values --------------------- +'''''''''''''''''''' >>> product['name'] Desktop PC @@ -128,7 +271,7 @@ False Setting field values --------------------- +'''''''''''''''''''' >>> product['last_updated'] = 'today' >>> product['last_updated'] @@ -141,7 +284,7 @@ KeyError: 'Product does not support field: lala' Accessing all populated values ------------------------------- +'''''''''''''''''''''''''''''' To access all populated values, just use the typical :class:`dict` API: @@ -155,7 +298,7 @@ To access all populated values, just use the typical :class:`dict` API: .. _copying-items: Copying items -------------- +''''''''''''' To copy an item, you must first decide whether you want a shallow copy or a deep copy. @@ -183,7 +326,7 @@ To create a deep copy, call :meth:`~scrapy.item.Item.deepcopy` instead Other common tasks ------------------- +'''''''''''''''''' Creating dicts from items: @@ -201,8 +344,8 @@ Traceback (most recent call last): KeyError: 'Product does not support field: lala' -Extending Items -=============== +Extending Item subclasses +------------------------- You can extend Items (to add more fields or to change some metadata for some fields) by declaring a subclass of your original Item. @@ -222,39 +365,25 @@ appending more values, or changing existing values, like this:: That adds (or replaces) the ``serializer`` metadata key for the ``name`` field, keeping all the previously existing metadata values. -Item objects -============ - -.. class:: Item([arg]) - Return a new Item optionally initialized from the given argument. +.. _supporting-item-types: - Items replicate the standard :class:`dict` API, including its ``__init__`` - method, and also provide the following additional API members: +Supporting All Item Types +========================= - .. automethod:: copy +In code that receives an item, such as methods of :ref:`item pipelines +` or :ref:`spider middlewares +`, it is a good practice to use the +:class:`~itemadapter.ItemAdapter` class and the +:func:`~itemadapter.is_item` function to write code that works for +any :ref:`supported item type `: - .. automethod:: deepcopy +.. autoclass:: itemadapter.ItemAdapter - .. attribute:: fields +.. autofunction:: itemadapter.is_item - A dictionary containing *all declared fields* for this Item, not only - those populated. The keys are the field names and the values are the - :class:`Field` objects used in the :ref:`Item declaration - `. - -Field objects -============= - -.. class:: Field([arg]) - - The :class:`Field` class is just an alias to the built-in :class:`dict` class and - doesn't provide any extra functionality or attributes. In other words, - :class:`Field` objects are plain-old Python dicts. A separate class is used - to support the :ref:`item declaration syntax ` - based on class attributes. -Other classes related to Item -============================= +Other classes related to items +============================== .. autoclass:: ItemMeta diff --git a/docs/topics/leaks.rst b/docs/topics/leaks.rst index ceb708c7ecd..3224241fc05 100644 --- a/docs/topics/leaks.rst +++ b/docs/topics/leaks.rst @@ -4,7 +4,7 @@ Debugging memory leaks ====================== -In Scrapy, objects such as Requests, Responses and Items have a finite +In Scrapy, objects such as requests, responses and items have a finite lifetime: they are created, used for a while, and finally destroyed. From all those objects, the Request is probably the one with the longest @@ -61,8 +61,8 @@ Debugging memory leaks with ``trackref`` ======================================== :mod:`trackref` is a module provided by Scrapy to debug the most common cases of -memory leaks. It basically tracks the references to all live Requests, -Responses, Item and Selector objects. +memory leaks. It basically tracks the references to all live Request, +Response, Item, Spider and Selector objects. You can enter the telnet console and inspect how many objects (of the classes mentioned above) are currently alive using the ``prefs()`` function which is an @@ -200,11 +200,10 @@ Debugging memory leaks with muppy ``trackref`` provides a very convenient mechanism for tracking down memory leaks, but it only keeps track of the objects that are more likely to cause -memory leaks (Requests, Responses, Items, and Selectors). However, there are -other cases where the memory leaks could come from other (more or less obscure) -objects. If this is your case, and you can't find your leaks using ``trackref``, -you still have another resource: the muppy library. - +memory leaks. However, there are other cases where the memory leaks could come +from other (more or less obscure) objects. If this is your case, and you can't +find your leaks using ``trackref``, you still have another resource: the muppy +library. You can use muppy from `Pympler`_. diff --git a/docs/topics/loaders.rst b/docs/topics/loaders.rst index eb804f1dbbc..6645bf12396 100644 --- a/docs/topics/loaders.rst +++ b/docs/topics/loaders.rst @@ -7,13 +7,12 @@ Item Loaders .. module:: scrapy.loader :synopsis: Item Loader class -Item Loaders provide a convenient mechanism for populating scraped :ref:`Items -`. Even though Items can be populated using their own -dictionary-like API, Item Loaders provide a much more convenient API for -populating them from a scraping process, by automating some common tasks like -parsing the raw extracted data before assigning it. +Item Loaders provide a convenient mechanism for populating scraped :ref:`items +`. Even though items can be populated directly, Item Loaders provide a +much more convenient API for populating them from a scraping process, by automating +some common tasks like parsing the raw extracted data before assigning it. -In other words, :ref:`Items ` provide the *container* of +In other words, :ref:`items ` provide the *container* of scraped data, while Item Loaders provide the mechanism for *populating* that container. @@ -25,10 +24,10 @@ Using Item Loaders to populate items ==================================== To use an Item Loader, you must first instantiate it. You can either -instantiate it with a dict-like object (e.g. Item or dict) or without one, in -which case an Item is automatically instantiated in the Item Loader ``__init__`` method -using the Item class specified in the :attr:`ItemLoader.default_item_class` -attribute. +instantiate it with an :ref:`item object ` or without one, in which +case an instance of :class:`~scrapy.item.Item` is automatically created in the +Item Loader ``__init__`` method using the :class:`~scrapy.item.Item` subclass +specified in the :attr:`ItemLoader.default_item_class` attribute. Then, you start collecting values into the Item Loader, typically using :ref:`Selectors `. You can add more than one value to @@ -88,7 +87,7 @@ received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` :meth:`~ItemLoader.add_value` methods) and the result of the input processor is collected and kept inside the ItemLoader. After collecting all data, the :meth:`ItemLoader.load_item` method is called to populate and get the populated -:class:`~scrapy.item.Item` object. That's when the output processor is +:ref:`item object `. That's when the output processor is called with the data previously collected (and processed using the input processor). The result of the output processor is the final value that gets assigned to the item. @@ -153,12 +152,10 @@ Last, but not least, Scrapy comes with some :ref:`commonly used processors ` built-in for convenience. - Declaring Item Loaders ====================== -Item Loaders are declared like Items, by using a class definition syntax. Here -is an example:: +Item Loaders are declared using a class definition syntax. Here is an example:: from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, MapCompose, Join @@ -275,9 +272,9 @@ ItemLoader objects .. class:: ItemLoader([item, selector, response], **kwargs) - Return a new Item Loader for populating the given Item. If no item is - given, one is instantiated automatically using the class in - :attr:`default_item_class`. + Return a new Item Loader for populating the given :ref:`item object + `. If no item object is given, one is instantiated + automatically using the class in :attr:`default_item_class`. When instantiated with a ``selector`` or a ``response`` parameters the :class:`ItemLoader` class provides convenient mechanisms for extracting @@ -286,7 +283,7 @@ ItemLoader objects :param item: The item instance to populate using subsequent calls to :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`, or :meth:`~ItemLoader.add_value`. - :type item: :class:`~scrapy.item.Item` object + :type item: :ref:`item object ` :param selector: The selector to extract data from, when using the :meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath` @@ -444,17 +441,19 @@ ItemLoader objects Create a nested loader with an xpath selector. The supplied selector is applied relative to selector associated - with this :class:`ItemLoader`. The nested loader shares the :class:`Item` - with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, - :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. + with this :class:`ItemLoader`. The nested loader shares the :ref:`item + object ` with the parent :class:`ItemLoader` so calls to + :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will + behave as expected. .. method:: nested_css(css) Create a nested loader with a css selector. The supplied selector is applied relative to selector associated - with this :class:`ItemLoader`. The nested loader shares the :class:`Item` - with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, - :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. + with this :class:`ItemLoader`. The nested loader shares the :ref:`item + object ` with the parent :class:`ItemLoader` so calls to + :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will + behave as expected. .. method:: get_collected_values(field_name) @@ -477,7 +476,7 @@ ItemLoader objects .. attribute:: item - The :class:`~scrapy.item.Item` object being parsed by this Item Loader. + The :ref:`item object ` being parsed by this Item Loader. This is mostly used as a property so when attempting to override this value, you may want to check out :attr:`default_item_class` first. @@ -488,8 +487,8 @@ ItemLoader objects .. attribute:: default_item_class - An Item class (or factory), used to instantiate items when not given in - the ``__init__`` method. + An :ref:`item object ` class or factory, used to + instantiate items when not given in the ``__init__`` method. .. attribute:: default_input_processor diff --git a/docs/topics/media-pipeline.rst b/docs/topics/media-pipeline.rst index 86550d7a43d..01de3dedb97 100644 --- a/docs/topics/media-pipeline.rst +++ b/docs/topics/media-pipeline.rst @@ -156,7 +156,7 @@ following forms:: ftp://username:password@address:port/path ftp://address:port/path - + If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and :setting:`FTP_PASSWORD` settings respectively. @@ -243,20 +243,22 @@ Usage example .. setting:: IMAGES_URLS_FIELD .. setting:: IMAGES_RESULT_FIELD -In order to use a media pipeline first, :ref:`enable it +In order to use a media pipeline, first :ref:`enable it `. -Then, if a spider returns a dict with the URLs key (``file_urls`` or -``image_urls``, for the Files or Images Pipeline respectively), the pipeline will -put the results under respective key (``files`` or ``images``). +Then, if a spider returns an :ref:`item object ` with the URLs +field (``file_urls`` or ``image_urls``, for the Files or Images Pipeline +respectively), the pipeline will put the results under the respective field +(``files`` or ``images``). -If you prefer to use :class:`~.Item`, then define a custom item with the -necessary fields, like in this example for Images Pipeline:: +When using :ref:`item types ` for which fields are defined beforehand, +you must define both the URLs field and the results field. For example, when +using the images pipeline, items must define both the ``image_urls`` and the +``images`` field. For instance, using the :class:`~scrapy.item.Item` class:: import scrapy class MyItem(scrapy.Item): - # ... other item fields ... image_urls = scrapy.Field() images = scrapy.Field() @@ -445,8 +447,11 @@ See here the methods that you can override in your custom Files Pipeline: :meth:`~get_media_requests` method and return a Request for each file URL:: + from itemadapter import ItemAdapter + def get_media_requests(self, item, info): - for file_url in item['file_urls']: + adapter = ItemAdapter(item) + for file_url in adapter['file_urls']: yield scrapy.Request(file_url) Those requests will be processed by the pipeline and, when they have finished @@ -509,13 +514,15 @@ See here the methods that you can override in your custom Files Pipeline: store the downloaded file paths (passed in results) in the ``file_paths`` item field, and we drop the item if it doesn't contain any files:: + from itemadapter import ItemAdapter from scrapy.exceptions import DropItem def item_completed(self, results, item, info): file_paths = [x['path'] for ok, x in results if ok] if not file_paths: raise DropItem("Item contains no files") - item['file_paths'] = file_paths + adapter = ItemAdapter(item) + adapter['file_paths'] = file_paths return item By default, the :meth:`item_completed` method returns the item. @@ -589,8 +596,9 @@ Here is a full example of the Images Pipeline whose methods are exemplified above:: import scrapy - from scrapy.pipelines.images import ImagesPipeline + from itemadapter import ItemAdapter from scrapy.exceptions import DropItem + from scrapy.pipelines.images import ImagesPipeline class MyImagesPipeline(ImagesPipeline): @@ -602,7 +610,8 @@ above:: image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") - item['image_paths'] = image_paths + adapter = ItemAdapter(item) + adapter['image_paths'] = image_paths return item diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index f06d9db3c95..5178f272f07 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -236,8 +236,8 @@ CONCURRENT_ITEMS Default: ``100`` -Maximum number of concurrent items (per response) to process in parallel in the -Item Processor (also known as the :ref:`Item Pipeline `). +Maximum number of concurrent items (per response) to process in parallel in +:ref:`item pipelines `. .. setting:: CONCURRENT_REQUESTS diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index fe4fb0834d0..255ba9d3fa2 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -151,8 +151,8 @@ item_scraped This signal supports returning deferreds from its handlers. - :param item: the item scraped - :type item: dict or :class:`~scrapy.item.Item` object + :param item: the scraped item + :type item: :ref:`item object ` :param spider: the spider which scraped the item :type spider: :class:`~scrapy.spiders.Spider` object @@ -172,7 +172,7 @@ item_dropped This signal supports returning deferreds from its handlers. :param item: the item dropped from the :ref:`topics-item-pipeline` - :type item: dict or :class:`~scrapy.item.Item` object + :type item: :ref:`item object ` :param spider: the spider which scraped the item :type spider: :class:`~scrapy.spiders.Spider` object @@ -196,8 +196,8 @@ item_error This signal supports returning deferreds from its handlers. - :param item: the item dropped from the :ref:`topics-item-pipeline` - :type item: dict or :class:`~scrapy.item.Item` object + :param item: the item that caused the error in the :ref:`topics-item-pipeline` + :type item: :ref:`item object ` :param response: the response being processed when the exception was raised :type response: :class:`~scrapy.http.Response` object diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst index d49a2209d75..c6cbdba763a 100644 --- a/docs/topics/spider-middleware.rst +++ b/docs/topics/spider-middleware.rst @@ -102,29 +102,28 @@ object gives you access, for example, to the :ref:`settings `. it has processed the response. :meth:`process_spider_output` must return an iterable of - :class:`~scrapy.http.Request`, dict or :class:`~scrapy.item.Item` - objects. + :class:`~scrapy.http.Request` objects and :ref:`item object + `. :param response: the response which generated this output from the spider :type response: :class:`~scrapy.http.Response` object :param result: the result returned by the spider - :type result: an iterable of :class:`~scrapy.http.Request`, dict - or :class:`~scrapy.item.Item` objects + :type result: an iterable of :class:`~scrapy.http.Request` objects and + :ref:`item object ` :param spider: the spider whose result is being processed :type spider: :class:`~scrapy.spiders.Spider` object - .. method:: process_spider_exception(response, exception, spider) This method is called when a spider or :meth:`process_spider_output` method (from a previous spider middleware) raises an exception. :meth:`process_spider_exception` should return either ``None`` or an - iterable of :class:`~scrapy.http.Request`, dict or - :class:`~scrapy.item.Item` objects. + iterable of :class:`~scrapy.http.Request` objects and :ref:`item object + `. If it returns ``None``, Scrapy will continue processing this exception, executing any other :meth:`process_spider_exception` in the following diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst index 8ff5e72920a..d4d6e2ea0a5 100644 --- a/docs/topics/spiders.rst +++ b/docs/topics/spiders.rst @@ -23,8 +23,8 @@ For spiders, the scraping cycle goes through something like this: :attr:`~scrapy.spiders.Spider.parse` method as callback function for the Requests. -2. In the callback function, you parse the response (web page) and return either - dicts with extracted data, :class:`~scrapy.item.Item` objects, +2. In the callback function, you parse the response (web page) and return + :ref:`item objects `, :class:`~scrapy.http.Request` objects, or an iterable of these objects. Those Requests will also contain a callback (maybe the same) and will then be downloaded by Scrapy and then their @@ -179,8 +179,8 @@ scrapy.Spider the same requirements as the :class:`Spider` class. This method, as well as any other Request callback, must return an - iterable of :class:`~scrapy.http.Request` and/or - dicts or :class:`~scrapy.item.Item` objects. + iterable of :class:`~scrapy.http.Request` and/or :ref:`item objects + `. :param response: the response to parse :type response: :class:`~scrapy.http.Response` @@ -234,7 +234,7 @@ Return multiple Requests and items from a single callback:: yield scrapy.Request(response.urljoin(href), self.parse) Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly; -to give data more structure you can use :ref:`topics-items`:: +to give data more structure you can use :class:`~scrapy.item.Item` objects:: import scrapy from myproject.items import MyItem @@ -364,7 +364,7 @@ CrawlSpider This method is called for the start_urls responses. It allows to parse the initial responses and must return either an - :class:`~scrapy.item.Item` object, a :class:`~scrapy.http.Request` + :ref:`item object `, a :class:`~scrapy.http.Request` object, or an iterable containing any of them. Crawling rules @@ -383,7 +383,7 @@ Crawling rules object with that name will be used) to be called for each link extracted with the specified link extractor. This callback receives a :class:`~scrapy.http.Response` as its first argument and must return either a single instance or an iterable of - :class:`~scrapy.item.Item`, ``dict`` and/or :class:`~scrapy.http.Request` objects + :ref:`item objects ` and/or :class:`~scrapy.http.Request` objects (or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response` object will contain the text of the link that produced the :class:`~scrapy.http.Request` in its ``meta`` dictionary (under the ``link_text`` key) @@ -531,7 +531,7 @@ XMLFeedSpider (``itertag``). Receives the response and an :class:`~scrapy.selector.Selector` for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method - must return either a :class:`~scrapy.item.Item` object, a + must return an :ref:`item object `, a :class:`~scrapy.http.Request` object, or an iterable containing any of them. @@ -541,7 +541,7 @@ XMLFeedSpider spider, and it's intended to perform any last time processing required before returning the results to the framework core, for example setting the item IDs. It receives a list of results and the response which originated - those results. It must return a list of results (Items or Requests). + those results. It must return a list of results (items or requests). XMLFeedSpider example diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py index 580fd282806..8b7fa8b580a 100644 --- a/scrapy/commands/parse.py +++ b/scrapy/commands/parse.py @@ -1,11 +1,11 @@ import json import logging +from itemadapter import is_item, ItemAdapter from w3lib.url import is_url from scrapy.commands import ScrapyCommand from scrapy.http import Request -from scrapy.item import _BaseItem from scrapy.utils import display from scrapy.utils.conf import arglist_to_dict from scrapy.utils.spider import iterate_spider_output, spidercls_for_request @@ -81,7 +81,7 @@ def print_items(self, lvl=None, colour=True): items = self.items.get(lvl, []) print("# Scraped Items ", "-" * 60) - display.pprint([dict(x) for x in items], colorize=colour) + display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour) def print_requests(self, lvl=None, colour=True): if lvl is None: @@ -117,7 +117,7 @@ def run_callback(self, response, callback, cb_kwargs=None): items, requests = [], [] for x in iterate_spider_output(callback(response, **cb_kwargs)): - if isinstance(x, (_BaseItem, dict)): + if is_item(x): items.append(x) elif isinstance(x, Request): requests.append(x) diff --git a/scrapy/contracts/default.py b/scrapy/contracts/default.py index cdc2bac1556..34f0d36d459 100644 --- a/scrapy/contracts/default.py +++ b/scrapy/contracts/default.py @@ -1,10 +1,10 @@ import json -from scrapy.item import _BaseItem -from scrapy.http import Request -from scrapy.exceptions import ContractFail +from itemadapter import is_item, ItemAdapter from scrapy.contracts import Contract +from scrapy.exceptions import ContractFail +from scrapy.http import Request # contracts @@ -48,11 +48,11 @@ class ReturnsContract(Contract): """ name = 'returns' - objects = { - 'request': Request, - 'requests': Request, - 'item': (_BaseItem, dict), - 'items': (_BaseItem, dict), + object_type_verifiers = { + 'request': lambda x: isinstance(x, Request), + 'requests': lambda x: isinstance(x, Request), + 'item': is_item, + 'items': is_item, } def __init__(self, *args, **kwargs): @@ -64,7 +64,7 @@ def __init__(self, *args, **kwargs): % len(self.args) ) self.obj_name = self.args[0] or None - self.obj_type = self.objects[self.obj_name] + self.obj_type_verifier = self.object_type_verifiers[self.obj_name] try: self.min_bound = int(self.args[1]) @@ -79,7 +79,7 @@ def __init__(self, *args, **kwargs): def post_process(self, output): occurrences = 0 for x in output: - if isinstance(x, self.obj_type): + if self.obj_type_verifier(x): occurrences += 1 assertion = (self.min_bound <= occurrences <= self.max_bound) @@ -103,8 +103,8 @@ class ScrapesContract(Contract): def post_process(self, output): for x in output: - if isinstance(x, (_BaseItem, dict)): - missing = [arg for arg in self.args if arg not in x] + if is_item(x): + missing = [arg for arg in self.args if arg not in ItemAdapter(x)] if missing: - raise ContractFail( - "Missing fields: %s" % ", ".join(missing)) + missing_str = ", ".join(missing) + raise ContractFail("Missing fields: %s" % missing_str) diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py index 6785e103db3..d07c7aa62ae 100644 --- a/scrapy/core/scraper.py +++ b/scrapy/core/scraper.py @@ -4,18 +4,18 @@ import logging from collections import deque -from twisted.python.failure import Failure +from itemadapter import is_item from twisted.internet import defer +from twisted.python.failure import Failure -from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback -from scrapy.utils.spider import iterate_spider_output -from scrapy.utils.misc import load_object, warn_on_generator_with_return_value -from scrapy.utils.log import logformatter_adapter, failure_to_exc_info -from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest from scrapy import signals -from scrapy.http import Request, Response -from scrapy.item import _BaseItem from scrapy.core.spidermw import SpiderMiddlewareManager +from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest +from scrapy.http import Request, Response +from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel +from scrapy.utils.log import failure_to_exc_info, logformatter_adapter +from scrapy.utils.misc import load_object, warn_on_generator_with_return_value +from scrapy.utils.spider import iterate_spider_output logger = logging.getLogger(__name__) @@ -191,7 +191,7 @@ def _process_spidermw_output(self, output, request, response, spider): """ if isinstance(output, Request): self.crawler.engine.crawl(request=output, spider=spider) - elif isinstance(output, (_BaseItem, dict)): + elif is_item(output): self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) @@ -200,10 +200,11 @@ def _process_spidermw_output(self, output, request, response, spider): pass else: typename = type(output).__name__ - logger.error('Spider must return Request, BaseItem, dict or None, ' - 'got %(typename)r in %(request)s', - {'request': request, 'typename': typename}, - extra={'spider': spider}) + logger.error( + 'Spider must return request, item, or None, got %(typename)r in %(request)s', + {'request': request, 'typename': typename}, + extra={'spider': spider}, + ) def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download diff --git a/scrapy/exporters.py b/scrapy/exporters.py index de009082a99..71257267337 100644 --- a/scrapy/exporters.py +++ b/scrapy/exporters.py @@ -4,16 +4,18 @@ import csv import io -import pprint import marshal -import warnings import pickle +import pprint +import warnings from xml.sax.saxutils import XMLGenerator -from scrapy.utils.serialize import ScrapyJSONEncoder -from scrapy.utils.python import to_bytes, to_unicode, is_listlike -from scrapy.item import _BaseItem +from itemadapter import is_item, ItemAdapter + from scrapy.exceptions import ScrapyDeprecationWarning +from scrapy.item import _BaseItem +from scrapy.utils.python import is_listlike, to_bytes, to_unicode +from scrapy.utils.serialize import ScrapyJSONEncoder __all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter', @@ -56,11 +58,14 @@ def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ + item = ItemAdapter(item) + if include_empty is None: include_empty = self.export_empty_fields + if self.fields_to_export is None: - if include_empty and not isinstance(item, dict): - field_iter = item.fields.keys() + if include_empty: + field_iter = item.field_names() else: field_iter = item.keys() else: @@ -71,8 +76,8 @@ def _get_serialized_fields(self, item, default_value=None, include_empty=None): for field_name in field_iter: if field_name in item: - field = {} if isinstance(item, dict) else item.fields[field_name] - value = self.serialize_field(field, field_name, item[field_name]) + field_meta = item.get_field_meta(field_name) + value = self.serialize_field(field_meta, field_name, item[field_name]) else: value = default_value @@ -297,6 +302,7 @@ class PythonItemExporter(BaseItemExporter): .. _msgpack: https://pypi.org/project/msgpack/ """ + def _configure(self, options, dont_fail=False): self.binary = options.pop('binary', True) super(PythonItemExporter, self)._configure(options, dont_fail) @@ -314,22 +320,22 @@ def serialize_field(self, field, name, value): def _serialize_value(self, value): if isinstance(value, _BaseItem): return self.export_item(value) - if isinstance(value, dict): - return dict(self._serialize_dict(value)) - if is_listlike(value): + elif is_item(value): + return dict(self._serialize_item(value)) + elif is_listlike(value): return [self._serialize_value(v) for v in value] encode_func = to_bytes if self.binary else to_unicode if isinstance(value, (str, bytes)): return encode_func(value, encoding=self.encoding) return value - def _serialize_dict(self, value): - for key, val in value.items(): + def _serialize_item(self, item): + for key, value in ItemAdapter(item).items(): key = to_bytes(key) if self.binary else key - yield key, self._serialize_value(val) + yield key, self._serialize_value(value) def export_item(self, item): result = dict(self._get_serialized_fields(item)) if self.binary: - result = dict(self._serialize_dict(result)) + result = dict(self._serialize_item(result)) return result diff --git a/scrapy/item.py b/scrapy/item.py index 97dfed9766e..4ab83d1a07b 100644 --- a/scrapy/item.py +++ b/scrapy/item.py @@ -36,7 +36,7 @@ class BaseItem(_BaseItem, metaclass=_BaseItemMeta): """ def __new__(cls, *args, **kwargs): - if issubclass(cls, BaseItem) and not (issubclass(cls, Item) or issubclass(cls, DictItem)): + if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)): warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead', ScrapyDeprecationWarning, stacklevel=2) return super(BaseItem, cls).__new__(cls, *args, **kwargs) diff --git a/scrapy/loader/__init__.py b/scrapy/loader/__init__.py index 21c4fb376f3..18f57945f9d 100644 --- a/scrapy/loader/__init__.py +++ b/scrapy/loader/__init__.py @@ -6,6 +6,8 @@ from collections import defaultdict from contextlib import suppress +from itemadapter import ItemAdapter + from scrapy.item import Item from scrapy.loader.common import wrap_loader_context from scrapy.loader.processors import Identity @@ -44,7 +46,7 @@ def __init__(self, item=None, selector=None, response=None, parent=None, **conte self._local_item = context['item'] = item self._local_values = defaultdict(list) # values from initial item - for field_name, value in item.items(): + for field_name, value in ItemAdapter(item).items(): self._values[field_name] += arg_to_iter(value) @property @@ -127,13 +129,12 @@ def get_value(self, value, *processors, **kw): return value def load_item(self): - item = self.item + adapter = ItemAdapter(self.item) for field_name in tuple(self._values): value = self.get_output_value(field_name) if value is not None: - item[field_name] = value - - return item + adapter[field_name] = value + return adapter.item def get_output_value(self, field_name): proc = self.get_output_processor(field_name) @@ -174,11 +175,8 @@ def _process_input_value(self, field_name, value): value, type(e).__name__, str(e))) def _get_item_field_attr(self, field_name, key, default=None): - if isinstance(self.item, Item): - value = self.item.fields[field_name].get(key, default) - else: - value = default - return value + field_meta = ItemAdapter(self.item).get_field_meta(field_name) + return field_meta.get(key, default) def _check_selector_method(self): if self.selector is None: diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index 7d86d0d568d..487382a38d7 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -10,24 +10,26 @@ import os import time from collections import defaultdict -from email.utils import parsedate_tz, mktime_tz +from contextlib import suppress +from email.utils import mktime_tz, parsedate_tz from ftplib import FTP from io import BytesIO from urllib.parse import urlparse +from itemadapter import ItemAdapter from twisted.internet import defer, threads +from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.http import Request from scrapy.pipelines.media import MediaPipeline from scrapy.settings import Settings -from scrapy.exceptions import NotConfigured, IgnoreRequest -from scrapy.http import Request -from scrapy.utils.misc import md5sum -from scrapy.utils.log import failure_to_exc_info -from scrapy.utils.python import to_bytes -from scrapy.utils.request import referer_str from scrapy.utils.boto import is_botocore from scrapy.utils.datatypes import CaselessDict from scrapy.utils.ftp import ftp_store_file +from scrapy.utils.log import failure_to_exc_info +from scrapy.utils.misc import md5sum +from scrapy.utils.python import to_bytes +from scrapy.utils.request import referer_str logger = logging.getLogger(__name__) @@ -517,7 +519,8 @@ def inc_stats(self, spider, status): # Overridable Interface def get_media_requests(self, item, info): - return [Request(x) for x in item.get(self.files_urls_field, [])] + urls = ItemAdapter(item).get(self.files_urls_field, []) + return [Request(u) for u in urls] def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) @@ -528,8 +531,8 @@ def file_downloaded(self, response, request, info): return checksum def item_completed(self, results, item, info): - if isinstance(item, dict) or self.files_result_field in item.fields: - item[self.files_result_field] = [x for ok, x in results if ok] + with suppress(KeyError): + ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok] return item def file_path(self, request, response=None, info=None): diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py index aeb520442c6..46f2bfb5881 100644 --- a/scrapy/pipelines/images.py +++ b/scrapy/pipelines/images.py @@ -5,17 +5,19 @@ """ import functools import hashlib +from contextlib import suppress from io import BytesIO +from itemadapter import ItemAdapter from PIL import Image -from scrapy.utils.misc import md5sum -from scrapy.utils.python import to_bytes -from scrapy.http import Request -from scrapy.settings import Settings from scrapy.exceptions import DropItem -# TODO: from scrapy.pipelines.media import MediaPipeline +from scrapy.http import Request from scrapy.pipelines.files import FileException, FilesPipeline +# TODO: from scrapy.pipelines.media import MediaPipeline +from scrapy.settings import Settings +from scrapy.utils.misc import md5sum +from scrapy.utils.python import to_bytes class NoimagesDrop(DropItem): @@ -157,11 +159,12 @@ def convert_image(self, image, size=None): return image, buf def get_media_requests(self, item, info): - return [Request(x) for x in item.get(self.images_urls_field, [])] + urls = ItemAdapter(item).get(self.images_urls_field, []) + return [Request(u) for u in urls] def item_completed(self, results, item, info): - if isinstance(item, dict) or self.images_result_field in item.fields: - item[self.images_result_field] = [x for ok, x in results if ok] + with suppress(KeyError): + ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok] return item def file_path(self, request, response=None, info=None): diff --git a/scrapy/shell.py b/scrapy/shell.py index 3ff5a8ad8e9..10de119ce1a 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -6,6 +6,7 @@ import os import signal +from itemadapter import is_item from twisted.internet import threads, defer from twisted.python import threadable from w3lib.url import any_to_uri @@ -13,20 +14,18 @@ from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest from scrapy.http import Request, Response -from scrapy.item import _BaseItem from scrapy.settings import Settings from scrapy.spiders import Spider -from scrapy.utils.console import start_python_console +from scrapy.utils.conf import get_config +from scrapy.utils.console import DEFAULT_PYTHON_SHELLS, start_python_console from scrapy.utils.datatypes import SequenceExclude from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser -from scrapy.utils.conf import get_config -from scrapy.utils.console import DEFAULT_PYTHON_SHELLS class Shell: - relevant_classes = (Crawler, Spider, Request, Response, _BaseItem, Settings) + relevant_classes = (Crawler, Spider, Request, Response, Settings) def __init__(self, crawler, update_vars=None, code=None): self.crawler = crawler @@ -154,7 +153,7 @@ def get_help(self): return "\n".join("[s] %s" % line for line in b) def _is_relevant(self, value): - return isinstance(value, self.relevant_classes) + return isinstance(value, self.relevant_classes) or is_item(value) def inspect_response(response, spider): diff --git a/scrapy/spiders/feed.py b/scrapy/spiders/feed.py index a4ff8010d35..5aad7398ae6 100644 --- a/scrapy/spiders/feed.py +++ b/scrapy/spiders/feed.py @@ -31,7 +31,7 @@ def process_results(self, response, results): processing required before returning the results to the framework core, for example setting the item GUIDs. It receives a list of results and the response which originated that results. It must return a list of - results (Items or Requests). + results (items or requests). """ return results diff --git a/scrapy/templates/project/module/middlewares.py.tmpl b/scrapy/templates/project/module/middlewares.py.tmpl index 6490f52a7f2..bd09890fe4b 100644 --- a/scrapy/templates/project/module/middlewares.py.tmpl +++ b/scrapy/templates/project/module/middlewares.py.tmpl @@ -5,6 +5,9 @@ from scrapy import signals +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + class ${ProjectName}SpiderMiddleware: # Not all methods need to be defined. If a method is not defined, @@ -29,7 +32,7 @@ class ${ProjectName}SpiderMiddleware: # Called with the results returned from the Spider, after # it has processed the response. - # Must return an iterable of Request, dict or Item objects. + # Must return an iterable of Request, or item objects. for i in result: yield i @@ -37,8 +40,7 @@ class ${ProjectName}SpiderMiddleware: # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. - # Should return either None or an iterable of Request, dict - # or Item objects. + # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): diff --git a/scrapy/templates/project/module/pipelines.py.tmpl b/scrapy/templates/project/module/pipelines.py.tmpl index ce0edd3359f..e845f43e909 100644 --- a/scrapy/templates/project/module/pipelines.py.tmpl +++ b/scrapy/templates/project/module/pipelines.py.tmpl @@ -4,6 +4,10 @@ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + class ${ProjectName}Pipeline: def process_item(self, item, spider): return item diff --git a/scrapy/utils/serialize.py b/scrapy/utils/serialize.py index bf73dfa18a0..dc96045783c 100644 --- a/scrapy/utils/serialize.py +++ b/scrapy/utils/serialize.py @@ -2,10 +2,10 @@ import datetime import decimal +from itemadapter import is_item, ItemAdapter from twisted.internet import defer from scrapy.http import Request, Response -from scrapy.item import _BaseItem class ScrapyJSONEncoder(json.JSONEncoder): @@ -26,8 +26,8 @@ def default(self, o): return str(o) elif isinstance(o, defer.Deferred): return str(o) - elif isinstance(o, _BaseItem): - return dict(o) + elif is_item(o): + return ItemAdapter(o).asdict() elif isinstance(o, Request): return "<%s %s %s>" % (type(o).__name__, o.method, o.url) elif isinstance(o, Response): diff --git a/setup.py b/setup.py index 71dc3232ddd..5a99fd1bfcc 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ def has_environment_marker_platform_impl_support(): 'w3lib>=1.17.0', 'zope.interface>=4.1.3', 'protego>=0.1.15', + 'itemadapter>=0.1.0', ], extras_require=extras_require, ) diff --git a/tests/requirements-py3.txt b/tests/requirements-py3.txt index 91fa1c5b5b1..dacb86e560c 100644 --- a/tests/requirements-py3.txt +++ b/tests/requirements-py3.txt @@ -1,4 +1,6 @@ # Tests requirements +attrs +dataclasses; python_version == '3.6' jmespath mitmproxy; python_version >= '3.6' mitmproxy<4.0.0; python_version < '3.6' diff --git a/tests/test_engine.py b/tests/test_engine.py index 6696ee52e25..1b848ac7298 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -16,9 +16,11 @@ from collections import defaultdict from urllib.parse import urlparse +import attr +from itemadapter import ItemAdapter from pydispatch import dispatcher from testfixtures import LogCapture -from twisted.internet import reactor, defer +from twisted.internet import defer, reactor from twisted.trial import unittest from twisted.web import server, static, util @@ -32,7 +34,7 @@ from scrapy.utils.signal import disconnect_all from scrapy.utils.test import get_crawler -from tests import tests_datadir, get_testdata +from tests import get_testdata, tests_datadir class TestItem(Item): @@ -41,6 +43,13 @@ class TestItem(Item): price = Field() +@attr.s +class AttrsItem: + name = attr.ib(default="") + url = attr.ib(default="") + price = attr.ib(default=0) + + class TestSpider(Spider): name = "scrapytest.org" allowed_domains = ["scrapytest.org", "localhost"] @@ -79,6 +88,27 @@ class DictItemsSpider(TestSpider): item_cls = dict +class AttrsItemsSpider(TestSpider): + item_class = AttrsItem + + +try: + from dataclasses import make_dataclass +except ImportError: + DataClassItemsSpider = None +else: + TestDataClass = make_dataclass("TestDataClass", [("name", str), ("url", str), ("price", int)]) + + class DataClassItemsSpider(DictItemsSpider): + def parse_item(self, response): + item = super().parse_item(response) + return TestDataClass( + name=item.get('name'), + url=item.get('url'), + price=item.get('price'), + ) + + class ItemZeroDivisionErrorSpider(TestSpider): custom_settings = { "ITEM_PIPELINES": { @@ -204,7 +234,10 @@ class EngineTest(unittest.TestCase): @defer.inlineCallbacks def test_crawler(self): - for spider in TestSpider, DictItemsSpider: + + for spider in (TestSpider, DictItemsSpider, AttrsItemsSpider, DataClassItemsSpider): + if spider is None: + continue self.run = CrawlerRun(spider) yield self.run.run() self._assert_visited_urls() @@ -281,6 +314,7 @@ def _assert_items_error(self): def _assert_scraped_items(self): self.assertEqual(2, len(self.run.itemresp)) for item, response in self.run.itemresp: + item = ItemAdapter(item) self.assertEqual(item['url'], response.url) if 'item1.html' in item['url']: self.assertEqual('Item 1 name', item['name']) diff --git a/tests/test_loader.py b/tests/test_loader.py index f14714c756d..8a9c6fca99c 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,6 +1,9 @@ from functools import partial import unittest +import attr +from itemadapter import ItemAdapter + from scrapy.http import HtmlResponse from scrapy.item import Item, Field from scrapy.loader import ItemLoader @@ -9,6 +12,13 @@ from scrapy.selector import Selector +try: + from dataclasses import make_dataclass, field as dataclass_field +except ImportError: + make_dataclass = None + dataclass_field = None + + # test items class NameItem(Item): name = Field() @@ -28,6 +38,11 @@ class TestNestedItem(Item): image = Field() +@attr.s +class AttrsNameItem: + name = attr.ib(default="") + + # test item loaders class NameItemLoader(ItemLoader): default_item_class = TestItem @@ -466,7 +481,7 @@ def test_keep_single_value(self): il = ItemLoader(item=input_item) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo']}) def test_keep_list(self): """Loaded item should contain values from the initial item""" @@ -474,7 +489,7 @@ def test_keep_list(self): il = ItemLoader(item=input_item) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar']}) def test_add_value_singlevalue_singlevalue(self): """Values added after initialization should be appended""" @@ -483,7 +498,7 @@ def test_add_value_singlevalue_singlevalue(self): il.add_value('name', 'bar') loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar']}) def test_add_value_singlevalue_list(self): """Values added after initialization should be appended""" @@ -492,7 +507,7 @@ def test_add_value_singlevalue_list(self): il.add_value('name', ['item', 'loader']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo', 'item', 'loader']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'item', 'loader']}) def test_add_value_list_singlevalue(self): """Values added after initialization should be appended""" @@ -501,7 +516,7 @@ def test_add_value_list_singlevalue(self): il.add_value('name', 'qwerty') loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar', 'qwerty']}) def test_add_value_list_list(self): """Values added after initialization should be appended""" @@ -510,7 +525,7 @@ def test_add_value_list_list(self): il.add_value('name', ['item', 'loader']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'item', 'loader']}) + self.assertEqual(ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar', 'item', 'loader']}) def test_get_output_value_singlevalue(self): """Getting output value must not remove value from item""" @@ -519,7 +534,7 @@ def test_get_output_value_singlevalue(self): self.assertEqual(il.get_output_value('name'), ['foo']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(loaded_item, dict({'name': ['foo']})) + self.assertEqual(ItemAdapter(loaded_item).asdict(), dict({'name': ['foo']})) def test_get_output_value_list(self): """Getting output value must not remove value from item""" @@ -528,7 +543,7 @@ def test_get_output_value_list(self): self.assertEqual(il.get_output_value('name'), ['foo', 'bar']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) - self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']})) + self.assertEqual(ItemAdapter(loaded_item).asdict(), dict({'name': ['foo', 'bar']})) def test_values_single(self): """Values from initial item must be added to loader._values""" @@ -551,6 +566,22 @@ class InitializationFromItemTest(InitializationTestMixin, unittest.TestCase): item_class = NameItem +class InitializationFromAttrsItemTest(InitializationTestMixin, unittest.TestCase): + item_class = AttrsNameItem + + +@unittest.skipIf(not make_dataclass, "dataclasses module is not available") +class InitializationFromDataClassTest(InitializationTestMixin, unittest.TestCase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if make_dataclass: + self.item_class = make_dataclass( + "TestDataClass", + [("name", list, dataclass_field(default_factory=list))], + ) + + class BaseNoInputReprocessingLoader(ItemLoader): title_in = MapCompose(str.upper) title_out = TakeFirst() diff --git a/tests/test_pipeline_files.py b/tests/test_pipeline_files.py index 6bbcbc2e9b0..a023dfcc8ff 100644 --- a/tests/test_pipeline_files.py +++ b/tests/test_pipeline_files.py @@ -2,22 +2,41 @@ import random import time from io import BytesIO -from tempfile import mkdtemp from shutil import rmtree -from unittest import mock +from tempfile import mkdtemp +from unittest import mock, skipIf from urllib.parse import urlparse -from twisted.trial import unittest +import attr +from itemadapter import ItemAdapter from twisted.internet import defer +from twisted.trial import unittest -from scrapy.pipelines.files import FilesPipeline, FSFilesStore, S3FilesStore, GCSFilesStore, FTPFilesStore -from scrapy.item import Item, Field from scrapy.http import Request, Response +from scrapy.item import Field, Item +from scrapy.pipelines.files import ( + FilesPipeline, + FSFilesStore, + FTPFilesStore, + GCSFilesStore, + S3FilesStore, +) from scrapy.settings import Settings -from scrapy.utils.test import assert_aws_environ, get_s3_content_and_delete -from scrapy.utils.test import assert_gcs_environ, get_gcs_content_and_delete -from scrapy.utils.test import get_ftp_content_and_delete from scrapy.utils.boto import is_botocore +from scrapy.utils.test import ( + assert_aws_environ, + assert_gcs_environ, + get_ftp_content_and_delete, + get_gcs_content_and_delete, + get_s3_content_and_delete, +) + + +try: + from dataclasses import make_dataclass, field as dataclass_field +except ImportError: + make_dataclass = None + dataclass_field = None def _mocked_download_func(request, info): @@ -143,43 +162,88 @@ def test_file_cached(self): p.stop() -class FilesPipelineTestCaseFields(unittest.TestCase): +class FilesPipelineTestCaseFieldsMixin: def test_item_fields_default(self): - class TestItem(Item): - name = Field() - file_urls = Field() - files = Field() - - for cls in TestItem, dict: - url = 'http://www.example.com/files/1.txt' - item = cls({'name': 'item1', 'file_urls': [url]}) - pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'})) - requests = list(pipeline.get_media_requests(item, None)) - self.assertEqual(requests[0].url, url) - results = [(True, {'url': url})] - pipeline.item_completed(results, item, None) - self.assertEqual(item['files'], [results[0][1]]) + url = 'http://www.example.com/files/1.txt' + item = self.item_class(name='item1', file_urls=[url]) + pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'})) + requests = list(pipeline.get_media_requests(item, None)) + self.assertEqual(requests[0].url, url) + results = [(True, {'url': url})] + item = pipeline.item_completed(results, item, None) + files = ItemAdapter(item).get("files") + self.assertEqual(files, [results[0][1]]) + self.assertIsInstance(item, self.item_class) def test_item_fields_override_settings(self): - class TestItem(Item): - name = Field() - files = Field() - stored_file = Field() - - for cls in TestItem, dict: - url = 'http://www.example.com/files/1.txt' - item = cls({'name': 'item1', 'files': [url]}) - pipeline = FilesPipeline.from_settings(Settings({ - 'FILES_STORE': 's3://example/files/', - 'FILES_URLS_FIELD': 'files', - 'FILES_RESULT_FIELD': 'stored_file' - })) - requests = list(pipeline.get_media_requests(item, None)) - self.assertEqual(requests[0].url, url) - results = [(True, {'url': url})] - pipeline.item_completed(results, item, None) - self.assertEqual(item['stored_file'], [results[0][1]]) + url = 'http://www.example.com/files/1.txt' + item = self.item_class(name='item1', custom_file_urls=[url]) + pipeline = FilesPipeline.from_settings(Settings({ + 'FILES_STORE': 's3://example/files/', + 'FILES_URLS_FIELD': 'custom_file_urls', + 'FILES_RESULT_FIELD': 'custom_files' + })) + requests = list(pipeline.get_media_requests(item, None)) + self.assertEqual(requests[0].url, url) + results = [(True, {'url': url})] + item = pipeline.item_completed(results, item, None) + custom_files = ItemAdapter(item).get("custom_files") + self.assertEqual(custom_files, [results[0][1]]) + self.assertIsInstance(item, self.item_class) + + +class FilesPipelineTestCaseFieldsDict(FilesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = dict + + +class FilesPipelineTestItem(Item): + name = Field() + # default fields + file_urls = Field() + files = Field() + # overridden fields + custom_file_urls = Field() + custom_files = Field() + + +class FilesPipelineTestCaseFieldsItem(FilesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = FilesPipelineTestItem + + +@skipIf(not make_dataclass, "dataclasses module is not available") +class FilesPipelineTestCaseFieldsDataClass(FilesPipelineTestCaseFieldsMixin, unittest.TestCase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if make_dataclass: + self.item_class = make_dataclass( + "FilesPipelineTestDataClass", + [ + ("name", str), + # default fields + ("file_urls", list, dataclass_field(default_factory=list)), + ("files", list, dataclass_field(default_factory=list)), + # overridden fields + ("custom_file_urls", list, dataclass_field(default_factory=list)), + ("custom_files", list, dataclass_field(default_factory=list)), + ], + ) + + +@attr.s +class FilesPipelineTestAttrsItem: + name = attr.ib(default="") + # default fields + file_urls = attr.ib(default=lambda: []) + files = attr.ib(default=lambda: []) + # overridden fields + custom_file_urls = attr.ib(default=lambda: []) + custom_files = attr.ib(default=lambda: []) + + +class FilesPipelineTestCaseFieldsAttrsItem(FilesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = FilesPipelineTestAttrsItem class FilesPipelineTestCaseCustomSettings(unittest.TestCase): diff --git a/tests/test_pipeline_images.py b/tests/test_pipeline_images.py index 8ef27fce77e..082e9ee2159 100644 --- a/tests/test_pipeline_images.py +++ b/tests/test_pipeline_images.py @@ -1,17 +1,28 @@ -import io import hashlib +import io import random -from tempfile import mkdtemp from shutil import rmtree +from tempfile import mkdtemp +from unittest import skipIf +import attr +from itemadapter import ItemAdapter from twisted.trial import unittest -from scrapy.item import Item, Field from scrapy.http import Request, Response -from scrapy.settings import Settings +from scrapy.item import Field, Item from scrapy.pipelines.images import ImagesPipeline +from scrapy.settings import Settings from scrapy.utils.python import to_bytes + +try: + from dataclasses import make_dataclass, field as dataclass_field +except ImportError: + make_dataclass = None + dataclass_field = None + + skip = False try: from PIL import Image @@ -124,43 +135,89 @@ def thumb_key(self, url, thumb_id): return 'thumbsup/%s/%s.jpg' % (thumb_id, thumb_guid) -class ImagesPipelineTestCaseFields(unittest.TestCase): +class ImagesPipelineTestCaseFieldsMixin: def test_item_fields_default(self): - class TestItem(Item): - name = Field() - image_urls = Field() - images = Field() - - for cls in TestItem, dict: - url = 'http://www.example.com/images/1.jpg' - item = cls({'name': 'item1', 'image_urls': [url]}) - pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'})) - requests = list(pipeline.get_media_requests(item, None)) - self.assertEqual(requests[0].url, url) - results = [(True, {'url': url})] - pipeline.item_completed(results, item, None) - self.assertEqual(item['images'], [results[0][1]]) + url = 'http://www.example.com/images/1.jpg' + item = self.item_class(name='item1', image_urls=[url]) + pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'})) + requests = list(pipeline.get_media_requests(item, None)) + self.assertEqual(requests[0].url, url) + results = [(True, {'url': url})] + item = pipeline.item_completed(results, item, None) + images = ItemAdapter(item).get("images") + self.assertEqual(images, [results[0][1]]) + self.assertIsInstance(item, self.item_class) def test_item_fields_override_settings(self): - class TestItem(Item): - name = Field() - image = Field() - stored_image = Field() - - for cls in TestItem, dict: - url = 'http://www.example.com/images/1.jpg' - item = cls({'name': 'item1', 'image': [url]}) - pipeline = ImagesPipeline.from_settings(Settings({ - 'IMAGES_STORE': 's3://example/images/', - 'IMAGES_URLS_FIELD': 'image', - 'IMAGES_RESULT_FIELD': 'stored_image' - })) - requests = list(pipeline.get_media_requests(item, None)) - self.assertEqual(requests[0].url, url) - results = [(True, {'url': url})] - pipeline.item_completed(results, item, None) - self.assertEqual(item['stored_image'], [results[0][1]]) + url = 'http://www.example.com/images/1.jpg' + item = self.item_class(name='item1', custom_image_urls=[url]) + pipeline = ImagesPipeline.from_settings(Settings({ + 'IMAGES_STORE': 's3://example/images/', + 'IMAGES_URLS_FIELD': 'custom_image_urls', + 'IMAGES_RESULT_FIELD': 'custom_images' + })) + requests = list(pipeline.get_media_requests(item, None)) + self.assertEqual(requests[0].url, url) + results = [(True, {'url': url})] + item = pipeline.item_completed(results, item, None) + custom_images = ItemAdapter(item).get("custom_images") + self.assertEqual(custom_images, [results[0][1]]) + self.assertIsInstance(item, self.item_class) + + +class ImagesPipelineTestCaseFieldsDict(ImagesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = dict + + +class ImagesPipelineTestItem(Item): + name = Field() + # default fields + image_urls = Field() + images = Field() + # overridden fields + custom_image_urls = Field() + custom_images = Field() + + +class ImagesPipelineTestCaseFieldsItem(ImagesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = ImagesPipelineTestItem + + +@skipIf(not make_dataclass, "dataclasses module is not available") +class ImagesPipelineTestCaseFieldsDataClass(ImagesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if make_dataclass: + self.item_class = make_dataclass( + "FilesPipelineTestDataClass", + [ + ("name", str), + # default fields + ("image_urls", list, dataclass_field(default_factory=list)), + ("images", list, dataclass_field(default_factory=list)), + # overridden fields + ("custom_image_urls", list, dataclass_field(default_factory=list)), + ("custom_images", list, dataclass_field(default_factory=list)), + ], + ) + + +@attr.s +class ImagesPipelineTestAttrsItem: + name = attr.ib(default="") + # default fields + image_urls = attr.ib(default=lambda: []) + images = attr.ib(default=lambda: []) + # overridden fields + custom_image_urls = attr.ib(default=lambda: []) + custom_images = attr.ib(default=lambda: []) + + +class ImagesPipelineTestCaseFieldsAttrsItem(ImagesPipelineTestCaseFieldsMixin, unittest.TestCase): + item_class = ImagesPipelineTestAttrsItem class ImagesPipelineTestCaseCustomSettings(unittest.TestCase): diff --git a/tests/test_utils_serialize.py b/tests/test_utils_serialize.py index 6dc1177792b..daf022aeed0 100644 --- a/tests/test_utils_serialize.py +++ b/tests/test_utils_serialize.py @@ -1,18 +1,25 @@ +import datetime import json import unittest -import datetime from decimal import Decimal +import attr from twisted.internet import defer -from scrapy.utils.serialize import ScrapyJSONEncoder from scrapy.http import Request, Response +from scrapy.utils.serialize import ScrapyJSONEncoder + + +try: + from dataclasses import make_dataclass +except ImportError: + make_dataclass = None class JsonEncoderTestCase(unittest.TestCase): def setUp(self): - self.encoder = ScrapyJSONEncoder() + self.encoder = ScrapyJSONEncoder(sort_keys=True) def test_encode_decode(self): dt = datetime.datetime(2010, 1, 2, 10, 11, 12) @@ -31,7 +38,8 @@ def test_encode_decode(self): for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts), (dec, decs), (['foo', d], ['foo', ds]), (s, ss), (dt_set, dt_sets)]: - self.assertEqual(self.encoder.encode(input), json.dumps(output)) + self.assertEqual(self.encoder.encode(input), + json.dumps(output, sort_keys=True)) def test_encode_deferred(self): self.assertIn('Deferred', self.encoder.encode(defer.Deferred())) @@ -47,3 +55,30 @@ def test_encode_response(self): rs = self.encoder.encode(r) self.assertIn(r.url, rs) self.assertIn(str(r.status), rs) + + @unittest.skipIf(not make_dataclass, "No dataclass support") + def test_encode_dataclass_item(self): + TestDataClass = make_dataclass( + "TestDataClass", + [("name", str), ("url", str), ("price", int)], + ) + item = TestDataClass(name="Product", url="http://product.org", price=1) + encoded = self.encoder.encode(item) + self.assertEqual( + encoded, + '{"name": "Product", "price": 1, "url": "http://product.org"}' + ) + + def test_encode_attrs_item(self): + @attr.s + class AttrsItem: + name = attr.ib(type=str) + url = attr.ib(type=str) + price = attr.ib(type=int) + + item = AttrsItem(name="Product", url="http://product.org", price=1) + encoded = self.encoder.encode(item) + self.assertEqual( + encoded, + '{"name": "Product", "price": 1, "url": "http://product.org"}' + ) diff --git a/tox.ini b/tox.ini index 69b1bdfdde5..4c790158dc6 100644 --- a/tox.ini +++ b/tox.ini @@ -37,7 +37,7 @@ deps = pytest-flake8 commands = py.test --flake8 {posargs:docs scrapy tests} - + [testenv:pylint] basepython = python3 deps = @@ -62,6 +62,7 @@ deps = -ctests/constraints.txt cryptography==2.0 cssselect==0.9.1 + itemadapter==0.1.0 lxml==3.5.0 parsel==1.5.0 Protego==0.1.15