Support for dataclass and attrs items (scrapy#3881)

AshitaKaze · Jun 14, 2020 · ec98dab · ec98dab
1 parent 8b54939
commit ec98dab
Show file tree

Hide file tree

Showing 36 changed files with 757 additions and 355 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -281,6 +281,7 @@
 # -------------------------------------
 
 intersphinx_mapping = {
+    'attrs': ('https://www.attrs.org/en/stable/', None),
     'coverage': ('https://coverage.readthedocs.io/en/stable', None),
     'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
     'pytest': ('https://docs.pytest.org/en/latest', None),

diff --git a/docs/faq.rst b/docs/faq.rst
@@ -342,15 +342,15 @@ method for this purpose. For example::
 
     from copy import deepcopy
 
-    from scrapy.item import Item
-
+    from itemadapter import is_item, ItemAdapter
 
     class MultiplyItemsMiddleware:
 
         def process_spider_output(self, response, result, spider):
             for item in result:
-                if isinstance(item, (Item, dict)):
-                    for _ in range(item['multiply_by']):
+                if is_item(item):
+                    adapter = ItemAdapter(item)
+                    for _ in range(adapter['multiply_by']):
                         yield deepcopy(item)
 
 Does Scrapy support IPv6 addresses?

diff --git a/docs/topics/architecture.rst b/docs/topics/architecture.rst
@@ -104,7 +104,7 @@ Spiders
 -------
 
 Spiders are custom classes written by Scrapy users to parse responses and
-extract items (aka scraped items) from them or additional requests to
+extract :ref:`items <topics-items>` from them or additional requests to
 follow. For more information see :ref:`topics-spiders`.
 
 .. _component-pipelines:

diff --git a/docs/topics/coroutines.rst b/docs/topics/coroutines.rst
@@ -53,21 +53,28 @@ There are several use cases for coroutines in Scrapy. Code that would
 return Deferreds when written for previous Scrapy versions, such as downloader
 middlewares and signal handlers, can be rewritten to be shorter and cleaner::
 
+    from itemadapter import ItemAdapter
+
     class DbPipeline:
         def _update_item(self, data, item):
-            item['field'] = data
+            adapter = ItemAdapter(item)
+            adapter['field'] = data
             return item
 
         def process_item(self, item, spider):
-            dfd = db.get_some_data(item['id'])
+            adapter = ItemAdapter(item)
+            dfd = db.get_some_data(adapter['id'])
             dfd.addCallback(self._update_item, item)
             return dfd
 
 becomes::
 
+    from itemadapter import ItemAdapter
+
     class DbPipeline:
         async def process_item(self, item, spider):
-            item['field'] = await db.get_some_data(item['id'])
+            adapter = ItemAdapter(item)
+            adapter['field'] = await db.get_some_data(adapter['id'])
             return item
 
 Coroutines may be used to call asynchronous code. This includes other

diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst
@@ -40,6 +40,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses multiple
 Item Exporters to group scraped items to different files according to the
 value of one of their fields::
 
+    from itemadapter import ItemAdapter
     from scrapy.exporters import XmlItemExporter
 
     class PerYearXmlExportPipeline:
@@ -53,7 +54,8 @@ value of one of their fields::
                 exporter.finish_exporting()
 
         def _exporter_for_item(self, item):
-            year = item['year']
+            adapter = ItemAdapter(item)
+            year = adapter['year']
             if year not in self.year_to_exporter:
                 f = open('{}.xml'.format(year), 'wb')
                 exporter = XmlItemExporter(f)
@@ -167,9 +169,10 @@ BaseItemExporter
       value unchanged except for ``unicode`` values which are encoded to
       ``str`` using the encoding declared in the :attr:`encoding` attribute.
 
-      :param field: the field being serialized. If a raw dict is being
-          exported (not :class:`~.Item`) *field* value is an empty dict.
-      :type field: :class:`~scrapy.item.Field` object or an empty dict
+      :param field: the field being serialized. If the source :ref:`item object
+          <item-types>` does not define field metadata, *field* is an empty
+          :class:`dict`.
+      :type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance
 
       :param name: the name of the field being serialized
       :type name: str
@@ -192,14 +195,17 @@ BaseItemExporter
 
    .. attribute:: fields_to_export
 
-      A list with the name of the fields that will be exported, or None if you
-      want to export all fields. Defaults to None.
+      A list with the name of the fields that will be exported, or ``None`` if
+      you want to export all fields. Defaults to ``None``.
 
       Some exporters (like :class:`CsvItemExporter`) respect the order of the
       fields defined in this attribute.
 
-      Some exporters may require fields_to_export list in order to export the
-      data properly when spiders return dicts (not :class:`~Item` instances).
+      When using :ref:`item objects <item-types>` that do not expose all their
+      possible fields, exporters that do not support exporting a different
+      subset of fields per item will only export the fields found in the first
+      item exported. Use ``fields_to_export`` to define all the fields to be
+      exported.
 
    .. attribute:: export_empty_fields
 
@@ -238,7 +244,7 @@ XmlItemExporter
 
 .. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs)
 
-   Exports Items in XML format to the specified file object.
+   Exports items in XML format to the specified file object.
 
    :param file: the file-like object to use for exporting the data. Its ``write`` method should
                 accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@@ -292,7 +298,7 @@ CsvItemExporter
 
 .. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs)
 
-   Exports Items in CSV format to the given file-like object. If the
+   Exports items in CSV format to the given file-like object. If the
    :attr:`fields_to_export` attribute is set, it will be used to define the
    CSV columns and their order. The :attr:`export_empty_fields` attribute has
    no effect on this exporter.
@@ -325,7 +331,7 @@ PickleItemExporter
 
 .. class:: PickleItemExporter(file, protocol=0, **kwargs)
 
-   Exports Items in pickle format to the given file-like object.
+   Exports items in pickle format to the given file-like object.
 
    :param file: the file-like object to use for exporting the data. Its ``write`` method should
                 accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@@ -345,7 +351,7 @@ PprintItemExporter
 
 .. class:: PprintItemExporter(file, **kwargs)
 
-   Exports Items in pretty print format to the specified file object.
+   Exports items in pretty print format to the specified file object.
 
    :param file: the file-like object to use for exporting the data. Its ``write`` method should
                 accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@@ -365,7 +371,7 @@ JsonItemExporter
 
 .. class:: JsonItemExporter(file, **kwargs)
 
-   Exports Items in JSON format to the specified file-like object, writing all
+   Exports items in JSON format to the specified file-like object, writing all
    objects as a list of objects. The additional ``__init__`` method arguments are
    passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
    arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
@@ -394,7 +400,7 @@ JsonLinesItemExporter
 
 .. class:: JsonLinesItemExporter(file, **kwargs)
 
-   Exports Items in JSON format to the specified file-like object, writing one
+   Exports items in JSON format to the specified file-like object, writing one
    JSON-encoded item per line. The additional ``__init__`` method arguments are passed
    to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
    the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any

diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst
@@ -298,8 +298,8 @@ Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``.
 
 Use FEED_EXPORT_FIELDS option to define fields to export and their order.
 
-When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields
-defined in dicts or :class:`~.Item` subclasses a spider is yielding.
+When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields
+defined in :ref:`item objects <topics-items>` yielded by your spider.
 
 If an exporter requires a fixed set of fields (this is the case for
 :ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS

diff --git a/docs/topics/item-pipeline.rst b/docs/topics/item-pipeline.rst
@@ -27,15 +27,19 @@ Each item pipeline component is a Python class that must implement the following
 
 .. method:: process_item(self, item, spider)
 
-   This method is called for every item pipeline component. :meth:`process_item`
-   must either: return a dict with data, return an :class:`~scrapy.item.Item`
-   (or any descendant class) object, return a
-   :class:`~twisted.internet.defer.Deferred` or raise
-   :exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer
-   processed by further pipeline components.
+   This method is called for every item pipeline component.
 
-   :param item: the item scraped
-   :type item: :class:`~scrapy.item.Item` object or a dict
+   `item` is an :ref:`item object <item-types>`, see
+   :ref:`supporting-item-types`.
+
+   :meth:`process_item` must either: return an :ref:`item object <item-types>`,
+   return a :class:`~twisted.internet.defer.Deferred` or raise a
+   :exc:`~scrapy.exceptions.DropItem` exception.
+
+   Dropped items are no longer processed by further pipeline components.
+
+   :param item: the scraped item
+   :type item: :ref:`item object <item-types>`
 
    :param spider: the spider which scraped the item
    :type spider: :class:`~scrapy.spiders.Spider` object
@@ -79,16 +83,17 @@ Let's take a look at the following hypothetical pipeline that adjusts the
 (``price_excludes_vat`` attribute), and drops those items which don't
 contain a price::
 
+    from itemadapter import ItemAdapter
     from scrapy.exceptions import DropItem
-
     class PricePipeline:
 
         vat_factor = 1.15
 
         def process_item(self, item, spider):
-            if item.get('price'):
-                if item.get('price_excludes_vat'):
-                    item['price'] = item['price'] * self.vat_factor
+            adapter = ItemAdapter(item)
+            if adapter.get('price'):
+                if adapter.get('price_excludes_vat'):
+                    adapter['price'] = adapter['price'] * self.vat_factor
                 return item
             else:
                 raise DropItem("Missing price in %s" % item)
@@ -103,6 +108,8 @@ format::
 
    import json
 
+   from itemadapter import ItemAdapter
+
    class JsonWriterPipeline:
 
        def open_spider(self, spider):
@@ -112,7 +119,7 @@ format::
            self.file.close()
 
        def process_item(self, item, spider):
-           line = json.dumps(dict(item)) + "\n"
+           line = json.dumps(ItemAdapter(item).asdict()) + "\n"
            self.file.write(line)
            return item
 
@@ -131,6 +138,7 @@ The main point of this example is to show how to use :meth:`from_crawler`
 method and how to clean up the resources properly.::
 
     import pymongo
+    from itemadapter import ItemAdapter
 
     class MongoPipeline:
 
@@ -155,7 +163,7 @@ method and how to clean up the resources properly.::
             self.client.close()
 
         def process_item(self, item, spider):
-            self.db[self.collection_name].insert_one(dict(item))
+            self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
             return item
 
 .. _MongoDB: https://www.mongodb.com/
@@ -177,10 +185,11 @@ item.
 
 ::
 
-    import scrapy
     import hashlib
     from urllib.parse import quote
 
+    import scrapy
+    from itemadapter import ItemAdapter
 
     class ScreenshotPipeline:
         """Pipeline that uses Splash to render screenshot of
@@ -189,7 +198,8 @@ item.
         SPLASH_URL = "http://localhost:8050/render.png?url={}"
 
         async def process_item(self, item, spider):
-            encoded_item_url = quote(item["url"])
+            adapter = ItemAdapter(item)
+            encoded_item_url = quote(adapter["url"])
             screenshot_url = self.SPLASH_URL.format(encoded_item_url)
             request = scrapy.Request(screenshot_url)
             response = await spider.crawler.engine.download(request, spider)
@@ -199,14 +209,14 @@ item.
                 return item
 
             # Save screenshot to file, filename will be hash of url.
-            url = item["url"]
+            url = adapter["url"]
             url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
             filename = "{}.png".format(url_hash)
             with open(filename, "wb") as f:
                 f.write(response.body)
 
             # Store filename in item.
-            item["screenshot_filename"] = filename
+            adapter["screenshot_filename"] = filename
             return item
 
 .. _Splash: https://splash.readthedocs.io/en/stable/
@@ -219,6 +229,7 @@ already processed. Let's say that our items have a unique id, but our spider
 returns multiples items with the same id::
 
 
+    from itemadapter import ItemAdapter
     from scrapy.exceptions import DropItem
 
     class DuplicatesPipeline:
@@ -227,10 +238,11 @@ returns multiples items with the same id::
             self.ids_seen = set()
 
         def process_item(self, item, spider):
-            if item['id'] in self.ids_seen:
-                raise DropItem("Duplicate item found: %s" % item)
+            adapter = ItemAdapter(item)
+            if adapter['id'] in self.ids_seen:
+                raise DropItem("Duplicate item found: %r" % item)
             else:
-                self.ids_seen.add(item['id'])
+                self.ids_seen.add(adapter['id'])
                 return item