From e5b23f4b00962df76d8302ebf869ef4a4319e142 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 18 Mar 2020 11:26:59 +0300 Subject: [PATCH 01/35] fix #4250: add batch deliveries --- scrapy/extensions/feedexport.py | 54 +++++++++++++++++++++-------- scrapy/settings/default_settings.py | 1 + 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 998d2a5d17a..906f99fee93 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -241,6 +241,7 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') + self.storage_batch = self.settings.getint('FEED_STORAGE_BATCH') for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured @@ -250,19 +251,7 @@ def __init__(self, crawler): def open_spider(self, spider): for uri, feed in self.feeds.items(): uri = uri % self._get_uri_params(spider, feed['uri_params']) - storage = self._get_storage(uri) - file = storage.open(spider) - exporter = self._get_exporter( - file=file, - format=feed['format'], - fields_to_export=feed['fields'], - encoding=feed['encoding'], - indent=feed['indent'], - ) - slot = _FeedSlot(file, exporter, storage, uri, feed['format'], feed['store_empty']) - self.slots.append(slot) - if slot.store_empty: - slot.start_exporting() + self.slots.append(self._start_new_batch(None, uri, feed, spider)) def close_spider(self, spider): deferred_list = [] @@ -285,11 +274,48 @@ def close_spider(self, spider): deferred_list.append(d) return defer.DeferredList(deferred_list) if deferred_list else None + def _start_new_batch(self, previous_batch_slot, uri, feed, spider): + """ + Redirect the output data stream to a new file. + Execute multiple times if 'FEED_STORAGE_BATCH' setting is greater than zero. + """ + if previous_batch_slot is not None: + previous_batch_slot.exporter.finish_exporting() + previous_batch_slot.storage.store(previous_batch_slot.file) + storage = self._get_storage(uri) + file = storage.open(spider) + exporter = self._get_exporter( + file=file, + format=feed['format'], + fields_to_export=feed['fields'], + encoding=feed['encoding'], + indent=feed['indent'] + ) + slot = _FeedSlot(file, exporter, storage, uri, feed['format'], feed['store_empty']) + if slot.store_empty: + slot.start_exporting() + return slot + + def _get_uri_of_partial(self, slot, feed, spider): + """Get uri for each partial using datetime.now().isoformat()""" + uri = (slot.uri % self._get_uri_params(spider, feed['uri_params'])).split('.')[0] + '.' + uri = uri + datetime.now().isoformat() + '.' + feed['format'] + return uri + def item_scraped(self, item, spider): - for slot in self.slots: + slots = [] + for idx, slot in enumerate(self.slots): slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 + if self.storage_batch and slot.itemcount % self.storage_batch == 0: + uri = self._get_uri_of_partial(slot, self.feeds[slot.uri], spider) + slots.append(self._start_new_batch(slot, uri, self.feeds[slot.uri], spider)) + self.feeds[uri] = self.feeds[slot.uri] + self.feeds.pop(slot.uri) + self.slots[idx] = None + self.slots = [slot for slot in self.slots if slot is not None] + self.slots.extend(slots) def _load_components(self, setting_prefix): conf = without_none_values(self.settings.getwithbase(setting_prefix)) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 077317c815b..690e044c580 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,6 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } +FEED_STORAGE_BATCH = 0 FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', From 8b4566ff93843cdf17ada069dc09261a99971d26 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 18 Mar 2020 14:21:21 +0300 Subject: [PATCH 02/35] fix wrong name of first file in partial deliveries --- scrapy/extensions/feedexport.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 906f99fee93..4f7c6bf07f0 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -249,6 +249,8 @@ def __init__(self, crawler): raise NotConfigured def open_spider(self, spider): + if self.storage_batch: + self.feeds = {self._get_uri_of_partial(uri, feed, spider): feed for uri, feed in self.feeds.items()} for uri, feed in self.feeds.items(): uri = uri % self._get_uri_params(spider, feed['uri_params']) self.slots.append(self._start_new_batch(None, uri, feed, spider)) @@ -296,11 +298,11 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider): slot.start_exporting() return slot - def _get_uri_of_partial(self, slot, feed, spider): + def _get_uri_of_partial(self, template_uri, feed, spider): """Get uri for each partial using datetime.now().isoformat()""" - uri = (slot.uri % self._get_uri_params(spider, feed['uri_params'])).split('.')[0] + '.' - uri = uri + datetime.now().isoformat() + '.' + feed['format'] - return uri + template_uri = (template_uri % self._get_uri_params(spider, feed['uri_params'])) + uri_name = template_uri.split('.')[0] + return '{}.{}.{}'.format(uri_name, datetime.now().isoformat(), feed["format"]) def item_scraped(self, item, spider): slots = [] @@ -309,11 +311,12 @@ def item_scraped(self, item, spider): slot.exporter.export_item(item) slot.itemcount += 1 if self.storage_batch and slot.itemcount % self.storage_batch == 0: - uri = self._get_uri_of_partial(slot, self.feeds[slot.uri], spider) + uri = self._get_uri_of_partial(slot.uri, self.feeds[slot.uri], spider) slots.append(self._start_new_batch(slot, uri, self.feeds[slot.uri], spider)) self.feeds[uri] = self.feeds[slot.uri] self.feeds.pop(slot.uri) self.slots[idx] = None + self.slots = [slot for slot in self.slots if slot is not None] self.slots.extend(slots) From 0723e3f4f9777a87d0df3b2e2fddfeac9099dd3b Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 19 Mar 2020 21:17:02 +0300 Subject: [PATCH 03/35] add batch_id, add error if uri is specified incorrectly --- scrapy/extensions/feedexport.py | 73 ++++++++++++++++++++--------- scrapy/settings/default_settings.py | 2 +- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 4f7c6bf07f0..38b25bf4a04 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -180,14 +180,16 @@ def _store_in_thread(self, file): class _FeedSlot: - def __init__(self, file, exporter, storage, uri, format, store_empty): + def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, template_uri): self.file = file self.exporter = exporter self.storage = storage # feed params - self.uri = uri + self.batch_id = batch_id self.format = format self.store_empty = store_empty + self.template_uri = template_uri + self.uri = uri # flags self.itemcount = 0 self._exporting = False @@ -241,19 +243,28 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch = self.settings.getint('FEED_STORAGE_BATCH') + self.storage_batch_size = self.settings.getint('FEED_STORAGE_BATCH_SIZE') for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured + if not self._batch_deliveries_supported(uri): + raise NotConfigured if not self._exporter_supported(feed['format']): raise NotConfigured def open_spider(self, spider): - if self.storage_batch: - self.feeds = {self._get_uri_of_partial(uri, feed, spider): feed for uri, feed in self.feeds.items()} for uri, feed in self.feeds.items(): - uri = uri % self._get_uri_params(spider, feed['uri_params']) - self.slots.append(self._start_new_batch(None, uri, feed, spider)) + batch_id = 1 + uri_params = self._get_uri_params(spider, feed['uri_params']) + uri_params['batch_id'] = batch_id + self.slots.append(self._start_new_batch( + previous_batch_slot=None, + uri=uri % uri_params, + feed=feed, + spider=spider, + batch_id=batch_id, + template_uri=uri + )) def close_spider(self, spider): deferred_list = [] @@ -276,10 +287,17 @@ def close_spider(self, spider): deferred_list.append(d) return defer.DeferredList(deferred_list) if deferred_list else None - def _start_new_batch(self, previous_batch_slot, uri, feed, spider): + def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, template_uri): """ Redirect the output data stream to a new file. Execute multiple times if 'FEED_STORAGE_BATCH' setting is greater than zero. + :param previous_batch_slot: slot of previous batch. We need to call slot.storage.store + to get the file properly closed. + :param uri: uri of the new batch to start + :param feed: dict with parameters of feed + :param spider: user spider + :param batch_id: sequential batch id starting at 1 + :param template_uri: template uri which contains %(time)s or %(batch_id)s to create new uri """ if previous_batch_slot is not None: previous_batch_slot.exporter.finish_exporting() @@ -293,30 +311,30 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider): encoding=feed['encoding'], indent=feed['indent'] ) - slot = _FeedSlot(file, exporter, storage, uri, feed['format'], feed['store_empty']) + slot = _FeedSlot(file, exporter, storage, uri, feed['format'], feed['store_empty'], batch_id, template_uri) if slot.store_empty: slot.start_exporting() return slot - def _get_uri_of_partial(self, template_uri, feed, spider): - """Get uri for each partial using datetime.now().isoformat()""" - template_uri = (template_uri % self._get_uri_params(spider, feed['uri_params'])) - uri_name = template_uri.split('.')[0] - return '{}.{}.{}'.format(uri_name, datetime.now().isoformat(), feed["format"]) - def item_scraped(self, item, spider): slots = [] for idx, slot in enumerate(self.slots): slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 - if self.storage_batch and slot.itemcount % self.storage_batch == 0: - uri = self._get_uri_of_partial(slot.uri, self.feeds[slot.uri], spider) - slots.append(self._start_new_batch(slot, uri, self.feeds[slot.uri], spider)) - self.feeds[uri] = self.feeds[slot.uri] - self.feeds.pop(slot.uri) + if self.storage_batch_size and slot.itemcount % self.storage_batch_size == 0: + batch_id = slot.batch_id + 1 + uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params']) + uri_params['batch_id'] = batch_id + self.slots.append(self._start_new_batch( + previous_batch_slot=slot, + uri=slot.template_uri % uri_params, + feed=self.feeds[slot.template_uri], + spider=spider, + batch_id=batch_id, + template_uri=slot.template_uri + )) self.slots[idx] = None - self.slots = [slot for slot in self.slots if slot is not None] self.slots.extend(slots) @@ -335,6 +353,17 @@ def _exporter_supported(self, format): return True logger.error("Unknown feed format: %(format)s", {'format': format}) + def _batch_deliveries_supported(self, uri): + """ + If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(time)s or %(batch_id)s + to distinguish different files of partial output + """ + if not self.storage_batch_size: + return True + if '%(time)s' in uri or '%(batch_id)s' in uri: + return True + logger.error('%(time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + def _storage_supported(self, uri): scheme = urlparse(uri).scheme if scheme in self.storages: @@ -364,7 +393,7 @@ def _get_uri_params(self, spider, uri_params): params = {} for k in dir(spider): params[k] = getattr(spider, k) - ts = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-') + ts = datetime.utcnow().isoformat().replace(':', '-') params['time'] = ts uripar_function = load_object(uri_params) if uri_params else lambda x, y: None uripar_function(params, spider) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 690e044c580..7f90a2280de 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,7 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } -FEED_STORAGE_BATCH = 0 +FEED_STORAGE_BATCH_SIZE = 0 FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', From d11411b402ae68874c6ccc2883836be0b9cf8326 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sat, 21 Mar 2020 10:48:13 +0300 Subject: [PATCH 04/35] fix comments --- scrapy/extensions/feedexport.py | 31 ++++++++++++++++++----------- scrapy/settings/default_settings.py | 2 +- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 38b25bf4a04..ab0a0de37e7 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -25,7 +25,6 @@ from scrapy.utils.misc import create_instance, load_object from scrapy.utils.python import without_none_values - logger = logging.getLogger(__name__) @@ -243,7 +242,7 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch_size = self.settings.getint('FEED_STORAGE_BATCH_SIZE') + self.storage_batch_size = self.settings.get('FEED_STORAGE_BATCH_SIZE', None) for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured @@ -263,7 +262,7 @@ def open_spider(self, spider): feed=feed, spider=spider, batch_id=batch_id, - template_uri=uri + template_uri=uri, )) def close_spider(self, spider): @@ -290,7 +289,7 @@ def close_spider(self, spider): def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, template_uri): """ Redirect the output data stream to a new file. - Execute multiple times if 'FEED_STORAGE_BATCH' setting is greater than zero. + Execute multiple times if 'FEED_STORAGE_BATCH' setting is specified. :param previous_batch_slot: slot of previous batch. We need to call slot.storage.store to get the file properly closed. :param uri: uri of the new batch to start @@ -309,9 +308,18 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, tem format=feed['format'], fields_to_export=feed['fields'], encoding=feed['encoding'], - indent=feed['indent'] + indent=feed['indent'], + ) + slot = _FeedSlot( + file=file, + exporter=exporter, + storage=storage, + uri=uri, + format=feed['format'], + store_empty=feed['store_empty'], + batch_id=batch_id, + template_uri=template_uri, ) - slot = _FeedSlot(file, exporter, storage, uri, feed['format'], feed['store_empty'], batch_id, template_uri) if slot.store_empty: slot.start_exporting() return slot @@ -326,13 +334,13 @@ def item_scraped(self, item, spider): batch_id = slot.batch_id + 1 uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params']) uri_params['batch_id'] = batch_id - self.slots.append(self._start_new_batch( + slots.append(self._start_new_batch( previous_batch_slot=slot, uri=slot.template_uri % uri_params, feed=self.feeds[slot.template_uri], spider=spider, batch_id=batch_id, - template_uri=slot.template_uri + template_uri=slot.template_uri, )) self.slots[idx] = None self.slots = [slot for slot in self.slots if slot is not None] @@ -358,11 +366,10 @@ def _batch_deliveries_supported(self, uri): If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(time)s or %(batch_id)s to distinguish different files of partial output """ - if not self.storage_batch_size: - return True - if '%(time)s' in uri or '%(batch_id)s' in uri: + if self.storage_batch_size is None or '%(time)s' in uri or '%(batch_id)s' in uri: return True - logger.error('%(time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + logger.warning('%(time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + return False def _storage_supported(self, uri): scheme = urlparse(uri).scheme diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 7f90a2280de..c3463a505dd 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,7 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } -FEED_STORAGE_BATCH_SIZE = 0 +FEED_STORAGE_BATCH_SIZE = None FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', From 39d0d13d3f7bd671d5b29646b209c62e23373fab Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 26 Mar 2020 14:18:35 +0300 Subject: [PATCH 05/35] Add partial deliveries tests --- tests/test_feedexport.py | 191 ++++++++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 34 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index c5589e52f2b..1ebe44e1277 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -6,6 +6,7 @@ import string import tempfile import warnings +from abc import ABC, abstractmethod from io import BytesIO from pathlib import Path from string import ascii_letters, digits @@ -21,8 +22,9 @@ import scrapy from scrapy.crawler import CrawlerRunner +from scrapy.exceptions import NotConfigured from scrapy.exporters import CsvItemExporter -from scrapy.extensions.feedexport import (BlockingFeedStorage, FileFeedStorage, FTPFeedStorage, +from scrapy.extensions.feedexport import (BlockingFeedStorage, FeedExporter, FileFeedStorage, FTPFeedStorage, IFeedStorage, S3FeedStorage, StdoutFeedStorage) from scrapy.settings import Settings from scrapy.utils.python import to_unicode @@ -76,6 +78,7 @@ class FTPFeedStorageTest(unittest.TestCase): def get_test_spider(self, settings=None): class TestSpider(scrapy.Spider): name = 'test_spider' + crawler = get_crawler(settings_dict=settings) spider = TestSpider.from_crawler(crawler) return spider @@ -129,6 +132,7 @@ class BlockingFeedStorageTest(unittest.TestCase): def get_test_spider(self, settings=None): class TestSpider(scrapy.Spider): name = 'test_spider' + crawler = get_crawler(settings_dict=settings) spider = TestSpider.from_crawler(crawler) return spider @@ -390,56 +394,31 @@ class FromCrawlerFileFeedStorage(FileFeedStorage, FromCrawlerMixin): pass -class FeedExportTest(unittest.TestCase): +class FeedExportTestBase(ABC, unittest.TestCase): + __test__ = False class MyItem(scrapy.Item): foo = scrapy.Field() egg = scrapy.Field() baz = scrapy.Field() + def _random_temp_filename(self, inter_dir=''): + chars = [random.choice(ascii_letters + digits) for _ in range(15)] + filename = ''.join(chars) + return os.path.join(self.temp_dir, inter_dir, filename) + def setUp(self): self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) - def _random_temp_filename(self): - chars = [random.choice(ascii_letters + digits) for _ in range(15)] - filename = ''.join(chars) - return os.path.join(self.temp_dir, filename) - - @defer.inlineCallbacks - def run_and_export(self, spider_cls, settings): - """ Run spider with specified settings; return exported data. """ - - FEEDS = settings.get('FEEDS') or {} - settings['FEEDS'] = { - urljoin('file:', pathname2url(str(file_path))): feed - for file_path, feed in FEEDS.items() - } - - content = {} - try: - with MockServer() as s: - runner = CrawlerRunner(Settings(settings)) - spider_cls.start_urls = [s.url('/')] - yield runner.crawl(spider_cls) - - for file_path, feed in FEEDS.items(): - with open(str(file_path), 'rb') as f: - content[feed['format']] = f.read() - - finally: - for file_path in FEEDS.keys(): - os.remove(str(file_path)) - - defer.returnValue(content) - @defer.inlineCallbacks def exported_data(self, items, settings): """ Return exported data which a spider yielding ``items`` would return. """ + class TestSpider(scrapy.Spider): name = 'testspider' @@ -455,6 +434,7 @@ def exported_no_data(self, settings): """ Return exported data which a spider yielding no ``items`` would return. """ + class TestSpider(scrapy.Spider): name = 'testspider' @@ -464,6 +444,40 @@ def parse(self, response): data = yield self.run_and_export(TestSpider, settings) defer.returnValue(data) + @abstractmethod + def run_and_export(self, spider_cls, settings): + pass + + +class FeedExportTest(FeedExportTestBase): + __test__ = True + + @defer.inlineCallbacks + def run_and_export(self, spider_cls, settings): + """ Run spider with specified settings; return exported data. """ + + FEEDS = settings.get('FEEDS') or {} + settings['FEEDS'] = { + urljoin('file:', pathname2url(str(file_path))): feed + for file_path, feed in FEEDS.items() + } + content = {} + try: + with MockServer() as s: + runner = CrawlerRunner(Settings(settings)) + spider_cls.start_urls = [s.url('/')] + yield runner.crawl(spider_cls) + + for file_path, feed in FEEDS.items(): + with open(str(file_path), 'rb') as f: + content[feed['format']] = f.read() + + finally: + for file_path in FEEDS.keys(): + os.remove(str(file_path)) + + defer.returnValue(content) + @defer.inlineCallbacks def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): settings = settings or {} @@ -970,3 +984,112 @@ def test_pathlib_uri(self): } data = yield self.exported_no_data(settings) self.assertEqual(data['csv'], b'') + + +class PartialDeliveriesTest(FeedExportTestBase): + __test__ = True + _file_mark = '_%(time)s_#%(batch_id)s' + + @defer.inlineCallbacks + def run_and_export(self, spider_cls, settings): + """ Run spider with specified settings; return exported data. """ + + FEEDS = settings.get('FEEDS') or {} + settings['FEEDS'] = { + urljoin('file:', file_path): feed + for file_path, feed in FEEDS.items() + } + from collections import defaultdict + content = defaultdict(list) + try: + with MockServer() as s: + runner = CrawlerRunner(Settings(settings)) + spider_cls.start_urls = [s.url('/')] + yield runner.crawl(spider_cls) + + for path, feed in FEEDS.items(): + dir_name = os.path.dirname(path) + for file in sorted(os.listdir(dir_name)): + with open(os.path.join(dir_name, file), 'rb') as f: + data = f.read() + content[feed['format']].append(data) + finally: + pass + defer.returnValue(content) + + @defer.inlineCallbacks + def assertPartialExported(self, items, rows, settings=None): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'jl', self._file_mark): {'format': 'jl'}, + }, + }) + data = yield self.exported_data(items, settings) + data['jl'] = b''.join(data['jl']) + parsed = [json.loads(to_unicode(line)) for line in data['jl'].splitlines()] + + rows = [{k: v for k, v in row.items() if v} for row in rows] + self.assertEqual(rows, parsed) + + @defer.inlineCallbacks + def test_partial_deliveries(self): + items = [ + self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), + self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), + self.MyItem({'foo': 'bar3', 'baz': 'quux3'}), + ] + rows = [ + {'egg': 'spam1', 'foo': 'bar1', 'baz': ''}, + {'egg': 'spam2', 'foo': 'bar2', 'baz': 'quux2'}, + {'foo': 'bar3', 'baz': 'quux3'} + ] + settings = { + 'FEED_STORAGE_BATCH_SIZE': 1 + } + yield self.assertPartialExported(items, rows, settings=settings) + + def test_wrong_path(self): + settings = { + 'FEEDS': { + self._random_temp_filename(): {'format': 'xml'}, + }, + 'FEED_STORAGE_BATCH_SIZE': 1 + } + crawler = get_crawler(settings_dict=settings) + self.assertRaises(NotConfigured, FeedExporter, crawler) + + @defer.inlineCallbacks + def test_export_no_items_not_store_empty(self): + for fmt in ('json', 'jsonlines', 'xml', 'csv'): + settings = { + 'FEEDS': { + os.path.join(self._random_temp_filename(), fmt, self._file_mark): {'format': fmt}, + }, + 'FEED_STORAGE_BATCH_SIZE': 1 + } + data = yield self.exported_no_data(settings) + data[fmt] = b''.join(data[fmt]) + self.assertEqual(data[fmt], b'') + + @defer.inlineCallbacks + def test_export_no_items_store_empty(self): + formats = ( + ('json', b'[]'), + ('jsonlines', b''), + ('xml', b'\n'), + ('csv', b''), + ) + + for fmt, expctd in formats: + settings = { + 'FEEDS': { + os.path.join(self._random_temp_filename(), fmt, self._file_mark): {'format': fmt}, + }, + 'FEED_STORE_EMPTY': True, + 'FEED_EXPORT_INDENT': None, + 'FEED_STORAGE_BATCH_SIZE': 1 + } + data = yield self.exported_no_data(settings) + data[fmt] = b''.join(data[fmt]) + self.assertEqual(data[fmt], expctd) From ffa8a533e74478a5c81fbf453f2c65601bb1d244 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sat, 28 Mar 2020 11:40:16 +0300 Subject: [PATCH 06/35] Set batch_id in _get_uri_params --- scrapy/extensions/feedexport.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index ab0a0de37e7..06ea6c5b2eb 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -253,15 +253,12 @@ def __init__(self, crawler): def open_spider(self, spider): for uri, feed in self.feeds.items(): - batch_id = 1 - uri_params = self._get_uri_params(spider, feed['uri_params']) - uri_params['batch_id'] = batch_id + uri_params = self._get_uri_params(spider, feed['uri_params'], None) self.slots.append(self._start_new_batch( previous_batch_slot=None, uri=uri % uri_params, feed=feed, spider=spider, - batch_id=batch_id, template_uri=uri, )) @@ -286,7 +283,7 @@ def close_spider(self, spider): deferred_list.append(d) return defer.DeferredList(deferred_list) if deferred_list else None - def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, template_uri): + def _start_new_batch(self, previous_batch_slot, uri, feed, spider, template_uri): """ Redirect the output data stream to a new file. Execute multiple times if 'FEED_STORAGE_BATCH' setting is specified. @@ -295,12 +292,15 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, tem :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider - :param batch_id: sequential batch id starting at 1 :param template_uri: template uri which contains %(time)s or %(batch_id)s to create new uri """ if previous_batch_slot is not None: + previous_batch_id = previous_batch_slot.batch_id previous_batch_slot.exporter.finish_exporting() previous_batch_slot.storage.store(previous_batch_slot.file) + else: + previous_batch_id = 0 + storage = self._get_storage(uri) file = storage.open(spider) exporter = self._get_exporter( @@ -317,7 +317,7 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider, batch_id, tem uri=uri, format=feed['format'], store_empty=feed['store_empty'], - batch_id=batch_id, + batch_id=previous_batch_id + 1, template_uri=template_uri, ) if slot.store_empty: @@ -331,15 +331,12 @@ def item_scraped(self, item, spider): slot.exporter.export_item(item) slot.itemcount += 1 if self.storage_batch_size and slot.itemcount % self.storage_batch_size == 0: - batch_id = slot.batch_id + 1 - uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params']) - uri_params['batch_id'] = batch_id + uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params'], slot) slots.append(self._start_new_batch( previous_batch_slot=slot, uri=slot.template_uri % uri_params, feed=self.feeds[slot.template_uri], spider=spider, - batch_id=batch_id, template_uri=slot.template_uri, )) self.slots[idx] = None @@ -396,12 +393,12 @@ def _get_exporter(self, file, format, *args, **kwargs): def _get_storage(self, uri): return self._get_instance(self.storages[urlparse(uri).scheme], uri) - def _get_uri_params(self, spider, uri_params): + def _get_uri_params(self, spider, uri_params, slot): params = {} for k in dir(spider): params[k] = getattr(spider, k) - ts = datetime.utcnow().isoformat().replace(':', '-') - params['time'] = ts + params['batch_id'] = slot.batch_id + 1 if slot is not None else 1 + params['time'] = datetime.utcnow().isoformat().replace(':', '-') uripar_function = load_object(uri_params) if uri_params else lambda x, y: None uripar_function(params, spider) return params From 963580463b96315eb58319e6d35b4cd52672371a Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 15 Apr 2020 20:14:33 +0300 Subject: [PATCH 07/35] Update tests --- tests/test_feedexport.py | 199 ++++++++++++++++++++++++++++++--------- 1 file changed, 157 insertions(+), 42 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 1ebe44e1277..c6cd867b16f 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -7,6 +7,7 @@ import tempfile import warnings from abc import ABC, abstractmethod +from collections import defaultdict from io import BytesIO from pathlib import Path from string import ascii_letters, digits @@ -444,10 +445,31 @@ def parse(self, response): data = yield self.run_and_export(TestSpider, settings) defer.returnValue(data) + @defer.inlineCallbacks + def assertExported(self, items, header, rows, settings=None, ordered=True): + yield self.assertExportedCsv(items, header, rows, settings, ordered) + yield self.assertExportedJsonLines(items, rows, settings) + yield self.assertExportedXml(items, rows, settings) + yield self.assertExportedPickle(items, rows, settings) + yield self.assertExportedMarshal(items, rows, settings) + yield self.assertExportedMultiple(items, rows, settings) + @abstractmethod def run_and_export(self, spider_cls, settings): pass + def _load_until_eof(self, data, load_func): + result = [] + with tempfile.TemporaryFile() as temp: + temp.write(data) + temp.seek(0) + while True: + try: + result.append(load_func(temp)) + except EOFError: + break + return result + class FeedExportTest(FeedExportTestBase): __test__ = True @@ -478,6 +500,22 @@ def run_and_export(self, spider_cls, settings): defer.returnValue(content) + @defer.inlineCallbacks + def exported_data(self, items, settings): + """ + Return exported data which a spider yielding ``items`` would return. + """ + + class TestSpider(scrapy.Spider): + name = 'testspider' + + def parse(self, response): + for item in items: + yield item + + data = yield self.run_and_export(TestSpider, settings) + defer.returnValue(data) + @defer.inlineCallbacks def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): settings = settings or {} @@ -543,18 +581,6 @@ def assertExportedMultiple(self, items, rows, settings=None): json_rows = json.loads(to_unicode(data['json'])) self.assertEqual(rows, json_rows) - def _load_until_eof(self, data, load_func): - result = [] - with tempfile.TemporaryFile() as temp: - temp.write(data) - temp.seek(0) - while True: - try: - result.append(load_func(temp)) - except EOFError: - break - return result - @defer.inlineCallbacks def assertExportedPickle(self, items, rows, settings=None): settings = settings or {} @@ -583,15 +609,6 @@ def assertExportedMarshal(self, items, rows, settings=None): result = self._load_until_eof(data['marshal'], load_func=marshal.load) self.assertEqual(expected, result) - @defer.inlineCallbacks - def assertExported(self, items, header, rows, settings=None, ordered=True): - yield self.assertExportedCsv(items, header, rows, settings, ordered) - yield self.assertExportedJsonLines(items, rows, settings) - yield self.assertExportedXml(items, rows, settings) - yield self.assertExportedPickle(items, rows, settings) - yield self.assertExportedMarshal(items, rows, settings) - yield self.assertExportedMultiple(items, rows, settings) - @defer.inlineCallbacks def test_export_items(self): # feed exporters use field names from Item @@ -615,7 +632,7 @@ def test_export_no_items_not_store_empty(self): }, } data = yield self.exported_no_data(settings) - self.assertEqual(data[fmt], b'') + self.assertEqual(b'', data[fmt]) @defer.inlineCallbacks def test_export_no_items_store_empty(self): @@ -635,7 +652,7 @@ def test_export_no_items_store_empty(self): 'FEED_EXPORT_INDENT': None, } data = yield self.exported_no_data(settings) - self.assertEqual(data[fmt], expctd) + self.assertEqual(expctd, data[fmt]) @defer.inlineCallbacks def test_export_multiple_item_classes(self): @@ -734,7 +751,8 @@ def test_export_encoding(self): formats = { 'json': u'[{"foo": "Test\\u00d6"}]'.encode('utf-8'), 'jsonlines': u'{"foo": "Test\\u00d6"}\n'.encode('utf-8'), - 'xml': u'\nTest\xd6'.encode('utf-8'), + 'xml': u'\nTest\xd6'.encode( + 'utf-8'), 'csv': u'foo\r\nTest\xd6\r\n'.encode('utf-8'), } @@ -751,7 +769,8 @@ def test_export_encoding(self): formats = { 'json': u'[{"foo": "Test\xd6"}]'.encode('latin-1'), 'jsonlines': u'{"foo": "Test\xd6"}\n'.encode('latin-1'), - 'xml': u'\nTest\xd6'.encode('latin-1'), + 'xml': u'\nTest\xd6'.encode( + 'latin-1'), 'csv': u'foo\r\nTest\xd6\r\n'.encode('latin-1'), } @@ -772,7 +791,8 @@ def test_export_multiple_configs(self): formats = { 'json': u'[\n{"bar": "BAR"}\n]'.encode('utf-8'), - 'xml': u'\n\n \n FOO\n \n'.encode('latin-1'), + 'xml': u'\n\n \n FOO\n \n'.encode( + 'latin-1'), 'csv': u'bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), } @@ -988,7 +1008,7 @@ def test_pathlib_uri(self): class PartialDeliveriesTest(FeedExportTestBase): __test__ = True - _file_mark = '_%(time)s_#%(batch_id)s' + _file_mark = '_%(time)s_#%(batch_id)s_' @defer.inlineCallbacks def run_and_export(self, spider_cls, settings): @@ -999,7 +1019,6 @@ def run_and_export(self, spider_cls, settings): urljoin('file:', file_path): feed for file_path, feed in FEEDS.items() } - from collections import defaultdict content = defaultdict(list) try: with MockServer() as s: @@ -1014,26 +1033,120 @@ def run_and_export(self, spider_cls, settings): data = f.read() content[feed['format']].append(data) finally: - pass + self.tearDown() defer.returnValue(content) @defer.inlineCallbacks - def assertPartialExported(self, items, rows, settings=None): + def assertExportedJsonLines(self, items, rows, settings=None): settings = settings or {} settings.update({ 'FEEDS': { os.path.join(self._random_temp_filename(), 'jl', self._file_mark): {'format': 'jl'}, }, }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) - data['jl'] = b''.join(data['jl']) - parsed = [json.loads(to_unicode(line)) for line in data['jl'].splitlines()] + for batch in data['jl']: + got_batch = [json.loads(to_unicode(batch_item)) for batch_item in batch.splitlines()] + expected_batch, rows = rows[:batch_size], rows[batch_size:] + self.assertEqual(expected_batch, got_batch) + @defer.inlineCallbacks + def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'csv', self._file_mark): {'format': 'csv'}, + }, + }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + data = yield self.exported_data(items, settings) + for batch in data['csv']: + got_batch = csv.DictReader(to_unicode(batch).splitlines()) + self.assertEqual(list(header), got_batch.fieldnames) + expected_batch, rows = rows[:batch_size], rows[batch_size:] + self.assertEqual(expected_batch, list(got_batch)) + + @defer.inlineCallbacks + def assertExportedXml(self, items, rows, settings=None): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'xml', self._file_mark): {'format': 'xml'}, + }, + }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] rows = [{k: v for k, v in row.items() if v} for row in rows] - self.assertEqual(rows, parsed) + data = yield self.exported_data(items, settings) + for batch in data['xml']: + root = lxml.etree.fromstring(batch) + got_batch = [{e.tag: e.text for e in it} for it in root.findall('item')] + expected_batch, rows = rows[:batch_size], rows[batch_size:] + self.assertEqual(expected_batch, got_batch) + + @defer.inlineCallbacks + def assertExportedMultiple(self, items, rows, settings=None): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'xml', self._file_mark): {'format': 'xml'}, + os.path.join(self._random_temp_filename(), 'json', self._file_mark): {'format': 'json'}, + }, + }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + rows = [{k: v for k, v in row.items() if v} for row in rows] + data = yield self.exported_data(items, settings) + # XML + xml_rows = rows.copy() + for batch in data['xml']: + root = lxml.etree.fromstring(batch) + got_batch = [{e.tag: e.text for e in it} for it in root.findall('item')] + expected_batch, xml_rows = xml_rows[:batch_size], xml_rows[batch_size:] + self.assertEqual(expected_batch, got_batch) + # JSON + json_rows = rows.copy() + for batch in data['json']: + got_batch = json.loads(batch) + expected_batch, json_rows = json_rows[:batch_size], json_rows[batch_size:] + self.assertEqual(expected_batch, got_batch) @defer.inlineCallbacks - def test_partial_deliveries(self): + def assertExportedPickle(self, items, rows, settings=None): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'pickle', self._file_mark): {'format': 'pickle'}, + }, + }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + rows = [{k: v for k, v in row.items() if v} for row in rows] + data = yield self.exported_data(items, settings) + import pickle + for batch in data['pickle']: + got_batch = self._load_until_eof(batch, load_func=pickle.load) + expected_batch, rows = rows[:batch_size], rows[batch_size:] + self.assertEqual(expected_batch, got_batch) + + @defer.inlineCallbacks + def assertExportedMarshal(self, items, rows, settings=None): + settings = settings or {} + settings.update({ + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'marshal', self._file_mark): {'format': 'marshal'}, + }, + }) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + rows = [{k: v for k, v in row.items() if v} for row in rows] + data = yield self.exported_data(items, settings) + import marshal + for batch in data['marshal']: + got_batch = self._load_until_eof(batch, load_func=marshal.load) + expected_batch, rows = rows[:batch_size], rows[batch_size:] + self.assertEqual(expected_batch, got_batch) + + @defer.inlineCallbacks + def test_export_items(self): items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), @@ -1042,14 +1155,16 @@ def test_partial_deliveries(self): rows = [ {'egg': 'spam1', 'foo': 'bar1', 'baz': ''}, {'egg': 'spam2', 'foo': 'bar2', 'baz': 'quux2'}, - {'foo': 'bar3', 'baz': 'quux3'} + {'foo': 'bar3', 'baz': 'quux3', 'egg': ''} ] settings = { - 'FEED_STORAGE_BATCH_SIZE': 1 + 'FEED_STORAGE_BATCH_SIZE': 2 } - yield self.assertPartialExported(items, rows, settings=settings) + header = self.MyItem.fields.keys() + yield self.assertExported(items, header, rows, settings=settings) def test_wrong_path(self): + """If path without %(time)s or %(batch_id)s an exception must be raised""" settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, @@ -1069,8 +1184,8 @@ def test_export_no_items_not_store_empty(self): 'FEED_STORAGE_BATCH_SIZE': 1 } data = yield self.exported_no_data(settings) - data[fmt] = b''.join(data[fmt]) - self.assertEqual(data[fmt], b'') + data = dict(data) + self.assertEqual(b'', data[fmt][0]) @defer.inlineCallbacks def test_export_no_items_store_empty(self): @@ -1088,8 +1203,8 @@ def test_export_no_items_store_empty(self): }, 'FEED_STORE_EMPTY': True, 'FEED_EXPORT_INDENT': None, - 'FEED_STORAGE_BATCH_SIZE': 1 + 'FEED_STORAGE_BATCH_SIZE': 1, } data = yield self.exported_no_data(settings) - data[fmt] = b''.join(data[fmt]) - self.assertEqual(data[fmt], expctd) + data = dict(data) + self.assertEqual(expctd, data[fmt][0]) From cac1f3a6adedc32977e0fb1830917a5e7d758bef Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 16 Apr 2020 10:06:56 +0300 Subject: [PATCH 08/35] Update documentation --- docs/topics/feed-exports.rst | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 9e5968a295d..0bba03a7cea 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -220,6 +220,7 @@ These are the settings used for configuring the feed exports: * :setting:`FEED_STORAGE_FTP_ACTIVE` * :setting:`FEED_STORAGE_S3_ACL` * :setting:`FEED_EXPORTERS` + * :setting:`FEED_EXPORT_BATCH_SIZE` .. currentmodule:: scrapy.extensions.feedexport @@ -429,3 +430,37 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter .. _Amazon S3: https://aws.amazon.com/s3/ .. _botocore: https://github.com/boto/botocore .. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl + +.. setting:: FEED_EXPORT_BATCH_SIZE + +FEED_EXPORT_BATCH_SIZE +---------------------- +Default: ``None`` + +An integer number which represent number of scraped items stored in each output +file. Whenever the number of items exceeds this setting, a new file +creates and output redirects to it. The name of the new file will be selected +based on timestamp when the feed is being created and/or batch sequence number. +Therefore you must specify %(time)s or %(batch_id)s or both in the file path. + +* ``%(time)s`` - gets replaced by a timestamp when the feed is being created +* ``%(batch_id)s`` - gets replaced by sequence number of batch + +For instance:: + + FEED_EXPORT_BATCH_SIZE=100 + +Your request can be like:: + + scrapy crawl spidername -o dirname/%(batch_id)s-filename%(time)s.json + +The result directory tree of above can be like:: + +->projectname +-->dirname +--->1-filename2020-03-28T14-45-08.237134.json +--->2-filename2020-03-28T14-45-09.148903.json +--->3-filename2020-03-28T14-45-10.046092.json + +Where first and second files contain exactly 100 items. The last one contains +<= 100 items. \ No newline at end of file From 5980ae72c6cb177f47fbb41d17837e8d98d50025 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 16 Apr 2020 10:13:39 +0300 Subject: [PATCH 09/35] Some minor fixes and refactoring --- tests/test_feedexport.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 60e19d1df08..e97e50e8e6e 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -428,7 +428,7 @@ def parse(self, response): yield item data = yield self.run_and_export(TestSpider, settings) - defer.returnValue(data) + return data @defer.inlineCallbacks def exported_no_data(self, settings): @@ -443,7 +443,7 @@ def parse(self, response): pass data = yield self.run_and_export(TestSpider, settings) - defer.returnValue(data) + return data @defer.inlineCallbacks def assertExported(self, items, header, rows, settings=None, ordered=True): @@ -735,8 +735,7 @@ def test_export_encoding(self): formats = { 'json': u'[{"foo": "Test\\u00d6"}]'.encode('utf-8'), 'jsonlines': u'{"foo": "Test\\u00d6"}\n'.encode('utf-8'), - 'xml': u'\nTest\xd6'.encode( - 'utf-8'), + 'xml': u'\nTest\xd6'.encode('utf-8'), 'csv': u'foo\r\nTest\xd6\r\n'.encode('utf-8'), } @@ -753,8 +752,7 @@ def test_export_encoding(self): formats = { 'json': u'[{"foo": "Test\xd6"}]'.encode('latin-1'), 'jsonlines': u'{"foo": "Test\xd6"}\n'.encode('latin-1'), - 'xml': u'\nTest\xd6'.encode( - 'latin-1'), + 'xml': u'\nTest\xd6'.encode('latin-1'), 'csv': u'foo\r\nTest\xd6\r\n'.encode('latin-1'), } @@ -775,8 +773,7 @@ def test_export_multiple_configs(self): formats = { 'json': u'[\n{"bar": "BAR"}\n]'.encode('utf-8'), - 'xml': u'\n\n \n FOO\n \n'.encode( - 'latin-1'), + 'xml': u'\n\n \n FOO\n \n'.encode('latin-1'), 'csv': u'bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), } @@ -1148,7 +1145,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=settings) def test_wrong_path(self): - """If path without %(time)s or %(batch_id)s an exception must be raised""" + """If path is without %(time)s or %(batch_id)s an exception must be raised""" settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, From ec76445dd6753074c1531571f66467eecf22b498 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sat, 18 Apr 2020 09:29:23 +0300 Subject: [PATCH 10/35] Update tests --- tests/test_feedexport.py | 66 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index e97e50e8e6e..8e03a91c828 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1128,6 +1128,7 @@ def assertExportedMarshal(self, items, rows, settings=None): @defer.inlineCallbacks def test_export_items(self): + """ Test partial deliveries in all supported formats """ items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), @@ -1145,7 +1146,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=settings) def test_wrong_path(self): - """If path is without %(time)s or %(batch_id)s an exception must be raised""" + """ If path is without %(time)s or %(batch_id)s an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, @@ -1189,3 +1190,66 @@ def test_export_no_items_store_empty(self): data = yield self.exported_no_data(settings) data = dict(data) self.assertEqual(expctd, data[fmt][0]) + + @defer.inlineCallbacks + def test_export_multiple_configs(self): + items = [dict({'foo': u'FOO', 'bar': u'BAR'}), dict({'foo': u'FOO1', 'bar': u'BAR1'})] + + formats = { + 'json': [u'[\n{"bar": "BAR"}\n]'.encode('utf-8'), + u'[\n{"bar": "BAR1"}\n]'.encode('utf-8')], + 'xml': [u'\n\n \n FOO\n \n'.encode('latin-1'), + u'\n\n \n FOO1\n \n'.encode('latin-1')], + 'csv': [u'bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), + u'bar,foo\r\nBAR1,FOO1\r\n'.encode('utf-8')], + } + + settings = { + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'json', self._file_mark): { + 'format': 'json', + 'indent': 0, + 'fields': ['bar'], + 'encoding': 'utf-8', + }, + os.path.join(self._random_temp_filename(), 'xml', self._file_mark): { + 'format': 'xml', + 'indent': 2, + 'fields': ['foo'], + 'encoding': 'latin-1', + }, + os.path.join(self._random_temp_filename(), 'csv', self._file_mark): { + 'format': 'csv', + 'indent': None, + 'fields': ['bar', 'foo'], + 'encoding': 'utf-8', + }, + }, + 'FEED_STORAGE_BATCH_SIZE': 1, + } + data = yield self.exported_data(items, settings) + for fmt, expected in formats.items(): + for expected_batch, got_batch in zip(expected, data[fmt]): + self.assertEqual(expected_batch, got_batch) + + @defer.inlineCallbacks + def test_batch_path_differ(self): + """ + Test that the name of all batch files differ from each other. + So %(time)s replaced with the current date. + """ + items = [ + self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), + self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), + self.MyItem({'foo': 'bar3', 'baz': 'quux3'}), + ] + settings = { + 'FEEDS': { + os.path.join(self._random_temp_filename(), '%(time)s'): { + 'format': 'json', + }, + }, + 'FEED_STORAGE_BATCH_SIZE': 1, + } + data = yield self.exported_data(items, settings) + self.assertEqual(len(items) + 1, len(data['json'])) From f0f1be76d1e6cef65ac9a01d13c5d5060a03f648 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Mon, 27 Apr 2020 09:56:57 +0300 Subject: [PATCH 11/35] Using time_id instead of time as a timestamp --- docs/topics/feed-exports.rst | 6 +++--- scrapy/extensions/feedexport.py | 11 ++++++----- tests/test_feedexport.py | 8 ++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 0bba03a7cea..2017be78fad 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -441,9 +441,9 @@ An integer number which represent number of scraped items stored in each output file. Whenever the number of items exceeds this setting, a new file creates and output redirects to it. The name of the new file will be selected based on timestamp when the feed is being created and/or batch sequence number. -Therefore you must specify %(time)s or %(batch_id)s or both in the file path. +Therefore you must specify %(time_id)s or %(batch_id)s or both in the file path. -* ``%(time)s`` - gets replaced by a timestamp when the feed is being created +* ``%(time_id)s`` - gets replaced by a timestamp when the feed is being created * ``%(batch_id)s`` - gets replaced by sequence number of batch For instance:: @@ -452,7 +452,7 @@ For instance:: Your request can be like:: - scrapy crawl spidername -o dirname/%(batch_id)s-filename%(time)s.json + scrapy crawl spidername -o dirname/%(batch_id)s-filename%(time_id)s.json The result directory tree of above can be like:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 06ea6c5b2eb..72baa62697e 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -292,7 +292,7 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider, template_uri) :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider - :param template_uri: template uri which contains %(time)s or %(batch_id)s to create new uri + :param template_uri: template uri which contains %(time_id)s or %(batch_id)s to create new uri """ if previous_batch_slot is not None: previous_batch_id = previous_batch_slot.batch_id @@ -360,12 +360,12 @@ def _exporter_supported(self, format): def _batch_deliveries_supported(self, uri): """ - If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(time)s or %(batch_id)s + If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(time_id)s or %(batch_id)s to distinguish different files of partial output """ - if self.storage_batch_size is None or '%(time)s' in uri or '%(batch_id)s' in uri: + if self.storage_batch_size is None or '%(time_id)s' in uri or '%(batch_id)s' in uri: return True - logger.warning('%(time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + logger.warning('%(time_id)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') return False def _storage_supported(self, uri): @@ -397,8 +397,9 @@ def _get_uri_params(self, spider, uri_params, slot): params = {} for k in dir(spider): params[k] = getattr(spider, k) + params['time'] = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-') + params['time_id'] = datetime.utcnow().isoformat().replace(':', '-') params['batch_id'] = slot.batch_id + 1 if slot is not None else 1 - params['time'] = datetime.utcnow().isoformat().replace(':', '-') uripar_function = load_object(uri_params) if uri_params else lambda x, y: None uripar_function(params, spider) return params diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 8e03a91c828..da759917ad9 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -989,7 +989,7 @@ def test_pathlib_uri(self): class PartialDeliveriesTest(FeedExportTestBase): __test__ = True - _file_mark = '_%(time)s_#%(batch_id)s_' + _file_mark = '_%(time_id)s_#%(batch_id)s_' @defer.inlineCallbacks def run_and_export(self, spider_cls, settings): @@ -1146,7 +1146,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=settings) def test_wrong_path(self): - """ If path is without %(time)s or %(batch_id)s an exception must be raised """ + """ If path is without %(time_id)s or %(batch_id)s an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, @@ -1236,7 +1236,7 @@ def test_export_multiple_configs(self): def test_batch_path_differ(self): """ Test that the name of all batch files differ from each other. - So %(time)s replaced with the current date. + So %(time_id)s replaced with the current date. """ items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), @@ -1245,7 +1245,7 @@ def test_batch_path_differ(self): ] settings = { 'FEEDS': { - os.path.join(self._random_temp_filename(), '%(time)s'): { + os.path.join(self._random_temp_filename(), '%(time_id)s'): { 'format': 'json', }, }, From 2eee6c81017e08bb492da560bc73c03f4f375fcc Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Mon, 27 Apr 2020 09:58:14 +0300 Subject: [PATCH 12/35] Documentation spelling fix --- docs/topics/feed-exports.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 2017be78fad..6c463fc2768 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -438,10 +438,10 @@ FEED_EXPORT_BATCH_SIZE Default: ``None`` An integer number which represent number of scraped items stored in each output -file. Whenever the number of items exceeds this setting, a new file -creates and output redirects to it. The name of the new file will be selected +file. Whenever the number of items exceeds this setting, a new file is +created and output redirects to it. The name of the new file will be selected based on timestamp when the feed is being created and/or batch sequence number. -Therefore you must specify %(time_id)s or %(batch_id)s or both in the file path. +Therefore you must specify %(time_id)s or %(batch_id)s or both in FEED_URI. * ``%(time_id)s`` - gets replaced by a timestamp when the feed is being created * ``%(batch_id)s`` - gets replaced by sequence number of batch From 204737042ac6672eee73c975d0bd6735893d684c Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Mon, 27 Apr 2020 12:52:18 +0300 Subject: [PATCH 13/35] Extract the slot closing functionality to the function; minor changes --- scrapy/extensions/feedexport.py | 56 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 72baa62697e..fe6061c339a 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -255,7 +255,7 @@ def open_spider(self, spider): for uri, feed in self.feeds.items(): uri_params = self._get_uri_params(spider, feed['uri_params'], None) self.slots.append(self._start_new_batch( - previous_batch_slot=None, + batch_id=1, uri=uri % uri_params, feed=feed, spider=spider, @@ -265,42 +265,38 @@ def open_spider(self, spider): def close_spider(self, spider): deferred_list = [] for slot in self.slots: - if not slot.itemcount and not slot.store_empty: - # We need to call slot.storage.store nonetheless to get the file - # properly closed. - return defer.maybeDeferred(slot.storage.store, slot.file) - slot.finish_exporting() - logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" - log_args = {'format': slot.format, - 'itemcount': slot.itemcount, - 'uri': slot.uri} - d = defer.maybeDeferred(slot.storage.store, slot.file) - d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args, - extra={'spider': spider})) - d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args, - exc_info=failure_to_exc_info(f), - extra={'spider': spider})) + d = self._close_slot(slot, spider) deferred_list.append(d) return defer.DeferredList(deferred_list) if deferred_list else None - def _start_new_batch(self, previous_batch_slot, uri, feed, spider, template_uri): + def _close_slot(self, slot, spider): + if not slot.itemcount and not slot.store_empty: + # We need to call slot.storage.store nonetheless to get the file + # properly closed. + return defer.maybeDeferred(slot.storage.store, slot.file) + slot.finish_exporting() + logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" + log_args = {'format': slot.format, + 'itemcount': slot.itemcount, + 'uri': slot.uri} + d = defer.maybeDeferred(slot.storage.store, slot.file) + d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args, + extra={'spider': spider})) + d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args, + exc_info=failure_to_exc_info(f), + extra={'spider': spider})) + return d + + def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): """ Redirect the output data stream to a new file. Execute multiple times if 'FEED_STORAGE_BATCH' setting is specified. - :param previous_batch_slot: slot of previous batch. We need to call slot.storage.store - to get the file properly closed. + :param batch_id: sequence number of current batch :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider :param template_uri: template uri which contains %(time_id)s or %(batch_id)s to create new uri """ - if previous_batch_slot is not None: - previous_batch_id = previous_batch_slot.batch_id - previous_batch_slot.exporter.finish_exporting() - previous_batch_slot.storage.store(previous_batch_slot.file) - else: - previous_batch_id = 0 - storage = self._get_storage(uri) file = storage.open(spider) exporter = self._get_exporter( @@ -317,7 +313,7 @@ def _start_new_batch(self, previous_batch_slot, uri, feed, spider, template_uri) uri=uri, format=feed['format'], store_empty=feed['store_empty'], - batch_id=previous_batch_id + 1, + batch_id=batch_id, template_uri=template_uri, ) if slot.store_empty: @@ -330,10 +326,12 @@ def item_scraped(self, item, spider): slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 - if self.storage_batch_size and slot.itemcount % self.storage_batch_size == 0: + # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_SIZE and close the old one + if self.storage_batch_size and slot.itemcount == self.storage_batch_size: uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params'], slot) + self._close_slot(slot, spider) slots.append(self._start_new_batch( - previous_batch_slot=slot, + batch_id=slot.batch_id + 1, uri=slot.template_uri % uri_params, feed=self.feeds[slot.template_uri], spider=spider, From 3f9874fac9f93c0956afa5975d7b2bbb21816894 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 1 May 2020 11:52:16 +0300 Subject: [PATCH 14/35] Add test s3 export --- tests/test_feedexport.py | 70 ++++++++++++++++++++++++++++++++++++++++ tox.ini | 1 + 2 files changed, 71 insertions(+) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index da759917ad9..9fc39c3a6eb 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1253,3 +1253,73 @@ def test_batch_path_differ(self): } data = yield self.exported_data(items, settings) self.assertEqual(len(items) + 1, len(data['json'])) + + @defer.inlineCallbacks + def test_s3_export(self): + """ + Test export of items into s3 bucket. + S3_TEST_BUCKET_NAME, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY must be specified in tox.ini + to perform this test: + [testenv] + setenv = + AWS_SECRET_ACCESS_KEY = ABCD + AWS_ACCESS_KEY_ID = ABCD + S3_TEST_BUCKET_NAME = ABCD + """ + try: + import boto3 + except ImportError: + raise unittest.SkipTest("S3FeedStorage requires boto3") + + assert_aws_environ() + s3_test_bucket_name = os.environ.get('S3_TEST_BUCKET_NAME') + access_key = os.environ.get('AWS_ACCESS_KEY_ID') + secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY') + if not s3_test_bucket_name: + raise unittest.SkipTest("No S3 BUCKET available for testing") + + chars = [random.choice(ascii_letters + digits) for _ in range(15)] + filename = ''.join(chars) + prefix = 'tmp/{filename}'.format(filename=filename) + s3_test_file_uri = 's3://{bucket_name}/{prefix}/%(time_id)s.json'.format( + bucket_name=s3_test_bucket_name, prefix=prefix + ) + storage = S3FeedStorage(s3_test_bucket_name, access_key, secret_key) + settings = { + 'FEEDS': { + s3_test_file_uri: { + 'format': 'json', + }, + }, + 'FEED_STORAGE_BATCH_SIZE': 1, + } + items = [ + self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), + self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), + self.MyItem({'foo': 'bar3', 'baz': 'quux3'}), + ] + verifyObject(IFeedStorage, storage) + + class TestSpider(scrapy.Spider): + name = 'testspider' + + def parse(self, response): + for item in items: + yield item + + s3 = boto3.resource('s3') + my_bucket = s3.Bucket(s3_test_bucket_name) + batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + + with MockServer() as s: + runner = CrawlerRunner(Settings(settings)) + TestSpider.start_urls = [s.url('/')] + yield runner.crawl(TestSpider) + + for file_uri in my_bucket.objects.filter(Prefix=prefix): + content = get_s3_content_and_delete(s3_test_bucket_name, file_uri.key) + if not content and not items: + break + content = json.loads(content.decode('utf-8')) + expected_batch, items = items[:batch_size], items[batch_size:] + self.assertEqual(expected_batch, content) diff --git a/tox.ini b/tox.ini index cd118c921d0..c77fae1f018 100644 --- a/tox.ini +++ b/tox.ini @@ -14,6 +14,7 @@ deps = # Extras botocore>=1.3.23 Pillow>=3.4.2 + boto3>=1.13.0 passenv = S3_TEST_FILE_URI AWS_ACCESS_KEY_ID From dad2ea75222d6240c569440d3221f5fc00925682 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sat, 2 May 2020 01:21:03 +0300 Subject: [PATCH 15/35] Change time_id to batch_time --- docs/topics/feed-exports.rst | 6 +++--- scrapy/extensions/feedexport.py | 10 +++++----- tests/test_feedexport.py | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 6c463fc2768..2106b41f582 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -441,9 +441,9 @@ An integer number which represent number of scraped items stored in each output file. Whenever the number of items exceeds this setting, a new file is created and output redirects to it. The name of the new file will be selected based on timestamp when the feed is being created and/or batch sequence number. -Therefore you must specify %(time_id)s or %(batch_id)s or both in FEED_URI. +Therefore you must specify %(batch_time)s or %(batch_id)s or both in FEED_URI. -* ``%(time_id)s`` - gets replaced by a timestamp when the feed is being created +* ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created * ``%(batch_id)s`` - gets replaced by sequence number of batch For instance:: @@ -452,7 +452,7 @@ For instance:: Your request can be like:: - scrapy crawl spidername -o dirname/%(batch_id)s-filename%(time_id)s.json + scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json The result directory tree of above can be like:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index fe6061c339a..a262f5d1825 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -295,7 +295,7 @@ def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider - :param template_uri: template uri which contains %(time_id)s or %(batch_id)s to create new uri + :param template_uri: template uri which contains %(batch_time)s or %(batch_id)s to create new uri """ storage = self._get_storage(uri) file = storage.open(spider) @@ -358,12 +358,12 @@ def _exporter_supported(self, format): def _batch_deliveries_supported(self, uri): """ - If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(time_id)s or %(batch_id)s + If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ - if self.storage_batch_size is None or '%(time_id)s' in uri or '%(batch_id)s' in uri: + if self.storage_batch_size is None or '%(batch_time)s' in uri or '%(batch_id)s' in uri: return True - logger.warning('%(time_id)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + logger.warning('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') return False def _storage_supported(self, uri): @@ -396,7 +396,7 @@ def _get_uri_params(self, spider, uri_params, slot): for k in dir(spider): params[k] = getattr(spider, k) params['time'] = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-') - params['time_id'] = datetime.utcnow().isoformat().replace(':', '-') + params['batch_time'] = datetime.utcnow().isoformat().replace(':', '-') params['batch_id'] = slot.batch_id + 1 if slot is not None else 1 uripar_function = load_object(uri_params) if uri_params else lambda x, y: None uripar_function(params, spider) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 9fc39c3a6eb..2217bb4edeb 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -989,7 +989,7 @@ def test_pathlib_uri(self): class PartialDeliveriesTest(FeedExportTestBase): __test__ = True - _file_mark = '_%(time_id)s_#%(batch_id)s_' + _file_mark = '_%(batch_time)s_#%(batch_id)s_' @defer.inlineCallbacks def run_and_export(self, spider_cls, settings): @@ -1146,7 +1146,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=settings) def test_wrong_path(self): - """ If path is without %(time_id)s or %(batch_id)s an exception must be raised """ + """ If path is without %(batch_time)s or %(batch_id)s an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, @@ -1236,7 +1236,7 @@ def test_export_multiple_configs(self): def test_batch_path_differ(self): """ Test that the name of all batch files differ from each other. - So %(time_id)s replaced with the current date. + So %(batch_time)s replaced with the current date. """ items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), @@ -1245,7 +1245,7 @@ def test_batch_path_differ(self): ] settings = { 'FEEDS': { - os.path.join(self._random_temp_filename(), '%(time_id)s'): { + os.path.join(self._random_temp_filename(), '%(batch_time)s'): { 'format': 'json', }, }, @@ -1281,7 +1281,7 @@ def test_s3_export(self): chars = [random.choice(ascii_letters + digits) for _ in range(15)] filename = ''.join(chars) prefix = 'tmp/{filename}'.format(filename=filename) - s3_test_file_uri = 's3://{bucket_name}/{prefix}/%(time_id)s.json'.format( + s3_test_file_uri = 's3://{bucket_name}/{prefix}/%(batch_time)s.json'.format( bucket_name=s3_test_bucket_name, prefix=prefix ) storage = S3FeedStorage(s3_test_bucket_name, access_key, secret_key) From 2327ecead085a41d1a71a70a12eb988bbf982268 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 13 May 2020 22:50:04 +0300 Subject: [PATCH 16/35] Rename FEED_STORAGE_BATCH_SIZE to FEED_STORAGE_BATCH_ITEM_COUNT --- docs/topics/feed-exports.rst | 8 ++++---- scrapy/extensions/feedexport.py | 10 +++++----- scrapy/settings/default_settings.py | 2 +- tests/test_feedexport.py | 28 ++++++++++++++-------------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 2106b41f582..917240d4d37 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -220,7 +220,7 @@ These are the settings used for configuring the feed exports: * :setting:`FEED_STORAGE_FTP_ACTIVE` * :setting:`FEED_STORAGE_S3_ACL` * :setting:`FEED_EXPORTERS` - * :setting:`FEED_EXPORT_BATCH_SIZE` + * :setting:`FEED_STORAGE_BATCH_ITEM_COUNT` .. currentmodule:: scrapy.extensions.feedexport @@ -431,9 +431,9 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter .. _botocore: https://github.com/boto/botocore .. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl -.. setting:: FEED_EXPORT_BATCH_SIZE +.. setting:: FEED_STORAGE_BATCH_ITEM_COUNT -FEED_EXPORT_BATCH_SIZE +FEED_STORAGE_BATCH_ITEM_COUNT ---------------------- Default: ``None`` @@ -448,7 +448,7 @@ Therefore you must specify %(batch_time)s or %(batch_id)s or both in FEED_URI. For instance:: - FEED_EXPORT_BATCH_SIZE=100 + FEED_STORAGE_BATCH_ITEM_COUNT=100 Your request can be like:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index a262f5d1825..5bc946634d1 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -242,7 +242,7 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch_size = self.settings.get('FEED_STORAGE_BATCH_SIZE', None) + self.storage_batch_size = self.settings.get('FEED_STORAGE_BATCH_ITEM_COUNT', None) for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured @@ -290,7 +290,7 @@ def _close_slot(self, slot, spider): def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): """ Redirect the output data stream to a new file. - Execute multiple times if 'FEED_STORAGE_BATCH' setting is specified. + Execute multiple times if 'FEED_STORAGE_BATCH_ITEM_COUNT' setting is specified. :param batch_id: sequence number of current batch :param uri: uri of the new batch to start :param feed: dict with parameters of feed @@ -326,7 +326,7 @@ def item_scraped(self, item, spider): slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 - # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_SIZE and close the old one + # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one if self.storage_batch_size and slot.itemcount == self.storage_batch_size: uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params'], slot) self._close_slot(slot, spider) @@ -358,12 +358,12 @@ def _exporter_supported(self, format): def _batch_deliveries_supported(self, uri): """ - If FEED_STORAGE_BATCH_SIZE setting is specified uri has to contain %(batch_time)s or %(batch_id)s + If FEED_STORAGE_BATCH_ITEM_COUNT setting is specified uri has to contain %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ if self.storage_batch_size is None or '%(batch_time)s' in uri or '%(batch_id)s' in uri: return True - logger.warning('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_SIZE setting is specified') + logger.warning('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified') return False def _storage_supported(self, uri): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index c3463a505dd..5a7dc533e50 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,7 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } -FEED_STORAGE_BATCH_SIZE = None +FEED_STORAGE_BATCH_ITEM_COUNT = None FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 2217bb4edeb..1a21eeba9be 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1025,7 +1025,7 @@ def assertExportedJsonLines(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'jl', self._file_mark): {'format': 'jl'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['jl']: @@ -1041,7 +1041,7 @@ def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): os.path.join(self._random_temp_filename(), 'csv', self._file_mark): {'format': 'csv'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] data = yield self.exported_data(items, settings) for batch in data['csv']: got_batch = csv.DictReader(to_unicode(batch).splitlines()) @@ -1057,7 +1057,7 @@ def assertExportedXml(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'xml', self._file_mark): {'format': 'xml'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['xml']: @@ -1075,7 +1075,7 @@ def assertExportedMultiple(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'json', self._file_mark): {'format': 'json'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) # XML @@ -1100,7 +1100,7 @@ def assertExportedPickle(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'pickle', self._file_mark): {'format': 'pickle'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import pickle @@ -1117,7 +1117,7 @@ def assertExportedMarshal(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'marshal', self._file_mark): {'format': 'marshal'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import marshal @@ -1140,7 +1140,7 @@ def test_export_items(self): {'foo': 'bar3', 'baz': 'quux3', 'egg': ''} ] settings = { - 'FEED_STORAGE_BATCH_SIZE': 2 + 'FEED_STORAGE_BATCH_ITEM_COUNT': 2 } header = self.MyItem.fields.keys() yield self.assertExported(items, header, rows, settings=settings) @@ -1151,7 +1151,7 @@ def test_wrong_path(self): 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, }, - 'FEED_STORAGE_BATCH_SIZE': 1 + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1 } crawler = get_crawler(settings_dict=settings) self.assertRaises(NotConfigured, FeedExporter, crawler) @@ -1163,7 +1163,7 @@ def test_export_no_items_not_store_empty(self): 'FEEDS': { os.path.join(self._random_temp_filename(), fmt, self._file_mark): {'format': fmt}, }, - 'FEED_STORAGE_BATCH_SIZE': 1 + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1 } data = yield self.exported_no_data(settings) data = dict(data) @@ -1185,7 +1185,7 @@ def test_export_no_items_store_empty(self): }, 'FEED_STORE_EMPTY': True, 'FEED_EXPORT_INDENT': None, - 'FEED_STORAGE_BATCH_SIZE': 1, + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, } data = yield self.exported_no_data(settings) data = dict(data) @@ -1225,7 +1225,7 @@ def test_export_multiple_configs(self): 'encoding': 'utf-8', }, }, - 'FEED_STORAGE_BATCH_SIZE': 1, + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, } data = yield self.exported_data(items, settings) for fmt, expected in formats.items(): @@ -1249,7 +1249,7 @@ def test_batch_path_differ(self): 'format': 'json', }, }, - 'FEED_STORAGE_BATCH_SIZE': 1, + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, } data = yield self.exported_data(items, settings) self.assertEqual(len(items) + 1, len(data['json'])) @@ -1291,7 +1291,7 @@ def test_s3_export(self): 'format': 'json', }, }, - 'FEED_STORAGE_BATCH_SIZE': 1, + 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, } items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), @@ -1309,7 +1309,7 @@ def parse(self, response): s3 = boto3.resource('s3') my_bucket = s3.Bucket(s3_test_bucket_name) - batch_size = settings['FEED_STORAGE_BATCH_SIZE'] + batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] with MockServer() as s: runner = CrawlerRunner(Settings(settings)) From 8662d3587df74841d4ea640c0432446569e59262 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 13 May 2020 23:41:01 +0300 Subject: [PATCH 17/35] Documentation and code refactoring --- docs/topics/feed-exports.rst | 21 ++++++++++++--------- scrapy/extensions/feedexport.py | 7 ++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 917240d4d37..0f15044b32b 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -437,24 +437,27 @@ FEED_STORAGE_BATCH_ITEM_COUNT ---------------------- Default: ``None`` -An integer number which represent number of scraped items stored in each output +An integer number that represents the number of scraped items stored in each output file. Whenever the number of items exceeds this setting, a new file is -created and output redirects to it. The name of the new file will be selected -based on timestamp when the feed is being created and/or batch sequence number. -Therefore you must specify %(batch_time)s or %(batch_id)s or both in FEED_URI. +created and the output is redirected to it. The name of the new file will be selected +based on the timestamp when the feed is being created and/or on the batch sequence number. +Therefore you must specify %(batch_time)s or %(batch_id)s or both in :setting:`FEED_URI`. * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created -* ``%(batch_id)s`` - gets replaced by sequence number of batch +(e.g. `2020-03-28T14-45-08.237134`) -For instance:: +* ``%(batch_id)s`` - gets replaced by the batch sequence number of batch +(e.g. `2` for the second file) + +For instance, if your settings include:: FEED_STORAGE_BATCH_ITEM_COUNT=100 -Your request can be like:: +And your :command:`crawl` command line is:: scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json -The result directory tree of above can be like:: +The resulting directory tree of above can be like:: ->projectname -->dirname @@ -462,5 +465,5 @@ The result directory tree of above can be like:: --->2-filename2020-03-28T14-45-09.148903.json --->3-filename2020-03-28T14-45-10.046092.json -Where first and second files contain exactly 100 items. The last one contains +Where the first and second files contain exactly 100 items. The last one contains <= 100 items. \ No newline at end of file diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 5bc946634d1..4c9362f3a95 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -25,6 +25,7 @@ from scrapy.utils.misc import create_instance, load_object from scrapy.utils.python import without_none_values + logger = logging.getLogger(__name__) @@ -337,9 +338,9 @@ def item_scraped(self, item, spider): spider=spider, template_uri=slot.template_uri, )) - self.slots[idx] = None - self.slots = [slot for slot in self.slots if slot is not None] - self.slots.extend(slots) + else: + slots.append(slot) + self.slots = slots def _load_components(self, setting_prefix): conf = without_none_values(self.settings.getwithbase(setting_prefix)) From 69c005f013eb0dc000611853e66371fad17dea9d Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 14 May 2020 10:35:56 +0300 Subject: [PATCH 18/35] Documentation indent fix --- docs/topics/feed-exports.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 42c4e2267dd..dfeea5b7f73 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -444,10 +444,10 @@ based on the timestamp when the feed is being created and/or on the batch sequen Therefore you must specify %(batch_time)s or %(batch_id)s or both in :setting:`FEED_URI`. * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created -(e.g. `2020-03-28T14-45-08.237134`) + (e.g. `2020-03-28T14-45-08.237134`) * ``%(batch_id)s`` - gets replaced by the batch sequence number of batch -(e.g. `2` for the second file) + (e.g. `2` for the second file) For instance, if your settings include:: From 1cdcf8b08b8f1e68c5b107b6ae39b2da1aedd245 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 15 May 2020 19:46:36 +0300 Subject: [PATCH 19/35] Minor fixes --- docs/topics/feed-exports.rst | 19 ++++++++++--------- scrapy/extensions/feedexport.py | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index dfeea5b7f73..638733b6af6 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -437,17 +437,18 @@ FEED_STORAGE_BATCH_ITEM_COUNT ----------------------------- Default: ``None`` -An integer number that represents the number of scraped items stored in each output -file. Whenever the number of items exceeds this setting, a new file is -created and the output is redirected to it. The name of the new file will be selected -based on the timestamp when the feed is being created and/or on the batch sequence number. -Therefore you must specify %(batch_time)s or %(batch_id)s or both in :setting:`FEED_URI`. +If assigned an integer number higher than ``0``, Scrapy generates multiple output files +storing up to the specified number of items in each output file. + +When generating multiple output files, you must use at least one of the following +placeholders in :setting:`FEED_URI` to indicate how the different output file names are +generated: * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created - (e.g. `2020-03-28T14-45-08.237134`) + (e.g. ``2020-03-28T14-45-08.237134``) * ``%(batch_id)s`` - gets replaced by the batch sequence number of batch - (e.g. `2` for the second file) + (e.g. ``2`` for the second file) For instance, if your settings include:: @@ -457,7 +458,7 @@ And your :command:`crawl` command line is:: scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json -The resulting directory tree of above can be like:: +The command line above can generate a directory tree like:: ->projectname -->dirname @@ -466,4 +467,4 @@ The resulting directory tree of above can be like:: --->3-filename2020-03-28T14-45-10.046092.json Where the first and second files contain exactly 100 items. The last one contains -<= 100 items. \ No newline at end of file +100 items or fever. diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 4c9362f3a95..3d691c5801f 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -180,7 +180,7 @@ def _store_in_thread(self, file): class _FeedSlot: - def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, template_uri): + def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template): self.file = file self.exporter = exporter self.storage = storage @@ -188,7 +188,7 @@ def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, self.batch_id = batch_id self.format = format self.store_empty = store_empty - self.template_uri = template_uri + self.uri_template = uri_template self.uri = uri # flags self.itemcount = 0 @@ -260,7 +260,7 @@ def open_spider(self, spider): uri=uri % uri_params, feed=feed, spider=spider, - template_uri=uri, + uri_template=uri, )) def close_spider(self, spider): @@ -288,7 +288,7 @@ def _close_slot(self, slot, spider): extra={'spider': spider})) return d - def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): + def _start_new_batch(self, batch_id, uri, feed, spider, uri_template): """ Redirect the output data stream to a new file. Execute multiple times if 'FEED_STORAGE_BATCH_ITEM_COUNT' setting is specified. @@ -296,7 +296,7 @@ def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider - :param template_uri: template uri which contains %(batch_time)s or %(batch_id)s to create new uri + :param uri_template: template of uri which contains %(batch_time)s or %(batch_id)s to create new uri """ storage = self._get_storage(uri) file = storage.open(spider) @@ -315,7 +315,7 @@ def _start_new_batch(self, batch_id, uri, feed, spider, template_uri): format=feed['format'], store_empty=feed['store_empty'], batch_id=batch_id, - template_uri=template_uri, + uri_template=uri_template, ) if slot.store_empty: slot.start_exporting() @@ -329,14 +329,14 @@ def item_scraped(self, item, spider): slot.itemcount += 1 # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one if self.storage_batch_size and slot.itemcount == self.storage_batch_size: - uri_params = self._get_uri_params(spider, self.feeds[slot.template_uri]['uri_params'], slot) + uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot) self._close_slot(slot, spider) slots.append(self._start_new_batch( batch_id=slot.batch_id + 1, - uri=slot.template_uri % uri_params, - feed=self.feeds[slot.template_uri], + uri=slot.uri_template % uri_params, + feed=self.feeds[slot.uri_template], spider=spider, - template_uri=slot.template_uri, + uri_template=slot.uri_template, )) else: slots.append(slot) From 10ae1a284f759b541d086e3d1a13cda96b6e2040 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 15 May 2020 22:50:54 +0300 Subject: [PATCH 20/35] Minor fixes --- docs/topics/feed-exports.rst | 2 +- scrapy/extensions/feedexport.py | 10 +++++----- tests/test_feedexport.py | 2 +- tox.ini | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 638733b6af6..6f7db20c420 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -467,4 +467,4 @@ The command line above can generate a directory tree like:: --->3-filename2020-03-28T14-45-10.046092.json Where the first and second files contain exactly 100 items. The last one contains -100 items or fever. +100 items or fewer. diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 3d691c5801f..cc26ae173ec 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -243,11 +243,11 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch_size = self.settings.get('FEED_STORAGE_BATCH_ITEM_COUNT', None) + self.storage_batch_item_count = self.settings.get('FEED_STORAGE_BATCH_ITEM_COUNT', None) for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured - if not self._batch_deliveries_supported(uri): + if not self._settings_are_valid(uri): raise NotConfigured if not self._exporter_supported(feed['format']): raise NotConfigured @@ -328,7 +328,7 @@ def item_scraped(self, item, spider): slot.exporter.export_item(item) slot.itemcount += 1 # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one - if self.storage_batch_size and slot.itemcount == self.storage_batch_size: + if self.storage_batch_item_count and slot.itemcount == self.storage_batch_item_count: uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot) self._close_slot(slot, spider) slots.append(self._start_new_batch( @@ -357,12 +357,12 @@ def _exporter_supported(self, format): return True logger.error("Unknown feed format: %(format)s", {'format': format}) - def _batch_deliveries_supported(self, uri): + def _settings_are_valid(self, uri): """ If FEED_STORAGE_BATCH_ITEM_COUNT setting is specified uri has to contain %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ - if self.storage_batch_size is None or '%(batch_time)s' in uri or '%(batch_id)s' in uri: + if not self.storage_batch_item_count or '%(batch_time)s' in uri or '%(batch_id)s' in uri: return True logger.warning('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified') return False diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index d1374f291e7..88f9a59333c 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -986,7 +986,7 @@ def test_pathlib_uri(self): self.assertEqual(data['csv'], b'') -class PartialDeliveriesTest(FeedExportTestBase): +class BatchDeliveriesTest(FeedExportTestBase): __test__ = True _file_mark = '_%(batch_time)s_#%(batch_id)s_' diff --git a/tox.ini b/tox.ini index 6dd944dff1f..7507a14a6ac 100644 --- a/tox.ini +++ b/tox.ini @@ -12,9 +12,9 @@ deps = -ctests/constraints.txt -rtests/requirements-py3.txt # Extras + boto3>=1.13.0 botocore>=1.3.23 Pillow>=3.4.2 - boto3>=1.13.0 passenv = S3_TEST_FILE_URI AWS_ACCESS_KEY_ID From a7d070f3bb350cbe1f7b580350d5f491f59d47d8 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Mon, 18 May 2020 22:25:29 +0300 Subject: [PATCH 21/35] Change log level to error --- scrapy/extensions/feedexport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index cc26ae173ec..ce7fc372d0b 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -364,7 +364,7 @@ def _settings_are_valid(self, uri): """ if not self.storage_batch_item_count or '%(batch_time)s' in uri or '%(batch_id)s' in uri: return True - logger.warning('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified') + logger.error('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified') return False def _storage_supported(self, uri): From 677e619d3761e6669c247786bb95822ce38c8080 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 21 May 2020 14:57:03 +0300 Subject: [PATCH 22/35] Fix too long lines --- scrapy/extensions/feedexport.py | 4 +++- tests/test_feedexport.py | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index ce7fc372d0b..1f745be98ee 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -364,7 +364,9 @@ def _settings_are_valid(self, uri): """ if not self.storage_batch_item_count or '%(batch_time)s' in uri or '%(batch_id)s' in uri: return True - logger.error('%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified') + logger.error( + '%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified' + ) return False def _storage_supported(self, uri): diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 08ee24768c3..fecb17e29fd 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1204,12 +1204,20 @@ def test_export_multiple_configs(self): items = [dict({'foo': u'FOO', 'bar': u'BAR'}), dict({'foo': u'FOO1', 'bar': u'BAR1'})] formats = { - 'json': [u'[\n{"bar": "BAR"}\n]'.encode('utf-8'), - u'[\n{"bar": "BAR1"}\n]'.encode('utf-8')], - 'xml': [u'\n\n \n FOO\n \n'.encode('latin-1'), - u'\n\n \n FOO1\n \n'.encode('latin-1')], - 'csv': [u'bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), - u'bar,foo\r\nBAR1,FOO1\r\n'.encode('utf-8')], + 'json': ['[\n{"bar": "BAR"}\n]'.encode('utf-8'), + '[\n{"bar": "BAR1"}\n]'.encode('utf-8')], + 'xml': [ + ( + '\n' + '\n \n FOO\n \n' + ).encode('latin-1'), + ( + '\n' + '\n \n FOO1\n \n' + ).encode('latin-1') + ], + 'csv': ['bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), + 'bar,foo\r\nBAR1,FOO1\r\n'.encode('utf-8')], } settings = { From dd96f94e8cc1517b7021e35e46cbdc92580c6333 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 22 May 2020 23:30:33 +0300 Subject: [PATCH 23/35] Push datetime.utcnow() to its own variable --- scrapy/extensions/feedexport.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 1f745be98ee..45c2971a616 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -398,8 +398,9 @@ def _get_uri_params(self, spider, uri_params, slot): params = {} for k in dir(spider): params[k] = getattr(spider, k) - params['time'] = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-') - params['batch_time'] = datetime.utcnow().isoformat().replace(':', '-') + utc_now = datetime.utcnow() + params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-') + params['batch_time'] = utc_now.isoformat().replace(':', '-') params['batch_id'] = slot.batch_id + 1 if slot is not None else 1 uripar_function = load_object(uri_params) if uri_params else lambda x, y: None uripar_function(params, spider) From c3cee74fd401e6a6307b5eb1786e532bb2cd5aa8 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 26 Jun 2020 18:45:21 +0300 Subject: [PATCH 24/35] Change default value of FEED_STORAGE_BATCH_ITEM_COUNT to 0 --- docs/topics/feed-exports.rst | 2 +- scrapy/extensions/feedexport.py | 2 +- scrapy/settings/default_settings.py | 2 +- tests/test_feedexport.py | 20 ++++++++++---------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 866ce78eb70..0b37e9a7dfb 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -435,7 +435,7 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter FEED_STORAGE_BATCH_ITEM_COUNT ----------------------------- -Default: ``None`` +Default: ``0`` If assigned an integer number higher than ``0``, Scrapy generates multiple output files storing up to the specified number of items in each output file. diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 1331782e3b9..e06116acda2 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -243,7 +243,7 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch_item_count = self.settings.get('FEED_STORAGE_BATCH_ITEM_COUNT', None) + self.storage_batch_item_count = self.settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 5a7dc533e50..810acd5a39d 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,7 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } -FEED_STORAGE_BATCH_ITEM_COUNT = None +FEED_STORAGE_BATCH_ITEM_COUNT = 0 FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 1a6a5624b75..578cd396bba 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1144,7 +1144,7 @@ def assertExportedJsonLines(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'jl', self._file_mark): {'format': 'jl'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['jl']: @@ -1160,7 +1160,7 @@ def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): os.path.join(self._random_temp_filename(), 'csv', self._file_mark): {'format': 'csv'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') data = yield self.exported_data(items, settings) for batch in data['csv']: got_batch = csv.DictReader(to_unicode(batch).splitlines()) @@ -1176,7 +1176,7 @@ def assertExportedXml(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'xml', self._file_mark): {'format': 'xml'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['xml']: @@ -1194,7 +1194,7 @@ def assertExportedMultiple(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'json', self._file_mark): {'format': 'json'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) # XML @@ -1219,7 +1219,7 @@ def assertExportedPickle(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'pickle', self._file_mark): {'format': 'pickle'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import pickle @@ -1236,7 +1236,7 @@ def assertExportedMarshal(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'marshal', self._file_mark): {'format': 'marshal'}, }, }) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import marshal @@ -1262,7 +1262,7 @@ def test_export_items(self): 'FEED_STORAGE_BATCH_ITEM_COUNT': 2 } header = self.MyItem.fields.keys() - yield self.assertExported(items, header, rows, settings=settings) + yield self.assertExported(items, header, rows, settings=Settings(settings)) def test_wrong_path(self): """ If path is without %(batch_time)s or %(batch_id)s an exception must be raised """ @@ -1412,14 +1412,14 @@ def test_s3_export(self): bucket_name=s3_test_bucket_name, prefix=prefix ) storage = S3FeedStorage(s3_test_bucket_name, access_key, secret_key) - settings = { + settings = Settings({ 'FEEDS': { s3_test_file_uri: { 'format': 'json', }, }, 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, - } + }) items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), self.MyItem({'foo': 'bar2', 'egg': 'spam2', 'baz': 'quux2'}), @@ -1436,7 +1436,7 @@ def parse(self, response): s3 = boto3.resource('s3') my_bucket = s3.Bucket(s3_test_bucket_name) - batch_size = settings['FEED_STORAGE_BATCH_ITEM_COUNT'] + batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') with MockServer() as s: runner = CrawlerRunner(Settings(settings)) From 88a52198b90faa0129c8e05072197cdffbb9653b Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sat, 27 Jun 2020 11:50:26 +0300 Subject: [PATCH 25/35] Add batch_item_count support in FEEDS setting --- scrapy/extensions/feedexport.py | 5 +++-- tests/test_feedexport.py | 39 ++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index e06116acda2..2312c994ec4 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -25,7 +25,6 @@ from scrapy.utils.misc import create_instance, load_object from scrapy.utils.python import without_none_values - logger = logging.getLogger(__name__) @@ -337,7 +336,9 @@ def item_scraped(self, item, spider): slot.exporter.export_item(item) slot.itemcount += 1 # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one - if self.storage_batch_item_count and slot.itemcount == self.storage_batch_item_count: + if self.feeds[slot.uri_template].get('batch_item_count', self.storage_batch_item_count) \ + and slot.itemcount == self.feeds[slot.uri_template].get('batch_item_count', + self.storage_batch_item_count): uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot) self._close_slot(slot, spider) slots.append(self._start_new_batch( diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 578cd396bba..3bc0c083c52 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1327,8 +1327,9 @@ def test_export_multiple_configs(self): '\n \n FOO1\n \n' ).encode('latin-1') ], - 'csv': ['bar,foo\r\nBAR,FOO\r\n'.encode('utf-8'), - 'bar,foo\r\nBAR1,FOO1\r\n'.encode('utf-8')], + 'csv': ['foo,bar\r\nFOO,BAR\r\n'.encode('utf-8'), + 'foo,bar\r\nFOO1,BAR1\r\n'.encode('utf-8')], + 'jsonlines': ['{"foo": "FOO", "bar": "BAR"}\n{"foo": "FOO1", "bar": "BAR1"}\n'.encode('utf-8')], } settings = { @@ -1348,8 +1349,15 @@ def test_export_multiple_configs(self): os.path.join(self._random_temp_filename(), 'csv', self._file_mark): { 'format': 'csv', 'indent': None, - 'fields': ['bar', 'foo'], + 'fields': ['foo', 'bar'], + 'encoding': 'utf-8', + }, + os.path.join(self._random_temp_filename(), 'csv', self._file_mark): { + 'format': 'jsonlines', + 'indent': None, + 'fields': ['foo', 'bar'], 'encoding': 'utf-8', + 'batch_item_count': 0, }, }, 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, @@ -1359,6 +1367,31 @@ def test_export_multiple_configs(self): for expected_batch, got_batch in zip(expected, data[fmt]): self.assertEqual(expected_batch, got_batch) + @defer.inlineCallbacks + def test_batch_item_count_feeds_setting(self): + items = [dict({'foo': u'FOO', 'bar': u'BAR'}), dict({'foo': u'FOO1', 'bar': u'BAR1'})] + + formats = { + 'jsonlines': ['{"foo": "FOO", "bar": "BAR"}\n'.encode('utf-8'), + '{"foo": "FOO1", "bar": "BAR1"}\n'.encode('utf-8')], + } + + settings = { + 'FEEDS': { + os.path.join(self._random_temp_filename(), 'jsonlines', self._file_mark): { + 'format': 'jsonlines', + 'indent': None, + 'fields': ['foo', 'bar'], + 'encoding': 'utf-8', + 'batch_item_count': 1, + }, + }, + } + data = yield self.exported_data(items, settings) + for fmt, expected in formats.items(): + for expected_batch, got_batch in zip(expected, data[fmt]): + self.assertEqual(expected_batch, got_batch) + @defer.inlineCallbacks def test_batch_path_differ(self): """ From 05c2587c6a32b84a94463f2b1187e49f94957aa2 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sun, 28 Jun 2020 09:45:45 +0300 Subject: [PATCH 26/35] Docs update and tiny fixes --- docs/topics/feed-exports.rst | 1 + tests/test_feedexport.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 0b37e9a7dfb..3da56821e29 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -272,6 +272,7 @@ as a fallback value if that key is not provided for a specific feed definition. * ``fields``: falls back to :setting:`FEED_EXPORT_FIELDS` * ``indent``: falls back to :setting:`FEED_EXPORT_INDENT` * ``store_empty``: falls back to :setting:`FEED_STORE_EMPTY` +* ``batch_item_count``: falls back to :setting:`FEED_STORAGE_BATCH_ITEM_COUNT` .. setting:: FEED_EXPORT_ENCODING diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 3bc0c083c52..542cce70fc5 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1352,7 +1352,7 @@ def test_export_multiple_configs(self): 'fields': ['foo', 'bar'], 'encoding': 'utf-8', }, - os.path.join(self._random_temp_filename(), 'csv', self._file_mark): { + os.path.join(self._random_temp_filename(), 'jsonlines', self._file_mark): { 'format': 'jsonlines', 'indent': None, 'fields': ['foo', 'bar'], @@ -1423,8 +1423,8 @@ def test_s3_export(self): [testenv] setenv = AWS_SECRET_ACCESS_KEY = ABCD - AWS_ACCESS_KEY_ID = ABCD - S3_TEST_BUCKET_NAME = ABCD + AWS_ACCESS_KEY_ID = EFGH + S3_TEST_BUCKET_NAME = IJKL """ try: import boto3 From 7b1d3c35ea3bfde2ac7fc69a2a26bbcb94aec1bf Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Wed, 1 Jul 2020 11:54:39 +0300 Subject: [PATCH 27/35] Minor updates --- docs/topics/feed-exports.rst | 4 ++-- scrapy/extensions/feedexport.py | 34 +++++++++++++++++---------------- scrapy/utils/conf.py | 4 ++++ tests/test_feedexport.py | 23 ++++++---------------- tests/test_utils_conf.py | 4 ++++ 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 3da56821e29..0b659f30e3c 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -442,7 +442,7 @@ If assigned an integer number higher than ``0``, Scrapy generates multiple outpu storing up to the specified number of items in each output file. When generating multiple output files, you must use at least one of the following -placeholders in :setting:`FEED_URI` to indicate how the different output file names are +placeholders in the feed URI to indicate how the different output file names are generated: * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created @@ -457,7 +457,7 @@ For instance, if your settings include:: And your :command:`crawl` command line is:: - scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json + scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json The command line above can generate a directory tree like:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 2312c994ec4..5908987a33b 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -242,7 +242,6 @@ def __init__(self, crawler): self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') - self.storage_batch_item_count = self.settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured @@ -253,7 +252,7 @@ def __init__(self, crawler): def open_spider(self, spider): for uri, feed in self.feeds.items(): - uri_params = self._get_uri_params(spider, feed['uri_params'], None) + uri_params = self._get_uri_params(spider, feed['uri_params']) self.slots.append(self._start_new_batch( batch_id=1, uri=uri % uri_params, @@ -299,7 +298,7 @@ def _close_slot(self, slot, spider): def _start_new_batch(self, batch_id, uri, feed, spider, uri_template): """ Redirect the output data stream to a new file. - Execute multiple times if 'FEED_STORAGE_BATCH_ITEM_COUNT' setting is specified. + Execute multiple times if FEED_STORAGE_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified :param batch_id: sequence number of current batch :param uri: uri of the new batch to start :param feed: dict with parameters of feed @@ -331,14 +330,15 @@ def _start_new_batch(self, batch_id, uri, feed, spider, uri_template): def item_scraped(self, item, spider): slots = [] - for idx, slot in enumerate(self.slots): + for slot in self.slots: slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one - if self.feeds[slot.uri_template].get('batch_item_count', self.storage_batch_item_count) \ - and slot.itemcount == self.feeds[slot.uri_template].get('batch_item_count', - self.storage_batch_item_count): + if ( + self.feeds[slot.uri_template]['batch_item_count'] + and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count'] + ): uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot) self._close_slot(slot, spider) slots.append(self._start_new_batch( @@ -369,15 +369,17 @@ def _exporter_supported(self, format): def _settings_are_valid(self, uri): """ - If FEED_STORAGE_BATCH_ITEM_COUNT setting is specified uri has to contain %(batch_time)s or %(batch_id)s - to distinguish different files of partial output + If FEED_STORAGE_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain + %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ - if not self.storage_batch_item_count or '%(batch_time)s' in uri or '%(batch_id)s' in uri: - return True - logger.error( - '%(batch_time)s or %(batch_id)s must be in uri if FEED_STORAGE_BATCH_ITEM_COUNT setting is specified' - ) - return False + for uri_template, values in self.feeds.items(): + if values['batch_item_count'] and not any(s in uri_template for s in ['%(batch_time)s', '%(batch_id)s']): + logger.error( + '%(batch_time)s or %(batch_id)s must be in uri({}) if FEED_STORAGE_BATCH_ITEM_COUNT setting ' + 'or FEEDS.batch_item_count is specified and greater than 0.'.format(uri_template) + ) + return False + return True def _storage_supported(self, uri): scheme = urlparse(uri).scheme @@ -404,7 +406,7 @@ def _get_exporter(self, file, format, *args, **kwargs): def _get_storage(self, uri): return self._get_instance(self.storages[urlparse(uri).scheme], uri) - def _get_uri_params(self, spider, uri_params, slot): + def _get_uri_params(self, spider, uri_params, slot=None): params = {} for k in dir(spider): params[k] = getattr(spider, k) diff --git a/scrapy/utils/conf.py b/scrapy/utils/conf.py index 5921f82bf8b..0e02f0f28c1 100644 --- a/scrapy/utils/conf.py +++ b/scrapy/utils/conf.py @@ -115,6 +115,10 @@ def feed_complete_default_values_from_settings(feed, settings): out = feed.copy() out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"]) out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None) + out.setdefault( + "batch_item_count", + out.get('batch_item_count', settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT')) + ) out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY")) out.setdefault("uri_params", settings["FEED_URI_PARAMS"]) if settings["FEED_EXPORT_INDENT"] is None: diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 542cce70fc5..db14b20b90d 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1265,7 +1265,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=Settings(settings)) def test_wrong_path(self): - """ If path is without %(batch_time)s or %(batch_id)s an exception must be raised """ + """ If path is without %(batch_time)s and %(batch_id)s an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, @@ -1329,7 +1329,6 @@ def test_export_multiple_configs(self): ], 'csv': ['foo,bar\r\nFOO,BAR\r\n'.encode('utf-8'), 'foo,bar\r\nFOO1,BAR1\r\n'.encode('utf-8')], - 'jsonlines': ['{"foo": "FOO", "bar": "BAR"}\n{"foo": "FOO1", "bar": "BAR1"}\n'.encode('utf-8')], } settings = { @@ -1352,13 +1351,6 @@ def test_export_multiple_configs(self): 'fields': ['foo', 'bar'], 'encoding': 'utf-8', }, - os.path.join(self._random_temp_filename(), 'jsonlines', self._file_mark): { - 'format': 'jsonlines', - 'indent': None, - 'fields': ['foo', 'bar'], - 'encoding': 'utf-8', - 'batch_item_count': 0, - }, }, 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, } @@ -1369,19 +1361,16 @@ def test_export_multiple_configs(self): @defer.inlineCallbacks def test_batch_item_count_feeds_setting(self): - items = [dict({'foo': u'FOO', 'bar': u'BAR'}), dict({'foo': u'FOO1', 'bar': u'BAR1'})] - + items = [dict({'foo': u'FOO'}), dict({'foo': u'FOO1'})] formats = { - 'jsonlines': ['{"foo": "FOO", "bar": "BAR"}\n'.encode('utf-8'), - '{"foo": "FOO1", "bar": "BAR1"}\n'.encode('utf-8')], + 'json': ['[{"foo": "FOO"}]'.encode('utf-8'), + '[{"foo": "FOO1"}]'.encode('utf-8')], } - settings = { 'FEEDS': { - os.path.join(self._random_temp_filename(), 'jsonlines', self._file_mark): { - 'format': 'jsonlines', + os.path.join(self._random_temp_filename(), 'json', self._file_mark): { + 'format': 'json', 'indent': None, - 'fields': ['foo', 'bar'], 'encoding': 'utf-8', 'batch_item_count': 1, }, diff --git a/tests/test_utils_conf.py b/tests/test_utils_conf.py index e5d3ef582c2..95ec2b64a85 100644 --- a/tests/test_utils_conf.py +++ b/tests/test_utils_conf.py @@ -149,6 +149,7 @@ def test_feed_complete_default_values_from_settings_empty(self): "FEED_EXPORT_INDENT": 42, "FEED_STORE_EMPTY": True, "FEED_URI_PARAMS": (1, 2, 3, 4), + "FEED_STORAGE_BATCH_ITEM_COUNT": 2, }) new_feed = feed_complete_default_values_from_settings(feed, settings) self.assertEqual(new_feed, { @@ -157,6 +158,7 @@ def test_feed_complete_default_values_from_settings_empty(self): "indent": 42, "store_empty": True, "uri_params": (1, 2, 3, 4), + "batch_item_count": 2, }) def test_feed_complete_default_values_from_settings_non_empty(self): @@ -169,6 +171,7 @@ def test_feed_complete_default_values_from_settings_non_empty(self): "FEED_EXPORT_FIELDS": ["f1", "f2", "f3"], "FEED_EXPORT_INDENT": 42, "FEED_STORE_EMPTY": True, + "FEED_STORAGE_BATCH_ITEM_COUNT": 2, }) new_feed = feed_complete_default_values_from_settings(feed, settings) self.assertEqual(new_feed, { @@ -177,6 +180,7 @@ def test_feed_complete_default_values_from_settings_non_empty(self): "indent": 42, "store_empty": True, "uri_params": None, + "batch_item_count": 2, }) From 1e245046ed8ac3d9f89860501c2da95b69aaabf6 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 2 Jul 2020 12:38:08 +0300 Subject: [PATCH 28/35] Change setting name. Add leading zeroes to batch_id. Minor fixes. --- docs/topics/feed-exports.rst | 19 +++++++++-------- scrapy/extensions/feedexport.py | 24 +++++++++++++--------- scrapy/settings/default_settings.py | 2 +- scrapy/utils/conf.py | 5 +---- tests/test_feedexport.py | 32 ++++++++++++++--------------- tests/test_utils_conf.py | 4 ++-- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 0b659f30e3c..56efa80a75e 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -220,7 +220,7 @@ These are the settings used for configuring the feed exports: * :setting:`FEED_STORAGE_FTP_ACTIVE` * :setting:`FEED_STORAGE_S3_ACL` * :setting:`FEED_EXPORTERS` - * :setting:`FEED_STORAGE_BATCH_ITEM_COUNT` + * :setting:`FEED_EXPORT_BATCH_ITEM_COUNT` .. currentmodule:: scrapy.extensions.feedexport @@ -272,7 +272,7 @@ as a fallback value if that key is not provided for a specific feed definition. * ``fields``: falls back to :setting:`FEED_EXPORT_FIELDS` * ``indent``: falls back to :setting:`FEED_EXPORT_INDENT` * ``store_empty``: falls back to :setting:`FEED_STORE_EMPTY` -* ``batch_item_count``: falls back to :setting:`FEED_STORAGE_BATCH_ITEM_COUNT` +* ``batch_item_count``: falls back to :setting:`FEED_EXPORT_BATCH_ITEM_COUNT` .. setting:: FEED_EXPORT_ENCODING @@ -432,9 +432,9 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter .. _botocore: https://github.com/boto/botocore .. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl -.. setting:: FEED_STORAGE_BATCH_ITEM_COUNT +.. setting:: FEED_EXPORT_BATCH_ITEM_COUNT -FEED_STORAGE_BATCH_ITEM_COUNT +FEED_EXPORT_BATCH_ITEM_COUNT ----------------------------- Default: ``0`` @@ -448,16 +448,19 @@ generated: * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created (e.g. ``2020-03-28T14-45-08.237134``) -* ``%(batch_id)s`` - gets replaced by the batch sequence number of batch - (e.g. ``2`` for the second file) +* ``%(batch_id)0xd`` - gets replaced by the sequence number of the batch. +By replacing ``x`` with an integer you set the number of leading zeroes to prevent +inappropriate sorting like this: [``'1'``, ``'10'``, ``'2'``]. Here are some examples: + ``%(batch_id)01d`` for the second batch gets replaced by ``2`` + ``%(batch_id)05d`` for the third batch gets replaced by ``00003`` For instance, if your settings include:: - FEED_STORAGE_BATCH_ITEM_COUNT=100 + FEED_EXPORT_BATCH_ITEM_COUNT=100 And your :command:`crawl` command line is:: - scrapy crawl spidername -o dirname/%(batch_id)s-filename%(batch_time)s.json + scrapy crawl spidername -o dirname/%(batch_id)d-filename%(batch_time)s.json The command line above can generate a directory tree like:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 5908987a33b..adb6ea2e462 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -6,6 +6,7 @@ import logging import os +import re import sys import warnings from datetime import datetime @@ -25,6 +26,7 @@ from scrapy.utils.misc import create_instance, load_object from scrapy.utils.python import without_none_values + logger = logging.getLogger(__name__) @@ -245,7 +247,7 @@ def __init__(self, crawler): for uri, feed in self.feeds.items(): if not self._storage_supported(uri): raise NotConfigured - if not self._settings_are_valid(uri): + if not self._settings_are_valid(): raise NotConfigured if not self._exporter_supported(feed['format']): raise NotConfigured @@ -298,7 +300,7 @@ def _close_slot(self, slot, spider): def _start_new_batch(self, batch_id, uri, feed, spider, uri_template): """ Redirect the output data stream to a new file. - Execute multiple times if FEED_STORAGE_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified + Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified :param batch_id: sequence number of current batch :param uri: uri of the new batch to start :param feed: dict with parameters of feed @@ -334,10 +336,10 @@ def item_scraped(self, item, spider): slot.start_exporting() slot.exporter.export_item(item) slot.itemcount += 1 - # create new slot for each slot with itemcount == FEED_STORAGE_BATCH_ITEM_COUNT and close the old one + # create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one if ( - self.feeds[slot.uri_template]['batch_item_count'] - and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count'] + self.feeds[slot.uri_template]['batch_item_count'] + and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count'] ): uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot) self._close_slot(slot, spider) @@ -367,16 +369,18 @@ def _exporter_supported(self, format): return True logger.error("Unknown feed format: %(format)s", {'format': format}) - def _settings_are_valid(self, uri): + def _settings_are_valid(self): """ - If FEED_STORAGE_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain + If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ for uri_template, values in self.feeds.items(): - if values['batch_item_count'] and not any(s in uri_template for s in ['%(batch_time)s', '%(batch_id)s']): + if values['batch_item_count'] and not re.findall(r'(%\(batch_time\)s|(%\(batch_id\)0\d*d))', uri_template): logger.error( - '%(batch_time)s or %(batch_id)s must be in uri({}) if FEED_STORAGE_BATCH_ITEM_COUNT setting ' - 'or FEEDS.batch_item_count is specified and greater than 0.'.format(uri_template) + '%(batch_time)s or %(batch_id)0xd must be in uri({}) if FEED_EXPORT_BATCH_ITEM_COUNT setting ' + 'or FEEDS.batch_item_count is specified and greater than 0. For more info see:' + 'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count' + ''.format(uri_template) ) return False return True diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 810acd5a39d..0016bbe1b06 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -146,7 +146,7 @@ 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } -FEED_STORAGE_BATCH_ITEM_COUNT = 0 +FEED_EXPORT_BATCH_ITEM_COUNT = 0 FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', diff --git a/scrapy/utils/conf.py b/scrapy/utils/conf.py index 0e02f0f28c1..64f9c824b50 100644 --- a/scrapy/utils/conf.py +++ b/scrapy/utils/conf.py @@ -113,12 +113,9 @@ def get_sources(use_closest=True): def feed_complete_default_values_from_settings(feed, settings): out = feed.copy() + out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT')) out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"]) out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None) - out.setdefault( - "batch_item_count", - out.get('batch_item_count', settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT')) - ) out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY")) out.setdefault("uri_params", settings["FEED_URI_PARAMS"]) if settings["FEED_EXPORT_INDENT"] is None: diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index db14b20b90d..d20b40e2f7a 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1108,7 +1108,7 @@ def test_multiple_feeds_failing_logs_blocking_feed_storage(self): class BatchDeliveriesTest(FeedExportTestBase): __test__ = True - _file_mark = '_%(batch_time)s_#%(batch_id)s_' + _file_mark = '_%(batch_time)s_#%(batch_id)02d_' @defer.inlineCallbacks def run_and_export(self, spider_cls, settings): @@ -1144,7 +1144,7 @@ def assertExportedJsonLines(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'jl', self._file_mark): {'format': 'jl'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['jl']: @@ -1160,7 +1160,7 @@ def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): os.path.join(self._random_temp_filename(), 'csv', self._file_mark): {'format': 'csv'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') data = yield self.exported_data(items, settings) for batch in data['csv']: got_batch = csv.DictReader(to_unicode(batch).splitlines()) @@ -1176,7 +1176,7 @@ def assertExportedXml(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'xml', self._file_mark): {'format': 'xml'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) for batch in data['xml']: @@ -1194,7 +1194,7 @@ def assertExportedMultiple(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'json', self._file_mark): {'format': 'json'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) # XML @@ -1219,7 +1219,7 @@ def assertExportedPickle(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'pickle', self._file_mark): {'format': 'pickle'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import pickle @@ -1236,7 +1236,7 @@ def assertExportedMarshal(self, items, rows, settings=None): os.path.join(self._random_temp_filename(), 'marshal', self._file_mark): {'format': 'marshal'}, }, }) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') rows = [{k: v for k, v in row.items() if v} for row in rows] data = yield self.exported_data(items, settings) import marshal @@ -1259,18 +1259,18 @@ def test_export_items(self): {'foo': 'bar3', 'baz': 'quux3', 'egg': ''} ] settings = { - 'FEED_STORAGE_BATCH_ITEM_COUNT': 2 + 'FEED_EXPORT_BATCH_ITEM_COUNT': 2 } header = self.MyItem.fields.keys() yield self.assertExported(items, header, rows, settings=Settings(settings)) def test_wrong_path(self): - """ If path is without %(batch_time)s and %(batch_id)s an exception must be raised """ + """ If path is without %(batch_time)s and %(batch_id)0xd an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, }, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1 + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1 } crawler = get_crawler(settings_dict=settings) self.assertRaises(NotConfigured, FeedExporter, crawler) @@ -1282,7 +1282,7 @@ def test_export_no_items_not_store_empty(self): 'FEEDS': { os.path.join(self._random_temp_filename(), fmt, self._file_mark): {'format': fmt}, }, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1 + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1 } data = yield self.exported_no_data(settings) data = dict(data) @@ -1304,7 +1304,7 @@ def test_export_no_items_store_empty(self): }, 'FEED_STORE_EMPTY': True, 'FEED_EXPORT_INDENT': None, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1, } data = yield self.exported_no_data(settings) data = dict(data) @@ -1352,7 +1352,7 @@ def test_export_multiple_configs(self): 'encoding': 'utf-8', }, }, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1, } data = yield self.exported_data(items, settings) for fmt, expected in formats.items(): @@ -1398,7 +1398,7 @@ def test_batch_path_differ(self): 'format': 'json', }, }, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1, } data = yield self.exported_data(items, settings) self.assertEqual(len(items) + 1, len(data['json'])) @@ -1440,7 +1440,7 @@ def test_s3_export(self): 'format': 'json', }, }, - 'FEED_STORAGE_BATCH_ITEM_COUNT': 1, + 'FEED_EXPORT_BATCH_ITEM_COUNT': 1, }) items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), @@ -1458,7 +1458,7 @@ def parse(self, response): s3 = boto3.resource('s3') my_bucket = s3.Bucket(s3_test_bucket_name) - batch_size = settings.getint('FEED_STORAGE_BATCH_ITEM_COUNT') + batch_size = settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT') with MockServer() as s: runner = CrawlerRunner(Settings(settings)) diff --git a/tests/test_utils_conf.py b/tests/test_utils_conf.py index 95ec2b64a85..f3ef3612741 100644 --- a/tests/test_utils_conf.py +++ b/tests/test_utils_conf.py @@ -149,7 +149,7 @@ def test_feed_complete_default_values_from_settings_empty(self): "FEED_EXPORT_INDENT": 42, "FEED_STORE_EMPTY": True, "FEED_URI_PARAMS": (1, 2, 3, 4), - "FEED_STORAGE_BATCH_ITEM_COUNT": 2, + "FEED_EXPORT_BATCH_ITEM_COUNT": 2, }) new_feed = feed_complete_default_values_from_settings(feed, settings) self.assertEqual(new_feed, { @@ -171,7 +171,7 @@ def test_feed_complete_default_values_from_settings_non_empty(self): "FEED_EXPORT_FIELDS": ["f1", "f2", "f3"], "FEED_EXPORT_INDENT": 42, "FEED_STORE_EMPTY": True, - "FEED_STORAGE_BATCH_ITEM_COUNT": 2, + "FEED_EXPORT_BATCH_ITEM_COUNT": 2, }) new_feed = feed_complete_default_values_from_settings(feed, settings) self.assertEqual(new_feed, { From 6454d456d2bcab0828aba6d81d98f7393ab7e04d Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 3 Jul 2020 08:29:54 +0300 Subject: [PATCH 29/35] Make check of placeholder less strict --- docs/topics/feed-exports.rst | 11 ++++++----- scrapy/extensions/feedexport.py | 4 ++-- tests/test_feedexport.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 56efa80a75e..0bb5f173348 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -448,11 +448,12 @@ generated: * ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created (e.g. ``2020-03-28T14-45-08.237134``) -* ``%(batch_id)0xd`` - gets replaced by the sequence number of the batch. -By replacing ``x`` with an integer you set the number of leading zeroes to prevent -inappropriate sorting like this: [``'1'``, ``'10'``, ``'2'``]. Here are some examples: - ``%(batch_id)01d`` for the second batch gets replaced by ``2`` - ``%(batch_id)05d`` for the third batch gets replaced by ``00003`` +* ``%(batch_id)d`` - gets replaced by the sequence number of the batch. + + Use :ref:`printf-style string formatting ` to + alter the number format. For example, to make the batch ID a 5-digit + number by introducing leading zeroes as needed, use ``%(batch_id)05d`` + (e.g. ``3`` becomes ``00003``, ``123`` becomes ``00123``). For instance, if your settings include:: diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index adb6ea2e462..e15c1a09c47 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -375,9 +375,9 @@ def _settings_are_valid(self): %(batch_time)s or %(batch_id)s to distinguish different files of partial output """ for uri_template, values in self.feeds.items(): - if values['batch_item_count'] and not re.findall(r'(%\(batch_time\)s|(%\(batch_id\)0\d*d))', uri_template): + if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template): logger.error( - '%(batch_time)s or %(batch_id)0xd must be in uri({}) if FEED_EXPORT_BATCH_ITEM_COUNT setting ' + '%(batch_time)s or %(batch_id) must be in uri({}) if FEED_EXPORT_BATCH_ITEM_COUNT setting ' 'or FEEDS.batch_item_count is specified and greater than 0. For more info see:' 'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count' ''.format(uri_template) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index d20b40e2f7a..4e0b867a417 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1265,7 +1265,7 @@ def test_export_items(self): yield self.assertExported(items, header, rows, settings=Settings(settings)) def test_wrong_path(self): - """ If path is without %(batch_time)s and %(batch_id)0xd an exception must be raised """ + """ If path is without %(batch_time)s and %(batch_id) an exception must be raised """ settings = { 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, From f1020e0e6af064ab31b812c25bda6b0f08827222 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Mon, 6 Jul 2020 15:40:53 +0300 Subject: [PATCH 30/35] Tiny changes --- scrapy/extensions/feedexport.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index e15c1a09c47..21177b1b0c6 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -305,7 +305,7 @@ def _start_new_batch(self, batch_id, uri, feed, spider, uri_template): :param uri: uri of the new batch to start :param feed: dict with parameters of feed :param spider: user spider - :param uri_template: template of uri which contains %(batch_time)s or %(batch_id)s to create new uri + :param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri """ storage = self._get_storage(uri) file = storage.open(spider) @@ -372,13 +372,13 @@ def _exporter_supported(self, format): def _settings_are_valid(self): """ If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain - %(batch_time)s or %(batch_id)s to distinguish different files of partial output + %(batch_time)s or %(batch_id)d to distinguish different files of partial output """ for uri_template, values in self.feeds.items(): if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template): logger.error( - '%(batch_time)s or %(batch_id) must be in uri({}) if FEED_EXPORT_BATCH_ITEM_COUNT setting ' - 'or FEEDS.batch_item_count is specified and greater than 0. For more info see:' + '%(batch_time)s or %(batch_id)d must be in the feed URI ({}) if FEED_EXPORT_BATCH_ITEM_COUNT ' + 'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: ' 'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count' ''.format(uri_template) ) From 8bdcdb0a76e3780681dcfbbd4a0eee62e2bb05b1 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 16 Jul 2020 09:13:54 +0300 Subject: [PATCH 31/35] Add quotes to example in docs --- docs/topics/feed-exports.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 0bb5f173348..7e91b365dfc 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -461,7 +461,7 @@ For instance, if your settings include:: And your :command:`crawl` command line is:: - scrapy crawl spidername -o dirname/%(batch_id)d-filename%(batch_time)s.json + scrapy crawl spidername -o 'dirname/%(batch_id)d-filename%(batch_time)s.json' The command line above can generate a directory tree like:: From 41263f61c6de8048023ba4c80e062f56b21e5a19 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Thu, 16 Jul 2020 18:41:45 +0300 Subject: [PATCH 32/35] Change single quotes to double in example in docs --- docs/topics/feed-exports.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 7e91b365dfc..fdc6e7cba5c 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -461,7 +461,7 @@ For instance, if your settings include:: And your :command:`crawl` command line is:: - scrapy crawl spidername -o 'dirname/%(batch_id)d-filename%(batch_time)s.json' + scrapy crawl spidername -o "dirname/%(batch_id)d-filename%(batch_time)s.json" The command line above can generate a directory tree like:: From 86f7ac2f2b5d58e0b2588fa2aa4c777a8decf299 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Fri, 17 Jul 2020 17:48:25 +0300 Subject: [PATCH 33/35] Try to fix error at Windows --- tests/test_feedexport.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 129b7fc0b27..cc124624daa 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1204,6 +1204,8 @@ def run_and_export(self, spider_cls, settings): for path, feed in FEEDS.items(): dir_name = os.path.dirname(path) + if not os.path.exists(str(dir_name)): + continue for file in sorted(os.listdir(dir_name)): with open(os.path.join(dir_name, file), 'rb') as f: data = f.read() From 3e0492741d93b05c464457b3b128a2b0d24c994b Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Sun, 19 Jul 2020 00:10:29 +0300 Subject: [PATCH 34/35] Another try to fix test errors on Windows --- tests/test_feedexport.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index cc124624daa..c49b2e92fb0 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1190,9 +1190,14 @@ class BatchDeliveriesTest(FeedExportTestBase): def run_and_export(self, spider_cls, settings): """ Run spider with specified settings; return exported data. """ + def build_url(path): + if path[0] != '/': + path = '/' + path + return urljoin('file:', path) + FEEDS = settings.get('FEEDS') or {} settings['FEEDS'] = { - urljoin('file:', file_path): feed + build_url(file_path): feed for file_path, feed in FEEDS.items() } content = defaultdict(list) @@ -1204,8 +1209,6 @@ def run_and_export(self, spider_cls, settings): for path, feed in FEEDS.items(): dir_name = os.path.dirname(path) - if not os.path.exists(str(dir_name)): - continue for file in sorted(os.listdir(dir_name)): with open(os.path.join(dir_name, file), 'rb') as f: data = f.read() From a6c1d79b7cc3bc2c408eab356bbbf99a0536f110 Mon Sep 17 00:00:00 2001 From: BroodingKangaroo Date: Tue, 28 Jul 2020 11:53:05 +0300 Subject: [PATCH 35/35] pep8 tiny changes --- docs/topics/feed-exports.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 2c9774b5553..dd4eb3c614e 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -453,6 +453,7 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter FEED_EXPORT_BATCH_ITEM_COUNT ----------------------------- + Default: ``0`` If assigned an integer number higher than ``0``, Scrapy generates multiple output files @@ -474,7 +475,7 @@ generated: For instance, if your settings include:: - FEED_EXPORT_BATCH_ITEM_COUNT=100 + FEED_EXPORT_BATCH_ITEM_COUNT = 100 And your :command:`crawl` command line is::