diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index 104a569..2734c32 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -36,7 +36,7 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None: self.store = self._get_store() self.writer = WarcFileWriter(collection_name=crawler.spider.name) - self.writer.write_warcinfo() + self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"]) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py index bcdcf8c..b28aeed 100644 --- a/scrapy_webarchive/warc.py +++ b/scrapy_webarchive/warc.py @@ -133,7 +133,7 @@ def write_request(self, request: Request, concurrent_to: ArcWarcRecord): ) return record - def write_warcinfo(self) -> None: + def write_warcinfo(self, robotstxt_obey: bool) -> None: """Write WARC-Type: warcinfo record""" content = { @@ -141,7 +141,7 @@ def write_warcinfo(self) -> None: "format": "WARC file version 1.0", "conformsTo": "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/", "isPartOf": self.collection_name, - "robots": "obey" if True else "ignore", + "robots": "obey" if robotstxt_obey else "ignore", } with open(self.warc_fname, "ab") as fh: diff --git a/tests/test_warc.py b/tests/test_warc.py index fd47376..a73d484 100644 --- a/tests/test_warc.py +++ b/tests/test_warc.py @@ -71,7 +71,7 @@ def test_write_warcinfo_record(self, fs): assert warcinfo == '' # Write warcinfo record and check output - wfw.write_warcinfo() + wfw.write_warcinfo(robotstxt_obey=True) warcinfo = gzip.open('/tmp/test.warc.gz', 'rb').read().decode() assert 'WARC/1.0' in warcinfo @@ -80,3 +80,4 @@ def test_write_warcinfo_record(self, fs): assert 'WARC-Filename: /tmp/test.warc.gz' in warcinfo assert 'Content-Type: application/warc-fields' in warcinfo assert f'isPartOf: {collection_name}' in warcinfo + assert 'robots: obey' in warcinfo