Skip to content

Commit

Permalink
Include robots behavior from the spider in the WARC warcinfo record
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 9, 2024
1 parent 3a173f9 commit 0f2ec31
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:

self.store = self._get_store()
self.writer = WarcFileWriter(collection_name=crawler.spider.name)
self.writer.write_warcinfo()
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
Expand Down
4 changes: 2 additions & 2 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ def write_request(self, request: Request, concurrent_to: ArcWarcRecord):
)
return record

def write_warcinfo(self) -> None:
def write_warcinfo(self, robotstxt_obey: bool) -> None:
"""Write WARC-Type: warcinfo record"""

content = {
"software": f"Scrapy/{scrapy_version} (+https://scrapy.org)",
"format": "WARC file version 1.0",
"conformsTo": "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/",
"isPartOf": self.collection_name,
"robots": "obey" if True else "ignore",
"robots": "obey" if robotstxt_obey else "ignore",
}

with open(self.warc_fname, "ab") as fh:
Expand Down
3 changes: 2 additions & 1 deletion tests/test_warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_write_warcinfo_record(self, fs):
assert warcinfo == ''

# Write warcinfo record and check output
wfw.write_warcinfo()
wfw.write_warcinfo(robotstxt_obey=True)
warcinfo = gzip.open('/tmp/test.warc.gz', 'rb').read().decode()

assert 'WARC/1.0' in warcinfo
Expand All @@ -80,3 +80,4 @@ def test_write_warcinfo_record(self, fs):
assert 'WARC-Filename: /tmp/test.warc.gz' in warcinfo
assert 'Content-Type: application/warc-fields' in warcinfo
assert f'isPartOf: {collection_name}' in warcinfo
assert 'robots: obey' in warcinfo

0 comments on commit 0f2ec31

Please sign in to comment.