Skip to content

Commit

Permalink
Fix WARC headers parsing when record has Content-Length: 0 and reco…
Browse files Browse the repository at this point in the history
…rd after it.

Validated against Python implementation: https://github.com/webrecorder/warcio

Also fixed flaky test `record::verify_display` by sorting header names.
  • Loading branch information
bocharov committed Nov 7, 2024
1 parent 61a8387 commit 3e4bbd1
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 7 deletions.
10 changes: 5 additions & 5 deletions src/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,12 @@ impl std::convert::TryFrom<RawRecordHeader> for Record<EmptyBody> {
impl std::fmt::Display for RawRecordHeader {
fn fmt(&self, w: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
writeln!(w, "WARC/{}", self.version)?;
for (key, value) in self.as_ref().iter() {
let mut sorted: Vec<_> = self.as_ref().iter().collect();
sorted.sort_by_key(|(key, _)| key.to_string());
for (key, value) in sorted {
writeln!(w, "{}: {}", key.to_string(), String::from_utf8_lossy(value))?;
}
writeln!(w)?;

Ok(())
writeln!(w)
}
}

Expand Down Expand Up @@ -1058,8 +1058,8 @@ mod raw_tests {

let expected = "\
WARC/1.0\n\
warc-type: dunno\n\
warc-date: 2024-01-01T00:00:00Z\n\
warc-type: dunno\n\
\n\
";

Expand Down
74 changes: 72 additions & 2 deletions src/warc_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ impl<R: BufRead> Iterator for RawRecordIter<R> {
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 122 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -203,7 +203,7 @@ impl<R: BufRead> Iterator for RecordIter<R> {
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 205 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -683,4 +683,74 @@ mod next_item_tests {
assert_eq!(record.body(), b"12345678");
}
}

#[test]
fn empty_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:empty-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
";

let mut reader = WarcReader::new(create_reader!(raw));
let mut stream_iter = reader.stream_records();

let record = stream_iter
.next_item()
.unwrap()
.unwrap()
.into_buffered()
.unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:empty-content-length>");
assert_eq!(record.body(), b"");
}

#[test]
fn zero_and_nonzero_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:zero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
Warc-Type: non-empty-record\r\n\
Content-Length: 7\r\n\
WARC-Record-Id: <urn:test:nonzero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:58Z\r\n\
\r\n\
1234567\r\n\
\r\n\
";

let reader = WarcReader::new(create_reader!(raw));
let mut iter = reader.iter_records();

// Test the first record with Content-Length: 0
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:zero-content-length>");
assert_eq!(record.body(), b"");
}

// Test the second record with non-zero Content-Length
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 7);
assert_eq!(record.warc_id(), "<urn:test:nonzero-content-length>");
assert_eq!(record.body(), b"1234567");
}
}
}

0 comments on commit 3e4bbd1

Please sign in to comment.