Skip to content

Commit

Permalink
Fix WARC headers parsing when record has Content-Length: 0 and reco…
Browse files Browse the repository at this point in the history
…rd after it.

Validated against Python implementation: https://github.com/webrecorder/warcio

Also fixed flaky test `record::verify_display` by sorting header names in the test.
  • Loading branch information
bocharov committed Nov 12, 2024
1 parent 61a8387 commit 7d7a2d4
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 15 deletions.
38 changes: 25 additions & 13 deletions src/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1046,24 +1046,36 @@ mod raw_tests {

#[test]
fn verify_display() {
let header_entries = vec![
(WarcHeader::WarcType, b"dunno".to_vec()),
(WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
];

let headers = RawRecordHeader {
version: "1.0".to_owned(),
headers: vec![
(WarcHeader::WarcType, b"dunno".to_vec()),
(WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
]
.into_iter()
.collect(),
headers: header_entries.into_iter().collect(),
};

let expected = "\
WARC/1.0\n\
warc-type: dunno\n\
warc-date: 2024-01-01T00:00:00Z\n\
\n\
";
let output = headers.to_string();

let expected_lines = vec![
"WARC/1.0",
"warc-type: dunno",
"warc-date: 2024-01-01T00:00:00Z",
"",
];
let actual_lines: Vec<_> = output.lines().collect();

let mut expected_headers: Vec<_> = expected_lines[1..expected_lines.len() - 1].to_vec();
expected_headers.sort();

let mut actual_headers: Vec<_> = actual_lines[1..actual_lines.len() - 1].to_vec();
actual_headers.sort();

assert_eq!(headers.to_string(), expected);
// verify parts
assert_eq!(actual_lines[0], expected_lines[0]); // WARC version
assert_eq!(actual_headers, expected_headers); // headers (sorted)
assert_eq!(actual_lines.last(), expected_lines.last()); // empty line
}
}

Expand Down
74 changes: 72 additions & 2 deletions src/warc_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ impl<R: BufRead> Iterator for RawRecordIter<R> {
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 122 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -203,7 +203,7 @@ impl<R: BufRead> Iterator for RecordIter<R> {
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 205 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -683,4 +683,74 @@ mod next_item_tests {
assert_eq!(record.body(), b"12345678");
}
}

#[test]
fn empty_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:empty-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
";

let mut reader = WarcReader::new(create_reader!(raw));
let mut stream_iter = reader.stream_records();

let record = stream_iter
.next_item()
.unwrap()
.unwrap()
.into_buffered()
.unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:empty-content-length>");
assert_eq!(record.body(), b"");
}

#[test]
fn zero_and_nonzero_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:zero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
Warc-Type: non-empty-record\r\n\
Content-Length: 7\r\n\
WARC-Record-Id: <urn:test:nonzero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:58Z\r\n\
\r\n\
1234567\r\n\
\r\n\
";

let reader = WarcReader::new(create_reader!(raw));
let mut iter = reader.iter_records();

// Test the first record with Content-Length: 0
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:zero-content-length>");
assert_eq!(record.body(), b"");
}

// Test the second record with non-zero Content-Length
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 7);
assert_eq!(record.warc_id(), "<urn:test:nonzero-content-length>");
assert_eq!(record.body(), b"1234567");
}
}
}

0 comments on commit 7d7a2d4

Please sign in to comment.