From 04224d38439466f74b865434f56fac43eb1b6cd0 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Sat, 4 Jan 2025 21:07:20 -0700 Subject: [PATCH] Initial implementation of fuzzing harness Added pipeline Fix path in build script Formatting Add --- .github/workflows/cifuzz.yml | 40 ++++++++++++++++ CHANGELOG.md | 1 + README.md | 1 + fuzz/build.sh | 13 ++++++ fuzz/fuzz_helpers.py | 91 ++++++++++++++++++++++++++++++++++++ fuzz/pdf_load_fuzzer.py | 59 +++++++++++++++++++++++ 6 files changed, 205 insertions(+) create mode 100644 .github/workflows/cifuzz.yml create mode 100755 fuzz/build.sh create mode 100644 fuzz/fuzz_helpers.py create mode 100644 fuzz/pdf_load_fuzzer.py diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 00000000..115bd7f1 --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,40 @@ +name: CIFuzz +on: + push: + branches: + - stable + - develop + pull_request: +permissions: {} +jobs: + Fuzzing: + runs-on: ubuntu-latest + permissions: + security-events: write + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'pdfplumber' + language: python + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'pdfplumber' + language: python + fuzz-seconds: 800 + output-sarif: true + - name: Upload Crash + uses: actions/upload-artifact@v3 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts + - name: Upload Sarif + if: always() && steps.build.outcome == 'success' + uses: github/codeql-action/upload-sarif@v2 + with: + # Path to SARIF file relative to the root of the repository + sarif_file: cifuzz-sarif/results.sarif + checkout_path: cifuzz-sarif diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f02c888..1e427494 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format - Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235)) - Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195)) - Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201)) +- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1]() ### Fixed diff --git a/README.md b/README.md index 8370475b..e7e1598c 100644 --- a/README.md +++ b/README.md @@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes - [@wodny](https://github.com/wodny) - [Michal Stolarczyk](https://github.com/stolarczyk) - [Brandon Roberts](https://github.com/brandonrobertz) +- [@ennamarie19](https://github.com/ennamarie19/) ## Contributing diff --git a/fuzz/build.sh b/fuzz/build.sh new file mode 100755 index 00000000..72734874 --- /dev/null +++ b/fuzz/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash -eu + +cd "$SRC"/pdfplumber +pip3 install . + +# Build fuzzers in $OUT +for fuzzer in $(find fuzz -name '*_fuzzer.py');do + compile_python_fuzzer "$fuzzer" +done + +mkdir -p fuzz/corpus +find . -name "*.pdf" -exec cp "{}" fuzz/corpus \; +zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/* diff --git a/fuzz/fuzz_helpers.py b/fuzz/fuzz_helpers.py new file mode 100644 index 00000000..f5ea91b0 --- /dev/null +++ b/fuzz/fuzz_helpers.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ +import contextlib +import io +import tempfile +from enum import IntEnum +from typing import Protocol, Type, TypeVar + +import atheris + + +class HasMax(Protocol): + MAX: int + + +T = TypeVar("T", bound=IntEnum) + + +class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider): + def ConsumeRandomBytes(self) -> bytes: + return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes())) + + def ConsumeRandomString(self) -> str: + return self.ConsumeUnicodeNoSurrogates( + self.ConsumeIntInRange(0, self.remaining_bytes()) + ) + + def ConsumeRemainingString(self) -> str: + return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()) + + def ConsumeRemainingBytes(self) -> bytes: + return self.ConsumeBytes(self.remaining_bytes()) + + @contextlib.contextmanager + def ConsumeMemoryFile( + self, all_data: bool = False, as_bytes: bool = True + ) -> io.BytesIO: + if all_data: + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) + else: + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) + + file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data) + yield file + file.close() + + @contextlib.contextmanager + def ConsumeTemporaryFile( + self, suffix: str, all_data: bool = False, as_bytes: bool = True + ) -> str: + if all_data: + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) + else: + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) + + mode = "w+b" if as_bytes else "w+" + tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix) + tfile.write(file_data) + tfile.seek(0) + tfile.flush() + yield tfile.name + tfile.close() + + def ConsumeEnum(self, enum_type: Type[T]) -> T: + return enum_type(self.ConsumeIntInRange(0, enum_type.MAX)) diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py new file mode 100644 index 00000000..d40800dc --- /dev/null +++ b/fuzz/pdf_load_fuzzer.py @@ -0,0 +1,59 @@ +import sys +from enum import IntEnum + +import atheris +from fuzz_helpers import EnhancedFuzzedDataProvider + +with atheris.instrument_imports(include=["pdfplumber"]): + from pdfminer.pdftypes import PDFException + from pdfminer.psparser import PSException + + import pdfplumber + + +class CastType(IntEnum): + CSV = 0 + IMAGE = 1 + JSON = 2 + DICT = 3 + MAX = 4 + + +def TestOneInput(data: bytes): + fdp = EnhancedFuzzedDataProvider(data) + + try: + with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f: + pdf = pdfplumber.open(f) + + # Test casting + cast_ty = fdp.ConsumeEnum(CastType) + + if cast_ty is CastType.CSV: + pdf.to_csv() + elif cast_ty is CastType.IMAGE and pdf.pages: + pdf.pages[0].to_image() + elif cast_ty is CastType.JSON: + pdf.to_json() + elif cast_ty is CastType.DICT: + pdf.to_dict() + + except (PDFException, PSException, AssertionError): + return -1 + except ValueError as e: + if "invalid literal for int" in str(e): + return -1 + raise e + except TypeError as e: + if "argument must be a string" in str(e): + return -1 + raise e + + +def main(): + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main()