Skip to content

Commit

Permalink
Initial implementation of fuzzing harness
Browse files Browse the repository at this point in the history
Added pipeline

Fix path in build script

Formatting

Add
  • Loading branch information
capuanob committed Jan 5, 2025
1 parent c562774 commit 04224d3
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 0 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/cifuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: CIFuzz
on:
push:
branches:
- stable
- develop
pull_request:
permissions: {}
jobs:
Fuzzing:
runs-on: ubuntu-latest
permissions:
security-events: write
steps:
- name: Build Fuzzers
id: build
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
with:
oss-fuzz-project-name: 'pdfplumber'
language: python
- name: Run Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
with:
oss-fuzz-project-name: 'pdfplumber'
language: python
fuzz-seconds: 800
output-sarif: true
- name: Upload Crash
uses: actions/upload-artifact@v3
if: failure() && steps.build.outcome == 'success'
with:
name: artifacts
path: ./out/artifacts
- name: Upload Sarif
if: always() && steps.build.outcome == 'success'
uses: github/codeql-action/upload-sarif@v2
with:
# Path to SARIF file relative to the root of the repository
sarif_file: cifuzz-sarif/results.sarif
checkout_path: cifuzz-sarif
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format
- Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
- Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
- Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1]()

### Fixed

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
- [@wodny](https://github.com/wodny)
- [Michal Stolarczyk](https://github.com/stolarczyk)
- [Brandon Roberts](https://github.com/brandonrobertz)
- [@ennamarie19](https://github.com/ennamarie19/)

## Contributing

Expand Down
13 changes: 13 additions & 0 deletions fuzz/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash -eu

cd "$SRC"/pdfplumber
pip3 install .

# Build fuzzers in $OUT
for fuzzer in $(find fuzz -name '*_fuzzer.py');do
compile_python_fuzzer "$fuzzer"
done

mkdir -p fuzz/corpus
find . -name "*.pdf" -exec cp "{}" fuzz/corpus \;
zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/*
91 changes: 91 additions & 0 deletions fuzz/fuzz_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/python3
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
import contextlib
import io
import tempfile
from enum import IntEnum
from typing import Protocol, Type, TypeVar

import atheris


class HasMax(Protocol):
MAX: int


T = TypeVar("T", bound=IntEnum)


class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
def ConsumeRandomBytes(self) -> bytes:
return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))

def ConsumeRandomString(self) -> str:
return self.ConsumeUnicodeNoSurrogates(
self.ConsumeIntInRange(0, self.remaining_bytes())
)

def ConsumeRemainingString(self) -> str:
return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())

def ConsumeRemainingBytes(self) -> bytes:
return self.ConsumeBytes(self.remaining_bytes())

@contextlib.contextmanager
def ConsumeMemoryFile(
self, all_data: bool = False, as_bytes: bool = True
) -> io.BytesIO:
if all_data:
file_data = (
self.ConsumeRemainingBytes()
if as_bytes
else self.ConsumeRemainingString()
)
else:
file_data = (
self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
)

file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
yield file
file.close()

@contextlib.contextmanager
def ConsumeTemporaryFile(
self, suffix: str, all_data: bool = False, as_bytes: bool = True
) -> str:
if all_data:
file_data = (
self.ConsumeRemainingBytes()
if as_bytes
else self.ConsumeRemainingString()
)
else:
file_data = (
self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
)

mode = "w+b" if as_bytes else "w+"
tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
tfile.write(file_data)
tfile.seek(0)
tfile.flush()
yield tfile.name
tfile.close()

def ConsumeEnum(self, enum_type: Type[T]) -> T:
return enum_type(self.ConsumeIntInRange(0, enum_type.MAX))
59 changes: 59 additions & 0 deletions fuzz/pdf_load_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
from enum import IntEnum

import atheris
from fuzz_helpers import EnhancedFuzzedDataProvider

with atheris.instrument_imports(include=["pdfplumber"]):
from pdfminer.pdftypes import PDFException
from pdfminer.psparser import PSException

import pdfplumber


class CastType(IntEnum):
CSV = 0
IMAGE = 1
JSON = 2
DICT = 3
MAX = 4


def TestOneInput(data: bytes):
fdp = EnhancedFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f:
pdf = pdfplumber.open(f)

# Test casting
cast_ty = fdp.ConsumeEnum(CastType)

if cast_ty is CastType.CSV:
pdf.to_csv()
elif cast_ty is CastType.IMAGE and pdf.pages:
pdf.pages[0].to_image()
elif cast_ty is CastType.JSON:
pdf.to_json()
elif cast_ty is CastType.DICT:
pdf.to_dict()

except (PDFException, PSException, AssertionError):
return -1
except ValueError as e:
if "invalid literal for int" in str(e):
return -1
raise e
except TypeError as e:
if "argument must be a string" in str(e):
return -1
raise e


def main():
atheris.Setup(sys.argv, TestOneInput)
atheris.Fuzz()


if __name__ == "__main__":
main()

0 comments on commit 04224d3

Please sign in to comment.