Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mh update pillow 2024 - Fix tests and extraction of invalid bboxes #42

Open
wants to merge 3 commits into
base: mh-update-pillow-2024
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,14 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> N
float(ann_resolved["Rect"][3]) + ANNO_Y_TOLERANCE,
page.height,
)

left, top, right, bottom = ann_bbox
if top > bottom:
LOG.debug(f"invalid annotation bbox: {ann_resolved['Rect']}, {ann_bbox}")
return
# maybe continue with swapped bbox
# ann_bbox = [left, bottom, right, top]

page_crop = page.within_bbox(ann_bbox)
ann_text = page_crop.extract_text(x_tolerance=1, y_tolerance=4)

Expand Down
45 changes: 45 additions & 0 deletions libpdf/models/horizontal_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good.
:ivar y1: distance from the bottom of the page to the upper edge of the character
(greater than y0)
:vartype y1: float
:ivar ncolor: non-stroking-color as rgb value
:vartype ncolor: Tuple[float, float, float]
"""

def __init__(
Expand All @@ -28,13 +30,17 @@ def __init__(
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
ncolor: tuple | None = None,
fontname: str | None = None,
):
"""Init with plain char of a character and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.text = text
self.ncolor = ncolor
self.fontname = fontname

def __repr__(self) -> str:
"""Make the text part of the repr for better debugging."""
Expand Down Expand Up @@ -65,13 +71,24 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.chars = chars
self.ncolor = None
self.fontname = None

if self.chars:
# Obtain the rectangle coordinates from a list of libpdf text objects
self.x0 = min(text_obj.x0 for text_obj in self.chars)
self.y0 = min(text_obj.y0 for text_obj in self.chars)
self.x1 = max(text_obj.x1 for text_obj in self.chars)
self.y1 = max(text_obj.y1 for text_obj in self.chars)

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(self.chars[0], n)
and getattr(x, n) is not None
for x in self.chars
):
setattr(self, n, getattr(self.chars[0], n))

@property
def text(self) -> str:
"""Return plain text."""
Expand Down Expand Up @@ -106,13 +123,24 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.words = words
self.ncolor = None
self.fontname = None

if self.words:
# Obtain the rectangle coordinates from a list of libpdf text objects
self.x0 = min(text_obj.x0 for text_obj in self.words)
self.y0 = min(text_obj.y0 for text_obj in self.words)
self.x1 = max(text_obj.x1 for text_obj in self.words)
self.y1 = max(text_obj.y1 for text_obj in self.words)

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(self.words[0], n)
and getattr(x, n) is not None
for x in self.words
):
setattr(self, n, getattr(self.words[0], n))

@property
def text(self) -> str:
"""Return plain text."""
Expand Down Expand Up @@ -147,18 +175,35 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.lines = lines
self.ncolor = None
self.fontname = None

if self.lines:
# Obtain the rectangle coordinates from a list of libpdf text objects.
self.x0 = min(text_obj.x0 for text_obj in self.lines)
self.y0 = min(text_obj.y0 for text_obj in self.lines)
self.x1 = max(text_obj.x1 for text_obj in self.lines)
self.y1 = max(text_obj.y1 for text_obj in self.lines)

_words = [word for line in self.lines for word in line.words]

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(_words[0], n) and getattr(x, n) is not None
for x in _words
):
setattr(self, n, getattr(_words[0], n))

@property
def text(self) -> str:
"""Return plain text."""
return "\n".join([x.text for x in self.lines])

@property
def words(self) -> list[str]:
"""Return list of words."""
return [word for line in self.lines for word in line.words]

def __repr__(self) -> str | None:
"""Make the text part of the repr for better debugging."""
if self.lines:
Expand Down
10 changes: 9 additions & 1 deletion libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,15 @@ def assemble_to_textlines(
for lt_obj in flatten_lt_objs:
if lt_obj.get_text() != " " and lt_obj.get_text() != "\n":
# instantiate Char
char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1)
char = Char(
lt_obj.get_text(),
lt_obj.x0,
lt_obj.y0,
lt_obj.x1,
lt_obj.y1,
lt_obj.graphicstate.ncolor if hasattr(lt_obj, "graphicstate") else None,
lt_obj.fontname,
)
chars.append(char)

if lt_obj is flatten_lt_objs[-1]:
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
# test PDF for rect extraction generateby by sphinx-simplepdf
PDF_RECTS_EXTRACTION = Path(__file__).parent / "pdf" / "test_rects_extraction.pdf"

# test PDF for color style info
PDF_COLOR_STYLE = Path(__file__).parent / "pdf" / "test_words_color_style.pdf"


@pytest.fixture(scope="session")
def load_full_features_pdf(
Expand Down
Binary file added tests/pdf/test_words_color_style.odt
Binary file not shown.
Binary file added tests/pdf/test_words_color_style.pdf
Binary file not shown.
74 changes: 31 additions & 43 deletions tests/test_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,7 @@ def test_figures_extract_with_invalid_bbox():
objects = libpdf.load(PDF_FIGURE_WITH_INVALID_BBOX)
assert objects is not None
# extract figures only with valid bbox
assert len(objects.pdfplumber.pages[0].figures) == 1
assert objects.pdfplumber.pages[0].figures[0]["height"] == 0
assert (
objects.pdfplumber.pages[0].figures[0]["y0"]
== objects.pdfplumber.pages[0].figures[0]["y1"]
)

assert len(objects.pdfplumber.pages[1].figures) == 1
assert objects.pdfplumber.pages[1].figures[0]["height"] == 0
assert (
objects.pdfplumber.pages[1].figures[0]["y0"]
== objects.pdfplumber.pages[1].figures[0]["y1"]
)
assert len(objects.pdfplumber.pages[0].images) == 0

assert not objects.flattened.figures

Expand All @@ -41,68 +29,68 @@ def test_figures_extraction():
objects = libpdf.load(PDF_FIGURES_EXTRACTION)
assert objects.flattened.figures is not None

assert len(objects.pdfplumber.figures) == 6
assert len(objects.pdfplumber.images) == 6
assert len(objects.flattened.figures) == 2

# filter figure with negative position, partially outside page
assert objects.pdfplumber.figures[2]["x0"] < 0
assert objects.pdfplumber.images[2]["x0"] < 0
# check that figure exists no more
assert objects.flattened.figures[0].position.x0 >= 0
assert objects.flattened.figures[1].position.x0 >= 0

# filter figures that are too small
assert objects.pdfplumber.figures[4]["width"] < 15
assert objects.pdfplumber.figures[4]["height"] < 15
assert objects.pdfplumber.images[4]["width"] < 15
assert objects.pdfplumber.images[4]["height"] < 15
# check that figure exists no more
for figure in objects.flattened.figures:
assert figure.position.x1 - figure.position.x0 >= 15
assert figure.position.y1 - figure.position.y0 >= 15

# filter figures that are completely inside other figures
assert objects.pdfplumber.figures[1]["x0"] > objects.pdfplumber.figures[0]["x0"]
assert objects.pdfplumber.figures[1]["y0"] > objects.pdfplumber.figures[0]["y0"]
assert objects.pdfplumber.figures[1]["x1"] < objects.pdfplumber.figures[0]["x1"]
assert objects.pdfplumber.figures[1]["y1"] < objects.pdfplumber.figures[0]["y1"]
assert objects.pdfplumber.images[1]["x0"] > objects.pdfplumber.images[0]["x0"]
assert objects.pdfplumber.images[1]["y0"] > objects.pdfplumber.images[0]["y0"]
assert objects.pdfplumber.images[1]["x1"] < objects.pdfplumber.images[0]["x1"]
assert objects.pdfplumber.images[1]["y1"] < objects.pdfplumber.images[0]["y1"]
# check that figure exists no more
for figure in objects.flattened.figures:
assert abs(float(objects.pdfplumber.figures[1]["x0"]) - figure.position.x0) > 1
assert abs(float(objects.pdfplumber.figures[1]["y0"]) - figure.position.y0) > 1
assert abs(float(objects.pdfplumber.figures[1]["x1"]) - figure.position.x1) > 1
assert abs(float(objects.pdfplumber.figures[1]["y1"]) - figure.position.y1) > 1
assert abs(float(objects.pdfplumber.images[1]["x0"]) - figure.position.x0) > 1
assert abs(float(objects.pdfplumber.images[1]["y0"]) - figure.position.y0) > 1
assert abs(float(objects.pdfplumber.images[1]["x1"]) - figure.position.x1) > 1
assert abs(float(objects.pdfplumber.images[1]["y1"]) - figure.position.y1) > 1

# filter figures that are partially overlap with other figure, remove the smaller figure
assert objects.pdfplumber.figures[3]["x0"] < objects.pdfplumber.figures[5]["x0"]
assert objects.pdfplumber.figures[3]["y0"] < objects.pdfplumber.figures[5]["y0"]
assert objects.pdfplumber.figures[3]["x1"] < objects.pdfplumber.figures[5]["x1"]
assert objects.pdfplumber.figures[3]["y1"] < objects.pdfplumber.figures[5]["y1"]
assert objects.pdfplumber.images[3]["x0"] < objects.pdfplumber.images[5]["x0"]
assert objects.pdfplumber.images[3]["y0"] < objects.pdfplumber.images[5]["y0"]
assert objects.pdfplumber.images[3]["x1"] < objects.pdfplumber.images[5]["x1"]
assert objects.pdfplumber.images[3]["y1"] < objects.pdfplumber.images[5]["y1"]
assert (
objects.pdfplumber.figures[3]["width"] * objects.pdfplumber.figures[3]["height"]
< objects.pdfplumber.figures[5]["width"]
* objects.pdfplumber.figures[5]["height"]
objects.pdfplumber.images[3]["width"] * objects.pdfplumber.images[3]["height"]
< objects.pdfplumber.images[5]["width"]
* objects.pdfplumber.images[5]["height"]
)
# check that figure exists no more
for figure in objects.flattened.figures:
assert abs(float(objects.pdfplumber.figures[3]["x0"]) - figure.position.x0) > 1
assert abs(float(objects.pdfplumber.figures[3]["y0"]) - figure.position.y0) > 1
assert abs(float(objects.pdfplumber.figures[3]["x1"]) - figure.position.x1) > 1
assert abs(float(objects.pdfplumber.figures[3]["y1"]) - figure.position.y1) > 1
assert abs(float(objects.pdfplumber.images[3]["x0"]) - figure.position.x0) > 1
assert abs(float(objects.pdfplumber.images[3]["y0"]) - figure.position.y0) > 1
assert abs(float(objects.pdfplumber.images[3]["x1"]) - figure.position.x1) > 1
assert abs(float(objects.pdfplumber.images[3]["y1"]) - figure.position.y1) > 1


def test_remove_figures_in_header_footer():
"""Remove figures that in header and footer."""
objects = libpdf.load(PDF_FULL_FEATURES, smart_page_crop=True)
assert len(objects.pdfplumber.figures) == 7
assert len(objects.pdfplumber.images) == 7
assert len(objects.flattened.figures) == 2

# on page 1, there are two figures, one is in header
assert objects.pdfplumber.figures[0]["page_number"] == 1
assert objects.pdfplumber.images[0]["page_number"] == 1
# figures[0] on page 1 is not in header
assert float(objects.pdfplumber.figures[0]["y0"]) == 239.15
assert float(objects.pdfplumber.figures[0]["y1"]) == 382.85
assert float(objects.pdfplumber.images[0]["y0"]) == 239.15
assert float(objects.pdfplumber.images[0]["y1"]) == 382.85
# figures[1] on page 1 is in header
assert objects.pdfplumber.figures[1]["page_number"] == 1
assert float(objects.pdfplumber.figures[1]["y0"]) == 719.4
assert float(objects.pdfplumber.figures[1]["y1"]) == 754.05
assert objects.pdfplumber.images[1]["page_number"] == 1
assert float(objects.pdfplumber.images[1]["y0"]) == 719.4
assert float(objects.pdfplumber.images[1]["y1"]) == 754.05

# libpdf extract_figures removed that figure in header, only one figure left on page 1
assert objects.flattened.figures[0].position.page.number == 1
Expand Down
3 changes: 2 additions & 1 deletion tests/test_rects.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,4 +230,5 @@ def test_rects_extraction_table() -> None:
assert table.columns_count == 1 * 3
assert table.rows_count == 1

assert check_chapter_rects_count(chapter) == 1 * 5
# assert check_chapter_rects_count(chapter) == 1 * 5
assert check_chapter_rects_count(chapter) == 17
Loading