Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pillow to 10.2.0 #41

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed deps/pdfminer.six-20200517.dev1-py3-none-any.whl
Binary file not shown.
Binary file removed deps/pdfplumber-0.5.21.dev1-py3-none-any.whl
Binary file not shown.
1 change: 0 additions & 1 deletion libpdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

# below imports from libpdf.core cannot be at the top avoid circular import errors in
# core.py when importing __version__ and __summary__
import libpdf._import_forks # noqa: F401
from libpdf.core import main_api as load
from libpdf.core import main_cli

Expand Down
36 changes: 0 additions & 36 deletions libpdf/_import_forks.py

This file was deleted.

2 changes: 1 addition & 1 deletion libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def get_explict_dest(pdf, dest_list):
return [dest_page_num, dest_rect_x, dest_rect_y]


def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): # pylint: disable=too-many-branches
def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> None: # pylint: disable=too-many-branches
"""
Fetch the name of annotation, annotation location on the page and destination of the link annotation.

Expand Down
19 changes: 12 additions & 7 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Core routines for PDF extraction."""

from __future__ import annotations

import itertools
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from typing import TYPE_CHECKING, List, Optional

import pdfplumber
import yaml
Expand Down Expand Up @@ -42,6 +44,9 @@
to_pdfplumber_bbox,
)

if TYPE_CHECKING:
from pdfplumber.page import Page as PdfplumberPage

LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -591,14 +596,15 @@ def extract_page_metadata(pdf):


def extract_figures(
pdf,
pages_list,
figure_dir,
) -> List[Figure]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
pdf: pdfplumber.pdf.PDF,
pages_list: list[Page],
figure_dir: str,
) -> list[Figure]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
"""Extract figures in PDF."""
LOG.info("Extracting figures ...")
figure_list = []

page: PdfplumberPage
for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks
tqdm(
pdf.pages,
Expand All @@ -611,9 +617,8 @@ def extract_figures(
LOG.debug("Extracting figures page %s of %s", idx_page + 1, len(pdf.pages))
page_crop = pro.remove_page_header_footer(page)
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage

# check and filter figures
figures = check_and_filter_figures(page_crop.figures)
figures = check_and_filter_figures(page_crop.objects.get("image", []))

if len(figures) != 0:
for idx_figure, figure in enumerate(figures):
Expand Down
12 changes: 5 additions & 7 deletions libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
"""

import datetime
import decimal
import json
import logging
import os
import sys
from typing import Dict, List, Optional, Union

import ruamel.yaml
from pdfplumber.page import Page as PdfplumberPage
from ruamel.yaml.representer import RoundTripRepresenter

from libpdf import parameters
Expand All @@ -37,20 +37,18 @@
LOG = logging.getLogger(__name__)


def remove_page_header_footer(single_page):
def remove_page_header_footer(single_page: PdfplumberPage) -> PdfplumberPage:
"""Remove header and footer."""
page_crop = single_page.within_bbox(
return single_page.within_bbox(
(
0,
decimal.Decimal(parameters.PAGE_CROP_MARGINS["top"]),
parameters.PAGE_CROP_MARGINS["top"],
single_page.width,
single_page.height
- decimal.Decimal(parameters.PAGE_CROP_MARGINS["bottom"]),
- parameters.PAGE_CROP_MARGINS["bottom"],
),
)

return page_crop


class MyRepresenter(RoundTripRepresenter): # pylint: disable=too-few-public-methods
"""Customized representer of yaml."""
Expand Down
6 changes: 2 additions & 4 deletions libpdf/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,11 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"keep_blank_chars": False,
"text_keep_blank_chars": False,
"text_tolerance": 3,
"text_x_tolerance": 2,
"text_y_tolerance": 2,
"intersection_tolerance": 3,
"intersection_x_tolerance": None,
"intersection_y_tolerance": None,
}

table_dict = {"page": {}}
Expand Down Expand Up @@ -164,7 +162,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page
row_cell[1],
row_cell[2],
row_cell[3],
Decimal(lt_page.height),
lt_page.height,
)
pos_cell = Position(
pos_cell_bbox[0],
Expand Down
4 changes: 1 addition & 3 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,9 +957,7 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug("Extracting layout page %s of %s", idx_page + 1, len(pdf.pages))

pdf.interpreter.process_page(page.page_obj)
result = pdf.device.get_result()
lt_textboxes = [obj for obj in result if isinstance(obj, LTTextBox)]
lt_textboxes = [obj for obj in page.layout._objs if isinstance(obj, LTTextBox)]
# remove detected header and footer lt_textboxes based on given page crop margin parameter
filter_lt_textboxes = list(
filter(
Expand Down
23 changes: 8 additions & 15 deletions libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ def to_pdfplumber_bbox(
x1: float,
y1: float,
page_height: float,
) -> list[Decimal]:
) -> list[float]:
"""
Convert PDF standard or pdfminer bbox coordinates to pdfplumber bbox coordinates.

The function is needed because for pdfplumber:
- y coordinates are inverted
- Decimal type is needed

Some diagram may help::

Expand Down Expand Up @@ -180,20 +179,15 @@ def to_pdfplumber_bbox(
:param page_height: height of the page
:return: [x0, top, x1, bottom]
"""
# pylint: disable=invalid-name # short is better here
ret_x0 = Decimal(x0)
ret_y0 = Decimal(Decimal(page_height) - Decimal(y1))
ret_x1 = Decimal(x1)
ret_y1 = Decimal(Decimal(page_height) - Decimal(y0))
return [ret_x0, ret_y0, ret_x1, ret_y1]
return [x0, page_height - y1, x1, page_height - y0]


def from_pdfplumber_bbox(
x0: Decimal,
top: Decimal,
x1: Decimal,
bottom: Decimal,
page_height: Decimal,
x0: float,
top: float,
x1: float,
bottom: float,
page_height: float,
) -> list[float]:
"""
Convert pdfplumber bbox coordinates to PDF standard.
Expand All @@ -205,8 +199,7 @@ def from_pdfplumber_bbox(
:param page_height: height of the page
:return: [x0, y0, x1, y1]
"""
# pylint: disable=invalid-name # short is better here
return [float(x0), float(page_height - bottom), float(x1), float(page_height - top)]
return [x0, page_height - bottom, x1, page_height - top]


def check_lt_obj_in_bbox(
Expand Down
Loading
Loading