Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raised pillow dependency #23

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""PDF catalog extraction."""
from decimal import Decimal
import logging
import re
from typing import Any, Dict, List, Union
Expand All @@ -7,7 +8,6 @@
from libpdf.parameters import ANNO_X_TOLERANCE, ANNO_Y_TOLERANCE
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.utils import decode_title, to_pdfplumber_bbox

from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral

Expand Down Expand Up @@ -405,7 +405,7 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): #
page.height,
)
page_crop = page.within_bbox(ann_bbox)
ann_text = page_crop.extract_text(x_tolerance=1, y_tolerance=4)
ann_text = page_crop.extract_text(x_tolerance=float(1), y_tolerance=float(4))

if 'A' in ann_resolved:
# make sure ann_resolved['A'] is resolved
Expand Down
2 changes: 1 addition & 1 deletion libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ def extract_figures(
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage

# check and filter figures
figures = check_and_filter_figures(page_crop.figures)
figures = check_and_filter_figures(page_crop.objects['figure'])

if len(figures) != 0:
for idx_figure, figure in enumerate(figures):
Expand Down
20 changes: 11 additions & 9 deletions libpdf/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,23 @@

pdfminer's layout is used.
"""
import logging
from decimal import Decimal
import logging
from typing import List, Union

from libpdf import textbox
from libpdf import utils
from libpdf import textbox, utils
from libpdf.catalog import catalog
from libpdf.log import logging_needed
from libpdf.models.figure import Figure
from libpdf.models.page import Page
from libpdf.models.position import Position
from libpdf.models.table import Cell
from libpdf.models.table import Table
from libpdf.models.table import Cell, Table
from libpdf.parameters import LA_PARAMS
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.utils import from_pdfplumber_bbox, lt_to_libpdf_hbox_converter

from pdfminer.layout import LTPage, LTTextBoxHorizontal


LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -65,7 +63,11 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
'explicit_vertical_lines': [],
'explicit_horizontal_lines': [],
'snap_tolerance': 3,
"snap_x_tolerance": 3,
"snap_y_tolerance": 3,
'join_tolerance': 3,
"join_x_tolerance": 3,
"join_y_tolerance": 3,
'edge_min_length': 3,
'min_words_vertical': 3,
'min_words_horizontal': 1,
Expand All @@ -74,8 +76,8 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
'text_x_tolerance': 2,
'text_y_tolerance': 2,
'intersection_tolerance': 3,
'intersection_x_tolerance': None,
'intersection_y_tolerance': None,
'intersection_x_tolerance': 3,
'intersection_y_tolerance': 3,
}

table_dict = {'page': {}}
Expand Down Expand Up @@ -157,7 +159,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page
row_cell[1],
row_cell[2],
row_cell[3],
Decimal(lt_page.height),
lt_page.height,
)
pos_cell = Position(pos_cell_bbox[0], pos_cell_bbox[1], pos_cell_bbox[2], pos_cell_bbox[3], page)
# extract cell text
Expand Down
17 changes: 5 additions & 12 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
pdfminer sees y0 and y1 from the bottom of the page, so y0 is smaller than y1.
All coordinates are given in points where 72 points are 1 inch.
"""
from difflib import SequenceMatcher
import logging
import re
from difflib import SequenceMatcher
from typing import Dict, List, Tuple, Union

from libpdf import parameters
Expand All @@ -51,14 +51,7 @@
)
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, textbox_crop

from pdfminer.layout import (
LTAnno,
LTChar,
LTText,
LTTextBox,
LTTextLineHorizontal,
)
from pdfminer.layout import LTAnno, LTChar, LTText, LTTextBox, LTTextLineHorizontal


LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -877,9 +870,9 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug('Extracting layout page %s of %s', idx_page + 1, len(pdf.pages))

pdf.interpreter.process_page(page.page_obj)
result = pdf.device.get_result()
lt_textboxes = [obj for obj in result if isinstance(obj, LTTextBox)]
# pdf.interpreter.process_page(page.page_obj)
layout_objects = page.layout._objs
lt_textboxes = [obj for obj in layout_objects if isinstance(obj, LTTextBox)]
# remove detected header and footer lt_textboxes based on given page crop margin parameter
filter_lt_textboxes = list(
filter(
Expand Down
8 changes: 4 additions & 4 deletions libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,10 @@ def to_pdfplumber_bbox(x0, y0, x1, y1, page_height):
:return: [x0, top, x1, bottom]
"""
# pylint: disable=invalid-name # short is better here
ret_x0 = Decimal(x0)
ret_y0 = Decimal(Decimal(page_height) - Decimal(y1))
ret_x1 = Decimal(x1)
ret_y1 = Decimal(Decimal(page_height) - Decimal(y0))
ret_x0 = x0
ret_y0 = page_height - y1
ret_x1 = x1
ret_y1 = page_height - y0
return [ret_x0, ret_y0, ret_x1, ret_y1]


Expand Down
Loading