diff --git a/CHANGELOG.md b/CHANGELOG.md index 687a325a..55fea4db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). +## [0.10.3] - 2023-10-26 + +### Added + +- Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961)) +- Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953)) + +### Fixed + +- Respect `use_text_flow` in `extract_text` (h/t @dhdaines). ([#983](https://github.com/jsvine/pdfplumber/pulls/983)) + ## [0.10.2] - 2023-07-29 ### Added diff --git a/CITATION.cff b/CITATION.cff index 9e5bb153..87516ee9 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 title: pdfplumber type: software -version: 0.10.2 -date-released: "2023-07-29" +version: 0.10.3 +date-released: "2023-10-26" authors: - family-names: "Singer-Vine" given-names: "Jeremy" diff --git a/README.md b/README.md index 54a5b9c3..335b1fff 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,8 @@ Each object is represented as a simple Python `dict`, with the following propert |`bottom`| Distance of bottom of the character from top of page.| |`doctop`| Distance of top of character from top of document.| |`matrix`| The "current transformation matrix" for this character. (See below for details.)| +|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this character if any (otherwise `None`). *Experimental attribute.*| +|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this character if any (otherwise `None`). *Experimental attribute.*| |`ncs`|TKTK| |`stroking_pattern`|TKTK| |`non_stroking_pattern`|TKTK| @@ -191,6 +193,8 @@ my_char_rotation = my_char_ctm.skew_x |`linewidth`| Thickness of line.| |`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.| |`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.| +|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this line if any (otherwise `None`). *Experimental attribute.*| +|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this line if any (otherwise `None`). *Experimental attribute.*| |`object_type`| "line"| #### `rect` properties @@ -210,6 +214,8 @@ my_char_rotation = my_char_ctm.skew_x |`linewidth`| Thickness of line.| |`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.| |`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.| +|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this rect if any (otherwise `None`). *Experimental attribute.*| +|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this rect if any (otherwise `None`). *Experimental attribute.*| |`object_type`| "rect"| #### `curve` properties @@ -231,6 +237,8 @@ my_char_rotation = my_char_ctm.skew_x |`fill`| Whether the shape defined by the curve's path is filled.| |`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.| |`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.| +|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this curve if any (otherwise `None`). *Experimental attribute.*| +|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this curve if any (otherwise `None`). *Experimental attribute.*| |`object_type`| "curve"| #### Derived properties @@ -531,6 +539,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes - [Shannon Shen](https://github.com/lolipopshock) - [Matsumoto Toshi](https://github.com/toshi1127) - [John West](https://github.com/jwestwsj) +- [David Huggins-Daines](https://github.com/dhdaines) - [Jeremy B. Merrill](https://github.com/jeremybmerrill) ## Contributing diff --git a/docs/repairing.md b/docs/repairing.md index 6efd8e7b..34d9b18f 100644 --- a/docs/repairing.md +++ b/docs/repairing.md @@ -9,3 +9,7 @@ Malformed PDFs can often be [fixed via Ghostscript](https://superuser.com/questi - `pdfplumber.open(..., repair=True)` will repair your PDF on the fly (but not save the repaired version to disk). - `pdfplumber.repair(path_to_pdf)` will return a `BytesIO` object holding the bytes of a repaired version of the original file. - `pdfplumber.repair(path_to_pdf, outfile="path/to/repaired.pdf")` will write a repaired version of the original file to the indicated `outfile` path. + +## Custom parameters + +- `gs_path=...`: You can pass a custom path for the Ghostscript executable, helpful in case `pdfplumber` is unable to auto-detect your copy of Ghostscript. diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py index f67cf481..ddba96ef 100644 --- a/pdfplumber/_version.py +++ b/pdfplumber/_version.py @@ -1,2 +1,2 @@ -version_info = (0, 10, 2) +version_info = (0, 10, 3) __version__ = ".".join(map(str, version_info)) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 521002f4..c86a363c 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -22,7 +22,7 @@ LTPage, LTTextContainer, ) -from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSLiteral @@ -62,6 +62,8 @@ "stream", "stroke", "stroking_color", + "mcid", + "tag", ] ) @@ -115,6 +117,56 @@ def normalize_color( return separate_pattern(tuplefied) +class PDFPageAggregatorWithMarkedContent(PDFPageAggregator): + """Extract layout from a specific page, adding marked-content IDs to + objects where found.""" + + cur_mcid: Optional[int] = None + cur_tag: Optional[str] = None + + def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: + """Handle beginning of tag, setting current MCID if any.""" + self.cur_tag = decode_text(tag.name) + if isinstance(props, dict) and "MCID" in props: + self.cur_mcid = props["MCID"] + else: + self.cur_mcid = None + + def end_tag(self) -> None: + """Handle beginning of tag, clearing current MCID.""" + self.cur_tag = None + self.cur_mcid = None + + def tag_cur_item(self) -> None: + """Add current MCID to what we hope to be the most recent object created + by pdfminer.six.""" + # This is somewhat hacky and would not be necessary if + # pdfminer.six supported MCIDs. In reading the code it's + # clear that the `render_*` methods methods will only ever + # create one object, but that is far from being guaranteed. + # Even if pdfminer.six's API would just return the objects it + # creates, we wouldn't have to do this. + cur_obj = self.cur_item._objs[-1] + cur_obj.mcid = self.cur_mcid # type: ignore + cur_obj.tag = self.cur_tag # type: ignore + + def render_char(self, *args, **kwargs) -> float: # type: ignore + """Hook for rendering characters, adding the `mcid` attribute.""" + adv = super().render_char(*args, **kwargs) + self.tag_cur_item() + return adv + + def render_image(self, *args, **kwargs) -> None: # type: ignore + """Hook for rendering images, adding the `mcid` attribute.""" + super().render_image(*args, **kwargs) + self.tag_cur_item() + + def paint_path(self, *args, **kwargs) -> None: # type: ignore + """Hook for rendering lines and curves, adding the `mcid` attribute.""" + super().paint_path(*args, **kwargs) + self.tag_cur_item() + + class Page(Container): cached_properties: List[str] = Container.cached_properties + ["_layout"] is_original: bool = True @@ -174,7 +226,7 @@ def height(self) -> T_num: def layout(self) -> LTPage: if hasattr(self, "_layout"): return self._layout - device = PDFPageAggregator( + device = PDFPageAggregatorWithMarkedContent( self.pdf.rsrcmgr, pageno=self.page_number, laparams=self.pdf.laparams, diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index e98090d8..aaf50e3e 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -70,12 +70,13 @@ def open( password: Optional[str] = None, strict_metadata: bool = False, repair: bool = False, + gs_path: Optional[Union[str, pathlib.Path]] = None, ) -> "PDF": stream: Union[BufferedReader, BytesIO] if repair: - stream = _repair(path_or_fp, password=password) + stream = _repair(path_or_fp, password=password, gs_path=gs_path) stream_is_external = False # Although the original file has a path, # the repaired version does not diff --git a/pdfplumber/repair.py b/pdfplumber/repair.py index 774733ca..a7a7b903 100644 --- a/pdfplumber/repair.py +++ b/pdfplumber/repair.py @@ -8,9 +8,10 @@ def _repair( path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO], password: Optional[str] = None, + gs_path: Optional[Union[str, pathlib.Path]] = None, ) -> BytesIO: - executable = shutil.which("gs") or shutil.which("gswin32c") + executable = gs_path or shutil.which("gs") or shutil.which("gswin32c") if executable is None: # pragma: nocover raise Exception( "Cannot find Ghostscript, which is required for repairs.\n" @@ -52,8 +53,9 @@ def repair( path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO], outfile: Optional[Union[str, pathlib.Path]] = None, password: Optional[str] = None, + gs_path: Optional[Union[str, pathlib.Path]] = None, ) -> Optional[BytesIO]: - repaired = _repair(path_or_fp, password) + repaired = _repair(path_or_fp, password, gs_path=gs_path) if outfile: with open(outfile, "wb") as f: f.write(repaired.read()) diff --git a/pdfplumber/utils/clustering.py b/pdfplumber/utils/clustering.py index 34fd876f..961ab31c 100644 --- a/pdfplumber/utils/clustering.py +++ b/pdfplumber/utils/clustering.py @@ -40,7 +40,10 @@ def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num, def cluster_objects( - xs: List[R], key_fn: Union[Hashable, Callable[[R], T_num]], tolerance: T_num + xs: List[R], + key_fn: Union[Hashable, Callable[[R], T_num]], + tolerance: T_num, + preserve_order: bool = False, ) -> List[List[R]]: if not callable(key_fn): @@ -51,7 +54,12 @@ def cluster_objects( get_0, get_1 = itemgetter(0), itemgetter(1) - cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1) + if preserve_order: + cluster_tuples = [(x, cluster_dict.get(key_fn(x))) for x in xs] + else: + cluster_tuples = sorted( + ((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1 + ) grouped = itertools.groupby(cluster_tuples, key=get_1) diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py index 1196f7e7..4d946caa 100644 --- a/pdfplumber/utils/text.py +++ b/pdfplumber/utils/text.py @@ -225,7 +225,10 @@ def to_textmap( for i, ws in enumerate( cluster_objects( - words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance + words_sorted_doctop, + lambda x: float(x[0]["doctop"]), + y_tolerance, + preserve_order=presorted or use_text_flow, ) ): y_dist = ( diff --git a/tests/pdfs/issue-982-example.pdf b/tests/pdfs/issue-982-example.pdf new file mode 100644 index 00000000..9ad27ae7 Binary files /dev/null and b/tests/pdfs/issue-982-example.pdf differ diff --git a/tests/pdfs/mcid_example.pdf b/tests/pdfs/mcid_example.pdf new file mode 100644 index 00000000..2a90e125 Binary files /dev/null and b/tests/pdfs/mcid_example.pdf differ diff --git a/tests/test_convert.py b/tests/test_convert.py index 0fd03e14..400bdaa5 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -70,7 +70,7 @@ def test_csv(self): assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,' + ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,' ) io = StringIO() @@ -125,7 +125,7 @@ def test_cli_csv(self): assert res.decode("utf-8").split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,' + ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -141,6 +141,7 @@ def test_cli_csv_exclude(self): "3", "--exclude-attrs", "matrix", + "mcid", "ncs", "non_stroking_pattern", "stroking_pattern", @@ -150,7 +151,7 @@ def test_cli_csv_exclude(self): assert res.decode("utf-8").split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,TimesNewRomanPSMT," - ',,"(0, 0, 0)",,18.0,,,,,Y,,1,' + ',,"(0, 0, 0)",,18.0,,,,,,Y,,1,' ) def test_cli_csv_include(self): diff --git a/tests/test_issues.py b/tests/test_issues.py index cfbaedda..614b4c22 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import logging import os +import re import unittest import pdfplumber @@ -257,3 +258,20 @@ def test_issue_683(self): with pdfplumber.open(path) as pdf: page = pdf.pages[0] page.search(r"\d+", regex=True) + + def test_issue_982(self): + """ + extract_text(use_text_flow=True) apparently does nothing + + This is because, while we took care not to sort the words by + `doctop` in `WordExtractor` and `WordMap`, no such precaution + was taken in `cluster_objects`. We thus add an option to + `cluster_objects` to preserve the ordering (which could come + from `use_text_flow` or from `presorted`) of the input objects. + """ + path = os.path.join(HERE, "pdfs/issue-982-example.pdf") + with pdfplumber.open(path) as pdf: + page = pdf.pages[0] + text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True)) + words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True)) + assert text[0:100] == words[0:100] diff --git a/tests/test_mcids.py b/tests/test_mcids.py new file mode 100644 index 00000000..006454e6 --- /dev/null +++ b/tests/test_mcids.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import os +import unittest + +import pdfplumber + +HERE = os.path.abspath(os.path.dirname(__file__)) + + +class TestMCIDs(unittest.TestCase): + """Test MCID extraction.""" + + def test_mcids(self): + path = os.path.join(HERE, "pdfs/mcid_example.pdf") + + pdf = pdfplumber.open(path) + page = pdf.pages[0] + # Check text of MCIDS + mcids = [] + for c in page.chars: + if "mcid" in c: + while len(mcids) <= c["mcid"]: + mcids.append("") + if not mcids[c["mcid"]]: + mcids[c["mcid"]] = c["tag"] + ": " + mcids[c["mcid"]] += c["text"] + assert mcids == [ + "Standard: Test of figures", + "", + "P: 1 ligne", + "P: 2 ligne", + "P: 3 ligne", + "P: 4 ligne", + "P: 0", + "P: 2", + "P: 4", + "P: 6", + "P: 8", + "P: 10", + "P: 12", + "P: Figure 1: Chart", + "", + "P: 1 colonne", + "P: 2 colonne", + "P: 3 colonne", + ] + # Check line and curve MCIDs + line_mcids = set(x["mcid"] for x in page.lines) + curve_mcids = set(x["mcid"] for x in page.curves) + assert all(x["tag"] == "Figure" for x in page.lines) + assert all(x["tag"] == "Figure" for x in page.curves) + assert line_mcids & {1, 14} + assert curve_mcids & {1, 14} + # No rects to test unfortunately! diff --git a/tests/test_repair.py b/tests/test_repair.py index 54adef90..7a02f4da 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import os +import shutil import tempfile import unittest @@ -56,3 +57,8 @@ def test_repair_password(self): path = os.path.join(HERE, "pdfs/password-example.pdf") with pdfplumber.open(path, repair=True, password="test") as pdf: assert len(pdf.pages[0].chars) + + def test_repair_custom_path(self): + path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf") + with pdfplumber.open(path, repair=True, gs_path=shutil.which("gs")) as pdf: + assert len(pdf.pages[0].chars)