From 340faf8864235e376f5dcfda0bb4d46dff1a1294 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 22 Jan 2025 13:43:30 -0800 Subject: [PATCH 01/11] added post_install in setup.py file --- CHANGELOG.md | 9 +++++++++ setup.py | 5 +++++ unstructured/__version__.py | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39740ec23c..25c24a5312 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.16.15-dev + +### Enhancements + +### Features + +### Fixes +- **Add auto-download for NLTK for Python Enviroment** When user install python library without image. It will automatic download nltk data. Added `entry_points` in `setup` in `setup.py` + ## 0.16.14 ### Enhancements diff --git a/setup.py b/setup.py index 3b698e12ec..0e8315072c 100644 --- a/setup.py +++ b/setup.py @@ -128,4 +128,9 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt", "py.typed"]}, + entry_points={ + "console_scripts": [ + "post_install_unstructured=unstructured.nlp.tokenize:download_nltk_packages" + ], + }, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4b8c503c08..5913f9e648 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.14" # pragma: no cover +__version__ = "0.16.15-dev" # pragma: no cover From eccddc5dd8b670a4472d41546fa72ad80f1dd6a8 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 22 Jan 2025 15:27:28 -0800 Subject: [PATCH 02/11] upgraded requirements --- requirements/deps/constraints.txt | 4 +-- requirements/extra-paddleocr.txt | 20 +++++------ requirements/extra-pdf-image.txt | 60 ++++++++++++------------------- 3 files changed, 33 insertions(+), 51 deletions(-) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 5700719383..296dd366b5 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -3,8 +3,8 @@ # extras. Putting a dependency here will only affect dependency sets that contain them -- in other # words, if something does not require a constraint, it will not be installed. #################################################################################################### -# (jennings): Versions greater than 5.0 create dependency conflicts with other packages -protobuf<5.0 +# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3 +weaviate-client>=3.26.7,<4.0.0 # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index d7c0fe7226..de5e3538e5 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.7.0 +anyio==4.8.0 # via # -c ./base.txt # httpx @@ -32,7 +32,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.55.3 +fonttools==4.55.4 # via matplotlib h11==0.14.0 # via @@ -52,13 +52,13 @@ idna==3.10 # anyio # httpx # requests -imageio==2.36.1 +imageio==2.37.0 # via # imgaug # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-resources==6.5.1 +importlib-resources==6.5.2 # via matplotlib kiwisolver==1.4.7 # via matplotlib @@ -86,9 +86,9 @@ numpy==1.26.4 # shapely # tifffile # unstructured-paddleocr -opencv-contrib-python==4.10.0.84 +opencv-contrib-python==4.11.0.86 # via unstructured-paddleocr -opencv-python==4.10.0.84 +opencv-python==4.11.0.86 # via # imgaug # unstructured-paddleocr @@ -113,10 +113,8 @@ pillow==11.1.0 # pdf2image # scikit-image # unstructured-paddleocr -protobuf==4.25.5 - # via - # -c ././deps/constraints.txt - # paddlepaddle +protobuf==5.29.3 + # via paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr pyparsing==3.2.1 @@ -175,4 +173,4 @@ urllib3==1.26.20 # -c ./base.txt # requests zipp==3.21.0 - # via importlib-resources + # via importlib-resources \ No newline at end of file diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 81b61276ef..beac3f39d6 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -6,7 +6,7 @@ # antlr4-python3-runtime==4.9.3 # via omegaconf -cachetools==5.5.0 +cachetools==5.5.1 # via google-auth certifi==2024.12.14 # via @@ -35,14 +35,14 @@ deprecated==1.2.15 # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in -filelock==3.16.1 +filelock==3.17.0 # via # huggingface-hub # torch # transformers -flatbuffers==24.12.23 +flatbuffers==25.1.21 # via onnxruntime -fonttools==4.55.3 +fonttools==4.55.4 # via matplotlib fsspec==2024.12.0 # via @@ -60,14 +60,14 @@ googleapis-common-protos==1.66.0 # via # google-api-core # grpcio-status -grpcio==1.68.1 +grpcio==1.69.0 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.62.3 +grpcio-status==1.69.0 # via google-api-core -huggingface-hub==0.27.0 +huggingface-hub==0.27.1 # via # timm # tokenizers @@ -79,16 +79,12 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.1 +importlib-resources==6.5.2 # via matplotlib -iopath==0.1.10 - # via layoutparser jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -layoutparser==0.3.4 - # via unstructured-inference lxml==5.3.0 # via # -c ./base.txt @@ -107,7 +103,6 @@ numpy==1.26.4 # via # -c ./base.txt # contourpy - # layoutparser # matplotlib # onnx # onnxruntime @@ -126,10 +121,8 @@ onnx==1.17.0 # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference -opencv-python==4.10.0.84 - # via - # layoutparser - # unstructured-inference +opencv-python==4.11.0.86 + # via unstructured-inference packaging==24.2 # via # -c ./base.txt @@ -140,24 +133,21 @@ packaging==24.2 # transformers # unstructured-pytesseract pandas==2.2.3 - # via layoutparser + # via unstructured-inference pdf2image==1.17.0 - # via - # -r ./extra-pdf-image.in - # layoutparser + # via -r ./extra-pdf-image.in pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber pdfplumber==0.11.5 - # via layoutparser + # via unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in -pikepdf==9.5.0 +pikepdf==9.5.1 # via -r ./extra-pdf-image.in pillow==11.1.0 # via - # layoutparser # matplotlib # pdf2image # pdfplumber @@ -165,15 +155,12 @@ pillow==11.1.0 # pikepdf # torchvision # unstructured-pytesseract -portalocker==3.1.1 - # via iopath proto-plus==1.25.0 # via # google-api-core # google-cloud-vision -protobuf==4.25.5 +protobuf==5.29.3 # via - # -c ././deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos @@ -213,7 +200,6 @@ pytz==2024.2 pyyaml==6.0.2 # via # huggingface-hub - # layoutparser # omegaconf # timm # transformers @@ -233,12 +219,12 @@ requests==2.32.3 # transformers rsa==4.9 # via google-auth -safetensors==0.5.0 +safetensors==0.5.2 # via # timm # transformers scipy==1.13.1 - # via layoutparser + # via unstructured-inference six==1.17.0 # via # -c ./base.txt @@ -247,7 +233,7 @@ sympy==1.13.1 # via # onnxruntime # torch -timm==1.0.12 +timm==1.0.14 # via # effdet # unstructured-inference @@ -269,7 +255,6 @@ tqdm==4.67.1 # via # -c ./base.txt # huggingface-hub - # iopath # transformers transformers==4.44.2 # via unstructured-inference @@ -277,12 +262,11 @@ typing-extensions==4.12.2 # via # -c ./base.txt # huggingface-hub - # iopath # pypdf # torch -tzdata==2024.2 +tzdata==2025.1 # via pandas -unstructured-inference==0.8.1 +unstructured-inference==0.8.4 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in @@ -291,9 +275,9 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -wrapt==1.17.0 +wrapt==1.17.2 # via # -c ./base.txt # deprecated zipp==3.21.0 - # via importlib-resources + # via importlib-resources \ No newline at end of file From 9d40c40f4165c111f27098ca7fdd5e7f27cfbb2b Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 22 Jan 2025 15:40:44 -0800 Subject: [PATCH 03/11] upgrade requirement versions --- requirements/base.txt | 13 ++++----- requirements/dev.txt | 21 ++++---------- requirements/extra-csv.txt | 4 +-- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 4 +-- requirements/extra-markdown.txt | 8 +----- requirements/extra-odt.txt | 4 +-- requirements/extra-paddleocr.txt | 24 ++++++---------- requirements/extra-pandoc.txt | 4 +-- requirements/extra-pdf-image.txt | 49 ++++++++++++++++++++------------ requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 6 ++-- requirements/huggingface.txt | 13 +++++---- requirements/test.txt | 37 +++++++----------------- 14 files changed, 82 insertions(+), 109 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 6fecb30c04..1df5595ddb 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./base.in # -anyio==4.7.0 +anyio==4.8.0 # via httpx backoff==2.2.1 # via -r ./base.in @@ -36,10 +36,8 @@ dataclasses-json==0.6.7 # unstructured-client deepdiff==8.1.1 # via unstructured-client -emoji==2.14.0 +emoji==2.14.1 # via -r ./base.in -exceptiongroup==1.2.2 - # via anyio filetype==1.2.0 # via -r ./base.in h11==0.14.0 @@ -64,7 +62,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.23.2 +marshmallow==3.25.1 # via # dataclasses-json # unstructured-client @@ -131,7 +129,6 @@ typing-extensions==4.12.2 # via # -r ./base.in # anyio - # pypdf # python-oxmsg # typing-inspect # unstructured-client @@ -150,5 +147,5 @@ urllib3==1.26.20 # unstructured-client webencodings==0.5.1 # via html5lib -wrapt==1.17.0 +wrapt==1.17.2 # via -r ./base.in diff --git a/requirements/dev.txt b/requirements/dev.txt index 30e42eb0ac..f31650cc8e 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./dev.in @@ -15,14 +15,10 @@ click==8.1.8 # pip-tools distlib==0.3.9 # via virtualenv -filelock==3.16.1 +filelock==3.17.0 # via virtualenv -identify==2.6.4 +identify==2.6.6 # via pre-commit -importlib-metadata==8.5.0 - # via - # -c ././deps/constraints.txt - # build nodeenv==1.9.1 # via pre-commit packaging==24.2 @@ -36,7 +32,7 @@ platformdirs==4.3.6 # via # -c ./test.txt # virtualenv -pre-commit==4.0.1 +pre-commit==4.1.0 # via -r ./dev.in pyproject-hooks==1.2.0 # via @@ -46,17 +42,10 @@ pyyaml==6.0.2 # via # -c ./test.txt # pre-commit -tomli==2.2.1 - # via - # -c ./test.txt - # build - # pip-tools -virtualenv==20.28.1 +virtualenv==20.29.1 # via pre-commit wheel==0.45.1 # via pip-tools -zipp==3.21.0 - # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 496cd42fc1..f26d8ca9a1 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-csv.in @@ -20,5 +20,5 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.2 +tzdata==2025.1 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 01e7e2e24b..bae3499afa 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index a9533059da..860b019c1d 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-epub.in # -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 243fd0b0da..3292b7b7c6 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,14 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-markdown.in # -importlib-metadata==8.5.0 - # via - # -c ././deps/constraints.txt - # markdown markdown==3.7 # via -r ./extra-markdown.in -zipp==3.21.0 - # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 28ebf301a6..2730ec6db3 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-odt.in @@ -8,7 +8,7 @@ lxml==5.3.0 # via # -c ./base.txt # python-docx -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index de5e3538e5..ddf0753a66 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-paddleocr.in @@ -20,7 +20,7 @@ charset-normalizer==3.4.1 # via # -c ./base.txt # requests -contourpy==1.3.0 +contourpy==1.3.1 # via matplotlib cycler==0.12.1 # via matplotlib @@ -28,10 +28,6 @@ cython==3.0.11 # via unstructured-paddleocr decorator==5.1.1 # via paddlepaddle -exceptiongroup==1.2.2 - # via - # -c ./base.txt - # anyio fonttools==4.55.4 # via matplotlib h11==0.14.0 @@ -58,15 +54,13 @@ imageio==2.37.0 # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-resources==6.5.2 - # via matplotlib -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib lazy-loader==0.4 # via scikit-image -matplotlib==3.9.4 +matplotlib==3.10.0 # via imgaug -networkx==3.2.1 +networkx==3.4.2 # via # paddlepaddle # scikit-image @@ -133,11 +127,11 @@ requests==2.32.3 # via # -c ./base.txt # unstructured-paddleocr -scikit-image==0.24.0 +scikit-image==0.25.0 # via # imgaug # unstructured-paddleocr -scipy==1.13.1 +scipy==1.15.1 # via # imgaug # scikit-image @@ -154,7 +148,7 @@ sniffio==1.3.1 # via # -c ./base.txt # anyio -tifffile==2024.8.30 +tifffile==2025.1.10 # via scikit-image tqdm==4.67.1 # via @@ -172,5 +166,3 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -zipp==3.21.0 - # via importlib-resources \ No newline at end of file diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index 4125059733..fb62b5156d 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-pandoc.in # -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index beac3f39d6..412e34c971 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-pdf-image.in @@ -23,7 +23,7 @@ charset-normalizer==3.4.1 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.0 +contourpy==1.3.1 # via matplotlib cryptography==44.0.0 # via @@ -79,30 +79,33 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.2 - # via matplotlib +iopath==0.1.10 + # via layoutparser jinja2==3.1.5 # via torch -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib +layoutparser==0.3.4 + # via unstructured-inference lxml==5.3.0 # via # -c ./base.txt # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.9.4 +matplotlib==3.10.0 # via # pycocotools # unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch numpy==1.26.4 # via # -c ./base.txt # contourpy + # layoutparser # matplotlib # onnx # onnxruntime @@ -119,10 +122,12 @@ onnx==1.17.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.20.1 # via unstructured-inference opencv-python==4.11.0.86 - # via unstructured-inference + # via + # layoutparser + # unstructured-inference packaging==24.2 # via # -c ./base.txt @@ -133,21 +138,24 @@ packaging==24.2 # transformers # unstructured-pytesseract pandas==2.2.3 - # via unstructured-inference + # via layoutparser pdf2image==1.17.0 - # via -r ./extra-pdf-image.in + # via + # -r ./extra-pdf-image.in + # layoutparser pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber pdfplumber==0.11.5 - # via unstructured-inference + # via layoutparser pi-heif==0.21.0 # via -r ./extra-pdf-image.in pikepdf==9.5.1 # via -r ./extra-pdf-image.in pillow==11.1.0 # via + # layoutparser # matplotlib # pdf2image # pdfplumber @@ -155,6 +163,8 @@ pillow==11.1.0 # pikepdf # torchvision # unstructured-pytesseract +portalocker==3.1.1 + # via iopath proto-plus==1.25.0 # via # google-api-core @@ -200,6 +210,7 @@ pytz==2024.2 pyyaml==6.0.2 # via # huggingface-hub + # layoutparser # omegaconf # timm # transformers @@ -223,8 +234,8 @@ safetensors==0.5.2 # via # timm # transformers -scipy==1.13.1 - # via unstructured-inference +scipy==1.15.1 + # via layoutparser six==1.17.0 # via # -c ./base.txt @@ -255,6 +266,7 @@ tqdm==4.67.1 # via # -c ./base.txt # huggingface-hub + # iopath # transformers transformers==4.44.2 # via unstructured-inference @@ -262,11 +274,11 @@ typing-extensions==4.12.2 # via # -c ./base.txt # huggingface-hub - # pypdf + # iopath # torch tzdata==2025.1 # via pandas -unstructured-inference==0.8.4 +unstructured-inference==0.8.1 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in @@ -279,5 +291,6 @@ wrapt==1.17.2 # via # -c ./base.txt # deprecated -zipp==3.21.0 - # via importlib-resources \ No newline at end of file + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 5740ad2f5a..f2989910d0 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-pptx.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 7f00c057a2..a9494b01cd 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,12 +1,12 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl -networkx==3.2.1 +networkx==3.4.2 # via -r ./extra-xlsx.in numpy==1.26.4 # via @@ -26,7 +26,7 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.2 +tzdata==2025.1 # via pandas xlrd==2.0.1 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 7051a2233b..7ffe086e2d 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./huggingface.in @@ -16,7 +16,7 @@ click==8.1.8 # via # -c ./base.txt # sacremoses -filelock==3.16.1 +filelock==3.17.0 # via # huggingface-hub # torch @@ -25,7 +25,7 @@ fsspec==2024.12.0 # via # huggingface-hub # torch -huggingface-hub==0.27.0 +huggingface-hub==0.27.1 # via # tokenizers # transformers @@ -47,7 +47,7 @@ markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch numpy==1.26.4 # via @@ -74,7 +74,7 @@ requests==2.32.3 # transformers sacremoses==0.1.1 # via -r ./huggingface.in -safetensors==0.5.0 +safetensors==0.5.2 # via transformers sentencepiece==0.2.0 # via -r ./huggingface.in @@ -108,3 +108,6 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/test.txt b/requirements/test.txt index a7e1d2cfa2..6ccc9814a7 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,12 +1,12 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile ./test.in # annotated-types==0.7.0 # via pydantic -anyio==4.7.0 +anyio==4.8.0 # via # -c ./base.txt # httpx @@ -49,12 +49,7 @@ dnspython==2.7.0 # via email-validator email-validator==2.2.0 # via pydantic -exceptiongroup==1.2.2 - # via - # -c ./base.txt - # anyio - # pytest -faker==33.1.0 +faker==34.0.2 # via jsf flake8==7.1.1 # via @@ -66,7 +61,7 @@ freezegun==1.5.1 # via -r ./test.in genson==1.3.0 # via datamodel-code-generator -grpcio==1.68.1 +grpcio==1.69.0 # via # -c ././deps/constraints.txt # -r ./test.in @@ -164,7 +159,7 @@ pycodestyle==2.12.1 # via # flake8 # flake8-print -pydantic[email]==2.10.4 +pydantic[email]==2.10.5 # via # -r ./test.in # datamodel-code-generator @@ -196,7 +191,7 @@ pyyaml==6.0.2 # via # datamodel-code-generator # vcrpy -referencing==0.35.1 +referencing==0.36.1 # via # jsonschema # jsonschema-specifications @@ -218,7 +213,7 @@ rpds-py==0.22.3 # referencing rstr==3.2.2 # via jsf -ruff==0.8.5 +ruff==0.9.2 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -233,16 +228,7 @@ sniffio==1.3.1 # -c ./base.txt # anyio toml==0.10.2 - # via - # datamodel-code-generator - # liccheck -tomli==2.2.1 - # via - # autoflake - # black - # coverage - # mypy - # pytest + # via liccheck tqdm==4.67.1 # via # -c ./base.txt @@ -261,15 +247,14 @@ typing-extensions==4.12.2 # via # -c ./base.txt # anyio - # black # faker # jsf # label-studio-sdk - # multidict # mypy # pydantic # pydantic-core -tzdata==2024.2 + # referencing +tzdata==2025.1 # via pandas ujson==5.10.0 # via label-studio-sdk @@ -281,7 +266,7 @@ urllib3==1.26.20 # vcrpy vcrpy==7.0.0 # via -r ./test.in -wrapt==1.17.0 +wrapt==1.17.2 # via # -c ./base.txt # smart-open From d73c0bbf3a22f28c0179e5f4bf148aa0ac8d462e Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 23 Jan 2025 06:03:12 -0800 Subject: [PATCH 04/11] changed contourpy version --- requirements/extra-paddleocr.txt | 2 +- requirements/extra-pdf-image.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 3858bcc703..55f83e6863 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -20,7 +20,7 @@ charset-normalizer==3.4.1 # via # -c ./base.txt # requests -contourpy==1.3.1 +contourpy==1.3.0 # via matplotlib cycler==0.12.1 # via matplotlib diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 8ebc485c69..002378a340 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -23,7 +23,7 @@ charset-normalizer==3.4.1 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.1 +contourpy==1.3.0 # via matplotlib cryptography==44.0.0 # via From 0eebd86e3d699b85c93c78b2885d4afd88ab0111 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 23 Jan 2025 06:17:55 -0800 Subject: [PATCH 05/11] Resolve lib version issue --- requirements/extra-paddleocr.txt | 14 ++++++++------ requirements/extra-pdf-image.txt | 8 ++++---- requirements/extra-xlsx.txt | 2 +- requirements/huggingface.txt | 2 +- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 55f83e6863..de5e3538e5 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-paddleocr.in @@ -64,9 +64,9 @@ kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 # via scikit-image -matplotlib==3.10.0 +matplotlib==3.9.4 # via imgaug -networkx==3.4.2 +networkx==3.2.1 # via # paddlepaddle # scikit-image @@ -133,11 +133,11 @@ requests==2.32.3 # via # -c ./base.txt # unstructured-paddleocr -scikit-image==0.25.0 +scikit-image==0.24.0 # via # imgaug # unstructured-paddleocr -scipy==1.15.1 +scipy==1.13.1 # via # imgaug # scikit-image @@ -154,7 +154,7 @@ sniffio==1.3.1 # via # -c ./base.txt # anyio -tifffile==2025.1.10 +tifffile==2024.8.30 # via scikit-image tqdm==4.67.1 # via @@ -172,3 +172,5 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests +zipp==3.21.0 + # via importlib-resources \ No newline at end of file diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 002378a340..b0d3dffe67 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -83,7 +83,7 @@ importlib-resources==6.5.2 # via matplotlib jinja2==3.1.5 # via torch -kiwisolver==1.4.8 +kiwisolver==1.4.7 # via matplotlib lxml==5.3.0 # via @@ -91,13 +91,13 @@ lxml==5.3.0 # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.10.0 +matplotlib==3.9.4 # via # pycocotools # unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.4.2 +networkx==3.2.1 # via torch numpy==1.26.4 # via @@ -119,7 +119,7 @@ onnx==1.17.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.20.1 +onnxruntime==1.19.2 # via unstructured-inference opencv-python==4.11.0.86 # via unstructured-inference diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index a9494b01cd..9cfa9cf299 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -6,7 +6,7 @@ # et-xmlfile==2.0.0 # via openpyxl -networkx==3.4.2 +networkx==3.2.1 # via -r ./extra-xlsx.in numpy==1.26.4 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 7ffe086e2d..cc7a693954 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -47,7 +47,7 @@ markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy -networkx==3.4.2 +networkx==3.2.1 # via torch numpy==1.26.4 # via From 22de886045f012a4f02e1cefe1b4cf27104e1fd7 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 23 Jan 2025 06:30:30 -0800 Subject: [PATCH 06/11] Fix lint issue --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c0612c3bbb..1b1be7552a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.16-dev0" # pragma: no cover \ No newline at end of file +__version__ = "0.16.16-dev0" # pragma: no cover From de9a5d28b51c762e41496941338aab99393f905a Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 23 Jan 2025 09:02:43 -0800 Subject: [PATCH 07/11] compile requirements using pythn 3.9 --- requirements/base.txt | 7 +++++-- requirements/dev.txt | 5 +++-- requirements/extra-csv.txt | 2 +- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 2 +- requirements/extra-markdown.txt | 4 +++- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 4 ++-- requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.txt | 11 +++++------ requirements/extra-pptx.txt | 4 ++-- requirements/extra-xlsx.txt | 2 +- requirements/huggingface.txt | 5 +---- requirements/test.txt | 17 ++++++++++++++--- 14 files changed, 41 insertions(+), 28 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 1df5595ddb..091672d880 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./base.in @@ -38,6 +38,8 @@ deepdiff==8.1.1 # via unstructured-client emoji==2.14.1 # via -r ./base.in +exceptiongroup==1.2.2 + # via anyio filetype==1.2.0 # via -r ./base.in h11==0.14.0 @@ -62,7 +64,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.25.1 +marshmallow==3.26.0 # via # dataclasses-json # unstructured-client @@ -129,6 +131,7 @@ typing-extensions==4.12.2 # via # -r ./base.in # anyio + # pypdf # python-oxmsg # typing-inspect # unstructured-client diff --git a/requirements/dev.txt b/requirements/dev.txt index 4b5843797c..a5ebd99214 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./dev.in @@ -19,7 +19,6 @@ filelock==3.17.0 # via virtualenv identify==2.6.6 # via pre-commit - importlib-metadata==8.6.1 # via # -c ././deps/constraints.txt @@ -56,6 +55,8 @@ virtualenv==20.29.1 # via pre-commit wheel==0.45.1 # via pip-tools +zipp==3.21.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index f26d8ca9a1..d4d50645e8 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-csv.in diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index bae3499afa..01e7e2e24b 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 860b019c1d..460408c418 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 4235fb6bf3..9d0a14da55 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-markdown.in @@ -10,3 +10,5 @@ importlib-metadata==8.6.1 # markdown markdown==3.7 # via -r ./extra-markdown.in +zipp==3.21.0 + # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 2730ec6db3..362c53ed74 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index de5e3538e5..f340b17ef6 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -32,7 +32,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.55.4 +fonttools==4.55.5 # via matplotlib h11==0.14.0 # via @@ -173,4 +173,4 @@ urllib3==1.26.20 # -c ./base.txt # requests zipp==3.21.0 - # via importlib-resources \ No newline at end of file + # via importlib-resources diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index fb62b5156d..dd397c3845 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index b0d3dffe67..df6369f22b 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-pdf-image.in @@ -42,7 +42,7 @@ filelock==3.17.0 # transformers flatbuffers==25.1.21 # via onnxruntime -fonttools==4.55.4 +fonttools==4.55.5 # via matplotlib fsspec==2024.12.0 # via @@ -50,7 +50,7 @@ fsspec==2024.12.0 # torch google-api-core[grpc]==2.24.0 # via google-cloud-vision -google-auth==2.37.0 +google-auth==2.38.0 # via # google-api-core # google-cloud-vision @@ -276,6 +276,5 @@ wrapt==1.17.2 # via # -c ./base.txt # deprecated - -# The following packages are considered to be unsafe in a requirements file: -# setuptools +zipp==3.21.0 + # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index f2989910d0..389ef87cc6 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-pptx.in @@ -12,5 +12,5 @@ python-pptx==1.0.2 # via -r ./extra-pptx.in typing-extensions==4.12.2 # via python-pptx -xlsxwriter==3.2.0 +xlsxwriter==3.2.1 # via python-pptx diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 9cfa9cf299..b0c6cadbf7 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index cc7a693954..e614f90a3b 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./huggingface.in @@ -108,6 +108,3 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/test.txt b/requirements/test.txt index e7b1601ea0..e9e974a16f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile ./test.in @@ -54,7 +54,7 @@ exceptiongroup==1.2.2 # -c ./base.txt # anyio # pytest -faker==34.0.0 +faker==35.0.0 # via jsf flake8==7.1.1 # via @@ -233,7 +233,16 @@ sniffio==1.3.1 # -c ./base.txt # anyio toml==0.10.2 - # via liccheck + # via + # datamodel-code-generator + # liccheck +tomli==2.2.1 + # via + # autoflake + # black + # coverage + # mypy + # pytest tqdm==4.67.1 # via # -c ./base.txt @@ -252,9 +261,11 @@ typing-extensions==4.12.2 # via # -c ./base.txt # anyio + # black # faker # jsf # label-studio-sdk + # multidict # mypy # pydantic # pydantic-core From f1f953dfc7c620588ea89567d7567fc54ad3f01d Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 24 Jan 2025 11:32:40 -0800 Subject: [PATCH 08/11] Added download function in tokenize.py --- setup.py | 5 ----- unstructured/nlp/tokenize.py | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 0e8315072c..3b698e12ec 100644 --- a/setup.py +++ b/setup.py @@ -128,9 +128,4 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt", "py.typed"]}, - entry_points={ - "console_scripts": [ - "post_install_unstructured=unstructured.nlp.tokenize:download_nltk_packages" - ], - }, ) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 1bababb32d..adc24fe75d 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -12,9 +12,27 @@ CACHE_MAX_SIZE: Final[int] = 128 +# We cache this because we do not want to attempt +# downloading the packages multiple times +@lru_cache() def download_nltk_packages(): - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) + """If required NLTK packages are not available, download them.""" + + tagger_available = check_for_nltk_package( + package_category="taggers", + package_name="averaged_perceptron_tagger_eng", + ) + tokenizer_available = check_for_nltk_package( + package_category="tokenizers", package_name="punkt_tab" + ) + + if (not tokenizer_available) or (not tagger_available): + download_nltk_packages() + + +# auto download nltk packages if the environment variable is set +if os.getenv("AUTO_DOWNLOAD_NLTK", True): + download_nltk_packages() def check_for_nltk_package(package_name: str, package_category: str) -> bool: From 9f4df12a9f89f3be520e150cf33538c5fc8456d9 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 24 Jan 2025 12:06:24 -0800 Subject: [PATCH 09/11] Fixed import issue --- unstructured/nlp/tokenize.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index adc24fe75d..b781bc781f 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -12,6 +12,21 @@ CACHE_MAX_SIZE: Final[int] = 128 +def check_for_nltk_package(package_name: str, package_category: str) -> bool: + """Checks to see if the specified NLTK package exists on the image.""" + paths: list[str] = [] + for path in nltk.data.path: + if not path.endswith("nltk_data"): + path = os.path.join(path, "nltk_data") + paths.append(path) + + try: + nltk.find(f"{package_category}/{package_name}", paths=paths) + return True + except (LookupError, OSError): + return False + + # We cache this because we do not want to attempt # downloading the packages multiple times @lru_cache() @@ -31,25 +46,10 @@ def download_nltk_packages(): # auto download nltk packages if the environment variable is set -if os.getenv("AUTO_DOWNLOAD_NLTK", True): +if os.getenv("AUTO_DOWNLOAD_NLTK", "True").lower() == "true": download_nltk_packages() -def check_for_nltk_package(package_name: str, package_category: str) -> bool: - """Checks to see if the specified NLTK package exists on the image.""" - paths: list[str] = [] - for path in nltk.data.path: - if not path.endswith("nltk_data"): - path = os.path.join(path, "nltk_data") - paths.append(path) - - try: - nltk.find(f"{package_category}/{package_name}", paths=paths) - return True - except (LookupError, OSError): - return False - - @lru_cache(maxsize=CACHE_MAX_SIZE) def sent_tokenize(text: str) -> List[str]: """A wrapper around the NLTK sentence tokenizer with LRU caching enabled.""" From df239430855a3351bd35f3995553030735acf329 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 24 Jan 2025 12:28:12 -0800 Subject: [PATCH 10/11] Fixed recurssion issue --- unstructured/nlp/tokenize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index b781bc781f..99ce2a076d 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -42,7 +42,8 @@ def download_nltk_packages(): ) if (not tokenizer_available) or (not tagger_available): - download_nltk_packages() + nltk.download("averaged_perceptron_tagger_eng", quiet=True) + nltk.download("punkt_tab", quiet=True) # auto download nltk packages if the environment variable is set From d40220b717d9d507c6b6e3a7857a5b3d360a9c88 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 24 Jan 2025 13:18:32 -0800 Subject: [PATCH 11/11] modfied change log --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 858a62c364..1f12fbf4a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ - **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication. ### Fixes -- **Add auto-download for NLTK for Python Enviroment** When user install python library without image. It will automatic download nltk data. Added `entry_points` in `setup` in `setup.py` +- **Add auto-download for NLTK for Python Enviroment** When user install python library without image. It will automatic download nltk data from `tokenize.py` file - **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.