diff --git a/CHANGELOG.md b/CHANGELOG.md index 669b8d338c..1f12fbf4a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,13 @@ - **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication. ### Fixes +- **Add auto-download for NLTK for Python Enviroment** When user install python library without image. It will automatic download nltk data from `tokenize.py` file - **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback. + * **Drop usage of ndjson dependency** ## 0.16.15 - ### Enhancements ### Features diff --git a/requirements/base.txt b/requirements/base.txt index 608da24066..e28e6ec038 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -64,7 +64,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.25.1 +marshmallow==3.26.0 # via # dataclasses-json # unstructured-client diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index f4f22d18fd..f340b17ef6 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -32,7 +32,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.55.4 +fonttools==4.55.5 # via matplotlib h11==0.14.0 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 59fcfb8326..df6369f22b 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -42,7 +42,7 @@ filelock==3.17.0 # transformers flatbuffers==25.1.21 # via onnxruntime -fonttools==4.55.4 +fonttools==4.55.5 # via matplotlib fsspec==2024.12.0 # via @@ -50,7 +50,7 @@ fsspec==2024.12.0 # torch google-api-core[grpc]==2.24.0 # via google-cloud-vision -google-auth==2.37.0 +google-auth==2.38.0 # via # google-api-core # google-cloud-vision diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 5740ad2f5a..389ef87cc6 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -12,5 +12,5 @@ python-pptx==1.0.2 # via -r ./extra-pptx.in typing-extensions==4.12.2 # via python-pptx -xlsxwriter==3.2.0 +xlsxwriter==3.2.1 # via python-pptx diff --git a/requirements/test.txt b/requirements/test.txt index 87b9d7cc52..e9e974a16f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -54,7 +54,7 @@ exceptiongroup==1.2.2 # -c ./base.txt # anyio # pytest -faker==34.0.0 +faker==35.0.0 # via jsf flake8==7.1.1 # via diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 1bababb32d..99ce2a076d 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -12,11 +12,6 @@ CACHE_MAX_SIZE: Final[int] = 128 -def download_nltk_packages(): - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) - - def check_for_nltk_package(package_name: str, package_category: str) -> bool: """Checks to see if the specified NLTK package exists on the image.""" paths: list[str] = [] @@ -32,6 +27,30 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool: return False +# We cache this because we do not want to attempt +# downloading the packages multiple times +@lru_cache() +def download_nltk_packages(): + """If required NLTK packages are not available, download them.""" + + tagger_available = check_for_nltk_package( + package_category="taggers", + package_name="averaged_perceptron_tagger_eng", + ) + tokenizer_available = check_for_nltk_package( + package_category="tokenizers", package_name="punkt_tab" + ) + + if (not tokenizer_available) or (not tagger_available): + nltk.download("averaged_perceptron_tagger_eng", quiet=True) + nltk.download("punkt_tab", quiet=True) + + +# auto download nltk packages if the environment variable is set +if os.getenv("AUTO_DOWNLOAD_NLTK", "True").lower() == "true": + download_nltk_packages() + + @lru_cache(maxsize=CACHE_MAX_SIZE) def sent_tokenize(text: str) -> List[str]: """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""