Skip to content

Commit

Permalink
fix(filetype): handle missing libmagic library
Browse files Browse the repository at this point in the history
  • Loading branch information
metadaddy committed Nov 22, 2024
1 parent 626f73a commit 02e9176
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
### Features

### Fixes

- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output
- **Fixed ImportError when `libmagic` library is not installed** File type detection now correctly falls back to using `filetype` if the `magic` module cannot be imported.

## 0.16.5

Expand Down
9 changes: 8 additions & 1 deletion test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ZipFileDifferentiator,
detect_filetype,
is_json_processable,
LIBMAGIC_AVAILABLE
)
from unstructured.file_utils.model import FileType

Expand Down Expand Up @@ -298,6 +299,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
assert file_type is expected_value


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
Expand Down Expand Up @@ -466,7 +468,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
detect_filetype(file=f)

assert "WARNING" in caplog.text
assert "libmagic is unavailable but assists in filetype detection. Please cons" in caplog.text
assert "magic module is installed but libmagic is unavailable. Please cons" in caplog.text


# ================================================================================================
Expand Down Expand Up @@ -632,10 +634,12 @@ def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified(
detect_filetype()


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def test_it_detects_EMPTY_from_file_path_to_empty_file():
assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def test_it_detects_EMPTY_from_empty_file_like_object():
with open(example_doc_path("empty.txt"), "rb") as f:
assert detect_filetype(file=f) == FileType.EMPTY
Expand Down Expand Up @@ -859,6 +863,7 @@ def it_knows_whether_it_is_a_zipfile(self, file_name: str, expected_value: bool)

# -- .mime_type ---------------------------------------------

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_path(self):
ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt"))
assert ctx.mime_type == "text/plain"
Expand All @@ -878,6 +883,7 @@ def but_it_warns_to_install_libmagic_when_the_filetype_lib_cannot_detect_the_MIM
assert "libmagic is unavailable" in caplog.text
assert "consider installing libmagic" in caplog.text

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_like_object(self):
with open(example_doc_path("norwich-city.txt"), "rb") as f:
ctx = _FileTypeDetectionContext(file=f)
Expand Down Expand Up @@ -1094,6 +1100,7 @@ class Describe_TextFileDifferentiator:

# -- .applies() ---------------------------------------------

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.
Expand Down
28 changes: 21 additions & 7 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,17 @@
from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty

LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
# Is the magic *module* available?
MAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))

# Is the libmagic *library* also available?
LIBMAGIC_AVAILABLE = False
if MAGIC_AVAILABLE:
try:
import magic
LIBMAGIC_AVAILABLE = True
except ImportError:
pass


def detect_filetype(
Expand Down Expand Up @@ -359,8 +369,6 @@ def mime_type(self) -> str | None:
file_path = self.file_path

if LIBMAGIC_AVAILABLE:
import magic

mime_type = (
magic.from_file(file_path, mime=True)
if file_path
Expand All @@ -371,10 +379,16 @@ def mime_type(self) -> str | None:
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)

if mime_type is None:
logger.warning(
"libmagic is unavailable but assists in filetype detection. Please consider"
" installing libmagic for better results."
)
if MAGIC_AVAILABLE:
logger.warning(
"The magic module is installed but libmagic is unavailable. Please consider"
" installing libmagic for better filetype detection results."
)
else:
logger.warning(
"The magic module is unavailable but assists in filetype detection. Please consider"
" installing magic for better results."
)
return None

return mime_type.lower()
Expand Down

0 comments on commit 02e9176

Please sign in to comment.