Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(filetype): handle missing libmagic library #3790

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
### Features

### Fixes

- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output
- **Fixed ImportError when `libmagic` library is not installed** File type detection now correctly falls back to using `filetype` if the `magic` module cannot be imported.

## 0.16.5

Expand Down
9 changes: 8 additions & 1 deletion test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ZipFileDifferentiator,
detect_filetype,
is_json_processable,
LIBMAGIC_AVAILABLE
)
from unstructured.file_utils.model import FileType

Expand Down Expand Up @@ -298,6 +299,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
assert file_type is expected_value


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
Expand Down Expand Up @@ -466,7 +468,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
detect_filetype(file=f)

assert "WARNING" in caplog.text
assert "libmagic is unavailable but assists in filetype detection. Please cons" in caplog.text
assert "magic module is installed but libmagic is unavailable. Please cons" in caplog.text


# ================================================================================================
Expand Down Expand Up @@ -632,10 +634,12 @@ def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified(
detect_filetype()


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def test_it_detects_EMPTY_from_file_path_to_empty_file():
assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY


@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def test_it_detects_EMPTY_from_empty_file_like_object():
with open(example_doc_path("empty.txt"), "rb") as f:
assert detect_filetype(file=f) == FileType.EMPTY
Expand Down Expand Up @@ -859,6 +863,7 @@ def it_knows_whether_it_is_a_zipfile(self, file_name: str, expected_value: bool)

# -- .mime_type ---------------------------------------------

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_path(self):
ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt"))
assert ctx.mime_type == "text/plain"
Expand All @@ -878,6 +883,7 @@ def but_it_warns_to_install_libmagic_when_the_filetype_lib_cannot_detect_the_MIM
assert "libmagic is unavailable" in caplog.text
assert "consider installing libmagic" in caplog.text

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_like_object(self):
with open(example_doc_path("norwich-city.txt"), "rb") as f:
ctx = _FileTypeDetectionContext(file=f)
Expand Down Expand Up @@ -1094,6 +1100,7 @@ class Describe_TextFileDifferentiator:

# -- .applies() ---------------------------------------------

@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available")
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.

Expand Down
28 changes: 21 additions & 7 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,17 @@
from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty

LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
# Is the magic *module* available?
MAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))

# Is the libmagic *library* also available?
LIBMAGIC_AVAILABLE = False
if MAGIC_AVAILABLE:
try:
import magic
LIBMAGIC_AVAILABLE = True
except ImportError:
pass


def detect_filetype(
Expand Down Expand Up @@ -359,8 +369,6 @@ def mime_type(self) -> str | None:
file_path = self.file_path

if LIBMAGIC_AVAILABLE:
import magic

mime_type = (
magic.from_file(file_path, mime=True)
if file_path
Expand All @@ -371,10 +379,16 @@ def mime_type(self) -> str | None:
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)

if mime_type is None:
logger.warning(
"libmagic is unavailable but assists in filetype detection. Please consider"
" installing libmagic for better results."
)
if MAGIC_AVAILABLE:
logger.warning(
"The magic module is installed but libmagic is unavailable. Please consider"
" installing libmagic for better filetype detection results."
)
else:
logger.warning(
"The magic module is unavailable but assists in filetype detection. Please consider"
" installing magic for better results."
)
return None

return mime_type.lower()
Expand Down