Skip to content

Commit

Permalink
Add importers grouping by file extensions (#1268)
Browse files Browse the repository at this point in the history
<!-- Contributing guide:
https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md
-->

### Summary
CVS-126682
This PR introduces grouping of importers by the required file extensions
in dataset directory tree to speed up the detection process.
| File Extension | # of importers |
| --- | --- |
| `.txt` | 24 |
| `.json` | 23 |
| `.png` | 17 |
| `.jpg` | 9 |
| `.csv` | 8 |
| `.xml` | 4 |
| `.sr`, `.hdr`, `.dib`, `.tiff`, `.pgm`, `.bmp`, `.tga`, `.tif`,
`.jpe`, `.pnm`, `.webp`, `.pxm`, `.pbm`, `.jp2`, `.jpeg`, `.ppm`,
`.ras`, `.exr`, `.pfm`, `.pic` | 3 |
| `.gz`, `.tfrecord`, `.mpeg`, `.f4v`, `.m2p`, `.ps`, `.mp4`, `.ogx`,
`.wmv`, `.vob`, `.m2ts`, `.3g2`, `.qt`, `.webm`, `.mk3d`, `.divx`,
`.mov`, `.mkv`, `.3gp`, `.ogg`, `.evo`, `.ogv`, `.mpg`, `.ts`, `.asf`,
`.mxf`, `.avi`, `.rmvb`, `.flv` | 2 |
| `.p`, ` `, `.meta`, `.datum`, `.arrow`, `.zip`, `.mat`, `.h5`, `data`
| 1 |

Total number of importers: 74. The groups listed above can intersect.


### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [ ] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [ ] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [x] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: Ilya Trushkin <[email protected]>
  • Loading branch information
itrushkin authored Feb 20, 2024
1 parent a311227 commit c070455
Show file tree
Hide file tree
Showing 63 changed files with 1,225 additions and 713 deletions.
3 changes: 2 additions & 1 deletion src/datumaro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
SubsetBase,
)
from .components.dataset_item_storage import ItemStatus
from .components.environment import Environment, PluginRegistry
from .components.environment import Environment
from .components.exporter import Exporter, ExportErrorPolicy, FailingExportErrorPolicy
from .components.hl_ops import HLOps
from .components.importer import FailingImportErrorPolicy, Importer, ImportErrorPolicy
Expand All @@ -63,6 +63,7 @@
SimpleProgressReporter,
TQDMProgressReporter,
)
from .components.registry import PluginRegistry
from .components.transformer import ItemTransform, ModelTransform, Transform
from .components.validator import Validator
from .components.visualizer import Visualizer
Expand Down
154 changes: 35 additions & 119 deletions src/datumaro/components/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,127 +8,39 @@
import os.path as osp
from functools import partial
from inspect import getmodule, isclass
from typing import (
Callable,
Dict,
Generator,
Generic,
Iterable,
Iterator,
List,
Optional,
Sequence,
Set,
Tuple,
Type,
TypeVar,
)
from typing import Callable, List, Optional, Sequence, Set

from datumaro.components.cli_plugin import CliPlugin, plugin_types
from datumaro.components.cli_plugin import plugin_types
from datumaro.components.format_detection import (
DetectedFormat,
FormatDetectionConfidence,
RejectionReason,
detect_dataset_format,
)
from datumaro.components.lazy_plugin import PLUGIN_TYPES, LazyPlugin
from datumaro.util.os_util import import_foreign_module, split_path

T = TypeVar("T")


class Registry(Generic[T]):
def __init__(self):
self._items: Dict[str, T] = {}

def register(self, name: str, value: T) -> T:
self._items[name] = value
return value

def unregister(self, name: str) -> Optional[T]:
return self._items.pop(name, None)

def get(self, key: str):
"""Returns a class or a factory function"""
return self._items[key]

def __getitem__(self, key: str) -> T:
return self.get(key)

def __contains__(self, key) -> bool:
return key in self._items

def __iter__(self) -> Iterator[str]:
return iter(self._items)

def items(self) -> Generator[Tuple[str, T], None, None]:
for key in self:
yield key, self.get(key)


class PluginRegistry(Registry[Type[CliPlugin]]):
def __init__(
self, filter: Callable[[Type[CliPlugin]], bool] = None
): # pylint: disable=redefined-builtin
super().__init__()
self._filter = filter

def get(self, key: str) -> PLUGIN_TYPES:
"""Returns a class or a factory function"""
item = self._items[key]
if issubclass(item, LazyPlugin):
return item.get_plugin_cls()
return item

def batch_register(self, values: Iterable[CliPlugin]):
for v in values:
if self._filter and not self._filter(v):
continue

self.register(v.NAME, v)
from datumaro.components.registry import (
DatasetBaseRegistry,
ExporterRegistry,
GeneratorRegistry,
ImporterRegistry,
LauncherRegistry,
PluginRegistry,
TransformRegistry,
ValidatorRegistry,
)
from datumaro.util.os_util import get_all_file_extensions, import_foreign_module, split_path


class Environment:
_builtin_plugins = None

@classmethod
def _make_filter(cls, accept, decline=None, skip=None):
accept = (accept,) if isclass(accept) else tuple(accept)
skip = {skip} if isclass(skip) else set(skip or [])
skip = tuple(skip | set(accept))
return partial(cls._check_type, accept=accept, decline=decline, skip=skip)

@staticmethod
def _check_type(t, *, accept, decline, skip):
if not issubclass(t, accept) or t in skip or (decline and issubclass(t, decline)):
return False
if getattr(t, "__not_plugin__", None):
return False
return True

def __init__(self, use_lazy_import: bool = True):
from datumaro.components.dataset_base import DatasetBase, SubsetBase
from datumaro.components.exporter import Exporter
from datumaro.components.generator import DatasetGenerator
from datumaro.components.importer import Importer
from datumaro.components.launcher import Launcher
from datumaro.components.transformer import ItemTransform, Transform
from datumaro.components.validator import Validator

_filter = self._make_filter
self._extractors = PluginRegistry(
_filter(
DatasetBase,
decline=Transform,
skip=(SubsetBase, Transform, ItemTransform),
)
)
self._importers = PluginRegistry(_filter(Importer))
self._launchers = PluginRegistry(_filter(Launcher))
self._exporters = PluginRegistry(_filter(Exporter))
self._generators = PluginRegistry(_filter(DatasetGenerator))
self._transforms = PluginRegistry(_filter(Transform, skip=ItemTransform))
self._validators = PluginRegistry(_filter(Validator))
self._extractors = DatasetBaseRegistry()
self._importers = ImporterRegistry()
self._launchers = LauncherRegistry()
self._exporters = ExporterRegistry()
self._generators = GeneratorRegistry()
self._transforms = TransformRegistry()
self._validators = ValidatorRegistry()
self._builtins_initialized = False
self._use_lazy_import = use_lazy_import

Expand All @@ -139,31 +51,31 @@ def _get_plugin_registry(self, name):
return getattr(self, name)

@property
def extractors(self) -> PluginRegistry:
def extractors(self) -> DatasetBaseRegistry:
return self._get_plugin_registry("_extractors")

@property
def importers(self) -> PluginRegistry:
def importers(self) -> ImporterRegistry:
return self._get_plugin_registry("_importers")

@property
def launchers(self) -> PluginRegistry:
def launchers(self) -> LauncherRegistry:
return self._get_plugin_registry("_launchers")

@property
def exporters(self) -> PluginRegistry:
def exporters(self) -> ExporterRegistry:
return self._get_plugin_registry("_exporters")

@property
def generators(self) -> PluginRegistry:
def generators(self) -> GeneratorRegistry:
return self._get_plugin_registry("_generators")

@property
def transforms(self) -> PluginRegistry:
def transforms(self) -> TransformRegistry:
return self._get_plugin_registry("_transforms")

@property
def validators(self) -> PluginRegistry:
def validators(self) -> ValidatorRegistry:
return self._get_plugin_registry("_validators")

@staticmethod
Expand Down Expand Up @@ -320,12 +232,16 @@ def detect_dataset(
ignore_dirs = {"__MSOSX", "__MACOSX"}
all_matched_formats: Set[DetectedFormat] = set()

extensions = get_all_file_extensions(path, ignore_dirs) or [""]

importers = {
(name, importer.get_plugin_cls() if self._use_lazy_import else importer)
for extension in extensions
for name, importer in self.importers.extension_groups.get(extension, [])
}
for _ in range(depth + 1):
detected_formats = detect_dataset_format(
(
(format_name, importer.detect)
for format_name, importer in self.importers.items()
),
((format_name, importer.detect) for format_name, importer in importers),
path,
rejection_callback=rejection_callback,
)
Expand Down
4 changes: 4 additions & 0 deletions src/datumaro/components/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def detect(

return cls.DETECT_CONFIDENCE

@classmethod
def get_file_extensions(cls) -> List[str]:
raise NotImplementedError()

@classmethod
def find_sources(cls, path: str) -> List[Dict]:
raise NotImplementedError()
Expand Down
4 changes: 3 additions & 1 deletion src/datumaro/components/lazy_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from abc import ABC, abstractclassmethod
from importlib import import_module
from importlib.util import find_spec
from typing import List, Optional, Sequence, Type, Union
from typing import Dict, List, Optional, Sequence, Type, Union

from datumaro.components.dataset_base import DatasetBase
from datumaro.components.errors import DatumaroError
Expand Down Expand Up @@ -57,6 +57,7 @@ def get_lazy_plugin(
plugin_name: str,
plugin_type: str,
extra_deps: List[str] = [],
metadata: Dict = {},
) -> Optional[LazyPlugin]:
for extra_dep in extra_deps:
spec = find_spec(extra_dep)
Expand All @@ -68,6 +69,7 @@ def get_lazy_plugin(

class LazyPluginImpl(LazyPlugin, plugin_type_cls):
NAME = plugin_name
METADATA = metadata

@classmethod
def get_plugin_cls(cls) -> PLUGIN_TYPES:
Expand Down
Loading

0 comments on commit c070455

Please sign in to comment.