From 27df73cfca73164664eebd9003a4312f37bf35dd Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 31 Jul 2024 11:10:26 +0900 Subject: [PATCH 01/42] draft implementation --- .../classification/backbones/efficientnet.py | 20 +++++----- src/otx/algo/classification/dino_v2.py | 6 ++- src/otx/algo/classification/efficientnet.py | 22 ++++++---- .../algo/classification/efficientnet_v2.py | 14 +++++-- .../algo/classification/huggingface_model.py | 6 ++- src/otx/algo/classification/mobilenet_v3.py | 22 ++++++---- .../algo/classification/torchvision_model.py | 6 ++- src/otx/algo/classification/vit.py | 22 ++++++---- src/otx/algo/detection/atss.py | 18 ++++++--- src/otx/algo/detection/huggingface_model.py | 6 +-- src/otx/algo/detection/rtdetr.py | 31 +++++++++----- src/otx/algo/detection/rtmdet.py | 22 +++++++--- src/otx/algo/detection/ssd.py | 16 ++++++-- src/otx/algo/detection/yolox.py | 40 +++++++++++++------ src/otx/core/model/base.py | 4 +- src/otx/core/model/classification.py | 23 +++++++---- src/otx/core/model/detection.py | 4 +- src/otx/core/model/segmentation.py | 3 ++ src/otx/core/types/label.py | 2 +- src/otx/recipe/detection/rtdetr_101.yaml | 15 +++---- src/otx/recipe/detection/rtdetr_18.yaml | 15 +++---- src/otx/recipe/detection/rtdetr_50.yaml | 15 +++---- 22 files changed, 212 insertions(+), 120 deletions(-) diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py index 9682dda3ce4..e69d0b2320b 100644 --- a/src/otx/algo/classification/backbones/efficientnet.py +++ b/src/otx/algo/classification/backbones/efficientnet.py @@ -569,43 +569,43 @@ class OTXEfficientNet(EfficientNet): in_size : tuple of two ints. Spatial size of the expected input image. """ - def __init__(self, version: EFFICIENTNET_VERSION, **kwargs): + def __init__(self, version: EFFICIENTNET_VERSION, in_size: tuple[int, int] | None = None, **kwargs): self.model_name = "efficientnet_" + version if version == "b0": - in_size = (224, 224) + in_size = in_size or (224, 224) depth_factor = 1.0 width_factor = 1.0 elif version == "b1": - in_size = (240, 240) + in_size = in_size or (240, 240) depth_factor = 1.1 width_factor = 1.0 elif version == "b2": - in_size = (260, 260) + in_size = in_size or (260, 260) depth_factor = 1.2 width_factor = 1.1 elif version == "b3": - in_size = (300, 300) + in_size = in_size or (300, 300) depth_factor = 1.4 width_factor = 1.2 elif version == "b4": - in_size = (380, 380) + in_size = in_size or (380, 380) depth_factor = 1.8 width_factor = 1.4 elif version == "b5": - in_size = (456, 456) + in_size = in_size or (456, 456) depth_factor = 2.2 width_factor = 1.6 elif version == "b6": - in_size = (528, 528) + in_size = in_size or (528, 528) depth_factor = 2.6 width_factor = 1.8 elif version == "b7": - in_size = (600, 600) + in_size = in_size or (600, 600) depth_factor = 3.1 width_factor = 2.0 elif version == "b8": - in_size = (672, 672) + in_size = in_size or (672, 672) depth_factor = 3.6 width_factor = 2.2 else: diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py index a24adf76d2a..5afe02e1869 100644 --- a/src/otx/algo/classification/dino_v2.py +++ b/src/otx/algo/classification/dino_v2.py @@ -8,7 +8,7 @@ import logging import os from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, Sequence import torch from torch import Tensor, nn @@ -119,6 +119,7 @@ def __init__( metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, freeze_backbone: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone @@ -129,6 +130,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -195,7 +197,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index c939e1b1421..e118de1a61d 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -7,7 +7,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import Tensor, nn @@ -60,6 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -69,6 +70,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -87,7 +89,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, @@ -145,7 +147,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -193,7 +195,7 @@ class EfficientNetForMulticlassClsSemiSL(EfficientNetForMulticlassCls): def _build_model(self, num_classes: int) -> nn.Module: return SemiSLClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.image_size[-2:]), neck=GlobalAveragePooling(dim=2), head=OTXSemiSLLinearClsHead( num_classes=num_classes, @@ -276,6 +278,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -285,6 +288,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -303,7 +307,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=MultiLabelLinearClsHead( num_classes=num_classes, @@ -358,7 +362,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -404,6 +408,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -413,6 +418,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -437,7 +443,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=HierarchicalLinearClsHead( in_channels=1280, @@ -515,7 +521,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py index 3d6ed09369b..24aaab18dfa 100644 --- a/src/otx/algo/classification/efficientnet_v2.py +++ b/src/otx/algo/classification/efficientnet_v2.py @@ -5,7 +5,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import Tensor, nn @@ -60,6 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -67,6 +68,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -140,7 +142,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -267,6 +269,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -274,6 +277,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -347,7 +351,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.image_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -392,6 +396,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -399,6 +404,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -498,7 +504,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index 8f088b668c4..5671160aee0 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import Tensor, nn @@ -61,6 +61,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.model_name = model_name_or_path @@ -70,6 +71,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -110,7 +112,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index b5808a29a00..eb3192c44ac 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -7,7 +7,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, Sequence import torch from torch import Tensor, nn @@ -71,6 +71,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -79,6 +80,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -97,7 +99,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, @@ -152,7 +154,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -200,7 +202,7 @@ class MobileNetV3ForMulticlassClsSemiSL(MobileNetV3ForMulticlassCls): def _build_model(self, num_classes: int) -> nn.Module: return SemiSLClassifier( - backbone=OTXMobileNetV3(mode=self.mode), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=OTXSemiSLLinearClsHead( num_classes=num_classes, @@ -283,6 +285,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -291,6 +294,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -309,7 +313,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=MultiLabelNonLinearClsHead( num_classes=num_classes, @@ -366,7 +370,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -412,6 +416,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -420,6 +425,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -444,7 +450,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), neck=GlobalAveragePooling(dim=2), head=HierarchicalNonLinearClsHead( in_channels=960, @@ -522,7 +528,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 89aa3e11812..222f24a4f3e 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, Sequence import torch from torch import Tensor, nn @@ -422,6 +422,7 @@ def __init__( OTXTaskType.H_LABEL_CLS, ] = OTXTaskType.MULTI_CLASS_CLS, train_type: Literal["supervised", "semi_supervised"] = "supervised", + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone @@ -442,6 +443,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -552,7 +554,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.image_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py index 86d05b71218..43e39f0af89 100644 --- a/src/otx/algo/classification/vit.py +++ b/src/otx/algo/classification/vit.py @@ -7,7 +7,7 @@ import types from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Generic +from typing import TYPE_CHECKING, Any, Callable, Generic, Sequence from urllib.parse import urlparse import numpy as np @@ -226,6 +226,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -236,6 +237,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -281,7 +283,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -346,7 +348,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -373,7 +375,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=224) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:]) return SemiSLClassifier( backbone=vit_backbone, neck=None, @@ -463,6 +465,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -474,6 +477,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -518,7 +522,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -582,7 +586,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.image_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -610,6 +614,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -621,6 +626,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -670,7 +676,7 @@ def _build_model(self, head_config: dict) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -757,7 +763,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, 224, 224), + input_size=self.input_shape, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index b6febc9f981..3e1186dc046 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from otx.algo.common.backbones import ResNeXt, build_model_including_pytorchcv from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss @@ -28,6 +28,18 @@ class ATSS(ExplainableOTXDetModel): """OTX Detection model class for ATSS.""" + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 800, 992), + tile_image_size: Sequence[int] = (1, 3, 800, 992), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + self.tile_image_size = tile_image_size + @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" @@ -68,8 +80,6 @@ class MobileNetV2ATSS(ATSS): "https://storage.openvinotoolkit.org/repositories/" "openvino_training_extensions/models/object_detection/v2/mobilenet_v2-atss.pth" ) - image_size = (1, 3, 800, 992) - tile_image_size = (1, 3, 800, 992) mean = (0.0, 0.0, 0.0) std = (255.0, 255.0, 255.0) @@ -140,8 +150,6 @@ class ResNeXt101ATSS(ATSS): "https://storage.openvinotoolkit.org/repositories/" "openvino_training_extensions/models/object_detection/v2/resnext101_atss_070623.pth" ) - image_size = (1, 3, 800, 992) - tile_image_size = (1, 3, 800, 992) mean = (0.0, 0.0, 0.0) std = (255.0, 255.0, 255.0) diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index 7fe94226b05..c14537ddb00 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -67,6 +67,7 @@ def __init__( ) -> None: self.model_name = model_name_or_path self.load_from = None + self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) super().__init__( label_info=label_info, @@ -74,8 +75,8 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=(1, 3, *self.image_processor.size.values()), ) - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) def _build_model(self, num_classes: int) -> nn.Module: return AutoModelForObjectDetection.from_pretrained( @@ -148,13 +149,12 @@ def _customize_outputs( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - image_size = (1, 3, *self.image_processor.size.values()) image_mean = (0.0, 0.0, 0.0) image_std = (255.0, 255.0, 255.0) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=image_size, + input_size=self.input_shape, mean=image_mean, # type: ignore[arg-type] std=image_std, # type: ignore[arg-type] resize_mode="standard", diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 1a7fdb6eba9..43fadf4e347 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -7,7 +7,7 @@ import copy import re -from typing import Any +from typing import Any, Sequence import torch from torch import Tensor, nn @@ -28,11 +28,20 @@ class RTDETR(ExplainableOTXDetModel): """RTDETR model.""" - image_size = (1, 3, 640, 640) mean: tuple[float, float, float] = (0.0, 0.0, 0.0) std: tuple[float, float, float] = (255.0, 255.0, 255.0) load_from: str | None = None + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 640, 640), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + def _customize_inputs( self, entity: DetBatchDataEntity, @@ -163,12 +172,12 @@ def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> lis @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_shape is None: + raise ValueError(self.input_shape) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=self.mean, std=self.std, resize_mode="standard", @@ -211,13 +220,13 @@ def _build_model(self, num_classes: int) -> nn.Module: encoder = HybridEncoder( in_channels=[128, 256, 512], expansion=0.5, - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, num_decoder_layers=3, feat_channels=[256, 256, 256], - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], ) optimizer_configuration = [ @@ -254,12 +263,12 @@ def _build_model(self, num_classes: int) -> nn.Module: norm_cfg={"type": "FBN", "name": "norm"}, ) encoder = HybridEncoder( - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[256, 256, 256], - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], num_decoder_layers=6, ) @@ -301,13 +310,13 @@ def _build_model(self, num_classes: int) -> nn.Module: hidden_dim=384, dim_feedforward=2048, in_channels=[512, 1024, 2048], - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[384, 384, 384], - eval_spatial_size=self.image_size[2:], + eval_spatial_size=self.input_shape[2:], ) # no bias decay and learning rate correction for the backbone. diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index 75e2e956f55..cc92cddf6f9 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -5,6 +5,8 @@ from __future__ import annotations +from typing import Sequence + from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import GIoULoss, QualityFocalLoss from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss @@ -24,15 +26,27 @@ class RTMDet(ExplainableOTXDetModel): """OTX Detection model class for RTMDet.""" + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 640, 640), + tile_image_size: Sequence[int] = (1, 3, 640, 640), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + self.tile_image_size = tile_image_size + @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_shape is None: + raise ValueError(self.input_shape) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -62,8 +76,6 @@ class RTMDetTiny(RTMDet): """RTMDet Tiny Model.""" load_from = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/rtmdet_tiny.pth" - image_size = (1, 3, 640, 640) - tile_image_size = (1, 3, 640, 640) mean = (103.53, 116.28, 123.675) std = (57.375, 57.12, 58.395) diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index 3b23ded94f0..95a6432c618 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -10,7 +10,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import numpy as np from datumaro.components.annotation import Bbox @@ -42,11 +42,21 @@ class SSD(ExplainableOTXDetModel): "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions" "/models/object_detection/v2/mobilenet_v2-2s_ssd-992x736.pth" ) - image_size = (1, 3, 864, 864) - tile_image_size = (1, 3, 864, 864) mean = (0.0, 0.0, 0.0) std = (255.0, 255.0, 255.0) + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 864, 864), + tile_image_size: Sequence[int] = (1, 3, 864, 864), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + self.tile_image_size = tile_image_size + def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg = { "assigner": MaxIoUAssigner( diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index 38acf96438f..e9b656bb648 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence from otx.algo.common.losses import CrossEntropyLoss, L1Loss from otx.algo.detection.backbones import CSPDarknet @@ -29,6 +29,18 @@ class YOLOX(ExplainableOTXDetModel): """OTX Detection model class for YOLOX.""" + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 640, 640), + tile_image_size: Sequence[int] = (1, 3, 640, 640), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + self.tile_image_size = tile_image_size + def _customize_inputs( self, entity: DetBatchDataEntity, @@ -40,14 +52,14 @@ def _customize_inputs( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_shape is None: + raise ValueError(self.input_shape) swap_rgb = not isinstance(self, YOLOXTINY) # only YOLOX-TINY uses RGB return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -112,11 +124,21 @@ class YOLOXTINY(YOLOX): "https://storage.openvinotoolkit.org/repositories/" "openvino_training_extensions/models/object_detection/v2/yolox_tiny_8x8.pth" ) - image_size = (1, 3, 416, 416) - tile_image_size = (1, 3, 416, 416) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + input_shape: Sequence[int] = (1, 3, 416, 416), + tile_image_size: Sequence[int] = (1, 3, 416, 416), + **kwargs + ) -> None: + super().__init__( + input_shape=input_shape, + **kwargs + ) + self.tile_image_size = tile_image_size + def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} test_cfg = { @@ -151,8 +173,6 @@ class YOLOXS(YOLOX): "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/" "yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth" ) - image_size = (1, 3, 640, 640) - tile_image_size = (1, 3, 640, 640) mean = (0.0, 0.0, 0.0) std = (1.0, 1.0, 1.0) @@ -190,8 +210,6 @@ class YOLOXL(YOLOX): "https://download.openmmlab.com/mmdetection/v2.0/yolox/" "yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth" ) - image_size = (1, 3, 640, 640) - tile_image_size = (1, 3, 640, 640) mean = (0.0, 0.0, 0.0) std = (1.0, 1.0, 1.0) @@ -224,8 +242,6 @@ class YOLOXX(YOLOX): "https://download.openmmlab.com/mmdetection/v2.0/yolox/" "yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth" ) - image_size = (1, 3, 640, 640) - tile_image_size = (1, 3, 640, 640) mean = (0.0, 0.0, 0.0) std = (1.0, 1.0, 1.0) diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 8e2a26acf6d..3a6efb87663 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -14,7 +14,7 @@ import warnings from abc import abstractmethod from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple +from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, Sequence import numpy as np import openvino @@ -108,6 +108,7 @@ def __init__( metric: MetricCallable = NullMetricCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), + input_shape: Sequence[int] | None = None, ) -> None: super().__init__() @@ -118,6 +119,7 @@ def __init__( self.optimizer_callable = ensure_callable(optimizer) self.scheduler_callable = ensure_callable(scheduler) self.metric_callable = ensure_callable(metric) + self.input_shape = input_shape self.torch_compile = torch_compile self._explain_mode = False diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index c189b9b9e32..5613b657ee0 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import numpy as np import torch @@ -55,6 +55,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, @@ -62,6 +63,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) @property @@ -103,17 +105,18 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) self.config = config self.load_from = config.pop("load_from", None) - self.image_size = (1, 3, 224, 224) super().__init__( label_info=label_info, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -217,7 +220,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=mean, std=std, resize_mode="standard", @@ -247,6 +250,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, @@ -254,6 +258,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) @property @@ -298,17 +303,18 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = lambda num_labels: Accuracy(task="multilabel", num_labels=num_labels), torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) self.config = config self.load_from = config.pop("load_from", None) - self.image_size = (1, 3, 224, 224) super().__init__( label_info=label_info, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -414,7 +420,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=mean, std=std, resize_mode="standard", @@ -436,6 +442,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, @@ -443,6 +450,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) @property @@ -498,6 +506,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) @@ -509,13 +518,13 @@ def __init__( self.config = config self.load_from = config.pop("load_from", None) - self.image_size = (1, 3, 224, 224) super().__init__( label_info=label_info, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -621,7 +630,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_shape, mean=mean, std=std, resize_mode="standard", diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 167157324f3..26fa27ece63 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -9,7 +9,7 @@ import types from abc import abstractmethod from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal +from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence import torch from model_api.tilers import DetectionTiler @@ -376,6 +376,7 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), + input_shape: Sequence[int] | None = None, ) -> None: from otx.algo.explain.explain_algo import feature_vector_fn @@ -386,6 +387,7 @@ def __init__( metric=metric, torch_compile=torch_compile, tile_config=tile_config, + input_shape=input_shape, ) self.model.feature_vector_fn = feature_vector_fn self.model.explain_fn = self.get_explain_fn() diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 0e8cbf85e06..a5777960279 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -44,6 +44,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] torch_compile: bool = False, + input_shape: Sequence[int] | None = None, ): """Base semantic segmentation model. @@ -57,6 +58,7 @@ def __init__( Defaults to SegmCallable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. + input_shape (Sequence[int] | None, optional): The input shape of the model. Defaults to None. """ super().__init__( label_info=label_info, @@ -64,6 +66,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, + input_shape=input_shape, ) @property diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py index 6b4ff83218f..21df8d94555 100644 --- a/src/otx/core/types/label.py +++ b/src/otx/core/types/label.py @@ -7,7 +7,7 @@ import json from dataclasses import asdict, dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence if TYPE_CHECKING: from datumaro import Label, LabelCategories diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml index 8f071d98e89..1ae36dbc26b 100644 --- a/src/otx/recipe/detection/rtdetr_101.yaml +++ b/src/otx/recipe/detection/rtdetr_101.yaml @@ -47,6 +47,9 @@ overrides: warmup_epochs: 7 data: + input_size: + - 640 + - 640 task: DETECTION stack_images: true data_format: coco_instances @@ -65,9 +68,7 @@ overrides: prob: 0.5 - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -85,9 +86,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -102,9 +101,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml index 1eca525f793..4e11fa20499 100644 --- a/src/otx/recipe/detection/rtdetr_18.yaml +++ b/src/otx/recipe/detection/rtdetr_18.yaml @@ -46,6 +46,9 @@ overrides: warmup_epochs: 7 data: + input_size: + - 640 + - 640 task: DETECTION stack_images: true data_format: coco_instances @@ -64,9 +67,7 @@ overrides: prob: 0.5 - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -84,9 +85,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -101,9 +100,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml index 7254550faaa..9adb14819a7 100644 --- a/src/otx/recipe/detection/rtdetr_50.yaml +++ b/src/otx/recipe/detection/rtdetr_50.yaml @@ -47,6 +47,9 @@ overrides: warmup_epochs: 7 data: + input_size: + - 640 + - 640 task: DETECTION stack_images: true data_format: coco_instances @@ -65,9 +68,7 @@ overrides: prob: 0.5 - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -85,9 +86,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true @@ -102,9 +101,7 @@ overrides: transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: - - 640 - - 640 + scale: $(input_size) keep_ratio: false transform_bbox: true is_numpy_to_tvtensor: true From bb9b66e0848e41a5b304662c10f5d11ee7a912fc Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 31 Jul 2024 14:39:35 +0900 Subject: [PATCH 02/42] draft implementation2 --- src/otx/algo/action_classification/movinet.py | 4 +- src/otx/algo/action_classification/x3d.py | 4 +- src/otx/algo/anomaly/padim.py | 8 +-- src/otx/algo/anomaly/stfpm.py | 6 ++- src/otx/algo/classification/dino_v2.py | 6 +-- src/otx/algo/classification/efficientnet.py | 24 ++++----- .../algo/classification/efficientnet_v2.py | 16 +++--- .../algo/classification/huggingface_model.py | 6 +-- src/otx/algo/classification/mobilenet_v3.py | 26 +++++----- .../algo/classification/torchvision_model.py | 6 +-- src/otx/algo/classification/vit.py | 24 ++++----- src/otx/algo/detection/atss.py | 4 +- src/otx/algo/detection/huggingface_model.py | 6 ++- src/otx/algo/detection/rtdetr.py | 22 ++++---- src/otx/algo/detection/rtmdet.py | 10 ++-- src/otx/algo/detection/ssd.py | 4 +- src/otx/algo/detection/yolox.py | 14 ++--- .../algo/instance_segmentation/maskrcnn.py | 51 +++++++++++++++---- .../algo/instance_segmentation/maskrcnn_tv.py | 21 +++++--- .../algo/instance_segmentation/rtmdet_inst.py | 22 +++++--- src/otx/algo/segmentation/dino_v2_seg.py | 11 +++- .../algo/segmentation/huggingface_model.py | 12 ++--- src/otx/algo/segmentation/litehrnet.py | 13 ++++- src/otx/algo/segmentation/segnext.py | 11 +++- .../algo/visual_prompting/segment_anything.py | 19 ++++--- .../zero_shot_segment_anything.py | 17 ++++--- src/otx/core/model/action_classification.py | 6 +-- src/otx/core/model/anomaly.py | 18 +++---- src/otx/core/model/base.py | 4 +- src/otx/core/model/classification.py | 30 +++++------ src/otx/core/model/detection.py | 4 +- src/otx/core/model/instance_segmentation.py | 6 ++- src/otx/core/model/segmentation.py | 12 +++-- src/otx/core/model/visual_prompting.py | 8 ++- src/otx/core/types/label.py | 2 +- .../recipe/semantic_segmentation/dino_v2.yaml | 7 --- 36 files changed, 278 insertions(+), 186 deletions(-) diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py index 9e6863f90aa..7c5861d2af6 100644 --- a/src/otx/algo/action_classification/movinet.py +++ b/src/otx/algo/action_classification/movinet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from torch import nn @@ -32,6 +32,7 @@ class MoViNet(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int] = (1, 1, 3, 8, 224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -40,6 +41,7 @@ def __init__( self.load_from = "https://github.com/Atze00/MoViNet-pytorch/blob/main/weights/modelA0_statedict_v3?raw=true" super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py index dbb6cb0f490..6c26f2deb2f 100644 --- a/src/otx/algo/action_classification/x3d.py +++ b/src/otx/algo/action_classification/x3d.py @@ -4,7 +4,7 @@ """X3D model implementation.""" from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from torch import nn @@ -31,6 +31,7 @@ class X3D(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int] = (1, 1, 3, 8, 224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -39,6 +40,7 @@ def __init__( self.load_from = "https://download.openmmlab.com/mmaction/recognition/x3d/facebook/x3d_m_facebook_16x5x1_kinetics400_rgb_20201027-3f42382a.pth" super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py index 201b0230a02..4f5fb0be6a9 100644 --- a/src/otx/algo/anomaly/padim.py +++ b/src/otx/algo/anomaly/padim.py @@ -7,7 +7,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, Sequence from anomalib.models.image import Padim as AnomalibPadim @@ -34,6 +34,7 @@ class Padim(OTXAnomaly, OTXModel, AnomalibPadim): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. + input_size (Sequence[int]): The input shape of the model. """ def __init__( @@ -47,9 +48,10 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, + input_size: Sequence[int] = (256, 256), ) -> None: - OTXAnomaly.__init__(self) - OTXModel.__init__(self, label_info=AnomalyLabelInfo()) + OTXAnomaly.__init__(self, input_size) + OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size) AnomalibPadim.__init__( self, backbone=backbone, diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py index 72dd30e8aa3..67963c25444 100644 --- a/src/otx/algo/anomaly/stfpm.py +++ b/src/otx/algo/anomaly/stfpm.py @@ -32,6 +32,7 @@ class Stfpm(OTXAnomaly, OTXModel, AnomalibStfpm): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. + input_size (Sequence[int]): The input shape of the model. """ def __init__( @@ -43,10 +44,11 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, + input_size: Sequence[int] = (256, 256), **kwargs, ) -> None: - OTXAnomaly.__init__(self) - OTXModel.__init__(self, label_info=AnomalyLabelInfo()) + OTXAnomaly.__init__(self, input_size) + OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size) AnomalibStfpm.__init__( self, backbone=backbone, diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py index 5afe02e1869..f6430e63f8a 100644 --- a/src/otx/algo/classification/dino_v2.py +++ b/src/otx/algo/classification/dino_v2.py @@ -119,7 +119,7 @@ def __init__( metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, freeze_backbone: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone @@ -130,7 +130,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -197,7 +197,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index e118de1a61d..6f41a844673 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -60,7 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -70,7 +70,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, @@ -147,7 +147,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -278,7 +278,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -288,7 +288,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -307,7 +307,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=MultiLabelLinearClsHead( num_classes=num_classes, @@ -362,7 +362,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -408,7 +408,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version @@ -418,7 +418,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -443,7 +443,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]), + backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=HierarchicalLinearClsHead( in_channels=1280, @@ -521,7 +521,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py index 24aaab18dfa..acb17d0ecc8 100644 --- a/src/otx/algo/classification/efficientnet_v2.py +++ b/src/otx/algo/classification/efficientnet_v2.py @@ -60,7 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -68,7 +68,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -142,7 +142,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -269,7 +269,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -277,7 +277,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -396,7 +396,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -404,7 +404,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -504,7 +504,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index 5671160aee0..56432533dcc 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -61,7 +61,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.model_name = model_name_or_path @@ -71,7 +71,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -112,7 +112,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index eb3192c44ac..570952efb01 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -71,7 +71,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -80,7 +80,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -99,7 +99,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, @@ -154,7 +154,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -202,7 +202,7 @@ class MobileNetV3ForMulticlassClsSemiSL(MobileNetV3ForMulticlassCls): def _build_model(self, num_classes: int) -> nn.Module: return SemiSLClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=OTXSemiSLLinearClsHead( num_classes=num_classes, @@ -285,7 +285,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -294,7 +294,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -313,7 +313,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=MultiLabelNonLinearClsHead( num_classes=num_classes, @@ -370,7 +370,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -416,7 +416,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -425,7 +425,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -450,7 +450,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), neck=GlobalAveragePooling(dim=2), head=HierarchicalNonLinearClsHead( in_channels=960, @@ -528,7 +528,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 222f24a4f3e..9b30a6db22a 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -422,7 +422,7 @@ def __init__( OTXTaskType.H_LABEL_CLS, ] = OTXTaskType.MULTI_CLASS_CLS, train_type: Literal["supervised", "semi_supervised"] = "supervised", - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone @@ -443,7 +443,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def _create_model(self) -> nn.Module: @@ -554,7 +554,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py index 43e39f0af89..4de1c8ac984 100644 --- a/src/otx/algo/classification/vit.py +++ b/src/otx/algo/classification/vit.py @@ -226,7 +226,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -237,7 +237,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -283,7 +283,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -348,7 +348,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -375,7 +375,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:]) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:]) return SemiSLClassifier( backbone=vit_backbone, neck=None, @@ -465,7 +465,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -477,7 +477,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -522,7 +522,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -614,7 +614,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -626,7 +626,7 @@ def __init__( scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, + input_size=input_size, ) def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: @@ -676,7 +676,7 @@ def _build_model(self, head_config: dict) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -763,7 +763,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index 3e1186dc046..76884ab383f 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -30,12 +30,12 @@ class ATSS(ExplainableOTXDetModel): def __init__( self, - input_shape: Sequence[int] = (1, 3, 800, 992), + input_size: Sequence[int] = (1, 3, 800, 992), tile_image_size: Sequence[int] = (1, 3, 800, 992), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index c14537ddb00..393e3d5a96f 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -68,14 +68,16 @@ def __init__( self.model_name = model_name_or_path self.load_from = None self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) + if len(input_size := self.image_processor.size.values()) == 1: + input_size = (*input_size, *input_size) super().__init__( label_info=label_info, + input_size=(1, 3, *input_size), optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=(1, 3, *self.image_processor.size.values()), ) def _build_model(self, num_classes: int) -> nn.Module: @@ -154,7 +156,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=image_mean, # type: ignore[arg-type] std=image_std, # type: ignore[arg-type] resize_mode="standard", diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 43fadf4e347..623d81c611c 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -34,11 +34,11 @@ class RTDETR(ExplainableOTXDetModel): def __init__( self, - input_shape: Sequence[int] = (1, 3, 640, 640), + input_size: Sequence[int] = (1, 3, 640, 640), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) @@ -172,12 +172,12 @@ def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> lis @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.input_shape is None: - raise ValueError(self.input_shape) + if self.input_size is None: + raise ValueError(self.input_size) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="standard", @@ -220,13 +220,13 @@ def _build_model(self, num_classes: int) -> nn.Module: encoder = HybridEncoder( in_channels=[128, 256, 512], expansion=0.5, - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, num_decoder_layers=3, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], ) optimizer_configuration = [ @@ -263,12 +263,12 @@ def _build_model(self, num_classes: int) -> nn.Module: norm_cfg={"type": "FBN", "name": "norm"}, ) encoder = HybridEncoder( - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], num_decoder_layers=6, ) @@ -310,13 +310,13 @@ def _build_model(self, num_classes: int) -> nn.Module: hidden_dim=384, dim_feedforward=2048, in_channels=[512, 1024, 2048], - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[384, 384, 384], - eval_spatial_size=self.input_shape[2:], + eval_spatial_size=self.input_size[2:], ) # no bias decay and learning rate correction for the backbone. diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index cc92cddf6f9..b1a87088d80 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -28,12 +28,12 @@ class RTMDet(ExplainableOTXDetModel): def __init__( self, - input_shape: Sequence[int] = (1, 3, 640, 640), + input_size: Sequence[int] = (1, 3, 640, 640), tile_image_size: Sequence[int] = (1, 3, 640, 640), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) self.tile_image_size = tile_image_size @@ -41,12 +41,12 @@ def __init__( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.input_shape is None: - raise ValueError(self.input_shape) + if self.input_size is None: + raise ValueError(self.input_size) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index 95a6432c618..43abeee8c7a 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -47,12 +47,12 @@ class SSD(ExplainableOTXDetModel): def __init__( self, - input_shape: Sequence[int] = (1, 3, 864, 864), + input_size: Sequence[int] = (1, 3, 864, 864), tile_image_size: Sequence[int] = (1, 3, 864, 864), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index e9b656bb648..1796a74899e 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -31,12 +31,12 @@ class YOLOX(ExplainableOTXDetModel): def __init__( self, - input_shape: Sequence[int] = (1, 3, 640, 640), + input_size: Sequence[int] = (1, 3, 640, 640), tile_image_size: Sequence[int] = (1, 3, 640, 640), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) self.tile_image_size = tile_image_size @@ -52,14 +52,14 @@ def _customize_inputs( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.input_shape is None: - raise ValueError(self.input_shape) + if self.input_size is None: + raise ValueError(self.input_size) swap_rgb = not isinstance(self, YOLOXTINY) # only YOLOX-TINY uses RGB return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -129,12 +129,12 @@ class YOLOXTINY(YOLOX): def __init__( self, - input_shape: Sequence[int] = (1, 3, 416, 416), + input_size: Sequence[int] = (1, 3, 416, 416), tile_image_size: Sequence[int] = (1, 3, 416, 416), **kwargs ) -> None: super().__init__( - input_shape=input_shape, + input_size=input_size, **kwargs ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index a8910e91bb3..599367f4ef6 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import Any +from typing import Any, Sequence from torchvision.ops import RoIAlign @@ -32,10 +32,10 @@ class MaskRCNN(ExplainableOTXInstanceSegModel): @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_size is None: + raise ValueError(self.input_size) - input_size = self.tile_image_size if self.tile_config.enable_tiler else self.image_size + input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, @@ -73,11 +73,21 @@ class MaskRCNNResNet50(MaskRCNN): "https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/" "mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth" ) - image_size = (1, 3, 1024, 1024) - tile_image_size = (1, 3, 512, 512) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + input_size: Sequence[int] = (1, 3, 1024, 1024), + tile_image_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) + self.tile_image_size = tile_image_size + def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { "rpn": { @@ -245,11 +255,21 @@ class MaskRCNNEfficientNet(MaskRCNN): "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/" "models/instance_segmentation/v2/efficientnet_b2b-mask_rcnn-576x576.pth" ) - image_size = (1, 3, 1024, 1024) - tile_image_size = (1, 3, 512, 512) mean = (123.675, 116.28, 103.53) std = (1.0, 1.0, 1.0) + def __init__( + self, + input_size: Sequence[int] = (1, 3, 1024, 1024), + tile_image_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) + self.tile_image_size = tile_image_size + def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { "rpn": { @@ -434,11 +454,22 @@ class MaskRCNNSwinT(MaskRCNN): "mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/" "mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth" ) - image_size = (1, 3, 1344, 1344) - tile_image_size = (1, 3, 512, 512) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + input_size: Sequence[int] = (1, 3, 1344, 1344), + tile_image_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) + self.tile_image_size = tile_image_size + + def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { "rpn": { diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py index d6f5bea1bda..e5afb877998 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py +++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections import OrderedDict -from typing import Any +from typing import Any, Sequence import torch from torch import Tensor, nn @@ -218,10 +218,10 @@ def _customize_outputs( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_size is None: + raise ValueError(self.input_size) - input_size = self.tile_image_size if self.tile_config.enable_tiler else self.image_size + input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, @@ -260,10 +260,19 @@ def forward_for_tracing(self, inputs: Tensor) -> tuple[Tensor, ...]: class TVMaskRCNNR50(TVMaskRCNN): """Torchvision MaskRCNN model with ResNet50 backbone.""" - image_size = (1, 3, 1024, 1024) - tile_image_size = (1, 3, 512, 512) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + input_size: Sequence[int] = (1, 3, 1024, 1024), + tile_image_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) + self.tile_image_size = tile_image_size def _create_model(self) -> nn.Module: """From torchvision tutorial.""" diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 60c6bea25ca..01fe2c10847 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss @@ -31,12 +31,12 @@ class RTMDetInst(ExplainableOTXInstanceSegModel): @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_size is None: + raise ValueError(self.input_size) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -81,11 +81,21 @@ class RTMDetInstTiny(RTMDetInst): "https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/" "rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727-ec670f7e.pth" ) - image_size = (1, 3, 640, 640) - tile_image_size = (1, 3, 640, 640) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + input_size: Sequence[int] = (1, 3, 640, 640), + tile_image_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) + self.tile_image_size = tile_image_size + def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg = { "assigner": DynamicSoftLabelAssigner(topk=13), diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py index d38001ada88..63baccfacc8 100644 --- a/src/otx/algo/segmentation/dino_v2_seg.py +++ b/src/otx/algo/segmentation/dino_v2_seg.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, Sequence from otx.algo.segmentation.backbones import DinoVisionTransformer from otx.algo.segmentation.heads import FCNHead @@ -43,6 +43,15 @@ class DinoV2Seg(BaseSegmModel): class OTXDinoV2Seg(TorchVisionCompatibleModel): """DinoV2Seg Model.""" + def __init__( + self, + input_size: Sequence[int] = (1, 3, 560, 560), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) def _create_model(self) -> nn.Module: # merge configurations with defaults overriding them diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index 14e0111f44c..693c2219c05 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import nn @@ -68,15 +68,18 @@ def __init__( ) -> None: self.model_name = model_name_or_path self.load_from = None + self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) + if len(input_size := self.image_processor.size.values()) == 1: + input_size = (*input_size, *input_size) super().__init__( label_info=label_info, + input_size=(1, 3, *input_size), optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, ) - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) def _create_model(self) -> nn.Module: return AutoModelForSemanticSegmentation.from_pretrained( @@ -121,15 +124,12 @@ def _customize_outputs( @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - size = self.image_processor.size.values() - size = (*size, *size) if len(size) == 1 else size - image_size = (1, 3, *size) image_mean = (123.675, 116.28, 103.53) image_std = (58.395, 57.12, 57.375) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=image_size, + input_size=self.input_size, mean=image_mean, std=image_std, resize_mode="standard", diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py index b24ea9bd77d..458a0f44ea6 100644 --- a/src/otx/algo/segmentation/litehrnet.py +++ b/src/otx/algo/segmentation/litehrnet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, Sequence from torch.onnx import OperatorExportTypes @@ -517,6 +517,15 @@ def ignore_scope(self) -> dict[str, str | dict[str, list[str]]]: class OTXLiteHRNet(TorchVisionCompatibleModel): """LiteHRNet Model.""" + def __init__( + self, + input_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) def _create_model(self) -> nn.Module: litehrnet_model_class = LITEHRNET_VARIANTS[self.name_base_model] @@ -560,7 +569,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.scale, resize_mode="standard", diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py index c3d2ca86fb3..72287dda2a3 100644 --- a/src/otx/algo/segmentation/segnext.py +++ b/src/otx/algo/segmentation/segnext.py @@ -4,7 +4,7 @@ """SegNext model implementations.""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, Sequence from otx.algo.segmentation.backbones import MSCAN from otx.algo.segmentation.heads import LightHamHead @@ -107,6 +107,15 @@ class SegNextT(BaseSegmModel): class OTXSegNext(TorchVisionCompatibleModel): """SegNext Model.""" + def __init__( + self, + input_size: Sequence[int] = (1, 3, 512, 512), + **kwargs + ) -> None: + super().__init__( + input_size=input_size, + **kwargs + ) def _create_model(self) -> nn.Module: segnext_model_class = SEGNEXT_VARIANTS[self.name_base_model] diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index be318a1d2ea..93caca5f109 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging as log -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, Sequence import torch from torch import Tensor, nn @@ -494,6 +494,7 @@ def __init__( self, backbone: Literal["tiny_vit", "vit_b"], label_info: LabelInfoTypes = NullLabelInfo(), + input_size: Sequence[int] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -506,8 +507,17 @@ def __init__( return_extra_metrics: bool = False, stability_score_offset: float = 1.0, ) -> None: + super().__init__( + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) self.config = { "backbone": backbone, + "image_size": self.input_size[-1], "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, @@ -517,13 +527,6 @@ def __init__( "stability_score_offset": stability_score_offset, **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone], } - super().__init__( - label_info=label_info, - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - ) def _create_model(self) -> nn.Module: """Create a PyTorch model for this class.""" diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py index dd650486e30..f4f531be978 100644 --- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py +++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py @@ -648,8 +648,18 @@ def __init__( # noqa: PLR0913 return_extra_metrics: bool = False, stability_score_offset: float = 1.0, ) -> None: + super().__init__( + label_info=label_info, + input_size=(1, 3, 1024, 1024), # zero-shot visual prompting model uses fixed 1024x1024 input size + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) + self.config = { "backbone": backbone, + "image_size": self.input_size[-1], "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, @@ -661,13 +671,6 @@ def __init__( # noqa: PLR0913 "stability_score_offset": stability_score_offset, **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone], } - super().__init__( - label_info=label_info, - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - ) self.save_outputs = save_outputs self.reference_info_dir: Path = Path(reference_info_dir) diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py index 08e2553a895..009affae692 100644 --- a/src/otx/core/model/action_classification.py +++ b/src/otx/core/model/action_classification.py @@ -46,7 +46,6 @@ def __init__( metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, ) -> None: - self.image_size = (1, 1, 3, 8, 224, 224) self.mean = (0.0, 0.0, 0.0) self.std = (255.0, 255.0, 255.0) super().__init__( @@ -135,7 +134,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="standard", @@ -186,7 +185,6 @@ def __init__( config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) self.config = config self.load_from = config.pop("load_from", None) - self.image_size = (1, 1, 3, 8, 224, 224) super().__init__( label_info=label_info, optimizer=optimizer, @@ -266,7 +264,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=mean, std=std, resize_mode="standard", diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py index 44edf869b0b..301fb03191c 100644 --- a/src/otx/core/model/anomaly.py +++ b/src/otx/core/model/anomaly.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging as log -from typing import TYPE_CHECKING, Any, TypeAlias +from typing import TYPE_CHECKING, Any, TypeAlias, Sequence import torch from anomalib import TaskType as AnomalibTaskType @@ -51,10 +51,10 @@ class OTXAnomaly: """Methods used to make OTX model compatible with the Anomalib model.""" - def __init__(self) -> None: + def __init__(self, input_size: Sequence[int] = (256, 256)) -> None: self.optimizer: list[OptimizerCallable] | OptimizerCallable = None self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None - self._input_size: tuple[int, int] = (256, 256) + self._input_size: tuple[int, int] = input_size self.trainer: Trainer self.model: nn.Module self.image_threshold: BaseThreshold @@ -116,15 +116,13 @@ def _get_values_from_transforms( self, ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]: """Get the value requested value from default transforms.""" - image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) + mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) for transform in self.configure_transforms().transforms: # type: ignore[attr-defined] name = transform.__class__.__name__ - if "Resize" in name: - image_size = tuple(transform.size) # type: ignore[assignment] - elif "Normalize" in name: + if "Normalize" in name: mean_value = tuple(value * 255 for value in transform.mean) # type: ignore[assignment] std_value = tuple(value * 255 for value in transform.std) # type: ignore[assignment] - return image_size, mean_value, std_value + return mean_value, std_value @property def trainable_model(self) -> str | None: @@ -300,7 +298,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: """Creates OTXAnomalyModelExporter object that can export anomaly models.""" min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist() max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist() - image_shape, mean_values, scale_values = self._get_values_from_transforms() + mean_values, scale_values = self._get_values_from_transforms() onnx_export_configuration = { "opset_version": 14, "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}}, @@ -308,7 +306,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: "output_names": ["output"], } return OTXAnomalyModelExporter( - image_shape=image_shape, + image_shape=self.input_size, image_threshold=self.image_threshold.value.cpu().numpy().tolist(), pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(), task=self.task, diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 3a6efb87663..24511ff0585 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -103,23 +103,23 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int] | None = None, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = NullMetricCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - input_shape: Sequence[int] | None = None, ) -> None: super().__init__() self._label_info = self._dispatch_label_info(label_info) + self.input_size = input_size self.classification_layers: dict[str, dict[str, Any]] = {} self.model = self._create_model() self._explain_mode = False self.optimizer_callable = ensure_callable(optimizer) self.scheduler_callable = ensure_callable(scheduler) self.metric_callable = ensure_callable(metric) - self.input_shape = input_shape self.torch_compile = torch_compile self._explain_mode = False diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index 5613b657ee0..9e1a150c4e8 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -51,19 +51,19 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) @property @@ -101,22 +101,22 @@ def __init__( self, label_info: LabelInfoTypes, config: DictConfig, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) self.config = config self.load_from = config.pop("load_from", None) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -220,7 +220,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=mean, std=std, resize_mode="standard", @@ -246,19 +246,19 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) @property @@ -299,22 +299,22 @@ def __init__( self, label_info: LabelInfoTypes, config: DictConfig, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = lambda num_labels: Accuracy(task="multilabel", num_labels=num_labels), torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) self.config = config self.load_from = config.pop("load_from", None) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -420,7 +420,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=mean, std=std, resize_mode="standard", @@ -438,19 +438,19 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt def __init__( self, label_info: HLabelInfo, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) @property @@ -502,11 +502,11 @@ def __init__( self, label_info: HLabelInfo, config: DictConfig, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ) -> None: config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes) @@ -520,11 +520,11 @@ def __init__( self.load_from = config.pop("load_from", None) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) def _create_model(self) -> nn.Module: @@ -630,7 +630,7 @@ def _exporter(self) -> OTXModelExporter: mean, std = get_mean_std_from_data_processing(self.config) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_shape, + input_size=self.input_size, mean=mean, std=std, resize_mode="standard", diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 26fa27ece63..eca90192a60 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -371,23 +371,23 @@ class ExplainableOTXDetModel(OTXDetectionModel): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - input_shape: Sequence[int] | None = None, ) -> None: from otx.algo.explain.explain_algo import feature_vector_fn super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, tile_config=tile_config, - input_shape=input_shape, ) self.model.feature_vector_fn = feature_vector_fn self.model.explain_fn = self.get_explain_fn() diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py index 9a838ceae0b..07e744c3ddb 100644 --- a/src/otx/core/model/instance_segmentation.py +++ b/src/otx/core/model/instance_segmentation.py @@ -8,7 +8,7 @@ import logging as log import types from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal +from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence import numpy as np import torch @@ -53,6 +53,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -61,6 +62,7 @@ def __init__( ) -> None: super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -368,6 +370,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -376,6 +379,7 @@ def __init__( ) -> None: super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index a5777960279..89cdae1f215 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -40,16 +40,17 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] torch_compile: bool = False, - input_shape: Sequence[int] | None = None, ): """Base semantic segmentation model. Args: label_info (LabelInfoTypes): The label information for the segmentation model. + input_size (Sequence[int]): The input shape of the model. optimizer (OptimizerCallable, optional): The optimizer to use for training. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -58,15 +59,14 @@ def __init__( Defaults to SegmCallable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. - input_shape (Sequence[int] | None, optional): The input shape of the model. Defaults to None. """ super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_shape=input_shape, ) @property @@ -115,6 +115,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel): def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -129,6 +130,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. + input_size (Sequence[int]): The input shape of the model. optimizer (OptimizerCallable, optional): The optimizer callable for the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -151,13 +153,13 @@ def __init__( self.decode_head_configuration = decode_head_configuration if decode_head_configuration is not None else {} export_image_configuration = export_image_configuration if export_image_configuration is not None else {} self.criterion_configuration = criterion_configuration - self.image_size = tuple(export_image_configuration.get("image_size", (1, 3, 512, 512))) self.mean = export_image_configuration.get("mean", [123.675, 116.28, 103.53]) self.scale = export_image_configuration.get("std", [58.395, 57.12, 57.375]) self.name_base_model = name_base_model super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -198,7 +200,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.scale, resize_mode="standard", diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index 7a4fa917993..749ec5ce0db 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -10,7 +10,7 @@ from collections import defaultdict from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, Sequence import numpy as np import torch @@ -155,6 +155,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro def __init__( self, + input_size: Sequence[int], label_info: LabelInfoTypes = NullLabelInfo(), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -165,6 +166,7 @@ def __init__( log.debug(msg) super().__init__( label_info=NullLabelInfo(), + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -262,6 +264,7 @@ class OTXZeroShotVisualPromptingModel( def __init__( self, + input_size: Sequence[int], label_info: LabelInfoTypes = NullLabelInfo(), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -272,6 +275,7 @@ def __init__( log.debug(msg) super().__init__( label_info=NullLabelInfo(), + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -283,7 +287,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXVisualPromptingModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, self.model.image_size, self.model.image_size), + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="fit_to_window", diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py index 21df8d94555..6b4ff83218f 100644 --- a/src/otx/core/types/label.py +++ b/src/otx/core/types/label.py @@ -7,7 +7,7 @@ import json from dataclasses import asdict, dataclass -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from datumaro import Label, LabelCategories diff --git a/src/otx/recipe/semantic_segmentation/dino_v2.yaml b/src/otx/recipe/semantic_segmentation/dino_v2.yaml index b62e173e74c..984e858860d 100644 --- a/src/otx/recipe/semantic_segmentation/dino_v2.yaml +++ b/src/otx/recipe/semantic_segmentation/dino_v2.yaml @@ -17,13 +17,6 @@ model: - 0.999 weight_decay: 0.0001 - export_image_configuration: - image_size: - - 1 - - 3 - - 560 - - 560 - scheduler: class_path: torch.optim.lr_scheduler.PolynomialLR init_args: From b6a76854b29a4a3075f2d1a69eef929ba5764386 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 31 Jul 2024 16:38:06 +0900 Subject: [PATCH 03/42] check input size constant value --- .../classification/backbones/efficientnet.py | 20 +++++++++---------- src/otx/algo/classification/efficientnet.py | 6 +++--- .../encoders/sam_image_encoder.py | 4 ++-- .../algo/visual_prompting/segment_anything.py | 2 +- src/otx/core/exporter/visual_prompting.py | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py index e69d0b2320b..9682dda3ce4 100644 --- a/src/otx/algo/classification/backbones/efficientnet.py +++ b/src/otx/algo/classification/backbones/efficientnet.py @@ -569,43 +569,43 @@ class OTXEfficientNet(EfficientNet): in_size : tuple of two ints. Spatial size of the expected input image. """ - def __init__(self, version: EFFICIENTNET_VERSION, in_size: tuple[int, int] | None = None, **kwargs): + def __init__(self, version: EFFICIENTNET_VERSION, **kwargs): self.model_name = "efficientnet_" + version if version == "b0": - in_size = in_size or (224, 224) + in_size = (224, 224) depth_factor = 1.0 width_factor = 1.0 elif version == "b1": - in_size = in_size or (240, 240) + in_size = (240, 240) depth_factor = 1.1 width_factor = 1.0 elif version == "b2": - in_size = in_size or (260, 260) + in_size = (260, 260) depth_factor = 1.2 width_factor = 1.1 elif version == "b3": - in_size = in_size or (300, 300) + in_size = (300, 300) depth_factor = 1.4 width_factor = 1.2 elif version == "b4": - in_size = in_size or (380, 380) + in_size = (380, 380) depth_factor = 1.8 width_factor = 1.4 elif version == "b5": - in_size = in_size or (456, 456) + in_size = (456, 456) depth_factor = 2.2 width_factor = 1.6 elif version == "b6": - in_size = in_size or (528, 528) + in_size = (528, 528) depth_factor = 2.6 width_factor = 1.8 elif version == "b7": - in_size = in_size or (600, 600) + in_size = (600, 600) depth_factor = 3.1 width_factor = 2.0 elif version == "b8": - in_size = in_size or (672, 672) + in_size = (672, 672) depth_factor = 3.6 width_factor = 2.2 else: diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index 6f41a844673..2371fddd2d1 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -60,18 +60,18 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version super().__init__( label_info=label_info, + input_size=(1, 3, 224, 224), optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, - input_size=input_size, ) + self.input_size = (1, 3, *self.model.backbone.in_size) def _create_model(self) -> nn.Module: # Get classification_layers for class-incr learning @@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), + backbone=OTXEfficientNet(version=self.version, pretrained=True), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, diff --git a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py index 6143fd139c0..3feef21aba5 100644 --- a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py +++ b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py @@ -70,11 +70,11 @@ def __new__(cls, backbone: str, *args, **kwargs): # noqa: ARG003 if backbone.lower() == "tiny_vit": from otx.algo.visual_prompting.backbones.tiny_vit import TinyViT - return TinyViT(**cls.backbone_configs.get(backbone.lower())) # type: ignore[arg-type] + return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[arg-type] elif backbone.lower() in ["vit_b", "vit_l", "vit_h"]: # noqa: RET505 from otx.algo.visual_prompting.backbones.vit import ViT - return ViT(**cls.backbone_configs.get(backbone.lower())) # type: ignore[arg-type] + return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[arg-type] else: error_log = f"{backbone} is not supported for SAMImageEncoder. Set among tiny_vit and vit_b." diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index 93caca5f109..e8095410bb8 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -82,7 +82,7 @@ def __init__( self.return_extra_metrics = return_extra_metrics self.stability_score_offset = stability_score_offset - self.image_encoder = SAMImageEncoder(backbone=backbone) + self.image_encoder = SAMImageEncoder(backbone=backbone, img_size=image_size) self.prompt_encoder = SAMPromptEncoder( embed_dim=embed_dim, image_embedding_size=(image_embedding_size, image_embedding_size), diff --git a/src/otx/core/exporter/visual_prompting.py b/src/otx/core/exporter/visual_prompting.py index 38cdf3fcd25..6b3d3970120 100644 --- a/src/otx/core/exporter/visual_prompting.py +++ b/src/otx/core/exporter/visual_prompting.py @@ -175,7 +175,7 @@ def get_onnx_dummy_inputs( model.image_embedding_size, dtype=torch.float32, ), - "point_coords": torch.randint(low=0, high=1024, size=(1, 2, 2), dtype=torch.float32), + "point_coords": torch.randint(low=0, high=self.input_size[-1], size=(1, 2, 2), dtype=torch.float32), "point_labels": torch.randint(low=0, high=4, size=(1, 2), dtype=torch.float32), "mask_input": torch.randn( 1, From 4c0781f580f0a1a6c4f9b68a8a7707ca0266ecd0 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 5 Aug 2024 14:02:26 +0900 Subject: [PATCH 04/42] update model part --- src/otx/algo/anomaly/padim.py | 8 +-- src/otx/algo/anomaly/stfpm.py | 6 +- .../classification/backbones/efficientnet.py | 5 +- src/otx/algo/classification/efficientnet.py | 11 ++-- .../algo/classification/efficientnet_v2.py | 2 +- .../algo/classification/huggingface_model.py | 16 +++++- src/otx/algo/classification/vit.py | 2 +- src/otx/algo/detection/atss.py | 28 ++++++++-- .../base_models/detection_transformer.py | 11 ++-- src/otx/algo/detection/huggingface_model.py | 11 ++-- src/otx/algo/detection/rtdetr.py | 35 +++++++++++- src/otx/algo/detection/rtmdet.py | 30 +++++++++- src/otx/algo/detection/ssd.py | 29 ++++++++-- src/otx/algo/detection/yolox.py | 46 ++++++++++++++-- .../heads/custom_roi_head.py | 1 + .../algo/instance_segmentation/maskrcnn.py | 55 ++++++++++++++++--- .../algo/instance_segmentation/maskrcnn_tv.py | 27 ++++++++- .../algo/instance_segmentation/rtmdet_inst.py | 22 +++++++- src/otx/algo/segmentation/dino_v2_seg.py | 35 +++++++++++- .../algo/segmentation/huggingface_model.py | 27 +++++++-- src/otx/algo/segmentation/litehrnet.py | 31 ++++++++++- src/otx/algo/segmentation/segnext.py | 31 ++++++++++- .../visual_prompting/backbones/tiny_vit.py | 6 +- .../algo/visual_prompting/segment_anything.py | 28 ++++++---- src/otx/cli/cli.py | 7 +++ src/otx/core/data/module.py | 3 +- src/otx/core/model/action_classification.py | 4 +- src/otx/core/model/anomaly.py | 18 +++--- src/otx/core/model/segmentation.py | 2 +- src/otx/core/model/visual_prompting.py | 4 +- src/otx/engine/engine.py | 12 +++- src/otx/engine/utils/auto_configurator.py | 9 ++- tests/unit/algo/detection/test_rtmdet.py | 2 +- tests/unit/algo/detection/test_yolox.py | 4 +- .../algo/segmentation/test_dino_v2_seg.py | 2 +- .../test_zero_shot_segment_anything.py | 2 +- 36 files changed, 463 insertions(+), 109 deletions(-) diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py index 4f5fb0be6a9..201b0230a02 100644 --- a/src/otx/algo/anomaly/padim.py +++ b/src/otx/algo/anomaly/padim.py @@ -7,7 +7,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, Sequence +from typing import TYPE_CHECKING, Literal from anomalib.models.image import Padim as AnomalibPadim @@ -34,7 +34,6 @@ class Padim(OTXAnomaly, OTXModel, AnomalibPadim): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. - input_size (Sequence[int]): The input shape of the model. """ def __init__( @@ -48,10 +47,9 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, - input_size: Sequence[int] = (256, 256), ) -> None: - OTXAnomaly.__init__(self, input_size) - OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size) + OTXAnomaly.__init__(self) + OTXModel.__init__(self, label_info=AnomalyLabelInfo()) AnomalibPadim.__init__( self, backbone=backbone, diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py index 67963c25444..72dd30e8aa3 100644 --- a/src/otx/algo/anomaly/stfpm.py +++ b/src/otx/algo/anomaly/stfpm.py @@ -32,7 +32,6 @@ class Stfpm(OTXAnomaly, OTXModel, AnomalibStfpm): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. - input_size (Sequence[int]): The input shape of the model. """ def __init__( @@ -44,11 +43,10 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, - input_size: Sequence[int] = (256, 256), **kwargs, ) -> None: - OTXAnomaly.__init__(self, input_size) - OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size) + OTXAnomaly.__init__(self) + OTXModel.__init__(self, label_info=AnomalyLabelInfo()) AnomalibStfpm.__init__( self, backbone=backbone, diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py index 9682dda3ce4..9931e2dca95 100644 --- a/src/otx/algo/classification/backbones/efficientnet.py +++ b/src/otx/algo/classification/backbones/efficientnet.py @@ -569,7 +569,7 @@ class OTXEfficientNet(EfficientNet): in_size : tuple of two ints. Spatial size of the expected input image. """ - def __init__(self, version: EFFICIENTNET_VERSION, **kwargs): + def __init__(self, version: EFFICIENTNET_VERSION, input_size: tuple[int, int] | None = None, **kwargs): self.model_name = "efficientnet_" + version if version == "b0": @@ -612,6 +612,9 @@ def __init__(self, version: EFFICIENTNET_VERSION, **kwargs): msg = f"Unsupported EfficientNet version {version}" raise ValueError(msg) + if input_size is not None: + in_size = input_size + init_block_channels = 32 layers = [1, 2, 2, 3, 3, 4, 1] downsample = [1, 1, 1, 1, 0, 1, 0] diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index 2371fddd2d1..6e34ee22dab 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -60,12 +60,13 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, + input_size: Sequence[int] = (1, 3, 224, 224), ) -> None: self.version = version super().__init__( label_info=label_info, - input_size=(1, 3, 224, 224), + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -89,7 +90,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True), + backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True), neck=GlobalAveragePooling(dim=2), head=LinearClsHead( num_classes=num_classes, @@ -195,7 +196,7 @@ class EfficientNetForMulticlassClsSemiSL(EfficientNetForMulticlassCls): def _build_model(self, num_classes: int) -> nn.Module: return SemiSLClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.image_size[-2:]), + backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True), neck=GlobalAveragePooling(dim=2), head=OTXSemiSLLinearClsHead( num_classes=num_classes, @@ -307,7 +308,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), + backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True), neck=GlobalAveragePooling(dim=2), head=MultiLabelLinearClsHead( num_classes=num_classes, @@ -443,7 +444,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]), + backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True), neck=GlobalAveragePooling(dim=2), head=HierarchicalLinearClsHead( in_channels=1280, diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py index acb17d0ecc8..17397d964bc 100644 --- a/src/otx/algo/classification/efficientnet_v2.py +++ b/src/otx/algo/classification/efficientnet_v2.py @@ -351,7 +351,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index 56432533dcc..d47bceffea9 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -5,11 +5,13 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING, Any, Sequence import torch from torch import Tensor, nn from transformers import AutoModelForImageClassification +from transformers.configuration_utils import PretrainedConfig from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.classification import ( @@ -31,6 +33,9 @@ from otx.core.metrics import MetricCallable +DEFAULT_INPUT_SIZE = (1, 2, 224, 224) +logger = logging.getLogger(__name__) + class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel): """HuggingFaceModelForMulticlassCls is a class that represents a Hugging Face model for multiclass classification. @@ -61,7 +66,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: Sequence[int] = DEFAULT_INPUT_SIZE, ) -> None: self.model_name = model_name_or_path @@ -75,10 +80,19 @@ def __init__( ) def _create_model(self) -> nn.Module: + model_config, _ = PretrainedConfig.get_config_dict(self.model_name) + kwargs = {} + if "image_size" in model_config: + kwargs["image_size"] = self.input_size[-1] + elif self.input_size != DEFAULT_INPUT_SIZE: + msg = "There is no 'image_size' argument in the model configuration. There may be unexpected results." + logger.warning(msg) + return AutoModelForImageClassification.from_pretrained( pretrained_model_name_or_path=self.model_name, num_labels=self.label_info.num_classes, ignore_mismatched_sizes=True, + **kwargs, ) def _customize_inputs(self, inputs: MulticlassClsBatchDataEntity) -> dict[str, Any]: diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py index 4de1c8ac984..4a93e169ffc 100644 --- a/src/otx/algo/classification/vit.py +++ b/src/otx/algo/classification/vit.py @@ -586,7 +586,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index 76884ab383f..65caaab8b0b 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -20,9 +20,17 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.detection import ExplainableOTXDetModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.config.data import TileConfig if TYPE_CHECKING: from typing_extensions import Self + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class ATSS(ExplainableOTXDetModel): @@ -30,25 +38,35 @@ class ATSS(ExplainableOTXDetModel): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 800, 992), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 800, 992), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_size is None: + raise ValueError(self.input_size) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="standard", diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py index a361b278561..0f35e94f3b2 100644 --- a/src/otx/algo/detection/base_models/detection_transformer.py +++ b/src/otx/algo/detection/base_models/detection_transformer.py @@ -45,17 +45,18 @@ def __init__( optimizer_configuration: list[dict] | None = None, multi_scale: list[int] | None = None, num_top_queries: int = 300, + input_size: int = 640, ) -> None: """DETR model implementation.""" super().__init__() self.backbone = backbone self.decoder = decoder self.encoder = encoder - self.multi_scale = ( - multi_scale - if multi_scale is not None - else [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] - ) + if multi_scale is not None: + self.multi_scale = multi_scale + else: + self.multi_scale = [input_size -i * 64 for i in range(-5, 6)] + [input_size] * 2 + self.num_classes = num_classes self.num_top_queries = num_top_queries self.criterion = ( diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index 393e3d5a96f..db4bfda1980 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -5,12 +5,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import nn from torchvision import tv_tensors from transformers import AutoImageProcessor, AutoModelForObjectDetection +from transformers.configuration_utils import PretrainedConfig +# from transformers.image_processing_base import ImageProcessingMixin from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity @@ -60,6 +62,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=object-detection label_info: LabelInfoTypes, + input_size: Sequence[int] = (1, 3, 800, 992), # detection default input size optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -67,18 +70,16 @@ def __init__( ) -> None: self.model_name = model_name_or_path self.load_from = None - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) - if len(input_size := self.image_processor.size.values()) == 1: - input_size = (*input_size, *input_size) super().__init__( label_info=label_info, - input_size=(1, 3, *input_size), + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, ) + self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) def _build_model(self, num_classes: int) -> nn.Module: return AutoModelForObjectDetection.from_pretrained( diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 623d81c611c..d9c486fe7dc 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -7,7 +7,7 @@ import copy import re -from typing import Any, Sequence +from typing import TYPE_CHECKING, Any, Sequence import torch from torch import Tensor, nn @@ -23,6 +23,16 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.detection import ExplainableOTXDetModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.config.data import TileConfig + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class RTDETR(ExplainableOTXDetModel): @@ -34,13 +44,29 @@ class RTDETR(ExplainableOTXDetModel): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 640, 640), - **kwargs + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), + tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: + if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: + msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." + raise ValueError(msg) + super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) + self.tile_image_size = tile_image_size def _customize_inputs( self, @@ -244,6 +270,7 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, + input_size=self.input_size[-1], ) @@ -287,6 +314,7 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, + input_size=self.input_size[-1], ) @@ -336,4 +364,5 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, + input_size=self.input_size[-1], ) diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index b1a87088d80..74e1de044b3 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import Sequence +from typing import TYPE_CHECKING, Sequence from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import GIoULoss, QualityFocalLoss @@ -21,6 +21,16 @@ from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.detection import ExplainableOTXDetModel from otx.core.types.export import TaskLevelExportParameters +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.config.data import TileConfig + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class RTMDet(ExplainableOTXDetModel): @@ -28,13 +38,27 @@ class RTMDet(ExplainableOTXDetModel): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 640, 640), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 640, 640), - **kwargs ) -> None: + if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: + msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." + raise ValueError(msg) + super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index 43abeee8c7a..10a7ed5a23f 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -25,11 +25,18 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.detection import ExplainableOTXDetModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.config.data import TileConfig if TYPE_CHECKING: import torch + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from otx.core.data.dataset.base import OTXDataset + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable logger = logging.getLogger() @@ -47,13 +54,23 @@ class SSD(ExplainableOTXDetModel): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 864, 864), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 864, 864), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size @@ -153,7 +170,7 @@ def _get_new_anchors(self, dataset: OTXDataset, anchor_generator: SSDAnchorGener if isinstance(transform, Resize): target_wh = transform.scale if target_wh is None: - target_wh = (864, 864) + target_wh = self.input_size[-2:] msg = f"Cannot get target_wh from the dataset. Assign it with the default value: {target_wh}" logger.warning(msg) group_as = [len(width) for width in anchor_generator.widths] @@ -276,11 +293,11 @@ def load_state_dict_pre_hook(self, state_dict: dict[str, torch.Tensor], prefix: @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - raise ValueError(self.image_size) + if self.input_size is None: + raise ValueError(self.input_size) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="standard", diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index 1796a74899e..b0ff4248cbd 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -21,9 +21,17 @@ from otx.core.model.detection import ExplainableOTXDetModel from otx.core.types.export import OTXExportFormatType from otx.core.types.precision import OTXPrecisionType +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.config.data import TileConfig if TYPE_CHECKING: from pathlib import Path + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class YOLOX(ExplainableOTXDetModel): @@ -31,13 +39,27 @@ class YOLOX(ExplainableOTXDetModel): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 640, 640), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 640, 640), - **kwargs ) -> None: + if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: + msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." + raise ValueError(msg) + super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size @@ -129,13 +151,27 @@ class YOLOXTINY(YOLOX): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 416, 416), - tile_image_size: Sequence[int] = (1, 3, 416, 416), - **kwargs + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), + tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: + if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: + msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." + raise ValueError(msg) + super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py index 360027b1376..4536956b873 100644 --- a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py +++ b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py @@ -548,6 +548,7 @@ def bbox_loss(self, x: tuple[Tensor], sampling_results: list[SamplingResult], ba class CustomConvFCBBoxHead(Shared2FCBBoxHead, ClassIncrementalMixin): """CustomConvFCBBoxHead class for OTX.""" + # checked def loss_and_target( self, diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index 599367f4ef6..6cdbed940c8 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import TYPE_CHECKING, Any, Sequence from torchvision.ops import RoIAlign @@ -24,6 +24,16 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.config.data import TileConfig +from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class MaskRCNN(ExplainableOTXInstanceSegModel): @@ -78,13 +88,23 @@ class MaskRCNNResNet50(MaskRCNN): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 1024, 1024), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 512, 512), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size @@ -260,13 +280,23 @@ class MaskRCNNEfficientNet(MaskRCNN): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 1024, 1024), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 512, 512), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size @@ -459,17 +489,26 @@ class MaskRCNNSwinT(MaskRCNN): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 1344, 1344), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 512, 512), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size - def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { "rpn": { diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py index e5afb877998..dc24fc9933b 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py +++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections import OrderedDict -from typing import Any, Sequence +from typing import Any, Sequence, TYPE_CHECKING import torch from torch import Tensor, nn @@ -30,6 +30,16 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.config.data import TileConfig +from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class _TVMaskRCNN(MaskRCNN): @@ -262,15 +272,26 @@ class TVMaskRCNNR50(TVMaskRCNN): mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 1024, 1024), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 512, 512), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 01fe2c10847..682ff46e23b 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -20,9 +20,17 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.config.data import TileConfig +from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable if TYPE_CHECKING: from torch import Tensor + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.types.label import LabelInfoTypes + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.metrics import MetricCallable class RTMDetInst(ExplainableOTXInstanceSegModel): @@ -86,13 +94,23 @@ class RTMDetInstTiny(RTMDetInst): def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 640, 640), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), tile_image_size: Sequence[int] = (1, 3, 512, 512), - **kwargs ) -> None: super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, ) self.tile_image_size = tile_image_size diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py index 63baccfacc8..c4be747d4e0 100644 --- a/src/otx/algo/segmentation/dino_v2_seg.py +++ b/src/otx/algo/segmentation/dino_v2_seg.py @@ -10,12 +10,19 @@ from otx.algo.segmentation.backbones import DinoVisionTransformer from otx.algo.segmentation.heads import FCNHead from otx.core.model.segmentation import TorchVisionCompatibleModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.dice import SegmCallable from .base_model import BaseSegmModel if TYPE_CHECKING: from torch import nn from typing_extensions import Self + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes + from otx.core.metrics import MetricCallable class DinoV2Seg(BaseSegmModel): @@ -45,12 +52,34 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel): """DinoV2Seg Model.""" def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 560, 560), - **kwargs - ) -> None: + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = SegmCallable, # type: ignore[assignment] + torch_compile: bool = False, + backbone_configuration: dict[str, Any] | None = None, + decode_head_configuration: dict[str, Any] | None = None, + criterion_configuration: list[dict[str, Any]] | None = None, + export_image_configuration: dict[str, Any] | None = None, + name_base_model: str = "semantic_segmentation_model", + ): + if input_size[-1] % 14 != 0 or input_size[-2] % 14 != 0: + msg = f"Input size should be a multiple of 14, but got {input_size[-2:]} instead." + raise ValueError(msg) + super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + backbone_configuration=backbone_configuration, + decode_head_configuration=decode_head_configuration, + criterion_configuration=criterion_configuration, + export_image_configuration=export_image_configuration, + name_base_model=name_base_model, ) def _create_model(self) -> nn.Module: diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index 693c2219c05..feee127b0f8 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -5,6 +5,7 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING, Any, Sequence import torch @@ -13,6 +14,7 @@ AutoImageProcessor, AutoModelForSemanticSegmentation, ) +from transformers.configuration_utils import PretrainedConfig from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.segmentation import SegBatchDataEntity, SegBatchPredEntity @@ -30,6 +32,8 @@ from otx.core.metrics import MetricCallable +logger = logging.getLogger(__name__) + class HuggingFaceModelForSegmentation(OTXSegmentationModel): """A class representing a Hugging Face model for segmentation. @@ -61,6 +65,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=image-segmentation label_info: LabelInfoTypes, + input_size: Sequence[int] = (1, 3, 512, 512), # sementic segmentation default input size optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -68,24 +73,38 @@ def __init__( ) -> None: self.model_name = model_name_or_path self.load_from = None - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) - if len(input_size := self.image_processor.size.values()) == 1: - input_size = (*input_size, *input_size) super().__init__( label_info=label_info, - input_size=(1, 3, *input_size), + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, ) + self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) def _create_model(self) -> nn.Module: + model_config, _ = PretrainedConfig.get_config_dict(self.model_name) + kwargs = {} + if "image_size" in model_config: + kwargs["image_size"] = self.input_size[-1] + + if (patch_size := model_config.get("patch_sizes")) != None: + if isinstance(patch_size, (list, tuple)): + patch_size = patch_size + if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0: + msg = ( + f"It's recommended to set the input size to multiple of patch size({patch_size}). " + "If not, score can decrease or model can't work." + ) + logger.warning(msg) + return AutoModelForSemanticSegmentation.from_pretrained( pretrained_model_name_or_path=self.model_name, num_labels=self.label_info.num_classes, ignore_mismatched_sizes=True, + **kwargs, ) def _customize_inputs(self, entity: SegBatchDataEntity) -> dict[str, Any]: diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py index 458a0f44ea6..31bf7ae33a2 100644 --- a/src/otx/algo/segmentation/litehrnet.py +++ b/src/otx/algo/segmentation/litehrnet.py @@ -15,11 +15,18 @@ from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.segmentation import TorchVisionCompatibleModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.dice import SegmCallable from .base_model import BaseSegmModel if TYPE_CHECKING: from torch import nn + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes + from otx.core.metrics import MetricCallable class LiteHRNetS(BaseSegmModel): @@ -519,12 +526,30 @@ class OTXLiteHRNet(TorchVisionCompatibleModel): """LiteHRNet Model.""" def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 512, 512), - **kwargs - ) -> None: + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = SegmCallable, # type: ignore[assignment] + torch_compile: bool = False, + backbone_configuration: dict[str, Any] | None = None, + decode_head_configuration: dict[str, Any] | None = None, + criterion_configuration: list[dict[str, Any]] | None = None, + export_image_configuration: dict[str, Any] | None = None, + name_base_model: str = "semantic_segmentation_model", + ): super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + backbone_configuration=backbone_configuration, + decode_head_configuration=decode_head_configuration, + criterion_configuration=criterion_configuration, + export_image_configuration=export_image_configuration, + name_base_model=name_base_model, ) def _create_model(self) -> nn.Module: diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py index 72287dda2a3..c18b1cc10c1 100644 --- a/src/otx/algo/segmentation/segnext.py +++ b/src/otx/algo/segmentation/segnext.py @@ -10,11 +10,18 @@ from otx.algo.segmentation.heads import LightHamHead from otx.algo.utils.support_otx_v1 import OTXv1Helper from otx.core.model.segmentation import TorchVisionCompatibleModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.metrics.dice import SegmCallable from .base_model import BaseSegmModel if TYPE_CHECKING: from torch import nn + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes + from otx.core.metrics import MetricCallable class SegNextB(BaseSegmModel): @@ -109,12 +116,30 @@ class OTXSegNext(TorchVisionCompatibleModel): """SegNext Model.""" def __init__( self, + label_info: LabelInfoTypes, input_size: Sequence[int] = (1, 3, 512, 512), - **kwargs - ) -> None: + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = SegmCallable, # type: ignore[assignment] + torch_compile: bool = False, + backbone_configuration: dict[str, Any] | None = None, + decode_head_configuration: dict[str, Any] | None = None, + criterion_configuration: list[dict[str, Any]] | None = None, + export_image_configuration: dict[str, Any] | None = None, + name_base_model: str = "semantic_segmentation_model", + ): super().__init__( + label_info=label_info, input_size=input_size, - **kwargs + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + backbone_configuration=backbone_configuration, + decode_head_configuration=decode_head_configuration, + criterion_configuration=criterion_configuration, + export_image_configuration=export_image_configuration, + name_base_model=name_base_model, ) def _create_model(self) -> nn.Module: diff --git a/src/otx/algo/visual_prompting/backbones/tiny_vit.py b/src/otx/algo/visual_prompting/backbones/tiny_vit.py index e3b7714683c..ac7f5f824eb 100644 --- a/src/otx/algo/visual_prompting/backbones/tiny_vit.py +++ b/src/otx/algo/visual_prompting/backbones/tiny_vit.py @@ -362,7 +362,9 @@ def forward(self, x: Tensor) -> Tensor: """Forward.""" h, w = self.input_resolution b, l, c = x.shape # noqa: E741 - assert h * w == l, "input feature has wrong size" # noqa: S101 + if h * w != l: + msg = f"Input feature has wrong size. Expected that h({h}) * w({w}) == l({l})." + raise ValueError(msg) res_x = x if self.window_size == h and self.window_size == w: x = self.attn(x) @@ -634,6 +636,6 @@ def forward(self, x: Tensor) -> Tensor: layer = self.layers[i] x = layer(x) batch, _, channel = x.size() - x = x.view(batch, 64, 64, channel) + x = x.view(batch, self.img_size // 16, self.img_size // 16, channel) x = x.permute(0, 3, 1, 2) return self.neck(x) diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index e8095410bb8..fc4579de5d4 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -139,7 +139,7 @@ def load_checkpoint( if key in state_dict: state_dict.pop(key) self.load_state_dict(state_dict) - except ValueError as e: + except (ValueError, RuntimeError) as e: log.info( f"{e}: {load_from} is not desirable format for torch.hub.load_state_dict_from_url. " f"To manually load {load_from}, try to set it to trainer.checkpoint.", @@ -507,17 +507,17 @@ def __init__( return_extra_metrics: bool = False, stability_score_offset: float = 1.0, ) -> None: - super().__init__( - label_info=label_info, - input_size=input_size, - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - ) + if input_size[-1] != input_size[-2]: + msg = f"SAM should use square image, but got {input_size}" + raise ValueError(msg) + if input_size[-1] % 16 != 0 and input_size[-2] % 16 != 0: + msg = f"Input size should be a multiple of 16, but got {input_size[-2:]} instead." + raise ValueError(msg) + self.config = { "backbone": backbone, - "image_size": self.input_size[-1], + "image_size": input_size[-1], + "image_embedding_size" : input_size[-1] // 16, "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, @@ -527,6 +527,14 @@ def __init__( "stability_score_offset": stability_score_offset, **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone], } + super().__init__( + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) def _create_model(self) -> nn.Module: """Create a PyTorch model for this class.""" diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 983c2fe927b..e0e2813bdab 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -330,6 +330,13 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # For num_classes update, Model and Metric are instantiated separately. model_config = self.config[self.subcommand].pop("model") + input_size = self.config["train"]["engine"].get("input_size") + if input_size is not None: + if isinstance(input_size, int): + input_size = (input_size, input_size) + self.config["train"]["data"]["input_size"] = input_size + model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size + # Instantiate the things that don't need to special handling self.config_init = self.parser.instantiate_classes(self.config) self.workspace = self.get_config_value(self.config_init, "workspace") diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index 7d1d56ba658..a1a5cdced8a 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -96,7 +96,7 @@ def __init__( self.device = device self.subsets: dict[str, OTXDataset] = {} - self.save_hyperparameters() + self.save_hyperparameters(ignore=["input_size"]) # TODO (Jaeguk): This is workaround for a bug in Datumaro. # These lines should be removed after next datumaro release. @@ -454,5 +454,6 @@ def __reduce__(self): self.unannotated_items_ratio, self.auto_num_workers, self.device, + self.input_size, ), ) diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py index 009affae692..583a8f2c99e 100644 --- a/src/otx/core/model/action_classification.py +++ b/src/otx/core/model/action_classification.py @@ -7,7 +7,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence import numpy as np import torch @@ -41,6 +41,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt def __init__( self, label_info: LabelInfoTypes, + input_size: Sequence[int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -50,6 +51,7 @@ def __init__( self.std = (255.0, 255.0, 255.0) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py index 301fb03191c..44edf869b0b 100644 --- a/src/otx/core/model/anomaly.py +++ b/src/otx/core/model/anomaly.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging as log -from typing import TYPE_CHECKING, Any, TypeAlias, Sequence +from typing import TYPE_CHECKING, Any, TypeAlias import torch from anomalib import TaskType as AnomalibTaskType @@ -51,10 +51,10 @@ class OTXAnomaly: """Methods used to make OTX model compatible with the Anomalib model.""" - def __init__(self, input_size: Sequence[int] = (256, 256)) -> None: + def __init__(self) -> None: self.optimizer: list[OptimizerCallable] | OptimizerCallable = None self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None - self._input_size: tuple[int, int] = input_size + self._input_size: tuple[int, int] = (256, 256) self.trainer: Trainer self.model: nn.Module self.image_threshold: BaseThreshold @@ -116,13 +116,15 @@ def _get_values_from_transforms( self, ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]: """Get the value requested value from default transforms.""" - mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) + image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) for transform in self.configure_transforms().transforms: # type: ignore[attr-defined] name = transform.__class__.__name__ - if "Normalize" in name: + if "Resize" in name: + image_size = tuple(transform.size) # type: ignore[assignment] + elif "Normalize" in name: mean_value = tuple(value * 255 for value in transform.mean) # type: ignore[assignment] std_value = tuple(value * 255 for value in transform.std) # type: ignore[assignment] - return mean_value, std_value + return image_size, mean_value, std_value @property def trainable_model(self) -> str | None: @@ -298,7 +300,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: """Creates OTXAnomalyModelExporter object that can export anomaly models.""" min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist() max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist() - mean_values, scale_values = self._get_values_from_transforms() + image_shape, mean_values, scale_values = self._get_values_from_transforms() onnx_export_configuration = { "opset_version": 14, "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}}, @@ -306,7 +308,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: "output_names": ["output"], } return OTXAnomalyModelExporter( - image_shape=self.input_size, + image_shape=image_shape, image_threshold=self.image_threshold.value.cpu().numpy().tolist(), pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(), task=self.task, diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 89cdae1f215..60b467b82dd 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -145,7 +145,7 @@ def __init__( criterion_configuration (list[dict[str, Any]] | None, optional): The configuration for the criterion of the model. Defaults to None. export_image_configuration (dict[str, Any] | None, optional): - The configuration for the export of the model like mean, scale and image_size. Defaults to None. + The configuration for the export of the model like mean and scale. Defaults to None. name_base_model (str, optional): The name of the base model used for trainig. Defaults to "semantic_segmentation_model". """ diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index 749ec5ce0db..789e2a679ab 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -155,8 +155,8 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro def __init__( self, - input_size: Sequence[int], label_info: LabelInfoTypes = NullLabelInfo(), + input_size: Sequence[int] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -178,7 +178,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXVisualPromptingModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, self.model.image_size, self.model.image_size), + input_size=self.input_size, mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="fit_to_window", diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index f9e233359ed..b92b4df69b4 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -10,7 +10,7 @@ import tempfile from contextlib import contextmanager from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal +from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal, Sequence from warnings import warn import torch @@ -119,6 +119,7 @@ def __init__( model: OTXModel | str | None = None, checkpoint: PathLike | None = None, device: DeviceType = DeviceType.auto, + input_size: Sequence[int] | int | None = None, **kwargs, ): """Initializes the OTX Engine. @@ -141,8 +142,17 @@ def __init__( data_root=data_root, task=datamodule.task if datamodule is not None else task, model_name=None if isinstance(model, OTXModel) else model, + input_size=input_size, ) + if input_size is not None: + if isinstance(datamodule, OTXDataModule) and datamodule.input_size != input_size: + msg = "Data module is already initialized. Input size will be ignored to data module." + logging.warning(msg) + if isinstance(model, OTXModel) and model.input_size != input_size: + msg = "Model is already initialized. Input size will be ignored to model." + logging.warning(msg) + self._datamodule: OTXDataModule | None = ( datamodule if datamodule is not None else self._auto_configurator.get_datamodule() ) diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index d544609b1b0..26992720134 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -8,7 +8,7 @@ import logging from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from warnings import warn import datumaro @@ -65,7 +65,7 @@ ], "common_semantic_segmentation_with_subset_dirs": [OTXTaskType.SEMANTIC_SEGMENTATION], "kinetics": [OTXTaskType.ACTION_CLASSIFICATION], - "mvtec": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION], + "mvtec_classification": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION], } OVMODEL_PER_TASK = { @@ -144,11 +144,13 @@ def __init__( data_root: PathLike | None = None, task: OTXTaskType | None = None, model_name: str | None = None, + input_size: Sequence[int] | None = None ) -> None: self.data_root = data_root self._task = task self._config: dict | None = None self.model_name: str | None = model_name + self.input_size = input_size @property def task(self) -> OTXTaskType: @@ -227,6 +229,9 @@ def get_datamodule(self) -> OTXDataModule | None: _ = data_config.pop("__path__", {}) # Remove __path__ key that for CLI _ = data_config.pop("config", {}) # Remove config key that for CLI + if getattr(data_config, "input_size", None) is not None and self.input_size is not None: + data_config["input_size"] = self.input_size + return OTXDataModule( train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config), val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config), diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py index e5ec628be4a..aec4a299737 100644 --- a/tests/unit/algo/detection/test_rtmdet.py +++ b/tests/unit/algo/detection/test_rtmdet.py @@ -18,7 +18,7 @@ def test_init(self) -> None: assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXt) assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN) assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead) - assert otx_rtmdet_tiny.image_size == (1, 3, 640, 640) + assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640) assert otx_rtmdet_tiny.tile_image_size == (1, 3, 640, 640) def test_exporter(self) -> None: diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py index b83d0cae71b..3b8cb90e686 100644 --- a/tests/unit/algo/detection/test_yolox.py +++ b/tests/unit/algo/detection/test_yolox.py @@ -18,11 +18,11 @@ def test_init(self) -> None: assert isinstance(otx_yolox_l.model.backbone, CSPDarknet) assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN) assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead) - assert otx_yolox_l.image_size == (1, 3, 640, 640) + assert otx_yolox_l.input_size == (1, 3, 640, 640) assert otx_yolox_l.tile_image_size == (1, 3, 640, 640) otx_yolox_tiny = YOLOXTINY(label_info=3) - assert otx_yolox_tiny.image_size == (1, 3, 416, 416) + assert otx_yolox_tiny.input_size == (1, 3, 416, 416) assert otx_yolox_tiny.tile_image_size == (1, 3, 416, 416) def test_exporter(self) -> None: diff --git a/tests/unit/algo/segmentation/test_dino_v2_seg.py b/tests/unit/algo/segmentation/test_dino_v2_seg.py index 7e1a8a9224d..259b6f4816b 100644 --- a/tests/unit/algo/segmentation/test_dino_v2_seg.py +++ b/tests/unit/algo/segmentation/test_dino_v2_seg.py @@ -10,7 +10,7 @@ class TestDinoV2Seg: @pytest.fixture(scope="class") def fxt_dino_v2_seg(self) -> OTXDinoV2Seg: - return OTXDinoV2Seg(label_info=10, export_image_configuration={"image_size": (1, 3, 560, 560)}) + return OTXDinoV2Seg(label_info=10) def test_dino_v2_seg_init(self, fxt_dino_v2_seg): assert isinstance(fxt_dino_v2_seg, OTXDinoV2Seg) diff --git a/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py b/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py index b9fe3a5e58e..fb6fe80c5eb 100644 --- a/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py +++ b/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py @@ -422,7 +422,7 @@ def test_predict_masks(self, mocker, build_zero_shot_segment_anything) -> None: ) zero_shot_segment_anything = build_zero_shot_segment_anything() - zero_shot_segment_anything.image_size = 6 + zero_shot_segment_anything.input_size = 6 mask = zero_shot_segment_anything._predict_masks( mode="infer", From a40db406d96c54137054fd3482765c8782a4fc04 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 5 Aug 2024 17:09:02 +0900 Subject: [PATCH 05/42] update interface --- .../instance_segmentation/heads/custom_roi_head.py | 1 - src/otx/cli/cli.py | 12 +++++------- src/otx/core/data/module.py | 5 +++++ src/otx/engine/engine.py | 11 +---------- src/otx/engine/utils/auto_configurator.py | 9 +++++++-- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py index 4536956b873..360027b1376 100644 --- a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py +++ b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py @@ -548,7 +548,6 @@ def bbox_loss(self, x: tuple[Tensor], sampling_results: list[SamplingResult], ba class CustomConvFCBBoxHead(Shared2FCBBoxHead, ClassIncrementalMixin): """CustomConvFCBBoxHead class for OTX.""" - # checked def loss_and_target( self, diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 6759d97adbe..fcec67ec2d7 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -331,18 +331,16 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # For num_classes update, Model and Metric are instantiated separately. model_config = self.config[self.subcommand].pop("model") - input_size = self.config["train"]["engine"].get("input_size") - if input_size is not None: - if isinstance(input_size, int): - input_size = (input_size, input_size) - self.config["train"]["data"]["input_size"] = input_size - model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size - # Instantiate the things that don't need to special handling self.config_init = self.parser.instantiate_classes(self.config) self.workspace = self.get_config_value(self.config_init, "workspace") self.datamodule = self.get_config_value(self.config_init, "data") + if (input_size := self.datamodule.input_size) is not None: + if isinstance(input_size, int): + input_size = (input_size, input_size) + model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size + # Instantiate the model and needed components self.model = self.instantiate_model(model_config=model_config) diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index a1a5cdced8a..d371bb5320d 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -63,6 +63,7 @@ def __init__( auto_num_workers: bool = False, device: DeviceType = DeviceType.auto, input_size: int | tuple[int, int] | None = None, + adaptive_input_size: bool = False, ) -> None: """Constructor.""" super().__init__() @@ -70,10 +71,14 @@ def __init__( self.data_format = data_format self.data_root = data_root + if adaptive_input_size: + print("adaptive_input_size works") + if input_size is not None: for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]: if subset_cfg.input_size is None: subset_cfg.input_size = input_size + self.input_size = input_size self.train_subset = train_subset self.val_subset = val_subset diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index edd0d6c063c..ee5ff4dce35 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -122,7 +122,6 @@ def __init__( checkpoint: PathLike | None = None, device: DeviceType = DeviceType.auto, num_devices: int = 1, - input_size: Sequence[int] | int | None = None, **kwargs, ): """Initializes the OTX Engine. @@ -147,17 +146,8 @@ def __init__( data_root=data_root, task=datamodule.task if datamodule is not None else task, model_name=None if isinstance(model, OTXModel) else model, - input_size=input_size, ) - if input_size is not None: - if isinstance(datamodule, OTXDataModule) and datamodule.input_size != input_size: - msg = "Data module is already initialized. Input size will be ignored to data module." - logging.warning(msg) - if isinstance(model, OTXModel) and model.input_size != input_size: - msg = "Model is already initialized. Input size will be ignored to model." - logging.warning(msg) - self._datamodule: OTXDataModule | None = ( datamodule if datamodule is not None else self._auto_configurator.get_datamodule() ) @@ -169,6 +159,7 @@ def __init__( if isinstance(model, OTXModel) else self._auto_configurator.get_model( label_info=self._datamodule.label_info if self._datamodule is not None else None, + input_size=self._datamodule.input_size, ) ) diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 26992720134..65e8f8bf2e7 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -65,7 +65,7 @@ ], "common_semantic_segmentation_with_subset_dirs": [OTXTaskType.SEMANTIC_SEGMENTATION], "kinetics": [OTXTaskType.ACTION_CLASSIFICATION], - "mvtec_classification": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION], + "mvtec": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION], } OVMODEL_PER_TASK = { @@ -245,7 +245,7 @@ def get_datamodule(self) -> OTXDataModule | None: **data_config, ) - def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None) -> OTXModel: + def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None, input_size: Sequence[int] | None = None) -> OTXModel: """Retrieves the OTXModel instance based on the provided model name and meta information. Args: @@ -278,6 +278,11 @@ def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | model_config = deepcopy(self.config["model"]) + if input_size is not None: + if isinstance(input_size, int): + input_size = (input_size, input_size) + model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size + model_cls = get_model_cls_from_config(Namespace(model_config)) if should_pass_label_info(model_cls): From ac506ef97ea61b2a9f201905f79781bf40926aa2 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 7 Aug 2024 13:35:47 +0900 Subject: [PATCH 06/42] implement adaptive input size draft version --- src/otx/algo/detection/atss.py | 2 - .../base_models/detection_transformer.py | 2 +- src/otx/algo/detection/rtdetr.py | 2 - src/otx/algo/detection/rtmdet.py | 2 - src/otx/algo/detection/ssd.py | 2 - src/otx/algo/detection/yolox.py | 7 +- .../algo/instance_segmentation/maskrcnn.py | 10 +- .../algo/instance_segmentation/maskrcnn_tv.py | 6 +- .../algo/instance_segmentation/rtmdet_inst.py | 2 - src/otx/cli/cli.py | 6 - src/otx/core/data/module.py | 22 +- src/otx/core/data/tile_adaptor.py | 183 ------------- src/otx/core/data/utils/__init__.py | 8 + src/otx/core/data/utils/utils.py | 253 ++++++++++++++++++ src/otx/engine/engine.py | 2 +- src/otx/engine/utils/auto_configurator.py | 5 - src/otx/recipe/detection/yolox_tiny.yaml | 6 +- tests/unit/algo/detection/test_rtmdet.py | 1 - tests/unit/algo/detection/test_yolox.py | 2 - 19 files changed, 279 insertions(+), 244 deletions(-) delete mode 100644 src/otx/core/data/tile_adaptor.py create mode 100644 src/otx/core/data/utils/__init__.py create mode 100644 src/otx/core/data/utils/utils.py diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index e1d5c5842eb..dd5412ac072 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -45,7 +45,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 800, 992), ) -> None: super().__init__( label_info=label_info, @@ -56,7 +55,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size @property def _exporter(self) -> OTXModelExporter: diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py index 0f35e94f3b2..18ca34c33de 100644 --- a/src/otx/algo/detection/base_models/detection_transformer.py +++ b/src/otx/algo/detection/base_models/detection_transformer.py @@ -55,7 +55,7 @@ def __init__( if multi_scale is not None: self.multi_scale = multi_scale else: - self.multi_scale = [input_size -i * 64 for i in range(-5, 6)] + [input_size] * 2 + self.multi_scale = [input_size -i * 32 for i in range(-5, 6)] + [input_size] * 2 self.num_classes = num_classes self.num_top_queries = num_top_queries diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 381d77ca83a..4aeb7bed3b2 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -51,7 +51,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." @@ -66,7 +65,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _customize_inputs( self, diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index 74c3686cf77..3f8e7b5c34c 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -45,7 +45,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." @@ -60,7 +59,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size @property def _exporter(self) -> OTXModelExporter: diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index 224f69b59ef..910baae419d 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -61,7 +61,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 864, 864), ) -> None: super().__init__( label_info=label_info, @@ -72,7 +71,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg = { diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index 520eaac645a..8341c6419f6 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -46,7 +46,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." @@ -61,7 +60,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _customize_inputs( self, @@ -79,11 +77,10 @@ def _exporter(self) -> OTXModelExporter: raise ValueError(msg) swap_rgb = not isinstance(self, YOLOXTINY) # only YOLOX-TINY uses RGB - input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=input_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -160,7 +157,6 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 640, 640), ) -> None: if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." @@ -175,7 +171,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index 6b8c7b1eaf8..b1af171124a 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -46,11 +46,9 @@ def _exporter(self) -> OTXModelExporter: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size - return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=input_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window", @@ -96,7 +94,6 @@ def __init__( metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 512, 512), ) -> None: super().__init__( label_info=label_info, @@ -107,7 +104,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { @@ -288,7 +284,6 @@ def __init__( metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 512, 512), ) -> None: super().__init__( label_info=label_info, @@ -299,7 +294,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { @@ -497,7 +491,6 @@ def __init__( metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 512, 512), ) -> None: super().__init__( label_info=label_info, @@ -508,7 +501,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> TwoStageDetector: train_cfg = { diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py index 54bb5d1fbc0..dcfe9c3c3c8 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py +++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py @@ -232,11 +232,9 @@ def _exporter(self) -> OTXModelExporter: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size - return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=input_size, + input_size=self.input_size, mean=self.mean, std=self.std, resize_mode="fit_to_window", @@ -283,7 +281,6 @@ def __init__( metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 512, 512), ) -> None: super().__init__( label_info=label_info, @@ -294,7 +291,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _create_model(self) -> nn.Module: """From torchvision tutorial.""" diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index bd39dc05210..96801665710 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -102,7 +102,6 @@ def __init__( metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), - tile_image_size: Sequence[int] = (1, 3, 512, 512), ) -> None: super().__init__( label_info=label_info, @@ -113,7 +112,6 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.tile_image_size = tile_image_size def _build_model(self, num_classes: int) -> SingleStageDetector: train_cfg = { diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index fcec67ec2d7..c0a66c184dd 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -413,12 +413,6 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel: model: OTXModel = model_parser.instantiate_classes(Namespace(model=model_config)).get("model") self.config_init[self.subcommand]["model"] = model - # Update tile config due to adaptive tiling - if model.tile_config.enable_tiler: - # TODO(Eugene): Ticket no. 139000: Need to find a better way to configure image size for OV Models - # https://github.com/openvinotoolkit/training_extensions/pull/2925 - model.input_size = model.tile_image_size - # Update self.config with model self.config[self.subcommand].update(Namespace(model=model_config)) diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index d371bb5320d..8268a18dd4f 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -7,7 +7,7 @@ import logging as log from copy import deepcopy -from typing import TYPE_CHECKING, Iterable +from typing import TYPE_CHECKING, Iterable, Literal import torch from datumaro import Dataset as DmDataset @@ -24,13 +24,13 @@ parse_mem_cache_size_to_int, ) from otx.core.data.pre_filtering import pre_filtering -from otx.core.data.tile_adaptor import adapt_tile_config from otx.core.types.device import DeviceType from otx.core.types.image import ImageColorChannel from otx.core.types.label import LabelInfo from otx.core.types.task import OTXTaskType from otx.core.utils.instantiators import instantiate_sampler from otx.core.utils.utils import get_adaptive_num_workers +from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config if TYPE_CHECKING: from lightning.pytorch.utilities.parsing import AttributeDict @@ -63,7 +63,7 @@ def __init__( auto_num_workers: bool = False, device: DeviceType = DeviceType.auto, input_size: int | tuple[int, int] | None = None, - adaptive_input_size: bool = False, + adaptive_input_size: Literal["auto", "downscale", "none"] = "none", ) -> None: """Constructor.""" super().__init__() @@ -71,15 +71,6 @@ def __init__( self.data_format = data_format self.data_root = data_root - if adaptive_input_size: - print("adaptive_input_size works") - - if input_size is not None: - for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]: - if subset_cfg.input_size is None: - subset_cfg.input_size = input_size - self.input_size = input_size - self.train_subset = train_subset self.val_subset = val_subset self.test_subset = test_subset @@ -143,6 +134,13 @@ def __init__( subset=self.unlabeled_subset.subset_name, ) + if adaptive_input_size != "none": + input_size = adapt_input_size_to_dataset(dataset, input_size, adaptive_input_size=="downscale") + if input_size is not None: + for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]: + subset_cfg.input_size = input_size + self.input_size = input_size + if self.tile_config.enable_tiler and self.tile_config.enable_adaptive_tiling: adapt_tile_config(self.tile_config, dataset=dataset) diff --git a/src/otx/core/data/tile_adaptor.py b/src/otx/core/data/tile_adaptor.py deleted file mode 100644 index 34755dd55eb..00000000000 --- a/src/otx/core/data/tile_adaptor.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# -"""Tile Adaptor for OTX.""" -from __future__ import annotations - -import logging as log -from typing import Any - -import numpy as np -from datumaro import Bbox, Dataset, DatasetSubset, Polygon - -from otx.core.config.data import TileConfig - - -def compute_robust_statistics(values: np.array) -> dict[str, float]: - """Computes robust statistics of given samples. - - Args: - values (np.array): Array of samples - - Returns: - dict[str, float]: Robust avg, min, max values - """ - stat: dict = {} - if values.size == 0: - return stat - - avg_value = np.mean(values) - std_value = np.std(values) - avg_3std_min_value = avg_value - 3 * std_value - avg_3std_max_value = avg_value + 3 * std_value - min_value = np.min(values) - max_value = np.max(values) - - # Refine min/max to reduce outlier effect - robust_min_value = max(min_value, avg_3std_min_value) - robust_max_value = min(max_value, avg_3std_max_value) - - stat["avg"] = float(avg_value) - stat["std"] = float(std_value) - stat["min"] = float(min_value) - stat["max"] = float(max_value) - stat["robust_min"] = float(robust_min_value) - stat["robust_max"] = float(robust_max_value) - return stat - - -def compute_robust_scale_statistics(values: np.array) -> dict[str, float]: - """Computes robust statistics of scale values. - - Average of 0.5x scale and 2x scale should be 1x - - Args: - values (np.array): Array of positive scale values - - Returns: - dict[str, float]: Robust avg, min, max values - """ - # Compute stat in log scale & convert back to original scale - if values.size == 0: - return {} - - stat = compute_robust_statistics(np.log(values)) - stat = {k: float(np.exp(v)) for k, v in stat.items()} - # Normal scale std is easier to understand - stat["std"] = float(np.std(values)) - return stat - - -def compute_robust_dataset_statistics( - dataset: DatasetSubset, - ann_stat: bool = False, - max_samples: int = 1000, -) -> dict[str, Any]: - """Computes robust statistics of image & annotation sizes. - - Args: - dataset (DatasetSubset): Input dataset. - ann_stat (bool, optional): Whether to compute annotation size statistics. Defaults to False. - max_samples (int, optional): Maximum number of dataset subsamples to analyze. Defaults to 1000. - - Returns: - Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally. - ex) stat = { - "image": {"avg": ...}, - "annotation": { - "num_per_image": {"avg": ...}, - "size_of_shape": {"avg": ...}, - } - } - """ - stat: dict = {} - if len(dataset) == 0 or max_samples <= 0: - return stat - - data_ids = [item.id for item in dataset] - max_image_samples = min(max_samples, len(dataset)) - # NOTE: current OTX does not set seed globally - rng = np.random.default_rng(42) - data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples] - - image_sizes = [] - for idx in data_ids: - data = dataset.get(id=idx, subset=dataset.name) - height, width = data.media.size - image_sizes.append(np.sqrt(width * height)) - stat["image"] = compute_robust_scale_statistics(np.array(image_sizes)) - - if ann_stat: - stat["annotation"] = {} - num_per_images: list[int] = [] - size_of_box_shapes: list[float] = [] - size_of_polygon_shapes: list[float] = [] - for idx in data_ids: - data = dataset.get(id=idx, subset=dataset.name) - annotations: dict[str, list] = {"boxes": [], "polygons": []} - for ann in data.annotations: - if isinstance(ann, Bbox): - annotations["boxes"].append(ann) - elif isinstance(ann, Polygon): - annotations["polygons"].append(ann) - - num_per_images.append(max(len(annotations["boxes"]), len(annotations["polygons"]))) - - if len(size_of_box_shapes) >= max_samples or len(size_of_polygon_shapes) >= max_samples: - continue - - size_of_box_shapes.extend( - filter(lambda x: x >= 1, [np.sqrt(anno.get_area()) for anno in annotations["boxes"]]), - ) - size_of_polygon_shapes.extend( - filter(lambda x: x >= 1, [np.sqrt(anno.get_area()) for anno in annotations["polygons"]]), - ) - - stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images)) - stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics( - np.array(size_of_polygon_shapes) if len(size_of_polygon_shapes) else np.array(size_of_box_shapes), - ) - - return stat - - -def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None: - """Config tile parameters. - - Adapt based on annotation statistics. - i.e. tile size, tile overlap, ratio and max objects per sample - - Args: - tile_config (TileConfig): tiling parameters of the model - dataset (Dataset): Datumaro dataset including all subsets - """ - if (train_dataset := dataset.subsets().get("train")) is not None: - stat = compute_robust_dataset_statistics(train_dataset, ann_stat=True) - max_num_objects = round(stat["annotation"]["num_per_image"]["max"]) - avg_size = stat["annotation"]["size_of_shape"]["avg"] - min_size = stat["annotation"]["size_of_shape"]["robust_min"] - max_size = stat["annotation"]["size_of_shape"]["robust_max"] - log.info(f"----> [stat] scale avg: {avg_size}") - log.info(f"----> [stat] scale min: {min_size}") - log.info(f"----> [stat] scale max: {max_size}") - - log.info("[Adaptive tiling pararms]") - object_tile_ratio = tile_config.object_tile_ratio - tile_size = int(avg_size / object_tile_ratio) - tile_overlap = max_size / tile_size - log.info(f"----> avg_object_size: {avg_size}") - log.info(f"----> max_object_size: {max_size}") - log.info(f"----> object_tile_ratio: {object_tile_ratio}") - log.info(f"----> tile_size: {avg_size} / {object_tile_ratio} = {tile_size}") - log.info(f"----> tile_overlap: {max_size} / {tile_size} = {tile_overlap}") - - if tile_overlap >= 0.9: - # Use the average object area if the tile overlap is too large to prevent 0 stride. - tile_overlap = min(avg_size / tile_size, 0.9) - log.info(f"----> (too big) tile_overlap: {avg_size} / {tile_size} = min[{tile_overlap}, 0.9]") - - # TODO(Eugene): how to validate lower/upper_bound? dataclass? pydantic? - # https://github.com/openvinotoolkit/training_extensions/pull/2903 - tile_config.tile_size = (tile_size, tile_size) - tile_config.max_num_instances = max_num_objects - tile_config.overlap = tile_overlap diff --git a/src/otx/core/data/utils/__init__.py b/src/otx/core/data/utils/__init__.py new file mode 100644 index 00000000000..adc0400284e --- /dev/null +++ b/src/otx/core/data/utils/__init__.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Utility modules for core data modules.""" + +from .utils import adapt_tile_config, adapt_input_size_to_dataset + +__all__ = ["adapt_tile_config", "adapt_input_size_to_dataset"] diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py new file mode 100644 index 00000000000..0d7f3fbfdbd --- /dev/null +++ b/src/otx/core/data/utils/utils.py @@ -0,0 +1,253 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Functions for adaptive input size.""" +from __future__ import annotations + +import logging +from collections import defaultdict +from typing import TYPE_CHECKING, Any + +import numpy as np +from datumaro.components.annotation import _Shape + +if TYPE_CHECKING: + from datumaro import Dataset, DatasetSubset + + from otx.core.config.data import TileConfig + + +logger = logging.getLogger(__name__) + + +def compute_robust_statistics(values: np.array) -> dict[str, float]: + """Computes robust statistics of given samples. + + Args: + values (np.array): Array of samples + + Returns: + dict[str, float]: Robust avg, min, max values + """ + stat: dict = {} + if values.size == 0: + return stat + + avg_value = np.mean(values) + std_value = np.std(values) + avg_3std_min_value = avg_value - 3 * std_value + avg_3std_max_value = avg_value + 3 * std_value + min_value = np.min(values) + max_value = np.max(values) + + # Refine min/max to reduce outlier effect + robust_min_value = max(min_value, avg_3std_min_value) + robust_max_value = min(max_value, avg_3std_max_value) + + stat["avg"] = float(avg_value) + stat["std"] = float(std_value) + stat["min"] = float(min_value) + stat["max"] = float(max_value) + stat["robust_min"] = float(robust_min_value) + stat["robust_max"] = float(robust_max_value) + return stat + + +def compute_robust_scale_statistics(values: np.array) -> dict[str, float]: + """Computes robust statistics of scale values. + + Average of 0.5x scale and 2x scale should be 1x + + Args: + values (np.array): Array of positive scale values + + Returns: + dict[str, float]: Robust avg, min, max values + """ + # Compute stat in log scale & convert back to original scale + if values.size == 0: + return {} + + stat = compute_robust_statistics(np.log(values)) + stat = {k: float(np.exp(v)) for k, v in stat.items()} + # Normal scale std is easier to understand + stat["std"] = float(np.std(values)) + return stat + + +def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = 1000) -> dict[str, Any]: + """Computes robust statistics of image & annotation sizes. + + Args: + dataset (DatasetSubset): Input dataset. + max_samples (int, optional): Maximum number of dataset subsamples to analyze. Defaults to 1000. + + Returns: + Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally. + ex) stat = { + "image": {"avg": ...}, + "annotation": { + "num_per_image": {"avg": ...}, + "size_of_shape": {"avg": ...}, + } + } + """ + stat: dict = {} + if len(dataset) == 0 or max_samples <= 0: + return stat + + data_ids = [item.id for item in dataset] + max_image_samples = min(max_samples, len(dataset)) + rng = np.random.default_rng(42) + data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples] + + image_sizes = [] + for idx in data_ids: + data = dataset.get(id=idx, subset=dataset.name) + height, width = data.media.size + image_sizes.append(np.sqrt(width * height)) + stat["image"] = compute_robust_scale_statistics(np.array(image_sizes)) + + stat["annotation"] = {} + num_per_images: list[int] = [] + size_of_shapes: dict[str, list] = defaultdict(list) + for idx in data_ids: + data = dataset.get(id=idx, subset=dataset.name) + annotations: dict[str, list] = defaultdict(list) + for ann in data.annotations: + annotations[ann.__class__.__name__].append(ann) + + num_per_images.append(max(len(val) for val in annotations.values())) + + if max(len(val) for val in size_of_shapes.values()) >= max_samples: + continue + + for ann_type, anns in annotations.items(): + size_of_shapes[ann_type].extend( + np.sqrt(area) for val in anns if isinstance(val, _Shape) and (area := val.get_area()) >= 1 + ) + + stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images)) + if "Polygon" in size_of_shapes: + stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes["Polygon"])) + else: + max_ann_type = None + max_num_ann = 0 + for ann_type, anns in size_of_shapes.items(): + if max_num_ann < len(anns): + max_ann_type = ann_type + max_num_ann = len(anns) + stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes[max_ann_type])) + + return stat + + +def adapt_input_size_to_dataset(dataset: Dataset, base_input_size = None, downscale_only: bool = True) -> tuple[int, int]: + """Compute appropriate model input size w.r.t. dataset statistics. + + Args: + max_image_size (int): Typical large image size of dataset in pixels. + min_object_size (int, optional): Typical small object size of dataset in pixels. + None to consider only image size. Defaults to None. + downscale_only (bool) : Whether to allow only smaller size than default setting. Defaults to True. + + Returns: + Tuple[int, int]: (width, height) + """ + MIN_RECOGNIZABLE_OBJECT_SIZE = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 + # meaning NxN input pixels being downscaled to 1x1 on feature map + MIN_DETECTION_INPUT_SIZE = 256 # Minimum input size for object detection + + train_dataset = dataset.subsets().get("train") + if train_dataset is None: + return + + logger.info("Adapting model input size based on dataset stat") + stat = compute_robust_dataset_statistics(train_dataset) + max_image_size = stat["image"]["robust_max"] + min_object_size = None + if stat["annotation"]: + # Refine using annotation shape size stat + # Fit to typical small object size (conservative) + # -> "avg" size might be preferrable for efficiency + min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None) + + base_input_size = base_input_size + if isinstance(base_input_size, dict): + base_input_size = base_input_size.get("train", base_input_size.get("test", None)) + logger.info(f"-> Current base input size: {base_input_size}") + + if max_image_size <= 0: + return base_input_size + + image_size = max_image_size + logger.info(f"-> Based on typical large image size: {image_size}") + + # Refine using annotation shape size stat + if min_object_size is not None and min_object_size > 0: + image_size = round(image_size * MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) + logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}") + if image_size > max_image_size: + image_size = max_image_size + logger.info(f"-> Restrict to max image size: {image_size}") + if image_size < MIN_DETECTION_INPUT_SIZE: + image_size = MIN_DETECTION_INPUT_SIZE + logger.info(f"-> Based on minimum object detection input size: {image_size}") + + input_size = (round(image_size), round(image_size)) + + if downscale_only: + + def area(x): + return x[0] * x[1] + + if base_input_size and area(input_size) >= area(base_input_size): + logger.info(f"-> Downscale only: {input_size} -> {base_input_size}") + return base_input_size + + # Closest preset + logger.info(f"-> Closest preset: {input_size}") + return input_size + + +def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None: + """Config tile parameters. + + Adapt based on annotation statistics. + i.e. tile size, tile overlap, ratio and max objects per sample + + Args: + tile_config (TileConfig): tiling parameters of the model + dataset (Dataset): Datumaro dataset including all subsets + """ + if (train_dataset := dataset.subsets().get("train")) is not None: + stat = compute_robust_dataset_statistics(train_dataset) + max_num_objects = round(stat["annotation"]["num_per_image"]["max"]) + avg_size = stat["annotation"]["size_of_shape"]["avg"] + min_size = stat["annotation"]["size_of_shape"]["robust_min"] + max_size = stat["annotation"]["size_of_shape"]["robust_max"] + logger.info(f"----> [stat] scale avg: {avg_size}") + logger.info(f"----> [stat] scale min: {min_size}") + logger.info(f"----> [stat] scale max: {max_size}") + + logger.info("[Adaptive tiling pararms]") + object_tile_ratio = tile_config.object_tile_ratio + tile_size = int(avg_size / object_tile_ratio) + tile_overlap = max_size / tile_size + logger.info(f"----> avg_object_size: {avg_size}") + logger.info(f"----> max_object_size: {max_size}") + logger.info(f"----> object_tile_ratio: {object_tile_ratio}") + logger.info(f"----> tile_size: {avg_size} / {object_tile_ratio} = {tile_size}") + logger.info(f"----> tile_overlap: {max_size} / {tile_size} = {tile_overlap}") + + if tile_overlap >= 0.9: + # Use the average object area if the tile overlap is too large to prevent 0 stride. + tile_overlap = min(avg_size / tile_size, 0.9) + logger.info(f"----> (too big) tile_overlap: {avg_size} / {tile_size} = min[{tile_overlap}, 0.9]") + + # TODO(Eugene): how to validate lower/upper_bound? dataclass? pydantic? + # https://github.com/openvinotoolkit/training_extensions/pull/2903 + tile_config.tile_size = (tile_size, tile_size) + tile_config.max_num_instances = max_num_objects + tile_config.overlap = tile_overlap \ No newline at end of file diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index ee5ff4dce35..c6a66d455b1 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -12,7 +12,7 @@ import time from contextlib import contextmanager from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal, Sequence +from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal from warnings import warn import torch diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 65e8f8bf2e7..3a5235e1f9f 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -144,13 +144,11 @@ def __init__( data_root: PathLike | None = None, task: OTXTaskType | None = None, model_name: str | None = None, - input_size: Sequence[int] | None = None ) -> None: self.data_root = data_root self._task = task self._config: dict | None = None self.model_name: str | None = model_name - self.input_size = input_size @property def task(self) -> OTXTaskType: @@ -229,9 +227,6 @@ def get_datamodule(self) -> OTXDataModule | None: _ = data_config.pop("__path__", {}) # Remove __path__ key that for CLI _ = data_config.pop("config", {}) # Remove config key that for CLI - if getattr(data_config, "input_size", None) is not None and self.input_size is not None: - data_config["input_size"] = self.input_size - return OTXDataModule( train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config), val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config), diff --git a/src/otx/recipe/detection/yolox_tiny.yaml b/src/otx/recipe/detection/yolox_tiny.yaml index 9950a427274..bdeee86606c 100644 --- a/src/otx/recipe/detection/yolox_tiny.yaml +++ b/src/otx/recipe/detection/yolox_tiny.yaml @@ -37,10 +37,10 @@ overrides: gradient_clip_val: 35.0 data: - input_size: - - 640 - - 640 train_subset: + input_size: + - 640 + - 640 batch_size: 8 transforms: - class_path: otx.core.data.transform_libs.torchvision.CachedMosaic diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py index aec4a299737..9344687894c 100644 --- a/tests/unit/algo/detection/test_rtmdet.py +++ b/tests/unit/algo/detection/test_rtmdet.py @@ -19,7 +19,6 @@ def test_init(self) -> None: assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN) assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead) assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640) - assert otx_rtmdet_tiny.tile_image_size == (1, 3, 640, 640) def test_exporter(self) -> None: otx_rtmdet_tiny = RTMDetTiny(label_info=3) diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py index 3b8cb90e686..29ffdd8172e 100644 --- a/tests/unit/algo/detection/test_yolox.py +++ b/tests/unit/algo/detection/test_yolox.py @@ -19,11 +19,9 @@ def test_init(self) -> None: assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN) assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead) assert otx_yolox_l.input_size == (1, 3, 640, 640) - assert otx_yolox_l.tile_image_size == (1, 3, 640, 640) otx_yolox_tiny = YOLOXTINY(label_info=3) assert otx_yolox_tiny.input_size == (1, 3, 416, 416) - assert otx_yolox_tiny.tile_image_size == (1, 3, 416, 416) def test_exporter(self) -> None: otx_yolox_l = YOLOXL(label_info=3) From b1121f06687d057d776c479361c493c8346da115 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 7 Aug 2024 13:51:07 +0900 Subject: [PATCH 07/42] handle edge case --- src/otx/core/data/utils/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 0d7f3fbfdbd..05178ffd159 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -"""Functions for adaptive input size.""" +"""Utility functions for the data module.""" from __future__ import annotations import logging @@ -118,9 +118,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = for ann in data.annotations: annotations[ann.__class__.__name__].append(ann) - num_per_images.append(max(len(val) for val in annotations.values())) + num_per_images.append(max(len(val) for val in annotations.values()) if annotations else 0) - if max(len(val) for val in size_of_shapes.values()) >= max_samples: + if size_of_shapes and max(len(val) for val in size_of_shapes.values()) >= max_samples: continue for ann_type, anns in annotations.items(): @@ -143,7 +143,11 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = return stat -def adapt_input_size_to_dataset(dataset: Dataset, base_input_size = None, downscale_only: bool = True) -> tuple[int, int]: +def adapt_input_size_to_dataset( + dataset: Dataset, + base_input_size: int | None = None, + downscale_only: bool = True +) -> tuple[int, int]: """Compute appropriate model input size w.r.t. dataset statistics. Args: @@ -250,4 +254,4 @@ def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None: # https://github.com/openvinotoolkit/training_extensions/pull/2903 tile_config.tile_size = (tile_size, tile_size) tile_config.max_num_instances = max_num_objects - tile_config.overlap = tile_overlap \ No newline at end of file + tile_config.overlap = tile_overlap From 659a9cf61651fc4d111e7ee26c2d365762484ce8 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Wed, 7 Aug 2024 21:40:17 +0900 Subject: [PATCH 08/42] add input_size_multiplier and pass it to datamodule in cli --- src/otx/algo/detection/rtdetr.py | 5 +---- src/otx/algo/detection/rtmdet.py | 6 ++---- src/otx/algo/detection/yolox.py | 10 ++-------- src/otx/algo/segmentation/dino_v2_seg.py | 7 +++---- .../algo/visual_prompting/segment_anything.py | 5 ++--- src/otx/cli/cli.py | 10 +++++++++- src/otx/core/data/module.py | 8 +++++++- src/otx/core/data/utils/utils.py | 18 +++++++++++++----- src/otx/core/model/base.py | 9 +++++++++ src/otx/engine/utils/auto_configurator.py | 3 +++ 10 files changed, 51 insertions(+), 30 deletions(-) diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 4aeb7bed3b2..13e5fdcae4e 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -38,6 +38,7 @@ class RTDETR(ExplainableOTXDetModel): """RTDETR model.""" + input_size_multiplier = 32 mean: tuple[float, float, float] = (0.0, 0.0, 0.0) std: tuple[float, float, float] = (255.0, 255.0, 255.0) load_from: str | None = None @@ -52,10 +53,6 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: - if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: - msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." - raise ValueError(msg) - super().__init__( label_info=label_info, input_size=input_size, diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index 3f8e7b5c34c..e5757944759 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -36,6 +36,8 @@ class RTMDet(ExplainableOTXDetModel): """OTX Detection model class for RTMDet.""" + input_size_multiplier = 32 + def __init__( self, label_info: LabelInfoTypes, @@ -46,10 +48,6 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: - if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: - msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." - raise ValueError(msg) - super().__init__( label_info=label_info, input_size=input_size, diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index 8341c6419f6..800c0e17f5e 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -37,6 +37,8 @@ class YOLOX(ExplainableOTXDetModel): """OTX Detection model class for YOLOX.""" + input_size_multiplier = 32 + def __init__( self, label_info: LabelInfoTypes, @@ -47,10 +49,6 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: - if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: - msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." - raise ValueError(msg) - super().__init__( label_info=label_info, input_size=input_size, @@ -158,10 +156,6 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: - if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0: - msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead." - raise ValueError(msg) - super().__init__( label_info=label_info, input_size=input_size, diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py index c4be747d4e0..c3e455a2910 100644 --- a/src/otx/algo/segmentation/dino_v2_seg.py +++ b/src/otx/algo/segmentation/dino_v2_seg.py @@ -50,6 +50,9 @@ class DinoV2Seg(BaseSegmModel): class OTXDinoV2Seg(TorchVisionCompatibleModel): """DinoV2Seg Model.""" + + input_size_multiplier = 14 + def __init__( self, label_info: LabelInfoTypes, @@ -64,10 +67,6 @@ def __init__( export_image_configuration: dict[str, Any] | None = None, name_base_model: str = "semantic_segmentation_model", ): - if input_size[-1] % 14 != 0 or input_size[-2] % 14 != 0: - msg = f"Input size should be a multiple of 14, but got {input_size[-2:]} instead." - raise ValueError(msg) - super().__init__( label_info=label_info, input_size=input_size, diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index 7e8bf94d65a..a948f9ae648 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -490,6 +490,8 @@ def select_masks(self, masks: Tensor, iou_preds: Tensor, num_points: int) -> tup class OTXSegmentAnything(OTXVisualPromptingModel): """Visual Prompting model.""" + input_size_multiplier = 16 + def __init__( self, backbone: Literal["tiny_vit", "vit_b"], @@ -510,9 +512,6 @@ def __init__( if input_size[-1] != input_size[-2]: msg = f"SAM should use square image, but got {input_size}" raise ValueError(msg) - if input_size[-1] % 16 != 0 and input_size[-2] % 16 != 0: - msg = f"Input size should be a multiple of 16, but got {input_size[-2:]} instead." - raise ValueError(msg) self.config = { "backbone": backbone, diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index c0a66c184dd..178673c5ee2 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -23,6 +23,7 @@ from otx.cli.utils.workspace import Workspace from otx.core.types.task import OTXTaskType from otx.core.utils.imports import get_otx_root_path +from otx.utils.utils import get_model_cls_from_config if TYPE_CHECKING: from jsonargparse._actions import _ActionSubCommands @@ -331,6 +332,11 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # For num_classes update, Model and Metric are instantiated separately. model_config = self.config[self.subcommand].pop("model") + if self.config[self.subcommand].data.adaptive_input_size != "none": + model_cls = get_model_cls_from_config(model_config) + if hasattr(model_cls, "input_size_multiplier"): + self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier + # Instantiate the things that don't need to special handling self.config_init = self.parser.instantiate_classes(self.config) self.workspace = self.get_config_value(self.config_init, "workspace") @@ -339,6 +345,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: if (input_size := self.datamodule.input_size) is not None: if isinstance(input_size, int): input_size = (input_size, input_size) + else: + input_size = tuple(input_size) model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size # Instantiate the model and needed components @@ -374,7 +382,7 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel: tuple: The model and optimizer and scheduler. """ from otx.core.model.base import OTXModel - from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info + from otx.utils.utils import can_pass_tile_config, should_pass_label_info skip = set() diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index 8268a18dd4f..3e1b6f5f133 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -64,6 +64,7 @@ def __init__( device: DeviceType = DeviceType.auto, input_size: int | tuple[int, int] | None = None, adaptive_input_size: Literal["auto", "downscale", "none"] = "none", + input_size_multiplier: int = 1, ) -> None: """Constructor.""" super().__init__() @@ -135,7 +136,12 @@ def __init__( ) if adaptive_input_size != "none": - input_size = adapt_input_size_to_dataset(dataset, input_size, adaptive_input_size=="downscale") + input_size = adapt_input_size_to_dataset( + dataset, + input_size, + adaptive_input_size=="downscale", + input_size_multiplier, + ) if input_size is not None: for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]: subset_cfg.input_size = input_size diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 05178ffd159..665994fe2ef 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -145,8 +145,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = def adapt_input_size_to_dataset( dataset: Dataset, - base_input_size: int | None = None, - downscale_only: bool = True + base_input_size: int | tuple[int, int] | None = None, + downscale_only: bool = True, + input_size_multiplier: int | None = None, ) -> tuple[int, int]: """Compute appropriate model input size w.r.t. dataset statistics. @@ -163,6 +164,13 @@ def adapt_input_size_to_dataset( # meaning NxN input pixels being downscaled to 1x1 on feature map MIN_DETECTION_INPUT_SIZE = 256 # Minimum input size for object detection + if downscale_only and base_input_size is None: + msg = "If downscale_only is set to True, base_input_size should be set but got None." + raise ValueError(msg) + + if isinstance(base_input_size, int): + base_input_size = (base_input_size, base_input_size) + train_dataset = dataset.subsets().get("train") if train_dataset is None: return @@ -177,9 +185,6 @@ def adapt_input_size_to_dataset( # -> "avg" size might be preferrable for efficiency min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None) - base_input_size = base_input_size - if isinstance(base_input_size, dict): - base_input_size = base_input_size.get("train", base_input_size.get("test", None)) logger.info(f"-> Current base input size: {base_input_size}") if max_image_size <= 0: @@ -199,6 +204,9 @@ def adapt_input_size_to_dataset( image_size = MIN_DETECTION_INPUT_SIZE logger.info(f"-> Based on minimum object detection input size: {image_size}") + if input_size_multiplier is not None and image_size % input_size_multiplier != 0: + image_size = (image_size // input_size_multiplier + 1) * input_size_multiplier + input_size = (round(image_size), round(image_size)) if downscale_only: diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 10d7475962b..405116b762e 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -114,6 +114,7 @@ def __init__( super().__init__() self._label_info = self._dispatch_label_info(label_info) + self._check_input_size(input_size) self.input_size = input_size self.classification_layers: dict[str, dict[str, Any]] = {} self.model = self._create_model() @@ -809,6 +810,14 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo: raise TypeError(label_info) + def _check_input_size(self, input_size: Sequence[int] | None = None) -> None: + if ( + input_size is not None + and hasattr(self, "input_size_multiplier") + and (input_size[-1] % self.input_size_multiplier != 0 or input_size[-2] % self.input_size_multiplier != 0) + ): + msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead." + raise ValueError(msg) class OVModel(OTXModel, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEntity]): """Base class for the OpenVINO model. diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 3a5235e1f9f..90790d59521 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -22,6 +22,7 @@ from otx.core.types.task import OTXTaskType from otx.core.utils.imports import get_otx_root_path from otx.core.utils.instantiators import partial_instantiate_class +from otx.core.utils.utils import import_object_from_module from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info if TYPE_CHECKING: @@ -276,6 +277,8 @@ def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | if input_size is not None: if isinstance(input_size, int): input_size = (input_size, input_size) + else: + input_size = tuple(input_size) model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size model_cls = get_model_cls_from_config(Namespace(model_config)) From e32902949f490eac266d8b2a7c8f092f6290511e Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Thu, 8 Aug 2024 15:22:01 +0900 Subject: [PATCH 09/42] change typehint from sequence to tuple --- src/otx/algo/action_classification/movinet.py | 4 ++-- src/otx/algo/action_classification/x3d.py | 4 ++-- src/otx/algo/classification/dino_v2.py | 4 ++-- src/otx/algo/classification/efficientnet.py | 8 ++++---- src/otx/algo/classification/efficientnet_v2.py | 8 ++++---- .../algo/classification/huggingface_model.py | 5 +++-- src/otx/algo/classification/mobilenet_v3.py | 8 ++++---- .../algo/classification/torchvision_model.py | 4 ++-- src/otx/algo/classification/vit.py | 8 ++++---- src/otx/algo/detection/atss.py | 16 ++++++++-------- .../base_models/detection_transformer.py | 2 +- src/otx/algo/detection/huggingface_model.py | 7 +++---- src/otx/algo/detection/rtdetr.py | 14 +++++++------- src/otx/algo/detection/rtmdet.py | 14 +++++++------- src/otx/algo/detection/ssd.py | 14 +++++++------- src/otx/algo/detection/yolox.py | 17 +++++++++-------- src/otx/algo/instance_segmentation/maskrcnn.py | 18 +++++++++--------- .../algo/instance_segmentation/maskrcnn_tv.py | 14 +++++++------- .../algo/instance_segmentation/rtmdet_inst.py | 16 ++++++++-------- src/otx/algo/segmentation/dino_v2_seg.py | 12 ++++++------ src/otx/algo/segmentation/huggingface_model.py | 6 +++--- src/otx/algo/segmentation/litehrnet.py | 13 +++++++------ src/otx/algo/segmentation/segnext.py | 13 +++++++------ .../algo/visual_prompting/segment_anything.py | 6 +++--- src/otx/cli/cli.py | 6 ++++-- src/otx/core/data/module.py | 4 ++-- src/otx/core/data/utils/__init__.py | 2 +- src/otx/core/data/utils/utils.py | 3 +-- src/otx/core/model/action_classification.py | 4 ++-- src/otx/core/model/base.py | 5 +++-- src/otx/core/model/classification.py | 8 ++++---- src/otx/core/model/detection.py | 4 ++-- src/otx/core/model/instance_segmentation.py | 6 +++--- src/otx/core/model/segmentation.py | 8 ++++---- src/otx/core/model/visual_prompting.py | 4 ++-- src/otx/engine/engine.py | 11 +++++------ src/otx/engine/utils/auto_configurator.py | 11 ++++++++--- 37 files changed, 160 insertions(+), 151 deletions(-) diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py index 7c5861d2af6..4803aba3d9e 100644 --- a/src/otx/algo/action_classification/movinet.py +++ b/src/otx/algo/action_classification/movinet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from torch import nn @@ -32,7 +32,7 @@ class MoViNet(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 1, 3, 8, 224, 224), + input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py index 6c26f2deb2f..60b58b6521d 100644 --- a/src/otx/algo/action_classification/x3d.py +++ b/src/otx/algo/action_classification/x3d.py @@ -4,7 +4,7 @@ """X3D model implementation.""" from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from torch import nn @@ -31,7 +31,7 @@ class X3D(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 1, 3, 8, 224, 224), + input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py index 592d2616a2e..7c2afda0d79 100644 --- a/src/otx/algo/classification/dino_v2.py +++ b/src/otx/algo/classification/dino_v2.py @@ -8,7 +8,7 @@ import logging import os from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Sequence +from typing import TYPE_CHECKING, Any, Literal import torch from torch import Tensor, nn @@ -119,7 +119,7 @@ def __init__( metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, freeze_backbone: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index d818551c5d2..6ea3aa9a257 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -7,7 +7,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import Tensor, nn @@ -60,7 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.version = version @@ -278,7 +278,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.version = version @@ -408,7 +408,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.version = version diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py index 4d6aa086748..ea4617e40a2 100644 --- a/src/otx/algo/classification/efficientnet_v2.py +++ b/src/otx/algo/classification/efficientnet_v2.py @@ -5,7 +5,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import Tensor, nn @@ -60,7 +60,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -269,7 +269,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, @@ -396,7 +396,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: super().__init__( label_info=label_info, diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index d47bceffea9..6de912cdf5d 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import Tensor, nn @@ -36,6 +36,7 @@ DEFAULT_INPUT_SIZE = (1, 2, 224, 224) logger = logging.getLogger(__name__) + class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel): """HuggingFaceModelForMulticlassCls is a class that represents a Hugging Face model for multiclass classification. @@ -66,7 +67,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = DEFAULT_INPUT_SIZE, + input_size: tuple[int, ...] = DEFAULT_INPUT_SIZE, ) -> None: self.model_name = model_name_or_path diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index 6a114aa62bb..b14017de3e8 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -7,7 +7,7 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any, Literal, Sequence +from typing import TYPE_CHECKING, Any, Literal import torch from torch import Tensor, nn @@ -71,7 +71,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -285,7 +285,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( @@ -416,7 +416,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.mode = mode super().__init__( diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 10e726aacc6..77643513b61 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Literal, Sequence +from typing import TYPE_CHECKING, Any, Literal import torch from torch import Tensor, nn @@ -422,7 +422,7 @@ def __init__( OTXTaskType.H_LABEL_CLS, ] = OTXTaskType.MULTI_CLASS_CLS, train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py index 446a18c0645..0797b5bd240 100644 --- a/src/otx/algo/classification/vit.py +++ b/src/otx/algo/classification/vit.py @@ -7,7 +7,7 @@ import types from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Generic, Sequence +from typing import TYPE_CHECKING, Any, Callable, Generic from urllib.parse import urlparse import numpy as np @@ -229,7 +229,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -468,7 +468,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora @@ -617,7 +617,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: Sequence[int] = (1, 3, 224, 224), + input_size: tuple[int, ...] = (1, 3, 224, 224), ) -> None: self.arch = arch self.lora = lora diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index dd5412ac072..a873055354b 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from otx.algo.common.backbones import ResNeXt, build_model_including_pytorchcv from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss @@ -17,20 +17,20 @@ from otx.algo.detection.necks import FPN from otx.algo.detection.utils.assigners import ATSSAssigner from otx.algo.utils.support_otx_v1 import OTXv1Helper +from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.detection import ExplainableOTXDetModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable -from otx.core.config.data import TileConfig +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.detection import ExplainableOTXDetModel if TYPE_CHECKING: - from typing_extensions import Self from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from typing_extensions import Self - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class ATSS(ExplainableOTXDetModel): @@ -39,7 +39,7 @@ class ATSS(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 800, 992), + input_size: tuple[int, ...] = (1, 3, 800, 992), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py index 18ca34c33de..a3158e19845 100644 --- a/src/otx/algo/detection/base_models/detection_transformer.py +++ b/src/otx/algo/detection/base_models/detection_transformer.py @@ -55,7 +55,7 @@ def __init__( if multi_scale is not None: self.multi_scale = multi_scale else: - self.multi_scale = [input_size -i * 32 for i in range(-5, 6)] + [input_size] * 2 + self.multi_scale = [input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2 self.num_classes = num_classes self.num_top_queries = num_top_queries diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index db4bfda1980..140c644ad7f 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -5,15 +5,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import nn from torchvision import tv_tensors from transformers import AutoImageProcessor, AutoModelForObjectDetection -from transformers.configuration_utils import PretrainedConfig -# from transformers.image_processing_base import ImageProcessingMixin +# from transformers.image_processing_base import ImageProcessingMixin from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity from otx.core.data.entity.utils import stack_batch @@ -62,7 +61,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=object-detection label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 800, 992), # detection default input size + input_size: tuple[int, ...] = (1, 3, 800, 992), # detection default input size optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 13e5fdcae4e..43a6267a769 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -7,7 +7,7 @@ import copy import re -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import Tensor, nn @@ -18,21 +18,21 @@ from otx.algo.detection.base_models.detection_transformer import DETR from otx.algo.detection.heads import RTDETRTransformer from otx.algo.detection.necks import HybridEncoder +from otx.core.config.data import TileConfig from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.detection import ExplainableOTXDetModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable -from otx.core.config.data import TileConfig +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.detection import ExplainableOTXDetModel if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class RTDETR(ExplainableOTXDetModel): @@ -46,7 +46,7 @@ class RTDETR(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 640, 640), + input_size: tuple[int, ...] = (1, 3, 640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index e5757944759..b382ff65225 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import GIoULoss, QualityFocalLoss @@ -17,20 +17,20 @@ from otx.algo.detection.base_models import SingleStageDetector from otx.algo.detection.heads import RTMDetSepBNHead from otx.algo.detection.necks import CSPNeXtPAFPN +from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.model.detection import ExplainableOTXDetModel from otx.core.types.export import TaskLevelExportParameters -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable -from otx.core.config.data import TileConfig if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class RTMDet(ExplainableOTXDetModel): @@ -41,7 +41,7 @@ class RTMDet(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 640, 640), + input_size: tuple[int, ...] = (1, 3, 640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index 910baae419d..e4c095dffa2 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -10,7 +10,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import numpy as np from datumaro.components.annotation import Bbox @@ -22,21 +22,21 @@ from otx.algo.detection.heads import SSDHead from otx.algo.detection.utils.prior_generators import SSDAnchorGeneratorClustered from otx.algo.utils.support_otx_v1 import OTXv1Helper +from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.detection import ExplainableOTXDetModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable -from otx.core.config.data import TileConfig +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.detection import ExplainableOTXDetModel if TYPE_CHECKING: import torch from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from otx.core.data.dataset.base import OTXDataset - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes logger = logging.getLogger() @@ -55,7 +55,7 @@ class SSD(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 864, 864), + input_size: tuple[int, ...] = (1, 3, 864, 864), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index 800c0e17f5e..ac319e26532 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any from otx.algo.common.losses import CrossEntropyLoss, L1Loss from otx.algo.detection.backbones import CSPDarknet @@ -15,23 +15,24 @@ from otx.algo.detection.necks import YOLOXPAFPN from otx.algo.detection.utils.assigners import SimOTAAssigner from otx.algo.utils.support_otx_v1 import OTXv1Helper +from otx.core.config.data import TileConfig from otx.core.data.entity.detection import DetBatchDataEntity from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.model.detection import ExplainableOTXDetModel from otx.core.types.export import OTXExportFormatType from otx.core.types.precision import OTXPrecisionType -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable -from otx.core.config.data import TileConfig if TYPE_CHECKING: from pathlib import Path + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class YOLOX(ExplainableOTXDetModel): @@ -42,7 +43,7 @@ class YOLOX(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 640, 640), + input_size: tuple[int, ...] = (1, 3, 640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -149,7 +150,7 @@ class YOLOXTINY(YOLOX): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 416, 416), + input_size: tuple[int, ...] = (1, 3, 416, 416), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index b1af171124a..1ab96d01bb1 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any from torchvision.ops import RoIAlign @@ -21,19 +21,19 @@ from otx.algo.instance_segmentation.two_stage import TwoStageDetector from otx.algo.instance_segmentation.utils.roi_extractors import SingleRoIExtractor from otx.algo.utils.support_otx_v1 import OTXv1Helper +from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.config.data import TileConfig from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class MaskRCNN(ExplainableOTXInstanceSegModel): @@ -88,7 +88,7 @@ class MaskRCNNResNet50(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 1024, 1024), + input_size: tuple[int, ...] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -278,7 +278,7 @@ class MaskRCNNEfficientNet(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 1024, 1024), + input_size: tuple[int, ...] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -485,7 +485,7 @@ class MaskRCNNSwinT(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 1344, 1344), + input_size: tuple[int, ...] = (1, 3, 1344, 1344), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py index dcfe9c3c3c8..075e4bcf811 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py +++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections import OrderedDict -from typing import Any, Sequence, TYPE_CHECKING +from typing import TYPE_CHECKING, Any import torch from torch import Tensor, nn @@ -24,22 +24,22 @@ from torchvision.models.resnet import resnet50 from otx.algo.instance_segmentation.heads import TVRoIHeads +from otx.core.config.data import TileConfig from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.instance_segmentation import InstanceSegBatchDataEntity, InstanceSegBatchPredEntity from otx.core.data.entity.utils import stack_batch from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.config.data import TileConfig from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class _TVMaskRCNN(MaskRCNN): @@ -275,7 +275,7 @@ class TVMaskRCNNR50(TVMaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 1024, 1024), + input_size: tuple[int, ...] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 96801665710..92a5627e5cb 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss @@ -17,20 +17,20 @@ from otx.algo.detection.necks import CSPNeXtPAFPN from otx.algo.instance_segmentation.heads import RTMDetInsSepBNHead from otx.algo.instance_segmentation.losses import DiceLoss +from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.config.data import TileConfig from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel if TYPE_CHECKING: - from torch import Tensor from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import Tensor - from otx.core.types.label import LabelInfoTypes - from otx.core.schedulers import LRSchedulerListCallable from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes class RTMDetInst(ExplainableOTXInstanceSegModel): @@ -96,7 +96,7 @@ class RTMDetInstTiny(RTMDetInst): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 640, 640), + input_size: tuple[int, ...] = (1, 3, 640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py index c3e455a2910..16101a6fcad 100644 --- a/src/otx/algo/segmentation/dino_v2_seg.py +++ b/src/otx/algo/segmentation/dino_v2_seg.py @@ -5,24 +5,24 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar, Sequence +from typing import TYPE_CHECKING, Any, ClassVar from otx.algo.segmentation.backbones import DinoVisionTransformer from otx.algo.segmentation.heads import FCNHead -from otx.core.model.segmentation import TorchVisionCompatibleModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.dice import SegmCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.segmentation import TorchVisionCompatibleModel from .base_model import BaseSegmModel if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from torch import nn from typing_extensions import Self - from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from otx.core.metrics import MetricCallable from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.label import LabelInfoTypes - from otx.core.metrics import MetricCallable class DinoV2Seg(BaseSegmModel): @@ -56,7 +56,7 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 560, 560), + input_size: tuple[int, ...] = (1, 3, 560, 560), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index feee127b0f8..10cad420b70 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import torch from torch import nn @@ -65,7 +65,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=image-segmentation label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 512, 512), # sementic segmentation default input size + input_size: tuple[int, ...] = (1, 3, 512, 512), # sementic segmentation default input size optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module: kwargs = {} if "image_size" in model_config: kwargs["image_size"] = self.input_size[-1] - + if (patch_size := model_config.get("patch_sizes")) != None: if isinstance(patch_size, (list, tuple)): patch_size = patch_size diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py index 42ffc26e613..81d9e99f57c 100644 --- a/src/otx/algo/segmentation/litehrnet.py +++ b/src/otx/algo/segmentation/litehrnet.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar, Sequence +from typing import TYPE_CHECKING, Any, ClassVar from torch.onnx import OperatorExportTypes @@ -14,19 +14,19 @@ from otx.algo.utils.support_otx_v1 import OTXv1Helper from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.segmentation import TorchVisionCompatibleModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.dice import SegmCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.segmentation import TorchVisionCompatibleModel from .base_model import BaseSegmModel if TYPE_CHECKING: - from torch import nn from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import nn + from otx.core.metrics import MetricCallable from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.label import LabelInfoTypes - from otx.core.metrics import MetricCallable class LiteHRNetS(BaseSegmModel): @@ -524,10 +524,11 @@ def ignore_scope(self) -> dict[str, str | dict[str, list[str]]]: class OTXLiteHRNet(TorchVisionCompatibleModel): """LiteHRNet Model.""" + def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 512, 512), + input_size: tuple[int, ...] = (1, 3, 512, 512), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py index c18b1cc10c1..703f5b1dfbe 100644 --- a/src/otx/algo/segmentation/segnext.py +++ b/src/otx/algo/segmentation/segnext.py @@ -4,24 +4,24 @@ """SegNext model implementations.""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar, Sequence +from typing import TYPE_CHECKING, Any, ClassVar from otx.algo.segmentation.backbones import MSCAN from otx.algo.segmentation.heads import LightHamHead from otx.algo.utils.support_otx_v1 import OTXv1Helper -from otx.core.model.segmentation import TorchVisionCompatibleModel -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.metrics.dice import SegmCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.segmentation import TorchVisionCompatibleModel from .base_model import BaseSegmModel if TYPE_CHECKING: - from torch import nn from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import nn + from otx.core.metrics import MetricCallable from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.label import LabelInfoTypes - from otx.core.metrics import MetricCallable class SegNextB(BaseSegmModel): @@ -114,10 +114,11 @@ class SegNextT(BaseSegmModel): class OTXSegNext(TorchVisionCompatibleModel): """SegNext Model.""" + def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] = (1, 3, 512, 512), + input_size: tuple[int, ...] = (1, 3, 512, 512), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index a948f9ae648..c1eef3d18d4 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -6,7 +6,7 @@ from __future__ import annotations import logging as log -from typing import TYPE_CHECKING, Any, Literal, Sequence +from typing import TYPE_CHECKING, Any, Literal import torch from torch import Tensor, nn @@ -496,7 +496,7 @@ def __init__( self, backbone: Literal["tiny_vit", "vit_b"], label_info: LabelInfoTypes = NullLabelInfo(), - input_size: Sequence[int] = (1, 3, 1024, 1024), + input_size: tuple[int, ...] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -516,7 +516,7 @@ def __init__( self.config = { "backbone": backbone, "image_size": input_size[-1], - "image_embedding_size" : input_size[-1] // 16, + "image_embedding_size": input_size[-1] // 16, "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 178673c5ee2..8c9a1787828 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -346,8 +346,10 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: if isinstance(input_size, int): input_size = (input_size, input_size) else: - input_size = tuple(input_size) - model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size + input_size = tuple(input_size) # type: ignore[assignment] + model_config["init_args"]["input_size"] = ( + tuple(model_config["init_args"]["input_size"][:-2]) + input_size + ) # Instantiate the model and needed components self.model = self.instantiate_model(model_config=model_config) diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index 3e1b6f5f133..cb6f8760940 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -24,13 +24,13 @@ parse_mem_cache_size_to_int, ) from otx.core.data.pre_filtering import pre_filtering +from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config from otx.core.types.device import DeviceType from otx.core.types.image import ImageColorChannel from otx.core.types.label import LabelInfo from otx.core.types.task import OTXTaskType from otx.core.utils.instantiators import instantiate_sampler from otx.core.utils.utils import get_adaptive_num_workers -from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config if TYPE_CHECKING: from lightning.pytorch.utilities.parsing import AttributeDict @@ -139,7 +139,7 @@ def __init__( input_size = adapt_input_size_to_dataset( dataset, input_size, - adaptive_input_size=="downscale", + adaptive_input_size == "downscale", input_size_multiplier, ) if input_size is not None: diff --git a/src/otx/core/data/utils/__init__.py b/src/otx/core/data/utils/__init__.py index adc0400284e..302e6cb75dd 100644 --- a/src/otx/core/data/utils/__init__.py +++ b/src/otx/core/data/utils/__init__.py @@ -3,6 +3,6 @@ # """Utility modules for core data modules.""" -from .utils import adapt_tile_config, adapt_input_size_to_dataset +from .utils import adapt_input_size_to_dataset, adapt_tile_config __all__ = ["adapt_tile_config", "adapt_input_size_to_dataset"] diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 665994fe2ef..040258c031a 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -173,7 +172,7 @@ def adapt_input_size_to_dataset( train_dataset = dataset.subsets().get("train") if train_dataset is None: - return + return None logger.info("Adapting model input size based on dataset stat") stat = compute_robust_dataset_statistics(train_dataset) diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py index 6829dca5f38..52e1a55cf6a 100644 --- a/src/otx/core/model/action_classification.py +++ b/src/otx/core/model/action_classification.py @@ -6,7 +6,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import numpy as np import torch @@ -37,7 +37,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 405116b762e..fc562f7b087 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -104,7 +104,7 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int] | None = None, + input_size: tuple[int, ...] | None = None, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = NullMetricCallable, @@ -810,7 +810,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo: raise TypeError(label_info) - def _check_input_size(self, input_size: Sequence[int] | None = None) -> None: + def _check_input_size(self, input_size: tuple[int, ...] | None = None) -> None: if ( input_size is not None and hasattr(self, "input_size_multiplier") @@ -819,6 +819,7 @@ def _check_input_size(self, input_size: Sequence[int] | None = None) -> None: msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead." raise ValueError(msg) + class OVModel(OTXModel, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEntity]): """Base class for the OpenVINO model. diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index 74963720b9e..52d6aea8215 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any import numpy as np import torch @@ -42,7 +42,7 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -96,7 +96,7 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, @@ -149,7 +149,7 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt def __init__( self, label_info: HLabelInfo, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 60ce1d164b7..d478ab91803 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -8,7 +8,7 @@ import types from abc import abstractmethod from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal import torch from model_api.tilers import DetectionTiler @@ -385,7 +385,7 @@ class ExplainableOTXDetModel(OTXDetectionModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py index cfcee76e48f..166e14ca656 100644 --- a/src/otx/core/model/instance_segmentation.py +++ b/src/otx/core/model/instance_segmentation.py @@ -7,7 +7,7 @@ import logging as log import types from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal import numpy as np import torch @@ -49,7 +49,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -384,7 +384,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 15976e25158..3d2a672e72a 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -37,7 +37,7 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -47,7 +47,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (Sequence[int]): The input shape of the model. + input_size (tuple[int, ...]): The input shape of the model. optimizer (OptimizerCallable, optional): The optimizer to use for training. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -130,7 +130,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel): def __init__( self, label_info: LabelInfoTypes, - input_size: Sequence[int], + input_size: tuple[int, ...], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -145,7 +145,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (Sequence[int]): The input shape of the model. + input_size (tuple[int, ...]): The input shape of the model. optimizer (OptimizerCallable, optional): The optimizer callable for the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index c8f3de1cfa9..fab3ffb52dd 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -162,7 +162,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro def __init__( self, label_info: LabelInfoTypes = NullLabelInfo(), - input_size: Sequence[int] = (1, 3, 1024, 1024), + input_size: tuple[int, ...] = (1, 3, 1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -286,7 +286,7 @@ class OTXZeroShotVisualPromptingModel( def __init__( self, - input_size: Sequence[int], + input_size: tuple[int, ...], label_info: LabelInfoTypes = NullLabelInfo(), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index c6a66d455b1..0d9f3750889 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -154,13 +154,12 @@ def __init__( self.task = task if task is not None else self._auto_configurator.task self._trainer: Trainer | None = None + get_model_args: dict[str, Any] = {} + if self._datamodule is not None: + get_model_args["label_info"] = self._datamodule.label_info + get_model_args["input_size"] = self._datamodule.input_size self._model: OTXModel = ( - model - if isinstance(model, OTXModel) - else self._auto_configurator.get_model( - label_info=self._datamodule.label_info if self._datamodule is not None else None, - input_size=self._datamodule.input_size, - ) + model if isinstance(model, OTXModel) else self._auto_configurator.get_model(**get_model_args) ) # ------------------------------------------------------------------------ # diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 90790d59521..2483adb9990 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -8,7 +8,7 @@ import logging from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from warnings import warn import datumaro @@ -22,7 +22,6 @@ from otx.core.types.task import OTXTaskType from otx.core.utils.imports import get_otx_root_path from otx.core.utils.instantiators import partial_instantiate_class -from otx.core.utils.utils import import_object_from_module from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info if TYPE_CHECKING: @@ -241,13 +240,19 @@ def get_datamodule(self) -> OTXDataModule | None: **data_config, ) - def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None, input_size: Sequence[int] | None = None) -> OTXModel: + def get_model( + self, + model_name: str | None = None, + label_info: LabelInfoTypes | None = None, + input_size: tuple[int, ...] | int | None = None, + ) -> OTXModel: """Retrieves the OTXModel instance based on the provided model name and meta information. Args: model_name (str | None): The name of the model to retrieve. If None, the default model will be used. label_info (LabelInfoTypes | None): The meta information about the labels. If provided, the number of classes will be updated in the model's configuration. + input_size (tuple[int, ...] | int | None): Input size the model will use. Returns: OTXModel: The instantiated OTXModel instance. From 8474b072d635614b56869947ef00e0e1e6ab7008 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Thu, 8 Aug 2024 16:07:49 +0900 Subject: [PATCH 10/42] align with pre-commit --- src/otx/algo/anomaly/stfpm.py | 2 +- .../algo/classification/torchvision_model.py | 1 + .../algo/segmentation/huggingface_model.py | 4 ++-- .../encoders/sam_image_encoder.py | 4 ++-- src/otx/cli/cli.py | 11 ++++++----- src/otx/core/data/module.py | 2 +- src/otx/core/data/utils/utils.py | 19 +++++++++++-------- src/otx/core/model/action_classification.py | 1 + src/otx/core/model/base.py | 1 - src/otx/core/model/classification.py | 3 +++ src/otx/core/model/detection.py | 2 ++ src/otx/core/model/instance_segmentation.py | 1 + src/otx/core/model/segmentation.py | 1 + src/otx/core/model/visual_prompting.py | 2 ++ src/otx/engine/utils/auto_configurator.py | 5 +---- 15 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py index 72dd30e8aa3..112ab465590 100644 --- a/src/otx/algo/anomaly/stfpm.py +++ b/src/otx/algo/anomaly/stfpm.py @@ -46,7 +46,7 @@ def __init__( **kwargs, ) -> None: OTXAnomaly.__init__(self) - OTXModel.__init__(self, label_info=AnomalyLabelInfo()) + OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=(224, 224)) AnomalibStfpm.__init__( self, backbone=backbone, diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 77643513b61..9caa54d6bec 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -445,6 +445,7 @@ def __init__( torch_compile=torch_compile, input_size=input_size, ) + self.input_size: tuple[int, int, int, int] def _create_model(self) -> nn.Module: if self.task == OTXTaskType.MULTI_CLASS_CLS: diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index 10cad420b70..ed53ba8fa20 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -90,9 +90,9 @@ def _create_model(self) -> nn.Module: if "image_size" in model_config: kwargs["image_size"] = self.input_size[-1] - if (patch_size := model_config.get("patch_sizes")) != None: + if (patch_size := model_config.get("patch_sizes")) is not None: if isinstance(patch_size, (list, tuple)): - patch_size = patch_size + patch_size = patch_size[0] if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0: msg = ( f"It's recommended to set the input size to multiple of patch size({patch_size}). " diff --git a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py index 3feef21aba5..a1f792e8c92 100644 --- a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py +++ b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py @@ -70,11 +70,11 @@ def __new__(cls, backbone: str, *args, **kwargs): # noqa: ARG003 if backbone.lower() == "tiny_vit": from otx.algo.visual_prompting.backbones.tiny_vit import TinyViT - return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[arg-type] + return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[dict-item] elif backbone.lower() in ["vit_b", "vit_l", "vit_h"]: # noqa: RET505 from otx.algo.visual_prompting.backbones.vit import ViT - return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[arg-type] + return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs}) # type: ignore[dict-item] else: error_log = f"{backbone} is not supported for SAMImageEncoder. Set among tiny_vit and vit_b." diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 8c9a1787828..d70adc56535 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -343,12 +343,13 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: self.datamodule = self.get_config_value(self.config_init, "data") if (input_size := self.datamodule.input_size) is not None: - if isinstance(input_size, int): - input_size = (input_size, input_size) - else: - input_size = tuple(input_size) # type: ignore[assignment] + input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) # type: ignore[assignment] + # if isinstance(input_size, int): + # input_size = (input_size, input_size) + # else: + # input_size = tuple(input_size) # type: ignore[assignment] model_config["init_args"]["input_size"] = ( - tuple(model_config["init_args"]["input_size"][:-2]) + input_size + tuple(model_config["init_args"]["input_size"][:-2]) + input_size # type: ignore[operator] ) # Instantiate the model and needed components diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index cb6f8760940..cdc04c18156 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -42,7 +42,7 @@ class OTXDataModule(LightningDataModule): """LightningDataModule extension for OTX pipeline.""" - def __init__( + def __init__( # noqa: PLR0913 self, task: OTXTaskType, data_format: str, diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 040258c031a..cf0aaa8e1af 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -137,7 +137,10 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = if max_num_ann < len(anns): max_ann_type = ann_type max_num_ann = len(anns) - stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes[max_ann_type])) + if max_ann_type is not None: + stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics( + np.array(size_of_shapes[max_ann_type]), + ) return stat @@ -147,7 +150,7 @@ def adapt_input_size_to_dataset( base_input_size: int | tuple[int, int] | None = None, downscale_only: bool = True, input_size_multiplier: int | None = None, -) -> tuple[int, int]: +) -> tuple[int, int] | None: """Compute appropriate model input size w.r.t. dataset statistics. Args: @@ -159,9 +162,9 @@ def adapt_input_size_to_dataset( Returns: Tuple[int, int]: (width, height) """ - MIN_RECOGNIZABLE_OBJECT_SIZE = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 + min_recognizable_object_size = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 # meaning NxN input pixels being downscaled to 1x1 on feature map - MIN_DETECTION_INPUT_SIZE = 256 # Minimum input size for object detection + min_detection_input_size = 256 # Minimum input size for object detection if downscale_only and base_input_size is None: msg = "If downscale_only is set to True, base_input_size should be set but got None." @@ -194,13 +197,13 @@ def adapt_input_size_to_dataset( # Refine using annotation shape size stat if min_object_size is not None and min_object_size > 0: - image_size = round(image_size * MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) + image_size = round(image_size * min_recognizable_object_size / min_object_size) logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}") if image_size > max_image_size: image_size = max_image_size logger.info(f"-> Restrict to max image size: {image_size}") - if image_size < MIN_DETECTION_INPUT_SIZE: - image_size = MIN_DETECTION_INPUT_SIZE + if image_size < min_detection_input_size: + image_size = min_detection_input_size logger.info(f"-> Based on minimum object detection input size: {image_size}") if input_size_multiplier is not None and image_size % input_size_multiplier != 0: @@ -210,7 +213,7 @@ def adapt_input_size_to_dataset( if downscale_only: - def area(x): + def area(x: tuple[int, int]) -> int: return x[0] * x[1] if base_input_size and area(input_size) >= area(base_input_size): diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py index 52e1a55cf6a..b31c4e1f63e 100644 --- a/src/otx/core/model/action_classification.py +++ b/src/otx/core/model/action_classification.py @@ -53,6 +53,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int, int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index fc562f7b087..1b4dc9b6acc 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -12,7 +12,6 @@ import logging import warnings from abc import abstractmethod -from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, Sequence import numpy as np diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index 52d6aea8215..dad775d2bd7 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -56,6 +56,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: @@ -110,6 +111,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: @@ -163,6 +165,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index d478ab91803..938715396b6 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -41,6 +41,8 @@ class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]): """Base class for the detection models used in OTX.""" + input_size: tuple[int, int, int, int] + def test_step(self, batch: DetBatchDataEntity, batch_idx: int) -> None: """Perform a single test step on a batch of data from the test set. diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py index 166e14ca656..756a5fd7641 100644 --- a/src/otx/core/model/instance_segmentation.py +++ b/src/otx/core/model/instance_segmentation.py @@ -65,6 +65,7 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) + self.input_size: tuple[int, int, int, int] def _build_model(self, num_classes: int) -> nn.Module: raise NotImplementedError diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 3d2a672e72a..617d0041067 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -65,6 +65,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index fab3ffb52dd..89c668aa7ca 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -178,6 +178,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _exporter(self) -> OTXModelExporter: @@ -303,6 +304,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int, int, int] @property def _exporter(self) -> OTXModelExporter: diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 2483adb9990..c1269fc4031 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -280,10 +280,7 @@ def get_model( model_config = deepcopy(self.config["model"]) if input_size is not None: - if isinstance(input_size, int): - input_size = (input_size, input_size) - else: - input_size = tuple(input_size) + input_size = (input_size, input_size) if isinstance(input_size, int) else input_size model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size model_cls = get_model_cls_from_config(Namespace(model_config)) From 28dcd6363e876414c5aafbcfdd6030c33a4820f8 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Thu, 8 Aug 2024 16:45:25 +0900 Subject: [PATCH 11/42] write doc string --- src/otx/algo/anomaly/stfpm.py | 2 +- src/otx/algo/classification/huggingface_model.py | 3 ++- src/otx/algo/classification/mobilenet_v3.py | 1 + src/otx/algo/classification/torchvision_model.py | 1 + .../detection/base_models/detection_transformer.py | 1 + src/otx/algo/detection/huggingface_model.py | 4 ++-- src/otx/algo/detection/rtdetr.py | 8 ++++---- src/otx/algo/segmentation/huggingface_model.py | 2 +- src/otx/algo/visual_prompting/segment_anything.py | 2 +- src/otx/cli/cli.py | 6 ++---- src/otx/core/data/utils/utils.py | 12 +++++++----- src/otx/core/model/segmentation.py | 4 ++-- src/otx/core/model/visual_prompting.py | 2 +- src/otx/engine/utils/auto_configurator.py | 2 +- 14 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py index 112ab465590..72dd30e8aa3 100644 --- a/src/otx/algo/anomaly/stfpm.py +++ b/src/otx/algo/anomaly/stfpm.py @@ -46,7 +46,7 @@ def __init__( **kwargs, ) -> None: OTXAnomaly.__init__(self) - OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=(224, 224)) + OTXModel.__init__(self, label_info=AnomalyLabelInfo()) AnomalibStfpm.__init__( self, backbone=backbone, diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index 6de912cdf5d..60bfe51225a 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -33,7 +33,7 @@ from otx.core.metrics import MetricCallable -DEFAULT_INPUT_SIZE = (1, 2, 224, 224) +DEFAULT_INPUT_SIZE = (1, 3, 224, 224) logger = logging.getLogger(__name__) @@ -46,6 +46,7 @@ class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel): optimizer (OptimizerCallable, optional): The optimizer callable for training the model. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): The learning rate scheduler callable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. + input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) Example: 1. API diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index b14017de3e8..5c681470f9a 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -61,6 +61,7 @@ class MobileNetV3ForMulticlassCls(OTXMulticlassClsModel): metric (MetricCallable, optional): The metric callable. Defaults to MultiClassClsMetricCallable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. freeze_backbone (bool, optional): Whether to freeze the backbone layers during training. Defaults to False. + input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) """ def __init__( diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 9caa54d6bec..002855f82ae 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -404,6 +404,7 @@ class OTXTVModel(OTXModel): task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional): The type of classification task. train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training. + input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) """ model: TVClassificationModel diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py index a3158e19845..ed19a38dbd0 100644 --- a/src/otx/algo/detection/base_models/detection_transformer.py +++ b/src/otx/algo/detection/base_models/detection_transformer.py @@ -33,6 +33,7 @@ class DETR(BaseModule): Defaults to None. num_top_queries (int, optional): Number of top queries to return. Defaults to 300. + input_size (int, optional): The input size of the model. Default to 640. """ def __init__( diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index 140c644ad7f..859825a143f 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -12,7 +12,6 @@ from torchvision import tv_tensors from transformers import AutoImageProcessor, AutoModelForObjectDetection -# from transformers.image_processing_base import ImageProcessingMixin from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity from otx.core.data.entity.utils import stack_batch @@ -37,6 +36,7 @@ class HuggingFaceModelForDetection(OTXDetectionModel): Args: model_name_or_path (str): The name or path of the pre-trained model. label_info (LabelInfoTypes): The label information for the model. + input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 800, 992). optimizer (OptimizerCallable, optional): The optimizer for training the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -61,7 +61,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=object-detection label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 800, 992), # detection default input size + input_size: tuple[int, ...] = (1, 3, 800, 992), # input size of default detection data recipe optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 43a6267a769..cf8c3d820a8 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -242,13 +242,13 @@ def _build_model(self, num_classes: int) -> nn.Module: encoder = HybridEncoder( in_channels=[128, 256, 512], expansion=0.5, - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size[-2:], ) decoder = RTDETRTransformer( num_classes=num_classes, num_decoder_layers=3, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size[-2:], ) optimizer_configuration = [ @@ -286,12 +286,12 @@ def _build_model(self, num_classes: int) -> nn.Module: norm_cfg={"type": "FBN", "name": "norm"}, ) encoder = HybridEncoder( - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size[-2:], ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size[-2:], num_decoder_layers=6, ) diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index ed53ba8fa20..a64798b22f4 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -65,7 +65,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=image-segmentation label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 512, 512), # sementic segmentation default input size + input_size: tuple[int, ...] = (1, 3, 512, 512), # input size of default semantic segmentation data recipe optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index c1eef3d18d4..feb02857375 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -510,7 +510,7 @@ def __init__( stability_score_offset: float = 1.0, ) -> None: if input_size[-1] != input_size[-2]: - msg = f"SAM should use square image, but got {input_size}" + msg = f"SAM should use square image size, but got {input_size}" raise ValueError(msg) self.config = { diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index d70adc56535..8ccf07bb02a 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -332,6 +332,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # For num_classes update, Model and Metric are instantiated separately. model_config = self.config[self.subcommand].pop("model") + # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to data module if self.config[self.subcommand].data.adaptive_input_size != "none": model_cls = get_model_cls_from_config(model_config) if hasattr(model_cls, "input_size_multiplier"): @@ -342,12 +343,9 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: self.workspace = self.get_config_value(self.config_init, "workspace") self.datamodule = self.get_config_value(self.config_init, "data") + # pass the data module input size to the model if (input_size := self.datamodule.input_size) is not None: input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) # type: ignore[assignment] - # if isinstance(input_size, int): - # input_size = (input_size, input_size) - # else: - # input_size = tuple(input_size) # type: ignore[assignment] model_config["init_args"]["input_size"] = ( tuple(model_config["init_args"]["input_size"][:-2]) + input_size # type: ignore[operator] ) diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index cf0aaa8e1af..ee1b6b44f44 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -154,13 +154,15 @@ def adapt_input_size_to_dataset( """Compute appropriate model input size w.r.t. dataset statistics. Args: - max_image_size (int): Typical large image size of dataset in pixels. - min_object_size (int, optional): Typical small object size of dataset in pixels. - None to consider only image size. Defaults to None. - downscale_only (bool) : Whether to allow only smaller size than default setting. Defaults to True. + dataset (Dataset): Datumaro dataset including all subsets. + base_input_size (int | tuple[int, int] | None, optional): Base input size of the model. Defaults to None. + downscale_only (bool, optional) : Whether to allow only smaller size than default setting. Defaults to True. + input_size_multiplier (int | None, optional): + Multiplier for input size. If it's set, return the input size which can be divisible by the value. + Defaults to None. Returns: - Tuple[int, int]: (width, height) + tuple[int, int] | None: Recommended input size based on dataset statistics. """ min_recognizable_object_size = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 # meaning NxN input pixels being downscaled to 1x1 on feature map diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 617d0041067..935e2a2215b 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -47,7 +47,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (tuple[int, ...]): The input shape of the model. + input_size (tuple[int, ...]): The input size of the model. optimizer (OptimizerCallable, optional): The optimizer to use for training. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -146,7 +146,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (tuple[int, ...]): The input shape of the model. + input_size (tuple[int, ...]): The input size of the model. optimizer (OptimizerCallable, optional): The optimizer callable for the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index 89c668aa7ca..c6d7d2010c2 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -9,7 +9,7 @@ from collections import defaultdict from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Sequence +from typing import TYPE_CHECKING, Any, Literal import numpy as np import torch diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index c1269fc4031..98b433b3291 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -252,7 +252,7 @@ def get_model( model_name (str | None): The name of the model to retrieve. If None, the default model will be used. label_info (LabelInfoTypes | None): The meta information about the labels. If provided, the number of classes will be updated in the model's configuration. - input_size (tuple[int, ...] | int | None): Input size the model will use. + input_size (tuple[int, ...] | int | None, optional): Input size of the model. Defaults to None. Returns: OTXModel: The instantiated OTXModel instance. From 01cb6291b251f890701f87a7425fe1e23cdd4321 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 13:47:55 +0900 Subject: [PATCH 12/42] implement unit test --- .../zero_shot_segment_anything.py | 19 ++++---- src/otx/cli/cli.py | 4 +- src/otx/engine/utils/auto_configurator.py | 7 ++- .../backbones/test_otx_efficientnet.py | 5 ++ .../algo/classification/test_efficientnet.py | 15 ++++++ .../classification/test_huggingface_model.py | 23 +++++++++ .../algo/classification/test_mobilenet_v3.py | 15 ++++++ .../algo/detection/base_models/test_detr.py | 16 +++++++ .../segmentation/test_huggingface_model.py | 23 +++++++++ .../visual_prompting/test_segment_anything.py | 6 +++ tests/unit/cli/test_cli.py | 48 +++++++++++++++++-- tests/unit/core/data/test_module.py | 38 ++++++++++++++- tests/unit/core/model/test_base.py | 5 ++ tests/unit/engine/test_engine.py | 21 ++++++++ .../engine/utils/test_auto_configurator.py | 24 ++++++++++ 15 files changed, 250 insertions(+), 19 deletions(-) diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py index 2938730cdb6..5edfa3aabd1 100644 --- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py +++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py @@ -645,18 +645,9 @@ def __init__( # noqa: PLR0913 return_extra_metrics: bool = False, stability_score_offset: float = 1.0, ) -> None: - super().__init__( - label_info=label_info, - input_size=(1, 3, 1024, 1024), # zero-shot visual prompting model uses fixed 1024x1024 input size - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - ) - self.config = { "backbone": backbone, - "image_size": self.input_size[-1], + "image_size": 1024, "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, @@ -668,6 +659,14 @@ def __init__( # noqa: PLR0913 "stability_score_offset": stability_score_offset, **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone], } + super().__init__( + label_info=label_info, + input_size=(1, 3, 1024, 1024), # zero-shot visual prompting model uses fixed 1024x1024 input size + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) self.save_outputs = save_outputs self.reference_info_dir: Path = Path(reference_info_dir) diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 8ccf07bb02a..31717fb487d 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -332,7 +332,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # For num_classes update, Model and Metric are instantiated separately. model_config = self.config[self.subcommand].pop("model") - # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to data module + # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule if self.config[self.subcommand].data.adaptive_input_size != "none": model_cls = get_model_cls_from_config(model_config) if hasattr(model_cls, "input_size_multiplier"): @@ -343,7 +343,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: self.workspace = self.get_config_value(self.config_init, "workspace") self.datamodule = self.get_config_value(self.config_init, "data") - # pass the data module input size to the model + # pass OTXDataModule input size to the model if (input_size := self.datamodule.input_size) is not None: input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) # type: ignore[assignment] model_config["init_args"]["input_size"] = ( diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 98b433b3291..170d3db76a0 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -216,7 +216,7 @@ def get_datamodule(self) -> OTXDataModule | None: if self.data_root is None: return None self.config["data"]["data_root"] = self.data_root - data_config = deepcopy(self.config["data"]) + data_config: dict = deepcopy(self.config["data"]) train_config = data_config.pop("train_subset") val_config = data_config.pop("val_subset") test_config = data_config.pop("test_subset") @@ -227,6 +227,11 @@ def get_datamodule(self) -> OTXDataModule | None: _ = data_config.pop("__path__", {}) # Remove __path__ key that for CLI _ = data_config.pop("config", {}) # Remove config key that for CLI + if data_config.get("adaptive_input_size", "none") != "none": + model_cls = get_model_cls_from_config(Namespace(self.config["model"])) + if hasattr(model_cls, "input_size_multiplier"): + data_config["input_size_multiplier"] = model_cls.input_size_multiplier + return OTXDataModule( train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config), val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config), diff --git a/tests/unit/algo/classification/backbones/test_otx_efficientnet.py b/tests/unit/algo/classification/backbones/test_otx_efficientnet.py index b2a85e4088f..3d7fb9017fd 100644 --- a/tests/unit/algo/classification/backbones/test_otx_efficientnet.py +++ b/tests/unit/algo/classification/backbones/test_otx_efficientnet.py @@ -13,3 +13,8 @@ def test_forward(self, version): model = OTXEfficientNet(version, pretrained=None) assert model(torch.randn(1, 3, 244, 244))[0].shape[-1] == 8 assert model(torch.randn(1, 3, 244, 244))[0].shape[-2] == 8 + + def test_set_input_size(self): + input_size = (300, 300) + model = OTXEfficientNet("b0", input_size=input_size, pretrained=None) + assert model.in_size == input_size diff --git a/tests/unit/algo/classification/test_efficientnet.py b/tests/unit/algo/classification/test_efficientnet.py index 49d16527f7a..fd501ff48ed 100644 --- a/tests/unit/algo/classification/test_efficientnet.py +++ b/tests/unit/algo/classification/test_efficientnet.py @@ -54,6 +54,11 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_ assert isinstance(outputs, MulticlassClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + def test_set_input_size(self): + input_size = (1, 3, 300, 300) + model = EfficientNetForMulticlassCls(version="b0", label_info=10, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] + @pytest.fixture() def fxt_multi_label_cls_model(): @@ -92,6 +97,11 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_ assert isinstance(outputs, MultilabelClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + def test_set_input_size(self): + input_size = (1, 3, 300, 300) + model = EfficientNetForMultilabelCls(version="b0", label_info=10, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] + @pytest.fixture() def fxt_h_label_cls_model(fxt_hlabel_data): @@ -129,3 +139,8 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent assert isinstance(outputs, HlabelClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + + def test_set_input_size(self, fxt_hlabel_data): + input_size = (1, 3, 300, 300) + model = EfficientNetForHLabelCls(version="b0", label_info=fxt_hlabel_data, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] diff --git a/tests/unit/algo/classification/test_huggingface_model.py b/tests/unit/algo/classification/test_huggingface_model.py index 98007cc4249..c25be896b5d 100644 --- a/tests/unit/algo/classification/test_huggingface_model.py +++ b/tests/unit/algo/classification/test_huggingface_model.py @@ -1,6 +1,8 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock + import pytest import torch from otx.core.data.entity.base import OTXBatchLossEntity @@ -9,6 +11,7 @@ SKIP_TRANSFORMERS_TEST = False try: + from otx.algo.classification import huggingface_model as target_file from otx.algo.classification.huggingface_model import HuggingFaceModelForMulticlassCls from transformers.modeling_outputs import ImageClassifierOutput except ImportError: @@ -67,3 +70,23 @@ def test_forward_for_tracing(self, fxt_multi_class_cls_model, tmp_path): output = fxt_multi_class_cls_model.forward_for_tracing( image=torch.randn(1, 3, 224, 224), ) + + @pytest.fixture() + def mock_pretrainedconfig(self, mocker) -> MagicMock: + mock_obj = mocker.patch.object(target_file, "PretrainedConfig") + mock_obj.get_config_dict.return_value = ({"image_size": 224}, None) + return mock_obj + + @pytest.fixture() + def mock_automodel(self, mocker) -> MagicMock: + return mocker.patch.object(target_file, "AutoModelForImageClassification") + + def test_set_input_size(self, mock_pretrainedconfig, mock_automodel): + input_size = (1, 3, 300, 300) + HuggingFaceModelForMulticlassCls( + model_name_or_path="facebook/deit-tiny-patch16-224", + label_info=10, + input_size=input_size, + ) + + assert mock_automodel.from_pretrained.call_args.kwargs["image_size"] == input_size[-1] diff --git a/tests/unit/algo/classification/test_mobilenet_v3.py b/tests/unit/algo/classification/test_mobilenet_v3.py index 60981098e1c..cecfd1d919a 100644 --- a/tests/unit/algo/classification/test_mobilenet_v3.py +++ b/tests/unit/algo/classification/test_mobilenet_v3.py @@ -54,6 +54,11 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_ assert isinstance(outputs, MulticlassClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + def test_set_input_size(self): + input_size = (1, 3, 300, 300) + model = MobileNetV3ForMulticlassCls(mode="large", label_info=10, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] + @pytest.fixture() def fxt_multi_label_cls_model(): @@ -92,6 +97,11 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_ assert isinstance(outputs, MultilabelClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + def test_set_input_size(self): + input_size = (1, 3, 300, 300) + model = MobileNetV3ForMultilabelCls(mode="large", label_info=10, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] + @pytest.fixture() def fxt_h_label_cls_model(fxt_hlabel_data): @@ -129,3 +139,8 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent assert isinstance(outputs, HlabelClsBatchPredEntity) assert outputs.has_xai_outputs == explain_mode + + def test_set_input_size(self, fxt_hlabel_data): + input_size = (1, 3, 300, 300) + model = MobileNetV3ForHLabelCls(mode="large", label_info=fxt_hlabel_data, input_size=input_size) + assert model.model.backbone.in_size == input_size[-2:] diff --git a/tests/unit/algo/detection/base_models/test_detr.py b/tests/unit/algo/detection/base_models/test_detr.py index 55194874b31..71ce30cc1fb 100644 --- a/tests/unit/algo/detection/base_models/test_detr.py +++ b/tests/unit/algo/detection/base_models/test_detr.py @@ -3,6 +3,8 @@ # """Test of DETR.""" +from unittest.mock import MagicMock + import pytest import torch import torchvision @@ -103,3 +105,17 @@ def test_rt_detr_export(self, rt_detr_model, images): assert result["bboxes"].shape == (2, 10, 4) # ensure no scaling assert torch.all(result["bboxes"] < 2) + + def test_set_input_size(self): + input_size = 1280 + model = DETR( + backbone=MagicMock(), + encoder=MagicMock(), + decoder=MagicMock(), + num_classes=10, + input_size=input_size, + ) + + expected_multi_scale = sorted([input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2) + + assert sorted(model.multi_scale) == expected_multi_scale diff --git a/tests/unit/algo/segmentation/test_huggingface_model.py b/tests/unit/algo/segmentation/test_huggingface_model.py index ca3a3a823ae..36693561692 100644 --- a/tests/unit/algo/segmentation/test_huggingface_model.py +++ b/tests/unit/algo/segmentation/test_huggingface_model.py @@ -1,6 +1,8 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock + import pytest import torch from otx.core.data.entity.base import ImageInfo, OTXBatchLossEntity @@ -8,6 +10,7 @@ SKIP_TRANSFORMERS_TEST = False try: + from otx.algo.segmentation import huggingface_model as target_file from otx.algo.segmentation.huggingface_model import HuggingFaceModelForSegmentation from transformers.modeling_outputs import SemanticSegmenterOutput from transformers.models.segformer.image_processing_segformer import SegformerImageProcessor @@ -69,3 +72,23 @@ def test_customize_outputs(self, fxt_seg_model, fxt_seg_batch_data_entity): fxt_seg_model.explain_mode = True with pytest.raises(NotImplementedError): fxt_seg_model._customize_outputs(outputs, fxt_seg_batch_data_entity) + + @pytest.fixture() + def mock_pretrainedconfig(self, mocker) -> MagicMock: + mock_obj = mocker.patch.object(target_file, "PretrainedConfig") + mock_obj.get_config_dict.return_value = ({"image_size": 512}, None) + return mock_obj + + @pytest.fixture() + def mock_automodel(self, mocker) -> MagicMock: + return mocker.patch.object(target_file, "AutoModelForSemanticSegmentation") + + def test_set_input_size(self, mock_pretrainedconfig, mock_automodel): + input_size = (1, 3, 1024, 1024) + HuggingFaceModelForSegmentation( + model_name_or_path="facebook/deit-tiny-patch16-224", + label_info=10, + input_size=input_size, + ) + + assert mock_automodel.from_pretrained.call_args.kwargs["image_size"] == input_size[-1] diff --git a/tests/unit/algo/visual_prompting/test_segment_anything.py b/tests/unit/algo/visual_prompting/test_segment_anything.py index 09447019573..9ad548bb72f 100644 --- a/tests/unit/algo/visual_prompting/test_segment_anything.py +++ b/tests/unit/algo/visual_prompting/test_segment_anything.py @@ -33,6 +33,7 @@ def test_init( ) segment_anything = SegmentAnything( backbone=backbone, + image_size=2048, freeze_image_encoder=freeze_image_encoder, freeze_prompt_encoder=freeze_prompt_encoder, freeze_mask_decoder=freeze_mask_decoder, @@ -40,6 +41,7 @@ def test_init( # check import modules assert hasattr(segment_anything, "image_encoder") + assert segment_anything.image_encoder.img_size == 2048 assert segment_anything.image_encoder.__class__.__name__ == expected_backbone assert hasattr(segment_anything, "prompt_encoder") assert hasattr(segment_anything, "mask_decoder") @@ -296,6 +298,10 @@ class TestOTXSegmentAnything: def model(self) -> OTXSegmentAnything: return OTXSegmentAnything(backbone="tiny_vit") + def test_set_input_size(self): + with pytest.raises(ValueError, match="SAM should use square image size"): + OTXSegmentAnything(backbone="tiny_vit", input_size=(1, 3, 1024, 2048)) + def test_create_model(self, model) -> None: """Test _create_model.""" segment_anything = model._create_model() diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 9211e574db5..e0b54c86210 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -3,11 +3,13 @@ from __future__ import annotations import sys +from unittest.mock import MagicMock import pytest import torch import yaml from otx.cli import OTXCLI, main +from otx.cli import cli as target_file from rich.console import Console @@ -78,8 +80,8 @@ def test_add_subcommands(self, mocker) -> None: assert cli._subcommand_method_arguments.keys() == cli.engine_subcommands().keys() @pytest.fixture() - def fxt_train_command(self, monkeypatch, tmpdir) -> list[str]: - argv = [ + def fxt_train_argv(self, tmpdir) -> list[str]: + return [ "otx", "train", "--config", @@ -91,8 +93,11 @@ def fxt_train_command(self, monkeypatch, tmpdir) -> list[str]: "--work_dir", str(tmpdir), ] - monkeypatch.setattr("sys.argv", argv) - return argv + + @pytest.fixture() + def fxt_train_command(self, monkeypatch, fxt_train_argv) -> list[str]: + monkeypatch.setattr("sys.argv", fxt_train_argv) + return fxt_train_argv def test_instantiate_classes(self, fxt_train_command, mocker) -> None: mock_run = mocker.patch("otx.cli.OTXCLI.run") @@ -115,6 +120,41 @@ def test_instantiate_classes(self, fxt_train_command, mocker) -> None: assert cli.datamodule == cli.engine.datamodule assert cli.model == cli.engine.model + @pytest.mark.parametrize("input_size", [512, 1024]) + def test_instantiate_classes_set_input_size(self, input_size, fxt_train_argv, monkeypatch, mocker) -> None: + mocker.patch("otx.cli.OTXCLI.run") + fxt_train_argv.extend(["--data.input_size", str(input_size)]) + monkeypatch.setattr("sys.argv", fxt_train_argv) + + cli = OTXCLI() + cli.instantiate_classes() + + assert cli.model.input_size == (1, 3, input_size, input_size) + + @pytest.fixture() + def mock_model_cls(self) -> MagicMock: + model_cls = MagicMock() + model_cls.input_size_multiplier = 12345 + return model_cls + + def test_instantiate_classes_set_adaptive_input_size( + self, + fxt_train_argv, + monkeypatch, + mocker, + mock_model_cls, + ) -> None: + mocker.patch("otx.cli.OTXCLI.run") + mocker.patch.object(target_file, "get_model_cls_from_config", return_value=mock_model_cls) + fxt_train_argv.extend(["--data.adaptive_input_size", "auto"]) + monkeypatch.setattr("sys.argv", fxt_train_argv) + mock_data_module = mocker.patch("otx.core.data.module.adapt_input_size_to_dataset", return_value=1024) + + cli = OTXCLI() + cli.instantiate_classes() + + assert mock_data_module.call_args.args[-1] == 12345 + def test_raise_error_correctly(self, fxt_train_command, mocker) -> None: mock_engine = mocker.patch("otx.cli.OTXCLI.instantiate_engine") mock_engine.return_value.train.side_effect = RuntimeError("my_error") diff --git a/tests/unit/core/data/test_module.py b/tests/unit/core/data/test_module.py index ac24753cafd..78d1145ae1d 100644 --- a/tests/unit/core/data/test_module.py +++ b/tests/unit/core/data/test_module.py @@ -16,6 +16,7 @@ TileConfig, UnlabeledDataConfig, ) +from otx.core.data import module as target_file from otx.core.data.module import ( OTXDataModule, OTXTaskType, @@ -158,9 +159,42 @@ def test_init_input_size( input_size=(1200, 1200), ) - assert fxt_config.train_subset.input_size == (1000, 1000) + assert fxt_config.train_subset.input_size == (1200, 1200) assert fxt_config.val_subset.input_size == (1200, 1200) - assert fxt_config.test_subset.input_size == (800, 800) + assert fxt_config.test_subset.input_size == (1200, 1200) + + @pytest.fixture() + def mock_adapt_input_size_to_dataset(self, mocker) -> MagicMock: + return mocker.patch.object(target_file, "adapt_input_size_to_dataset", return_value=(1234, 1234)) + + def test_init_adaptive_input_size( + self, + mock_dm_dataset, + mock_otx_dataset_factory, + mock_data_filtering, + fxt_config, + mock_adapt_input_size_to_dataset, + ) -> None: + # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets + mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)} + mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets + fxt_config.train_subset.input_size = (1000, 1000) + fxt_config.val_subset.input_size = None + fxt_config.test_subset.input_size = (800, 800) + + OTXDataModule( + task=OTXTaskType.MULTI_CLASS_CLS, + data_format=fxt_config.data_format, + data_root=fxt_config.data_root, + train_subset=fxt_config.train_subset, + val_subset=fxt_config.val_subset, + test_subset=fxt_config.test_subset, + adaptive_input_size="auto", + ) + + assert fxt_config.train_subset.input_size == (1234, 1234) + assert fxt_config.val_subset.input_size == (1234, 1234) + assert fxt_config.test_subset.input_size == (1234, 1234) def test_data_format_check( self, diff --git a/tests/unit/core/model/test_base.py b/tests/unit/core/model/test_base.py index dc164a577be..d72891cf538 100644 --- a/tests/unit/core/model/test_base.py +++ b/tests/unit/core/model/test_base.py @@ -20,6 +20,11 @@ def __init__(self, num_classes): class TestOTXModel: + def test_init(self, monkeypatch): + monkeypatch.setattr(OTXModel, "input_size_multiplier", 10, raising=False) + with pytest.raises(ValueError, match="Input size should be a multiple"): + OTXModel(label_info=2, input_size=(1, 3, 1024, 1024)) + def test_smart_weight_loading(self, mocker) -> None: with mocker.patch.object(OTXModel, "_create_model", return_value=MockNNModule(2)): prev_model = OTXModel(label_info=2) diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py index b4b34fb4b2c..1bd9c655cf8 100644 --- a/tests/unit/engine/test_engine.py +++ b/tests/unit/engine/test_engine.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path +from unittest.mock import MagicMock import pytest from otx.algo.classification.efficientnet import EfficientNetForMulticlassCls @@ -51,6 +52,26 @@ def test_constructor(self, tmp_path) -> None: with pytest.raises(ValueError, match="Given model class (.*) requires a valid label_info to instantiate."): _ = Engine(work_dir=tmp_path, task="MULTI_CLASS_CLS") + @pytest.fixture() + def mock_datamodule(self, mocker): + input_size = (1234, 1234) + label_info = 4321 + mock_datamodule = MagicMock() + mock_datamodule.label_info = label_info + mock_datamodule.input_size = input_size + + return mocker.patch( + "otx.engine.utils.auto_configurator.AutoConfigurator.get_datamodule", + return_value=mock_datamodule, + ) + + def test_model_init(self, tmp_path, mock_datamodule): + data_root = "tests/assets/classification_dataset" + engine = Engine(work_dir=tmp_path, data_root=data_root) + + assert engine._model.input_size == (1, 3, 1234, 1234) + assert engine._model.label_info.num_classes == 4321 + def test_model_setter(self, fxt_engine, mocker) -> None: assert isinstance(fxt_engine.model, OTXTVModel) fxt_engine.model = "efficientnet_b0" diff --git a/tests/unit/engine/utils/test_auto_configurator.py b/tests/unit/engine/utils/test_auto_configurator.py index 078c81fc84c..7bf247020c9 100644 --- a/tests/unit/engine/utils/test_auto_configurator.py +++ b/tests/unit/engine/utils/test_auto_configurator.py @@ -11,6 +11,7 @@ from otx.core.types.label import LabelInfo, SegLabelInfo from otx.core.types.task import OTXTaskType from otx.core.types.transformer_libs import TransformLibType +from otx.engine.utils import auto_configurator as target_file from otx.engine.utils.auto_configurator import ( DEFAULT_CONFIG_PER_TASK, AutoConfigurator, @@ -108,6 +109,19 @@ def test_get_datamodule(self) -> None: assert isinstance(datamodule, OTXDataModule) assert datamodule.task == task + def test_get_datamodule_set_input_size_multiplier(self, mocker) -> None: + mock_otxdatamodule = mocker.patch.object(target_file, "OTXDataModule") + auto_configurator = AutoConfigurator( + data_root="tests/assets/car_tree_bug", + task=OTXTaskType.DETECTION, + model_name="yolox_tiny", + ) + auto_configurator.config["data"]["adaptive_input_size"] = "auto" + + auto_configurator.get_datamodule() + + assert mock_otxdatamodule.call_args.kwargs["input_size_multiplier"] == 32 + def test_get_model(self, fxt_task: OTXTaskType) -> None: if fxt_task is OTXTaskType.H_LABEL_CLS: pytest.xfail(reason="Not working") @@ -130,6 +144,16 @@ def test_get_model(self, fxt_task: OTXTaskType) -> None: with pytest.raises(ValueError, match="Given model class (.*) requires a valid label_info to instantiate."): _ = auto_configurator.get_model(label_info=None) + def test_get_model_set_input_size(self) -> None: + auto_configurator = AutoConfigurator(task=OTXTaskType.MULTI_CLASS_CLS) + label_names = ["class1", "class2", "class3"] + label_info = LabelInfo(label_names=label_names, label_groups=[label_names]) + input_size = 300 + + model = auto_configurator.get_model(label_info=label_info, input_size=input_size) + + assert model.input_size == (1, 3, input_size, input_size) + def test_get_optimizer(self, fxt_task: OTXTaskType) -> None: if fxt_task in { OTXTaskType.ANOMALY_SEGMENTATION, From 30172ddad8ecd8fe45f42bcb3ed72365c74f87e6 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 13:58:16 +0900 Subject: [PATCH 13/42] update unit test --- tests/unit/core/model/test_classification.py | 10 +++++++++- tests/unit/core/model/test_segmentation.py | 4 ++-- tests/unit/core/model/test_visual_prompting.py | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/unit/core/model/test_classification.py b/tests/unit/core/model/test_classification.py index 73027bf00f3..835ed854e20 100644 --- a/tests/unit/core/model/test_classification.py +++ b/tests/unit/core/model/test_classification.py @@ -37,6 +37,7 @@ def test_export_parameters( ) -> None: model = OTXMulticlassClsModel( label_info=1, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -50,6 +51,7 @@ def test_export_parameters( model = OTXMultilabelClsModel( label_info=1, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -60,6 +62,7 @@ def test_export_parameters( model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -76,6 +79,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXMulticlassClsModel( label_info=1, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -106,6 +110,7 @@ def test_export_parameters( ) -> None: model = OTXMultilabelClsModel( label_info=1, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -125,6 +130,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXMultilabelClsModel( label_info=1, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -156,6 +162,7 @@ def test_export_parameters( ) -> None: model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -176,6 +183,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, + input_size=(1, 3, 224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -199,7 +207,7 @@ def test_convert_pred_entity_to_compute_metric( assert "target" in metric_input def test_set_label_info(self, fxt_hlabel_multilabel_info): - model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info) + model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(1, 3, 224, 224)) assert model.label_info.num_multilabel_classes == fxt_hlabel_multilabel_info.num_multilabel_classes fxt_hlabel_multilabel_info.num_multilabel_classes = 0 diff --git a/tests/unit/core/model/test_segmentation.py b/tests/unit/core/model/test_segmentation.py index 573e11d773d..32da4815475 100644 --- a/tests/unit/core/model/test_segmentation.py +++ b/tests/unit/core/model/test_segmentation.py @@ -46,7 +46,7 @@ def torch_compile(): class TestOTXSegmentationModel: @pytest.fixture() def model(self, label_info, optimizer, scheduler, metric, torch_compile): - return OTXSegmentationModel(label_info, optimizer, scheduler, metric, torch_compile) + return OTXSegmentationModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile) def test_export_parameters(self, model): params = model._export_parameters @@ -74,7 +74,7 @@ def test_dispatch_label_info(self, model, label_info, expected_label_info): class TestTorchVisionCompatibleModel: @pytest.fixture() def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> TorchVisionCompatibleModel: - return TorchVisionCompatibleModel(label_info, optimizer, scheduler, metric, torch_compile) + return TorchVisionCompatibleModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile) @pytest.fixture() def batch_data_entity(self): diff --git a/tests/unit/core/model/test_visual_prompting.py b/tests/unit/core/model/test_visual_prompting.py index e8cc1e2fe93..9a3a8709529 100644 --- a/tests/unit/core/model/test_visual_prompting.py +++ b/tests/unit/core/model/test_visual_prompting.py @@ -36,7 +36,7 @@ @pytest.fixture() def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel: mocker.patch.object(OTXVisualPromptingModel, "_create_model") - model = OTXVisualPromptingModel(label_info=1) + model = OTXVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024)) model.model.image_size = 1024 return model @@ -44,7 +44,7 @@ def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel: @pytest.fixture() def otx_zero_shot_visual_prompting_model(mocker) -> OTXZeroShotVisualPromptingModel: mocker.patch.object(OTXZeroShotVisualPromptingModel, "_create_model") - model = OTXZeroShotVisualPromptingModel(label_info=1) + model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024)) model.model.image_size = 1024 return model From 73598abacf5a9c59ca110306ffbf0e3af133d060 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 15:46:08 +0900 Subject: [PATCH 14/42] implement left unit test --- src/otx/core/data/utils/utils.py | 28 ++-- tests/unit/core/data/utils/__init__.py | 2 + tests/unit/core/data/utils/test_utils.py | 156 +++++++++++++++++++++++ 3 files changed, 171 insertions(+), 15 deletions(-) create mode 100644 tests/unit/core/data/utils/__init__.py create mode 100644 tests/unit/core/data/utils/test_utils.py diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index ee1b6b44f44..60d6a532c86 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -145,6 +145,11 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = return stat +_MIN_RECOGNIZABLE_OBJECT_SIZE = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 +# meaning NxN input pixels being downscaled to 1x1 on feature map +_MIN_DETECTION_INPUT_SIZE = 256 # Minimum input size for object detection + + def adapt_input_size_to_dataset( dataset: Dataset, base_input_size: int | tuple[int, int] | None = None, @@ -164,10 +169,6 @@ def adapt_input_size_to_dataset( Returns: tuple[int, int] | None: Recommended input size based on dataset statistics. """ - min_recognizable_object_size = 32 # Minimum object size recognizable by NNs: typically 16 ~ 32 - # meaning NxN input pixels being downscaled to 1x1 on feature map - min_detection_input_size = 256 # Minimum input size for object detection - if downscale_only and base_input_size is None: msg = "If downscale_only is set to True, base_input_size should be set but got None." raise ValueError(msg) @@ -175,19 +176,13 @@ def adapt_input_size_to_dataset( if isinstance(base_input_size, int): base_input_size = (base_input_size, base_input_size) - train_dataset = dataset.subsets().get("train") - if train_dataset is None: + if (train_dataset := dataset.subsets().get("train")) is None: return None logger.info("Adapting model input size based on dataset stat") stat = compute_robust_dataset_statistics(train_dataset) - max_image_size = stat["image"]["robust_max"] + max_image_size = stat["image"].get("robust_max", 0) min_object_size = None - if stat["annotation"]: - # Refine using annotation shape size stat - # Fit to typical small object size (conservative) - # -> "avg" size might be preferrable for efficiency - min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None) logger.info(f"-> Current base input size: {base_input_size}") @@ -198,14 +193,17 @@ def adapt_input_size_to_dataset( logger.info(f"-> Based on typical large image size: {image_size}") # Refine using annotation shape size stat + # Fit to typical small object size (conservative) + # -> "avg" size might be preferrable for efficiency + min_object_size = stat.get("annotation", {}).get("size_of_shape", {}).get("robust_min", None) if min_object_size is not None and min_object_size > 0: - image_size = round(image_size * min_recognizable_object_size / min_object_size) + image_size = round(image_size * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}") if image_size > max_image_size: image_size = max_image_size logger.info(f"-> Restrict to max image size: {image_size}") - if image_size < min_detection_input_size: - image_size = min_detection_input_size + if image_size < _MIN_DETECTION_INPUT_SIZE: + image_size = _MIN_DETECTION_INPUT_SIZE logger.info(f"-> Based on minimum object detection input size: {image_size}") if input_size_multiplier is not None and image_size % input_size_multiplier != 0: diff --git a/tests/unit/core/data/utils/__init__.py b/tests/unit/core/data/utils/__init__.py new file mode 100644 index 00000000000..916f3a44b27 --- /dev/null +++ b/tests/unit/core/data/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/unit/core/data/utils/test_utils.py b/tests/unit/core/data/utils/test_utils.py new file mode 100644 index 00000000000..ace8d23250a --- /dev/null +++ b/tests/unit/core/data/utils/test_utils.py @@ -0,0 +1,156 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Tests for utils for OTX data module.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import cv2 +import numpy as np +import pytest +from datumaro.components.annotation import Bbox +from datumaro.components.dataset import Dataset as DmDataset +from datumaro.components.dataset_base import DatasetItem +from datumaro.components.media import Image +from otx.core.data.utils import utils as target_file +from otx.core.data.utils.utils import ( + adapt_input_size_to_dataset, + compute_robust_dataset_statistics, + compute_robust_scale_statistics, + compute_robust_statistics, +) + + +def test_compute_robust_statistics(): + values = np.array([]) + stat = compute_robust_statistics(values) + assert len(stat) == 0 + + values = np.array([0.5, 1, 1.5]) + stat = compute_robust_statistics(values) + assert np.isclose(stat["avg"], 1.0) + assert np.isclose(stat["min"], 0.5) + assert np.isclose(stat["max"], 1.5) + + values = np.random.rand(10) + stat = compute_robust_statistics(values) + assert np.isclose(stat["min"], np.min(values)) + assert np.isclose(stat["max"], np.max(values)) + assert stat["min"] <= stat["robust_min"] + assert stat["max"] <= stat["robust_max"] + + +def test_compute_robust_scale_statistics(): + scales = np.array([]) + stat = compute_robust_scale_statistics(scales) + assert len(stat) == 0 + + scales = np.array([0.5, 1, 2]) + stat = compute_robust_scale_statistics(scales) + assert np.isclose(stat["avg"], 1.0) + assert np.isclose(stat["min"], 0.5) + assert np.isclose(stat["max"], 2.0) + + scales = np.random.rand(10) + stat = compute_robust_scale_statistics(scales) + assert np.isclose(stat["min"], np.min(scales)) + assert np.isclose(stat["max"], np.max(scales)) + assert stat["min"] <= stat["robust_min"] + assert stat["max"] <= stat["robust_max"] + + +def make_media(shape: tuple[int, int, int]): + np_img = np.zeros(shape=shape, dtype=np.uint8) + np_img[:, :, 0] = 0 # Set 0 for B channel + np_img[:, :, 1] = 1 # Set 1 for G channel + np_img[:, :, 2] = 2 # Set 2 for R channel + + _, np_bytes = cv2.imencode(".png", np_img) + media = Image.from_bytes(np_bytes.tobytes()) + media.path = "" + + return media + + +@pytest.fixture() +def mock_dataset() -> DmDataset: + return DmDataset.from_iterable( + [ + DatasetItem( + id="1", + subset="train", + media=make_media((50, 50, 3)), + annotations=[ + Bbox(x=0, y=0, w=5, h=5, label=0), + ], + ), + DatasetItem( + id="2", + subset="train", + media=make_media((100, 100, 3)), + annotations=[ + Bbox(x=0, y=0, w=10, h=10, label=0), + Bbox(x=10, y=10, w=20, h=20, label=0), + ], + ), + DatasetItem( + id="3", + subset="train", + media=make_media((200, 200, 3)), + annotations=[], + ), + ], + ) + + +def test_compute_robuste_dataset_statistics(mock_dataset): + subset = mock_dataset.get_subset("train") + + stat = compute_robust_dataset_statistics(subset, max_samples=0) + assert len(stat) == 0 + stat = compute_robust_dataset_statistics(subset, max_samples=-1) + assert len(stat) == 0 + + stat = compute_robust_dataset_statistics(subset) + assert np.isclose(stat["image"]["avg"], 100) + assert np.isclose(stat["annotation"]["num_per_image"]["avg"], 1.0) + assert np.isclose(stat["annotation"]["size_of_shape"]["avg"], 10.0) + + +def test_adapt_input_size_to_dataset(mocker): + mock_stat = mocker.patch.object(target_file, "compute_robust_dataset_statistics") + + with pytest.raises(ValueError, match="base_input_size should be set"): + input_size = adapt_input_size_to_dataset(dataset=MagicMock()) + + mock_stat.return_value = {"image": {}, "annotation": {}} + mock_dataset = MagicMock() + mock_dataset.subsets.return_value = {} + input_size = adapt_input_size_to_dataset(dataset=mock_dataset, base_input_size=512) + assert input_size is None + + mock_stat.return_value = {"image": {}, "annotation": {}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) + assert input_size == (512, 512) + + mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) + assert input_size == (150, 150) + + mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512, input_size_multiplier=32) + assert input_size == (160, 160) + + mock_stat.return_value = {"image": {"robust_max": 256}, "annotation": {"size_of_shape": {"robust_min": 64}}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) + assert input_size == (256, 256) + + mock_stat.return_value = {"image": {"robust_max": 1024}, "annotation": {"size_of_shape": {"robust_min": 64}}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) + assert input_size == (512, 512) + + mock_stat.return_value = {"image": {"robust_max": 2045}, "annotation": {"size_of_shape": {"robust_min": 64}}} + input_size = adapt_input_size_to_dataset(dataset=MagicMock(), downscale_only=False, base_input_size=512) + assert input_size == (1022, 1022) From 4df567476c038fc3ae7c8656f5b1192b3d31c40b Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 16:04:42 +0900 Subject: [PATCH 15/42] align with develop branch --- src/otx/algo/classification/timm_model.py | 6 ++++++ src/otx/cli/cli.py | 3 ++- src/otx/core/model/anomaly.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/otx/algo/classification/timm_model.py b/src/otx/algo/classification/timm_model.py index 04eaf5ff396..7c540b3e1ef 100644 --- a/src/otx/algo/classification/timm_model.py +++ b/src/otx/algo/classification/timm_model.py @@ -54,6 +54,7 @@ def __init__( self, label_info: LabelInfoTypes, backbone: TimmModelType, + input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -66,6 +67,7 @@ def __init__( super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -146,6 +148,7 @@ def __init__( self, label_info: LabelInfoTypes, backbone: TimmModelType, + input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -157,6 +160,7 @@ def __init__( super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, @@ -226,6 +230,7 @@ def __init__( self, label_info: HLabelInfo, backbone: TimmModelType, + input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -237,6 +242,7 @@ def __init__( super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 31717fb487d..04ea994148f 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -344,7 +344,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: self.datamodule = self.get_config_value(self.config_init, "data") # pass OTXDataModule input size to the model - if (input_size := self.datamodule.input_size) is not None: + if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]: + # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) # type: ignore[assignment] model_config["init_args"]["input_size"] = ( tuple(model_config["init_args"]["input_size"][:-2]) + input_size # type: ignore[operator] diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py index 1e3e9e0dd1f..a4f57c4fe0e 100644 --- a/src/otx/core/model/anomaly.py +++ b/src/otx/core/model/anomaly.py @@ -81,7 +81,7 @@ def on_load_checkpoint(self, checkpoint: dict[str, Any]) -> None: for key, value in anomaly_attrs.items(): setattr(self, key, value) - @property + @property # type: ignore[override] def input_size(self) -> tuple[int, int]: """Returns the input size of the model. From 4fcc62718b34d92040eb03357a40d07aa87b343f Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 16:44:14 +0900 Subject: [PATCH 16/42] fix typo --- src/otx/algo/detection/yolox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index ac319e26532..b21a0420d67 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -72,7 +72,7 @@ def _customize_inputs( def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" if self.input_size is None: - msg = f"Inputhsize attribute is not set for {self.__class__}" + msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) swap_rgb = not isinstance(self, YOLOXTINY) # only YOLOX-TINY uses RGB From 39b0650946566af8ec13e2a01481b10e78c892ea Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 18:10:51 +0900 Subject: [PATCH 17/42] exclude batch and num channel from input size --- src/otx/algo/action_classification/movinet.py | 2 +- src/otx/algo/action_classification/x3d.py | 2 +- src/otx/algo/classification/efficientnet.py | 12 +++++----- .../algo/classification/huggingface_model.py | 9 +++---- src/otx/algo/classification/mobilenet_v3.py | 19 ++++++++------- src/otx/algo/classification/timm_model.py | 6 ++--- .../algo/classification/torchvision_model.py | 11 +++++---- src/otx/algo/classification/vit.py | 14 +++++------ src/otx/algo/detection/atss.py | 4 ++-- src/otx/algo/detection/huggingface_model.py | 7 +++--- src/otx/algo/detection/rtdetr.py | 22 ++++++++--------- src/otx/algo/detection/rtmdet.py | 4 ++-- src/otx/algo/detection/ssd.py | 8 +++---- src/otx/algo/detection/yolox.py | 6 ++--- .../algo/instance_segmentation/maskrcnn.py | 8 +++---- .../algo/instance_segmentation/maskrcnn_tv.py | 4 ++-- .../algo/instance_segmentation/rtmdet_inst.py | 4 ++-- src/otx/algo/segmentation/dino_v2_seg.py | 2 +- .../algo/segmentation/huggingface_model.py | 2 +- src/otx/algo/segmentation/litehrnet.py | 4 ++-- src/otx/algo/segmentation/segnext.py | 2 +- .../algo/visual_prompting/segment_anything.py | 8 +++---- .../zero_shot_segment_anything.py | 2 +- src/otx/cli/cli.py | 6 ++--- src/otx/core/data/utils/utils.py | 3 +-- src/otx/core/model/action_classification.py | 8 +++---- src/otx/core/model/base.py | 13 +++++----- src/otx/core/model/classification.py | 24 +++++++++---------- src/otx/core/model/detection.py | 6 ++--- src/otx/core/model/instance_segmentation.py | 8 +++---- src/otx/core/model/segmentation.py | 14 +++++------ src/otx/core/model/visual_prompting.py | 16 ++++++------- src/otx/engine/utils/auto_configurator.py | 12 +++++----- 33 files changed, 136 insertions(+), 136 deletions(-) diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py index 4803aba3d9e..9bd85b0fed0 100644 --- a/src/otx/algo/action_classification/movinet.py +++ b/src/otx/algo/action_classification/movinet.py @@ -32,7 +32,7 @@ class MoViNet(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224), + input_size: tuple[int, int] = (224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py index 5d49b19661f..98cda3fe3bf 100644 --- a/src/otx/algo/action_classification/x3d.py +++ b/src/otx/algo/action_classification/x3d.py @@ -32,7 +32,7 @@ class X3D(OTXActionClsModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224), + input_size: tuple[int, int] = (224, 224), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index 4bf4220aaec..52eb86d7f75 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -57,7 +57,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED, ) -> None: self.version = version @@ -88,7 +88,7 @@ def _create_model(self) -> nn.Module: return model def _build_model(self, num_classes: int) -> nn.Module: - backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained) + backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained) neck = GlobalAveragePooling(dim=2) loss = nn.CrossEntropyLoss(reduction="none") if self.train_type == OTXTrainType.SEMI_SUPERVISED: @@ -151,7 +151,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.version = version self.pretrained = pretrained @@ -180,7 +180,7 @@ def _create_model(self) -> nn.Module: return model def _build_model(self, num_classes: int) -> nn.Module: - backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained) + backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained) return ImageClassifier( backbone=backbone, neck=GlobalAveragePooling(dim=2), @@ -233,7 +233,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.version = version self.pretrained = pretrained @@ -268,7 +268,7 @@ def _build_model(self, head_config: dict) -> nn.Module: if not isinstance(self.label_info, HLabelInfo): raise TypeError(self.label_info) - backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained) + backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained) return ImageClassifier( backbone=backbone, neck=GlobalAveragePooling(dim=2), diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py index d48120de1d2..c9bdf5b02cb 100644 --- a/src/otx/algo/classification/huggingface_model.py +++ b/src/otx/algo/classification/huggingface_model.py @@ -31,7 +31,7 @@ from otx.core.metrics import MetricCallable -DEFAULT_INPUT_SIZE = (1, 3, 224, 224) +DEFAULT_INPUT_SIZE = (224, 224) logger = logging.getLogger(__name__) @@ -44,7 +44,8 @@ class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel): optimizer (OptimizerCallable, optional): The optimizer callable for training the model. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): The learning rate scheduler callable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. - input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) + input_size (tuple[int, int], optional): + Model input size in the order of height and width. Defaults to (224, 224) Example: 1. API @@ -66,7 +67,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = DEFAULT_INPUT_SIZE, + input_size: tuple[int, int] = DEFAULT_INPUT_SIZE, ) -> None: self.model_name = model_name_or_path @@ -83,7 +84,7 @@ def _create_model(self) -> nn.Module: model_config, _ = PretrainedConfig.get_config_dict(self.model_name) kwargs = {} if "image_size" in model_config: - kwargs["image_size"] = self.input_size[-1] + kwargs["image_size"] = self.input_size[0] elif self.input_size != DEFAULT_INPUT_SIZE: msg = "There is no 'image_size' argument in the model configuration. There may be unexpected results." logger.warning(msg) diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index 62727037737..fb391e37888 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -62,7 +62,8 @@ class MobileNetV3ForMulticlassCls(OTXMulticlassClsModel): metric (MetricCallable, optional): The metric callable. Defaults to MultiClassClsMetricCallable. torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False. freeze_backbone (bool, optional): Whether to freeze the backbone layers during training. Defaults to False. - input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) + input_size (tuple[int, int], optional): + Model input size in the order of height and width. Defaults to (224, 224) """ def __init__( @@ -73,7 +74,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED, ) -> None: self.mode = mode @@ -103,7 +104,7 @@ def _create_model(self) -> nn.Module: return model def _build_model(self, num_classes: int) -> nn.Module: - backbone = OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]) + backbone = OTXMobileNetV3(mode=self.mode, input_size=self.input_size) neck = GlobalAveragePooling(dim=2) loss = nn.CrossEntropyLoss(reduction="none") in_channels = 960 if self.mode == "large" else 576 @@ -166,7 +167,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.mode = mode super().__init__( @@ -194,7 +195,7 @@ def _create_model(self) -> nn.Module: def _build_model(self, num_classes: int) -> nn.Module: return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size), neck=GlobalAveragePooling(dim=2), head=MultiLabelNonLinearClsHead( num_classes=num_classes, @@ -251,7 +252,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -297,7 +298,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.mode = mode super().__init__( @@ -331,7 +332,7 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) return ImageClassifier( - backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]), + backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size), neck=GlobalAveragePooling(dim=2), head=HierarchicalNonLinearClsHead( in_channels=960, @@ -409,7 +410,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", diff --git a/src/otx/algo/classification/timm_model.py b/src/otx/algo/classification/timm_model.py index 7c540b3e1ef..411da97a165 100644 --- a/src/otx/algo/classification/timm_model.py +++ b/src/otx/algo/classification/timm_model.py @@ -54,7 +54,7 @@ def __init__( self, label_info: LabelInfoTypes, backbone: TimmModelType, - input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe + input_size: tuple[int, int] = (224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -148,7 +148,7 @@ def __init__( self, label_info: LabelInfoTypes, backbone: TimmModelType, - input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe + input_size: tuple[int, int] = (224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -230,7 +230,7 @@ def __init__( self, label_info: HLabelInfo, backbone: TimmModelType, - input_size: tuple[int, ...] = (1, 3, 224, 224), # input size of default classification data recipe + input_size: tuple[int, int] = (224, 224), # input size of default classification data recipe pretrained: bool = True, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index 002855f82ae..e90c4776e1e 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -404,7 +404,8 @@ class OTXTVModel(OTXModel): task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional): The type of classification task. train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training. - input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224) + input_size (tuple[int, ...], optional): + Model input size in the order of height and width. Defaults to (224, 224) """ model: TVClassificationModel @@ -423,7 +424,7 @@ def __init__( OTXTaskType.H_LABEL_CLS, ] = OTXTaskType.MULTI_CLASS_CLS, train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.backbone = backbone self.freeze_backbone = freeze_backbone @@ -446,7 +447,7 @@ def __init__( torch_compile=torch_compile, input_size=input_size, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] def _create_model(self) -> nn.Module: if self.task == OTXTaskType.MULTI_CLASS_CLS: @@ -556,7 +557,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -654,7 +655,7 @@ def _convert_pred_entity_to_compute_metric( def get_dummy_input(self, batch_size: int = 1) -> CLASSIFICATION_BATCH_DATA_ENTITY: """Returns a dummy input for classification model.""" - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [torch.LongTensor([0])] * batch_size if self.task == OTXTaskType.MULTI_CLASS_CLS: diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py index 6ccf64607a8..9a629b94537 100644 --- a/src/otx/algo/classification/vit.py +++ b/src/otx/algo/classification/vit.py @@ -219,7 +219,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED, ) -> None: self.arch = arch @@ -279,7 +279,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora) if self.train_type == OTXTrainType.SEMI_SUPERVISED: return SemiSLClassifier( backbone=vit_backbone, @@ -320,7 +320,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:]) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size) return SemiSLClassifier( backbone=vit_backbone, neck=None, @@ -348,7 +348,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.arch = arch self.lora = lora @@ -405,7 +405,7 @@ def _build_model(self, num_classes: int) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, @@ -434,7 +434,7 @@ def __init__( scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, torch_compile: bool = False, - input_size: tuple[int, ...] = (1, 3, 224, 224), + input_size: tuple[int, int] = (224, 224), ) -> None: self.arch = arch self.lora = lora @@ -496,7 +496,7 @@ def _build_model(self, head_config: dict) -> nn.Module: {"std": 0.2, "layer": "Linear", "type": "TruncNormal"}, {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"}, ] - vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora) + vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora) return ImageClassifier( backbone=vit_backbone, neck=None, diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index a873055354b..019c7e5cc82 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -39,7 +39,7 @@ class ATSS(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 800, 992), + input_size: tuple[int, int] = (800, 992), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -65,7 +65,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="standard", diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index 859825a143f..eb0cd9111ca 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -36,7 +36,8 @@ class HuggingFaceModelForDetection(OTXDetectionModel): Args: model_name_or_path (str): The name or path of the pre-trained model. label_info (LabelInfoTypes): The label information for the model. - input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 800, 992). + input_size (tuple[int, int], optional): + Model input size in the order of height and width. Defaults to (800, 992). optimizer (OptimizerCallable, optional): The optimizer for training the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -61,7 +62,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=object-detection label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 800, 992), # input size of default detection data recipe + input_size: tuple[int, int] = (800, 992), # input size of default detection data recipe optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -156,7 +157,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=image_mean, # type: ignore[arg-type] std=image_std, # type: ignore[arg-type] resize_mode="standard", diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index cf8c3d820a8..8af1d28a659 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -46,7 +46,7 @@ class RTDETR(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 640, 640), + input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -199,7 +199,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="standard", @@ -242,13 +242,13 @@ def _build_model(self, num_classes: int) -> nn.Module: encoder = HybridEncoder( in_channels=[128, 256, 512], expansion=0.5, - eval_spatial_size=self.input_size[-2:], + eval_spatial_size=self.input_size, ) decoder = RTDETRTransformer( num_classes=num_classes, num_decoder_layers=3, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size[-2:], + eval_spatial_size=self.input_size, ) optimizer_configuration = [ @@ -266,7 +266,7 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, - input_size=self.input_size[-1], + input_size=self.input_size[0], ) @@ -286,12 +286,12 @@ def _build_model(self, num_classes: int) -> nn.Module: norm_cfg={"type": "FBN", "name": "norm"}, ) encoder = HybridEncoder( - eval_spatial_size=self.input_size[-2:], + eval_spatial_size=self.input_size, ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size[-2:], + eval_spatial_size=self.input_size, num_decoder_layers=6, ) @@ -310,7 +310,7 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, - input_size=self.input_size[-1], + input_size=self.input_size[0], ) @@ -334,13 +334,13 @@ def _build_model(self, num_classes: int) -> nn.Module: hidden_dim=384, dim_feedforward=2048, in_channels=[512, 1024, 2048], - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size, ) decoder = RTDETRTransformer( num_classes=num_classes, feat_channels=[384, 384, 384], - eval_spatial_size=self.input_size[2:], + eval_spatial_size=self.input_size, ) # no bias decay and learning rate correction for the backbone. @@ -360,5 +360,5 @@ def _build_model(self, num_classes: int) -> nn.Module: decoder=decoder, num_classes=num_classes, optimizer_configuration=optimizer_configuration, - input_size=self.input_size[-1], + input_size=self.input_size[0], ) diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index b382ff65225..fb43031838b 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -41,7 +41,7 @@ class RTMDet(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 640, 640), + input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -67,7 +67,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index e4c095dffa2..f6aa62b6cea 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -55,7 +55,7 @@ class SSD(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 864, 864), + input_size: tuple[int, int] = (864, 864), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -168,11 +168,11 @@ def _get_new_anchors(self, dataset: OTXDataset, anchor_generator: SSDAnchorGener if isinstance(transform, Resize): target_wh = transform.scale if target_wh is None: - target_wh = self.input_size[-2:] + target_wh = list(reversed(self.input_size)) # type: ignore[assignment] msg = f"Cannot get target_wh from the dataset. Assign it with the default value: {target_wh}" logger.warning(msg) group_as = [len(width) for width in anchor_generator.widths] - wh_stats = self._get_sizes_from_dataset_entity(dataset, list(target_wh)) + wh_stats = self._get_sizes_from_dataset_entity(dataset, list(target_wh)) # type: ignore[arg-type] if len(wh_stats) < sum(group_as): logger.warning( @@ -297,7 +297,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="standard", diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index b21a0420d67..fd1a8765cad 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -43,7 +43,7 @@ class YOLOX(ExplainableOTXDetModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 640, 640), + input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, @@ -79,7 +79,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -150,7 +150,7 @@ class YOLOXTINY(YOLOX): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 416, 416), + input_size: tuple[int, int] = (416, 416), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index 1ab96d01bb1..67b8a772d3a 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -48,7 +48,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window", @@ -88,7 +88,7 @@ class MaskRCNNResNet50(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 1024, 1024), + input_size: tuple[int, int] = (1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -278,7 +278,7 @@ class MaskRCNNEfficientNet(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 1024, 1024), + input_size: tuple[int, int] = (1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -485,7 +485,7 @@ class MaskRCNNSwinT(MaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 1344, 1344), + input_size: tuple[int, int] = (1344, 1344), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py index 075e4bcf811..7d530cbd926 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py +++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py @@ -234,7 +234,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window", @@ -275,7 +275,7 @@ class TVMaskRCNNR50(TVMaskRCNN): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 1024, 1024), + input_size: tuple[int, int] = (1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 92a5627e5cb..02bedb89aa9 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -45,7 +45,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -96,7 +96,7 @@ class RTMDetInstTiny(RTMDetInst): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 640, 640), + input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py index 16101a6fcad..3aef09e73af 100644 --- a/src/otx/algo/segmentation/dino_v2_seg.py +++ b/src/otx/algo/segmentation/dino_v2_seg.py @@ -56,7 +56,7 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 560, 560), + input_size: tuple[int, int] = (560, 560), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index a64798b22f4..c60964c27c8 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -65,7 +65,7 @@ def __init__( self, model_name_or_path: str, # https://huggingface.co/models?pipeline_tag=image-segmentation label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 512, 512), # input size of default semantic segmentation data recipe + input_size: tuple[int, int] = (512, 512), # input size of default semantic segmentation data recipe optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py index 81d9e99f57c..a056be9dda6 100644 --- a/src/otx/algo/segmentation/litehrnet.py +++ b/src/otx/algo/segmentation/litehrnet.py @@ -528,7 +528,7 @@ class OTXLiteHRNet(TorchVisionCompatibleModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 512, 512), + input_size: tuple[int, int] = (512, 512), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -599,7 +599,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.scale, resize_mode="standard", diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py index 703f5b1dfbe..4a01417f158 100644 --- a/src/otx/algo/segmentation/segnext.py +++ b/src/otx/algo/segmentation/segnext.py @@ -118,7 +118,7 @@ class OTXSegNext(TorchVisionCompatibleModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] = (1, 3, 512, 512), + input_size: tuple[int, int] = (512, 512), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py index feb02857375..7cc9673aa4f 100644 --- a/src/otx/algo/visual_prompting/segment_anything.py +++ b/src/otx/algo/visual_prompting/segment_anything.py @@ -496,7 +496,7 @@ def __init__( self, backbone: Literal["tiny_vit", "vit_b"], label_info: LabelInfoTypes = NullLabelInfo(), - input_size: tuple[int, ...] = (1, 3, 1024, 1024), + input_size: tuple[int, int] = (1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -509,14 +509,14 @@ def __init__( return_extra_metrics: bool = False, stability_score_offset: float = 1.0, ) -> None: - if input_size[-1] != input_size[-2]: + if input_size[0] != input_size[1]: msg = f"SAM should use square image size, but got {input_size}" raise ValueError(msg) self.config = { "backbone": backbone, - "image_size": input_size[-1], - "image_embedding_size": input_size[-1] // 16, + "image_size": input_size[0], + "image_embedding_size": input_size[0] // 16, "freeze_image_encoder": freeze_image_encoder, "freeze_prompt_encoder": freeze_prompt_encoder, "freeze_mask_decoder": freeze_mask_decoder, diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py index 5edfa3aabd1..71fca59b925 100644 --- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py +++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py @@ -661,7 +661,7 @@ def __init__( # noqa: PLR0913 } super().__init__( label_info=label_info, - input_size=(1, 3, 1024, 1024), # zero-shot visual prompting model uses fixed 1024x1024 input size + input_size=(1024, 1024), # zero-shot visual prompting model uses fixed 1024x1024 input size optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 04ea994148f..d20e30a700e 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -335,8 +335,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule if self.config[self.subcommand].data.adaptive_input_size != "none": model_cls = get_model_cls_from_config(model_config) - if hasattr(model_cls, "input_size_multiplier"): - self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier + self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier # Instantiate the things that don't need to special handling self.config_init = self.parser.instantiate_classes(self.config) @@ -346,9 +345,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # pass OTXDataModule input size to the model if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]: # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check - input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) # type: ignore[assignment] model_config["init_args"]["input_size"] = ( - tuple(model_config["init_args"]["input_size"][:-2]) + input_size # type: ignore[operator] + (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) ) # Instantiate the model and needed components diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 60d6a532c86..07777bda224 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -220,8 +220,7 @@ def area(x: tuple[int, int]) -> int: logger.info(f"-> Downscale only: {input_size} -> {base_input_size}") return base_input_size - # Closest preset - logger.info(f"-> Closest preset: {input_size}") + logger.info(f"-> Adapted input size: {input_size}") return input_size diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py index b31c4e1f63e..98b26ad1bd5 100644 --- a/src/otx/core/model/action_classification.py +++ b/src/otx/core/model/action_classification.py @@ -37,7 +37,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -53,7 +53,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int, int, int] + self.input_size: tuple[int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: @@ -133,7 +133,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 1, 3, 8, *self.input_size), mean=self.mean, std=self.std, resize_mode="standard", @@ -165,7 +165,7 @@ def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[st def get_dummy_input(self, batch_size: int = 1) -> ActionClsBatchDataEntity: """Returns a dummy input for action classification model.""" - images = torch.rand(batch_size, *self.input_size[1:]) + images = torch.rand(batch_size, 1, 3, 8, *self.input_size) labels = [torch.LongTensor([0])] * batch_size infos = [] for i, img in enumerate(images): diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 1b4dc9b6acc..ac945d26b71 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -99,11 +99,12 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti """ _OPTIMIZED_MODEL_BASE_NAME: str = "optimized_model" + input_size_multiplier = 1 def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...] | None = None, + input_size: tuple[int, int] | None = None, optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = NullMetricCallable, @@ -809,13 +810,11 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo: raise TypeError(label_info) - def _check_input_size(self, input_size: tuple[int, ...] | None = None) -> None: - if ( - input_size is not None - and hasattr(self, "input_size_multiplier") - and (input_size[-1] % self.input_size_multiplier != 0 or input_size[-2] % self.input_size_multiplier != 0) + def _check_input_size(self, input_size: tuple[int, int] | None = None) -> None: + if input_size is not None and ( + input_size[0] % self.input_size_multiplier != 0 or input_size[1] % self.input_size_multiplier != 0 ): - msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead." + msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size} instead." raise ValueError(msg) diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index 26a7a21843c..678203447dd 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -46,7 +46,7 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiClassClsMetricCallable, @@ -63,7 +63,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] def _customize_inputs(self, inputs: MulticlassClsBatchDataEntity) -> dict[str, Any]: if self.training: @@ -162,7 +162,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -190,7 +190,7 @@ def _reset_prediction_layer(self, num_classes: int) -> None: def get_dummy_input(self, batch_size: int = 1) -> MulticlassClsBatchDataEntity: """Returns a dummy input for classification model.""" - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [torch.LongTensor([0])] * batch_size return MulticlassClsBatchDataEntity(batch_size, images, [], labels=labels) @@ -209,7 +209,7 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MultiLabelClsMetricCallable, @@ -223,7 +223,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] def _customize_inputs(self, inputs: MultilabelClsBatchDataEntity) -> dict[str, Any]: if self.training: @@ -287,7 +287,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -314,7 +314,7 @@ def forward_for_tracing(self, image: Tensor) -> Tensor | dict[str, Tensor]: def get_dummy_input(self, batch_size: int = 1) -> MultilabelClsBatchDataEntity: """Returns a dummy input for classification OV model.""" - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [torch.LongTensor([0])] * batch_size return MultilabelClsBatchDataEntity(batch_size, images, [], labels=labels) @@ -327,7 +327,7 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt def __init__( self, label_info: HLabelInfo, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = HLabelClsMetricCallble, @@ -341,7 +341,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] def _customize_inputs(self, inputs: HlabelClsBatchDataEntity) -> dict[str, Any]: if self.training: @@ -409,7 +409,7 @@ def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="standard", @@ -449,7 +449,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo: def get_dummy_input(self, batch_size: int = 1) -> HlabelClsBatchDataEntity: """Returns a dummy input for classification OV model.""" - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, self.input_size) for _ in range(batch_size)] labels = [torch.LongTensor([0])] * batch_size return HlabelClsBatchDataEntity(batch_size, images, [], labels=labels) diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 938715396b6..76b7c2d1538 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -41,7 +41,7 @@ class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]): """Base class for the detection models used in OTX.""" - input_size: tuple[int, int, int, int] + input_size: tuple[int, int] def test_step(self, batch: DetBatchDataEntity, batch_idx: int) -> None: """Perform a single test step on a batch of data from the test set. @@ -368,7 +368,7 @@ def get_dummy_input(self, batch_size: int = 1) -> DetBatchDataEntity: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] infos = [] for i, img in enumerate(images): infos.append( @@ -387,7 +387,7 @@ class ExplainableOTXDetModel(OTXDetectionModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py index 756a5fd7641..2a26b688920 100644 --- a/src/otx/core/model/instance_segmentation.py +++ b/src/otx/core/model/instance_segmentation.py @@ -49,7 +49,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, @@ -65,7 +65,7 @@ def __init__( torch_compile=torch_compile, tile_config=tile_config, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] def _build_model(self, num_classes: int) -> nn.Module: raise NotImplementedError @@ -366,7 +366,7 @@ def get_dummy_input(self, batch_size: int = 1) -> InstanceSegBatchDataEntity: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] infos = [] for i, img in enumerate(images): infos.append( @@ -385,7 +385,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = MaskRLEMeanAPFMeasureCallable, diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 935e2a2215b..ba2f5963448 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -37,7 +37,7 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -47,7 +47,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (tuple[int, ...]): The input size of the model. + input_size (tuple[int, int]): Model input size in the order of height and width. optimizer (OptimizerCallable, optional): The optimizer to use for training. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -65,7 +65,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] @property def _export_parameters(self) -> TaskLevelExportParameters: @@ -112,7 +112,7 @@ def get_dummy_input(self, batch_size: int = 1) -> SegBatchDataEntity: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - images = torch.rand(batch_size, *self.input_size[1:]) + images = torch.rand(batch_size, 3, *self.input_size) infos = [] for i, img in enumerate(images): infos.append( @@ -131,7 +131,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel): def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, ...], + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = SegmCallable, # type: ignore[assignment] @@ -146,7 +146,7 @@ def __init__( Args: label_info (LabelInfoTypes): The label information for the segmentation model. - input_size (tuple[int, ...]): The input size of the model. + input_size (tuple[int, int]): Model input size in the order of height and width. optimizer (OptimizerCallable, optional): The optimizer callable for the model. Defaults to DefaultOptimizerCallable. scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): @@ -220,7 +220,7 @@ def _exporter(self) -> OTXModelExporter: return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.scale, resize_mode="standard", diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py index c6d7d2010c2..6c50c2ee62b 100644 --- a/src/otx/core/model/visual_prompting.py +++ b/src/otx/core/model/visual_prompting.py @@ -162,7 +162,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro def __init__( self, label_info: LabelInfoTypes = NullLabelInfo(), - input_size: tuple[int, ...] = (1, 3, 1024, 1024), + input_size: tuple[int, int] = (1024, 1024), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = VisualPromptingMetricCallable, @@ -178,14 +178,14 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXVisualPromptingModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="fit_to_window", @@ -265,7 +265,7 @@ def _set_label_info(self, _: LabelInfoTypes) -> None: def get_dummy_input(self, batch_size: int = 1) -> VisualPromptingBatchDataEntity: """Returns a dummy input for VPT model.""" - images = [torch.rand(self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [{"points": torch.LongTensor([0] * batch_size)}] * batch_size prompts = [torch.zeros((1, 2))] * batch_size return VisualPromptingBatchDataEntity( @@ -287,7 +287,7 @@ class OTXZeroShotVisualPromptingModel( def __init__( self, - input_size: tuple[int, ...], + input_size: tuple[int, int], label_info: LabelInfoTypes = NullLabelInfo(), optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, @@ -304,14 +304,14 @@ def __init__( metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int, int, int] + self.input_size: tuple[int, int] @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" return OTXVisualPromptingModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.input_size, + input_size=(1, 3, *self.input_size), mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375), resize_mode="fit_to_window", @@ -450,7 +450,7 @@ def _set_label_info(self, _: LabelInfoTypes) -> None: def get_dummy_input(self, batch_size: int = 1) -> ZeroShotVisualPromptingBatchDataEntity: """Returns a dummy input for ZSL VPT model.""" - images = [torch.rand(self.input_size[1:]) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [ZeroShotVisualPromptingLabel(prompts=torch.LongTensor([0]))] * batch_size prompts = [torch.zeros((1, 2))] * batch_size infos = [] diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index b8f1280099a..b55792f8c77 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -230,8 +230,7 @@ def get_datamodule(self) -> OTXDataModule | None: if data_config.get("adaptive_input_size", "none") != "none": model_cls = get_model_cls_from_config(Namespace(self.config["model"])) - if hasattr(model_cls, "input_size_multiplier"): - data_config["input_size_multiplier"] = model_cls.input_size_multiplier + data_config["input_size_multiplier"] = model_cls.input_size_multiplier return OTXDataModule( train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config), @@ -250,7 +249,7 @@ def get_model( self, model_name: str | None = None, label_info: LabelInfoTypes | None = None, - input_size: tuple[int, ...] | int | None = None, + input_size: tuple[int, int] | int | None = None, ) -> OTXModel: """Retrieves the OTXModel instance based on the provided model name and meta information. @@ -258,7 +257,7 @@ def get_model( model_name (str | None): The name of the model to retrieve. If None, the default model will be used. label_info (LabelInfoTypes | None): The meta information about the labels. If provided, the number of classes will be updated in the model's configuration. - input_size (tuple[int, ...] | int | None, optional): Input size of the model. Defaults to None. + input_size (tuple[int, int] | int | None, optional): Input size of the model. Defaults to None. Returns: OTXModel: The instantiated OTXModel instance. @@ -286,8 +285,9 @@ def get_model( model_config = deepcopy(self.config["model"]) if input_size is not None: - input_size = (input_size, input_size) if isinstance(input_size, int) else input_size - model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size + model_config["init_args"]["input_size"] = ( + (input_size, input_size) if isinstance(input_size, int) else input_size + ) model_cls = get_model_cls_from_config(Namespace(model_config)) From aee7600c09647a9838577e6d7adff41ae51673d4 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 18:15:05 +0900 Subject: [PATCH 18/42] update docstring --- src/otx/engine/utils/auto_configurator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index b55792f8c77..5574f1fc3d9 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -257,7 +257,9 @@ def get_model( model_name (str | None): The name of the model to retrieve. If None, the default model will be used. label_info (LabelInfoTypes | None): The meta information about the labels. If provided, the number of classes will be updated in the model's configuration. - input_size (tuple[int, int] | int | None, optional): Input size of the model. Defaults to None. + input_size (tuple[int, int] | int | None, optional): + Model input size in the order of height and width or a single integer for a side of a square. + Defaults to None. Returns: OTXModel: The instantiated OTXModel instance. From 5d6c4815897b11f6e98b9eb29afa83329030ebc4 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 18:47:48 +0900 Subject: [PATCH 19/42] update unit test --- tests/unit/algo/classification/test_efficientnet.py | 6 +++--- tests/unit/algo/classification/test_mobilenet_v3.py | 6 +++--- tests/unit/algo/detection/test_rtmdet.py | 2 +- tests/unit/algo/detection/test_yolox.py | 4 ++-- tests/unit/cli/test_cli.py | 2 +- tests/unit/core/model/test_visual_prompting.py | 4 ++-- tests/unit/engine/test_engine.py | 2 +- tests/unit/engine/utils/test_auto_configurator.py | 2 +- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/unit/algo/classification/test_efficientnet.py b/tests/unit/algo/classification/test_efficientnet.py index fd501ff48ed..45c0681444d 100644 --- a/tests/unit/algo/classification/test_efficientnet.py +++ b/tests/unit/algo/classification/test_efficientnet.py @@ -55,7 +55,7 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_ assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = EfficientNetForMulticlassCls(version="b0", label_info=10, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] @@ -98,7 +98,7 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_ assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = EfficientNetForMultilabelCls(version="b0", label_info=10, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] @@ -141,6 +141,6 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self, fxt_hlabel_data): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = EfficientNetForHLabelCls(version="b0", label_info=fxt_hlabel_data, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] diff --git a/tests/unit/algo/classification/test_mobilenet_v3.py b/tests/unit/algo/classification/test_mobilenet_v3.py index cecfd1d919a..39d4b282b0a 100644 --- a/tests/unit/algo/classification/test_mobilenet_v3.py +++ b/tests/unit/algo/classification/test_mobilenet_v3.py @@ -55,7 +55,7 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_ assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = MobileNetV3ForMulticlassCls(mode="large", label_info=10, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] @@ -98,7 +98,7 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_ assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = MobileNetV3ForMultilabelCls(mode="large", label_info=10, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] @@ -141,6 +141,6 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent assert outputs.has_xai_outputs == explain_mode def test_set_input_size(self, fxt_hlabel_data): - input_size = (1, 3, 300, 300) + input_size = (300, 300) model = MobileNetV3ForHLabelCls(mode="large", label_info=fxt_hlabel_data, input_size=input_size) assert model.model.backbone.in_size == input_size[-2:] diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py index 9344687894c..17f4b7ecc35 100644 --- a/tests/unit/algo/detection/test_rtmdet.py +++ b/tests/unit/algo/detection/test_rtmdet.py @@ -18,7 +18,7 @@ def test_init(self) -> None: assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXt) assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN) assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead) - assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640) + assert otx_rtmdet_tiny.input_size == (640, 640) def test_exporter(self) -> None: otx_rtmdet_tiny = RTMDetTiny(label_info=3) diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py index 29ffdd8172e..fdb8e835ee7 100644 --- a/tests/unit/algo/detection/test_yolox.py +++ b/tests/unit/algo/detection/test_yolox.py @@ -18,10 +18,10 @@ def test_init(self) -> None: assert isinstance(otx_yolox_l.model.backbone, CSPDarknet) assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN) assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead) - assert otx_yolox_l.input_size == (1, 3, 640, 640) + assert otx_yolox_l.input_size == (640, 640) otx_yolox_tiny = YOLOXTINY(label_info=3) - assert otx_yolox_tiny.input_size == (1, 3, 416, 416) + assert otx_yolox_tiny.input_size == (416, 416) def test_exporter(self) -> None: otx_yolox_l = YOLOXL(label_info=3) diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index e0b69b77a54..8fa1581f18a 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -129,7 +129,7 @@ def test_instantiate_classes_set_input_size(self, input_size, fxt_train_argv, mo cli = OTXCLI() cli.instantiate_classes() - assert cli.model.input_size == (1, 3, input_size, input_size) + assert cli.model.input_size == (input_size, input_size) @pytest.fixture() def mock_model_cls(self) -> MagicMock: diff --git a/tests/unit/core/model/test_visual_prompting.py b/tests/unit/core/model/test_visual_prompting.py index 9a3a8709529..01d245a46b9 100644 --- a/tests/unit/core/model/test_visual_prompting.py +++ b/tests/unit/core/model/test_visual_prompting.py @@ -36,7 +36,7 @@ @pytest.fixture() def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel: mocker.patch.object(OTXVisualPromptingModel, "_create_model") - model = OTXVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024)) + model = OTXVisualPromptingModel(label_info=1, input_size=(1024, 1024)) model.model.image_size = 1024 return model @@ -44,7 +44,7 @@ def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel: @pytest.fixture() def otx_zero_shot_visual_prompting_model(mocker) -> OTXZeroShotVisualPromptingModel: mocker.patch.object(OTXZeroShotVisualPromptingModel, "_create_model") - model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024)) + model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1024, 1024)) model.model.image_size = 1024 return model diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py index 1bd9c655cf8..e1695e2d9a2 100644 --- a/tests/unit/engine/test_engine.py +++ b/tests/unit/engine/test_engine.py @@ -69,7 +69,7 @@ def test_model_init(self, tmp_path, mock_datamodule): data_root = "tests/assets/classification_dataset" engine = Engine(work_dir=tmp_path, data_root=data_root) - assert engine._model.input_size == (1, 3, 1234, 1234) + assert engine._model.input_size == (1234, 1234) assert engine._model.label_info.num_classes == 4321 def test_model_setter(self, fxt_engine, mocker) -> None: diff --git a/tests/unit/engine/utils/test_auto_configurator.py b/tests/unit/engine/utils/test_auto_configurator.py index 7bf247020c9..6627e4131ab 100644 --- a/tests/unit/engine/utils/test_auto_configurator.py +++ b/tests/unit/engine/utils/test_auto_configurator.py @@ -152,7 +152,7 @@ def test_get_model_set_input_size(self) -> None: model = auto_configurator.get_model(label_info=label_info, input_size=input_size) - assert model.input_size == (1, 3, input_size, input_size) + assert model.input_size == (input_size, input_size) def test_get_optimizer(self, fxt_task: OTXTaskType) -> None: if fxt_task in { From ff8ecf9f76b619cd2e314a2ab7d304c2f91b402c Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 19:34:29 +0900 Subject: [PATCH 20/42] adaptive input size supports not square --- src/otx/core/data/utils/utils.py | 53 ++++++++++++-------- tests/unit/core/data/utils/test_utils.py | 61 ++++++++++++++++++------ 2 files changed, 79 insertions(+), 35 deletions(-) diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 07777bda224..9e2ee03837d 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -85,14 +85,17 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = Returns: Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally. ex) stat = { - "image": {"avg": ...}, + "image": { + "height" : {"avg": ...}, + "width" : {"avg": ...}, + } "annotation": { "num_per_image": {"avg": ...}, "size_of_shape": {"avg": ...}, } } """ - stat: dict = {} + stat: dict = {"image": {}, "annotation": {}} if len(dataset) == 0 or max_samples <= 0: return stat @@ -101,14 +104,16 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = rng = np.random.default_rng(42) data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples] - image_sizes = [] + height_arr = [] + width_arr = [] for idx in data_ids: data = dataset.get(id=idx, subset=dataset.name) height, width = data.media.size - image_sizes.append(np.sqrt(width * height)) - stat["image"] = compute_robust_scale_statistics(np.array(image_sizes)) + height_arr.append(height) + width_arr.append(width) + stat["image"]["height"] = compute_robust_scale_statistics(np.array(height_arr)) + stat["image"]["width"] = compute_robust_scale_statistics(np.array(width_arr)) - stat["annotation"] = {} num_per_images: list[int] = [] size_of_shapes: dict[str, list] = defaultdict(list) for idx in data_ids: @@ -181,12 +186,15 @@ def adapt_input_size_to_dataset( logger.info("Adapting model input size based on dataset stat") stat = compute_robust_dataset_statistics(train_dataset) - max_image_size = stat["image"].get("robust_max", 0) + max_image_size: list[int] = [ + stat["image"].get("height", {}).get("robust_max", 0), + stat["image"].get("width", {}).get("robust_max", 0), + ] min_object_size = None logger.info(f"-> Current base input size: {base_input_size}") - if max_image_size <= 0: + if max_image_size[0] <= 0 or max_image_size[1] <= 0: return base_input_size image_size = max_image_size @@ -197,31 +205,34 @@ def adapt_input_size_to_dataset( # -> "avg" size might be preferrable for efficiency min_object_size = stat.get("annotation", {}).get("size_of_shape", {}).get("robust_min", None) if min_object_size is not None and min_object_size > 0: - image_size = round(image_size * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) + image_size = [round(val * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) for val in image_size] logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}") - if image_size > max_image_size: + if image_size[0] > max_image_size[0]: image_size = max_image_size logger.info(f"-> Restrict to max image size: {image_size}") - if image_size < _MIN_DETECTION_INPUT_SIZE: - image_size = _MIN_DETECTION_INPUT_SIZE + if image_size[0] < _MIN_DETECTION_INPUT_SIZE or image_size[1] < _MIN_DETECTION_INPUT_SIZE: + big_val_idx = 0 if image_size[0] > image_size[1] else 1 + small_val_idx = 1 - big_val_idx + image_size[big_val_idx] = image_size[big_val_idx] * _MIN_DETECTION_INPUT_SIZE // image_size[small_val_idx] + image_size[small_val_idx] = _MIN_DETECTION_INPUT_SIZE logger.info(f"-> Based on minimum object detection input size: {image_size}") - if input_size_multiplier is not None and image_size % input_size_multiplier != 0: - image_size = (image_size // input_size_multiplier + 1) * input_size_multiplier - - input_size = (round(image_size), round(image_size)) + if input_size_multiplier is not None: + for i, val in enumerate(image_size): + if val % input_size_multiplier != 0: + image_size[i] = (val // input_size_multiplier + 1) * input_size_multiplier if downscale_only: - def area(x: tuple[int, int]) -> int: + def area(x: list[int] | tuple[int, int]) -> int: return x[0] * x[1] - if base_input_size and area(input_size) >= area(base_input_size): - logger.info(f"-> Downscale only: {input_size} -> {base_input_size}") + if base_input_size and area(image_size) >= area(base_input_size): + logger.info(f"-> Downscale only: {image_size} -> {base_input_size}") return base_input_size - logger.info(f"-> Adapted input size: {input_size}") - return input_size + logger.info(f"-> Adapted input size: {image_size}") + return tuple(image_size) # type: ignore[return-value] def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None: diff --git a/tests/unit/core/data/utils/test_utils.py b/tests/unit/core/data/utils/test_utils.py index ace8d23250a..606b08f5ef7 100644 --- a/tests/unit/core/data/utils/test_utils.py +++ b/tests/unit/core/data/utils/test_utils.py @@ -109,12 +109,15 @@ def test_compute_robuste_dataset_statistics(mock_dataset): subset = mock_dataset.get_subset("train") stat = compute_robust_dataset_statistics(subset, max_samples=0) - assert len(stat) == 0 + assert stat["image"] == {} + assert stat["annotation"] == {} stat = compute_robust_dataset_statistics(subset, max_samples=-1) - assert len(stat) == 0 + assert stat["image"] == {} + assert stat["annotation"] == {} stat = compute_robust_dataset_statistics(subset) - assert np.isclose(stat["image"]["avg"], 100) + assert np.isclose(stat["image"]["height"]["avg"], 100) + assert np.isclose(stat["image"]["width"]["avg"], 100) assert np.isclose(stat["annotation"]["num_per_image"]["avg"], 1.0) assert np.isclose(stat["annotation"]["size_of_shape"]["avg"], 10.0) @@ -135,22 +138,52 @@ def test_adapt_input_size_to_dataset(mocker): input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) assert input_size == (512, 512) - mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}} + mock_stat.return_value = { + "image": { + "height": {"robust_max": 150}, + "width": {"robust_max": 200}, + }, + "annotation": {}, + } input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) - assert input_size == (150, 150) - - mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}} + assert input_size == (150, 200) + + mock_stat.return_value = { + "image": { + "height": {"robust_max": 150}, + "width": {"robust_max": 200}, + }, + "annotation": {}, + } input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512, input_size_multiplier=32) - assert input_size == (160, 160) - - mock_stat.return_value = {"image": {"robust_max": 256}, "annotation": {"size_of_shape": {"robust_min": 64}}} + assert input_size == (160, 224) + + mock_stat.return_value = { + "image": { + "height": {"robust_max": 224}, + "width": {"robust_max": 240}, + }, + "annotation": {"size_of_shape": {"robust_min": 64}}, + } input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) - assert input_size == (256, 256) - - mock_stat.return_value = {"image": {"robust_max": 1024}, "annotation": {"size_of_shape": {"robust_min": 64}}} + assert input_size == (256, 274) + + mock_stat.return_value = { + "image": { + "height": {"robust_max": 1024}, + "width": {"robust_max": 1200}, + }, + "annotation": {"size_of_shape": {"robust_min": 64}}, + } input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512) assert input_size == (512, 512) - mock_stat.return_value = {"image": {"robust_max": 2045}, "annotation": {"size_of_shape": {"robust_min": 64}}} + mock_stat.return_value = { + "image": { + "height": {"robust_max": 2045}, + "width": {"robust_max": 2045}, + }, + "annotation": {"size_of_shape": {"robust_min": 64}}, + } input_size = adapt_input_size_to_dataset(dataset=MagicMock(), downscale_only=False, base_input_size=512) assert input_size == (1022, 1022) From 82e41e0537d792135befb0333a25354e6fd5379b Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 19:35:44 +0900 Subject: [PATCH 21/42] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67c87e31181..806f44d8ed0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ All notable changes to this project will be documented in this file. (https://github.com/openvinotoolkit/training_extensions/pull/3762) - Add RTMPose for Keypoint Detection Task (https://github.com/openvinotoolkit/training_extensions/pull/3781) +- Support configurable input size + (https://github.com/openvinotoolkit/training_extensions/pull/3788) ### Enhancements From 4e8ce701347e281990b19562d52c3b900546114c Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Fri, 9 Aug 2024 20:21:09 +0900 Subject: [PATCH 22/42] fix typo --- .../algo/classification/torchvision_model.py | 2 +- src/otx/algo/segmentation/huggingface_model.py | 2 +- src/otx/core/exporter/visual_prompting.py | 2 +- tests/unit/core/model/test_base.py | 2 +- tests/unit/core/model/test_classification.py | 18 +++++++++--------- tests/unit/core/model/test_segmentation.py | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py index e90c4776e1e..fec454381ed 100644 --- a/src/otx/algo/classification/torchvision_model.py +++ b/src/otx/algo/classification/torchvision_model.py @@ -404,7 +404,7 @@ class OTXTVModel(OTXModel): task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional): The type of classification task. train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training. - input_size (tuple[int, ...], optional): + input_size (tuple[int, int], optional): Model input size in the order of height and width. Defaults to (224, 224) """ diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index c60964c27c8..f00a7faceb6 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -96,7 +96,7 @@ def _create_model(self) -> nn.Module: if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0: msg = ( f"It's recommended to set the input size to multiple of patch size({patch_size}). " - "If not, score can decrease or model can't work." + "If not, score can decrease or model may not work." ) logger.warning(msg) diff --git a/src/otx/core/exporter/visual_prompting.py b/src/otx/core/exporter/visual_prompting.py index 6b3d3970120..ea40073686b 100644 --- a/src/otx/core/exporter/visual_prompting.py +++ b/src/otx/core/exporter/visual_prompting.py @@ -175,7 +175,7 @@ def get_onnx_dummy_inputs( model.image_embedding_size, dtype=torch.float32, ), - "point_coords": torch.randint(low=0, high=self.input_size[-1], size=(1, 2, 2), dtype=torch.float32), + "point_coords": torch.randint(low=0, high=self.input_size[0], size=(1, 2, 2), dtype=torch.float32), "point_labels": torch.randint(low=0, high=4, size=(1, 2), dtype=torch.float32), "mask_input": torch.randn( 1, diff --git a/tests/unit/core/model/test_base.py b/tests/unit/core/model/test_base.py index d72891cf538..3a24908e99f 100644 --- a/tests/unit/core/model/test_base.py +++ b/tests/unit/core/model/test_base.py @@ -23,7 +23,7 @@ class TestOTXModel: def test_init(self, monkeypatch): monkeypatch.setattr(OTXModel, "input_size_multiplier", 10, raising=False) with pytest.raises(ValueError, match="Input size should be a multiple"): - OTXModel(label_info=2, input_size=(1, 3, 1024, 1024)) + OTXModel(label_info=2, input_size=(1024, 1024)) def test_smart_weight_loading(self, mocker) -> None: with mocker.patch.object(OTXModel, "_create_model", return_value=MockNNModule(2)): diff --git a/tests/unit/core/model/test_classification.py b/tests/unit/core/model/test_classification.py index 835ed854e20..352bf9a331d 100644 --- a/tests/unit/core/model/test_classification.py +++ b/tests/unit/core/model/test_classification.py @@ -37,7 +37,7 @@ def test_export_parameters( ) -> None: model = OTXMulticlassClsModel( label_info=1, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -51,7 +51,7 @@ def test_export_parameters( model = OTXMultilabelClsModel( label_info=1, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -62,7 +62,7 @@ def test_export_parameters( model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -79,7 +79,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXMulticlassClsModel( label_info=1, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -110,7 +110,7 @@ def test_export_parameters( ) -> None: model = OTXMultilabelClsModel( label_info=1, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -130,7 +130,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXMultilabelClsModel( label_info=1, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -162,7 +162,7 @@ def test_export_parameters( ) -> None: model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -183,7 +183,7 @@ def test_convert_pred_entity_to_compute_metric( ) -> None: model = OTXHlabelClsModel( label_info=fxt_hlabel_multilabel_info, - input_size=(1, 3, 224, 224), + input_size=(224, 224), torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -207,7 +207,7 @@ def test_convert_pred_entity_to_compute_metric( assert "target" in metric_input def test_set_label_info(self, fxt_hlabel_multilabel_info): - model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(1, 3, 224, 224)) + model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(224, 224)) assert model.label_info.num_multilabel_classes == fxt_hlabel_multilabel_info.num_multilabel_classes fxt_hlabel_multilabel_info.num_multilabel_classes = 0 diff --git a/tests/unit/core/model/test_segmentation.py b/tests/unit/core/model/test_segmentation.py index 32da4815475..130aa3a96dd 100644 --- a/tests/unit/core/model/test_segmentation.py +++ b/tests/unit/core/model/test_segmentation.py @@ -46,7 +46,7 @@ def torch_compile(): class TestOTXSegmentationModel: @pytest.fixture() def model(self, label_info, optimizer, scheduler, metric, torch_compile): - return OTXSegmentationModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile) + return OTXSegmentationModel(label_info, (512, 512), optimizer, scheduler, metric, torch_compile) def test_export_parameters(self, model): params = model._export_parameters @@ -74,7 +74,7 @@ def test_dispatch_label_info(self, model, label_info, expected_label_info): class TestTorchVisionCompatibleModel: @pytest.fixture() def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> TorchVisionCompatibleModel: - return TorchVisionCompatibleModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile) + return TorchVisionCompatibleModel(label_info, (512, 512), optimizer, scheduler, metric, torch_compile) @pytest.fixture() def batch_data_entity(self): From 9260a8c5e04014f7589045d6e72a8d5882652993 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 09:15:10 +0900 Subject: [PATCH 23/42] fix typo --- src/otx/core/model/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py index 678203447dd..f48f026b585 100644 --- a/src/otx/core/model/classification.py +++ b/src/otx/core/model/classification.py @@ -449,7 +449,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo: def get_dummy_input(self, batch_size: int = 1) -> HlabelClsBatchDataEntity: """Returns a dummy input for classification OV model.""" - images = [torch.rand(3, self.input_size) for _ in range(batch_size)] + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] labels = [torch.LongTensor([0])] * batch_size return HlabelClsBatchDataEntity(batch_size, images, [], labels=labels) From d40a9f06cafaeae6fe8c29a9228d50c9040b7af1 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 10:13:35 +0900 Subject: [PATCH 24/42] update base data pipeline --- src/otx/recipe/_base_/data/anomaly.yaml | 4 +++- .../recipe/_base_/data/classification.yaml | 4 +++- src/otx/recipe/_base_/data/detection.yaml | 12 +++++++--- .../_base_/data/instance_segmentation.yaml | 12 +++++++--- .../_base_/data/keypoint_detection.yaml | 24 +++++++++---------- .../recipe/_base_/data/rotated_detection.yaml | 12 +++++++--- .../_base_/data/semantic_segmentation.yaml | 12 +++++++--- .../_base_/data/torchvision_semisl.yaml | 4 +++- .../recipe/_base_/data/visual_prompting.yaml | 12 +++++++--- 9 files changed, 66 insertions(+), 30 deletions(-) diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml index 2f74b987915..29d4471d9a6 100644 --- a/src/otx/recipe/_base_/data/anomaly.yaml +++ b/src/otx/recipe/_base_/data/anomaly.yaml @@ -1,5 +1,4 @@ task: ANOMALY_CLASSIFICATION -input_size: 256 data_format: mvtec mem_cache_size: 1GB mem_cache_img_max_size: null @@ -7,6 +6,7 @@ image_color_channel: RGB stack_images: false unannotated_items_ratio: 0.0 train_subset: + input_size: 256 subset_name: train transform_lib_type: TORCHVISION to_tv_image: true @@ -30,6 +30,7 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: 256 subset_name: test transform_lib_type: TORCHVISION to_tv_image: true @@ -53,6 +54,7 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: 256 subset_name: test transform_lib_type: TORCHVISION to_tv_image: true diff --git a/src/otx/recipe/_base_/data/classification.yaml b/src/otx/recipe/_base_/data/classification.yaml index e8ee41bf15e..04b675d8774 100644 --- a/src/otx/recipe/_base_/data/classification.yaml +++ b/src/otx/recipe/_base_/data/classification.yaml @@ -1,5 +1,4 @@ task: MULTI_CLASS_CLS -input_size: 224 mem_cache_size: 1GB mem_cache_img_max_size: - 500 @@ -9,6 +8,7 @@ stack_images: true data_format: imagenet_with_subset_dirs unannotated_items_ratio: 0.0 train_subset: + input_size: 224 subset_name: train transform_lib_type: TORCHVISION batch_size: 64 @@ -34,6 +34,7 @@ train_subset: class_path: otx.algo.samplers.balanced_sampler.BalancedSampler val_subset: + input_size: 224 subset_name: val transform_lib_type: TORCHVISION batch_size: 64 @@ -56,6 +57,7 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: 224 subset_name: test transform_lib_type: TORCHVISION batch_size: 64 diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml index c08a5fea022..fa88d02b3fd 100644 --- a/src/otx/recipe/_base_/data/detection.yaml +++ b/src/otx/recipe/_base_/data/detection.yaml @@ -1,7 +1,4 @@ task: DETECTION -input_size: - - 800 - - 992 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -9,6 +6,9 @@ stack_images: true data_format: coco_instances unannotated_items_ratio: 0.0 train_subset: + input_size: + - 800 + - 992 subset_name: train transform_lib_type: TORCHVISION batch_size: 1 @@ -35,6 +35,9 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: + - 800 + - 992 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -56,6 +59,9 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: + - 800 + - 992 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml index 3520f3930a7..299bf488d4f 100644 --- a/src/otx/recipe/_base_/data/instance_segmentation.yaml +++ b/src/otx/recipe/_base_/data/instance_segmentation.yaml @@ -1,7 +1,4 @@ task: INSTANCE_SEGMENTATION -input_size: - - 1024 - - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -10,6 +7,9 @@ data_format: coco_instances include_polygons: true unannotated_items_ratio: 0.0 train_subset: + input_size: + - 1024 + - 1024 subset_name: train transform_lib_type: TORCHVISION batch_size: 1 @@ -41,6 +41,9 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: + - 1024 + - 1024 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -66,6 +69,9 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: + - 1024 + - 1024 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml index c466aa657bb..b42d24775b4 100644 --- a/src/otx/recipe/_base_/data/keypoint_detection.yaml +++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml @@ -6,15 +6,15 @@ data_format: coco_person_keypoints unannotated_items_ratio: 0.0 image_color_channel: RGB train_subset: + input_size: + - 192 + - 256 subset_name: train batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: - input_size: - - 192 - - 256 + init_args: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: @@ -27,15 +27,15 @@ train_subset: mean: [123.675, 116.28, 103.53] std: [58.395, 57.12, 57.375] val_subset: + input_size: + - 192 + - 256 subset_name: val batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: - input_size: - - 192 - - 256 + init_args: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true @@ -47,15 +47,15 @@ val_subset: mean: [123.675, 116.28, 103.53] std: [58.395, 57.12, 57.375] test_subset: + input_size: + - 192 + - 256 subset_name: test batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: - input_size: - - 192 - - 256 + init_args: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true diff --git a/src/otx/recipe/_base_/data/rotated_detection.yaml b/src/otx/recipe/_base_/data/rotated_detection.yaml index 8ac4759ffc5..1d41b9b3c82 100644 --- a/src/otx/recipe/_base_/data/rotated_detection.yaml +++ b/src/otx/recipe/_base_/data/rotated_detection.yaml @@ -1,7 +1,4 @@ task: ROTATED_DETECTION -input_size: - - 1024 - - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -10,6 +7,9 @@ data_format: coco_instances include_polygons: true unannotated_items_ratio: 0.0 train_subset: + input_size: + - 1024 + - 1024 subset_name: train transform_lib_type: TORCHVISION to_tv_image: false @@ -41,6 +41,9 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: + - 1024 + - 1024 subset_name: val transform_lib_type: TORCHVISION to_tv_image: false @@ -66,6 +69,9 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: + - 1024 + - 1024 subset_name: test transform_lib_type: TORCHVISION to_tv_image: false diff --git a/src/otx/recipe/_base_/data/semantic_segmentation.yaml b/src/otx/recipe/_base_/data/semantic_segmentation.yaml index 52b3dec6f63..2a9ec2d8779 100644 --- a/src/otx/recipe/_base_/data/semantic_segmentation.yaml +++ b/src/otx/recipe/_base_/data/semantic_segmentation.yaml @@ -1,7 +1,4 @@ task: SEMANTIC_SEGMENTATION -input_size: - - 512 - - 512 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -10,6 +7,9 @@ include_polygons: true unannotated_items_ratio: 0.0 ignore_index: 255 train_subset: + input_size: + - 512 + - 512 subset_name: train batch_size: 8 num_workers: 4 @@ -42,6 +42,9 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: + - 512 + - 512 subset_name: val batch_size: 8 num_workers: 4 @@ -64,6 +67,9 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: + - 512 + - 512 subset_name: test num_workers: 4 batch_size: 8 diff --git a/src/otx/recipe/_base_/data/torchvision_semisl.yaml b/src/otx/recipe/_base_/data/torchvision_semisl.yaml index 1b5d630a1ec..25ce95252f6 100644 --- a/src/otx/recipe/_base_/data/torchvision_semisl.yaml +++ b/src/otx/recipe/_base_/data/torchvision_semisl.yaml @@ -1,5 +1,4 @@ task: MULTI_CLASS_CLS -input_size: 224 mem_cache_size: 1GB mem_cache_img_max_size: - 500 @@ -9,6 +8,7 @@ stack_images: True data_format: imagenet_with_subset_dirs unannotated_items_ratio: 0.0 train_subset: + input_size: 224 subset_name: train transform_lib_type: TORCHVISION batch_size: 16 @@ -34,6 +34,7 @@ train_subset: class_path: otx.algo.samplers.balanced_sampler.BalancedSampler val_subset: + input_size: 224 subset_name: val transform_lib_type: TORCHVISION batch_size: 64 @@ -56,6 +57,7 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: 224 subset_name: test transform_lib_type: TORCHVISION batch_size: 64 diff --git a/src/otx/recipe/_base_/data/visual_prompting.yaml b/src/otx/recipe/_base_/data/visual_prompting.yaml index f51287efdec..5fa9188f64f 100644 --- a/src/otx/recipe/_base_/data/visual_prompting.yaml +++ b/src/otx/recipe/_base_/data/visual_prompting.yaml @@ -1,7 +1,4 @@ task: VISUAL_PROMPTING -input_size: - - 1024 - - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -12,6 +9,9 @@ vpm_config: use_bbox: true use_point: false train_subset: + input_size: + - 1024 + - 1024 subset_name: train transform_lib_type: TORCHVISION batch_size: 2 @@ -39,6 +39,9 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: + input_size: + - 1024 + - 1024 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -66,6 +69,9 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: + input_size: + - 1024 + - 1024 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 From c049044f6cbd61122cddb32884c0f9edeabd75fd Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 10:35:46 +0900 Subject: [PATCH 25/42] update keypoint detection --- src/otx/algo/keypoint_detection/rtmpose.py | 41 +++++++++++++++---- src/otx/core/model/keypoint_detection.py | 3 +- .../_base_/data/keypoint_detection.yaml | 9 ++-- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index c552580b557..8388e6d8a52 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -13,10 +13,17 @@ from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.keypoint_detection import OTXKeypointDetectionModel +from otx.core.metrics import MetricCallable +from otx.core.metrics.pck import PCKMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.schedulers import LRSchedulerListCallable +from otx.core.types.export import TaskLevelExportParameters +from otx.core.types.label import LabelInfoTypes if TYPE_CHECKING: from otx.core.exporter.base import OTXModelExporter from otx.core.types.export import TaskLevelExportParameters + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable class RTMPose(OTXKeypointDetectionModel): @@ -25,13 +32,13 @@ class RTMPose(OTXKeypointDetectionModel): @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - msg = f"Exporter should have a image_size but it is given by {self.image_size}" + if self.input_size is None: + msg = f"Exporter should have a input_size but it is given by {self.input_size}" raise ValueError(msg) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -60,12 +67,30 @@ class RTMPoseTiny(RTMPose): """RTMPose Tiny Model.""" load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth" - image_size = (1, 3, 192, 256) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + label_info: LabelInfoTypes, + input_size: tuple[int, int] = (192, 256), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = PCKMeasureCallable, + torch_compile: bool = False, + ) -> None: + self.mean = (0.0, 0.0, 0.0) + self.std = (255.0, 255.0, 255.0) + super().__init__( + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) + def _build_model(self, num_classes: int) -> RTMPose: - input_size = (192, 256) simcc_split_ratio = 2.0 sigma = (4.9, 5.66) @@ -82,13 +107,13 @@ def _build_model(self, num_classes: int) -> RTMPose: head = RTMCCHead( out_channels=num_classes, in_channels=384, - input_size=input_size, - in_featuremap_size=(input_size[0] // 32, input_size[1] // 32), + input_size=self.input_size, + in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32), simcc_split_ratio=simcc_split_ratio, final_layer_kernel_size=7, loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True), decoder_cfg={ - "input_size": input_size, + "input_size": self.input_size, "simcc_split_ratio": simcc_split_ratio, "sigma": sigma, "normalize": False, diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py index 69c05ed148e..06f22371deb 100644 --- a/src/otx/core/model/keypoint_detection.py +++ b/src/otx/core/model/keypoint_detection.py @@ -32,16 +32,17 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet def __init__( self, label_info: LabelInfoTypes, + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = PCKMeasureCallable, torch_compile: bool = False, ) -> None: - self.image_size = (1, 3, 192, 256) self.mean = (0.0, 0.0, 0.0) self.std = (255.0, 255.0, 255.0) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml index b42d24775b4..e982b3ac467 100644 --- a/src/otx/recipe/_base_/data/keypoint_detection.yaml +++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml @@ -14,7 +14,8 @@ train_subset: transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: $(input_size) + init_args: + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: @@ -35,7 +36,8 @@ val_subset: transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: $(input_size) + init_args: + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true @@ -55,7 +57,8 @@ test_subset: transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine - init_args: $(input_size) + init_args: + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true From 9097b3d0a7ca98877bb323a45cd9d414e2e23ba2 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 10:44:33 +0900 Subject: [PATCH 26/42] align with pre-commit --- src/otx/algo/keypoint_detection/rtmpose.py | 11 +++++------ src/otx/core/model/keypoint_detection.py | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index 8388e6d8a52..2e932806a27 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -12,18 +12,17 @@ from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.model.keypoint_detection import OTXKeypointDetectionModel -from otx.core.metrics import MetricCallable from otx.core.metrics.pck import PCKMeasureCallable from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable -from otx.core.schedulers import LRSchedulerListCallable -from otx.core.types.export import TaskLevelExportParameters -from otx.core.types.label import LabelInfoTypes +from otx.core.model.keypoint_detection import OTXKeypointDetectionModel if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from otx.core.exporter.base import OTXModelExporter + from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.export import TaskLevelExportParameters - from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from otx.core.types.label import LabelInfoTypes class RTMPose(OTXKeypointDetectionModel): diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py index 06f22371deb..02cbb652333 100644 --- a/src/otx/core/model/keypoint_detection.py +++ b/src/otx/core/model/keypoint_detection.py @@ -48,6 +48,7 @@ def __init__( metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int] @abstractmethod def _build_model(self, num_classes: int) -> nn.Module: From 0adb7ea54bb57b761f75b423167b45772d08d416 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 11:59:37 +0900 Subject: [PATCH 27/42] update docstring --- src/otx/cli/cli.py | 7 ++++--- src/otx/core/data/module.py | 19 ++++++++++++++++--- src/otx/core/data/utils/utils.py | 4 +++- src/otx/core/model/base.py | 5 ++++- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index d20e30a700e..31ced921c18 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -23,7 +23,6 @@ from otx.cli.utils.workspace import Workspace from otx.core.types.task import OTXTaskType from otx.core.utils.imports import get_otx_root_path -from otx.utils.utils import get_model_cls_from_config if TYPE_CHECKING: from jsonargparse._actions import _ActionSubCommands @@ -333,7 +332,9 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: model_config = self.config[self.subcommand].pop("model") # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule - if self.config[self.subcommand].data.adaptive_input_size != "none": + if self.config[self.subcommand].data.get("adaptive_input_size") is not None: + from otx.utils.utils import get_model_cls_from_config + model_cls = get_model_cls_from_config(model_config) self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier @@ -382,7 +383,7 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel: tuple: The model and optimizer and scheduler. """ from otx.core.model.base import OTXModel - from otx.utils.utils import can_pass_tile_config, should_pass_label_info + from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info skip = set() diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index f60ef4cae24..cd1209a7d64 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -39,7 +39,20 @@ class OTXDataModule(LightningDataModule): - """LightningDataModule extension for OTX pipeline.""" + """LightningDataModule extension for OTX pipeline. + + Args: + input_size (int | tuple[int, int] | None, optional): + Final image or video shape of data after data transformation. It'll be applied to all subset configs + If it's not None. Defaults to None. + adaptive_input_size (Literal["auto", "downscale"] | None, optional): + The adaptive input size mode. If it's set, appropriate input size is found by analyzing dataset. + "auto" can find both bigger and smaller input size than current input size and "downscale" uses only + smaller size than default setting. Defaults to None. + input_size_multiplier (int, optional): + adaptive_input_size will finds multiple of input_size_multiplier value if it's set. It's usefull when + a model requries multiple of specific value as input_size. Defaults to 1. + """ def __init__( # noqa: PLR0913 self, @@ -62,7 +75,7 @@ def __init__( # noqa: PLR0913 auto_num_workers: bool = False, device: DeviceType = DeviceType.auto, input_size: int | tuple[int, int] | None = None, - adaptive_input_size: Literal["auto", "downscale", "none"] = "none", + adaptive_input_size: Literal["auto", "downscale"] | None = None, input_size_multiplier: int = 1, ) -> None: """Constructor.""" @@ -122,7 +135,7 @@ def __init__( # noqa: PLR0913 subset=self.unlabeled_subset.subset_name, ) - if adaptive_input_size != "none": + if adaptive_input_size is not None: input_size = adapt_input_size_to_dataset( dataset, input_size, diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 9e2ee03837d..d4651ab5a9e 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -231,8 +231,10 @@ def area(x: list[int] | tuple[int, int]) -> int: logger.info(f"-> Downscale only: {image_size} -> {base_input_size}") return base_input_size + image_size = tuple(int(val) for val in image_size) # type: ignore[assignment] + logger.info(f"-> Adapted input size: {image_size}") - return tuple(image_size) # type: ignore[return-value] + return image_size # type: ignore[return-value] def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None: diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index ac945d26b71..bd42c668a52 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -96,10 +96,13 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti Attributes: explain_mode: If true, `self.predict_step()` will produce a XAI output as well + input_size_multiplier (int): + multiplier value for input size a model requires. If input_size isn't multiple of this value, + error is raised. """ _OPTIMIZED_MODEL_BASE_NAME: str = "optimized_model" - input_size_multiplier = 1 + input_size_multiplier: int = 1 def __init__( self, From 0896ee300fc0f0e3e2513b6d088569f9f04818eb Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 13:13:31 +0900 Subject: [PATCH 28/42] update unit test --- tests/unit/cli/test_cli.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 8fa1581f18a..3b2501066ce 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -9,7 +9,6 @@ import torch import yaml from otx.cli import OTXCLI, main -from otx.cli import cli as target_file from rich.console import Console @@ -145,7 +144,7 @@ def test_instantiate_classes_set_adaptive_input_size( mock_model_cls, ) -> None: mocker.patch("otx.cli.OTXCLI.run") - mocker.patch.object(target_file, "get_model_cls_from_config", return_value=mock_model_cls) + mocker.patch("otx.utils.utils.get_model_cls_from_config", return_value=mock_model_cls) fxt_train_argv.extend(["--data.adaptive_input_size", "auto"]) monkeypatch.setattr("sys.argv", fxt_train_argv) mock_data_module = mocker.patch("otx.core.data.module.adapt_input_size_to_dataset", return_value=1024) From 05483ced884675279a047d8cab12d08860c71ad7 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 13:55:14 +0900 Subject: [PATCH 29/42] update auto_configurator to use None intead of none --- src/otx/engine/utils/auto_configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 5574f1fc3d9..d95826e5c1c 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -228,7 +228,7 @@ def get_datamodule(self) -> OTXDataModule | None: _ = data_config.pop("__path__", {}) # Remove __path__ key that for CLI _ = data_config.pop("config", {}) # Remove config key that for CLI - if data_config.get("adaptive_input_size", "none") != "none": + if data_config.get("adaptive_input_size") is not None: model_cls = get_model_cls_from_config(Namespace(self.config["model"])) data_config["input_size_multiplier"] = model_cls.input_size_multiplier From 8d34d2c3c0cf5891865238881f92686df1075b8e Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 15:08:19 +0900 Subject: [PATCH 30/42] revert data module policy to apply input_size to subset cfg --- src/otx/core/data/module.py | 3 ++- src/otx/recipe/_base_/data/anomaly.yaml | 4 +--- .../recipe/_base_/data/classification.yaml | 4 +--- src/otx/recipe/_base_/data/detection.yaml | 12 +++-------- .../_base_/data/instance_segmentation.yaml | 12 +++-------- .../_base_/data/keypoint_detection.yaml | 21 ++++++++----------- .../recipe/_base_/data/rotated_detection.yaml | 12 +++-------- .../_base_/data/semantic_segmentation.yaml | 12 +++-------- .../_base_/data/torchvision_semisl.yaml | 4 +--- .../recipe/_base_/data/visual_prompting.yaml | 12 +++-------- src/otx/recipe/detection/yolox_tiny.yaml | 18 ++++++---------- tests/unit/core/data/test_module.py | 12 +++++------ 12 files changed, 41 insertions(+), 85 deletions(-) diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index cd1209a7d64..06f62f1c614 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -144,7 +144,8 @@ def __init__( # noqa: PLR0913 ) if input_size is not None: for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]: - subset_cfg.input_size = input_size + if subset_cfg.input_size is None: + subset_cfg.input_size = input_size self.input_size = input_size if self.tile_config.enable_tiler and self.tile_config.enable_adaptive_tiling: diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml index 29d4471d9a6..2f74b987915 100644 --- a/src/otx/recipe/_base_/data/anomaly.yaml +++ b/src/otx/recipe/_base_/data/anomaly.yaml @@ -1,4 +1,5 @@ task: ANOMALY_CLASSIFICATION +input_size: 256 data_format: mvtec mem_cache_size: 1GB mem_cache_img_max_size: null @@ -6,7 +7,6 @@ image_color_channel: RGB stack_images: false unannotated_items_ratio: 0.0 train_subset: - input_size: 256 subset_name: train transform_lib_type: TORCHVISION to_tv_image: true @@ -30,7 +30,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: 256 subset_name: test transform_lib_type: TORCHVISION to_tv_image: true @@ -54,7 +53,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: 256 subset_name: test transform_lib_type: TORCHVISION to_tv_image: true diff --git a/src/otx/recipe/_base_/data/classification.yaml b/src/otx/recipe/_base_/data/classification.yaml index 04b675d8774..e8ee41bf15e 100644 --- a/src/otx/recipe/_base_/data/classification.yaml +++ b/src/otx/recipe/_base_/data/classification.yaml @@ -1,4 +1,5 @@ task: MULTI_CLASS_CLS +input_size: 224 mem_cache_size: 1GB mem_cache_img_max_size: - 500 @@ -8,7 +9,6 @@ stack_images: true data_format: imagenet_with_subset_dirs unannotated_items_ratio: 0.0 train_subset: - input_size: 224 subset_name: train transform_lib_type: TORCHVISION batch_size: 64 @@ -34,7 +34,6 @@ train_subset: class_path: otx.algo.samplers.balanced_sampler.BalancedSampler val_subset: - input_size: 224 subset_name: val transform_lib_type: TORCHVISION batch_size: 64 @@ -57,7 +56,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: 224 subset_name: test transform_lib_type: TORCHVISION batch_size: 64 diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml index fa88d02b3fd..c08a5fea022 100644 --- a/src/otx/recipe/_base_/data/detection.yaml +++ b/src/otx/recipe/_base_/data/detection.yaml @@ -1,4 +1,7 @@ task: DETECTION +input_size: + - 800 + - 992 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -6,9 +9,6 @@ stack_images: true data_format: coco_instances unannotated_items_ratio: 0.0 train_subset: - input_size: - - 800 - - 992 subset_name: train transform_lib_type: TORCHVISION batch_size: 1 @@ -35,9 +35,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: - - 800 - - 992 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -59,9 +56,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: - - 800 - - 992 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml index 299bf488d4f..3520f3930a7 100644 --- a/src/otx/recipe/_base_/data/instance_segmentation.yaml +++ b/src/otx/recipe/_base_/data/instance_segmentation.yaml @@ -1,4 +1,7 @@ task: INSTANCE_SEGMENTATION +input_size: + - 1024 + - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -7,9 +10,6 @@ data_format: coco_instances include_polygons: true unannotated_items_ratio: 0.0 train_subset: - input_size: - - 1024 - - 1024 subset_name: train transform_lib_type: TORCHVISION batch_size: 1 @@ -41,9 +41,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: - - 1024 - - 1024 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -69,9 +66,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: - - 1024 - - 1024 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml index e982b3ac467..c466aa657bb 100644 --- a/src/otx/recipe/_base_/data/keypoint_detection.yaml +++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml @@ -6,16 +6,15 @@ data_format: coco_person_keypoints unannotated_items_ratio: 0.0 image_color_channel: RGB train_subset: - input_size: - - 192 - - 256 subset_name: train batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: $(input_size) + input_size: + - 192 + - 256 - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: @@ -28,16 +27,15 @@ train_subset: mean: [123.675, 116.28, 103.53] std: [58.395, 57.12, 57.375] val_subset: - input_size: - - 192 - - 256 subset_name: val batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: $(input_size) + input_size: + - 192 + - 256 - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true @@ -49,16 +47,15 @@ val_subset: mean: [123.675, 116.28, 103.53] std: [58.395, 57.12, 57.375] test_subset: - input_size: - - 192 - - 256 subset_name: test batch_size: 32 transforms: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: $(input_size) + input_size: + - 192 + - 256 - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: is_numpy_to_tvtensor: true diff --git a/src/otx/recipe/_base_/data/rotated_detection.yaml b/src/otx/recipe/_base_/data/rotated_detection.yaml index 1d41b9b3c82..8ac4759ffc5 100644 --- a/src/otx/recipe/_base_/data/rotated_detection.yaml +++ b/src/otx/recipe/_base_/data/rotated_detection.yaml @@ -1,4 +1,7 @@ task: ROTATED_DETECTION +input_size: + - 1024 + - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -7,9 +10,6 @@ data_format: coco_instances include_polygons: true unannotated_items_ratio: 0.0 train_subset: - input_size: - - 1024 - - 1024 subset_name: train transform_lib_type: TORCHVISION to_tv_image: false @@ -41,9 +41,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: - - 1024 - - 1024 subset_name: val transform_lib_type: TORCHVISION to_tv_image: false @@ -69,9 +66,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: - - 1024 - - 1024 subset_name: test transform_lib_type: TORCHVISION to_tv_image: false diff --git a/src/otx/recipe/_base_/data/semantic_segmentation.yaml b/src/otx/recipe/_base_/data/semantic_segmentation.yaml index 2a9ec2d8779..52b3dec6f63 100644 --- a/src/otx/recipe/_base_/data/semantic_segmentation.yaml +++ b/src/otx/recipe/_base_/data/semantic_segmentation.yaml @@ -1,4 +1,7 @@ task: SEMANTIC_SEGMENTATION +input_size: + - 512 + - 512 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -7,9 +10,6 @@ include_polygons: true unannotated_items_ratio: 0.0 ignore_index: 255 train_subset: - input_size: - - 512 - - 512 subset_name: train batch_size: 8 num_workers: 4 @@ -42,9 +42,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: - - 512 - - 512 subset_name: val batch_size: 8 num_workers: 4 @@ -67,9 +64,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: - - 512 - - 512 subset_name: test num_workers: 4 batch_size: 8 diff --git a/src/otx/recipe/_base_/data/torchvision_semisl.yaml b/src/otx/recipe/_base_/data/torchvision_semisl.yaml index 25ce95252f6..1b5d630a1ec 100644 --- a/src/otx/recipe/_base_/data/torchvision_semisl.yaml +++ b/src/otx/recipe/_base_/data/torchvision_semisl.yaml @@ -1,4 +1,5 @@ task: MULTI_CLASS_CLS +input_size: 224 mem_cache_size: 1GB mem_cache_img_max_size: - 500 @@ -8,7 +9,6 @@ stack_images: True data_format: imagenet_with_subset_dirs unannotated_items_ratio: 0.0 train_subset: - input_size: 224 subset_name: train transform_lib_type: TORCHVISION batch_size: 16 @@ -34,7 +34,6 @@ train_subset: class_path: otx.algo.samplers.balanced_sampler.BalancedSampler val_subset: - input_size: 224 subset_name: val transform_lib_type: TORCHVISION batch_size: 64 @@ -57,7 +56,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: 224 subset_name: test transform_lib_type: TORCHVISION batch_size: 64 diff --git a/src/otx/recipe/_base_/data/visual_prompting.yaml b/src/otx/recipe/_base_/data/visual_prompting.yaml index 5fa9188f64f..f51287efdec 100644 --- a/src/otx/recipe/_base_/data/visual_prompting.yaml +++ b/src/otx/recipe/_base_/data/visual_prompting.yaml @@ -1,4 +1,7 @@ task: VISUAL_PROMPTING +input_size: + - 1024 + - 1024 mem_cache_size: 1GB mem_cache_img_max_size: null image_color_channel: RGB @@ -9,9 +12,6 @@ vpm_config: use_bbox: true use_point: false train_subset: - input_size: - - 1024 - - 1024 subset_name: train transform_lib_type: TORCHVISION batch_size: 2 @@ -39,9 +39,6 @@ train_subset: class_path: torch.utils.data.RandomSampler val_subset: - input_size: - - 1024 - - 1024 subset_name: val transform_lib_type: TORCHVISION batch_size: 1 @@ -69,9 +66,6 @@ val_subset: class_path: torch.utils.data.RandomSampler test_subset: - input_size: - - 1024 - - 1024 subset_name: test transform_lib_type: TORCHVISION batch_size: 1 diff --git a/src/otx/recipe/detection/yolox_tiny.yaml b/src/otx/recipe/detection/yolox_tiny.yaml index bdeee86606c..744dc3e72a7 100644 --- a/src/otx/recipe/detection/yolox_tiny.yaml +++ b/src/otx/recipe/detection/yolox_tiny.yaml @@ -37,24 +37,24 @@ overrides: gradient_clip_val: 35.0 data: + input_size: + - 416 + - 416 train_subset: - input_size: - - 640 - - 640 batch_size: 8 transforms: - class_path: otx.core.data.transform_libs.torchvision.CachedMosaic init_args: random_pop: false max_cached_images: 20 - img_scale: $(input_size) # (H, W) + img_scale: $(input_size) * 1.538 # 640x640 - class_path: otx.core.data.transform_libs.torchvision.RandomAffine init_args: - border: $(input_size) * -0.5 + border: $(input_size) * 1.538 * -0.5 # 640x640 * -0.5 - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: - scale: $(input_size) + scale: $(input_size) * 1.538 # 640x640 keep_ratio: true transform_bbox: true - class_path: otx.core.data.transform_libs.torchvision.RandomFlip @@ -76,9 +76,6 @@ overrides: class_path: otx.algo.samplers.balanced_sampler.BalancedSampler val_subset: - input_size: - - 416 - - 416 batch_size: 8 transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize @@ -99,9 +96,6 @@ overrides: std: [58.395, 57.12, 57.375] test_subset: - input_size: - - 416 - - 416 batch_size: 8 transforms: - class_path: otx.core.data.transform_libs.torchvision.Resize diff --git a/tests/unit/core/data/test_module.py b/tests/unit/core/data/test_module.py index 6cb04105b0d..e5365406ddc 100644 --- a/tests/unit/core/data/test_module.py +++ b/tests/unit/core/data/test_module.py @@ -144,7 +144,7 @@ def test_init_input_size( # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)} mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets - fxt_config.train_subset.input_size = (1000, 1000) + fxt_config.train_subset.input_size = None fxt_config.val_subset.input_size = None fxt_config.test_subset.input_size = (800, 800) @@ -160,7 +160,7 @@ def test_init_input_size( assert fxt_config.train_subset.input_size == (1200, 1200) assert fxt_config.val_subset.input_size == (1200, 1200) - assert fxt_config.test_subset.input_size == (1200, 1200) + assert fxt_config.test_subset.input_size == (800, 800) @pytest.fixture() def mock_adapt_input_size_to_dataset(self, mocker) -> MagicMock: @@ -177,9 +177,9 @@ def test_init_adaptive_input_size( # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)} mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets - fxt_config.train_subset.input_size = (1000, 1000) - fxt_config.val_subset.input_size = None - fxt_config.test_subset.input_size = (800, 800) + fxt_config.train_subset.input_size = None + fxt_config.val_subset.input_size = (1000, 1000) + fxt_config.test_subset.input_size = None OTXDataModule( task=OTXTaskType.MULTI_CLASS_CLS, @@ -192,7 +192,7 @@ def test_init_adaptive_input_size( ) assert fxt_config.train_subset.input_size == (1234, 1234) - assert fxt_config.val_subset.input_size == (1234, 1234) + assert fxt_config.val_subset.input_size == (1000, 1000) assert fxt_config.test_subset.input_size == (1234, 1234) @pytest.fixture() From 982c9854114292dce2e3039ed27fba545facb80f Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 17:26:46 +0900 Subject: [PATCH 31/42] revert keypoint detection --- src/otx/algo/keypoint_detection/rtmpose.py | 40 ++++--------------- .../core/data/transform_libs/torchvision.py | 10 ++--- src/otx/core/model/keypoint_detection.py | 4 +- .../_base_/data/keypoint_detection.yaml | 18 ++++----- 4 files changed, 22 insertions(+), 50 deletions(-) diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index 8f06378adf1..0086acd80dd 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -13,18 +13,12 @@ from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator from otx.core.exporter.native import OTXNativeModelExporter -from otx.core.metrics.pck import PCKMeasureCallable -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.model.keypoint_detection import OTXKeypointDetectionModel from torch import nn if TYPE_CHECKING: - from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from otx.core.exporter.base import OTXModelExporter - from otx.core.metrics import MetricCallable - from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.export import TaskLevelExportParameters - from otx.core.types.label import LabelInfoTypes class RTMPose(OTXKeypointDetectionModel): @@ -33,13 +27,13 @@ class RTMPose(OTXKeypointDetectionModel): @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.input_size is None: - msg = f"Exporter should have a input_size but it is given by {self.input_size}" + if self.image_size is None: + msg = f"Exporter should have a image_size but it is given by {self.image_size}" raise ValueError(msg) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=(1, 3, *self.input_size), + input_size=self.image_size, mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -68,30 +62,12 @@ class RTMPoseTiny(RTMPose): """RTMPose Tiny Model.""" load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth" + image_size = (1, 3, 192, 256) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) - def __init__( - self, - label_info: LabelInfoTypes, - input_size: tuple[int, int] = (192, 256), - optimizer: OptimizerCallable = DefaultOptimizerCallable, - scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, - metric: MetricCallable = PCKMeasureCallable, - torch_compile: bool = False, - ) -> None: - self.mean = (0.0, 0.0, 0.0) - self.std = (255.0, 255.0, 255.0) - super().__init__( - label_info=label_info, - input_size=input_size, - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - ) - def _build_model(self, num_classes: int) -> RTMPose: + input_size = (192, 256) simcc_split_ratio = 2.0 sigma = (4.9, 5.66) @@ -108,13 +84,13 @@ def _build_model(self, num_classes: int) -> RTMPose: head = RTMCCHead( out_channels=num_classes, in_channels=384, - input_size=self.input_size, - in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32), + input_size=input_size, + in_featuremap_size=(input_size[0] // 32, input_size[1] // 32), simcc_split_ratio=simcc_split_ratio, final_layer_kernel_size=7, loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True), decoder_cfg={ - "input_size": self.input_size, + "input_size": input_size, "simcc_split_ratio": simcc_split_ratio, "sigma": sigma, "normalize": False, diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index 8d2ecb259a7..e29ee2989df 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -3508,22 +3508,20 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin): the specific codec for more details. Args: - encoder (dict | list[dict]): The codec config for keypoint encoding. - Both single encoder and multiple encoders (given as a list) are - supported - target_type (str, deprecated): This argument is deprecated and has no - effect. Defaults to ``None`` + input_size (tuple[int, int]): Input image size in [w, h] + is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False. """ def __init__( self, + input_size: tuple[int, int], is_numpy_to_tvtensor: bool = False, ) -> None: super().__init__() from otx.algo.keypoint_detection.utils.simcc_label import SimCCLabel self.encoder = SimCCLabel( - input_size=(192, 256), + input_size=input_size, sigma=(4.9, 5.66), simcc_split_ratio=2.0, normalize=False, diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py index 02cbb652333..69c05ed148e 100644 --- a/src/otx/core/model/keypoint_detection.py +++ b/src/otx/core/model/keypoint_detection.py @@ -32,23 +32,21 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet def __init__( self, label_info: LabelInfoTypes, - input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = PCKMeasureCallable, torch_compile: bool = False, ) -> None: + self.image_size = (1, 3, 192, 256) self.mean = (0.0, 0.0, 0.0) self.std = (255.0, 255.0, 255.0) super().__init__( label_info=label_info, - input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, ) - self.input_size: tuple[int, int] @abstractmethod def _build_model(self, num_classes: int) -> nn.Module: diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml index c466aa657bb..b3ffed5b915 100644 --- a/src/otx/recipe/_base_/data/keypoint_detection.yaml +++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml @@ -5,6 +5,9 @@ stack_images: true data_format: coco_person_keypoints unannotated_items_ratio: 0.0 image_color_channel: RGB +input_size: + - 192 + - 256 train_subset: subset_name: train batch_size: 32 @@ -12,12 +15,11 @@ train_subset: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: - - 192 - - 256 + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: + input_size: $(input_size) is_numpy_to_tvtensor: true - class_path: torchvision.transforms.v2.ToDtype init_args: @@ -33,11 +35,10 @@ val_subset: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: - - 192 - - 256 + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: + input_size: $(input_size) is_numpy_to_tvtensor: true - class_path: torchvision.transforms.v2.ToDtype init_args: @@ -53,11 +54,10 @@ test_subset: - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine init_args: - input_size: - - 192 - - 256 + input_size: $(input_size) - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget init_args: + input_size: $(input_size) is_numpy_to_tvtensor: true - class_path: torchvision.transforms.v2.ToDtype init_args: From f8f9e2867c1ce39677bdb910955063b4f1cb1684 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 18:09:51 +0900 Subject: [PATCH 32/42] add comments to explain a reason of priority in compute_robust_dataset_statistics --- src/otx/core/data/utils/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index d4651ab5a9e..fc0c84dbf58 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -133,6 +133,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = ) stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images)) + # The reason why polygon is used prior to others is based on assumtion that it is more accurate than other shapes. + # Especially, polygon can be used in the case both polygon and bbox exist like instance segmentation task. + # it's needed to refine this algorithm considering not only instance segmentation but also other tasks. if "Polygon" in size_of_shapes: stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes["Polygon"])) else: From 09772d817f5afbecd45fcf1d5ade46e548ae3d49 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Mon, 12 Aug 2024 18:18:36 +0900 Subject: [PATCH 33/42] add integration test --- tests/integration/cli/test_cli.py | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index 73908782e98..3c126e9d97b 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -7,6 +7,7 @@ import cv2 import pytest +import torch import yaml from otx.core.types.task import OTXTaskType from otx.engine.utils.auto_configurator import DEFAULT_CONFIG_PER_TASK @@ -555,3 +556,66 @@ def test_otx_adaptive_bs_e2e( ] run_main(command_cfg=command_cfg, open_subprocess=fxt_open_subprocess) + + +@pytest.mark.parametrize("task", pytest.TASK_LIST) +def test_otx_configurable_input_size_e2e( + task: OTXTaskType, + tmp_path: Path, + fxt_accelerator: str, + fxt_target_dataset_per_task: dict, + fxt_cli_override_command_per_task: dict, + fxt_open_subprocess: bool, +) -> None: + """ + Test adaptive batch size e2e commands with default template of each task. + + Args: + task (OTXTaskType): The task to run adaptive batch size with. + tmp_path (Path): The temporary path for storing the training outputs. + + Returns: + None + """ + if task not in DEFAULT_CONFIG_PER_TASK: + pytest.skip(f"Task {task} is not supported in the auto-configuration.") + if task in [ + OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING, + OTXTaskType.ANOMALY_CLASSIFICATION, + OTXTaskType.ANOMALY_DETECTION, + OTXTaskType.ANOMALY_SEGMENTATION, + OTXTaskType.KEYPOINT_DETECTION, + ]: + pytest.skip(f"{task} doesn't support configurable input size.") + + task = task.lower() + tmp_path_cfg_ipt_size = tmp_path / f"otx_configurable_input_size_{task}" + tmp_path_cfg_ipt_size.mkdir(parents=True) + + command_cfg = [ + "otx", + "train", + "--task", + task.upper(), + "--data_root", + fxt_target_dataset_per_task[task], + "--work_dir", + str(tmp_path_cfg_ipt_size), + "--engine.device", + fxt_accelerator, + "--data.input_size", + str(448), + "--max_epoch", + "1", + *fxt_cli_override_command_per_task[task], + ] + + run_main(command_cfg=command_cfg, open_subprocess=fxt_open_subprocess) + + best_ckpt_files = list(tmp_path_cfg_ipt_size.rglob("best_checkpoint.ckpt")) + assert len(best_ckpt_files) != 0 + best_ckpt = torch.load(best_ckpt_files[0]) + assert best_ckpt["hyper_parameters"]["input_size"] == (448, 448) + for param_name in best_ckpt["datamodule_hyper_parameters"]: + if "subset" in param_name: + assert best_ckpt["datamodule_hyper_parameters"][param_name].input_size == 448 From 5b3f198b40b4c4d22b692a7432ff05006d599e60 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 09:20:01 +0900 Subject: [PATCH 34/42] update unit test --- tests/unit/core/data/transform_libs/test_torchvision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py index 66837a2e609..202a91afb15 100644 --- a/tests/unit/core/data/transform_libs/test_torchvision.py +++ b/tests/unit/core/data/transform_libs/test_torchvision.py @@ -946,7 +946,7 @@ def keypoint_det_entity(self) -> KeypointDetDataEntity: ) def test_forward(self, keypoint_det_entity) -> None: - transform = GenerateTarget() + transform = GenerateTarget(input_size=(192, 256)) results = transform(deepcopy(keypoint_det_entity)) assert hasattr(results, "keypoint_x_labels") From 5fe677768cbd2eec91ce7068992877d0cf9047e1 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 11:02:48 +0900 Subject: [PATCH 35/42] apply input_size to anomaly task --- src/otx/algo/anomaly/padim.py | 6 +++++- src/otx/algo/anomaly/stfpm.py | 6 +++++- src/otx/core/model/anomaly.py | 29 +++++++++++++++-------------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py index ab9a6ddb1a3..f667efa897d 100644 --- a/src/otx/algo/anomaly/padim.py +++ b/src/otx/algo/anomaly/padim.py @@ -34,6 +34,8 @@ class Padim(OTXAnomaly, AnomalibPadim): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. + input_size (tuple[int, int], optional): + Model input size in the order of height and width. Defaults to (256, 256) """ def __init__( @@ -47,8 +49,9 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, + input_size: tuple[int, int] = (256, 256), ) -> None: - OTXAnomaly.__init__(self) + OTXAnomaly.__init__(self, input_size) AnomalibPadim.__init__( self, backbone=backbone, @@ -57,6 +60,7 @@ def __init__( n_features=n_features, ) self.task = task + self.input_size = input_size def configure_optimizers(self) -> tuple[list[Optimizer], list[Optimizer]] | None: """PADIM doesn't require optimization, therefore returns no optimizers.""" diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py index c9ddb4cd93c..614d3ad52f9 100644 --- a/src/otx/algo/anomaly/stfpm.py +++ b/src/otx/algo/anomaly/stfpm.py @@ -32,6 +32,8 @@ class Stfpm(OTXAnomaly, AnomalibStfpm): task (Literal[ OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION. + input_size (tuple[int, int], optional): + Model input size in the order of height and width. Defaults to (256, 256) """ def __init__( @@ -43,15 +45,17 @@ def __init__( OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION, ] = OTXTaskType.ANOMALY_CLASSIFICATION, + input_size: tuple[int, int] = (256, 256), **kwargs, ) -> None: - OTXAnomaly.__init__(self) + OTXAnomaly.__init__(self, input_size=input_size) AnomalibStfpm.__init__( self, backbone=backbone, layers=layers, ) self.task = task + self.input_size = input_size @property def trainable_model(self) -> str: diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py index a4f57c4fe0e..68abff41a59 100644 --- a/src/otx/core/model/anomaly.py +++ b/src/otx/core/model/anomaly.py @@ -50,13 +50,17 @@ class OTXAnomaly(OTXModel): - """Methods used to make OTX model compatible with the Anomalib model.""" + """Methods used to make OTX model compatible with the Anomalib model. - def __init__(self) -> None: - super().__init__(label_info=AnomalyLabelInfo()) + Args: + input_size (tuple[int, int] | None): + Model input size in the order of height and width. Defaults to None. + """ + + def __init__(self, input_size: tuple[int, int]) -> None: + super().__init__(label_info=AnomalyLabelInfo(), input_size=input_size) self.optimizer: list[OptimizerCallable] | OptimizerCallable = None self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None - self._input_size: tuple[int, int] = (256, 256) self.trainer: Trainer self.model: nn.Module self.image_threshold: BaseThreshold @@ -116,17 +120,15 @@ def task(self, value: OTXTaskType) -> None: def _get_values_from_transforms( self, - ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]: + ) -> tuple[tuple[float, float, float], tuple[float, float, float]]: """Get the value requested value from default transforms.""" - image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) + mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375) for transform in self.configure_transforms().transforms: # type: ignore[attr-defined] name = transform.__class__.__name__ - if "Resize" in name: - image_size = tuple(transform.size) # type: ignore[assignment] - elif "Normalize" in name: + if "Normalize" in name: mean_value = tuple(value * 255 for value in transform.mean) # type: ignore[assignment] std_value = tuple(value * 255 for value in transform.std) # type: ignore[assignment] - return image_size, mean_value, std_value + return mean_value, std_value @property def trainable_model(self) -> str | None: @@ -243,7 +245,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: """Creates OTXAnomalyModelExporter object that can export anomaly models.""" min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist() max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist() - image_shape, mean_values, scale_values = self._get_values_from_transforms() + mean_values, scale_values = self._get_values_from_transforms() onnx_export_configuration = { "opset_version": 14, "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}}, @@ -251,7 +253,7 @@ def _exporter(self) -> OTXAnomalyModelExporter: "output_names": ["output"], } return OTXAnomalyModelExporter( - image_shape=image_shape, + image_shape=self.input_size, image_threshold=self.image_threshold.value.cpu().numpy().tolist(), pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(), task=self.task, @@ -299,8 +301,7 @@ def export( def get_dummy_input(self, batch_size: int = 1) -> AnomalyModelInputs: """Returns a dummy input for anomaly model.""" - image_size, _, _ = self._get_values_from_transforms() - images = torch.rand(batch_size, 3, *image_size) + images = torch.rand(batch_size, 3, *self.input_size) infos = [] for i, img in enumerate(images): infos.append( From 9dccf2d0550c5bff84e61d09dd080a4017ff2c0a Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 11:05:22 +0900 Subject: [PATCH 36/42] update docstring --- src/otx/core/data/transform_libs/torchvision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index e29ee2989df..3ea0b063e00 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -3508,7 +3508,7 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin): the specific codec for more details. Args: - input_size (tuple[int, int]): Input image size in [w, h] + input_size (tuple[int, int]): Input image size in [w, h] TODO[wonjulee]: need to change order of shape is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False. """ From 081c94bc6a290edd5c133aebee3d75114852e31e Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 11:12:09 +0900 Subject: [PATCH 37/42] remove unused comment --- src/otx/cli/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py index 31ced921c18..81b006d4d74 100644 --- a/src/otx/cli/cli.py +++ b/src/otx/cli/cli.py @@ -345,7 +345,6 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None: # pass OTXDataModule input size to the model if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]: - # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check model_config["init_args"]["input_size"] = ( (input_size, input_size) if isinstance(input_size, int) else tuple(input_size) ) From 4ee155161770a5d1c53b855c0321f63e092831b3 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 13:45:49 +0900 Subject: [PATCH 38/42] re-enable anomaly integration test --- tests/integration/cli/test_cli.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index 3c126e9d97b..603f648dd4b 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -579,14 +579,10 @@ def test_otx_configurable_input_size_e2e( """ if task not in DEFAULT_CONFIG_PER_TASK: pytest.skip(f"Task {task} is not supported in the auto-configuration.") - if task in [ - OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING, - OTXTaskType.ANOMALY_CLASSIFICATION, - OTXTaskType.ANOMALY_DETECTION, - OTXTaskType.ANOMALY_SEGMENTATION, - OTXTaskType.KEYPOINT_DETECTION, - ]: + if task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: pytest.skip(f"{task} doesn't support configurable input size.") + if task == OTXTaskType.KEYPOINT_DETECTION: + pytest.skip(f"{task} isn't prepared to run integration test.") task = task.lower() tmp_path_cfg_ipt_size = tmp_path / f"otx_configurable_input_size_{task}" From 9127e5d152ead7cef2fc98a9b5dde77002dbdce0 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 13:47:00 +0900 Subject: [PATCH 39/42] apply configurable input size to keypoint detection --- .../keypoint_detection/heads/rtmcc_head.py | 6 +-- src/otx/algo/keypoint_detection/rtmpose.py | 40 +++++++++++++++---- .../keypoint_detection/utils/simcc_label.py | 8 ++-- .../core/data/transform_libs/torchvision.py | 4 +- src/otx/core/model/keypoint_detection.py | 4 +- src/otx/engine/utils/auto_configurator.py | 2 +- .../_base_/data/keypoint_detection.yaml | 2 +- 7 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py index 5b38b9f0661..1fdf8252c2c 100644 --- a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py +++ b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py @@ -31,7 +31,7 @@ class RTMCCHead(BaseModule): in_channels (int | sequence[int]): Number of channels in the input feature map. out_channels (int): Number of channels in the output heatmap. - input_size (tuple): Size of input image in shape [w, h]. + input_size (tuple): Size of input image in shape [h, w]. in_featuremap_size (int | sequence[int]): Size of input feature map. loss (nn.module): keypoint loss. decoder_cfg (dict): Config dict for the keypoint decoder. @@ -87,8 +87,8 @@ def __init__( ) self.mlp = nn.Sequential(ScaleNorm(flatten_dims), nn.Linear(flatten_dims, gau_cfg["in_token_dims"], bias=False)) self.gau = RTMCCBlock(**gau_cfg) - self.cls_x = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[0] * self.simcc_split_ratio), bias=False) - self.cls_y = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[1] * self.simcc_split_ratio), bias=False) + self.cls_x = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[1] * self.simcc_split_ratio), bias=False) + self.cls_y = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[0] * self.simcc_split_ratio), bias=False) def forward(self, feats: tuple[Tensor]) -> tuple[Tensor, Tensor]: """Forward the network. diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index 0086acd80dd..23c8ed8b22e 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -13,12 +13,18 @@ from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator from otx.core.exporter.native import OTXNativeModelExporter +from otx.core.metrics.pck import PCKMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.model.keypoint_detection import OTXKeypointDetectionModel from torch import nn if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable from otx.core.exporter.base import OTXModelExporter + from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable from otx.core.types.export import TaskLevelExportParameters + from otx.core.types.label import LabelInfoTypes class RTMPose(OTXKeypointDetectionModel): @@ -27,13 +33,13 @@ class RTMPose(OTXKeypointDetectionModel): @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" - if self.image_size is None: - msg = f"Exporter should have a image_size but it is given by {self.image_size}" + if self.input_size is None: + msg = f"Exporter should have a input_size but it is given by {self.input_size}" raise ValueError(msg) return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, - input_size=self.image_size, + input_size=(1, 3, *self.input_size), mean=self.mean, std=self.std, resize_mode="fit_to_window_letterbox", @@ -62,12 +68,30 @@ class RTMPoseTiny(RTMPose): """RTMPose Tiny Model.""" load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth" - image_size = (1, 3, 192, 256) mean = (123.675, 116.28, 103.53) std = (58.395, 57.12, 57.375) + def __init__( + self, + label_info: LabelInfoTypes, + input_size: tuple[int, int] = (256, 192), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = PCKMeasureCallable, + torch_compile: bool = False, + ) -> None: + self.mean = (0.0, 0.0, 0.0) + self.std = (255.0, 255.0, 255.0) + super().__init__( + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) + def _build_model(self, num_classes: int) -> RTMPose: - input_size = (192, 256) simcc_split_ratio = 2.0 sigma = (4.9, 5.66) @@ -84,13 +108,13 @@ def _build_model(self, num_classes: int) -> RTMPose: head = RTMCCHead( out_channels=num_classes, in_channels=384, - input_size=input_size, - in_featuremap_size=(input_size[0] // 32, input_size[1] // 32), + input_size=self.input_size, + in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32), simcc_split_ratio=simcc_split_ratio, final_layer_kernel_size=7, loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True), decoder_cfg={ - "input_size": input_size, + "input_size": self.input_size, "simcc_split_ratio": simcc_split_ratio, "sigma": sigma, "normalize": False, diff --git a/src/otx/algo/keypoint_detection/utils/simcc_label.py b/src/otx/algo/keypoint_detection/utils/simcc_label.py index 4f03997bb7d..429c2f427a4 100644 --- a/src/otx/algo/keypoint_detection/utils/simcc_label.py +++ b/src/otx/algo/keypoint_detection/utils/simcc_label.py @@ -21,7 +21,7 @@ class SimCCLabel: - instance number: N - keypoint number: K - keypoint dimension: D - - image size: [w, h] + - image size: [h, w] Encoded: @@ -36,7 +36,7 @@ class SimCCLabel: - keypoint_weights (np.ndarray): The target weights in shape (N, K) Args: - input_size (tuple): Input image size in [w, h] + input_size (tuple): Input image size in [h, w] smoothing_type (str): The SimCC label smoothing strategy. Options are ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'`` sigma (float | int | tuple): The sigma value in the Gaussian SimCC @@ -201,7 +201,7 @@ def _generate_standard( Labels will be one-hot vectors if self.label_smooth_weight==0.0 """ batch_size, num_keypoints, _ = keypoints.shape - w, h = self.input_size + h, w = self.input_size x_dim = np.around(w * self.simcc_split_ratio).astype(int) y_dim = np.around(h * self.simcc_split_ratio).astype(int) @@ -239,7 +239,7 @@ def _generate_gaussian( ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Encoding keypoints into SimCC labels with Gaussian Label Smoothing strategy.""" batch_size, num_keypoints, _ = keypoints.shape - w, h = self.input_size + h, w = self.input_size x_dim = np.around(w * self.simcc_split_ratio).astype(int) y_dim = np.around(h * self.simcc_split_ratio).astype(int) diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index 3ea0b063e00..00471d85be2 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -3447,7 +3447,7 @@ def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None: assert len(_inputs) == 1, "[tmp] Multiple entity is not supported yet." # noqa: S101 inputs = _inputs[0] - w, h = self.input_size + h, w = self.input_size warp_size = (int(w), int(h)) # reshape bbox to fixed aspect ratio @@ -3508,7 +3508,7 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin): the specific codec for more details. Args: - input_size (tuple[int, int]): Input image size in [w, h] TODO[wonjulee]: need to change order of shape + input_size (tuple[int, int]): Input image size in [h, w] is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False. """ diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py index 69c05ed148e..02cbb652333 100644 --- a/src/otx/core/model/keypoint_detection.py +++ b/src/otx/core/model/keypoint_detection.py @@ -32,21 +32,23 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet def __init__( self, label_info: LabelInfoTypes, + input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, metric: MetricCallable = PCKMeasureCallable, torch_compile: bool = False, ) -> None: - self.image_size = (1, 3, 192, 256) self.mean = (0.0, 0.0, 0.0) self.std = (255.0, 255.0, 255.0) super().__init__( label_info=label_info, + input_size=input_size, optimizer=optimizer, scheduler=scheduler, metric=metric, torch_compile=torch_compile, ) + self.input_size: tuple[int, int] @abstractmethod def _build_model(self, num_classes: int) -> nn.Module: diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index d95826e5c1c..5b2b50f33bd 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -382,7 +382,7 @@ def get_ov_model(self, model_name: str, label_info: LabelInfo) -> OVModel: """ class_path = OVMODEL_PER_TASK.get(self.task, None) if class_path is None: - msg = f"{self.task} is not support OVModel." + msg = f"{self.task} doesn't support OVModel." raise NotImplementedError(msg) class_module, class_name = class_path.rsplit(".", 1) module = __import__(class_module, fromlist=[class_name]) diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml index b3ffed5b915..ddd8eaf92ea 100644 --- a/src/otx/recipe/_base_/data/keypoint_detection.yaml +++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml @@ -6,8 +6,8 @@ data_format: coco_person_keypoints unannotated_items_ratio: 0.0 image_color_channel: RGB input_size: - - 192 - 256 + - 192 train_subset: subset_name: train batch_size: 32 From a6b922dec7ffa567d121c16f09d0652552b6fab2 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 13:57:28 +0900 Subject: [PATCH 40/42] update unit test --- .../keypoint_detection/heads/test_rtmcc_head.py | 14 +++++++------- .../keypoint_detection/utils/test_simcc_label.py | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py b/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py index 0f3898c3f49..f2f620671d5 100644 --- a/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py +++ b/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py @@ -17,19 +17,19 @@ class TestRTMCCHead: def fxt_features(self): batch_size = 2 in_channels = 384 # Match the in_channels of the rtmdet_sep_bn_head fixture - input_size = (192, 256) + input_size = (256, 192) return [torch.rand(batch_size, in_channels, input_size[0] // 32, input_size[1] // 32)] @pytest.fixture() def fxt_gt_entity(self): batch_size = 2 - img_infos = [ImageInfo(img_idx=i, img_shape=(192, 256), ori_shape=(192, 256)) for i in range(batch_size)] + img_infos = [ImageInfo(img_idx=i, img_shape=(256, 192), ori_shape=(256, 192)) for i in range(batch_size)] keypoint_x_labels = [torch.randn((1, 17, 384)) for _ in range(batch_size)] keypoint_y_labels = [torch.randn((1, 17, 512)) for _ in range(batch_size)] keypoint_weights = [torch.randn((1, 17)) for _ in range(batch_size)] return KeypointDetBatchDataEntity( batch_size=batch_size, - images=tv_tensors.Image(data=torch.randn((batch_size, 3, 192, 256))), + images=tv_tensors.Image(data=torch.randn((batch_size, 3, 256, 192))), imgs_info=img_infos, keypoint_x_labels=keypoint_x_labels, keypoint_y_labels=keypoint_y_labels, @@ -45,13 +45,13 @@ def fxt_rtmcc_head(self) -> RTMCCHead: return RTMCCHead( out_channels=17, in_channels=384, - input_size=(192, 256), + input_size=(256, 192), in_featuremap_size=(6, 8), simcc_split_ratio=2.0, final_layer_kernel_size=7, loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True), decoder_cfg={ - "input_size": (192, 256), + "input_size": (256, 192), "simcc_split_ratio": 2.0, "sigma": (4.9, 5.66), "normalize": False, @@ -72,9 +72,9 @@ def fxt_rtmcc_head(self) -> RTMCCHead: def test_forward(self, fxt_rtmcc_head, fxt_features) -> None: pred_x, pred_y = fxt_rtmcc_head(fxt_features) assert pred_x.shape[1] == fxt_rtmcc_head.out_channels - assert pred_x.shape[2] == fxt_rtmcc_head.decoder.input_size[0] * fxt_rtmcc_head.decoder.simcc_split_ratio + assert pred_x.shape[2] == fxt_rtmcc_head.decoder.input_size[1] * fxt_rtmcc_head.decoder.simcc_split_ratio assert pred_y.shape[1] == fxt_rtmcc_head.out_channels - assert pred_y.shape[2] == fxt_rtmcc_head.decoder.input_size[1] * fxt_rtmcc_head.decoder.simcc_split_ratio + assert pred_y.shape[2] == fxt_rtmcc_head.decoder.input_size[0] * fxt_rtmcc_head.decoder.simcc_split_ratio def test_loss(self, fxt_rtmcc_head, fxt_features, fxt_gt_entity) -> None: losses = fxt_rtmcc_head.loss( diff --git a/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py b/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py index 69d432ae7a9..7eeaa18ed5d 100644 --- a/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py +++ b/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py @@ -18,12 +18,12 @@ def fxt_keypoints_visible(self): @pytest.fixture() def fxt_codec_gaussian(self): - return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=6.0, simcc_split_ratio=2.0) + return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=6.0, simcc_split_ratio=2.0) @pytest.fixture() def fxt_codec_smoothing(self): return SimCCLabel( - input_size=(192, 256), + input_size=(256, 192), smoothing_type="standard", sigma=5.0, simcc_split_ratio=3.0, @@ -32,11 +32,11 @@ def fxt_codec_smoothing(self): @pytest.fixture() def fxt_codec_dark(self): - return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0) + return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0) @pytest.fixture() def fxt_codec_separated_sigma(self): - return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0) + return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0) @pytest.mark.parametrize( "fxt_codec", From 255a4e08430deaa70188147c1ee434a21d2b2166 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 14:20:07 +0900 Subject: [PATCH 41/42] update unit test --- tests/unit/core/data/transform_libs/test_torchvision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py index 202a91afb15..6ccf71b3dcb 100644 --- a/tests/unit/core/data/transform_libs/test_torchvision.py +++ b/tests/unit/core/data/transform_libs/test_torchvision.py @@ -946,7 +946,7 @@ def keypoint_det_entity(self) -> KeypointDetDataEntity: ) def test_forward(self, keypoint_det_entity) -> None: - transform = GenerateTarget(input_size=(192, 256)) + transform = GenerateTarget(input_size=(256, 192)) results = transform(deepcopy(keypoint_det_entity)) assert hasattr(results, "keypoint_x_labels") From 8e6f8f8e915cbf04f9ac713c4121031683198d8c Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" <eunwoo.shin@intel.com> Date: Tue, 13 Aug 2024 23:43:57 +0900 Subject: [PATCH 42/42] update h-label head --- src/otx/algo/classification/efficientnet.py | 9 ++++++-- .../classification/heads/hlabel_cls_head.py | 23 +++++++++++-------- src/otx/algo/classification/mobilenet_v3.py | 8 +++++-- .../heads/test_hlabel_cls_head.py | 19 +++++++++++++++ 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py index c2fa5ae7784..d6ab980a3a4 100644 --- a/src/otx/algo/classification/efficientnet.py +++ b/src/otx/algo/classification/efficientnet.py @@ -6,7 +6,8 @@ from __future__ import annotations -from copy import deepcopy +from copy import copy, deepcopy +from math import ceil from typing import TYPE_CHECKING, Literal from torch import Tensor, nn @@ -269,6 +270,10 @@ def _build_model(self, head_config: dict) -> nn.Module: raise TypeError(self.label_info) backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained) + + copied_head_config = copy(head_config) + copied_head_config["step_size"] = (ceil(self.input_size[0] / 32), ceil(self.input_size[1] / 32)) + return ImageClassifier( backbone=backbone, neck=nn.Identity(), @@ -276,7 +281,7 @@ def _build_model(self, head_config: dict) -> nn.Module: in_channels=backbone.num_features, multiclass_loss=nn.CrossEntropyLoss(), multilabel_loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"), - **head_config, + **copied_head_config, ), optimize_gap=False, ) diff --git a/src/otx/algo/classification/heads/hlabel_cls_head.py b/src/otx/algo/classification/heads/hlabel_cls_head.py index 1b5767c4ace..b0f6cfb9711 100644 --- a/src/otx/algo/classification/heads/hlabel_cls_head.py +++ b/src/otx/algo/classification/heads/hlabel_cls_head.py @@ -419,7 +419,7 @@ class HierarchicalCBAMClsHead(HierarchicalClsHead): thr (float, optional): Predictions with scores under the thresholds are considered as negative. Defaults to 0.5. init_cfg (dict | None, optional): Initialize configuration key-values, Defaults to None. - step_size (int, optional): Step size value for HierarchicalCBAMClsHead, Defaults to 7. + step_size (int | tuple[int, int], optional): Step size value for HierarchicalCBAMClsHead, Defaults to 7. """ def __init__( @@ -435,7 +435,7 @@ def __init__( multilabel_loss: nn.Module | None = None, thr: float = 0.5, init_cfg: dict | None = None, - step_size: int = 7, + step_size: int | tuple[int, int] = 7, **kwargs, ): super().__init__( @@ -452,11 +452,11 @@ def __init__( init_cfg=init_cfg, **kwargs, ) - self.step_size = step_size - self.fc_superclass = nn.Linear(in_channels * step_size * step_size, num_multiclass_heads) - self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * step_size * step_size) + self.step_size = (step_size, step_size) if isinstance(step_size, int) else tuple(step_size) + self.fc_superclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_multiclass_heads) + self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * self.step_size[0] * self.step_size[1]) self.cbam = CBAM(in_channels) - self.fc_subclass = nn.Linear(in_channels * step_size * step_size, num_single_label_classes) + self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_single_label_classes) self._init_layers() @@ -464,7 +464,7 @@ def pre_logits(self, feats: tuple[torch.Tensor] | torch.Tensor) -> torch.Tensor: """The process before the final classification head.""" if isinstance(feats, Sequence): feats = feats[-1] - return feats.view(feats.size(0), self.in_channels * self.step_size * self.step_size) + return feats.view(feats.size(0), self.in_channels * self.step_size[0] * self.step_size[1]) def _init_layers(self) -> None: """Iniitialize weights of classification head.""" @@ -479,10 +479,15 @@ def forward(self, feats: tuple[torch.Tensor] | torch.Tensor) -> torch.Tensor: attention_weights = torch.sigmoid(self.attention_fc(out_superclass)) attended_features = pre_logits * attention_weights - attended_features = attended_features.view(pre_logits.size(0), self.in_channels, self.step_size, self.step_size) + attended_features = attended_features.view( + pre_logits.size(0), + self.in_channels, + self.step_size[0], + self.step_size[1], + ) attended_features = self.cbam(attended_features) attended_features = attended_features.view( pre_logits.size(0), - self.in_channels * self.step_size * self.step_size, + self.in_channels * self.step_size[0] * self.step_size[1], ) return self.fc_subclass(attended_features) diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py index 8470d398aee..40f92c594a4 100644 --- a/src/otx/algo/classification/mobilenet_v3.py +++ b/src/otx/algo/classification/mobilenet_v3.py @@ -6,7 +6,8 @@ from __future__ import annotations -from copy import deepcopy +from copy import copy, deepcopy +from math import ceil from typing import TYPE_CHECKING, Any, Literal import torch @@ -331,6 +332,9 @@ def _build_model(self, head_config: dict) -> nn.Module: if not isinstance(self.label_info, HLabelInfo): raise TypeError(self.label_info) + copied_head_config = copy(head_config) + copied_head_config["step_size"] = (ceil(self.input_size[0] / 32), ceil(self.input_size[1] / 32)) + return ImageClassifier( backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size), neck=nn.Identity(), @@ -338,7 +342,7 @@ def _build_model(self, head_config: dict) -> nn.Module: in_channels=960, multiclass_loss=nn.CrossEntropyLoss(), multilabel_loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"), - **head_config, + **copied_head_config, ), optimize_gap=False, ) diff --git a/tests/unit/algo/classification/heads/test_hlabel_cls_head.py b/tests/unit/algo/classification/heads/test_hlabel_cls_head.py index 11e7191dc49..a32f9bb14d4 100644 --- a/tests/unit/algo/classification/heads/test_hlabel_cls_head.py +++ b/tests/unit/algo/classification/heads/test_hlabel_cls_head.py @@ -169,3 +169,22 @@ def test_pre_logits(self, fxt_hierarchical_cbam_cls_head) -> None: input_tensor = torch.rand((8, 64, 7, 7)) pre_logits = fxt_hierarchical_cbam_cls_head.pre_logits(input_tensor) assert pre_logits.shape == (8, 64 * 7 * 7) + + def test_pre_logits_tuple_step_size(self) -> None: + head_idx_to_logits_range = {"0": (0, 5), "1": (5, 10), "2": (10, 12)} + head = HierarchicalCBAMClsHead( + num_multiclass_heads=3, + num_multilabel_classes=0, + head_idx_to_logits_range=head_idx_to_logits_range, + num_single_label_classes=12, + empty_multiclass_head_indices=[], + in_channels=64, + num_classes=12, + multiclass_loss=CrossEntropyLoss(), + multilabel_loss=None, + step_size=(14, 7), + ) + + input_tensor = torch.rand((8, 64, 14, 7)) + pre_logits = head.pre_logits(input_tensor) + assert pre_logits.shape == (8, 64 * 14 * 7)