From 27df73cfca73164664eebd9003a4312f37bf35dd Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 31 Jul 2024 11:10:26 +0900
Subject: [PATCH 01/42] draft implementation

---
 .../classification/backbones/efficientnet.py  | 20 +++++-----
 src/otx/algo/classification/dino_v2.py        |  6 ++-
 src/otx/algo/classification/efficientnet.py   | 22 ++++++----
 .../algo/classification/efficientnet_v2.py    | 14 +++++--
 .../algo/classification/huggingface_model.py  |  6 ++-
 src/otx/algo/classification/mobilenet_v3.py   | 22 ++++++----
 .../algo/classification/torchvision_model.py  |  6 ++-
 src/otx/algo/classification/vit.py            | 22 ++++++----
 src/otx/algo/detection/atss.py                | 18 ++++++---
 src/otx/algo/detection/huggingface_model.py   |  6 +--
 src/otx/algo/detection/rtdetr.py              | 31 +++++++++-----
 src/otx/algo/detection/rtmdet.py              | 22 +++++++---
 src/otx/algo/detection/ssd.py                 | 16 ++++++--
 src/otx/algo/detection/yolox.py               | 40 +++++++++++++------
 src/otx/core/model/base.py                    |  4 +-
 src/otx/core/model/classification.py          | 23 +++++++----
 src/otx/core/model/detection.py               |  4 +-
 src/otx/core/model/segmentation.py            |  3 ++
 src/otx/core/types/label.py                   |  2 +-
 src/otx/recipe/detection/rtdetr_101.yaml      | 15 +++----
 src/otx/recipe/detection/rtdetr_18.yaml       | 15 +++----
 src/otx/recipe/detection/rtdetr_50.yaml       | 15 +++----
 22 files changed, 212 insertions(+), 120 deletions(-)

diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
index 9682dda3ce4..e69d0b2320b 100644
--- a/src/otx/algo/classification/backbones/efficientnet.py
+++ b/src/otx/algo/classification/backbones/efficientnet.py
@@ -569,43 +569,43 @@ class OTXEfficientNet(EfficientNet):
         in_size : tuple of two ints. Spatial size of the expected input image.
     """
 
-    def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
+    def __init__(self, version: EFFICIENTNET_VERSION, in_size: tuple[int, int] | None = None, **kwargs):
         self.model_name = "efficientnet_" + version
 
         if version == "b0":
-            in_size = (224, 224)
+            in_size = in_size or (224, 224)
             depth_factor = 1.0
             width_factor = 1.0
         elif version == "b1":
-            in_size = (240, 240)
+            in_size = in_size or (240, 240)
             depth_factor = 1.1
             width_factor = 1.0
         elif version == "b2":
-            in_size = (260, 260)
+            in_size = in_size or (260, 260)
             depth_factor = 1.2
             width_factor = 1.1
         elif version == "b3":
-            in_size = (300, 300)
+            in_size = in_size or (300, 300)
             depth_factor = 1.4
             width_factor = 1.2
         elif version == "b4":
-            in_size = (380, 380)
+            in_size = in_size or (380, 380)
             depth_factor = 1.8
             width_factor = 1.4
         elif version == "b5":
-            in_size = (456, 456)
+            in_size = in_size or (456, 456)
             depth_factor = 2.2
             width_factor = 1.6
         elif version == "b6":
-            in_size = (528, 528)
+            in_size = in_size or (528, 528)
             depth_factor = 2.6
             width_factor = 1.8
         elif version == "b7":
-            in_size = (600, 600)
+            in_size = in_size or (600, 600)
             depth_factor = 3.1
             width_factor = 2.0
         elif version == "b8":
-            in_size = (672, 672)
+            in_size = in_size or (672, 672)
             depth_factor = 3.6
             width_factor = 2.2
         else:
diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py
index a24adf76d2a..5afe02e1869 100644
--- a/src/otx/algo/classification/dino_v2.py
+++ b/src/otx/algo/classification/dino_v2.py
@@ -8,7 +8,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -119,6 +119,7 @@ def __init__(
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
         freeze_backbone: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
@@ -129,6 +130,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -195,7 +197,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index c939e1b1421..e118de1a61d 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -60,6 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -69,6 +70,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -87,7 +89,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
@@ -145,7 +147,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -193,7 +195,7 @@ class EfficientNetForMulticlassClsSemiSL(EfficientNetForMulticlassCls):
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return SemiSLClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.image_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=OTXSemiSLLinearClsHead(
                 num_classes=num_classes,
@@ -276,6 +278,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -285,6 +288,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -303,7 +307,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelLinearClsHead(
                 num_classes=num_classes,
@@ -358,7 +362,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -404,6 +408,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -413,6 +418,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -437,7 +443,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalLinearClsHead(
                 in_channels=1280,
@@ -515,7 +521,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py
index 3d6ed09369b..24aaab18dfa 100644
--- a/src/otx/algo/classification/efficientnet_v2.py
+++ b/src/otx/algo/classification/efficientnet_v2.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -60,6 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -67,6 +68,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -140,7 +142,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -267,6 +269,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -274,6 +277,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -347,7 +351,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.image_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -392,6 +396,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -399,6 +404,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -498,7 +504,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index 8f088b668c4..5671160aee0 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -61,6 +61,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.model_name = model_name_or_path
 
@@ -70,6 +71,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -110,7 +112,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index b5808a29a00..eb3192c44ac 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -71,6 +71,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -79,6 +80,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -97,7 +99,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
@@ -152,7 +154,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -200,7 +202,7 @@ class MobileNetV3ForMulticlassClsSemiSL(MobileNetV3ForMulticlassCls):
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return SemiSLClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=OTXSemiSLLinearClsHead(
                 num_classes=num_classes,
@@ -283,6 +285,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -291,6 +294,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -309,7 +313,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelNonLinearClsHead(
                 num_classes=num_classes,
@@ -366,7 +370,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -412,6 +416,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -420,6 +425,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -444,7 +450,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalNonLinearClsHead(
                 in_channels=960,
@@ -522,7 +528,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 89aa3e11812..222f24a4f3e 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -422,6 +422,7 @@ def __init__(
             OTXTaskType.H_LABEL_CLS,
         ] = OTXTaskType.MULTI_CLASS_CLS,
         train_type: Literal["supervised", "semi_supervised"] = "supervised",
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
@@ -442,6 +443,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -552,7 +554,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.image_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 86d05b71218..43e39f0af89 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -7,7 +7,7 @@
 import types
 from copy import deepcopy
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Generic
+from typing import TYPE_CHECKING, Any, Callable, Generic, Sequence
 from urllib.parse import urlparse
 
 import numpy as np
@@ -226,6 +226,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -236,6 +237,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -281,7 +283,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -346,7 +348,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -373,7 +375,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=224)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:])
         return SemiSLClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -463,6 +465,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -474,6 +477,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -518,7 +522,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -582,7 +586,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.image_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -610,6 +614,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -621,6 +626,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -670,7 +676,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=224, lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -757,7 +763,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=self.input_shape,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index b6febc9f981..3e1186dc046 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
 from otx.algo.common.backbones import ResNeXt, build_model_including_pytorchcv
 from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss
@@ -28,6 +28,18 @@
 class ATSS(ExplainableOTXDetModel):
     """OTX Detection model class for ATSS."""
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 800, 992),
+        tile_image_size: Sequence[int] = (1, 3, 800, 992),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
@@ -68,8 +80,6 @@ class MobileNetV2ATSS(ATSS):
         "https://storage.openvinotoolkit.org/repositories/"
         "openvino_training_extensions/models/object_detection/v2/mobilenet_v2-atss.pth"
     )
-    image_size = (1, 3, 800, 992)
-    tile_image_size = (1, 3, 800, 992)
     mean = (0.0, 0.0, 0.0)
     std = (255.0, 255.0, 255.0)
 
@@ -140,8 +150,6 @@ class ResNeXt101ATSS(ATSS):
         "https://storage.openvinotoolkit.org/repositories/"
         "openvino_training_extensions/models/object_detection/v2/resnext101_atss_070623.pth"
     )
-    image_size = (1, 3, 800, 992)
-    tile_image_size = (1, 3, 800, 992)
     mean = (0.0, 0.0, 0.0)
     std = (255.0, 255.0, 255.0)
 
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index 7fe94226b05..c14537ddb00 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -67,6 +67,7 @@ def __init__(
     ) -> None:
         self.model_name = model_name_or_path
         self.load_from = None
+        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
 
         super().__init__(
             label_info=label_info,
@@ -74,8 +75,8 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=(1, 3, *self.image_processor.size.values()),
         )
-        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return AutoModelForObjectDetection.from_pretrained(
@@ -148,13 +149,12 @@ def _customize_outputs(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        image_size = (1, 3, *self.image_processor.size.values())
         image_mean = (0.0, 0.0, 0.0)
         image_std = (255.0, 255.0, 255.0)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=image_size,
+            input_size=self.input_shape,
             mean=image_mean,  # type: ignore[arg-type]
             std=image_std,  # type: ignore[arg-type]
             resize_mode="standard",
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 1a7fdb6eba9..43fadf4e347 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -7,7 +7,7 @@
 
 import copy
 import re
-from typing import Any
+from typing import Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -28,11 +28,20 @@
 class RTDETR(ExplainableOTXDetModel):
     """RTDETR model."""
 
-    image_size = (1, 3, 640, 640)
     mean: tuple[float, float, float] = (0.0, 0.0, 0.0)
     std: tuple[float, float, float] = (255.0, 255.0, 255.0)
     load_from: str | None = None
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 640, 640),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+
     def _customize_inputs(
         self,
         entity: DetBatchDataEntity,
@@ -163,12 +172,12 @@ def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> lis
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_shape is None:
+            raise ValueError(self.input_shape)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
@@ -211,13 +220,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
         encoder = HybridEncoder(
             in_channels=[128, 256, 512],
             expansion=0.5,
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             num_decoder_layers=3,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
         )
 
         optimizer_configuration = [
@@ -254,12 +263,12 @@ def _build_model(self, num_classes: int) -> nn.Module:
             norm_cfg={"type": "FBN", "name": "norm"},
         )
         encoder = HybridEncoder(
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
             num_decoder_layers=6,
         )
 
@@ -301,13 +310,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
             hidden_dim=384,
             dim_feedforward=2048,
             in_channels=[512, 1024, 2048],
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
         )
 
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[384, 384, 384],
-            eval_spatial_size=self.image_size[2:],
+            eval_spatial_size=self.input_shape[2:],
         )
 
         # no bias decay and learning rate correction for the backbone.
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index 75e2e956f55..cc92cddf6f9 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -5,6 +5,8 @@
 
 from __future__ import annotations
 
+from typing import Sequence
+
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import GIoULoss, QualityFocalLoss
 from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss
@@ -24,15 +26,27 @@
 class RTMDet(ExplainableOTXDetModel):
     """OTX Detection model class for RTMDet."""
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 640, 640),
+        tile_image_size: Sequence[int] = (1, 3, 640, 640),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_shape is None:
+            raise ValueError(self.input_shape)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -62,8 +76,6 @@ class RTMDetTiny(RTMDet):
     """RTMDet Tiny Model."""
 
     load_from = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/rtmdet_tiny.pth"
-    image_size = (1, 3, 640, 640)
-    tile_image_size = (1, 3, 640, 640)
     mean = (103.53, 116.28, 123.675)
     std = (57.375, 57.12, 58.395)
 
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index 3b23ded94f0..95a6432c618 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
 from datumaro.components.annotation import Bbox
@@ -42,11 +42,21 @@ class SSD(ExplainableOTXDetModel):
         "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions"
         "/models/object_detection/v2/mobilenet_v2-2s_ssd-992x736.pth"
     )
-    image_size = (1, 3, 864, 864)
-    tile_image_size = (1, 3, 864, 864)
     mean = (0.0, 0.0, 0.0)
     std = (255.0, 255.0, 255.0)
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 864, 864),
+        tile_image_size: Sequence[int] = (1, 3, 864, 864),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg = {
             "assigner": MaxIoUAssigner(
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index 38acf96438f..e9b656bb648 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 from otx.algo.common.losses import CrossEntropyLoss, L1Loss
 from otx.algo.detection.backbones import CSPDarknet
@@ -29,6 +29,18 @@
 class YOLOX(ExplainableOTXDetModel):
     """OTX Detection model class for YOLOX."""
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 640, 640),
+        tile_image_size: Sequence[int] = (1, 3, 640, 640),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _customize_inputs(
         self,
         entity: DetBatchDataEntity,
@@ -40,14 +52,14 @@ def _customize_inputs(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_shape is None:
+            raise ValueError(self.input_shape)
 
         swap_rgb = not isinstance(self, YOLOXTINY)  # only YOLOX-TINY uses RGB
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -112,11 +124,21 @@ class YOLOXTINY(YOLOX):
         "https://storage.openvinotoolkit.org/repositories/"
         "openvino_training_extensions/models/object_detection/v2/yolox_tiny_8x8.pth"
     )
-    image_size = (1, 3, 416, 416)
-    tile_image_size = (1, 3, 416, 416)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        input_shape: Sequence[int] = (1, 3, 416, 416),
+        tile_image_size: Sequence[int] = (1, 3, 416, 416),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)}
         test_cfg = {
@@ -151,8 +173,6 @@ class YOLOXS(YOLOX):
         "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/"
         "yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth"
     )
-    image_size = (1, 3, 640, 640)
-    tile_image_size = (1, 3, 640, 640)
     mean = (0.0, 0.0, 0.0)
     std = (1.0, 1.0, 1.0)
 
@@ -190,8 +210,6 @@ class YOLOXL(YOLOX):
         "https://download.openmmlab.com/mmdetection/v2.0/yolox/"
         "yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth"
     )
-    image_size = (1, 3, 640, 640)
-    tile_image_size = (1, 3, 640, 640)
     mean = (0.0, 0.0, 0.0)
     std = (1.0, 1.0, 1.0)
 
@@ -224,8 +242,6 @@ class YOLOXX(YOLOX):
         "https://download.openmmlab.com/mmdetection/v2.0/yolox/"
         "yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth"
     )
-    image_size = (1, 3, 640, 640)
-    tile_image_size = (1, 3, 640, 640)
     mean = (0.0, 0.0, 0.0)
     std = (1.0, 1.0, 1.0)
 
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 8e2a26acf6d..3a6efb87663 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -14,7 +14,7 @@
 import warnings
 from abc import abstractmethod
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple
+from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, Sequence
 
 import numpy as np
 import openvino
@@ -108,6 +108,7 @@ def __init__(
         metric: MetricCallable = NullMetricCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__()
 
@@ -118,6 +119,7 @@ def __init__(
         self.optimizer_callable = ensure_callable(optimizer)
         self.scheduler_callable = ensure_callable(scheduler)
         self.metric_callable = ensure_callable(metric)
+        self.input_shape = input_shape
 
         self.torch_compile = torch_compile
         self._explain_mode = False
diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index c189b9b9e32..5613b657ee0 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
 import torch
@@ -55,6 +55,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -62,6 +63,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     @property
@@ -103,17 +105,18 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
         self.config = config
         self.load_from = config.pop("load_from", None)
-        self.image_size = (1, 3, 224, 224)
         super().__init__(
             label_info=label_info,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -217,7 +220,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=mean,
             std=std,
             resize_mode="standard",
@@ -247,6 +250,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -254,6 +258,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     @property
@@ -298,17 +303,18 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = lambda num_labels: Accuracy(task="multilabel", num_labels=num_labels),
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
         self.config = config
         self.load_from = config.pop("load_from", None)
-        self.image_size = (1, 3, 224, 224)
         super().__init__(
             label_info=label_info,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -414,7 +420,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=mean,
             std=std,
             resize_mode="standard",
@@ -436,6 +442,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -443,6 +450,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     @property
@@ -498,6 +506,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
 
@@ -509,13 +518,13 @@ def __init__(
 
         self.config = config
         self.load_from = config.pop("load_from", None)
-        self.image_size = (1, 3, 224, 224)
         super().__init__(
             label_info=label_info,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -621,7 +630,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_shape,
             mean=mean,
             std=std,
             resize_mode="standard",
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index 167157324f3..26fa27ece63 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -9,7 +9,7 @@
 import types
 from abc import abstractmethod
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal
+from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence
 
 import torch
 from model_api.tilers import DetectionTiler
@@ -376,6 +376,7 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
+        input_shape: Sequence[int] | None = None,
     ) -> None:
         from otx.algo.explain.explain_algo import feature_vector_fn
 
@@ -386,6 +387,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
             tile_config=tile_config,
+            input_shape=input_shape,
         )
         self.model.feature_vector_fn = feature_vector_fn
         self.model.explain_fn = self.get_explain_fn()
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 0e8cbf85e06..a5777960279 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -44,6 +44,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
         torch_compile: bool = False,
+        input_shape: Sequence[int] | None = None,
     ):
         """Base semantic segmentation model.
 
@@ -57,6 +58,7 @@ def __init__(
                 Defaults to SegmCallable.
             torch_compile (bool, optional): Whether to compile the model using TorchScript.
                 Defaults to False.
+            input_shape (Sequence[int] | None, optional): The input shape of the model. Defaults to None.
         """
         super().__init__(
             label_info=label_info,
@@ -64,6 +66,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_shape=input_shape,
         )
 
     @property
diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py
index 6b4ff83218f..21df8d94555 100644
--- a/src/otx/core/types/label.py
+++ b/src/otx/core/types/label.py
@@ -7,7 +7,7 @@
 
 import json
 from dataclasses import asdict, dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 if TYPE_CHECKING:
     from datumaro import Label, LabelCategories
diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml
index 8f071d98e89..1ae36dbc26b 100644
--- a/src/otx/recipe/detection/rtdetr_101.yaml
+++ b/src/otx/recipe/detection/rtdetr_101.yaml
@@ -47,6 +47,9 @@ overrides:
         warmup_epochs: 7
 
   data:
+    input_size:
+      - 640
+      - 640
     task: DETECTION
     stack_images: true
     data_format: coco_instances
@@ -65,9 +68,7 @@ overrides:
             prob: 0.5
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -85,9 +86,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -102,9 +101,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml
index 1eca525f793..4e11fa20499 100644
--- a/src/otx/recipe/detection/rtdetr_18.yaml
+++ b/src/otx/recipe/detection/rtdetr_18.yaml
@@ -46,6 +46,9 @@ overrides:
         warmup_epochs: 7
 
   data:
+    input_size:
+      - 640
+      - 640
     task: DETECTION
     stack_images: true
     data_format: coco_instances
@@ -64,9 +67,7 @@ overrides:
             prob: 0.5
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -84,9 +85,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -101,9 +100,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml
index 7254550faaa..9adb14819a7 100644
--- a/src/otx/recipe/detection/rtdetr_50.yaml
+++ b/src/otx/recipe/detection/rtdetr_50.yaml
@@ -47,6 +47,9 @@ overrides:
         warmup_epochs: 7
 
   data:
+    input_size:
+      - 640
+      - 640
     task: DETECTION
     stack_images: true
     data_format: coco_instances
@@ -65,9 +68,7 @@ overrides:
             prob: 0.5
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -85,9 +86,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true
@@ -102,9 +101,7 @@ overrides:
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale:
-              - 640
-              - 640
+            scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
             is_numpy_to_tvtensor: true

From bb9b66e0848e41a5b304662c10f5d11ee7a912fc Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 31 Jul 2024 14:39:35 +0900
Subject: [PATCH 02/42] draft implementation2

---
 src/otx/algo/action_classification/movinet.py |  4 +-
 src/otx/algo/action_classification/x3d.py     |  4 +-
 src/otx/algo/anomaly/padim.py                 |  8 +--
 src/otx/algo/anomaly/stfpm.py                 |  6 ++-
 src/otx/algo/classification/dino_v2.py        |  6 +--
 src/otx/algo/classification/efficientnet.py   | 24 ++++-----
 .../algo/classification/efficientnet_v2.py    | 16 +++---
 .../algo/classification/huggingface_model.py  |  6 +--
 src/otx/algo/classification/mobilenet_v3.py   | 26 +++++-----
 .../algo/classification/torchvision_model.py  |  6 +--
 src/otx/algo/classification/vit.py            | 24 ++++-----
 src/otx/algo/detection/atss.py                |  4 +-
 src/otx/algo/detection/huggingface_model.py   |  6 ++-
 src/otx/algo/detection/rtdetr.py              | 22 ++++----
 src/otx/algo/detection/rtmdet.py              | 10 ++--
 src/otx/algo/detection/ssd.py                 |  4 +-
 src/otx/algo/detection/yolox.py               | 14 ++---
 .../algo/instance_segmentation/maskrcnn.py    | 51 +++++++++++++++----
 .../algo/instance_segmentation/maskrcnn_tv.py | 21 +++++---
 .../algo/instance_segmentation/rtmdet_inst.py | 22 +++++---
 src/otx/algo/segmentation/dino_v2_seg.py      | 11 +++-
 .../algo/segmentation/huggingface_model.py    | 12 ++---
 src/otx/algo/segmentation/litehrnet.py        | 13 ++++-
 src/otx/algo/segmentation/segnext.py          | 11 +++-
 .../algo/visual_prompting/segment_anything.py | 19 ++++---
 .../zero_shot_segment_anything.py             | 17 ++++---
 src/otx/core/model/action_classification.py   |  6 +--
 src/otx/core/model/anomaly.py                 | 18 +++----
 src/otx/core/model/base.py                    |  4 +-
 src/otx/core/model/classification.py          | 30 +++++------
 src/otx/core/model/detection.py               |  4 +-
 src/otx/core/model/instance_segmentation.py   |  6 ++-
 src/otx/core/model/segmentation.py            | 12 +++--
 src/otx/core/model/visual_prompting.py        |  8 ++-
 src/otx/core/types/label.py                   |  2 +-
 .../recipe/semantic_segmentation/dino_v2.yaml |  7 ---
 36 files changed, 278 insertions(+), 186 deletions(-)

diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py
index 9e6863f90aa..7c5861d2af6 100644
--- a/src/otx/algo/action_classification/movinet.py
+++ b/src/otx/algo/action_classification/movinet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
 from torch import nn
 
@@ -32,6 +32,7 @@ class MoViNet(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int] = (1, 1, 3, 8, 224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -40,6 +41,7 @@ def __init__(
         self.load_from = "https://github.com/Atze00/MoViNet-pytorch/blob/main/weights/modelA0_statedict_v3?raw=true"
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
index dbb6cb0f490..6c26f2deb2f 100644
--- a/src/otx/algo/action_classification/x3d.py
+++ b/src/otx/algo/action_classification/x3d.py
@@ -4,7 +4,7 @@
 """X3D model implementation."""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
 from torch import nn
 
@@ -31,6 +31,7 @@ class X3D(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int] = (1, 1, 3, 8, 224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -39,6 +40,7 @@ def __init__(
         self.load_from = "https://download.openmmlab.com/mmaction/recognition/x3d/facebook/x3d_m_facebook_16x5x1_kinetics400_rgb_20201027-3f42382a.pth"
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py
index 201b0230a02..4f5fb0be6a9 100644
--- a/src/otx/algo/anomaly/padim.py
+++ b/src/otx/algo/anomaly/padim.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, Sequence
 
 from anomalib.models.image import Padim as AnomalibPadim
 
@@ -34,6 +34,7 @@ class Padim(OTXAnomaly, OTXModel, AnomalibPadim):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (Sequence[int]): The input shape of the model.
     """
 
     def __init__(
@@ -47,9 +48,10 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: Sequence[int] = (256, 256),
     ) -> None:
-        OTXAnomaly.__init__(self)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
+        OTXAnomaly.__init__(self, input_size)
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size)
         AnomalibPadim.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index 72dd30e8aa3..67963c25444 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -32,6 +32,7 @@ class Stfpm(OTXAnomaly, OTXModel, AnomalibStfpm):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (Sequence[int]): The input shape of the model.
     """
 
     def __init__(
@@ -43,10 +44,11 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: Sequence[int] = (256, 256),
         **kwargs,
     ) -> None:
-        OTXAnomaly.__init__(self)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
+        OTXAnomaly.__init__(self, input_size)
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size)
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py
index 5afe02e1869..f6430e63f8a 100644
--- a/src/otx/algo/classification/dino_v2.py
+++ b/src/otx/algo/classification/dino_v2.py
@@ -119,7 +119,7 @@ def __init__(
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
         freeze_backbone: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
@@ -130,7 +130,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -197,7 +197,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index e118de1a61d..6f41a844673 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -60,7 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -70,7 +70,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
@@ -147,7 +147,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -278,7 +278,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -288,7 +288,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -307,7 +307,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelLinearClsHead(
                 num_classes=num_classes,
@@ -362,7 +362,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -408,7 +408,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -418,7 +418,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -443,7 +443,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_shape[-2:]),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalLinearClsHead(
                 in_channels=1280,
@@ -521,7 +521,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py
index 24aaab18dfa..acb17d0ecc8 100644
--- a/src/otx/algo/classification/efficientnet_v2.py
+++ b/src/otx/algo/classification/efficientnet_v2.py
@@ -60,7 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -68,7 +68,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -142,7 +142,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -269,7 +269,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -277,7 +277,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -396,7 +396,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -404,7 +404,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -504,7 +504,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index 5671160aee0..56432533dcc 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -61,7 +61,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.model_name = model_name_or_path
 
@@ -71,7 +71,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -112,7 +112,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index eb3192c44ac..570952efb01 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -71,7 +71,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -80,7 +80,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -99,7 +99,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
@@ -154,7 +154,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -202,7 +202,7 @@ class MobileNetV3ForMulticlassClsSemiSL(MobileNetV3ForMulticlassCls):
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return SemiSLClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=OTXSemiSLLinearClsHead(
                 num_classes=num_classes,
@@ -285,7 +285,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -294,7 +294,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -313,7 +313,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelNonLinearClsHead(
                 num_classes=num_classes,
@@ -370,7 +370,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -416,7 +416,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -425,7 +425,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -450,7 +450,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_shape[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalNonLinearClsHead(
                 in_channels=960,
@@ -528,7 +528,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 222f24a4f3e..9b30a6db22a 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -422,7 +422,7 @@ def __init__(
             OTXTaskType.H_LABEL_CLS,
         ] = OTXTaskType.MULTI_CLASS_CLS,
         train_type: Literal["supervised", "semi_supervised"] = "supervised",
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
@@ -443,7 +443,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -554,7 +554,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 43e39f0af89..4de1c8ac984 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -226,7 +226,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -237,7 +237,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -283,7 +283,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -348,7 +348,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -375,7 +375,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:])
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:])
         return SemiSLClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -465,7 +465,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -477,7 +477,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -522,7 +522,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -614,7 +614,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -626,7 +626,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
+            input_size=input_size,
         )
 
     def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict:
@@ -676,7 +676,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_shape[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -763,7 +763,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index 3e1186dc046..76884ab383f 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -30,12 +30,12 @@ class ATSS(ExplainableOTXDetModel):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 800, 992),
+        input_size: Sequence[int] = (1, 3, 800, 992),
         tile_image_size: Sequence[int] = (1, 3, 800, 992),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
         self.tile_image_size = tile_image_size
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index c14537ddb00..393e3d5a96f 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -68,14 +68,16 @@ def __init__(
         self.model_name = model_name_or_path
         self.load_from = None
         self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
+        if len(input_size := self.image_processor.size.values()) == 1:
+            input_size = (*input_size, *input_size)
 
         super().__init__(
             label_info=label_info,
+            input_size=(1, 3, *input_size),
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=(1, 3, *self.image_processor.size.values()),
         )
 
     def _build_model(self, num_classes: int) -> nn.Module:
@@ -154,7 +156,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=image_mean,  # type: ignore[arg-type]
             std=image_std,  # type: ignore[arg-type]
             resize_mode="standard",
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 43fadf4e347..623d81c611c 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -34,11 +34,11 @@ class RTDETR(ExplainableOTXDetModel):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 640, 640),
+        input_size: Sequence[int] = (1, 3, 640, 640),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
 
@@ -172,12 +172,12 @@ def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> lis
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.input_shape is None:
-            raise ValueError(self.input_shape)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
@@ -220,13 +220,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
         encoder = HybridEncoder(
             in_channels=[128, 256, 512],
             expansion=0.5,
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             num_decoder_layers=3,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
         )
 
         optimizer_configuration = [
@@ -263,12 +263,12 @@ def _build_model(self, num_classes: int) -> nn.Module:
             norm_cfg={"type": "FBN", "name": "norm"},
         )
         encoder = HybridEncoder(
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
             num_decoder_layers=6,
         )
 
@@ -310,13 +310,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
             hidden_dim=384,
             dim_feedforward=2048,
             in_channels=[512, 1024, 2048],
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
         )
 
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[384, 384, 384],
-            eval_spatial_size=self.input_shape[2:],
+            eval_spatial_size=self.input_size[2:],
         )
 
         # no bias decay and learning rate correction for the backbone.
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index cc92cddf6f9..b1a87088d80 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -28,12 +28,12 @@ class RTMDet(ExplainableOTXDetModel):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 640, 640),
+        input_size: Sequence[int] = (1, 3, 640, 640),
         tile_image_size: Sequence[int] = (1, 3, 640, 640),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
         self.tile_image_size = tile_image_size
@@ -41,12 +41,12 @@ def __init__(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.input_shape is None:
-            raise ValueError(self.input_shape)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index 95a6432c618..43abeee8c7a 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -47,12 +47,12 @@ class SSD(ExplainableOTXDetModel):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 864, 864),
+        input_size: Sequence[int] = (1, 3, 864, 864),
         tile_image_size: Sequence[int] = (1, 3, 864, 864),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
         self.tile_image_size = tile_image_size
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index e9b656bb648..1796a74899e 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -31,12 +31,12 @@ class YOLOX(ExplainableOTXDetModel):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 640, 640),
+        input_size: Sequence[int] = (1, 3, 640, 640),
         tile_image_size: Sequence[int] = (1, 3, 640, 640),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
         self.tile_image_size = tile_image_size
@@ -52,14 +52,14 @@ def _customize_inputs(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.input_shape is None:
-            raise ValueError(self.input_shape)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
         swap_rgb = not isinstance(self, YOLOXTINY)  # only YOLOX-TINY uses RGB
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -129,12 +129,12 @@ class YOLOXTINY(YOLOX):
 
     def __init__(
         self,
-        input_shape: Sequence[int] = (1, 3, 416, 416),
+        input_size: Sequence[int] = (1, 3, 416, 416),
         tile_image_size: Sequence[int] = (1, 3, 416, 416),
         **kwargs
     ) -> None:
         super().__init__(
-            input_shape=input_shape,
+            input_size=input_size,
             **kwargs
         )
         self.tile_image_size = tile_image_size
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index a8910e91bb3..599367f4ef6 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Sequence
 
 from torchvision.ops import RoIAlign
 
@@ -32,10 +32,10 @@ class MaskRCNN(ExplainableOTXInstanceSegModel):
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
-        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.image_size
+        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
@@ -73,11 +73,21 @@ class MaskRCNNResNet50(MaskRCNN):
         "https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/"
         "mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth"
     )
-    image_size = (1, 3, 1024, 1024)
-    tile_image_size = (1, 3, 512, 512)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        tile_image_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
             "rpn": {
@@ -245,11 +255,21 @@ class MaskRCNNEfficientNet(MaskRCNN):
         "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/"
         "models/instance_segmentation/v2/efficientnet_b2b-mask_rcnn-576x576.pth"
     )
-    image_size = (1, 3, 1024, 1024)
-    tile_image_size = (1, 3, 512, 512)
     mean = (123.675, 116.28, 103.53)
     std = (1.0, 1.0, 1.0)
 
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        tile_image_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
             "rpn": {
@@ -434,11 +454,22 @@ class MaskRCNNSwinT(MaskRCNN):
         "mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/"
         "mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth"
     )
-    image_size = (1, 3, 1344, 1344)
-    tile_image_size = (1, 3, 512, 512)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 1344, 1344),
+        tile_image_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
+
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
             "rpn": {
diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
index d6f5bea1bda..e5afb877998 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any
+from typing import Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -218,10 +218,10 @@ def _customize_outputs(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
-        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.image_size
+        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
@@ -260,10 +260,19 @@ def forward_for_tracing(self, inputs: Tensor) -> tuple[Tensor, ...]:
 class TVMaskRCNNR50(TVMaskRCNN):
     """Torchvision MaskRCNN model with ResNet50 backbone."""
 
-    image_size = (1, 3, 1024, 1024)
-    tile_image_size = (1, 3, 512, 512)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        tile_image_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
 
     def _create_model(self) -> nn.Module:
         """From torchvision tutorial."""
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 60c6bea25ca..01fe2c10847 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss
@@ -31,12 +31,12 @@ class RTMDetInst(ExplainableOTXInstanceSegModel):
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -81,11 +81,21 @@ class RTMDetInstTiny(RTMDetInst):
         "https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/"
         "rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727-ec670f7e.pth"
     )
-    image_size = (1, 3, 640, 640)
-    tile_image_size = (1, 3, 640, 640)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 640, 640),
+        tile_image_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
+        self.tile_image_size = tile_image_size
+
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg = {
             "assigner": DynamicSoftLabelAssigner(topk=13),
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
index d38001ada88..63baccfacc8 100644
--- a/src/otx/algo/segmentation/dino_v2_seg.py
+++ b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Sequence
 
 from otx.algo.segmentation.backbones import DinoVisionTransformer
 from otx.algo.segmentation.heads import FCNHead
@@ -43,6 +43,15 @@ class DinoV2Seg(BaseSegmModel):
 
 class OTXDinoV2Seg(TorchVisionCompatibleModel):
     """DinoV2Seg Model."""
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 560, 560),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
 
     def _create_model(self) -> nn.Module:
         # merge configurations with defaults overriding them
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index 14e0111f44c..693c2219c05 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import nn
@@ -68,15 +68,18 @@ def __init__(
     ) -> None:
         self.model_name = model_name_or_path
         self.load_from = None
+        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
+        if len(input_size := self.image_processor.size.values()) == 1:
+            input_size = (*input_size, *input_size)
 
         super().__init__(
             label_info=label_info,
+            input_size=(1, 3, *input_size),
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
 
     def _create_model(self) -> nn.Module:
         return AutoModelForSemanticSegmentation.from_pretrained(
@@ -121,15 +124,12 @@ def _customize_outputs(
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        size = self.image_processor.size.values()
-        size = (*size, *size) if len(size) == 1 else size
-        image_size = (1, 3, *size)
         image_mean = (123.675, 116.28, 103.53)
         image_std = (58.395, 57.12, 57.375)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=image_size,
+            input_size=self.input_size,
             mean=image_mean,
             std=image_std,
             resize_mode="standard",
diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index b24ea9bd77d..458a0f44ea6 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Sequence
 
 from torch.onnx import OperatorExportTypes
 
@@ -517,6 +517,15 @@ def ignore_scope(self) -> dict[str, str | dict[str, list[str]]]:
 
 class OTXLiteHRNet(TorchVisionCompatibleModel):
     """LiteHRNet Model."""
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
 
     def _create_model(self) -> nn.Module:
         litehrnet_model_class = LITEHRNET_VARIANTS[self.name_base_model]
@@ -560,7 +569,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.scale,
             resize_mode="standard",
diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py
index c3d2ca86fb3..72287dda2a3 100644
--- a/src/otx/algo/segmentation/segnext.py
+++ b/src/otx/algo/segmentation/segnext.py
@@ -4,7 +4,7 @@
 """SegNext model implementations."""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Sequence
 
 from otx.algo.segmentation.backbones import MSCAN
 from otx.algo.segmentation.heads import LightHamHead
@@ -107,6 +107,15 @@ class SegNextT(BaseSegmModel):
 
 class OTXSegNext(TorchVisionCompatibleModel):
     """SegNext Model."""
+    def __init__(
+        self,
+        input_size: Sequence[int] = (1, 3, 512, 512),
+        **kwargs
+    ) -> None:
+        super().__init__(
+            input_size=input_size,
+            **kwargs
+        )
 
     def _create_model(self) -> nn.Module:
         segnext_model_class = SEGNEXT_VARIANTS[self.name_base_model]
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index be318a1d2ea..93caca5f109 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging as log
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -494,6 +494,7 @@ def __init__(
         self,
         backbone: Literal["tiny_vit", "vit_b"],
         label_info: LabelInfoTypes = NullLabelInfo(),
+        input_size: Sequence[int] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -506,8 +507,17 @@ def __init__(
         return_extra_metrics: bool = False,
         stability_score_offset: float = 1.0,
     ) -> None:
+        super().__init__(
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
         self.config = {
             "backbone": backbone,
+            "image_size": self.input_size[-1],
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
@@ -517,13 +527,6 @@ def __init__(
             "stability_score_offset": stability_score_offset,
             **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone],
         }
-        super().__init__(
-            label_info=label_info,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            metric=metric,
-            torch_compile=torch_compile,
-        )
 
     def _create_model(self) -> nn.Module:
         """Create a PyTorch model for this class."""
diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
index dd650486e30..f4f531be978 100644
--- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
+++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
@@ -648,8 +648,18 @@ def __init__(  # noqa: PLR0913
         return_extra_metrics: bool = False,
         stability_score_offset: float = 1.0,
     ) -> None:
+        super().__init__(
+            label_info=label_info,
+            input_size=(1, 3, 1024, 1024),  # zero-shot visual prompting model uses fixed 1024x1024 input size
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
+
         self.config = {
             "backbone": backbone,
+            "image_size": self.input_size[-1],
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
@@ -661,13 +671,6 @@ def __init__(  # noqa: PLR0913
             "stability_score_offset": stability_score_offset,
             **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone],
         }
-        super().__init__(
-            label_info=label_info,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            metric=metric,
-            torch_compile=torch_compile,
-        )
 
         self.save_outputs = save_outputs
         self.reference_info_dir: Path = Path(reference_info_dir)
diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py
index 08e2553a895..009affae692 100644
--- a/src/otx/core/model/action_classification.py
+++ b/src/otx/core/model/action_classification.py
@@ -46,7 +46,6 @@ def __init__(
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
     ) -> None:
-        self.image_size = (1, 1, 3, 8, 224, 224)
         self.mean = (0.0, 0.0, 0.0)
         self.std = (255.0, 255.0, 255.0)
         super().__init__(
@@ -135,7 +134,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
@@ -186,7 +185,6 @@ def __init__(
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
         self.config = config
         self.load_from = config.pop("load_from", None)
-        self.image_size = (1, 1, 3, 8, 224, 224)
         super().__init__(
             label_info=label_info,
             optimizer=optimizer,
@@ -266,7 +264,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=mean,
             std=std,
             resize_mode="standard",
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index 44edf869b0b..301fb03191c 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging as log
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any, TypeAlias, Sequence
 
 import torch
 from anomalib import TaskType as AnomalibTaskType
@@ -51,10 +51,10 @@
 class OTXAnomaly:
     """Methods used to make OTX model compatible with the Anomalib model."""
 
-    def __init__(self) -> None:
+    def __init__(self, input_size: Sequence[int] = (256, 256)) -> None:
         self.optimizer: list[OptimizerCallable] | OptimizerCallable = None
         self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None
-        self._input_size: tuple[int, int] = (256, 256)
+        self._input_size: tuple[int, int] = input_size
         self.trainer: Trainer
         self.model: nn.Module
         self.image_threshold: BaseThreshold
@@ -116,15 +116,13 @@ def _get_values_from_transforms(
         self,
     ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]:
         """Get the value requested value from default transforms."""
-        image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
+        mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
         for transform in self.configure_transforms().transforms:  # type: ignore[attr-defined]
             name = transform.__class__.__name__
-            if "Resize" in name:
-                image_size = tuple(transform.size)  # type: ignore[assignment]
-            elif "Normalize" in name:
+            if "Normalize" in name:
                 mean_value = tuple(value * 255 for value in transform.mean)  # type: ignore[assignment]
                 std_value = tuple(value * 255 for value in transform.std)  # type: ignore[assignment]
-        return image_size, mean_value, std_value
+        return mean_value, std_value
 
     @property
     def trainable_model(self) -> str | None:
@@ -300,7 +298,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
         """Creates OTXAnomalyModelExporter object that can export anomaly models."""
         min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist()
         max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist()
-        image_shape, mean_values, scale_values = self._get_values_from_transforms()
+        mean_values, scale_values = self._get_values_from_transforms()
         onnx_export_configuration = {
             "opset_version": 14,
             "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}},
@@ -308,7 +306,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
             "output_names": ["output"],
         }
         return OTXAnomalyModelExporter(
-            image_shape=image_shape,
+            image_shape=self.input_size,
             image_threshold=self.image_threshold.value.cpu().numpy().tolist(),
             pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(),
             task=self.task,
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 3a6efb87663..24511ff0585 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -103,23 +103,23 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int] | None = None,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = NullMetricCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__()
 
         self._label_info = self._dispatch_label_info(label_info)
+        self.input_size = input_size
         self.classification_layers: dict[str, dict[str, Any]] = {}
         self.model = self._create_model()
         self._explain_mode = False
         self.optimizer_callable = ensure_callable(optimizer)
         self.scheduler_callable = ensure_callable(scheduler)
         self.metric_callable = ensure_callable(metric)
-        self.input_shape = input_shape
 
         self.torch_compile = torch_compile
         self._explain_mode = False
diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 5613b657ee0..9e1a150c4e8 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -51,19 +51,19 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     @property
@@ -101,22 +101,22 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         config: DictConfig,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
         self.config = config
         self.load_from = config.pop("load_from", None)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -220,7 +220,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=mean,
             std=std,
             resize_mode="standard",
@@ -246,19 +246,19 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     @property
@@ -299,22 +299,22 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         config: DictConfig,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = lambda num_labels: Accuracy(task="multilabel", num_labels=num_labels),
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
         self.config = config
         self.load_from = config.pop("load_from", None)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -420,7 +420,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=mean,
             std=std,
             resize_mode="standard",
@@ -438,19 +438,19 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt
     def __init__(
         self,
         label_info: HLabelInfo,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     @property
@@ -502,11 +502,11 @@ def __init__(
         self,
         label_info: HLabelInfo,
         config: DictConfig,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         config = inplace_num_classes(cfg=config, num_classes=self._dispatch_label_info(label_info).num_classes)
 
@@ -520,11 +520,11 @@ def __init__(
         self.load_from = config.pop("load_from", None)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     def _create_model(self) -> nn.Module:
@@ -630,7 +630,7 @@ def _exporter(self) -> OTXModelExporter:
         mean, std = get_mean_std_from_data_processing(self.config)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_shape,
+            input_size=self.input_size,
             mean=mean,
             std=std,
             resize_mode="standard",
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index 26fa27ece63..eca90192a60 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -371,23 +371,23 @@ class ExplainableOTXDetModel(OTXDetectionModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        input_shape: Sequence[int] | None = None,
     ) -> None:
         from otx.algo.explain.explain_algo import feature_vector_fn
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
             tile_config=tile_config,
-            input_shape=input_shape,
         )
         self.model.feature_vector_fn = feature_vector_fn
         self.model.explain_fn = self.get_explain_fn()
diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py
index 9a838ceae0b..07e744c3ddb 100644
--- a/src/otx/core/model/instance_segmentation.py
+++ b/src/otx/core/model/instance_segmentation.py
@@ -8,7 +8,7 @@
 import logging as log
 import types
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal
+from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence
 
 import numpy as np
 import torch
@@ -53,6 +53,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -61,6 +62,7 @@ def __init__(
     ) -> None:
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -368,6 +370,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -376,6 +379,7 @@ def __init__(
     ) -> None:
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index a5777960279..89cdae1f215 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -40,16 +40,17 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
         torch_compile: bool = False,
-        input_shape: Sequence[int] | None = None,
     ):
         """Base semantic segmentation model.
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
+            input_size (Sequence[int]): The input shape of the model.
             optimizer (OptimizerCallable, optional): The optimizer to use for training.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -58,15 +59,14 @@ def __init__(
                 Defaults to SegmCallable.
             torch_compile (bool, optional): Whether to compile the model using TorchScript.
                 Defaults to False.
-            input_shape (Sequence[int] | None, optional): The input shape of the model. Defaults to None.
         """
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_shape=input_shape,
         )
 
     @property
@@ -115,6 +115,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -129,6 +130,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
+            input_size (Sequence[int]): The input shape of the model.
             optimizer (OptimizerCallable, optional): The optimizer callable for the model.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -151,13 +153,13 @@ def __init__(
         self.decode_head_configuration = decode_head_configuration if decode_head_configuration is not None else {}
         export_image_configuration = export_image_configuration if export_image_configuration is not None else {}
         self.criterion_configuration = criterion_configuration
-        self.image_size = tuple(export_image_configuration.get("image_size", (1, 3, 512, 512)))
         self.mean = export_image_configuration.get("mean", [123.675, 116.28, 103.53])
         self.scale = export_image_configuration.get("std", [58.395, 57.12, 57.375])
         self.name_base_model = name_base_model
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -198,7 +200,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.scale,
             resize_mode="standard",
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index 7a4fa917993..749ec5ce0db 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 import numpy as np
 import torch
@@ -155,6 +155,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro
 
     def __init__(
         self,
+        input_size: Sequence[int],
         label_info: LabelInfoTypes = NullLabelInfo(),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -165,6 +166,7 @@ def __init__(
         log.debug(msg)
         super().__init__(
             label_info=NullLabelInfo(),
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -262,6 +264,7 @@ class OTXZeroShotVisualPromptingModel(
 
     def __init__(
         self,
+        input_size: Sequence[int],
         label_info: LabelInfoTypes = NullLabelInfo(),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -272,6 +275,7 @@ def __init__(
         log.debug(msg)
         super().__init__(
             label_info=NullLabelInfo(),
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -283,7 +287,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXVisualPromptingModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, self.model.image_size, self.model.image_size),
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="fit_to_window",
diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py
index 21df8d94555..6b4ff83218f 100644
--- a/src/otx/core/types/label.py
+++ b/src/otx/core/types/label.py
@@ -7,7 +7,7 @@
 
 import json
 from dataclasses import asdict, dataclass
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from datumaro import Label, LabelCategories
diff --git a/src/otx/recipe/semantic_segmentation/dino_v2.yaml b/src/otx/recipe/semantic_segmentation/dino_v2.yaml
index b62e173e74c..984e858860d 100644
--- a/src/otx/recipe/semantic_segmentation/dino_v2.yaml
+++ b/src/otx/recipe/semantic_segmentation/dino_v2.yaml
@@ -17,13 +17,6 @@ model:
           - 0.999
         weight_decay: 0.0001
 
-    export_image_configuration:
-      image_size:
-        - 1
-        - 3
-        - 560
-        - 560
-
     scheduler:
       class_path: torch.optim.lr_scheduler.PolynomialLR
       init_args:

From b6a76854b29a4a3075f2d1a69eef929ba5764386 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 31 Jul 2024 16:38:06 +0900
Subject: [PATCH 03/42] check input size constant value

---
 .../classification/backbones/efficientnet.py  | 20 +++++++++----------
 src/otx/algo/classification/efficientnet.py   |  6 +++---
 .../encoders/sam_image_encoder.py             |  4 ++--
 .../algo/visual_prompting/segment_anything.py |  2 +-
 src/otx/core/exporter/visual_prompting.py     |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
index e69d0b2320b..9682dda3ce4 100644
--- a/src/otx/algo/classification/backbones/efficientnet.py
+++ b/src/otx/algo/classification/backbones/efficientnet.py
@@ -569,43 +569,43 @@ class OTXEfficientNet(EfficientNet):
         in_size : tuple of two ints. Spatial size of the expected input image.
     """
 
-    def __init__(self, version: EFFICIENTNET_VERSION, in_size: tuple[int, int] | None = None, **kwargs):
+    def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
         self.model_name = "efficientnet_" + version
 
         if version == "b0":
-            in_size = in_size or (224, 224)
+            in_size = (224, 224)
             depth_factor = 1.0
             width_factor = 1.0
         elif version == "b1":
-            in_size = in_size or (240, 240)
+            in_size = (240, 240)
             depth_factor = 1.1
             width_factor = 1.0
         elif version == "b2":
-            in_size = in_size or (260, 260)
+            in_size = (260, 260)
             depth_factor = 1.2
             width_factor = 1.1
         elif version == "b3":
-            in_size = in_size or (300, 300)
+            in_size = (300, 300)
             depth_factor = 1.4
             width_factor = 1.2
         elif version == "b4":
-            in_size = in_size or (380, 380)
+            in_size = (380, 380)
             depth_factor = 1.8
             width_factor = 1.4
         elif version == "b5":
-            in_size = in_size or (456, 456)
+            in_size = (456, 456)
             depth_factor = 2.2
             width_factor = 1.6
         elif version == "b6":
-            in_size = in_size or (528, 528)
+            in_size = (528, 528)
             depth_factor = 2.6
             width_factor = 1.8
         elif version == "b7":
-            in_size = in_size or (600, 600)
+            in_size = (600, 600)
             depth_factor = 3.1
             width_factor = 2.0
         elif version == "b8":
-            in_size = in_size or (672, 672)
+            in_size = (672, 672)
             depth_factor = 3.6
             width_factor = 2.2
         else:
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index 6f41a844673..2371fddd2d1 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -60,18 +60,18 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
         super().__init__(
             label_info=label_info,
+            input_size=(1, 3, 224, 224),
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
-            input_size=input_size,
         )
+        self.input_size = (1, 3, *self.model.backbone.in_size)
 
     def _create_model(self) -> nn.Module:
         # Get classification_layers for class-incr learning
@@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
+            backbone=OTXEfficientNet(version=self.version, pretrained=True),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
diff --git a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
index 6143fd139c0..3feef21aba5 100644
--- a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
+++ b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
@@ -70,11 +70,11 @@ def __new__(cls, backbone: str, *args, **kwargs):  # noqa: ARG003
         if backbone.lower() == "tiny_vit":
             from otx.algo.visual_prompting.backbones.tiny_vit import TinyViT
 
-            return TinyViT(**cls.backbone_configs.get(backbone.lower()))  # type: ignore[arg-type]
+            return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[arg-type]
         elif backbone.lower() in ["vit_b", "vit_l", "vit_h"]:  # noqa: RET505
             from otx.algo.visual_prompting.backbones.vit import ViT
 
-            return ViT(**cls.backbone_configs.get(backbone.lower()))  # type: ignore[arg-type]
+            return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[arg-type]
 
         else:
             error_log = f"{backbone} is not supported for SAMImageEncoder. Set among tiny_vit and vit_b."
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index 93caca5f109..e8095410bb8 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -82,7 +82,7 @@ def __init__(
         self.return_extra_metrics = return_extra_metrics
         self.stability_score_offset = stability_score_offset
 
-        self.image_encoder = SAMImageEncoder(backbone=backbone)
+        self.image_encoder = SAMImageEncoder(backbone=backbone, img_size=image_size)
         self.prompt_encoder = SAMPromptEncoder(
             embed_dim=embed_dim,
             image_embedding_size=(image_embedding_size, image_embedding_size),
diff --git a/src/otx/core/exporter/visual_prompting.py b/src/otx/core/exporter/visual_prompting.py
index 38cdf3fcd25..6b3d3970120 100644
--- a/src/otx/core/exporter/visual_prompting.py
+++ b/src/otx/core/exporter/visual_prompting.py
@@ -175,7 +175,7 @@ def get_onnx_dummy_inputs(
                     model.image_embedding_size,
                     dtype=torch.float32,
                 ),
-                "point_coords": torch.randint(low=0, high=1024, size=(1, 2, 2), dtype=torch.float32),
+                "point_coords": torch.randint(low=0, high=self.input_size[-1], size=(1, 2, 2), dtype=torch.float32),
                 "point_labels": torch.randint(low=0, high=4, size=(1, 2), dtype=torch.float32),
                 "mask_input": torch.randn(
                     1,

From 4c0781f580f0a1a6c4f9b68a8a7707ca0266ecd0 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 5 Aug 2024 14:02:26 +0900
Subject: [PATCH 04/42] update model part

---
 src/otx/algo/anomaly/padim.py                 |  8 +--
 src/otx/algo/anomaly/stfpm.py                 |  6 +-
 .../classification/backbones/efficientnet.py  |  5 +-
 src/otx/algo/classification/efficientnet.py   | 11 ++--
 .../algo/classification/efficientnet_v2.py    |  2 +-
 .../algo/classification/huggingface_model.py  | 16 +++++-
 src/otx/algo/classification/vit.py            |  2 +-
 src/otx/algo/detection/atss.py                | 28 ++++++++--
 .../base_models/detection_transformer.py      | 11 ++--
 src/otx/algo/detection/huggingface_model.py   | 11 ++--
 src/otx/algo/detection/rtdetr.py              | 35 +++++++++++-
 src/otx/algo/detection/rtmdet.py              | 30 +++++++++-
 src/otx/algo/detection/ssd.py                 | 29 ++++++++--
 src/otx/algo/detection/yolox.py               | 46 ++++++++++++++--
 .../heads/custom_roi_head.py                  |  1 +
 .../algo/instance_segmentation/maskrcnn.py    | 55 ++++++++++++++++---
 .../algo/instance_segmentation/maskrcnn_tv.py | 27 ++++++++-
 .../algo/instance_segmentation/rtmdet_inst.py | 22 +++++++-
 src/otx/algo/segmentation/dino_v2_seg.py      | 35 +++++++++++-
 .../algo/segmentation/huggingface_model.py    | 27 +++++++--
 src/otx/algo/segmentation/litehrnet.py        | 31 ++++++++++-
 src/otx/algo/segmentation/segnext.py          | 31 ++++++++++-
 .../visual_prompting/backbones/tiny_vit.py    |  6 +-
 .../algo/visual_prompting/segment_anything.py | 28 ++++++----
 src/otx/cli/cli.py                            |  7 +++
 src/otx/core/data/module.py                   |  3 +-
 src/otx/core/model/action_classification.py   |  4 +-
 src/otx/core/model/anomaly.py                 | 18 +++---
 src/otx/core/model/segmentation.py            |  2 +-
 src/otx/core/model/visual_prompting.py        |  4 +-
 src/otx/engine/engine.py                      | 12 +++-
 src/otx/engine/utils/auto_configurator.py     |  9 ++-
 tests/unit/algo/detection/test_rtmdet.py      |  2 +-
 tests/unit/algo/detection/test_yolox.py       |  4 +-
 .../algo/segmentation/test_dino_v2_seg.py     |  2 +-
 .../test_zero_shot_segment_anything.py        |  2 +-
 36 files changed, 463 insertions(+), 109 deletions(-)

diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py
index 4f5fb0be6a9..201b0230a02 100644
--- a/src/otx/algo/anomaly/padim.py
+++ b/src/otx/algo/anomaly/padim.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 from anomalib.models.image import Padim as AnomalibPadim
 
@@ -34,7 +34,6 @@ class Padim(OTXAnomaly, OTXModel, AnomalibPadim):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
-        input_size (Sequence[int]): The input shape of the model.
     """
 
     def __init__(
@@ -48,10 +47,9 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
-        input_size: Sequence[int] = (256, 256),
     ) -> None:
-        OTXAnomaly.__init__(self, input_size)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size)
+        OTXAnomaly.__init__(self)
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
         AnomalibPadim.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index 67963c25444..72dd30e8aa3 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -32,7 +32,6 @@ class Stfpm(OTXAnomaly, OTXModel, AnomalibStfpm):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
-        input_size (Sequence[int]): The input shape of the model.
     """
 
     def __init__(
@@ -44,11 +43,10 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
-        input_size: Sequence[int] = (256, 256),
         **kwargs,
     ) -> None:
-        OTXAnomaly.__init__(self, input_size)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=input_size)
+        OTXAnomaly.__init__(self)
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
index 9682dda3ce4..9931e2dca95 100644
--- a/src/otx/algo/classification/backbones/efficientnet.py
+++ b/src/otx/algo/classification/backbones/efficientnet.py
@@ -569,7 +569,7 @@ class OTXEfficientNet(EfficientNet):
         in_size : tuple of two ints. Spatial size of the expected input image.
     """
 
-    def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
+    def __init__(self, version: EFFICIENTNET_VERSION, input_size: tuple[int, int] | None = None, **kwargs):
         self.model_name = "efficientnet_" + version
 
         if version == "b0":
@@ -612,6 +612,9 @@ def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
             msg = f"Unsupported EfficientNet version {version}"
             raise ValueError(msg)
 
+        if input_size is not None:
+            in_size = input_size
+
         init_block_channels = 32
         layers = [1, 2, 2, 3, 3, 4, 1]
         downsample = [1, 1, 1, 1, 0, 1, 0]
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index 2371fddd2d1..6e34ee22dab 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -60,12 +60,13 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_size: Sequence[int] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
         super().__init__(
             label_info=label_info,
-            input_size=(1, 3, 224, 224),
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -89,7 +90,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True),
+            backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True),
             neck=GlobalAveragePooling(dim=2),
             head=LinearClsHead(
                 num_classes=num_classes,
@@ -195,7 +196,7 @@ class EfficientNetForMulticlassClsSemiSL(EfficientNetForMulticlassCls):
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return SemiSLClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.image_size[-2:]),
+            backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True),
             neck=GlobalAveragePooling(dim=2),
             head=OTXSemiSLLinearClsHead(
                 num_classes=num_classes,
@@ -307,7 +308,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
+            backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelLinearClsHead(
                 num_classes=num_classes,
@@ -443,7 +444,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXEfficientNet(version=self.version, pretrained=True, in_size=self.input_size[-2:]),
+            backbone=OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=True),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalLinearClsHead(
                 in_channels=1280,
diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py
index acb17d0ecc8..17397d964bc 100644
--- a/src/otx/algo/classification/efficientnet_v2.py
+++ b/src/otx/algo/classification/efficientnet_v2.py
@@ -351,7 +351,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index 56432533dcc..d47bceffea9 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -5,11 +5,13 @@
 
 from __future__ import annotations
 
+import logging
 from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import Tensor, nn
 from transformers import AutoModelForImageClassification
+from transformers.configuration_utils import PretrainedConfig
 
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.classification import (
@@ -31,6 +33,9 @@
     from otx.core.metrics import MetricCallable
 
 
+DEFAULT_INPUT_SIZE = (1, 2, 224, 224)
+logger = logging.getLogger(__name__)
+
 class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
     """HuggingFaceModelForMulticlassCls is a class that represents a Hugging Face model for multiclass classification.
 
@@ -61,7 +66,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: Sequence[int] = DEFAULT_INPUT_SIZE,
     ) -> None:
         self.model_name = model_name_or_path
 
@@ -75,10 +80,19 @@ def __init__(
         )
 
     def _create_model(self) -> nn.Module:
+        model_config, _ = PretrainedConfig.get_config_dict(self.model_name)
+        kwargs = {}
+        if "image_size" in model_config:
+            kwargs["image_size"] = self.input_size[-1]
+        elif self.input_size != DEFAULT_INPUT_SIZE:
+            msg = "There is no 'image_size' argument in the model configuration. There may be unexpected results."
+            logger.warning(msg)
+
         return AutoModelForImageClassification.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             num_labels=self.label_info.num_classes,
             ignore_mismatched_sizes=True,
+            **kwargs,
         )
 
     def _customize_inputs(self, inputs: MulticlassClsBatchDataEntity) -> dict[str, Any]:
diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 4de1c8ac984..4a93e169ffc 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -586,7 +586,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index 76884ab383f..65caaab8b0b 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -20,9 +20,17 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.detection import ExplainableOTXDetModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.config.data import TileConfig
 
 if TYPE_CHECKING:
     from typing_extensions import Self
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class ATSS(ExplainableOTXDetModel):
@@ -30,25 +38,35 @@ class ATSS(ExplainableOTXDetModel):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 800, 992),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 800, 992),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py
index a361b278561..0f35e94f3b2 100644
--- a/src/otx/algo/detection/base_models/detection_transformer.py
+++ b/src/otx/algo/detection/base_models/detection_transformer.py
@@ -45,17 +45,18 @@ def __init__(
         optimizer_configuration: list[dict] | None = None,
         multi_scale: list[int] | None = None,
         num_top_queries: int = 300,
+        input_size: int = 640,
     ) -> None:
         """DETR model implementation."""
         super().__init__()
         self.backbone = backbone
         self.decoder = decoder
         self.encoder = encoder
-        self.multi_scale = (
-            multi_scale
-            if multi_scale is not None
-            else [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
-        )
+        if multi_scale is not None:
+            self.multi_scale = multi_scale
+        else:
+            self.multi_scale = [input_size -i * 64 for i in range(-5, 6)] + [input_size] * 2
+
         self.num_classes = num_classes
         self.num_top_queries = num_top_queries
         self.criterion = (
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index 393e3d5a96f..db4bfda1980 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -5,12 +5,14 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import nn
 from torchvision import tv_tensors
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.image_processing_base import ImageProcessingMixin
 
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity
@@ -60,6 +62,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=object-detection
         label_info: LabelInfoTypes,
+        input_size: Sequence[int] = (1, 3, 800, 992),  # detection default input size
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -67,18 +70,16 @@ def __init__(
     ) -> None:
         self.model_name = model_name_or_path
         self.load_from = None
-        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
-        if len(input_size := self.image_processor.size.values()) == 1:
-            input_size = (*input_size, *input_size)
 
         super().__init__(
             label_info=label_info,
-            input_size=(1, 3, *input_size),
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return AutoModelForObjectDetection.from_pretrained(
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 623d81c611c..d9c486fe7dc 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -7,7 +7,7 @@
 
 import copy
 import re
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -23,6 +23,16 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.detection import ExplainableOTXDetModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.config.data import TileConfig
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class RTDETR(ExplainableOTXDetModel):
@@ -34,13 +44,29 @@ class RTDETR(ExplainableOTXDetModel):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 640, 640),
-        **kwargs
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
+        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
+        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
+            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
+        self.tile_image_size = tile_image_size
 
     def _customize_inputs(
         self,
@@ -244,6 +270,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
+            input_size=self.input_size[-1],
         )
 
 
@@ -287,6 +314,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
+            input_size=self.input_size[-1],
         )
 
 
@@ -336,4 +364,5 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
+            input_size=self.input_size[-1],
         )
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index b1a87088d80..74e1de044b3 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import Sequence
+from typing import TYPE_CHECKING, Sequence
 
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import GIoULoss, QualityFocalLoss
@@ -21,6 +21,16 @@
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.detection import ExplainableOTXDetModel
 from otx.core.types.export import TaskLevelExportParameters
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.config.data import TileConfig
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class RTMDet(ExplainableOTXDetModel):
@@ -28,13 +38,27 @@ class RTMDet(ExplainableOTXDetModel):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 640, 640),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 640, 640),
-        **kwargs
     ) -> None:
+        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
+            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index 43abeee8c7a..10a7ed5a23f 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -25,11 +25,18 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.detection import ExplainableOTXDetModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.config.data import TileConfig
 
 if TYPE_CHECKING:
     import torch
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
     from otx.core.data.dataset.base import OTXDataset
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 logger = logging.getLogger()
@@ -47,13 +54,23 @@ class SSD(ExplainableOTXDetModel):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 864, 864),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 864, 864),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
@@ -153,7 +170,7 @@ def _get_new_anchors(self, dataset: OTXDataset, anchor_generator: SSDAnchorGener
                 if isinstance(transform, Resize):
                     target_wh = transform.scale
         if target_wh is None:
-            target_wh = (864, 864)
+            target_wh = self.input_size[-2:]
             msg = f"Cannot get target_wh from the dataset. Assign it with the default value: {target_wh}"
             logger.warning(msg)
         group_as = [len(width) for width in anchor_generator.widths]
@@ -276,11 +293,11 @@ def load_state_dict_pre_hook(self, state_dict: dict[str, torch.Tensor], prefix:
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            raise ValueError(self.image_size)
+        if self.input_size is None:
+            raise ValueError(self.input_size)
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index 1796a74899e..b0ff4248cbd 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -21,9 +21,17 @@
 from otx.core.model.detection import ExplainableOTXDetModel
 from otx.core.types.export import OTXExportFormatType
 from otx.core.types.precision import OTXPrecisionType
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.config.data import TileConfig
 
 if TYPE_CHECKING:
     from pathlib import Path
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class YOLOX(ExplainableOTXDetModel):
@@ -31,13 +39,27 @@ class YOLOX(ExplainableOTXDetModel):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 640, 640),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 640, 640),
-        **kwargs
     ) -> None:
+        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
+            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
@@ -129,13 +151,27 @@ class YOLOXTINY(YOLOX):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 416, 416),
-        tile_image_size: Sequence[int] = (1, 3, 416, 416),
-        **kwargs
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
+        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
+        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
+            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
diff --git a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
index 360027b1376..4536956b873 100644
--- a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
+++ b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
@@ -548,6 +548,7 @@ def bbox_loss(self, x: tuple[Tensor], sampling_results: list[SamplingResult], ba
 
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, ClassIncrementalMixin):
     """CustomConvFCBBoxHead class for OTX."""
+    # checked
 
     def loss_and_target(
         self,
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 599367f4ef6..6cdbed940c8 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 
 from torchvision.ops import RoIAlign
 
@@ -24,6 +24,16 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.config.data import TileConfig
+from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class MaskRCNN(ExplainableOTXInstanceSegModel):
@@ -78,13 +88,23 @@ class MaskRCNNResNet50(MaskRCNN):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 1024, 1024),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
@@ -260,13 +280,23 @@ class MaskRCNNEfficientNet(MaskRCNN):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 1024, 1024),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
@@ -459,17 +489,26 @@ class MaskRCNNSwinT(MaskRCNN):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 1344, 1344),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
-
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
             "rpn": {
diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
index e5afb877998..dc24fc9933b 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, Sequence
+from typing import Any, Sequence, TYPE_CHECKING
 
 import torch
 from torch import Tensor, nn
@@ -30,6 +30,16 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.config.data import TileConfig
+from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class _TVMaskRCNN(MaskRCNN):
@@ -262,15 +272,26 @@ class TVMaskRCNNR50(TVMaskRCNN):
 
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
+
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 1024, 1024),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 01fe2c10847..682ff46e23b 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -20,9 +20,17 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.config.data import TileConfig
+from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
 
 if TYPE_CHECKING:
     from torch import Tensor
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.metrics import MetricCallable
 
 
 class RTMDetInst(ExplainableOTXInstanceSegModel):
@@ -86,13 +94,23 @@ class RTMDetInstTiny(RTMDetInst):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 640, 640),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
         tile_image_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
     ) -> None:
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
         )
         self.tile_image_size = tile_image_size
 
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
index 63baccfacc8..c4be747d4e0 100644
--- a/src/otx/algo/segmentation/dino_v2_seg.py
+++ b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -10,12 +10,19 @@
 from otx.algo.segmentation.backbones import DinoVisionTransformer
 from otx.algo.segmentation.heads import FCNHead
 from otx.core.model.segmentation import TorchVisionCompatibleModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.dice import SegmCallable
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
     from torch import nn
     from typing_extensions import Self
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.metrics import MetricCallable
 
 
 class DinoV2Seg(BaseSegmModel):
@@ -45,12 +52,34 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel):
     """DinoV2Seg Model."""
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 560, 560),
-        **kwargs
-    ) -> None:
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
+        torch_compile: bool = False,
+        backbone_configuration: dict[str, Any] | None = None,
+        decode_head_configuration: dict[str, Any] | None = None,
+        criterion_configuration: list[dict[str, Any]] | None = None,
+        export_image_configuration: dict[str, Any] | None = None,
+        name_base_model: str = "semantic_segmentation_model",
+    ):
+        if input_size[-1] % 14 != 0 or input_size[-2] % 14 != 0:
+            msg = f"Input size should be a multiple of 14, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            backbone_configuration=backbone_configuration,
+            decode_head_configuration=decode_head_configuration,
+            criterion_configuration=criterion_configuration,
+            export_image_configuration=export_image_configuration,
+            name_base_model=name_base_model,
         )
 
     def _create_model(self) -> nn.Module:
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index 693c2219c05..feee127b0f8 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import logging
 from typing import TYPE_CHECKING, Any, Sequence
 
 import torch
@@ -13,6 +14,7 @@
     AutoImageProcessor,
     AutoModelForSemanticSegmentation,
 )
+from transformers.configuration_utils import PretrainedConfig
 
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.segmentation import SegBatchDataEntity, SegBatchPredEntity
@@ -30,6 +32,8 @@
 
     from otx.core.metrics import MetricCallable
 
+logger = logging.getLogger(__name__)
+
 
 class HuggingFaceModelForSegmentation(OTXSegmentationModel):
     """A class representing a Hugging Face model for segmentation.
@@ -61,6 +65,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=image-segmentation
         label_info: LabelInfoTypes,
+        input_size: Sequence[int] = (1, 3, 512, 512),  # sementic segmentation default input size
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -68,24 +73,38 @@ def __init__(
     ) -> None:
         self.model_name = model_name_or_path
         self.load_from = None
-        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
-        if len(input_size := self.image_processor.size.values()) == 1:
-            input_size = (*input_size, *input_size)
 
         super().__init__(
             label_info=label_info,
-            input_size=(1, 3, *input_size),
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)
 
     def _create_model(self) -> nn.Module:
+        model_config, _ = PretrainedConfig.get_config_dict(self.model_name)
+        kwargs = {}
+        if "image_size" in model_config:
+            kwargs["image_size"] = self.input_size[-1]
+        
+        if (patch_size := model_config.get("patch_sizes")) != None:
+            if isinstance(patch_size, (list, tuple)):
+                patch_size = patch_size
+            if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0:
+                msg = (
+                    f"It's recommended to set the input size to multiple of patch size({patch_size}). "
+                    "If not, score can decrease or model can't work."
+                )
+                logger.warning(msg)
+
         return AutoModelForSemanticSegmentation.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             num_labels=self.label_info.num_classes,
             ignore_mismatched_sizes=True,
+            **kwargs,
         )
 
     def _customize_inputs(self, entity: SegBatchDataEntity) -> dict[str, Any]:
diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 458a0f44ea6..31bf7ae33a2 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -15,11 +15,18 @@
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.segmentation import TorchVisionCompatibleModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.dice import SegmCallable
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
     from torch import nn
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.metrics import MetricCallable
 
 
 class LiteHRNetS(BaseSegmModel):
@@ -519,12 +526,30 @@ class OTXLiteHRNet(TorchVisionCompatibleModel):
     """LiteHRNet Model."""
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
-    ) -> None:
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
+        torch_compile: bool = False,
+        backbone_configuration: dict[str, Any] | None = None,
+        decode_head_configuration: dict[str, Any] | None = None,
+        criterion_configuration: list[dict[str, Any]] | None = None,
+        export_image_configuration: dict[str, Any] | None = None,
+        name_base_model: str = "semantic_segmentation_model",
+    ):
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            backbone_configuration=backbone_configuration,
+            decode_head_configuration=decode_head_configuration,
+            criterion_configuration=criterion_configuration,
+            export_image_configuration=export_image_configuration,
+            name_base_model=name_base_model,
         )
 
     def _create_model(self) -> nn.Module:
diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py
index 72287dda2a3..c18b1cc10c1 100644
--- a/src/otx/algo/segmentation/segnext.py
+++ b/src/otx/algo/segmentation/segnext.py
@@ -10,11 +10,18 @@
 from otx.algo.segmentation.heads import LightHamHead
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
 from otx.core.model.segmentation import TorchVisionCompatibleModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.metrics.dice import SegmCallable
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
     from torch import nn
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
+    from otx.core.metrics import MetricCallable
 
 
 class SegNextB(BaseSegmModel):
@@ -109,12 +116,30 @@ class OTXSegNext(TorchVisionCompatibleModel):
     """SegNext Model."""
     def __init__(
         self,
+        label_info: LabelInfoTypes,
         input_size: Sequence[int] = (1, 3, 512, 512),
-        **kwargs
-    ) -> None:
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
+        torch_compile: bool = False,
+        backbone_configuration: dict[str, Any] | None = None,
+        decode_head_configuration: dict[str, Any] | None = None,
+        criterion_configuration: list[dict[str, Any]] | None = None,
+        export_image_configuration: dict[str, Any] | None = None,
+        name_base_model: str = "semantic_segmentation_model",
+    ):
         super().__init__(
+            label_info=label_info,
             input_size=input_size,
-            **kwargs
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            backbone_configuration=backbone_configuration,
+            decode_head_configuration=decode_head_configuration,
+            criterion_configuration=criterion_configuration,
+            export_image_configuration=export_image_configuration,
+            name_base_model=name_base_model,
         )
 
     def _create_model(self) -> nn.Module:
diff --git a/src/otx/algo/visual_prompting/backbones/tiny_vit.py b/src/otx/algo/visual_prompting/backbones/tiny_vit.py
index e3b7714683c..ac7f5f824eb 100644
--- a/src/otx/algo/visual_prompting/backbones/tiny_vit.py
+++ b/src/otx/algo/visual_prompting/backbones/tiny_vit.py
@@ -362,7 +362,9 @@ def forward(self, x: Tensor) -> Tensor:
         """Forward."""
         h, w = self.input_resolution
         b, l, c = x.shape  # noqa: E741
-        assert h * w == l, "input feature has wrong size"  # noqa: S101
+        if h * w != l:
+            msg = f"Input feature has wrong size. Expected that h({h}) * w({w}) == l({l})."
+            raise ValueError(msg)
         res_x = x
         if self.window_size == h and self.window_size == w:
             x = self.attn(x)
@@ -634,6 +636,6 @@ def forward(self, x: Tensor) -> Tensor:
             layer = self.layers[i]
             x = layer(x)
         batch, _, channel = x.size()
-        x = x.view(batch, 64, 64, channel)
+        x = x.view(batch, self.img_size // 16, self.img_size // 16, channel)
         x = x.permute(0, 3, 1, 2)
         return self.neck(x)
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index e8095410bb8..fc4579de5d4 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -139,7 +139,7 @@ def load_checkpoint(
                 if key in state_dict:
                     state_dict.pop(key)
             self.load_state_dict(state_dict)
-        except ValueError as e:
+        except (ValueError, RuntimeError) as e:
             log.info(
                 f"{e}: {load_from} is not desirable format for torch.hub.load_state_dict_from_url. "
                 f"To manually load {load_from}, try to set it to trainer.checkpoint.",
@@ -507,17 +507,17 @@ def __init__(
         return_extra_metrics: bool = False,
         stability_score_offset: float = 1.0,
     ) -> None:
-        super().__init__(
-            label_info=label_info,
-            input_size=input_size,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            metric=metric,
-            torch_compile=torch_compile,
-        )
+        if input_size[-1] != input_size[-2]:
+            msg = f"SAM should use square image, but got {input_size}"
+            raise ValueError(msg)
+        if input_size[-1] % 16 != 0 and input_size[-2] % 16 != 0:
+            msg = f"Input size should be a multiple of 16, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
+
         self.config = {
             "backbone": backbone,
-            "image_size": self.input_size[-1],
+            "image_size": input_size[-1],
+            "image_embedding_size" : input_size[-1] // 16,
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
@@ -527,6 +527,14 @@ def __init__(
             "stability_score_offset": stability_score_offset,
             **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone],
         }
+        super().__init__(
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
 
     def _create_model(self) -> nn.Module:
         """Create a PyTorch model for this class."""
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 983c2fe927b..e0e2813bdab 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -330,6 +330,13 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # For num_classes update, Model and Metric are instantiated separately.
             model_config = self.config[self.subcommand].pop("model")
 
+            input_size = self.config["train"]["engine"].get("input_size")
+            if input_size is not None:
+                if isinstance(input_size, int):
+                    input_size = (input_size, input_size)
+                self.config["train"]["data"]["input_size"] = input_size
+                model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+
             # Instantiate the things that don't need to special handling
             self.config_init = self.parser.instantiate_classes(self.config)
             self.workspace = self.get_config_value(self.config_init, "workspace")
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index 7d1d56ba658..a1a5cdced8a 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -96,7 +96,7 @@ def __init__(
         self.device = device
 
         self.subsets: dict[str, OTXDataset] = {}
-        self.save_hyperparameters()
+        self.save_hyperparameters(ignore=["input_size"])
 
         # TODO (Jaeguk): This is workaround for a bug in Datumaro.
         # These lines should be removed after next datumaro release.
@@ -454,5 +454,6 @@ def __reduce__(self):
                 self.unannotated_items_ratio,
                 self.auto_num_workers,
                 self.device,
+                self.input_size,
             ),
         )
diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py
index 009affae692..583a8f2c99e 100644
--- a/src/otx/core/model/action_classification.py
+++ b/src/otx/core/model/action_classification.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
 import torch
@@ -41,6 +41,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: Sequence[int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -50,6 +51,7 @@ def __init__(
         self.std = (255.0, 255.0, 255.0)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index 301fb03191c..44edf869b0b 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging as log
-from typing import TYPE_CHECKING, Any, TypeAlias, Sequence
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 import torch
 from anomalib import TaskType as AnomalibTaskType
@@ -51,10 +51,10 @@
 class OTXAnomaly:
     """Methods used to make OTX model compatible with the Anomalib model."""
 
-    def __init__(self, input_size: Sequence[int] = (256, 256)) -> None:
+    def __init__(self) -> None:
         self.optimizer: list[OptimizerCallable] | OptimizerCallable = None
         self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None
-        self._input_size: tuple[int, int] = input_size
+        self._input_size: tuple[int, int] = (256, 256)
         self.trainer: Trainer
         self.model: nn.Module
         self.image_threshold: BaseThreshold
@@ -116,13 +116,15 @@ def _get_values_from_transforms(
         self,
     ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]:
         """Get the value requested value from default transforms."""
-        mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
+        image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
         for transform in self.configure_transforms().transforms:  # type: ignore[attr-defined]
             name = transform.__class__.__name__
-            if "Normalize" in name:
+            if "Resize" in name:
+                image_size = tuple(transform.size)  # type: ignore[assignment]
+            elif "Normalize" in name:
                 mean_value = tuple(value * 255 for value in transform.mean)  # type: ignore[assignment]
                 std_value = tuple(value * 255 for value in transform.std)  # type: ignore[assignment]
-        return mean_value, std_value
+        return image_size, mean_value, std_value
 
     @property
     def trainable_model(self) -> str | None:
@@ -298,7 +300,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
         """Creates OTXAnomalyModelExporter object that can export anomaly models."""
         min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist()
         max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist()
-        mean_values, scale_values = self._get_values_from_transforms()
+        image_shape, mean_values, scale_values = self._get_values_from_transforms()
         onnx_export_configuration = {
             "opset_version": 14,
             "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}},
@@ -306,7 +308,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
             "output_names": ["output"],
         }
         return OTXAnomalyModelExporter(
-            image_shape=self.input_size,
+            image_shape=image_shape,
             image_threshold=self.image_threshold.value.cpu().numpy().tolist(),
             pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(),
             task=self.task,
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 89cdae1f215..60b467b82dd 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -145,7 +145,7 @@ def __init__(
             criterion_configuration (list[dict[str, Any]] | None, optional):
                 The configuration for the criterion of the model. Defaults to None.
             export_image_configuration (dict[str, Any] | None, optional):
-                The configuration for the export of the model like mean, scale and image_size. Defaults to None.
+                The configuration for the export of the model like mean and scale. Defaults to None.
             name_base_model (str, optional): The name of the base model used for trainig.
                 Defaults to "semantic_segmentation_model".
         """
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index 749ec5ce0db..789e2a679ab 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -155,8 +155,8 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro
 
     def __init__(
         self,
-        input_size: Sequence[int],
         label_info: LabelInfoTypes = NullLabelInfo(),
+        input_size: Sequence[int] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -178,7 +178,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXVisualPromptingModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, self.model.image_size, self.model.image_size),
+            input_size=self.input_size,
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="fit_to_window",
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index f9e233359ed..b92b4df69b4 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -10,7 +10,7 @@
 import tempfile
 from contextlib import contextmanager
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal, Sequence
 from warnings import warn
 
 import torch
@@ -119,6 +119,7 @@ def __init__(
         model: OTXModel | str | None = None,
         checkpoint: PathLike | None = None,
         device: DeviceType = DeviceType.auto,
+        input_size: Sequence[int] | int | None = None,
         **kwargs,
     ):
         """Initializes the OTX Engine.
@@ -141,8 +142,17 @@ def __init__(
             data_root=data_root,
             task=datamodule.task if datamodule is not None else task,
             model_name=None if isinstance(model, OTXModel) else model,
+            input_size=input_size,
         )
 
+        if input_size is not None:
+            if isinstance(datamodule, OTXDataModule) and datamodule.input_size != input_size:
+                msg = "Data module is already initialized. Input size will be ignored to data module."
+                logging.warning(msg)
+            if isinstance(model, OTXModel) and model.input_size != input_size:
+                msg = "Model is already initialized. Input size will be ignored to model."
+                logging.warning(msg)
+
         self._datamodule: OTXDataModule | None = (
             datamodule if datamodule is not None else self._auto_configurator.get_datamodule()
         )
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index d544609b1b0..26992720134 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -8,7 +8,7 @@
 import logging
 from copy import deepcopy
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 from warnings import warn
 
 import datumaro
@@ -65,7 +65,7 @@
     ],
     "common_semantic_segmentation_with_subset_dirs": [OTXTaskType.SEMANTIC_SEGMENTATION],
     "kinetics": [OTXTaskType.ACTION_CLASSIFICATION],
-    "mvtec": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION],
+    "mvtec_classification": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION],
 }
 
 OVMODEL_PER_TASK = {
@@ -144,11 +144,13 @@ def __init__(
         data_root: PathLike | None = None,
         task: OTXTaskType | None = None,
         model_name: str | None = None,
+        input_size: Sequence[int] | None = None
     ) -> None:
         self.data_root = data_root
         self._task = task
         self._config: dict | None = None
         self.model_name: str | None = model_name
+        self.input_size = input_size
 
     @property
     def task(self) -> OTXTaskType:
@@ -227,6 +229,9 @@ def get_datamodule(self) -> OTXDataModule | None:
         _ = data_config.pop("__path__", {})  # Remove __path__ key that for CLI
         _ = data_config.pop("config", {})  # Remove config key that for CLI
 
+        if getattr(data_config, "input_size", None) is not None and self.input_size is not None:
+            data_config["input_size"] = self.input_size
+
         return OTXDataModule(
             train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config),
             val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config),
diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py
index e5ec628be4a..aec4a299737 100644
--- a/tests/unit/algo/detection/test_rtmdet.py
+++ b/tests/unit/algo/detection/test_rtmdet.py
@@ -18,7 +18,7 @@ def test_init(self) -> None:
         assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXt)
         assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN)
         assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead)
-        assert otx_rtmdet_tiny.image_size == (1, 3, 640, 640)
+        assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640)
         assert otx_rtmdet_tiny.tile_image_size == (1, 3, 640, 640)
 
     def test_exporter(self) -> None:
diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py
index b83d0cae71b..3b8cb90e686 100644
--- a/tests/unit/algo/detection/test_yolox.py
+++ b/tests/unit/algo/detection/test_yolox.py
@@ -18,11 +18,11 @@ def test_init(self) -> None:
         assert isinstance(otx_yolox_l.model.backbone, CSPDarknet)
         assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN)
         assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead)
-        assert otx_yolox_l.image_size == (1, 3, 640, 640)
+        assert otx_yolox_l.input_size == (1, 3, 640, 640)
         assert otx_yolox_l.tile_image_size == (1, 3, 640, 640)
 
         otx_yolox_tiny = YOLOXTINY(label_info=3)
-        assert otx_yolox_tiny.image_size == (1, 3, 416, 416)
+        assert otx_yolox_tiny.input_size == (1, 3, 416, 416)
         assert otx_yolox_tiny.tile_image_size == (1, 3, 416, 416)
 
     def test_exporter(self) -> None:
diff --git a/tests/unit/algo/segmentation/test_dino_v2_seg.py b/tests/unit/algo/segmentation/test_dino_v2_seg.py
index 7e1a8a9224d..259b6f4816b 100644
--- a/tests/unit/algo/segmentation/test_dino_v2_seg.py
+++ b/tests/unit/algo/segmentation/test_dino_v2_seg.py
@@ -10,7 +10,7 @@
 class TestDinoV2Seg:
     @pytest.fixture(scope="class")
     def fxt_dino_v2_seg(self) -> OTXDinoV2Seg:
-        return OTXDinoV2Seg(label_info=10, export_image_configuration={"image_size": (1, 3, 560, 560)})
+        return OTXDinoV2Seg(label_info=10)
 
     def test_dino_v2_seg_init(self, fxt_dino_v2_seg):
         assert isinstance(fxt_dino_v2_seg, OTXDinoV2Seg)
diff --git a/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py b/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py
index b9fe3a5e58e..fb6fe80c5eb 100644
--- a/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py
+++ b/tests/unit/algo/visual_prompting/test_zero_shot_segment_anything.py
@@ -422,7 +422,7 @@ def test_predict_masks(self, mocker, build_zero_shot_segment_anything) -> None:
         )
 
         zero_shot_segment_anything = build_zero_shot_segment_anything()
-        zero_shot_segment_anything.image_size = 6
+        zero_shot_segment_anything.input_size = 6
 
         mask = zero_shot_segment_anything._predict_masks(
             mode="infer",

From a40db406d96c54137054fd3482765c8782a4fc04 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 5 Aug 2024 17:09:02 +0900
Subject: [PATCH 05/42] update interface

---
 .../instance_segmentation/heads/custom_roi_head.py   |  1 -
 src/otx/cli/cli.py                                   | 12 +++++-------
 src/otx/core/data/module.py                          |  5 +++++
 src/otx/engine/engine.py                             | 11 +----------
 src/otx/engine/utils/auto_configurator.py            |  9 +++++++--
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
index 4536956b873..360027b1376 100644
--- a/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
+++ b/src/otx/algo/instance_segmentation/heads/custom_roi_head.py
@@ -548,7 +548,6 @@ def bbox_loss(self, x: tuple[Tensor], sampling_results: list[SamplingResult], ba
 
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, ClassIncrementalMixin):
     """CustomConvFCBBoxHead class for OTX."""
-    # checked
 
     def loss_and_target(
         self,
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 6759d97adbe..fcec67ec2d7 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -331,18 +331,16 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # For num_classes update, Model and Metric are instantiated separately.
             model_config = self.config[self.subcommand].pop("model")
 
-            input_size = self.config["train"]["engine"].get("input_size")
-            if input_size is not None:
-                if isinstance(input_size, int):
-                    input_size = (input_size, input_size)
-                self.config["train"]["data"]["input_size"] = input_size
-                model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
-
             # Instantiate the things that don't need to special handling
             self.config_init = self.parser.instantiate_classes(self.config)
             self.workspace = self.get_config_value(self.config_init, "workspace")
             self.datamodule = self.get_config_value(self.config_init, "data")
 
+            if (input_size := self.datamodule.input_size) is not None:
+                if isinstance(input_size, int):
+                    input_size = (input_size, input_size)
+                model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+
             # Instantiate the model and needed components
             self.model = self.instantiate_model(model_config=model_config)
 
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index a1a5cdced8a..d371bb5320d 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -63,6 +63,7 @@ def __init__(
         auto_num_workers: bool = False,
         device: DeviceType = DeviceType.auto,
         input_size: int | tuple[int, int] | None = None,
+        adaptive_input_size: bool = False,
     ) -> None:
         """Constructor."""
         super().__init__()
@@ -70,10 +71,14 @@ def __init__(
         self.data_format = data_format
         self.data_root = data_root
 
+        if adaptive_input_size:
+            print("adaptive_input_size works")
+
         if input_size is not None:
             for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]:
                 if subset_cfg.input_size is None:
                     subset_cfg.input_size = input_size
+        self.input_size = input_size
 
         self.train_subset = train_subset
         self.val_subset = val_subset
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index edd0d6c063c..ee5ff4dce35 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -122,7 +122,6 @@ def __init__(
         checkpoint: PathLike | None = None,
         device: DeviceType = DeviceType.auto,
         num_devices: int = 1,
-        input_size: Sequence[int] | int | None = None,
         **kwargs,
     ):
         """Initializes the OTX Engine.
@@ -147,17 +146,8 @@ def __init__(
             data_root=data_root,
             task=datamodule.task if datamodule is not None else task,
             model_name=None if isinstance(model, OTXModel) else model,
-            input_size=input_size,
         )
 
-        if input_size is not None:
-            if isinstance(datamodule, OTXDataModule) and datamodule.input_size != input_size:
-                msg = "Data module is already initialized. Input size will be ignored to data module."
-                logging.warning(msg)
-            if isinstance(model, OTXModel) and model.input_size != input_size:
-                msg = "Model is already initialized. Input size will be ignored to model."
-                logging.warning(msg)
-
         self._datamodule: OTXDataModule | None = (
             datamodule if datamodule is not None else self._auto_configurator.get_datamodule()
         )
@@ -169,6 +159,7 @@ def __init__(
             if isinstance(model, OTXModel)
             else self._auto_configurator.get_model(
                 label_info=self._datamodule.label_info if self._datamodule is not None else None,
+                input_size=self._datamodule.input_size,
             )
         )
 
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 26992720134..65e8f8bf2e7 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -65,7 +65,7 @@
     ],
     "common_semantic_segmentation_with_subset_dirs": [OTXTaskType.SEMANTIC_SEGMENTATION],
     "kinetics": [OTXTaskType.ACTION_CLASSIFICATION],
-    "mvtec_classification": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION],
+    "mvtec": [OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION],
 }
 
 OVMODEL_PER_TASK = {
@@ -245,7 +245,7 @@ def get_datamodule(self) -> OTXDataModule | None:
             **data_config,
         )
 
-    def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None) -> OTXModel:
+    def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None, input_size: Sequence[int] | None = None) -> OTXModel:
         """Retrieves the OTXModel instance based on the provided model name and meta information.
 
         Args:
@@ -278,6 +278,11 @@ def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes |
 
         model_config = deepcopy(self.config["model"])
 
+        if input_size is not None:
+            if isinstance(input_size, int):
+                input_size = (input_size, input_size)
+            model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+
         model_cls = get_model_cls_from_config(Namespace(model_config))
 
         if should_pass_label_info(model_cls):

From ac506ef97ea61b2a9f201905f79781bf40926aa2 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 7 Aug 2024 13:35:47 +0900
Subject: [PATCH 06/42] implement adaptive input size draft version

---
 src/otx/algo/detection/atss.py                |   2 -
 .../base_models/detection_transformer.py      |   2 +-
 src/otx/algo/detection/rtdetr.py              |   2 -
 src/otx/algo/detection/rtmdet.py              |   2 -
 src/otx/algo/detection/ssd.py                 |   2 -
 src/otx/algo/detection/yolox.py               |   7 +-
 .../algo/instance_segmentation/maskrcnn.py    |  10 +-
 .../algo/instance_segmentation/maskrcnn_tv.py |   6 +-
 .../algo/instance_segmentation/rtmdet_inst.py |   2 -
 src/otx/cli/cli.py                            |   6 -
 src/otx/core/data/module.py                   |  22 +-
 src/otx/core/data/tile_adaptor.py             | 183 -------------
 src/otx/core/data/utils/__init__.py           |   8 +
 src/otx/core/data/utils/utils.py              | 253 ++++++++++++++++++
 src/otx/engine/engine.py                      |   2 +-
 src/otx/engine/utils/auto_configurator.py     |   5 -
 src/otx/recipe/detection/yolox_tiny.yaml      |   6 +-
 tests/unit/algo/detection/test_rtmdet.py      |   1 -
 tests/unit/algo/detection/test_yolox.py       |   2 -
 19 files changed, 279 insertions(+), 244 deletions(-)
 delete mode 100644 src/otx/core/data/tile_adaptor.py
 create mode 100644 src/otx/core/data/utils/__init__.py
 create mode 100644 src/otx/core/data/utils/utils.py

diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index e1d5c5842eb..dd5412ac072 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -45,7 +45,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 800, 992),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -56,7 +55,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     @property
     def _exporter(self) -> OTXModelExporter:
diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py
index 0f35e94f3b2..18ca34c33de 100644
--- a/src/otx/algo/detection/base_models/detection_transformer.py
+++ b/src/otx/algo/detection/base_models/detection_transformer.py
@@ -55,7 +55,7 @@ def __init__(
         if multi_scale is not None:
             self.multi_scale = multi_scale
         else:
-            self.multi_scale = [input_size -i * 64 for i in range(-5, 6)] + [input_size] * 2
+            self.multi_scale = [input_size -i * 32 for i in range(-5, 6)] + [input_size] * 2
 
         self.num_classes = num_classes
         self.num_top_queries = num_top_queries
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 381d77ca83a..4aeb7bed3b2 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -51,7 +51,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
         if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
             msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
@@ -66,7 +65,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _customize_inputs(
         self,
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index 74c3686cf77..3f8e7b5c34c 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -45,7 +45,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
         if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
             msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
@@ -60,7 +59,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     @property
     def _exporter(self) -> OTXModelExporter:
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index 224f69b59ef..910baae419d 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -61,7 +61,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 864, 864),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -72,7 +71,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg = {
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index 520eaac645a..8341c6419f6 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -46,7 +46,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
         if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
             msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
@@ -61,7 +60,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _customize_inputs(
         self,
@@ -79,11 +77,10 @@ def _exporter(self) -> OTXModelExporter:
             raise ValueError(msg)
 
         swap_rgb = not isinstance(self, YOLOXTINY)  # only YOLOX-TINY uses RGB
-        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=input_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -160,7 +157,6 @@ def __init__(
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 640, 640),
     ) -> None:
         if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
             msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
@@ -175,7 +171,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)}
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 6b8c7b1eaf8..b1af171124a 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -46,11 +46,9 @@ def _exporter(self) -> OTXModelExporter:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size
-
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=input_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window",
@@ -96,7 +94,6 @@ def __init__(
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 512, 512),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -107,7 +104,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
@@ -288,7 +284,6 @@ def __init__(
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 512, 512),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -299,7 +294,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
@@ -497,7 +491,6 @@ def __init__(
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 512, 512),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -508,7 +501,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> TwoStageDetector:
         train_cfg = {
diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
index 54bb5d1fbc0..dcfe9c3c3c8 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
@@ -232,11 +232,9 @@ def _exporter(self) -> OTXModelExporter:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        input_size = self.tile_image_size if self.tile_config.enable_tiler else self.input_size
-
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=input_size,
+            input_size=self.input_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window",
@@ -283,7 +281,6 @@ def __init__(
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 512, 512),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -294,7 +291,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _create_model(self) -> nn.Module:
         """From torchvision tutorial."""
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index bd39dc05210..96801665710 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -102,7 +102,6 @@ def __init__(
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
-        tile_image_size: Sequence[int] = (1, 3, 512, 512),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -113,7 +112,6 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.tile_image_size = tile_image_size
 
     def _build_model(self, num_classes: int) -> SingleStageDetector:
         train_cfg = {
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index fcec67ec2d7..c0a66c184dd 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -413,12 +413,6 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel:
         model: OTXModel = model_parser.instantiate_classes(Namespace(model=model_config)).get("model")
         self.config_init[self.subcommand]["model"] = model
 
-        # Update tile config due to adaptive tiling
-        if model.tile_config.enable_tiler:
-            # TODO(Eugene): Ticket no. 139000: Need to find a better way to configure image size for OV Models
-            # https://github.com/openvinotoolkit/training_extensions/pull/2925
-            model.input_size = model.tile_image_size
-
         # Update self.config with model
         self.config[self.subcommand].update(Namespace(model=model_config))
 
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index d371bb5320d..8268a18dd4f 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -7,7 +7,7 @@
 
 import logging as log
 from copy import deepcopy
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING, Iterable, Literal
 
 import torch
 from datumaro import Dataset as DmDataset
@@ -24,13 +24,13 @@
     parse_mem_cache_size_to_int,
 )
 from otx.core.data.pre_filtering import pre_filtering
-from otx.core.data.tile_adaptor import adapt_tile_config
 from otx.core.types.device import DeviceType
 from otx.core.types.image import ImageColorChannel
 from otx.core.types.label import LabelInfo
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.instantiators import instantiate_sampler
 from otx.core.utils.utils import get_adaptive_num_workers
+from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config
 
 if TYPE_CHECKING:
     from lightning.pytorch.utilities.parsing import AttributeDict
@@ -63,7 +63,7 @@ def __init__(
         auto_num_workers: bool = False,
         device: DeviceType = DeviceType.auto,
         input_size: int | tuple[int, int] | None = None,
-        adaptive_input_size: bool = False,
+        adaptive_input_size: Literal["auto", "downscale", "none"] = "none",
     ) -> None:
         """Constructor."""
         super().__init__()
@@ -71,15 +71,6 @@ def __init__(
         self.data_format = data_format
         self.data_root = data_root
 
-        if adaptive_input_size:
-            print("adaptive_input_size works")
-
-        if input_size is not None:
-            for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]:
-                if subset_cfg.input_size is None:
-                    subset_cfg.input_size = input_size
-        self.input_size = input_size
-
         self.train_subset = train_subset
         self.val_subset = val_subset
         self.test_subset = test_subset
@@ -143,6 +134,13 @@ def __init__(
                 subset=self.unlabeled_subset.subset_name,
             )
 
+        if adaptive_input_size != "none":
+            input_size = adapt_input_size_to_dataset(dataset, input_size, adaptive_input_size=="downscale")
+        if input_size is not None:
+            for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]:
+                subset_cfg.input_size = input_size
+        self.input_size = input_size
+
         if self.tile_config.enable_tiler and self.tile_config.enable_adaptive_tiling:
             adapt_tile_config(self.tile_config, dataset=dataset)
 
diff --git a/src/otx/core/data/tile_adaptor.py b/src/otx/core/data/tile_adaptor.py
deleted file mode 100644
index 34755dd55eb..00000000000
--- a/src/otx/core/data/tile_adaptor.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-"""Tile Adaptor for OTX."""
-from __future__ import annotations
-
-import logging as log
-from typing import Any
-
-import numpy as np
-from datumaro import Bbox, Dataset, DatasetSubset, Polygon
-
-from otx.core.config.data import TileConfig
-
-
-def compute_robust_statistics(values: np.array) -> dict[str, float]:
-    """Computes robust statistics of given samples.
-
-    Args:
-        values (np.array): Array of samples
-
-    Returns:
-        dict[str, float]: Robust avg, min, max values
-    """
-    stat: dict = {}
-    if values.size == 0:
-        return stat
-
-    avg_value = np.mean(values)
-    std_value = np.std(values)
-    avg_3std_min_value = avg_value - 3 * std_value
-    avg_3std_max_value = avg_value + 3 * std_value
-    min_value = np.min(values)
-    max_value = np.max(values)
-
-    # Refine min/max to reduce outlier effect
-    robust_min_value = max(min_value, avg_3std_min_value)
-    robust_max_value = min(max_value, avg_3std_max_value)
-
-    stat["avg"] = float(avg_value)
-    stat["std"] = float(std_value)
-    stat["min"] = float(min_value)
-    stat["max"] = float(max_value)
-    stat["robust_min"] = float(robust_min_value)
-    stat["robust_max"] = float(robust_max_value)
-    return stat
-
-
-def compute_robust_scale_statistics(values: np.array) -> dict[str, float]:
-    """Computes robust statistics of scale values.
-
-    Average of 0.5x scale and 2x scale should be 1x
-
-    Args:
-        values (np.array): Array of positive scale values
-
-    Returns:
-        dict[str, float]: Robust avg, min, max values
-    """
-    # Compute stat in log scale & convert back to original scale
-    if values.size == 0:
-        return {}
-
-    stat = compute_robust_statistics(np.log(values))
-    stat = {k: float(np.exp(v)) for k, v in stat.items()}
-    # Normal scale std is easier to understand
-    stat["std"] = float(np.std(values))
-    return stat
-
-
-def compute_robust_dataset_statistics(
-    dataset: DatasetSubset,
-    ann_stat: bool = False,
-    max_samples: int = 1000,
-) -> dict[str, Any]:
-    """Computes robust statistics of image & annotation sizes.
-
-    Args:
-        dataset (DatasetSubset): Input dataset.
-        ann_stat (bool, optional): Whether to compute annotation size statistics. Defaults to False.
-        max_samples (int, optional): Maximum number of dataset subsamples to analyze. Defaults to 1000.
-
-    Returns:
-        Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally.
-            ex) stat = {
-                    "image": {"avg": ...},
-                    "annotation": {
-                       "num_per_image": {"avg": ...},
-                       "size_of_shape": {"avg": ...},
-                    }
-                }
-    """
-    stat: dict = {}
-    if len(dataset) == 0 or max_samples <= 0:
-        return stat
-
-    data_ids = [item.id for item in dataset]
-    max_image_samples = min(max_samples, len(dataset))
-    # NOTE: current OTX does not set seed globally
-    rng = np.random.default_rng(42)
-    data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples]
-
-    image_sizes = []
-    for idx in data_ids:
-        data = dataset.get(id=idx, subset=dataset.name)
-        height, width = data.media.size
-        image_sizes.append(np.sqrt(width * height))
-    stat["image"] = compute_robust_scale_statistics(np.array(image_sizes))
-
-    if ann_stat:
-        stat["annotation"] = {}
-        num_per_images: list[int] = []
-        size_of_box_shapes: list[float] = []
-        size_of_polygon_shapes: list[float] = []
-        for idx in data_ids:
-            data = dataset.get(id=idx, subset=dataset.name)
-            annotations: dict[str, list] = {"boxes": [], "polygons": []}
-            for ann in data.annotations:
-                if isinstance(ann, Bbox):
-                    annotations["boxes"].append(ann)
-                elif isinstance(ann, Polygon):
-                    annotations["polygons"].append(ann)
-
-            num_per_images.append(max(len(annotations["boxes"]), len(annotations["polygons"])))
-
-            if len(size_of_box_shapes) >= max_samples or len(size_of_polygon_shapes) >= max_samples:
-                continue
-
-            size_of_box_shapes.extend(
-                filter(lambda x: x >= 1, [np.sqrt(anno.get_area()) for anno in annotations["boxes"]]),
-            )
-            size_of_polygon_shapes.extend(
-                filter(lambda x: x >= 1, [np.sqrt(anno.get_area()) for anno in annotations["polygons"]]),
-            )
-
-        stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images))
-        stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(
-            np.array(size_of_polygon_shapes) if len(size_of_polygon_shapes) else np.array(size_of_box_shapes),
-        )
-
-    return stat
-
-
-def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
-    """Config tile parameters.
-
-    Adapt based on annotation statistics.
-    i.e. tile size, tile overlap, ratio and max objects per sample
-
-    Args:
-        tile_config (TileConfig): tiling parameters of the model
-        dataset (Dataset): Datumaro dataset including all subsets
-    """
-    if (train_dataset := dataset.subsets().get("train")) is not None:
-        stat = compute_robust_dataset_statistics(train_dataset, ann_stat=True)
-        max_num_objects = round(stat["annotation"]["num_per_image"]["max"])
-        avg_size = stat["annotation"]["size_of_shape"]["avg"]
-        min_size = stat["annotation"]["size_of_shape"]["robust_min"]
-        max_size = stat["annotation"]["size_of_shape"]["robust_max"]
-        log.info(f"----> [stat] scale avg: {avg_size}")
-        log.info(f"----> [stat] scale min: {min_size}")
-        log.info(f"----> [stat] scale max: {max_size}")
-
-        log.info("[Adaptive tiling pararms]")
-        object_tile_ratio = tile_config.object_tile_ratio
-        tile_size = int(avg_size / object_tile_ratio)
-        tile_overlap = max_size / tile_size
-        log.info(f"----> avg_object_size: {avg_size}")
-        log.info(f"----> max_object_size: {max_size}")
-        log.info(f"----> object_tile_ratio: {object_tile_ratio}")
-        log.info(f"----> tile_size: {avg_size} / {object_tile_ratio} = {tile_size}")
-        log.info(f"----> tile_overlap: {max_size} / {tile_size} = {tile_overlap}")
-
-        if tile_overlap >= 0.9:
-            # Use the average object area if the tile overlap is too large to prevent 0 stride.
-            tile_overlap = min(avg_size / tile_size, 0.9)
-            log.info(f"----> (too big) tile_overlap: {avg_size} / {tile_size} = min[{tile_overlap}, 0.9]")
-
-        # TODO(Eugene): how to validate lower/upper_bound? dataclass? pydantic?
-        # https://github.com/openvinotoolkit/training_extensions/pull/2903
-        tile_config.tile_size = (tile_size, tile_size)
-        tile_config.max_num_instances = max_num_objects
-        tile_config.overlap = tile_overlap
diff --git a/src/otx/core/data/utils/__init__.py b/src/otx/core/data/utils/__init__.py
new file mode 100644
index 00000000000..adc0400284e
--- /dev/null
+++ b/src/otx/core/data/utils/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Utility modules for core data modules."""
+
+from .utils import adapt_tile_config, adapt_input_size_to_dataset
+
+__all__ = ["adapt_tile_config", "adapt_input_size_to_dataset"]
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
new file mode 100644
index 00000000000..0d7f3fbfdbd
--- /dev/null
+++ b/src/otx/core/data/utils/utils.py
@@ -0,0 +1,253 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Functions for adaptive input size."""
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+from datumaro.components.annotation import _Shape
+
+if TYPE_CHECKING:
+    from datumaro import Dataset, DatasetSubset
+
+    from otx.core.config.data import TileConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+def compute_robust_statistics(values: np.array) -> dict[str, float]:
+    """Computes robust statistics of given samples.
+
+    Args:
+        values (np.array): Array of samples
+
+    Returns:
+        dict[str, float]: Robust avg, min, max values
+    """
+    stat: dict = {}
+    if values.size == 0:
+        return stat
+
+    avg_value = np.mean(values)
+    std_value = np.std(values)
+    avg_3std_min_value = avg_value - 3 * std_value
+    avg_3std_max_value = avg_value + 3 * std_value
+    min_value = np.min(values)
+    max_value = np.max(values)
+
+    # Refine min/max to reduce outlier effect
+    robust_min_value = max(min_value, avg_3std_min_value)
+    robust_max_value = min(max_value, avg_3std_max_value)
+
+    stat["avg"] = float(avg_value)
+    stat["std"] = float(std_value)
+    stat["min"] = float(min_value)
+    stat["max"] = float(max_value)
+    stat["robust_min"] = float(robust_min_value)
+    stat["robust_max"] = float(robust_max_value)
+    return stat
+
+
+def compute_robust_scale_statistics(values: np.array) -> dict[str, float]:
+    """Computes robust statistics of scale values.
+
+    Average of 0.5x scale and 2x scale should be 1x
+
+    Args:
+        values (np.array): Array of positive scale values
+
+    Returns:
+        dict[str, float]: Robust avg, min, max values
+    """
+    # Compute stat in log scale & convert back to original scale
+    if values.size == 0:
+        return {}
+
+    stat = compute_robust_statistics(np.log(values))
+    stat = {k: float(np.exp(v)) for k, v in stat.items()}
+    # Normal scale std is easier to understand
+    stat["std"] = float(np.std(values))
+    return stat
+
+
+def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int = 1000) -> dict[str, Any]:
+    """Computes robust statistics of image & annotation sizes.
+
+    Args:
+        dataset (DatasetSubset): Input dataset.
+        max_samples (int, optional): Maximum number of dataset subsamples to analyze. Defaults to 1000.
+
+    Returns:
+        Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally.
+            ex) stat = {
+                    "image": {"avg": ...},
+                    "annotation": {
+                       "num_per_image": {"avg": ...},
+                       "size_of_shape": {"avg": ...},
+                    }
+                }
+    """
+    stat: dict = {}
+    if len(dataset) == 0 or max_samples <= 0:
+        return stat
+
+    data_ids = [item.id for item in dataset]
+    max_image_samples = min(max_samples, len(dataset))
+    rng = np.random.default_rng(42)
+    data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples]
+
+    image_sizes = []
+    for idx in data_ids:
+        data = dataset.get(id=idx, subset=dataset.name)
+        height, width = data.media.size
+        image_sizes.append(np.sqrt(width * height))
+    stat["image"] = compute_robust_scale_statistics(np.array(image_sizes))
+
+    stat["annotation"] = {}
+    num_per_images: list[int] = []
+    size_of_shapes: dict[str, list] = defaultdict(list)
+    for idx in data_ids:
+        data = dataset.get(id=idx, subset=dataset.name)
+        annotations: dict[str, list] = defaultdict(list)
+        for ann in data.annotations:
+            annotations[ann.__class__.__name__].append(ann)
+
+        num_per_images.append(max(len(val) for val in annotations.values()))
+
+        if max(len(val) for val in size_of_shapes.values()) >= max_samples:
+            continue
+
+        for ann_type, anns in annotations.items():
+            size_of_shapes[ann_type].extend(
+                np.sqrt(area) for val in anns if isinstance(val, _Shape) and (area := val.get_area()) >= 1
+            )
+
+    stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images))
+    if "Polygon" in size_of_shapes:
+        stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes["Polygon"]))
+    else:
+        max_ann_type = None
+        max_num_ann = 0
+        for ann_type, anns in size_of_shapes.items():
+            if max_num_ann < len(anns):
+                max_ann_type = ann_type
+                max_num_ann = len(anns)
+        stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes[max_ann_type]))
+
+    return stat
+
+
+def adapt_input_size_to_dataset(dataset: Dataset, base_input_size = None, downscale_only: bool = True) -> tuple[int, int]:
+    """Compute appropriate model input size w.r.t. dataset statistics.
+
+    Args:
+        max_image_size (int): Typical large image size of dataset in pixels.
+        min_object_size (int, optional): Typical small object size of dataset in pixels.
+            None to consider only image size. Defaults to None.
+        downscale_only (bool) : Whether to allow only smaller size than default setting. Defaults to True.
+
+    Returns:
+        Tuple[int, int]: (width, height)
+    """
+    MIN_RECOGNIZABLE_OBJECT_SIZE = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
+    # meaning NxN input pixels being downscaled to 1x1 on feature map
+    MIN_DETECTION_INPUT_SIZE = 256  # Minimum input size for object detection
+
+    train_dataset = dataset.subsets().get("train")
+    if train_dataset is None:
+        return
+
+    logger.info("Adapting model input size based on dataset stat")
+    stat = compute_robust_dataset_statistics(train_dataset)
+    max_image_size = stat["image"]["robust_max"]
+    min_object_size = None
+    if stat["annotation"]:
+        # Refine using annotation shape size stat
+        # Fit to typical small object size (conservative)
+        # -> "avg" size might be preferrable for efficiency
+        min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None)
+
+    base_input_size = base_input_size
+    if isinstance(base_input_size, dict):
+        base_input_size = base_input_size.get("train", base_input_size.get("test", None))
+    logger.info(f"-> Current base input size: {base_input_size}")
+
+    if max_image_size <= 0:
+        return base_input_size
+
+    image_size = max_image_size
+    logger.info(f"-> Based on typical large image size: {image_size}")
+
+    # Refine using annotation shape size stat
+    if min_object_size is not None and min_object_size > 0:
+        image_size = round(image_size * MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size)
+        logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}")
+        if image_size > max_image_size:
+            image_size = max_image_size
+            logger.info(f"-> Restrict to max image size: {image_size}")
+        if image_size < MIN_DETECTION_INPUT_SIZE:
+            image_size = MIN_DETECTION_INPUT_SIZE
+            logger.info(f"-> Based on minimum object detection input size: {image_size}")
+
+    input_size = (round(image_size), round(image_size))
+
+    if downscale_only:
+
+        def area(x):
+            return x[0] * x[1]
+
+        if base_input_size and area(input_size) >= area(base_input_size):
+            logger.info(f"-> Downscale only: {input_size} -> {base_input_size}")
+            return base_input_size
+
+    # Closest preset
+    logger.info(f"-> Closest preset: {input_size}")
+    return input_size
+
+
+def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
+    """Config tile parameters.
+
+    Adapt based on annotation statistics.
+    i.e. tile size, tile overlap, ratio and max objects per sample
+
+    Args:
+        tile_config (TileConfig): tiling parameters of the model
+        dataset (Dataset): Datumaro dataset including all subsets
+    """
+    if (train_dataset := dataset.subsets().get("train")) is not None:
+        stat = compute_robust_dataset_statistics(train_dataset)
+        max_num_objects = round(stat["annotation"]["num_per_image"]["max"])
+        avg_size = stat["annotation"]["size_of_shape"]["avg"]
+        min_size = stat["annotation"]["size_of_shape"]["robust_min"]
+        max_size = stat["annotation"]["size_of_shape"]["robust_max"]
+        logger.info(f"----> [stat] scale avg: {avg_size}")
+        logger.info(f"----> [stat] scale min: {min_size}")
+        logger.info(f"----> [stat] scale max: {max_size}")
+
+        logger.info("[Adaptive tiling pararms]")
+        object_tile_ratio = tile_config.object_tile_ratio
+        tile_size = int(avg_size / object_tile_ratio)
+        tile_overlap = max_size / tile_size
+        logger.info(f"----> avg_object_size: {avg_size}")
+        logger.info(f"----> max_object_size: {max_size}")
+        logger.info(f"----> object_tile_ratio: {object_tile_ratio}")
+        logger.info(f"----> tile_size: {avg_size} / {object_tile_ratio} = {tile_size}")
+        logger.info(f"----> tile_overlap: {max_size} / {tile_size} = {tile_overlap}")
+
+        if tile_overlap >= 0.9:
+            # Use the average object area if the tile overlap is too large to prevent 0 stride.
+            tile_overlap = min(avg_size / tile_size, 0.9)
+            logger.info(f"----> (too big) tile_overlap: {avg_size} / {tile_size} = min[{tile_overlap}, 0.9]")
+
+        # TODO(Eugene): how to validate lower/upper_bound? dataclass? pydantic?
+        # https://github.com/openvinotoolkit/training_extensions/pull/2903
+        tile_config.tile_size = (tile_size, tile_size)
+        tile_config.max_num_instances = max_num_objects
+        tile_config.overlap = tile_overlap
\ No newline at end of file
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index ee5ff4dce35..c6a66d455b1 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -12,7 +12,7 @@
 import time
 from contextlib import contextmanager
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator, Literal
 from warnings import warn
 
 import torch
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 65e8f8bf2e7..3a5235e1f9f 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -144,13 +144,11 @@ def __init__(
         data_root: PathLike | None = None,
         task: OTXTaskType | None = None,
         model_name: str | None = None,
-        input_size: Sequence[int] | None = None
     ) -> None:
         self.data_root = data_root
         self._task = task
         self._config: dict | None = None
         self.model_name: str | None = model_name
-        self.input_size = input_size
 
     @property
     def task(self) -> OTXTaskType:
@@ -229,9 +227,6 @@ def get_datamodule(self) -> OTXDataModule | None:
         _ = data_config.pop("__path__", {})  # Remove __path__ key that for CLI
         _ = data_config.pop("config", {})  # Remove config key that for CLI
 
-        if getattr(data_config, "input_size", None) is not None and self.input_size is not None:
-            data_config["input_size"] = self.input_size
-
         return OTXDataModule(
             train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config),
             val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config),
diff --git a/src/otx/recipe/detection/yolox_tiny.yaml b/src/otx/recipe/detection/yolox_tiny.yaml
index 9950a427274..bdeee86606c 100644
--- a/src/otx/recipe/detection/yolox_tiny.yaml
+++ b/src/otx/recipe/detection/yolox_tiny.yaml
@@ -37,10 +37,10 @@ overrides:
 
   gradient_clip_val: 35.0
   data:
-    input_size:
-      - 640
-      - 640
     train_subset:
+      input_size:
+        - 640
+        - 640
       batch_size: 8
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.CachedMosaic
diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py
index aec4a299737..9344687894c 100644
--- a/tests/unit/algo/detection/test_rtmdet.py
+++ b/tests/unit/algo/detection/test_rtmdet.py
@@ -19,7 +19,6 @@ def test_init(self) -> None:
         assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN)
         assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead)
         assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640)
-        assert otx_rtmdet_tiny.tile_image_size == (1, 3, 640, 640)
 
     def test_exporter(self) -> None:
         otx_rtmdet_tiny = RTMDetTiny(label_info=3)
diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py
index 3b8cb90e686..29ffdd8172e 100644
--- a/tests/unit/algo/detection/test_yolox.py
+++ b/tests/unit/algo/detection/test_yolox.py
@@ -19,11 +19,9 @@ def test_init(self) -> None:
         assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN)
         assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead)
         assert otx_yolox_l.input_size == (1, 3, 640, 640)
-        assert otx_yolox_l.tile_image_size == (1, 3, 640, 640)
 
         otx_yolox_tiny = YOLOXTINY(label_info=3)
         assert otx_yolox_tiny.input_size == (1, 3, 416, 416)
-        assert otx_yolox_tiny.tile_image_size == (1, 3, 416, 416)
 
     def test_exporter(self) -> None:
         otx_yolox_l = YOLOXL(label_info=3)

From b1121f06687d057d776c479361c493c8346da115 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 7 Aug 2024 13:51:07 +0900
Subject: [PATCH 07/42] handle edge case

---
 src/otx/core/data/utils/utils.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 0d7f3fbfdbd..05178ffd159 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -2,7 +2,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
-"""Functions for adaptive input size."""
+"""Utility functions for the data module."""
 from __future__ import annotations
 
 import logging
@@ -118,9 +118,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
         for ann in data.annotations:
             annotations[ann.__class__.__name__].append(ann)
 
-        num_per_images.append(max(len(val) for val in annotations.values()))
+        num_per_images.append(max(len(val) for val in annotations.values()) if annotations else 0)
 
-        if max(len(val) for val in size_of_shapes.values()) >= max_samples:
+        if size_of_shapes and max(len(val) for val in size_of_shapes.values()) >= max_samples:
             continue
 
         for ann_type, anns in annotations.items():
@@ -143,7 +143,11 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
     return stat
 
 
-def adapt_input_size_to_dataset(dataset: Dataset, base_input_size = None, downscale_only: bool = True) -> tuple[int, int]:
+def adapt_input_size_to_dataset(
+    dataset: Dataset,
+    base_input_size: int | None = None,
+    downscale_only: bool = True
+) -> tuple[int, int]:
     """Compute appropriate model input size w.r.t. dataset statistics.
 
     Args:
@@ -250,4 +254,4 @@ def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
         # https://github.com/openvinotoolkit/training_extensions/pull/2903
         tile_config.tile_size = (tile_size, tile_size)
         tile_config.max_num_instances = max_num_objects
-        tile_config.overlap = tile_overlap
\ No newline at end of file
+        tile_config.overlap = tile_overlap

From 659a9cf61651fc4d111e7ee26c2d365762484ce8 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Wed, 7 Aug 2024 21:40:17 +0900
Subject: [PATCH 08/42] add input_size_multiplier and pass it to datamodule in
 cli

---
 src/otx/algo/detection/rtdetr.py               |  5 +----
 src/otx/algo/detection/rtmdet.py               |  6 ++----
 src/otx/algo/detection/yolox.py                | 10 ++--------
 src/otx/algo/segmentation/dino_v2_seg.py       |  7 +++----
 .../algo/visual_prompting/segment_anything.py  |  5 ++---
 src/otx/cli/cli.py                             | 10 +++++++++-
 src/otx/core/data/module.py                    |  8 +++++++-
 src/otx/core/data/utils/utils.py               | 18 +++++++++++++-----
 src/otx/core/model/base.py                     |  9 +++++++++
 src/otx/engine/utils/auto_configurator.py      |  3 +++
 10 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 4aeb7bed3b2..13e5fdcae4e 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -38,6 +38,7 @@
 class RTDETR(ExplainableOTXDetModel):
     """RTDETR model."""
 
+    input_size_multiplier = 32
     mean: tuple[float, float, float] = (0.0, 0.0, 0.0)
     std: tuple[float, float, float] = (255.0, 255.0, 255.0)
     load_from: str | None = None
@@ -52,10 +53,6 @@ def __init__(
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
     ) -> None:
-        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
-            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
-
         super().__init__(
             label_info=label_info,
             input_size=input_size,
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index 3f8e7b5c34c..e5757944759 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -36,6 +36,8 @@
 class RTMDet(ExplainableOTXDetModel):
     """OTX Detection model class for RTMDet."""
 
+    input_size_multiplier = 32
+
     def __init__(
         self,
         label_info: LabelInfoTypes,
@@ -46,10 +48,6 @@ def __init__(
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
     ) -> None:
-        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
-            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
-
         super().__init__(
             label_info=label_info,
             input_size=input_size,
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index 8341c6419f6..800c0e17f5e 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -37,6 +37,8 @@
 class YOLOX(ExplainableOTXDetModel):
     """OTX Detection model class for YOLOX."""
 
+    input_size_multiplier = 32
+
     def __init__(
         self,
         label_info: LabelInfoTypes,
@@ -47,10 +49,6 @@ def __init__(
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
     ) -> None:
-        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
-            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
-
         super().__init__(
             label_info=label_info,
             input_size=input_size,
@@ -158,10 +156,6 @@ def __init__(
         torch_compile: bool = False,
         tile_config: TileConfig = TileConfig(enable_tiler=False),
     ) -> None:
-        if input_size[-1] % 32 != 0 or input_size[-2] % 32 != 0:
-            msg = f"Input size should be a multiple of 32, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
-
         super().__init__(
             label_info=label_info,
             input_size=input_size,
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
index c4be747d4e0..c3e455a2910 100644
--- a/src/otx/algo/segmentation/dino_v2_seg.py
+++ b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -50,6 +50,9 @@ class DinoV2Seg(BaseSegmModel):
 
 class OTXDinoV2Seg(TorchVisionCompatibleModel):
     """DinoV2Seg Model."""
+
+    input_size_multiplier = 14
+
     def __init__(
         self,
         label_info: LabelInfoTypes,
@@ -64,10 +67,6 @@ def __init__(
         export_image_configuration: dict[str, Any] | None = None,
         name_base_model: str = "semantic_segmentation_model",
     ):
-        if input_size[-1] % 14 != 0 or input_size[-2] % 14 != 0:
-            msg = f"Input size should be a multiple of 14, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
-
         super().__init__(
             label_info=label_info,
             input_size=input_size,
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index 7e8bf94d65a..a948f9ae648 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -490,6 +490,8 @@ def select_masks(self, masks: Tensor, iou_preds: Tensor, num_points: int) -> tup
 class OTXSegmentAnything(OTXVisualPromptingModel):
     """Visual Prompting model."""
 
+    input_size_multiplier = 16
+
     def __init__(
         self,
         backbone: Literal["tiny_vit", "vit_b"],
@@ -510,9 +512,6 @@ def __init__(
         if input_size[-1] != input_size[-2]:
             msg = f"SAM should use square image, but got {input_size}"
             raise ValueError(msg)
-        if input_size[-1] % 16 != 0 and input_size[-2] % 16 != 0:
-            msg = f"Input size should be a multiple of 16, but got {input_size[-2:]} instead."
-            raise ValueError(msg)
 
         self.config = {
             "backbone": backbone,
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index c0a66c184dd..178673c5ee2 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -23,6 +23,7 @@
 from otx.cli.utils.workspace import Workspace
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.imports import get_otx_root_path
+from otx.utils.utils import get_model_cls_from_config
 
 if TYPE_CHECKING:
     from jsonargparse._actions import _ActionSubCommands
@@ -331,6 +332,11 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # For num_classes update, Model and Metric are instantiated separately.
             model_config = self.config[self.subcommand].pop("model")
 
+            if self.config[self.subcommand].data.adaptive_input_size != "none":
+                model_cls = get_model_cls_from_config(model_config)
+                if hasattr(model_cls, "input_size_multiplier"):
+                    self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier
+
             # Instantiate the things that don't need to special handling
             self.config_init = self.parser.instantiate_classes(self.config)
             self.workspace = self.get_config_value(self.config_init, "workspace")
@@ -339,6 +345,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             if (input_size := self.datamodule.input_size) is not None:
                 if isinstance(input_size, int):
                     input_size = (input_size, input_size)
+                else:
+                    input_size = tuple(input_size)
                 model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
 
             # Instantiate the model and needed components
@@ -374,7 +382,7 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel:
             tuple: The model and optimizer and scheduler.
         """
         from otx.core.model.base import OTXModel
-        from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info
+        from otx.utils.utils import can_pass_tile_config, should_pass_label_info
 
         skip = set()
 
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index 8268a18dd4f..3e1b6f5f133 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -64,6 +64,7 @@ def __init__(
         device: DeviceType = DeviceType.auto,
         input_size: int | tuple[int, int] | None = None,
         adaptive_input_size: Literal["auto", "downscale", "none"] = "none",
+        input_size_multiplier: int = 1,
     ) -> None:
         """Constructor."""
         super().__init__()
@@ -135,7 +136,12 @@ def __init__(
             )
 
         if adaptive_input_size != "none":
-            input_size = adapt_input_size_to_dataset(dataset, input_size, adaptive_input_size=="downscale")
+            input_size = adapt_input_size_to_dataset(
+                dataset,
+                input_size,
+                adaptive_input_size=="downscale",
+                input_size_multiplier,
+            )
         if input_size is not None:
             for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]:
                 subset_cfg.input_size = input_size
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 05178ffd159..665994fe2ef 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -145,8 +145,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
 
 def adapt_input_size_to_dataset(
     dataset: Dataset,
-    base_input_size: int | None = None,
-    downscale_only: bool = True
+    base_input_size: int | tuple[int, int] | None = None,
+    downscale_only: bool = True,
+    input_size_multiplier: int | None = None,
 ) -> tuple[int, int]:
     """Compute appropriate model input size w.r.t. dataset statistics.
 
@@ -163,6 +164,13 @@ def adapt_input_size_to_dataset(
     # meaning NxN input pixels being downscaled to 1x1 on feature map
     MIN_DETECTION_INPUT_SIZE = 256  # Minimum input size for object detection
 
+    if downscale_only and base_input_size is None:
+        msg = "If downscale_only is set to True, base_input_size should be set but got None."
+        raise ValueError(msg)
+
+    if isinstance(base_input_size, int):
+        base_input_size = (base_input_size, base_input_size)
+
     train_dataset = dataset.subsets().get("train")
     if train_dataset is None:
         return
@@ -177,9 +185,6 @@ def adapt_input_size_to_dataset(
         # -> "avg" size might be preferrable for efficiency
         min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None)
 
-    base_input_size = base_input_size
-    if isinstance(base_input_size, dict):
-        base_input_size = base_input_size.get("train", base_input_size.get("test", None))
     logger.info(f"-> Current base input size: {base_input_size}")
 
     if max_image_size <= 0:
@@ -199,6 +204,9 @@ def adapt_input_size_to_dataset(
             image_size = MIN_DETECTION_INPUT_SIZE
             logger.info(f"-> Based on minimum object detection input size: {image_size}")
 
+    if input_size_multiplier is not None and image_size % input_size_multiplier != 0:
+        image_size = (image_size // input_size_multiplier + 1) * input_size_multiplier
+
     input_size = (round(image_size), round(image_size))
 
     if downscale_only:
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 10d7475962b..405116b762e 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -114,6 +114,7 @@ def __init__(
         super().__init__()
 
         self._label_info = self._dispatch_label_info(label_info)
+        self._check_input_size(input_size)
         self.input_size = input_size
         self.classification_layers: dict[str, dict[str, Any]] = {}
         self.model = self._create_model()
@@ -809,6 +810,14 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
         raise TypeError(label_info)
 
+    def _check_input_size(self, input_size: Sequence[int] | None = None) -> None:
+        if (
+            input_size is not None
+            and hasattr(self, "input_size_multiplier")
+            and (input_size[-1] % self.input_size_multiplier != 0 or input_size[-2] % self.input_size_multiplier != 0)
+        ):
+            msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead."
+            raise ValueError(msg)
 
 class OVModel(OTXModel, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEntity]):
     """Base class for the OpenVINO model.
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 3a5235e1f9f..90790d59521 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -22,6 +22,7 @@
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.imports import get_otx_root_path
 from otx.core.utils.instantiators import partial_instantiate_class
+from otx.core.utils.utils import import_object_from_module
 from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info
 
 if TYPE_CHECKING:
@@ -276,6 +277,8 @@ def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes |
         if input_size is not None:
             if isinstance(input_size, int):
                 input_size = (input_size, input_size)
+            else:
+                input_size = tuple(input_size)
             model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
 
         model_cls = get_model_cls_from_config(Namespace(model_config))

From e32902949f490eac266d8b2a7c8f092f6290511e Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Thu, 8 Aug 2024 15:22:01 +0900
Subject: [PATCH 09/42] change typehint from sequence to tuple

---
 src/otx/algo/action_classification/movinet.py  |  4 ++--
 src/otx/algo/action_classification/x3d.py      |  4 ++--
 src/otx/algo/classification/dino_v2.py         |  4 ++--
 src/otx/algo/classification/efficientnet.py    |  8 ++++----
 src/otx/algo/classification/efficientnet_v2.py |  8 ++++----
 .../algo/classification/huggingface_model.py   |  5 +++--
 src/otx/algo/classification/mobilenet_v3.py    |  8 ++++----
 .../algo/classification/torchvision_model.py   |  4 ++--
 src/otx/algo/classification/vit.py             |  8 ++++----
 src/otx/algo/detection/atss.py                 | 16 ++++++++--------
 .../base_models/detection_transformer.py       |  2 +-
 src/otx/algo/detection/huggingface_model.py    |  7 +++----
 src/otx/algo/detection/rtdetr.py               | 14 +++++++-------
 src/otx/algo/detection/rtmdet.py               | 14 +++++++-------
 src/otx/algo/detection/ssd.py                  | 14 +++++++-------
 src/otx/algo/detection/yolox.py                | 17 +++++++++--------
 src/otx/algo/instance_segmentation/maskrcnn.py | 18 +++++++++---------
 .../algo/instance_segmentation/maskrcnn_tv.py  | 14 +++++++-------
 .../algo/instance_segmentation/rtmdet_inst.py  | 16 ++++++++--------
 src/otx/algo/segmentation/dino_v2_seg.py       | 12 ++++++------
 src/otx/algo/segmentation/huggingface_model.py |  6 +++---
 src/otx/algo/segmentation/litehrnet.py         | 13 +++++++------
 src/otx/algo/segmentation/segnext.py           | 13 +++++++------
 .../algo/visual_prompting/segment_anything.py  |  6 +++---
 src/otx/cli/cli.py                             |  6 ++++--
 src/otx/core/data/module.py                    |  4 ++--
 src/otx/core/data/utils/__init__.py            |  2 +-
 src/otx/core/data/utils/utils.py               |  3 +--
 src/otx/core/model/action_classification.py    |  4 ++--
 src/otx/core/model/base.py                     |  5 +++--
 src/otx/core/model/classification.py           |  8 ++++----
 src/otx/core/model/detection.py                |  4 ++--
 src/otx/core/model/instance_segmentation.py    |  6 +++---
 src/otx/core/model/segmentation.py             |  8 ++++----
 src/otx/core/model/visual_prompting.py         |  4 ++--
 src/otx/engine/engine.py                       | 11 +++++------
 src/otx/engine/utils/auto_configurator.py      | 11 ++++++++---
 37 files changed, 160 insertions(+), 151 deletions(-)

diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py
index 7c5861d2af6..4803aba3d9e 100644
--- a/src/otx/algo/action_classification/movinet.py
+++ b/src/otx/algo/action_classification/movinet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 from torch import nn
 
@@ -32,7 +32,7 @@ class MoViNet(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 1, 3, 8, 224, 224),
+        input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
index 6c26f2deb2f..60b58b6521d 100644
--- a/src/otx/algo/action_classification/x3d.py
+++ b/src/otx/algo/action_classification/x3d.py
@@ -4,7 +4,7 @@
 """X3D model implementation."""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 from torch import nn
 
@@ -31,7 +31,7 @@ class X3D(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 1, 3, 8, 224, 224),
+        input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
diff --git a/src/otx/algo/classification/dino_v2.py b/src/otx/algo/classification/dino_v2.py
index 592d2616a2e..7c2afda0d79 100644
--- a/src/otx/algo/classification/dino_v2.py
+++ b/src/otx/algo/classification/dino_v2.py
@@ -8,7 +8,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from torch import Tensor, nn
@@ -119,7 +119,7 @@ def __init__(
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
         freeze_backbone: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index d818551c5d2..6ea3aa9a257 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
@@ -60,7 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -278,7 +278,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
@@ -408,7 +408,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.version = version
 
diff --git a/src/otx/algo/classification/efficientnet_v2.py b/src/otx/algo/classification/efficientnet_v2.py
index 4d6aa086748..ea4617e40a2 100644
--- a/src/otx/algo/classification/efficientnet_v2.py
+++ b/src/otx/algo/classification/efficientnet_v2.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
@@ -60,7 +60,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -269,7 +269,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -396,7 +396,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         super().__init__(
             label_info=label_info,
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index d47bceffea9..6de912cdf5d 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
@@ -36,6 +36,7 @@
 DEFAULT_INPUT_SIZE = (1, 2, 224, 224)
 logger = logging.getLogger(__name__)
 
+
 class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
     """HuggingFaceModelForMulticlassCls is a class that represents a Hugging Face model for multiclass classification.
 
@@ -66,7 +67,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = DEFAULT_INPUT_SIZE,
+        input_size: tuple[int, ...] = DEFAULT_INPUT_SIZE,
     ) -> None:
         self.model_name = model_name_or_path
 
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index 6a114aa62bb..b14017de3e8 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from torch import Tensor, nn
@@ -71,7 +71,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -285,7 +285,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -416,7 +416,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 10e726aacc6..77643513b61 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from torch import Tensor, nn
@@ -422,7 +422,7 @@ def __init__(
             OTXTaskType.H_LABEL_CLS,
         ] = OTXTaskType.MULTI_CLASS_CLS,
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 446a18c0645..0797b5bd240 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -7,7 +7,7 @@
 import types
 from copy import deepcopy
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Generic, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Generic
 from urllib.parse import urlparse
 
 import numpy as np
@@ -229,7 +229,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -468,7 +468,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -617,7 +617,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: Sequence[int] = (1, 3, 224, 224),
+        input_size: tuple[int, ...] = (1, 3, 224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index dd5412ac072..a873055354b 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 from otx.algo.common.backbones import ResNeXt, build_model_including_pytorchcv
 from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss
@@ -17,20 +17,20 @@
 from otx.algo.detection.necks import FPN
 from otx.algo.detection.utils.assigners import ATSSAssigner
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
+from otx.core.config.data import TileConfig
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.detection import ExplainableOTXDetModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
-from otx.core.config.data import TileConfig
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.detection import ExplainableOTXDetModel
 
 if TYPE_CHECKING:
-    from typing_extensions import Self
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from typing_extensions import Self
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class ATSS(ExplainableOTXDetModel):
@@ -39,7 +39,7 @@ class ATSS(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 800, 992),
+        input_size: tuple[int, ...] = (1, 3, 800, 992),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py
index 18ca34c33de..a3158e19845 100644
--- a/src/otx/algo/detection/base_models/detection_transformer.py
+++ b/src/otx/algo/detection/base_models/detection_transformer.py
@@ -55,7 +55,7 @@ def __init__(
         if multi_scale is not None:
             self.multi_scale = multi_scale
         else:
-            self.multi_scale = [input_size -i * 32 for i in range(-5, 6)] + [input_size] * 2
+            self.multi_scale = [input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2
 
         self.num_classes = num_classes
         self.num_top_queries = num_top_queries
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index db4bfda1980..140c644ad7f 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -5,15 +5,14 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import nn
 from torchvision import tv_tensors
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
-from transformers.configuration_utils import PretrainedConfig
-# from transformers.image_processing_base import ImageProcessingMixin
 
+# from transformers.image_processing_base import ImageProcessingMixin
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity
 from otx.core.data.entity.utils import stack_batch
@@ -62,7 +61,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=object-detection
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 800, 992),  # detection default input size
+        input_size: tuple[int, ...] = (1, 3, 800, 992),  # detection default input size
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 13e5fdcae4e..43a6267a769 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -7,7 +7,7 @@
 
 import copy
 import re
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
@@ -18,21 +18,21 @@
 from otx.algo.detection.base_models.detection_transformer import DETR
 from otx.algo.detection.heads import RTDETRTransformer
 from otx.algo.detection.necks import HybridEncoder
+from otx.core.config.data import TileConfig
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.detection import ExplainableOTXDetModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
-from otx.core.config.data import TileConfig
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.detection import ExplainableOTXDetModel
 
 if TYPE_CHECKING:
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class RTDETR(ExplainableOTXDetModel):
@@ -46,7 +46,7 @@ class RTDETR(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 640, 640),
+        input_size: tuple[int, ...] = (1, 3, 640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index e5757944759..b382ff65225 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import GIoULoss, QualityFocalLoss
@@ -17,20 +17,20 @@
 from otx.algo.detection.base_models import SingleStageDetector
 from otx.algo.detection.heads import RTMDetSepBNHead
 from otx.algo.detection.necks import CSPNeXtPAFPN
+from otx.core.config.data import TileConfig
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.model.detection import ExplainableOTXDetModel
 from otx.core.types.export import TaskLevelExportParameters
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
-from otx.core.config.data import TileConfig
 
 if TYPE_CHECKING:
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class RTMDet(ExplainableOTXDetModel):
@@ -41,7 +41,7 @@ class RTMDet(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 640, 640),
+        input_size: tuple[int, ...] = (1, 3, 640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index 910baae419d..e4c095dffa2 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 from datumaro.components.annotation import Bbox
@@ -22,21 +22,21 @@
 from otx.algo.detection.heads import SSDHead
 from otx.algo.detection.utils.prior_generators import SSDAnchorGeneratorClustered
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
+from otx.core.config.data import TileConfig
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.detection import ExplainableOTXDetModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
-from otx.core.config.data import TileConfig
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.detection import ExplainableOTXDetModel
 
 if TYPE_CHECKING:
     import torch
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
     from otx.core.data.dataset.base import OTXDataset
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 logger = logging.getLogger()
@@ -55,7 +55,7 @@ class SSD(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 864, 864),
+        input_size: tuple[int, ...] = (1, 3, 864, 864),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index 800c0e17f5e..ac319e26532 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 from otx.algo.common.losses import CrossEntropyLoss, L1Loss
 from otx.algo.detection.backbones import CSPDarknet
@@ -15,23 +15,24 @@
 from otx.algo.detection.necks import YOLOXPAFPN
 from otx.algo.detection.utils.assigners import SimOTAAssigner
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
+from otx.core.config.data import TileConfig
 from otx.core.data.entity.detection import DetBatchDataEntity
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.model.detection import ExplainableOTXDetModel
 from otx.core.types.export import OTXExportFormatType
 from otx.core.types.precision import OTXPrecisionType
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
-from otx.core.config.data import TileConfig
 
 if TYPE_CHECKING:
     from pathlib import Path
+
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class YOLOX(ExplainableOTXDetModel):
@@ -42,7 +43,7 @@ class YOLOX(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 640, 640),
+        input_size: tuple[int, ...] = (1, 3, 640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -149,7 +150,7 @@ class YOLOXTINY(YOLOX):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 416, 416),
+        input_size: tuple[int, ...] = (1, 3, 416, 416),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index b1af171124a..1ab96d01bb1 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 from torchvision.ops import RoIAlign
 
@@ -21,19 +21,19 @@
 from otx.algo.instance_segmentation.two_stage import TwoStageDetector
 from otx.algo.instance_segmentation.utils.roi_extractors import SingleRoIExtractor
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
+from otx.core.config.data import TileConfig
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.config.data import TileConfig
 from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
 
 if TYPE_CHECKING:
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class MaskRCNN(ExplainableOTXInstanceSegModel):
@@ -88,7 +88,7 @@ class MaskRCNNResNet50(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -278,7 +278,7 @@ class MaskRCNNEfficientNet(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -485,7 +485,7 @@ class MaskRCNNSwinT(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 1344, 1344),
+        input_size: tuple[int, ...] = (1, 3, 1344, 1344),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
index dcfe9c3c3c8..075e4bcf811 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, Sequence, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
@@ -24,22 +24,22 @@
 from torchvision.models.resnet import resnet50
 
 from otx.algo.instance_segmentation.heads import TVRoIHeads
+from otx.core.config.data import TileConfig
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.instance_segmentation import InstanceSegBatchDataEntity, InstanceSegBatchPredEntity
 from otx.core.data.entity.utils import stack_batch
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.config.data import TileConfig
 from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
 
 if TYPE_CHECKING:
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class _TVMaskRCNN(MaskRCNN):
@@ -275,7 +275,7 @@ class TVMaskRCNNR50(TVMaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 96801665710..92a5627e5cb 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss
@@ -17,20 +17,20 @@
 from otx.algo.detection.necks import CSPNeXtPAFPN
 from otx.algo.instance_segmentation.heads import RTMDetInsSepBNHead
 from otx.algo.instance_segmentation.losses import DiceLoss
+from otx.core.config.data import TileConfig
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.config.data import TileConfig
 from otx.core.metrics.mean_ap import MaskRLEMeanAPFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.instance_segmentation import ExplainableOTXInstanceSegModel
 
 if TYPE_CHECKING:
-    from torch import Tensor
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from torch import Tensor
 
-    from otx.core.types.label import LabelInfoTypes
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class RTMDetInst(ExplainableOTXInstanceSegModel):
@@ -96,7 +96,7 @@ class RTMDetInstTiny(RTMDetInst):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 640, 640),
+        input_size: tuple[int, ...] = (1, 3, 640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
index c3e455a2910..16101a6fcad 100644
--- a/src/otx/algo/segmentation/dino_v2_seg.py
+++ b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -5,24 +5,24 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
 
 from otx.algo.segmentation.backbones import DinoVisionTransformer
 from otx.algo.segmentation.heads import FCNHead
-from otx.core.model.segmentation import TorchVisionCompatibleModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.dice import SegmCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.segmentation import TorchVisionCompatibleModel
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
     from torch import nn
     from typing_extensions import Self
-    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
+    from otx.core.metrics import MetricCallable
     from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.label import LabelInfoTypes
-    from otx.core.metrics import MetricCallable
 
 
 class DinoV2Seg(BaseSegmModel):
@@ -56,7 +56,7 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 560, 560),
+        input_size: tuple[int, ...] = (1, 3, 560, 560),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index feee127b0f8..10cad420b70 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import nn
@@ -65,7 +65,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=image-segmentation
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 512, 512),  # sementic segmentation default input size
+        input_size: tuple[int, ...] = (1, 3, 512, 512),  # sementic segmentation default input size
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -89,7 +89,7 @@ def _create_model(self) -> nn.Module:
         kwargs = {}
         if "image_size" in model_config:
             kwargs["image_size"] = self.input_size[-1]
-        
+
         if (patch_size := model_config.get("patch_sizes")) != None:
             if isinstance(patch_size, (list, tuple)):
                 patch_size = patch_size
diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 42ffc26e613..81d9e99f57c 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
 
 from torch.onnx import OperatorExportTypes
 
@@ -14,19 +14,19 @@
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.segmentation import TorchVisionCompatibleModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.dice import SegmCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.segmentation import TorchVisionCompatibleModel
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
-    from torch import nn
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from torch import nn
 
+    from otx.core.metrics import MetricCallable
     from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.label import LabelInfoTypes
-    from otx.core.metrics import MetricCallable
 
 
 class LiteHRNetS(BaseSegmModel):
@@ -524,10 +524,11 @@ def ignore_scope(self) -> dict[str, str | dict[str, list[str]]]:
 
 class OTXLiteHRNet(TorchVisionCompatibleModel):
     """LiteHRNet Model."""
+
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 512, 512),
+        input_size: tuple[int, ...] = (1, 3, 512, 512),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py
index c18b1cc10c1..703f5b1dfbe 100644
--- a/src/otx/algo/segmentation/segnext.py
+++ b/src/otx/algo/segmentation/segnext.py
@@ -4,24 +4,24 @@
 """SegNext model implementations."""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
 
 from otx.algo.segmentation.backbones import MSCAN
 from otx.algo.segmentation.heads import LightHamHead
 from otx.algo.utils.support_otx_v1 import OTXv1Helper
-from otx.core.model.segmentation import TorchVisionCompatibleModel
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.metrics.dice import SegmCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.segmentation import TorchVisionCompatibleModel
 
 from .base_model import BaseSegmModel
 
 if TYPE_CHECKING:
-    from torch import nn
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from torch import nn
 
+    from otx.core.metrics import MetricCallable
     from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.label import LabelInfoTypes
-    from otx.core.metrics import MetricCallable
 
 
 class SegNextB(BaseSegmModel):
@@ -114,10 +114,11 @@ class SegNextT(BaseSegmModel):
 
 class OTXSegNext(TorchVisionCompatibleModel):
     """SegNext Model."""
+
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] = (1, 3, 512, 512),
+        input_size: tuple[int, ...] = (1, 3, 512, 512),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index a948f9ae648..c1eef3d18d4 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import logging as log
-from typing import TYPE_CHECKING, Any, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from torch import Tensor, nn
@@ -496,7 +496,7 @@ def __init__(
         self,
         backbone: Literal["tiny_vit", "vit_b"],
         label_info: LabelInfoTypes = NullLabelInfo(),
-        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -516,7 +516,7 @@ def __init__(
         self.config = {
             "backbone": backbone,
             "image_size": input_size[-1],
-            "image_embedding_size" : input_size[-1] // 16,
+            "image_embedding_size": input_size[-1] // 16,
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 178673c5ee2..8c9a1787828 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -346,8 +346,10 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
                 if isinstance(input_size, int):
                     input_size = (input_size, input_size)
                 else:
-                    input_size = tuple(input_size)
-                model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+                    input_size = tuple(input_size)  # type: ignore[assignment]
+                model_config["init_args"]["input_size"] = (
+                    tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+                )
 
             # Instantiate the model and needed components
             self.model = self.instantiate_model(model_config=model_config)
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index 3e1b6f5f133..cb6f8760940 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -24,13 +24,13 @@
     parse_mem_cache_size_to_int,
 )
 from otx.core.data.pre_filtering import pre_filtering
+from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config
 from otx.core.types.device import DeviceType
 from otx.core.types.image import ImageColorChannel
 from otx.core.types.label import LabelInfo
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.instantiators import instantiate_sampler
 from otx.core.utils.utils import get_adaptive_num_workers
-from otx.core.data.utils import adapt_input_size_to_dataset, adapt_tile_config
 
 if TYPE_CHECKING:
     from lightning.pytorch.utilities.parsing import AttributeDict
@@ -139,7 +139,7 @@ def __init__(
             input_size = adapt_input_size_to_dataset(
                 dataset,
                 input_size,
-                adaptive_input_size=="downscale",
+                adaptive_input_size == "downscale",
                 input_size_multiplier,
             )
         if input_size is not None:
diff --git a/src/otx/core/data/utils/__init__.py b/src/otx/core/data/utils/__init__.py
index adc0400284e..302e6cb75dd 100644
--- a/src/otx/core/data/utils/__init__.py
+++ b/src/otx/core/data/utils/__init__.py
@@ -3,6 +3,6 @@
 #
 """Utility modules for core data modules."""
 
-from .utils import adapt_tile_config, adapt_input_size_to_dataset
+from .utils import adapt_input_size_to_dataset, adapt_tile_config
 
 __all__ = ["adapt_tile_config", "adapt_input_size_to_dataset"]
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 665994fe2ef..040258c031a 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -1,4 +1,3 @@
-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -173,7 +172,7 @@ def adapt_input_size_to_dataset(
 
     train_dataset = dataset.subsets().get("train")
     if train_dataset is None:
-        return
+        return None
 
     logger.info("Adapting model input size based on dataset stat")
     stat = compute_robust_dataset_statistics(train_dataset)
diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py
index 6829dca5f38..52e1a55cf6a 100644
--- a/src/otx/core/model/action_classification.py
+++ b/src/otx/core/model/action_classification.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import torch
@@ -37,7 +37,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 405116b762e..fc562f7b087 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -104,7 +104,7 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int] | None = None,
+        input_size: tuple[int, ...] | None = None,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = NullMetricCallable,
@@ -810,7 +810,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
         raise TypeError(label_info)
 
-    def _check_input_size(self, input_size: Sequence[int] | None = None) -> None:
+    def _check_input_size(self, input_size: tuple[int, ...] | None = None) -> None:
         if (
             input_size is not None
             and hasattr(self, "input_size_multiplier")
@@ -819,6 +819,7 @@ def _check_input_size(self, input_size: Sequence[int] | None = None) -> None:
             msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead."
             raise ValueError(msg)
 
+
 class OVModel(OTXModel, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEntity]):
     """Base class for the OpenVINO model.
 
diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 74963720b9e..52d6aea8215 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import torch
@@ -42,7 +42,7 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -96,7 +96,7 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
@@ -149,7 +149,7 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt
     def __init__(
         self,
         label_info: HLabelInfo,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index 60ce1d164b7..d478ab91803 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -8,7 +8,7 @@
 import types
 from abc import abstractmethod
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal
 
 import torch
 from model_api.tilers import DetectionTiler
@@ -385,7 +385,7 @@ class ExplainableOTXDetModel(OTXDetectionModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py
index cfcee76e48f..166e14ca656 100644
--- a/src/otx/core/model/instance_segmentation.py
+++ b/src/otx/core/model/instance_segmentation.py
@@ -7,7 +7,7 @@
 import logging as log
 import types
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal
 
 import numpy as np
 import torch
@@ -49,7 +49,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -384,7 +384,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 15976e25158..3d2a672e72a 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -37,7 +37,7 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -47,7 +47,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (Sequence[int]): The input shape of the model.
+            input_size (tuple[int, ...]): The input shape of the model.
             optimizer (OptimizerCallable, optional): The optimizer to use for training.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -130,7 +130,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -145,7 +145,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (Sequence[int]): The input shape of the model.
+            input_size (tuple[int, ...]): The input shape of the model.
             optimizer (OptimizerCallable, optional): The optimizer callable for the model.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index c8f3de1cfa9..fab3ffb52dd 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -162,7 +162,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro
     def __init__(
         self,
         label_info: LabelInfoTypes = NullLabelInfo(),
-        input_size: Sequence[int] = (1, 3, 1024, 1024),
+        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -286,7 +286,7 @@ class OTXZeroShotVisualPromptingModel(
 
     def __init__(
         self,
-        input_size: Sequence[int],
+        input_size: tuple[int, ...],
         label_info: LabelInfoTypes = NullLabelInfo(),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index c6a66d455b1..0d9f3750889 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -154,13 +154,12 @@ def __init__(
         self.task = task if task is not None else self._auto_configurator.task
 
         self._trainer: Trainer | None = None
+        get_model_args: dict[str, Any] = {}
+        if self._datamodule is not None:
+            get_model_args["label_info"] = self._datamodule.label_info
+            get_model_args["input_size"] = self._datamodule.input_size
         self._model: OTXModel = (
-            model
-            if isinstance(model, OTXModel)
-            else self._auto_configurator.get_model(
-                label_info=self._datamodule.label_info if self._datamodule is not None else None,
-                input_size=self._datamodule.input_size,
-            )
+            model if isinstance(model, OTXModel) else self._auto_configurator.get_model(**get_model_args)
         )
 
     # ------------------------------------------------------------------------ #
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 90790d59521..2483adb9990 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -8,7 +8,7 @@
 import logging
 from copy import deepcopy
 from pathlib import Path
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 from warnings import warn
 
 import datumaro
@@ -22,7 +22,6 @@
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.imports import get_otx_root_path
 from otx.core.utils.instantiators import partial_instantiate_class
-from otx.core.utils.utils import import_object_from_module
 from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info
 
 if TYPE_CHECKING:
@@ -241,13 +240,19 @@ def get_datamodule(self) -> OTXDataModule | None:
             **data_config,
         )
 
-    def get_model(self, model_name: str | None = None, label_info: LabelInfoTypes | None = None, input_size: Sequence[int] | None = None) -> OTXModel:
+    def get_model(
+        self,
+        model_name: str | None = None,
+        label_info: LabelInfoTypes | None = None,
+        input_size: tuple[int, ...] | int | None = None,
+    ) -> OTXModel:
         """Retrieves the OTXModel instance based on the provided model name and meta information.
 
         Args:
             model_name (str | None): The name of the model to retrieve. If None, the default model will be used.
             label_info (LabelInfoTypes | None): The meta information about the labels.
                 If provided, the number of classes will be updated in the model's configuration.
+            input_size (tuple[int, ...] | int | None): Input size the model will use.
 
         Returns:
             OTXModel: The instantiated OTXModel instance.

From 8474b072d635614b56869947ef00e0e1e6ab7008 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Thu, 8 Aug 2024 16:07:49 +0900
Subject: [PATCH 10/42] align with pre-commit

---
 src/otx/algo/anomaly/stfpm.py                 |  2 +-
 .../algo/classification/torchvision_model.py  |  1 +
 .../algo/segmentation/huggingface_model.py    |  4 ++--
 .../encoders/sam_image_encoder.py             |  4 ++--
 src/otx/cli/cli.py                            | 11 ++++++-----
 src/otx/core/data/module.py                   |  2 +-
 src/otx/core/data/utils/utils.py              | 19 +++++++++++--------
 src/otx/core/model/action_classification.py   |  1 +
 src/otx/core/model/base.py                    |  1 -
 src/otx/core/model/classification.py          |  3 +++
 src/otx/core/model/detection.py               |  2 ++
 src/otx/core/model/instance_segmentation.py   |  1 +
 src/otx/core/model/segmentation.py            |  1 +
 src/otx/core/model/visual_prompting.py        |  2 ++
 src/otx/engine/utils/auto_configurator.py     |  5 +----
 15 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index 72dd30e8aa3..112ab465590 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -46,7 +46,7 @@ def __init__(
         **kwargs,
     ) -> None:
         OTXAnomaly.__init__(self)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=(224, 224))
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 77643513b61..9caa54d6bec 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -445,6 +445,7 @@ def __init__(
             torch_compile=torch_compile,
             input_size=input_size,
         )
+        self.input_size: tuple[int, int, int, int]
 
     def _create_model(self) -> nn.Module:
         if self.task == OTXTaskType.MULTI_CLASS_CLS:
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index 10cad420b70..ed53ba8fa20 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -90,9 +90,9 @@ def _create_model(self) -> nn.Module:
         if "image_size" in model_config:
             kwargs["image_size"] = self.input_size[-1]
 
-        if (patch_size := model_config.get("patch_sizes")) != None:
+        if (patch_size := model_config.get("patch_sizes")) is not None:
             if isinstance(patch_size, (list, tuple)):
-                patch_size = patch_size
+                patch_size = patch_size[0]
             if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0:
                 msg = (
                     f"It's recommended to set the input size to multiple of patch size({patch_size}). "
diff --git a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
index 3feef21aba5..a1f792e8c92 100644
--- a/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
+++ b/src/otx/algo/visual_prompting/encoders/sam_image_encoder.py
@@ -70,11 +70,11 @@ def __new__(cls, backbone: str, *args, **kwargs):  # noqa: ARG003
         if backbone.lower() == "tiny_vit":
             from otx.algo.visual_prompting.backbones.tiny_vit import TinyViT
 
-            return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[arg-type]
+            return TinyViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[dict-item]
         elif backbone.lower() in ["vit_b", "vit_l", "vit_h"]:  # noqa: RET505
             from otx.algo.visual_prompting.backbones.vit import ViT
 
-            return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[arg-type]
+            return ViT(**{**cls.backbone_configs.get(backbone.lower()), **kwargs})  # type: ignore[dict-item]
 
         else:
             error_log = f"{backbone} is not supported for SAMImageEncoder. Set among tiny_vit and vit_b."
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 8c9a1787828..d70adc56535 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -343,12 +343,13 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             self.datamodule = self.get_config_value(self.config_init, "data")
 
             if (input_size := self.datamodule.input_size) is not None:
-                if isinstance(input_size, int):
-                    input_size = (input_size, input_size)
-                else:
-                    input_size = tuple(input_size)  # type: ignore[assignment]
+                input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)  # type: ignore[assignment]
+                # if isinstance(input_size, int):
+                #     input_size = (input_size, input_size)
+                # else:
+                #     input_size = tuple(input_size)  # type: ignore[assignment]
                 model_config["init_args"]["input_size"] = (
-                    tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+                    tuple(model_config["init_args"]["input_size"][:-2]) + input_size  # type: ignore[operator]
                 )
 
             # Instantiate the model and needed components
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index cb6f8760940..cdc04c18156 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -42,7 +42,7 @@
 class OTXDataModule(LightningDataModule):
     """LightningDataModule extension for OTX pipeline."""
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         task: OTXTaskType,
         data_format: str,
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 040258c031a..cf0aaa8e1af 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -137,7 +137,10 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
             if max_num_ann < len(anns):
                 max_ann_type = ann_type
                 max_num_ann = len(anns)
-        stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes[max_ann_type]))
+        if max_ann_type is not None:
+            stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(
+                np.array(size_of_shapes[max_ann_type]),
+            )
 
     return stat
 
@@ -147,7 +150,7 @@ def adapt_input_size_to_dataset(
     base_input_size: int | tuple[int, int] | None = None,
     downscale_only: bool = True,
     input_size_multiplier: int | None = None,
-) -> tuple[int, int]:
+) -> tuple[int, int] | None:
     """Compute appropriate model input size w.r.t. dataset statistics.
 
     Args:
@@ -159,9 +162,9 @@ def adapt_input_size_to_dataset(
     Returns:
         Tuple[int, int]: (width, height)
     """
-    MIN_RECOGNIZABLE_OBJECT_SIZE = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
+    min_recognizable_object_size = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
     # meaning NxN input pixels being downscaled to 1x1 on feature map
-    MIN_DETECTION_INPUT_SIZE = 256  # Minimum input size for object detection
+    min_detection_input_size = 256  # Minimum input size for object detection
 
     if downscale_only and base_input_size is None:
         msg = "If downscale_only is set to True, base_input_size should be set but got None."
@@ -194,13 +197,13 @@ def adapt_input_size_to_dataset(
 
     # Refine using annotation shape size stat
     if min_object_size is not None and min_object_size > 0:
-        image_size = round(image_size * MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size)
+        image_size = round(image_size * min_recognizable_object_size / min_object_size)
         logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}")
         if image_size > max_image_size:
             image_size = max_image_size
             logger.info(f"-> Restrict to max image size: {image_size}")
-        if image_size < MIN_DETECTION_INPUT_SIZE:
-            image_size = MIN_DETECTION_INPUT_SIZE
+        if image_size < min_detection_input_size:
+            image_size = min_detection_input_size
             logger.info(f"-> Based on minimum object detection input size: {image_size}")
 
     if input_size_multiplier is not None and image_size % input_size_multiplier != 0:
@@ -210,7 +213,7 @@ def adapt_input_size_to_dataset(
 
     if downscale_only:
 
-        def area(x):
+        def area(x: tuple[int, int]) -> int:
             return x[0] * x[1]
 
         if base_input_size and area(input_size) >= area(base_input_size):
diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py
index 52e1a55cf6a..b31c4e1f63e 100644
--- a/src/otx/core/model/action_classification.py
+++ b/src/otx/core/model/action_classification.py
@@ -53,6 +53,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int, int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index fc562f7b087..1b4dc9b6acc 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -12,7 +12,6 @@
 import logging
 import warnings
 from abc import abstractmethod
-from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, Sequence
 
 import numpy as np
diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 52d6aea8215..dad775d2bd7 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -56,6 +56,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
@@ -110,6 +111,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
@@ -163,6 +165,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index d478ab91803..938715396b6 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -41,6 +41,8 @@
 class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]):
     """Base class for the detection models used in OTX."""
 
+    input_size: tuple[int, int, int, int]
+
     def test_step(self, batch: DetBatchDataEntity, batch_idx: int) -> None:
         """Perform a single test step on a batch of data from the test set.
 
diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py
index 166e14ca656..756a5fd7641 100644
--- a/src/otx/core/model/instance_segmentation.py
+++ b/src/otx/core/model/instance_segmentation.py
@@ -65,6 +65,7 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
+        self.input_size: tuple[int, int, int, int]
 
     def _build_model(self, num_classes: int) -> nn.Module:
         raise NotImplementedError
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 3d2a672e72a..617d0041067 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -65,6 +65,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index fab3ffb52dd..89c668aa7ca 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -178,6 +178,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _exporter(self) -> OTXModelExporter:
@@ -303,6 +304,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int, int, int]
 
     @property
     def _exporter(self) -> OTXModelExporter:
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 2483adb9990..c1269fc4031 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -280,10 +280,7 @@ def get_model(
         model_config = deepcopy(self.config["model"])
 
         if input_size is not None:
-            if isinstance(input_size, int):
-                input_size = (input_size, input_size)
-            else:
-                input_size = tuple(input_size)
+            input_size = (input_size, input_size) if isinstance(input_size, int) else input_size
             model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
 
         model_cls = get_model_cls_from_config(Namespace(model_config))

From 28dcd6363e876414c5aafbcfdd6030c33a4820f8 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Thu, 8 Aug 2024 16:45:25 +0900
Subject: [PATCH 11/42] write doc string

---
 src/otx/algo/anomaly/stfpm.py                        |  2 +-
 src/otx/algo/classification/huggingface_model.py     |  3 ++-
 src/otx/algo/classification/mobilenet_v3.py          |  1 +
 src/otx/algo/classification/torchvision_model.py     |  1 +
 .../detection/base_models/detection_transformer.py   |  1 +
 src/otx/algo/detection/huggingface_model.py          |  4 ++--
 src/otx/algo/detection/rtdetr.py                     |  8 ++++----
 src/otx/algo/segmentation/huggingface_model.py       |  2 +-
 src/otx/algo/visual_prompting/segment_anything.py    |  2 +-
 src/otx/cli/cli.py                                   |  6 ++----
 src/otx/core/data/utils/utils.py                     | 12 +++++++-----
 src/otx/core/model/segmentation.py                   |  4 ++--
 src/otx/core/model/visual_prompting.py               |  2 +-
 src/otx/engine/utils/auto_configurator.py            |  2 +-
 14 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index 112ab465590..72dd30e8aa3 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -46,7 +46,7 @@ def __init__(
         **kwargs,
     ) -> None:
         OTXAnomaly.__init__(self)
-        OTXModel.__init__(self, label_info=AnomalyLabelInfo(), input_size=(224, 224))
+        OTXModel.__init__(self, label_info=AnomalyLabelInfo())
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index 6de912cdf5d..60bfe51225a 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -33,7 +33,7 @@
     from otx.core.metrics import MetricCallable
 
 
-DEFAULT_INPUT_SIZE = (1, 2, 224, 224)
+DEFAULT_INPUT_SIZE = (1, 3, 224, 224)
 logger = logging.getLogger(__name__)
 
 
@@ -46,6 +46,7 @@ class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
         optimizer (OptimizerCallable, optional): The optimizer callable for training the model.
         scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): The learning rate scheduler callable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
+        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
 
     Example:
         1. API
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index b14017de3e8..5c681470f9a 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -61,6 +61,7 @@ class MobileNetV3ForMulticlassCls(OTXMulticlassClsModel):
         metric (MetricCallable, optional): The metric callable. Defaults to MultiClassClsMetricCallable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
         freeze_backbone (bool, optional): Whether to freeze the backbone layers during training. Defaults to False.
+        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
     """
 
     def __init__(
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 9caa54d6bec..002855f82ae 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -404,6 +404,7 @@ class OTXTVModel(OTXModel):
         task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional):
             The type of classification task.
         train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training.
+        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
     """
 
     model: TVClassificationModel
diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py
index a3158e19845..ed19a38dbd0 100644
--- a/src/otx/algo/detection/base_models/detection_transformer.py
+++ b/src/otx/algo/detection/base_models/detection_transformer.py
@@ -33,6 +33,7 @@ class DETR(BaseModule):
             Defaults to None.
         num_top_queries (int, optional): Number of top queries to return.
             Defaults to 300.
+        input_size (int, optional): The input size of the model. Default to 640.
     """
 
     def __init__(
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index 140c644ad7f..859825a143f 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -12,7 +12,6 @@
 from torchvision import tv_tensors
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
 
-# from transformers.image_processing_base import ImageProcessingMixin
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity
 from otx.core.data.entity.utils import stack_batch
@@ -37,6 +36,7 @@ class HuggingFaceModelForDetection(OTXDetectionModel):
     Args:
         model_name_or_path (str): The name or path of the pre-trained model.
         label_info (LabelInfoTypes): The label information for the model.
+        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 800, 992).
         optimizer (OptimizerCallable, optional): The optimizer for training the model.
             Defaults to DefaultOptimizerCallable.
         scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -61,7 +61,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=object-detection
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 800, 992),  # detection default input size
+        input_size: tuple[int, ...] = (1, 3, 800, 992),  # input size of default detection data recipe
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 43a6267a769..cf8c3d820a8 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -242,13 +242,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
         encoder = HybridEncoder(
             in_channels=[128, 256, 512],
             expansion=0.5,
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size[-2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             num_decoder_layers=3,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size[-2:],
         )
 
         optimizer_configuration = [
@@ -286,12 +286,12 @@ def _build_model(self, num_classes: int) -> nn.Module:
             norm_cfg={"type": "FBN", "name": "norm"},
         )
         encoder = HybridEncoder(
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size[-2:],
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size[-2:],
             num_decoder_layers=6,
         )
 
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index ed53ba8fa20..a64798b22f4 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -65,7 +65,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=image-segmentation
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 512, 512),  # sementic segmentation default input size
+        input_size: tuple[int, ...] = (1, 3, 512, 512),  # input size of default semantic segmentation data recipe
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index c1eef3d18d4..feb02857375 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -510,7 +510,7 @@ def __init__(
         stability_score_offset: float = 1.0,
     ) -> None:
         if input_size[-1] != input_size[-2]:
-            msg = f"SAM should use square image, but got {input_size}"
+            msg = f"SAM should use square image size, but got {input_size}"
             raise ValueError(msg)
 
         self.config = {
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index d70adc56535..8ccf07bb02a 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -332,6 +332,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # For num_classes update, Model and Metric are instantiated separately.
             model_config = self.config[self.subcommand].pop("model")
 
+            # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to data module
             if self.config[self.subcommand].data.adaptive_input_size != "none":
                 model_cls = get_model_cls_from_config(model_config)
                 if hasattr(model_cls, "input_size_multiplier"):
@@ -342,12 +343,9 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             self.workspace = self.get_config_value(self.config_init, "workspace")
             self.datamodule = self.get_config_value(self.config_init, "data")
 
+            # pass the data module input size to the model
             if (input_size := self.datamodule.input_size) is not None:
                 input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)  # type: ignore[assignment]
-                # if isinstance(input_size, int):
-                #     input_size = (input_size, input_size)
-                # else:
-                #     input_size = tuple(input_size)  # type: ignore[assignment]
                 model_config["init_args"]["input_size"] = (
                     tuple(model_config["init_args"]["input_size"][:-2]) + input_size  # type: ignore[operator]
                 )
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index cf0aaa8e1af..ee1b6b44f44 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -154,13 +154,15 @@ def adapt_input_size_to_dataset(
     """Compute appropriate model input size w.r.t. dataset statistics.
 
     Args:
-        max_image_size (int): Typical large image size of dataset in pixels.
-        min_object_size (int, optional): Typical small object size of dataset in pixels.
-            None to consider only image size. Defaults to None.
-        downscale_only (bool) : Whether to allow only smaller size than default setting. Defaults to True.
+        dataset (Dataset): Datumaro dataset including all subsets.
+        base_input_size (int | tuple[int, int] | None, optional): Base input size of the model. Defaults to None.
+        downscale_only (bool, optional) : Whether to allow only smaller size than default setting. Defaults to True.
+        input_size_multiplier (int | None, optional):
+            Multiplier for input size. If it's set, return the input size which can be divisible by the value.
+            Defaults to None.
 
     Returns:
-        Tuple[int, int]: (width, height)
+        tuple[int, int] | None: Recommended input size based on dataset statistics.
     """
     min_recognizable_object_size = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
     # meaning NxN input pixels being downscaled to 1x1 on feature map
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 617d0041067..935e2a2215b 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -47,7 +47,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (tuple[int, ...]): The input shape of the model.
+            input_size (tuple[int, ...]): The input size of the model.
             optimizer (OptimizerCallable, optional): The optimizer to use for training.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -146,7 +146,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (tuple[int, ...]): The input shape of the model.
+            input_size (tuple[int, ...]): The input size of the model.
             optimizer (OptimizerCallable, optional): The optimizer callable for the model.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index 89c668aa7ca..c6d7d2010c2 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -9,7 +9,7 @@
 from collections import defaultdict
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, Sequence
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 import torch
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index c1269fc4031..98b433b3291 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -252,7 +252,7 @@ def get_model(
             model_name (str | None): The name of the model to retrieve. If None, the default model will be used.
             label_info (LabelInfoTypes | None): The meta information about the labels.
                 If provided, the number of classes will be updated in the model's configuration.
-            input_size (tuple[int, ...] | int | None): Input size the model will use.
+            input_size (tuple[int, ...] | int | None, optional): Input size of the model. Defaults to None.
 
         Returns:
             OTXModel: The instantiated OTXModel instance.

From 01cb6291b251f890701f87a7425fe1e23cdd4321 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 13:47:55 +0900
Subject: [PATCH 12/42] implement unit test

---
 .../zero_shot_segment_anything.py             | 19 ++++----
 src/otx/cli/cli.py                            |  4 +-
 src/otx/engine/utils/auto_configurator.py     |  7 ++-
 .../backbones/test_otx_efficientnet.py        |  5 ++
 .../algo/classification/test_efficientnet.py  | 15 ++++++
 .../classification/test_huggingface_model.py  | 23 +++++++++
 .../algo/classification/test_mobilenet_v3.py  | 15 ++++++
 .../algo/detection/base_models/test_detr.py   | 16 +++++++
 .../segmentation/test_huggingface_model.py    | 23 +++++++++
 .../visual_prompting/test_segment_anything.py |  6 +++
 tests/unit/cli/test_cli.py                    | 48 +++++++++++++++++--
 tests/unit/core/data/test_module.py           | 38 ++++++++++++++-
 tests/unit/core/model/test_base.py            |  5 ++
 tests/unit/engine/test_engine.py              | 21 ++++++++
 .../engine/utils/test_auto_configurator.py    | 24 ++++++++++
 15 files changed, 250 insertions(+), 19 deletions(-)

diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
index 2938730cdb6..5edfa3aabd1 100644
--- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
+++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
@@ -645,18 +645,9 @@ def __init__(  # noqa: PLR0913
         return_extra_metrics: bool = False,
         stability_score_offset: float = 1.0,
     ) -> None:
-        super().__init__(
-            label_info=label_info,
-            input_size=(1, 3, 1024, 1024),  # zero-shot visual prompting model uses fixed 1024x1024 input size
-            optimizer=optimizer,
-            scheduler=scheduler,
-            metric=metric,
-            torch_compile=torch_compile,
-        )
-
         self.config = {
             "backbone": backbone,
-            "image_size": self.input_size[-1],
+            "image_size": 1024,
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
@@ -668,6 +659,14 @@ def __init__(  # noqa: PLR0913
             "stability_score_offset": stability_score_offset,
             **DEFAULT_CONFIG_SEGMENT_ANYTHING[backbone],
         }
+        super().__init__(
+            label_info=label_info,
+            input_size=(1, 3, 1024, 1024),  # zero-shot visual prompting model uses fixed 1024x1024 input size
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
 
         self.save_outputs = save_outputs
         self.reference_info_dir: Path = Path(reference_info_dir)
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 8ccf07bb02a..31717fb487d 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -332,7 +332,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # For num_classes update, Model and Metric are instantiated separately.
             model_config = self.config[self.subcommand].pop("model")
 
-            # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to data module
+            # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule
             if self.config[self.subcommand].data.adaptive_input_size != "none":
                 model_cls = get_model_cls_from_config(model_config)
                 if hasattr(model_cls, "input_size_multiplier"):
@@ -343,7 +343,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             self.workspace = self.get_config_value(self.config_init, "workspace")
             self.datamodule = self.get_config_value(self.config_init, "data")
 
-            # pass the data module input size to the model
+            # pass OTXDataModule input size to the model
             if (input_size := self.datamodule.input_size) is not None:
                 input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)  # type: ignore[assignment]
                 model_config["init_args"]["input_size"] = (
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 98b433b3291..170d3db76a0 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -216,7 +216,7 @@ def get_datamodule(self) -> OTXDataModule | None:
         if self.data_root is None:
             return None
         self.config["data"]["data_root"] = self.data_root
-        data_config = deepcopy(self.config["data"])
+        data_config: dict = deepcopy(self.config["data"])
         train_config = data_config.pop("train_subset")
         val_config = data_config.pop("val_subset")
         test_config = data_config.pop("test_subset")
@@ -227,6 +227,11 @@ def get_datamodule(self) -> OTXDataModule | None:
         _ = data_config.pop("__path__", {})  # Remove __path__ key that for CLI
         _ = data_config.pop("config", {})  # Remove config key that for CLI
 
+        if data_config.get("adaptive_input_size", "none") != "none":
+            model_cls = get_model_cls_from_config(Namespace(self.config["model"]))
+            if hasattr(model_cls, "input_size_multiplier"):
+                data_config["input_size_multiplier"] = model_cls.input_size_multiplier
+
         return OTXDataModule(
             train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config),
             val_subset=SubsetConfig(sampler=SamplerConfig(**val_config.pop("sampler", {})), **val_config),
diff --git a/tests/unit/algo/classification/backbones/test_otx_efficientnet.py b/tests/unit/algo/classification/backbones/test_otx_efficientnet.py
index b2a85e4088f..3d7fb9017fd 100644
--- a/tests/unit/algo/classification/backbones/test_otx_efficientnet.py
+++ b/tests/unit/algo/classification/backbones/test_otx_efficientnet.py
@@ -13,3 +13,8 @@ def test_forward(self, version):
         model = OTXEfficientNet(version, pretrained=None)
         assert model(torch.randn(1, 3, 244, 244))[0].shape[-1] == 8
         assert model(torch.randn(1, 3, 244, 244))[0].shape[-2] == 8
+
+    def test_set_input_size(self):
+        input_size = (300, 300)
+        model = OTXEfficientNet("b0", input_size=input_size, pretrained=None)
+        assert model.in_size == input_size
diff --git a/tests/unit/algo/classification/test_efficientnet.py b/tests/unit/algo/classification/test_efficientnet.py
index 49d16527f7a..fd501ff48ed 100644
--- a/tests/unit/algo/classification/test_efficientnet.py
+++ b/tests/unit/algo/classification/test_efficientnet.py
@@ -54,6 +54,11 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_
         assert isinstance(outputs, MulticlassClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
 
+    def test_set_input_size(self):
+        input_size = (1, 3, 300, 300)
+        model = EfficientNetForMulticlassCls(version="b0", label_info=10, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
+
 
 @pytest.fixture()
 def fxt_multi_label_cls_model():
@@ -92,6 +97,11 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_
         assert isinstance(outputs, MultilabelClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
 
+    def test_set_input_size(self):
+        input_size = (1, 3, 300, 300)
+        model = EfficientNetForMultilabelCls(version="b0", label_info=10, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
+
 
 @pytest.fixture()
 def fxt_h_label_cls_model(fxt_hlabel_data):
@@ -129,3 +139,8 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent
 
         assert isinstance(outputs, HlabelClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
+
+    def test_set_input_size(self, fxt_hlabel_data):
+        input_size = (1, 3, 300, 300)
+        model = EfficientNetForHLabelCls(version="b0", label_info=fxt_hlabel_data, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
diff --git a/tests/unit/algo/classification/test_huggingface_model.py b/tests/unit/algo/classification/test_huggingface_model.py
index 98007cc4249..c25be896b5d 100644
--- a/tests/unit/algo/classification/test_huggingface_model.py
+++ b/tests/unit/algo/classification/test_huggingface_model.py
@@ -1,6 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+from unittest.mock import MagicMock
+
 import pytest
 import torch
 from otx.core.data.entity.base import OTXBatchLossEntity
@@ -9,6 +11,7 @@
 
 SKIP_TRANSFORMERS_TEST = False
 try:
+    from otx.algo.classification import huggingface_model as target_file
     from otx.algo.classification.huggingface_model import HuggingFaceModelForMulticlassCls
     from transformers.modeling_outputs import ImageClassifierOutput
 except ImportError:
@@ -67,3 +70,23 @@ def test_forward_for_tracing(self, fxt_multi_class_cls_model, tmp_path):
             output = fxt_multi_class_cls_model.forward_for_tracing(
                 image=torch.randn(1, 3, 224, 224),
             )
+
+    @pytest.fixture()
+    def mock_pretrainedconfig(self, mocker) -> MagicMock:
+        mock_obj = mocker.patch.object(target_file, "PretrainedConfig")
+        mock_obj.get_config_dict.return_value = ({"image_size": 224}, None)
+        return mock_obj
+
+    @pytest.fixture()
+    def mock_automodel(self, mocker) -> MagicMock:
+        return mocker.patch.object(target_file, "AutoModelForImageClassification")
+
+    def test_set_input_size(self, mock_pretrainedconfig, mock_automodel):
+        input_size = (1, 3, 300, 300)
+        HuggingFaceModelForMulticlassCls(
+            model_name_or_path="facebook/deit-tiny-patch16-224",
+            label_info=10,
+            input_size=input_size,
+        )
+
+        assert mock_automodel.from_pretrained.call_args.kwargs["image_size"] == input_size[-1]
diff --git a/tests/unit/algo/classification/test_mobilenet_v3.py b/tests/unit/algo/classification/test_mobilenet_v3.py
index 60981098e1c..cecfd1d919a 100644
--- a/tests/unit/algo/classification/test_mobilenet_v3.py
+++ b/tests/unit/algo/classification/test_mobilenet_v3.py
@@ -54,6 +54,11 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_
         assert isinstance(outputs, MulticlassClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
 
+    def test_set_input_size(self):
+        input_size = (1, 3, 300, 300)
+        model = MobileNetV3ForMulticlassCls(mode="large", label_info=10, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
+
 
 @pytest.fixture()
 def fxt_multi_label_cls_model():
@@ -92,6 +97,11 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_
         assert isinstance(outputs, MultilabelClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
 
+    def test_set_input_size(self):
+        input_size = (1, 3, 300, 300)
+        model = MobileNetV3ForMultilabelCls(mode="large", label_info=10, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
+
 
 @pytest.fixture()
 def fxt_h_label_cls_model(fxt_hlabel_data):
@@ -129,3 +139,8 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent
 
         assert isinstance(outputs, HlabelClsBatchPredEntity)
         assert outputs.has_xai_outputs == explain_mode
+
+    def test_set_input_size(self, fxt_hlabel_data):
+        input_size = (1, 3, 300, 300)
+        model = MobileNetV3ForHLabelCls(mode="large", label_info=fxt_hlabel_data, input_size=input_size)
+        assert model.model.backbone.in_size == input_size[-2:]
diff --git a/tests/unit/algo/detection/base_models/test_detr.py b/tests/unit/algo/detection/base_models/test_detr.py
index 55194874b31..71ce30cc1fb 100644
--- a/tests/unit/algo/detection/base_models/test_detr.py
+++ b/tests/unit/algo/detection/base_models/test_detr.py
@@ -3,6 +3,8 @@
 #
 """Test of DETR."""
 
+from unittest.mock import MagicMock
+
 import pytest
 import torch
 import torchvision
@@ -103,3 +105,17 @@ def test_rt_detr_export(self, rt_detr_model, images):
         assert result["bboxes"].shape == (2, 10, 4)
         # ensure no scaling
         assert torch.all(result["bboxes"] < 2)
+
+    def test_set_input_size(self):
+        input_size = 1280
+        model = DETR(
+            backbone=MagicMock(),
+            encoder=MagicMock(),
+            decoder=MagicMock(),
+            num_classes=10,
+            input_size=input_size,
+        )
+
+        expected_multi_scale = sorted([input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2)
+
+        assert sorted(model.multi_scale) == expected_multi_scale
diff --git a/tests/unit/algo/segmentation/test_huggingface_model.py b/tests/unit/algo/segmentation/test_huggingface_model.py
index ca3a3a823ae..36693561692 100644
--- a/tests/unit/algo/segmentation/test_huggingface_model.py
+++ b/tests/unit/algo/segmentation/test_huggingface_model.py
@@ -1,6 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+from unittest.mock import MagicMock
+
 import pytest
 import torch
 from otx.core.data.entity.base import ImageInfo, OTXBatchLossEntity
@@ -8,6 +10,7 @@
 
 SKIP_TRANSFORMERS_TEST = False
 try:
+    from otx.algo.segmentation import huggingface_model as target_file
     from otx.algo.segmentation.huggingface_model import HuggingFaceModelForSegmentation
     from transformers.modeling_outputs import SemanticSegmenterOutput
     from transformers.models.segformer.image_processing_segformer import SegformerImageProcessor
@@ -69,3 +72,23 @@ def test_customize_outputs(self, fxt_seg_model, fxt_seg_batch_data_entity):
         fxt_seg_model.explain_mode = True
         with pytest.raises(NotImplementedError):
             fxt_seg_model._customize_outputs(outputs, fxt_seg_batch_data_entity)
+
+    @pytest.fixture()
+    def mock_pretrainedconfig(self, mocker) -> MagicMock:
+        mock_obj = mocker.patch.object(target_file, "PretrainedConfig")
+        mock_obj.get_config_dict.return_value = ({"image_size": 512}, None)
+        return mock_obj
+
+    @pytest.fixture()
+    def mock_automodel(self, mocker) -> MagicMock:
+        return mocker.patch.object(target_file, "AutoModelForSemanticSegmentation")
+
+    def test_set_input_size(self, mock_pretrainedconfig, mock_automodel):
+        input_size = (1, 3, 1024, 1024)
+        HuggingFaceModelForSegmentation(
+            model_name_or_path="facebook/deit-tiny-patch16-224",
+            label_info=10,
+            input_size=input_size,
+        )
+
+        assert mock_automodel.from_pretrained.call_args.kwargs["image_size"] == input_size[-1]
diff --git a/tests/unit/algo/visual_prompting/test_segment_anything.py b/tests/unit/algo/visual_prompting/test_segment_anything.py
index 09447019573..9ad548bb72f 100644
--- a/tests/unit/algo/visual_prompting/test_segment_anything.py
+++ b/tests/unit/algo/visual_prompting/test_segment_anything.py
@@ -33,6 +33,7 @@ def test_init(
         )
         segment_anything = SegmentAnything(
             backbone=backbone,
+            image_size=2048,
             freeze_image_encoder=freeze_image_encoder,
             freeze_prompt_encoder=freeze_prompt_encoder,
             freeze_mask_decoder=freeze_mask_decoder,
@@ -40,6 +41,7 @@ def test_init(
 
         # check import modules
         assert hasattr(segment_anything, "image_encoder")
+        assert segment_anything.image_encoder.img_size == 2048
         assert segment_anything.image_encoder.__class__.__name__ == expected_backbone
         assert hasattr(segment_anything, "prompt_encoder")
         assert hasattr(segment_anything, "mask_decoder")
@@ -296,6 +298,10 @@ class TestOTXSegmentAnything:
     def model(self) -> OTXSegmentAnything:
         return OTXSegmentAnything(backbone="tiny_vit")
 
+    def test_set_input_size(self):
+        with pytest.raises(ValueError, match="SAM should use square image size"):
+            OTXSegmentAnything(backbone="tiny_vit", input_size=(1, 3, 1024, 2048))
+
     def test_create_model(self, model) -> None:
         """Test _create_model."""
         segment_anything = model._create_model()
diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py
index 9211e574db5..e0b54c86210 100644
--- a/tests/unit/cli/test_cli.py
+++ b/tests/unit/cli/test_cli.py
@@ -3,11 +3,13 @@
 from __future__ import annotations
 
 import sys
+from unittest.mock import MagicMock
 
 import pytest
 import torch
 import yaml
 from otx.cli import OTXCLI, main
+from otx.cli import cli as target_file
 from rich.console import Console
 
 
@@ -78,8 +80,8 @@ def test_add_subcommands(self, mocker) -> None:
         assert cli._subcommand_method_arguments.keys() == cli.engine_subcommands().keys()
 
     @pytest.fixture()
-    def fxt_train_command(self, monkeypatch, tmpdir) -> list[str]:
-        argv = [
+    def fxt_train_argv(self, tmpdir) -> list[str]:
+        return [
             "otx",
             "train",
             "--config",
@@ -91,8 +93,11 @@ def fxt_train_command(self, monkeypatch, tmpdir) -> list[str]:
             "--work_dir",
             str(tmpdir),
         ]
-        monkeypatch.setattr("sys.argv", argv)
-        return argv
+
+    @pytest.fixture()
+    def fxt_train_command(self, monkeypatch, fxt_train_argv) -> list[str]:
+        monkeypatch.setattr("sys.argv", fxt_train_argv)
+        return fxt_train_argv
 
     def test_instantiate_classes(self, fxt_train_command, mocker) -> None:
         mock_run = mocker.patch("otx.cli.OTXCLI.run")
@@ -115,6 +120,41 @@ def test_instantiate_classes(self, fxt_train_command, mocker) -> None:
         assert cli.datamodule == cli.engine.datamodule
         assert cli.model == cli.engine.model
 
+    @pytest.mark.parametrize("input_size", [512, 1024])
+    def test_instantiate_classes_set_input_size(self, input_size, fxt_train_argv, monkeypatch, mocker) -> None:
+        mocker.patch("otx.cli.OTXCLI.run")
+        fxt_train_argv.extend(["--data.input_size", str(input_size)])
+        monkeypatch.setattr("sys.argv", fxt_train_argv)
+
+        cli = OTXCLI()
+        cli.instantiate_classes()
+
+        assert cli.model.input_size == (1, 3, input_size, input_size)
+
+    @pytest.fixture()
+    def mock_model_cls(self) -> MagicMock:
+        model_cls = MagicMock()
+        model_cls.input_size_multiplier = 12345
+        return model_cls
+
+    def test_instantiate_classes_set_adaptive_input_size(
+        self,
+        fxt_train_argv,
+        monkeypatch,
+        mocker,
+        mock_model_cls,
+    ) -> None:
+        mocker.patch("otx.cli.OTXCLI.run")
+        mocker.patch.object(target_file, "get_model_cls_from_config", return_value=mock_model_cls)
+        fxt_train_argv.extend(["--data.adaptive_input_size", "auto"])
+        monkeypatch.setattr("sys.argv", fxt_train_argv)
+        mock_data_module = mocker.patch("otx.core.data.module.adapt_input_size_to_dataset", return_value=1024)
+
+        cli = OTXCLI()
+        cli.instantiate_classes()
+
+        assert mock_data_module.call_args.args[-1] == 12345
+
     def test_raise_error_correctly(self, fxt_train_command, mocker) -> None:
         mock_engine = mocker.patch("otx.cli.OTXCLI.instantiate_engine")
         mock_engine.return_value.train.side_effect = RuntimeError("my_error")
diff --git a/tests/unit/core/data/test_module.py b/tests/unit/core/data/test_module.py
index ac24753cafd..78d1145ae1d 100644
--- a/tests/unit/core/data/test_module.py
+++ b/tests/unit/core/data/test_module.py
@@ -16,6 +16,7 @@
     TileConfig,
     UnlabeledDataConfig,
 )
+from otx.core.data import module as target_file
 from otx.core.data.module import (
     OTXDataModule,
     OTXTaskType,
@@ -158,9 +159,42 @@ def test_init_input_size(
             input_size=(1200, 1200),
         )
 
-        assert fxt_config.train_subset.input_size == (1000, 1000)
+        assert fxt_config.train_subset.input_size == (1200, 1200)
         assert fxt_config.val_subset.input_size == (1200, 1200)
-        assert fxt_config.test_subset.input_size == (800, 800)
+        assert fxt_config.test_subset.input_size == (1200, 1200)
+
+    @pytest.fixture()
+    def mock_adapt_input_size_to_dataset(self, mocker) -> MagicMock:
+        return mocker.patch.object(target_file, "adapt_input_size_to_dataset", return_value=(1234, 1234))
+
+    def test_init_adaptive_input_size(
+        self,
+        mock_dm_dataset,
+        mock_otx_dataset_factory,
+        mock_data_filtering,
+        fxt_config,
+        mock_adapt_input_size_to_dataset,
+    ) -> None:
+        # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets
+        mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)}
+        mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets
+        fxt_config.train_subset.input_size = (1000, 1000)
+        fxt_config.val_subset.input_size = None
+        fxt_config.test_subset.input_size = (800, 800)
+
+        OTXDataModule(
+            task=OTXTaskType.MULTI_CLASS_CLS,
+            data_format=fxt_config.data_format,
+            data_root=fxt_config.data_root,
+            train_subset=fxt_config.train_subset,
+            val_subset=fxt_config.val_subset,
+            test_subset=fxt_config.test_subset,
+            adaptive_input_size="auto",
+        )
+
+        assert fxt_config.train_subset.input_size == (1234, 1234)
+        assert fxt_config.val_subset.input_size == (1234, 1234)
+        assert fxt_config.test_subset.input_size == (1234, 1234)
 
     def test_data_format_check(
         self,
diff --git a/tests/unit/core/model/test_base.py b/tests/unit/core/model/test_base.py
index dc164a577be..d72891cf538 100644
--- a/tests/unit/core/model/test_base.py
+++ b/tests/unit/core/model/test_base.py
@@ -20,6 +20,11 @@ def __init__(self, num_classes):
 
 
 class TestOTXModel:
+    def test_init(self, monkeypatch):
+        monkeypatch.setattr(OTXModel, "input_size_multiplier", 10, raising=False)
+        with pytest.raises(ValueError, match="Input size should be a multiple"):
+            OTXModel(label_info=2, input_size=(1, 3, 1024, 1024))
+
     def test_smart_weight_loading(self, mocker) -> None:
         with mocker.patch.object(OTXModel, "_create_model", return_value=MockNNModule(2)):
             prev_model = OTXModel(label_info=2)
diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py
index b4b34fb4b2c..1bd9c655cf8 100644
--- a/tests/unit/engine/test_engine.py
+++ b/tests/unit/engine/test_engine.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
+from unittest.mock import MagicMock
 
 import pytest
 from otx.algo.classification.efficientnet import EfficientNetForMulticlassCls
@@ -51,6 +52,26 @@ def test_constructor(self, tmp_path) -> None:
         with pytest.raises(ValueError, match="Given model class (.*) requires a valid label_info to instantiate."):
             _ = Engine(work_dir=tmp_path, task="MULTI_CLASS_CLS")
 
+    @pytest.fixture()
+    def mock_datamodule(self, mocker):
+        input_size = (1234, 1234)
+        label_info = 4321
+        mock_datamodule = MagicMock()
+        mock_datamodule.label_info = label_info
+        mock_datamodule.input_size = input_size
+
+        return mocker.patch(
+            "otx.engine.utils.auto_configurator.AutoConfigurator.get_datamodule",
+            return_value=mock_datamodule,
+        )
+
+    def test_model_init(self, tmp_path, mock_datamodule):
+        data_root = "tests/assets/classification_dataset"
+        engine = Engine(work_dir=tmp_path, data_root=data_root)
+
+        assert engine._model.input_size == (1, 3, 1234, 1234)
+        assert engine._model.label_info.num_classes == 4321
+
     def test_model_setter(self, fxt_engine, mocker) -> None:
         assert isinstance(fxt_engine.model, OTXTVModel)
         fxt_engine.model = "efficientnet_b0"
diff --git a/tests/unit/engine/utils/test_auto_configurator.py b/tests/unit/engine/utils/test_auto_configurator.py
index 078c81fc84c..7bf247020c9 100644
--- a/tests/unit/engine/utils/test_auto_configurator.py
+++ b/tests/unit/engine/utils/test_auto_configurator.py
@@ -11,6 +11,7 @@
 from otx.core.types.label import LabelInfo, SegLabelInfo
 from otx.core.types.task import OTXTaskType
 from otx.core.types.transformer_libs import TransformLibType
+from otx.engine.utils import auto_configurator as target_file
 from otx.engine.utils.auto_configurator import (
     DEFAULT_CONFIG_PER_TASK,
     AutoConfigurator,
@@ -108,6 +109,19 @@ def test_get_datamodule(self) -> None:
         assert isinstance(datamodule, OTXDataModule)
         assert datamodule.task == task
 
+    def test_get_datamodule_set_input_size_multiplier(self, mocker) -> None:
+        mock_otxdatamodule = mocker.patch.object(target_file, "OTXDataModule")
+        auto_configurator = AutoConfigurator(
+            data_root="tests/assets/car_tree_bug",
+            task=OTXTaskType.DETECTION,
+            model_name="yolox_tiny",
+        )
+        auto_configurator.config["data"]["adaptive_input_size"] = "auto"
+
+        auto_configurator.get_datamodule()
+
+        assert mock_otxdatamodule.call_args.kwargs["input_size_multiplier"] == 32
+
     def test_get_model(self, fxt_task: OTXTaskType) -> None:
         if fxt_task is OTXTaskType.H_LABEL_CLS:
             pytest.xfail(reason="Not working")
@@ -130,6 +144,16 @@ def test_get_model(self, fxt_task: OTXTaskType) -> None:
             with pytest.raises(ValueError, match="Given model class (.*) requires a valid label_info to instantiate."):
                 _ = auto_configurator.get_model(label_info=None)
 
+    def test_get_model_set_input_size(self) -> None:
+        auto_configurator = AutoConfigurator(task=OTXTaskType.MULTI_CLASS_CLS)
+        label_names = ["class1", "class2", "class3"]
+        label_info = LabelInfo(label_names=label_names, label_groups=[label_names])
+        input_size = 300
+
+        model = auto_configurator.get_model(label_info=label_info, input_size=input_size)
+
+        assert model.input_size == (1, 3, input_size, input_size)
+
     def test_get_optimizer(self, fxt_task: OTXTaskType) -> None:
         if fxt_task in {
             OTXTaskType.ANOMALY_SEGMENTATION,

From 30172ddad8ecd8fe45f42bcb3ed72365c74f87e6 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 13:58:16 +0900
Subject: [PATCH 13/42] update unit test

---
 tests/unit/core/model/test_classification.py   | 10 +++++++++-
 tests/unit/core/model/test_segmentation.py     |  4 ++--
 tests/unit/core/model/test_visual_prompting.py |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/unit/core/model/test_classification.py b/tests/unit/core/model/test_classification.py
index 73027bf00f3..835ed854e20 100644
--- a/tests/unit/core/model/test_classification.py
+++ b/tests/unit/core/model/test_classification.py
@@ -37,6 +37,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXMulticlassClsModel(
             label_info=1,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -50,6 +51,7 @@ def test_export_parameters(
 
         model = OTXMultilabelClsModel(
             label_info=1,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -60,6 +62,7 @@ def test_export_parameters(
 
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -76,6 +79,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXMulticlassClsModel(
             label_info=1,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -106,6 +110,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXMultilabelClsModel(
             label_info=1,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -125,6 +130,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXMultilabelClsModel(
             label_info=1,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -156,6 +162,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -176,6 +183,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
+            input_size=(1, 3, 224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -199,7 +207,7 @@ def test_convert_pred_entity_to_compute_metric(
         assert "target" in metric_input
 
     def test_set_label_info(self, fxt_hlabel_multilabel_info):
-        model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info)
+        model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(1, 3, 224, 224))
         assert model.label_info.num_multilabel_classes == fxt_hlabel_multilabel_info.num_multilabel_classes
 
         fxt_hlabel_multilabel_info.num_multilabel_classes = 0
diff --git a/tests/unit/core/model/test_segmentation.py b/tests/unit/core/model/test_segmentation.py
index 573e11d773d..32da4815475 100644
--- a/tests/unit/core/model/test_segmentation.py
+++ b/tests/unit/core/model/test_segmentation.py
@@ -46,7 +46,7 @@ def torch_compile():
 class TestOTXSegmentationModel:
     @pytest.fixture()
     def model(self, label_info, optimizer, scheduler, metric, torch_compile):
-        return OTXSegmentationModel(label_info, optimizer, scheduler, metric, torch_compile)
+        return OTXSegmentationModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile)
 
     def test_export_parameters(self, model):
         params = model._export_parameters
@@ -74,7 +74,7 @@ def test_dispatch_label_info(self, model, label_info, expected_label_info):
 class TestTorchVisionCompatibleModel:
     @pytest.fixture()
     def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> TorchVisionCompatibleModel:
-        return TorchVisionCompatibleModel(label_info, optimizer, scheduler, metric, torch_compile)
+        return TorchVisionCompatibleModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile)
 
     @pytest.fixture()
     def batch_data_entity(self):
diff --git a/tests/unit/core/model/test_visual_prompting.py b/tests/unit/core/model/test_visual_prompting.py
index e8cc1e2fe93..9a3a8709529 100644
--- a/tests/unit/core/model/test_visual_prompting.py
+++ b/tests/unit/core/model/test_visual_prompting.py
@@ -36,7 +36,7 @@
 @pytest.fixture()
 def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel:
     mocker.patch.object(OTXVisualPromptingModel, "_create_model")
-    model = OTXVisualPromptingModel(label_info=1)
+    model = OTXVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024))
     model.model.image_size = 1024
     return model
 
@@ -44,7 +44,7 @@ def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel:
 @pytest.fixture()
 def otx_zero_shot_visual_prompting_model(mocker) -> OTXZeroShotVisualPromptingModel:
     mocker.patch.object(OTXZeroShotVisualPromptingModel, "_create_model")
-    model = OTXZeroShotVisualPromptingModel(label_info=1)
+    model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024))
     model.model.image_size = 1024
     return model
 

From 73598abacf5a9c59ca110306ffbf0e3af133d060 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 15:46:08 +0900
Subject: [PATCH 14/42] implement left unit test

---
 src/otx/core/data/utils/utils.py         |  28 ++--
 tests/unit/core/data/utils/__init__.py   |   2 +
 tests/unit/core/data/utils/test_utils.py | 156 +++++++++++++++++++++++
 3 files changed, 171 insertions(+), 15 deletions(-)
 create mode 100644 tests/unit/core/data/utils/__init__.py
 create mode 100644 tests/unit/core/data/utils/test_utils.py

diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index ee1b6b44f44..60d6a532c86 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -145,6 +145,11 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
     return stat
 
 
+_MIN_RECOGNIZABLE_OBJECT_SIZE = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
+# meaning NxN input pixels being downscaled to 1x1 on feature map
+_MIN_DETECTION_INPUT_SIZE = 256  # Minimum input size for object detection
+
+
 def adapt_input_size_to_dataset(
     dataset: Dataset,
     base_input_size: int | tuple[int, int] | None = None,
@@ -164,10 +169,6 @@ def adapt_input_size_to_dataset(
     Returns:
         tuple[int, int] | None: Recommended input size based on dataset statistics.
     """
-    min_recognizable_object_size = 32  # Minimum object size recognizable by NNs: typically 16 ~ 32
-    # meaning NxN input pixels being downscaled to 1x1 on feature map
-    min_detection_input_size = 256  # Minimum input size for object detection
-
     if downscale_only and base_input_size is None:
         msg = "If downscale_only is set to True, base_input_size should be set but got None."
         raise ValueError(msg)
@@ -175,19 +176,13 @@ def adapt_input_size_to_dataset(
     if isinstance(base_input_size, int):
         base_input_size = (base_input_size, base_input_size)
 
-    train_dataset = dataset.subsets().get("train")
-    if train_dataset is None:
+    if (train_dataset := dataset.subsets().get("train")) is None:
         return None
 
     logger.info("Adapting model input size based on dataset stat")
     stat = compute_robust_dataset_statistics(train_dataset)
-    max_image_size = stat["image"]["robust_max"]
+    max_image_size = stat["image"].get("robust_max", 0)
     min_object_size = None
-    if stat["annotation"]:
-        # Refine using annotation shape size stat
-        # Fit to typical small object size (conservative)
-        # -> "avg" size might be preferrable for efficiency
-        min_object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None)
 
     logger.info(f"-> Current base input size: {base_input_size}")
 
@@ -198,14 +193,17 @@ def adapt_input_size_to_dataset(
     logger.info(f"-> Based on typical large image size: {image_size}")
 
     # Refine using annotation shape size stat
+    # Fit to typical small object size (conservative)
+    # -> "avg" size might be preferrable for efficiency
+    min_object_size = stat.get("annotation", {}).get("size_of_shape", {}).get("robust_min", None)
     if min_object_size is not None and min_object_size > 0:
-        image_size = round(image_size * min_recognizable_object_size / min_object_size)
+        image_size = round(image_size * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size)
         logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}")
         if image_size > max_image_size:
             image_size = max_image_size
             logger.info(f"-> Restrict to max image size: {image_size}")
-        if image_size < min_detection_input_size:
-            image_size = min_detection_input_size
+        if image_size < _MIN_DETECTION_INPUT_SIZE:
+            image_size = _MIN_DETECTION_INPUT_SIZE
             logger.info(f"-> Based on minimum object detection input size: {image_size}")
 
     if input_size_multiplier is not None and image_size % input_size_multiplier != 0:
diff --git a/tests/unit/core/data/utils/__init__.py b/tests/unit/core/data/utils/__init__.py
new file mode 100644
index 00000000000..916f3a44b27
--- /dev/null
+++ b/tests/unit/core/data/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/unit/core/data/utils/test_utils.py b/tests/unit/core/data/utils/test_utils.py
new file mode 100644
index 00000000000..ace8d23250a
--- /dev/null
+++ b/tests/unit/core/data/utils/test_utils.py
@@ -0,0 +1,156 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Tests for utils for OTX data module."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import cv2
+import numpy as np
+import pytest
+from datumaro.components.annotation import Bbox
+from datumaro.components.dataset import Dataset as DmDataset
+from datumaro.components.dataset_base import DatasetItem
+from datumaro.components.media import Image
+from otx.core.data.utils import utils as target_file
+from otx.core.data.utils.utils import (
+    adapt_input_size_to_dataset,
+    compute_robust_dataset_statistics,
+    compute_robust_scale_statistics,
+    compute_robust_statistics,
+)
+
+
+def test_compute_robust_statistics():
+    values = np.array([])
+    stat = compute_robust_statistics(values)
+    assert len(stat) == 0
+
+    values = np.array([0.5, 1, 1.5])
+    stat = compute_robust_statistics(values)
+    assert np.isclose(stat["avg"], 1.0)
+    assert np.isclose(stat["min"], 0.5)
+    assert np.isclose(stat["max"], 1.5)
+
+    values = np.random.rand(10)
+    stat = compute_robust_statistics(values)
+    assert np.isclose(stat["min"], np.min(values))
+    assert np.isclose(stat["max"], np.max(values))
+    assert stat["min"] <= stat["robust_min"]
+    assert stat["max"] <= stat["robust_max"]
+
+
+def test_compute_robust_scale_statistics():
+    scales = np.array([])
+    stat = compute_robust_scale_statistics(scales)
+    assert len(stat) == 0
+
+    scales = np.array([0.5, 1, 2])
+    stat = compute_robust_scale_statistics(scales)
+    assert np.isclose(stat["avg"], 1.0)
+    assert np.isclose(stat["min"], 0.5)
+    assert np.isclose(stat["max"], 2.0)
+
+    scales = np.random.rand(10)
+    stat = compute_robust_scale_statistics(scales)
+    assert np.isclose(stat["min"], np.min(scales))
+    assert np.isclose(stat["max"], np.max(scales))
+    assert stat["min"] <= stat["robust_min"]
+    assert stat["max"] <= stat["robust_max"]
+
+
+def make_media(shape: tuple[int, int, int]):
+    np_img = np.zeros(shape=shape, dtype=np.uint8)
+    np_img[:, :, 0] = 0  # Set 0 for B channel
+    np_img[:, :, 1] = 1  # Set 1 for G channel
+    np_img[:, :, 2] = 2  # Set 2 for R channel
+
+    _, np_bytes = cv2.imencode(".png", np_img)
+    media = Image.from_bytes(np_bytes.tobytes())
+    media.path = ""
+
+    return media
+
+
+@pytest.fixture()
+def mock_dataset() -> DmDataset:
+    return DmDataset.from_iterable(
+        [
+            DatasetItem(
+                id="1",
+                subset="train",
+                media=make_media((50, 50, 3)),
+                annotations=[
+                    Bbox(x=0, y=0, w=5, h=5, label=0),
+                ],
+            ),
+            DatasetItem(
+                id="2",
+                subset="train",
+                media=make_media((100, 100, 3)),
+                annotations=[
+                    Bbox(x=0, y=0, w=10, h=10, label=0),
+                    Bbox(x=10, y=10, w=20, h=20, label=0),
+                ],
+            ),
+            DatasetItem(
+                id="3",
+                subset="train",
+                media=make_media((200, 200, 3)),
+                annotations=[],
+            ),
+        ],
+    )
+
+
+def test_compute_robuste_dataset_statistics(mock_dataset):
+    subset = mock_dataset.get_subset("train")
+
+    stat = compute_robust_dataset_statistics(subset, max_samples=0)
+    assert len(stat) == 0
+    stat = compute_robust_dataset_statistics(subset, max_samples=-1)
+    assert len(stat) == 0
+
+    stat = compute_robust_dataset_statistics(subset)
+    assert np.isclose(stat["image"]["avg"], 100)
+    assert np.isclose(stat["annotation"]["num_per_image"]["avg"], 1.0)
+    assert np.isclose(stat["annotation"]["size_of_shape"]["avg"], 10.0)
+
+
+def test_adapt_input_size_to_dataset(mocker):
+    mock_stat = mocker.patch.object(target_file, "compute_robust_dataset_statistics")
+
+    with pytest.raises(ValueError, match="base_input_size should be set"):
+        input_size = adapt_input_size_to_dataset(dataset=MagicMock())
+
+    mock_stat.return_value = {"image": {}, "annotation": {}}
+    mock_dataset = MagicMock()
+    mock_dataset.subsets.return_value = {}
+    input_size = adapt_input_size_to_dataset(dataset=mock_dataset, base_input_size=512)
+    assert input_size is None
+
+    mock_stat.return_value = {"image": {}, "annotation": {}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
+    assert input_size == (512, 512)
+
+    mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
+    assert input_size == (150, 150)
+
+    mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512, input_size_multiplier=32)
+    assert input_size == (160, 160)
+
+    mock_stat.return_value = {"image": {"robust_max": 256}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
+    assert input_size == (256, 256)
+
+    mock_stat.return_value = {"image": {"robust_max": 1024}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
+    assert input_size == (512, 512)
+
+    mock_stat.return_value = {"image": {"robust_max": 2045}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    input_size = adapt_input_size_to_dataset(dataset=MagicMock(), downscale_only=False, base_input_size=512)
+    assert input_size == (1022, 1022)

From 4df567476c038fc3ae7c8656f5b1192b3d31c40b Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 16:04:42 +0900
Subject: [PATCH 15/42] align with develop branch

---
 src/otx/algo/classification/timm_model.py | 6 ++++++
 src/otx/cli/cli.py                        | 3 ++-
 src/otx/core/model/anomaly.py             | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/otx/algo/classification/timm_model.py b/src/otx/algo/classification/timm_model.py
index 04eaf5ff396..7c540b3e1ef 100644
--- a/src/otx/algo/classification/timm_model.py
+++ b/src/otx/algo/classification/timm_model.py
@@ -54,6 +54,7 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         backbone: TimmModelType,
+        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -66,6 +67,7 @@ def __init__(
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -146,6 +148,7 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         backbone: TimmModelType,
+        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -157,6 +160,7 @@ def __init__(
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -226,6 +230,7 @@ def __init__(
         self,
         label_info: HLabelInfo,
         backbone: TimmModelType,
+        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -237,6 +242,7 @@ def __init__(
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 31717fb487d..04ea994148f 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -344,7 +344,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             self.datamodule = self.get_config_value(self.config_init, "data")
 
             # pass OTXDataModule input size to the model
-            if (input_size := self.datamodule.input_size) is not None:
+            if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]:
+                # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check
                 input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)  # type: ignore[assignment]
                 model_config["init_args"]["input_size"] = (
                     tuple(model_config["init_args"]["input_size"][:-2]) + input_size  # type: ignore[operator]
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index 1e3e9e0dd1f..a4f57c4fe0e 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -81,7 +81,7 @@ def on_load_checkpoint(self, checkpoint: dict[str, Any]) -> None:
             for key, value in anomaly_attrs.items():
                 setattr(self, key, value)
 
-    @property
+    @property  # type: ignore[override]
     def input_size(self) -> tuple[int, int]:
         """Returns the input size of the model.
 

From 4fcc62718b34d92040eb03357a40d07aa87b343f Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 16:44:14 +0900
Subject: [PATCH 16/42] fix typo

---
 src/otx/algo/detection/yolox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index ac319e26532..b21a0420d67 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -72,7 +72,7 @@ def _customize_inputs(
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         if self.input_size is None:
-            msg = f"Inputhsize attribute is not set for {self.__class__}"
+            msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
         swap_rgb = not isinstance(self, YOLOXTINY)  # only YOLOX-TINY uses RGB

From 39b0650946566af8ec13e2a01481b10e78c892ea Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 18:10:51 +0900
Subject: [PATCH 17/42] exclude batch and num channel from input size

---
 src/otx/algo/action_classification/movinet.py |  2 +-
 src/otx/algo/action_classification/x3d.py     |  2 +-
 src/otx/algo/classification/efficientnet.py   | 12 +++++-----
 .../algo/classification/huggingface_model.py  |  9 +++----
 src/otx/algo/classification/mobilenet_v3.py   | 19 ++++++++-------
 src/otx/algo/classification/timm_model.py     |  6 ++---
 .../algo/classification/torchvision_model.py  | 11 +++++----
 src/otx/algo/classification/vit.py            | 14 +++++------
 src/otx/algo/detection/atss.py                |  4 ++--
 src/otx/algo/detection/huggingface_model.py   |  7 +++---
 src/otx/algo/detection/rtdetr.py              | 22 ++++++++---------
 src/otx/algo/detection/rtmdet.py              |  4 ++--
 src/otx/algo/detection/ssd.py                 |  8 +++----
 src/otx/algo/detection/yolox.py               |  6 ++---
 .../algo/instance_segmentation/maskrcnn.py    |  8 +++----
 .../algo/instance_segmentation/maskrcnn_tv.py |  4 ++--
 .../algo/instance_segmentation/rtmdet_inst.py |  4 ++--
 src/otx/algo/segmentation/dino_v2_seg.py      |  2 +-
 .../algo/segmentation/huggingface_model.py    |  2 +-
 src/otx/algo/segmentation/litehrnet.py        |  4 ++--
 src/otx/algo/segmentation/segnext.py          |  2 +-
 .../algo/visual_prompting/segment_anything.py |  8 +++----
 .../zero_shot_segment_anything.py             |  2 +-
 src/otx/cli/cli.py                            |  6 ++---
 src/otx/core/data/utils/utils.py              |  3 +--
 src/otx/core/model/action_classification.py   |  8 +++----
 src/otx/core/model/base.py                    | 13 +++++-----
 src/otx/core/model/classification.py          | 24 +++++++++----------
 src/otx/core/model/detection.py               |  6 ++---
 src/otx/core/model/instance_segmentation.py   |  8 +++----
 src/otx/core/model/segmentation.py            | 14 +++++------
 src/otx/core/model/visual_prompting.py        | 16 ++++++-------
 src/otx/engine/utils/auto_configurator.py     | 12 +++++-----
 33 files changed, 136 insertions(+), 136 deletions(-)

diff --git a/src/otx/algo/action_classification/movinet.py b/src/otx/algo/action_classification/movinet.py
index 4803aba3d9e..9bd85b0fed0 100644
--- a/src/otx/algo/action_classification/movinet.py
+++ b/src/otx/algo/action_classification/movinet.py
@@ -32,7 +32,7 @@ class MoViNet(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
index 5d49b19661f..98cda3fe3bf 100644
--- a/src/otx/algo/action_classification/x3d.py
+++ b/src/otx/algo/action_classification/x3d.py
@@ -32,7 +32,7 @@ class X3D(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 1, 3, 8, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index 4bf4220aaec..52eb86d7f75 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -57,7 +57,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
     ) -> None:
         self.version = version
@@ -88,7 +88,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         neck = GlobalAveragePooling(dim=2)
         loss = nn.CrossEntropyLoss(reduction="none")
         if self.train_type == OTXTrainType.SEMI_SUPERVISED:
@@ -151,7 +151,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.version = version
         self.pretrained = pretrained
@@ -180,7 +180,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         return ImageClassifier(
             backbone=backbone,
             neck=GlobalAveragePooling(dim=2),
@@ -233,7 +233,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.version = version
         self.pretrained = pretrained
@@ -268,7 +268,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
         if not isinstance(self.label_info, HLabelInfo):
             raise TypeError(self.label_info)
 
-        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size[-2:], pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         return ImageClassifier(
             backbone=backbone,
             neck=GlobalAveragePooling(dim=2),
diff --git a/src/otx/algo/classification/huggingface_model.py b/src/otx/algo/classification/huggingface_model.py
index d48120de1d2..c9bdf5b02cb 100644
--- a/src/otx/algo/classification/huggingface_model.py
+++ b/src/otx/algo/classification/huggingface_model.py
@@ -31,7 +31,7 @@
     from otx.core.metrics import MetricCallable
 
 
-DEFAULT_INPUT_SIZE = (1, 3, 224, 224)
+DEFAULT_INPUT_SIZE = (224, 224)
 logger = logging.getLogger(__name__)
 
 
@@ -44,7 +44,8 @@ class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
         optimizer (OptimizerCallable, optional): The optimizer callable for training the model.
         scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): The learning rate scheduler callable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
-        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (224, 224)
 
     Example:
         1. API
@@ -66,7 +67,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = DEFAULT_INPUT_SIZE,
+        input_size: tuple[int, int] = DEFAULT_INPUT_SIZE,
     ) -> None:
         self.model_name = model_name_or_path
 
@@ -83,7 +84,7 @@ def _create_model(self) -> nn.Module:
         model_config, _ = PretrainedConfig.get_config_dict(self.model_name)
         kwargs = {}
         if "image_size" in model_config:
-            kwargs["image_size"] = self.input_size[-1]
+            kwargs["image_size"] = self.input_size[0]
         elif self.input_size != DEFAULT_INPUT_SIZE:
             msg = "There is no 'image_size' argument in the model configuration. There may be unexpected results."
             logger.warning(msg)
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index 62727037737..fb391e37888 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -62,7 +62,8 @@ class MobileNetV3ForMulticlassCls(OTXMulticlassClsModel):
         metric (MetricCallable, optional): The metric callable. Defaults to MultiClassClsMetricCallable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
         freeze_backbone (bool, optional): Whether to freeze the backbone layers during training. Defaults to False.
-        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (224, 224)
     """
 
     def __init__(
@@ -73,7 +74,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
     ) -> None:
         self.mode = mode
@@ -103,7 +104,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:])
+        backbone = OTXMobileNetV3(mode=self.mode, input_size=self.input_size)
         neck = GlobalAveragePooling(dim=2)
         loss = nn.CrossEntropyLoss(reduction="none")
         in_channels = 960 if self.mode == "large" else 576
@@ -166,7 +167,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -194,7 +195,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelNonLinearClsHead(
                 num_classes=num_classes,
@@ -251,7 +252,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -297,7 +298,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -331,7 +332,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size[-2:]),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size),
             neck=GlobalAveragePooling(dim=2),
             head=HierarchicalNonLinearClsHead(
                 in_channels=960,
@@ -409,7 +410,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
diff --git a/src/otx/algo/classification/timm_model.py b/src/otx/algo/classification/timm_model.py
index 7c540b3e1ef..411da97a165 100644
--- a/src/otx/algo/classification/timm_model.py
+++ b/src/otx/algo/classification/timm_model.py
@@ -54,7 +54,7 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         backbone: TimmModelType,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
+        input_size: tuple[int, int] = (224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -148,7 +148,7 @@ def __init__(
         self,
         label_info: LabelInfoTypes,
         backbone: TimmModelType,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
+        input_size: tuple[int, int] = (224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -230,7 +230,7 @@ def __init__(
         self,
         label_info: HLabelInfo,
         backbone: TimmModelType,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),  # input size of default classification data recipe
+        input_size: tuple[int, int] = (224, 224),  # input size of default classification data recipe
         pretrained: bool = True,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index 002855f82ae..e90c4776e1e 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -404,7 +404,8 @@ class OTXTVModel(OTXModel):
         task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional):
             The type of classification task.
         train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training.
-        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 224, 224)
+        input_size (tuple[int, ...], optional):
+            Model input size in the order of height and width. Defaults to (224, 224)
     """
 
     model: TVClassificationModel
@@ -423,7 +424,7 @@ def __init__(
             OTXTaskType.H_LABEL_CLS,
         ] = OTXTaskType.MULTI_CLASS_CLS,
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.backbone = backbone
         self.freeze_backbone = freeze_backbone
@@ -446,7 +447,7 @@ def __init__(
             torch_compile=torch_compile,
             input_size=input_size,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     def _create_model(self) -> nn.Module:
         if self.task == OTXTaskType.MULTI_CLASS_CLS:
@@ -556,7 +557,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -654,7 +655,7 @@ def _convert_pred_entity_to_compute_metric(
 
     def get_dummy_input(self, batch_size: int = 1) -> CLASSIFICATION_BATCH_DATA_ENTITY:
         """Returns a dummy input for classification model."""
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [torch.LongTensor([0])] * batch_size
 
         if self.task == OTXTaskType.MULTI_CLASS_CLS:
diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 6ccf64607a8..9a629b94537 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -219,7 +219,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
     ) -> None:
         self.arch = arch
@@ -279,7 +279,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora)
         if self.train_type == OTXTrainType.SEMI_SUPERVISED:
             return SemiSLClassifier(
                 backbone=vit_backbone,
@@ -320,7 +320,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:])
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size)
         return SemiSLClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -348,7 +348,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -405,7 +405,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
@@ -434,7 +434,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
-        input_size: tuple[int, ...] = (1, 3, 224, 224),
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.arch = arch
         self.lora = lora
@@ -496,7 +496,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
             {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
         ]
-        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size[-2:], lora=self.lora)
+        vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
             neck=None,
diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py
index a873055354b..019c7e5cc82 100644
--- a/src/otx/algo/detection/atss.py
+++ b/src/otx/algo/detection/atss.py
@@ -39,7 +39,7 @@ class ATSS(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 800, 992),
+        input_size: tuple[int, int] = (800, 992),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -65,7 +65,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py
index 859825a143f..eb0cd9111ca 100644
--- a/src/otx/algo/detection/huggingface_model.py
+++ b/src/otx/algo/detection/huggingface_model.py
@@ -36,7 +36,8 @@ class HuggingFaceModelForDetection(OTXDetectionModel):
     Args:
         model_name_or_path (str): The name or path of the pre-trained model.
         label_info (LabelInfoTypes): The label information for the model.
-        input_size (tuple[int, ...], optional): The input size of the model. Defaults to (1, 3, 800, 992).
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (800, 992).
         optimizer (OptimizerCallable, optional): The optimizer for training the model.
             Defaults to DefaultOptimizerCallable.
         scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -61,7 +62,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=object-detection
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 800, 992),  # input size of default detection data recipe
+        input_size: tuple[int, int] = (800, 992),  # input size of default detection data recipe
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -156,7 +157,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=image_mean,  # type: ignore[arg-type]
             std=image_std,  # type: ignore[arg-type]
             resize_mode="standard",
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index cf8c3d820a8..8af1d28a659 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -46,7 +46,7 @@ class RTDETR(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 640, 640),
+        input_size: tuple[int, int] = (640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -199,7 +199,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
@@ -242,13 +242,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
         encoder = HybridEncoder(
             in_channels=[128, 256, 512],
             expansion=0.5,
-            eval_spatial_size=self.input_size[-2:],
+            eval_spatial_size=self.input_size,
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             num_decoder_layers=3,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_size[-2:],
+            eval_spatial_size=self.input_size,
         )
 
         optimizer_configuration = [
@@ -266,7 +266,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
-            input_size=self.input_size[-1],
+            input_size=self.input_size[0],
         )
 
 
@@ -286,12 +286,12 @@ def _build_model(self, num_classes: int) -> nn.Module:
             norm_cfg={"type": "FBN", "name": "norm"},
         )
         encoder = HybridEncoder(
-            eval_spatial_size=self.input_size[-2:],
+            eval_spatial_size=self.input_size,
         )
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[256, 256, 256],
-            eval_spatial_size=self.input_size[-2:],
+            eval_spatial_size=self.input_size,
             num_decoder_layers=6,
         )
 
@@ -310,7 +310,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
-            input_size=self.input_size[-1],
+            input_size=self.input_size[0],
         )
 
 
@@ -334,13 +334,13 @@ def _build_model(self, num_classes: int) -> nn.Module:
             hidden_dim=384,
             dim_feedforward=2048,
             in_channels=[512, 1024, 2048],
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size,
         )
 
         decoder = RTDETRTransformer(
             num_classes=num_classes,
             feat_channels=[384, 384, 384],
-            eval_spatial_size=self.input_size[2:],
+            eval_spatial_size=self.input_size,
         )
 
         # no bias decay and learning rate correction for the backbone.
@@ -360,5 +360,5 @@ def _build_model(self, num_classes: int) -> nn.Module:
             decoder=decoder,
             num_classes=num_classes,
             optimizer_configuration=optimizer_configuration,
-            input_size=self.input_size[-1],
+            input_size=self.input_size[0],
         )
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index b382ff65225..fb43031838b 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -41,7 +41,7 @@ class RTMDet(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 640, 640),
+        input_size: tuple[int, int] = (640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -67,7 +67,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py
index e4c095dffa2..f6aa62b6cea 100644
--- a/src/otx/algo/detection/ssd.py
+++ b/src/otx/algo/detection/ssd.py
@@ -55,7 +55,7 @@ class SSD(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 864, 864),
+        input_size: tuple[int, int] = (864, 864),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -168,11 +168,11 @@ def _get_new_anchors(self, dataset: OTXDataset, anchor_generator: SSDAnchorGener
                 if isinstance(transform, Resize):
                     target_wh = transform.scale
         if target_wh is None:
-            target_wh = self.input_size[-2:]
+            target_wh = list(reversed(self.input_size))  # type: ignore[assignment]
             msg = f"Cannot get target_wh from the dataset. Assign it with the default value: {target_wh}"
             logger.warning(msg)
         group_as = [len(width) for width in anchor_generator.widths]
-        wh_stats = self._get_sizes_from_dataset_entity(dataset, list(target_wh))
+        wh_stats = self._get_sizes_from_dataset_entity(dataset, list(target_wh))  # type: ignore[arg-type]
 
         if len(wh_stats) < sum(group_as):
             logger.warning(
@@ -297,7 +297,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index b21a0420d67..fd1a8765cad 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -43,7 +43,7 @@ class YOLOX(ExplainableOTXDetModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 640, 640),
+        input_size: tuple[int, int] = (640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
@@ -79,7 +79,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -150,7 +150,7 @@ class YOLOXTINY(YOLOX):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 416, 416),
+        input_size: tuple[int, int] = (416, 416),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 1ab96d01bb1..67b8a772d3a 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -48,7 +48,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window",
@@ -88,7 +88,7 @@ class MaskRCNNResNet50(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
+        input_size: tuple[int, int] = (1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -278,7 +278,7 @@ class MaskRCNNEfficientNet(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
+        input_size: tuple[int, int] = (1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -485,7 +485,7 @@ class MaskRCNNSwinT(MaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 1344, 1344),
+        input_size: tuple[int, int] = (1344, 1344),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/maskrcnn_tv.py b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
index 075e4bcf811..7d530cbd926 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn_tv.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn_tv.py
@@ -234,7 +234,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window",
@@ -275,7 +275,7 @@ class TVMaskRCNNR50(TVMaskRCNN):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
+        input_size: tuple[int, int] = (1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 92a5627e5cb..02bedb89aa9 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -45,7 +45,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -96,7 +96,7 @@ class RTMDetInstTiny(RTMDetInst):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 640, 640),
+        input_size: tuple[int, int] = (640, 640),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
index 16101a6fcad..3aef09e73af 100644
--- a/src/otx/algo/segmentation/dino_v2_seg.py
+++ b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -56,7 +56,7 @@ class OTXDinoV2Seg(TorchVisionCompatibleModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 560, 560),
+        input_size: tuple[int, int] = (560, 560),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index a64798b22f4..c60964c27c8 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -65,7 +65,7 @@ def __init__(
         self,
         model_name_or_path: str,  # https://huggingface.co/models?pipeline_tag=image-segmentation
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 512, 512),  # input size of default semantic segmentation data recipe
+        input_size: tuple[int, int] = (512, 512),  # input size of default semantic segmentation data recipe
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 81d9e99f57c..a056be9dda6 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -528,7 +528,7 @@ class OTXLiteHRNet(TorchVisionCompatibleModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 512, 512),
+        input_size: tuple[int, int] = (512, 512),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -599,7 +599,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.scale,
             resize_mode="standard",
diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py
index 703f5b1dfbe..4a01417f158 100644
--- a/src/otx/algo/segmentation/segnext.py
+++ b/src/otx/algo/segmentation/segnext.py
@@ -118,7 +118,7 @@ class OTXSegNext(TorchVisionCompatibleModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] = (1, 3, 512, 512),
+        input_size: tuple[int, int] = (512, 512),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
diff --git a/src/otx/algo/visual_prompting/segment_anything.py b/src/otx/algo/visual_prompting/segment_anything.py
index feb02857375..7cc9673aa4f 100644
--- a/src/otx/algo/visual_prompting/segment_anything.py
+++ b/src/otx/algo/visual_prompting/segment_anything.py
@@ -496,7 +496,7 @@ def __init__(
         self,
         backbone: Literal["tiny_vit", "vit_b"],
         label_info: LabelInfoTypes = NullLabelInfo(),
-        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
+        input_size: tuple[int, int] = (1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -509,14 +509,14 @@ def __init__(
         return_extra_metrics: bool = False,
         stability_score_offset: float = 1.0,
     ) -> None:
-        if input_size[-1] != input_size[-2]:
+        if input_size[0] != input_size[1]:
             msg = f"SAM should use square image size, but got {input_size}"
             raise ValueError(msg)
 
         self.config = {
             "backbone": backbone,
-            "image_size": input_size[-1],
-            "image_embedding_size": input_size[-1] // 16,
+            "image_size": input_size[0],
+            "image_embedding_size": input_size[0] // 16,
             "freeze_image_encoder": freeze_image_encoder,
             "freeze_prompt_encoder": freeze_prompt_encoder,
             "freeze_mask_decoder": freeze_mask_decoder,
diff --git a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
index 5edfa3aabd1..71fca59b925 100644
--- a/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
+++ b/src/otx/algo/visual_prompting/zero_shot_segment_anything.py
@@ -661,7 +661,7 @@ def __init__(  # noqa: PLR0913
         }
         super().__init__(
             label_info=label_info,
-            input_size=(1, 3, 1024, 1024),  # zero-shot visual prompting model uses fixed 1024x1024 input size
+            input_size=(1024, 1024),  # zero-shot visual prompting model uses fixed 1024x1024 input size
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 04ea994148f..d20e30a700e 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -335,8 +335,7 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule
             if self.config[self.subcommand].data.adaptive_input_size != "none":
                 model_cls = get_model_cls_from_config(model_config)
-                if hasattr(model_cls, "input_size_multiplier"):
-                    self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier
+                self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier
 
             # Instantiate the things that don't need to special handling
             self.config_init = self.parser.instantiate_classes(self.config)
@@ -346,9 +345,8 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             # pass OTXDataModule input size to the model
             if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]:
                 # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check
-                input_size = (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)  # type: ignore[assignment]
                 model_config["init_args"]["input_size"] = (
-                    tuple(model_config["init_args"]["input_size"][:-2]) + input_size  # type: ignore[operator]
+                    (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)
                 )
 
             # Instantiate the model and needed components
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 60d6a532c86..07777bda224 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -220,8 +220,7 @@ def area(x: tuple[int, int]) -> int:
             logger.info(f"-> Downscale only: {input_size} -> {base_input_size}")
             return base_input_size
 
-    # Closest preset
-    logger.info(f"-> Closest preset: {input_size}")
+    logger.info(f"-> Adapted input size: {input_size}")
     return input_size
 
 
diff --git a/src/otx/core/model/action_classification.py b/src/otx/core/model/action_classification.py
index b31c4e1f63e..98b26ad1bd5 100644
--- a/src/otx/core/model/action_classification.py
+++ b/src/otx/core/model/action_classification.py
@@ -37,7 +37,7 @@ class OTXActionClsModel(OTXModel[ActionClsBatchDataEntity, ActionClsBatchPredEnt
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -53,7 +53,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int, int, int]
+        self.input_size: tuple[int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
@@ -133,7 +133,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 1, 3, 8, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="standard",
@@ -165,7 +165,7 @@ def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[st
 
     def get_dummy_input(self, batch_size: int = 1) -> ActionClsBatchDataEntity:
         """Returns a dummy input for action classification model."""
-        images = torch.rand(batch_size, *self.input_size[1:])
+        images = torch.rand(batch_size, 1, 3, 8, *self.input_size)
         labels = [torch.LongTensor([0])] * batch_size
         infos = []
         for i, img in enumerate(images):
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 1b4dc9b6acc..ac945d26b71 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -99,11 +99,12 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti
     """
 
     _OPTIMIZED_MODEL_BASE_NAME: str = "optimized_model"
+    input_size_multiplier = 1
 
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...] | None = None,
+        input_size: tuple[int, int] | None = None,
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = NullMetricCallable,
@@ -809,13 +810,11 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
         raise TypeError(label_info)
 
-    def _check_input_size(self, input_size: tuple[int, ...] | None = None) -> None:
-        if (
-            input_size is not None
-            and hasattr(self, "input_size_multiplier")
-            and (input_size[-1] % self.input_size_multiplier != 0 or input_size[-2] % self.input_size_multiplier != 0)
+    def _check_input_size(self, input_size: tuple[int, int] | None = None) -> None:
+        if input_size is not None and (
+            input_size[0] % self.input_size_multiplier != 0 or input_size[1] % self.input_size_multiplier != 0
         ):
-            msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size[-2:]} instead."
+            msg = f"Input size should be a multiple of {self.input_size_multiplier}, but got {input_size} instead."
             raise ValueError(msg)
 
 
diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 26a7a21843c..678203447dd 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -46,7 +46,7 @@ class OTXMulticlassClsModel(OTXModel[MulticlassClsBatchDataEntity, MulticlassCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -63,7 +63,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     def _customize_inputs(self, inputs: MulticlassClsBatchDataEntity) -> dict[str, Any]:
         if self.training:
@@ -162,7 +162,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -190,7 +190,7 @@ def _reset_prediction_layer(self, num_classes: int) -> None:
 
     def get_dummy_input(self, batch_size: int = 1) -> MulticlassClsBatchDataEntity:
         """Returns a dummy input for classification model."""
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [torch.LongTensor([0])] * batch_size
         return MulticlassClsBatchDataEntity(batch_size, images, [], labels=labels)
 
@@ -209,7 +209,7 @@ class OTXMultilabelClsModel(OTXModel[MultilabelClsBatchDataEntity, MultilabelCls
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
@@ -223,7 +223,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     def _customize_inputs(self, inputs: MultilabelClsBatchDataEntity) -> dict[str, Any]:
         if self.training:
@@ -287,7 +287,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -314,7 +314,7 @@ def forward_for_tracing(self, image: Tensor) -> Tensor | dict[str, Tensor]:
 
     def get_dummy_input(self, batch_size: int = 1) -> MultilabelClsBatchDataEntity:
         """Returns a dummy input for classification OV model."""
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [torch.LongTensor([0])] * batch_size
         return MultilabelClsBatchDataEntity(batch_size, images, [], labels=labels)
 
@@ -327,7 +327,7 @@ class OTXHlabelClsModel(OTXModel[HlabelClsBatchDataEntity, HlabelClsBatchPredEnt
     def __init__(
         self,
         label_info: HLabelInfo,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
@@ -341,7 +341,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     def _customize_inputs(self, inputs: HlabelClsBatchDataEntity) -> dict[str, Any]:
         if self.training:
@@ -409,7 +409,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -449,7 +449,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
     def get_dummy_input(self, batch_size: int = 1) -> HlabelClsBatchDataEntity:
         """Returns a dummy input for classification OV model."""
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, self.input_size) for _ in range(batch_size)]
         labels = [torch.LongTensor([0])] * batch_size
         return HlabelClsBatchDataEntity(batch_size, images, [], labels=labels)
 
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index 938715396b6..76b7c2d1538 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -41,7 +41,7 @@
 class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]):
     """Base class for the detection models used in OTX."""
 
-    input_size: tuple[int, int, int, int]
+    input_size: tuple[int, int]
 
     def test_step(self, batch: DetBatchDataEntity, batch_idx: int) -> None:
         """Perform a single test step on a batch of data from the test set.
@@ -368,7 +368,7 @@ def get_dummy_input(self, batch_size: int = 1) -> DetBatchDataEntity:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         infos = []
         for i, img in enumerate(images):
             infos.append(
@@ -387,7 +387,7 @@ class ExplainableOTXDetModel(OTXDetectionModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py
index 756a5fd7641..2a26b688920 100644
--- a/src/otx/core/model/instance_segmentation.py
+++ b/src/otx/core/model/instance_segmentation.py
@@ -49,7 +49,7 @@ class OTXInstanceSegModel(OTXModel[InstanceSegBatchDataEntity, InstanceSegBatchP
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
@@ -65,7 +65,7 @@ def __init__(
             torch_compile=torch_compile,
             tile_config=tile_config,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     def _build_model(self, num_classes: int) -> nn.Module:
         raise NotImplementedError
@@ -366,7 +366,7 @@ def get_dummy_input(self, batch_size: int = 1) -> InstanceSegBatchDataEntity:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        images = [torch.rand(*self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         infos = []
         for i, img in enumerate(images):
             infos.append(
@@ -385,7 +385,7 @@ class ExplainableOTXInstanceSegModel(OTXInstanceSegModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MaskRLEMeanAPFMeasureCallable,
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 935e2a2215b..ba2f5963448 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -37,7 +37,7 @@ class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -47,7 +47,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (tuple[int, ...]): The input size of the model.
+            input_size (tuple[int, int]): Model input size in the order of height and width.
             optimizer (OptimizerCallable, optional): The optimizer to use for training.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -65,7 +65,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
@@ -112,7 +112,7 @@ def get_dummy_input(self, batch_size: int = 1) -> SegBatchDataEntity:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        images = torch.rand(batch_size, *self.input_size[1:])
+        images = torch.rand(batch_size, 3, *self.input_size)
         infos = []
         for i, img in enumerate(images):
             infos.append(
@@ -131,7 +131,7 @@ class TorchVisionCompatibleModel(OTXSegmentationModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = SegmCallable,  # type: ignore[assignment]
@@ -146,7 +146,7 @@ def __init__(
 
         Args:
             label_info (LabelInfoTypes): The label information for the segmentation model.
-            input_size (tuple[int, ...]): The input size of the model.
+            input_size (tuple[int, int]): Model input size in the order of height and width.
             optimizer (OptimizerCallable, optional): The optimizer callable for the model.
                 Defaults to DefaultOptimizerCallable.
             scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional):
@@ -220,7 +220,7 @@ def _exporter(self) -> OTXModelExporter:
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.scale,
             resize_mode="standard",
diff --git a/src/otx/core/model/visual_prompting.py b/src/otx/core/model/visual_prompting.py
index c6d7d2010c2..6c50c2ee62b 100644
--- a/src/otx/core/model/visual_prompting.py
+++ b/src/otx/core/model/visual_prompting.py
@@ -162,7 +162,7 @@ class OTXVisualPromptingModel(OTXModel[VisualPromptingBatchDataEntity, VisualPro
     def __init__(
         self,
         label_info: LabelInfoTypes = NullLabelInfo(),
-        input_size: tuple[int, ...] = (1, 3, 1024, 1024),
+        input_size: tuple[int, int] = (1024, 1024),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = VisualPromptingMetricCallable,
@@ -178,14 +178,14 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXVisualPromptingModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="fit_to_window",
@@ -265,7 +265,7 @@ def _set_label_info(self, _: LabelInfoTypes) -> None:
 
     def get_dummy_input(self, batch_size: int = 1) -> VisualPromptingBatchDataEntity:
         """Returns a dummy input for VPT model."""
-        images = [torch.rand(self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [{"points": torch.LongTensor([0] * batch_size)}] * batch_size
         prompts = [torch.zeros((1, 2))] * batch_size
         return VisualPromptingBatchDataEntity(
@@ -287,7 +287,7 @@ class OTXZeroShotVisualPromptingModel(
 
     def __init__(
         self,
-        input_size: tuple[int, ...],
+        input_size: tuple[int, int],
         label_info: LabelInfoTypes = NullLabelInfo(),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
@@ -304,14 +304,14 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int, int, int]
+        self.input_size: tuple[int, int]
 
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXVisualPromptingModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.input_size,
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="fit_to_window",
@@ -450,7 +450,7 @@ def _set_label_info(self, _: LabelInfoTypes) -> None:
 
     def get_dummy_input(self, batch_size: int = 1) -> ZeroShotVisualPromptingBatchDataEntity:
         """Returns a dummy input for ZSL VPT model."""
-        images = [torch.rand(self.input_size[1:]) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [ZeroShotVisualPromptingLabel(prompts=torch.LongTensor([0]))] * batch_size
         prompts = [torch.zeros((1, 2))] * batch_size
         infos = []
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index b8f1280099a..b55792f8c77 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -230,8 +230,7 @@ def get_datamodule(self) -> OTXDataModule | None:
 
         if data_config.get("adaptive_input_size", "none") != "none":
             model_cls = get_model_cls_from_config(Namespace(self.config["model"]))
-            if hasattr(model_cls, "input_size_multiplier"):
-                data_config["input_size_multiplier"] = model_cls.input_size_multiplier
+            data_config["input_size_multiplier"] = model_cls.input_size_multiplier
 
         return OTXDataModule(
             train_subset=SubsetConfig(sampler=SamplerConfig(**train_config.pop("sampler", {})), **train_config),
@@ -250,7 +249,7 @@ def get_model(
         self,
         model_name: str | None = None,
         label_info: LabelInfoTypes | None = None,
-        input_size: tuple[int, ...] | int | None = None,
+        input_size: tuple[int, int] | int | None = None,
     ) -> OTXModel:
         """Retrieves the OTXModel instance based on the provided model name and meta information.
 
@@ -258,7 +257,7 @@ def get_model(
             model_name (str | None): The name of the model to retrieve. If None, the default model will be used.
             label_info (LabelInfoTypes | None): The meta information about the labels.
                 If provided, the number of classes will be updated in the model's configuration.
-            input_size (tuple[int, ...] | int | None, optional): Input size of the model. Defaults to None.
+            input_size (tuple[int, int] | int | None, optional): Input size of the model. Defaults to None.
 
         Returns:
             OTXModel: The instantiated OTXModel instance.
@@ -286,8 +285,9 @@ def get_model(
         model_config = deepcopy(self.config["model"])
 
         if input_size is not None:
-            input_size = (input_size, input_size) if isinstance(input_size, int) else input_size
-            model_config["init_args"]["input_size"] = tuple(model_config["init_args"]["input_size"][:-2]) + input_size
+            model_config["init_args"]["input_size"] = (
+                (input_size, input_size) if isinstance(input_size, int) else input_size
+            )
 
         model_cls = get_model_cls_from_config(Namespace(model_config))
 

From aee7600c09647a9838577e6d7adff41ae51673d4 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 18:15:05 +0900
Subject: [PATCH 18/42] update docstring

---
 src/otx/engine/utils/auto_configurator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index b55792f8c77..5574f1fc3d9 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -257,7 +257,9 @@ def get_model(
             model_name (str | None): The name of the model to retrieve. If None, the default model will be used.
             label_info (LabelInfoTypes | None): The meta information about the labels.
                 If provided, the number of classes will be updated in the model's configuration.
-            input_size (tuple[int, int] | int | None, optional): Input size of the model. Defaults to None.
+            input_size (tuple[int, int] | int | None, optional):
+                Model input size in the order of height and width or a single integer for a side of a square.
+                Defaults to None.
 
         Returns:
             OTXModel: The instantiated OTXModel instance.

From 5d6c4815897b11f6e98b9eb29afa83329030ebc4 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 18:47:48 +0900
Subject: [PATCH 19/42] update unit test

---
 tests/unit/algo/classification/test_efficientnet.py | 6 +++---
 tests/unit/algo/classification/test_mobilenet_v3.py | 6 +++---
 tests/unit/algo/detection/test_rtmdet.py            | 2 +-
 tests/unit/algo/detection/test_yolox.py             | 4 ++--
 tests/unit/cli/test_cli.py                          | 2 +-
 tests/unit/core/model/test_visual_prompting.py      | 4 ++--
 tests/unit/engine/test_engine.py                    | 2 +-
 tests/unit/engine/utils/test_auto_configurator.py   | 2 +-
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/unit/algo/classification/test_efficientnet.py b/tests/unit/algo/classification/test_efficientnet.py
index fd501ff48ed..45c0681444d 100644
--- a/tests/unit/algo/classification/test_efficientnet.py
+++ b/tests/unit/algo/classification/test_efficientnet.py
@@ -55,7 +55,7 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = EfficientNetForMulticlassCls(version="b0", label_info=10, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
 
@@ -98,7 +98,7 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = EfficientNetForMultilabelCls(version="b0", label_info=10, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
 
@@ -141,6 +141,6 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self, fxt_hlabel_data):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = EfficientNetForHLabelCls(version="b0", label_info=fxt_hlabel_data, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
diff --git a/tests/unit/algo/classification/test_mobilenet_v3.py b/tests/unit/algo/classification/test_mobilenet_v3.py
index cecfd1d919a..39d4b282b0a 100644
--- a/tests/unit/algo/classification/test_mobilenet_v3.py
+++ b/tests/unit/algo/classification/test_mobilenet_v3.py
@@ -55,7 +55,7 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = MobileNetV3ForMulticlassCls(mode="large", label_info=10, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
 
@@ -98,7 +98,7 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = MobileNetV3ForMultilabelCls(mode="large", label_info=10, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
 
@@ -141,6 +141,6 @@ def test_predict_step(self, fxt_h_label_cls_model, fxt_hlabel_cls_batch_data_ent
         assert outputs.has_xai_outputs == explain_mode
 
     def test_set_input_size(self, fxt_hlabel_data):
-        input_size = (1, 3, 300, 300)
+        input_size = (300, 300)
         model = MobileNetV3ForHLabelCls(mode="large", label_info=fxt_hlabel_data, input_size=input_size)
         assert model.model.backbone.in_size == input_size[-2:]
diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py
index 9344687894c..17f4b7ecc35 100644
--- a/tests/unit/algo/detection/test_rtmdet.py
+++ b/tests/unit/algo/detection/test_rtmdet.py
@@ -18,7 +18,7 @@ def test_init(self) -> None:
         assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXt)
         assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN)
         assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead)
-        assert otx_rtmdet_tiny.input_size == (1, 3, 640, 640)
+        assert otx_rtmdet_tiny.input_size == (640, 640)
 
     def test_exporter(self) -> None:
         otx_rtmdet_tiny = RTMDetTiny(label_info=3)
diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py
index 29ffdd8172e..fdb8e835ee7 100644
--- a/tests/unit/algo/detection/test_yolox.py
+++ b/tests/unit/algo/detection/test_yolox.py
@@ -18,10 +18,10 @@ def test_init(self) -> None:
         assert isinstance(otx_yolox_l.model.backbone, CSPDarknet)
         assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN)
         assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead)
-        assert otx_yolox_l.input_size == (1, 3, 640, 640)
+        assert otx_yolox_l.input_size == (640, 640)
 
         otx_yolox_tiny = YOLOXTINY(label_info=3)
-        assert otx_yolox_tiny.input_size == (1, 3, 416, 416)
+        assert otx_yolox_tiny.input_size == (416, 416)
 
     def test_exporter(self) -> None:
         otx_yolox_l = YOLOXL(label_info=3)
diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py
index e0b69b77a54..8fa1581f18a 100644
--- a/tests/unit/cli/test_cli.py
+++ b/tests/unit/cli/test_cli.py
@@ -129,7 +129,7 @@ def test_instantiate_classes_set_input_size(self, input_size, fxt_train_argv, mo
         cli = OTXCLI()
         cli.instantiate_classes()
 
-        assert cli.model.input_size == (1, 3, input_size, input_size)
+        assert cli.model.input_size == (input_size, input_size)
 
     @pytest.fixture()
     def mock_model_cls(self) -> MagicMock:
diff --git a/tests/unit/core/model/test_visual_prompting.py b/tests/unit/core/model/test_visual_prompting.py
index 9a3a8709529..01d245a46b9 100644
--- a/tests/unit/core/model/test_visual_prompting.py
+++ b/tests/unit/core/model/test_visual_prompting.py
@@ -36,7 +36,7 @@
 @pytest.fixture()
 def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel:
     mocker.patch.object(OTXVisualPromptingModel, "_create_model")
-    model = OTXVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024))
+    model = OTXVisualPromptingModel(label_info=1, input_size=(1024, 1024))
     model.model.image_size = 1024
     return model
 
@@ -44,7 +44,7 @@ def otx_visual_prompting_model(mocker) -> OTXVisualPromptingModel:
 @pytest.fixture()
 def otx_zero_shot_visual_prompting_model(mocker) -> OTXZeroShotVisualPromptingModel:
     mocker.patch.object(OTXZeroShotVisualPromptingModel, "_create_model")
-    model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1, 3, 1024, 1024))
+    model = OTXZeroShotVisualPromptingModel(label_info=1, input_size=(1024, 1024))
     model.model.image_size = 1024
     return model
 
diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py
index 1bd9c655cf8..e1695e2d9a2 100644
--- a/tests/unit/engine/test_engine.py
+++ b/tests/unit/engine/test_engine.py
@@ -69,7 +69,7 @@ def test_model_init(self, tmp_path, mock_datamodule):
         data_root = "tests/assets/classification_dataset"
         engine = Engine(work_dir=tmp_path, data_root=data_root)
 
-        assert engine._model.input_size == (1, 3, 1234, 1234)
+        assert engine._model.input_size == (1234, 1234)
         assert engine._model.label_info.num_classes == 4321
 
     def test_model_setter(self, fxt_engine, mocker) -> None:
diff --git a/tests/unit/engine/utils/test_auto_configurator.py b/tests/unit/engine/utils/test_auto_configurator.py
index 7bf247020c9..6627e4131ab 100644
--- a/tests/unit/engine/utils/test_auto_configurator.py
+++ b/tests/unit/engine/utils/test_auto_configurator.py
@@ -152,7 +152,7 @@ def test_get_model_set_input_size(self) -> None:
 
         model = auto_configurator.get_model(label_info=label_info, input_size=input_size)
 
-        assert model.input_size == (1, 3, input_size, input_size)
+        assert model.input_size == (input_size, input_size)
 
     def test_get_optimizer(self, fxt_task: OTXTaskType) -> None:
         if fxt_task in {

From ff8ecf9f76b619cd2e314a2ab7d304c2f91b402c Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 19:34:29 +0900
Subject: [PATCH 20/42] adaptive input size supports not square

---
 src/otx/core/data/utils/utils.py         | 53 ++++++++++++--------
 tests/unit/core/data/utils/test_utils.py | 61 ++++++++++++++++++------
 2 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 07777bda224..9e2ee03837d 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -85,14 +85,17 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
     Returns:
         Dict[str, Any]: Robust avg, min, max values for images, and annotations optionally.
             ex) stat = {
-                    "image": {"avg": ...},
+                    "image": {
+                        "height" : {"avg": ...},
+                        "width" : {"avg": ...},
+                    }
                     "annotation": {
                        "num_per_image": {"avg": ...},
                        "size_of_shape": {"avg": ...},
                     }
                 }
     """
-    stat: dict = {}
+    stat: dict = {"image": {}, "annotation": {}}
     if len(dataset) == 0 or max_samples <= 0:
         return stat
 
@@ -101,14 +104,16 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
     rng = np.random.default_rng(42)
     data_ids = rng.choice(data_ids, max_image_samples, replace=False)[:max_image_samples]
 
-    image_sizes = []
+    height_arr = []
+    width_arr = []
     for idx in data_ids:
         data = dataset.get(id=idx, subset=dataset.name)
         height, width = data.media.size
-        image_sizes.append(np.sqrt(width * height))
-    stat["image"] = compute_robust_scale_statistics(np.array(image_sizes))
+        height_arr.append(height)
+        width_arr.append(width)
+    stat["image"]["height"] = compute_robust_scale_statistics(np.array(height_arr))
+    stat["image"]["width"] = compute_robust_scale_statistics(np.array(width_arr))
 
-    stat["annotation"] = {}
     num_per_images: list[int] = []
     size_of_shapes: dict[str, list] = defaultdict(list)
     for idx in data_ids:
@@ -181,12 +186,15 @@ def adapt_input_size_to_dataset(
 
     logger.info("Adapting model input size based on dataset stat")
     stat = compute_robust_dataset_statistics(train_dataset)
-    max_image_size = stat["image"].get("robust_max", 0)
+    max_image_size: list[int] = [
+        stat["image"].get("height", {}).get("robust_max", 0),
+        stat["image"].get("width", {}).get("robust_max", 0),
+    ]
     min_object_size = None
 
     logger.info(f"-> Current base input size: {base_input_size}")
 
-    if max_image_size <= 0:
+    if max_image_size[0] <= 0 or max_image_size[1] <= 0:
         return base_input_size
 
     image_size = max_image_size
@@ -197,31 +205,34 @@ def adapt_input_size_to_dataset(
     # -> "avg" size might be preferrable for efficiency
     min_object_size = stat.get("annotation", {}).get("size_of_shape", {}).get("robust_min", None)
     if min_object_size is not None and min_object_size > 0:
-        image_size = round(image_size * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size)
+        image_size = [round(val * _MIN_RECOGNIZABLE_OBJECT_SIZE / min_object_size) for val in image_size]
         logger.info(f"-> Based on typical small object size {min_object_size}: {image_size}")
-        if image_size > max_image_size:
+        if image_size[0] > max_image_size[0]:
             image_size = max_image_size
             logger.info(f"-> Restrict to max image size: {image_size}")
-        if image_size < _MIN_DETECTION_INPUT_SIZE:
-            image_size = _MIN_DETECTION_INPUT_SIZE
+        if image_size[0] < _MIN_DETECTION_INPUT_SIZE or image_size[1] < _MIN_DETECTION_INPUT_SIZE:
+            big_val_idx = 0 if image_size[0] > image_size[1] else 1
+            small_val_idx = 1 - big_val_idx
+            image_size[big_val_idx] = image_size[big_val_idx] * _MIN_DETECTION_INPUT_SIZE // image_size[small_val_idx]
+            image_size[small_val_idx] = _MIN_DETECTION_INPUT_SIZE
             logger.info(f"-> Based on minimum object detection input size: {image_size}")
 
-    if input_size_multiplier is not None and image_size % input_size_multiplier != 0:
-        image_size = (image_size // input_size_multiplier + 1) * input_size_multiplier
-
-    input_size = (round(image_size), round(image_size))
+    if input_size_multiplier is not None:
+        for i, val in enumerate(image_size):
+            if val % input_size_multiplier != 0:
+                image_size[i] = (val // input_size_multiplier + 1) * input_size_multiplier
 
     if downscale_only:
 
-        def area(x: tuple[int, int]) -> int:
+        def area(x: list[int] | tuple[int, int]) -> int:
             return x[0] * x[1]
 
-        if base_input_size and area(input_size) >= area(base_input_size):
-            logger.info(f"-> Downscale only: {input_size} -> {base_input_size}")
+        if base_input_size and area(image_size) >= area(base_input_size):
+            logger.info(f"-> Downscale only: {image_size} -> {base_input_size}")
             return base_input_size
 
-    logger.info(f"-> Adapted input size: {input_size}")
-    return input_size
+    logger.info(f"-> Adapted input size: {image_size}")
+    return tuple(image_size)  # type: ignore[return-value]
 
 
 def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
diff --git a/tests/unit/core/data/utils/test_utils.py b/tests/unit/core/data/utils/test_utils.py
index ace8d23250a..606b08f5ef7 100644
--- a/tests/unit/core/data/utils/test_utils.py
+++ b/tests/unit/core/data/utils/test_utils.py
@@ -109,12 +109,15 @@ def test_compute_robuste_dataset_statistics(mock_dataset):
     subset = mock_dataset.get_subset("train")
 
     stat = compute_robust_dataset_statistics(subset, max_samples=0)
-    assert len(stat) == 0
+    assert stat["image"] == {}
+    assert stat["annotation"] == {}
     stat = compute_robust_dataset_statistics(subset, max_samples=-1)
-    assert len(stat) == 0
+    assert stat["image"] == {}
+    assert stat["annotation"] == {}
 
     stat = compute_robust_dataset_statistics(subset)
-    assert np.isclose(stat["image"]["avg"], 100)
+    assert np.isclose(stat["image"]["height"]["avg"], 100)
+    assert np.isclose(stat["image"]["width"]["avg"], 100)
     assert np.isclose(stat["annotation"]["num_per_image"]["avg"], 1.0)
     assert np.isclose(stat["annotation"]["size_of_shape"]["avg"], 10.0)
 
@@ -135,22 +138,52 @@ def test_adapt_input_size_to_dataset(mocker):
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
     assert input_size == (512, 512)
 
-    mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}}
+    mock_stat.return_value = {
+        "image": {
+            "height": {"robust_max": 150},
+            "width": {"robust_max": 200},
+        },
+        "annotation": {},
+    }
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
-    assert input_size == (150, 150)
-
-    mock_stat.return_value = {"image": {"robust_max": 150}, "annotation": {}}
+    assert input_size == (150, 200)
+
+    mock_stat.return_value = {
+        "image": {
+            "height": {"robust_max": 150},
+            "width": {"robust_max": 200},
+        },
+        "annotation": {},
+    }
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512, input_size_multiplier=32)
-    assert input_size == (160, 160)
-
-    mock_stat.return_value = {"image": {"robust_max": 256}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    assert input_size == (160, 224)
+
+    mock_stat.return_value = {
+        "image": {
+            "height": {"robust_max": 224},
+            "width": {"robust_max": 240},
+        },
+        "annotation": {"size_of_shape": {"robust_min": 64}},
+    }
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
-    assert input_size == (256, 256)
-
-    mock_stat.return_value = {"image": {"robust_max": 1024}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    assert input_size == (256, 274)
+
+    mock_stat.return_value = {
+        "image": {
+            "height": {"robust_max": 1024},
+            "width": {"robust_max": 1200},
+        },
+        "annotation": {"size_of_shape": {"robust_min": 64}},
+    }
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), base_input_size=512)
     assert input_size == (512, 512)
 
-    mock_stat.return_value = {"image": {"robust_max": 2045}, "annotation": {"size_of_shape": {"robust_min": 64}}}
+    mock_stat.return_value = {
+        "image": {
+            "height": {"robust_max": 2045},
+            "width": {"robust_max": 2045},
+        },
+        "annotation": {"size_of_shape": {"robust_min": 64}},
+    }
     input_size = adapt_input_size_to_dataset(dataset=MagicMock(), downscale_only=False, base_input_size=512)
     assert input_size == (1022, 1022)

From 82e41e0537d792135befb0333a25354e6fd5379b Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 19:35:44 +0900
Subject: [PATCH 21/42] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67c87e31181..806f44d8ed0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,8 @@ All notable changes to this project will be documented in this file.
   (https://github.com/openvinotoolkit/training_extensions/pull/3762)
 - Add RTMPose for Keypoint Detection Task
   (https://github.com/openvinotoolkit/training_extensions/pull/3781)
+- Support configurable input size
+  (https://github.com/openvinotoolkit/training_extensions/pull/3788)
 
 ### Enhancements
 

From 4e8ce701347e281990b19562d52c3b900546114c Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 9 Aug 2024 20:21:09 +0900
Subject: [PATCH 22/42] fix typo

---
 .../algo/classification/torchvision_model.py   |  2 +-
 src/otx/algo/segmentation/huggingface_model.py |  2 +-
 src/otx/core/exporter/visual_prompting.py      |  2 +-
 tests/unit/core/model/test_base.py             |  2 +-
 tests/unit/core/model/test_classification.py   | 18 +++++++++---------
 tests/unit/core/model/test_segmentation.py     |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/otx/algo/classification/torchvision_model.py b/src/otx/algo/classification/torchvision_model.py
index e90c4776e1e..fec454381ed 100644
--- a/src/otx/algo/classification/torchvision_model.py
+++ b/src/otx/algo/classification/torchvision_model.py
@@ -404,7 +404,7 @@ class OTXTVModel(OTXModel):
         task (Literal[OTXTaskType.MULTI_CLASS_CLS, OTXTaskType.MULTI_LABEL_CLS, OTXTaskType.H_LABEL_CLS], optional):
             The type of classification task.
         train_type (Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED], optional): The type of training.
-        input_size (tuple[int, ...], optional):
+        input_size (tuple[int, int], optional):
             Model input size in the order of height and width. Defaults to (224, 224)
     """
 
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index c60964c27c8..f00a7faceb6 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -96,7 +96,7 @@ def _create_model(self) -> nn.Module:
             if self.input_size[0] % patch_size != 0 or self.input_size[1] % patch_size != 0:
                 msg = (
                     f"It's recommended to set the input size to multiple of patch size({patch_size}). "
-                    "If not, score can decrease or model can't work."
+                    "If not, score can decrease or model may not work."
                 )
                 logger.warning(msg)
 
diff --git a/src/otx/core/exporter/visual_prompting.py b/src/otx/core/exporter/visual_prompting.py
index 6b3d3970120..ea40073686b 100644
--- a/src/otx/core/exporter/visual_prompting.py
+++ b/src/otx/core/exporter/visual_prompting.py
@@ -175,7 +175,7 @@ def get_onnx_dummy_inputs(
                     model.image_embedding_size,
                     dtype=torch.float32,
                 ),
-                "point_coords": torch.randint(low=0, high=self.input_size[-1], size=(1, 2, 2), dtype=torch.float32),
+                "point_coords": torch.randint(low=0, high=self.input_size[0], size=(1, 2, 2), dtype=torch.float32),
                 "point_labels": torch.randint(low=0, high=4, size=(1, 2), dtype=torch.float32),
                 "mask_input": torch.randn(
                     1,
diff --git a/tests/unit/core/model/test_base.py b/tests/unit/core/model/test_base.py
index d72891cf538..3a24908e99f 100644
--- a/tests/unit/core/model/test_base.py
+++ b/tests/unit/core/model/test_base.py
@@ -23,7 +23,7 @@ class TestOTXModel:
     def test_init(self, monkeypatch):
         monkeypatch.setattr(OTXModel, "input_size_multiplier", 10, raising=False)
         with pytest.raises(ValueError, match="Input size should be a multiple"):
-            OTXModel(label_info=2, input_size=(1, 3, 1024, 1024))
+            OTXModel(label_info=2, input_size=(1024, 1024))
 
     def test_smart_weight_loading(self, mocker) -> None:
         with mocker.patch.object(OTXModel, "_create_model", return_value=MockNNModule(2)):
diff --git a/tests/unit/core/model/test_classification.py b/tests/unit/core/model/test_classification.py
index 835ed854e20..352bf9a331d 100644
--- a/tests/unit/core/model/test_classification.py
+++ b/tests/unit/core/model/test_classification.py
@@ -37,7 +37,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXMulticlassClsModel(
             label_info=1,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -51,7 +51,7 @@ def test_export_parameters(
 
         model = OTXMultilabelClsModel(
             label_info=1,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -62,7 +62,7 @@ def test_export_parameters(
 
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -79,7 +79,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXMulticlassClsModel(
             label_info=1,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -110,7 +110,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXMultilabelClsModel(
             label_info=1,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -130,7 +130,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXMultilabelClsModel(
             label_info=1,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -162,7 +162,7 @@ def test_export_parameters(
     ) -> None:
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -183,7 +183,7 @@ def test_convert_pred_entity_to_compute_metric(
     ) -> None:
         model = OTXHlabelClsModel(
             label_info=fxt_hlabel_multilabel_info,
-            input_size=(1, 3, 224, 224),
+            input_size=(224, 224),
             torch_compile=False,
             optimizer=mock_optimizer,
             scheduler=mock_scheduler,
@@ -207,7 +207,7 @@ def test_convert_pred_entity_to_compute_metric(
         assert "target" in metric_input
 
     def test_set_label_info(self, fxt_hlabel_multilabel_info):
-        model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(1, 3, 224, 224))
+        model = OTXHlabelClsModel(label_info=fxt_hlabel_multilabel_info, input_size=(224, 224))
         assert model.label_info.num_multilabel_classes == fxt_hlabel_multilabel_info.num_multilabel_classes
 
         fxt_hlabel_multilabel_info.num_multilabel_classes = 0
diff --git a/tests/unit/core/model/test_segmentation.py b/tests/unit/core/model/test_segmentation.py
index 32da4815475..130aa3a96dd 100644
--- a/tests/unit/core/model/test_segmentation.py
+++ b/tests/unit/core/model/test_segmentation.py
@@ -46,7 +46,7 @@ def torch_compile():
 class TestOTXSegmentationModel:
     @pytest.fixture()
     def model(self, label_info, optimizer, scheduler, metric, torch_compile):
-        return OTXSegmentationModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile)
+        return OTXSegmentationModel(label_info, (512, 512), optimizer, scheduler, metric, torch_compile)
 
     def test_export_parameters(self, model):
         params = model._export_parameters
@@ -74,7 +74,7 @@ def test_dispatch_label_info(self, model, label_info, expected_label_info):
 class TestTorchVisionCompatibleModel:
     @pytest.fixture()
     def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> TorchVisionCompatibleModel:
-        return TorchVisionCompatibleModel(label_info, (1, 3, 512, 512), optimizer, scheduler, metric, torch_compile)
+        return TorchVisionCompatibleModel(label_info, (512, 512), optimizer, scheduler, metric, torch_compile)
 
     @pytest.fixture()
     def batch_data_entity(self):

From 9260a8c5e04014f7589045d6e72a8d5882652993 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 09:15:10 +0900
Subject: [PATCH 23/42] fix typo

---
 src/otx/core/model/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 678203447dd..f48f026b585 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -449,7 +449,7 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
     def get_dummy_input(self, batch_size: int = 1) -> HlabelClsBatchDataEntity:
         """Returns a dummy input for classification OV model."""
-        images = [torch.rand(3, self.input_size) for _ in range(batch_size)]
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
         labels = [torch.LongTensor([0])] * batch_size
         return HlabelClsBatchDataEntity(batch_size, images, [], labels=labels)
 

From d40a9f06cafaeae6fe8c29a9228d50c9040b7af1 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 10:13:35 +0900
Subject: [PATCH 24/42] update base data pipeline

---
 src/otx/recipe/_base_/data/anomaly.yaml       |  4 +++-
 .../recipe/_base_/data/classification.yaml    |  4 +++-
 src/otx/recipe/_base_/data/detection.yaml     | 12 +++++++---
 .../_base_/data/instance_segmentation.yaml    | 12 +++++++---
 .../_base_/data/keypoint_detection.yaml       | 24 +++++++++----------
 .../recipe/_base_/data/rotated_detection.yaml | 12 +++++++---
 .../_base_/data/semantic_segmentation.yaml    | 12 +++++++---
 .../_base_/data/torchvision_semisl.yaml       |  4 +++-
 .../recipe/_base_/data/visual_prompting.yaml  | 12 +++++++---
 9 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml
index 2f74b987915..29d4471d9a6 100644
--- a/src/otx/recipe/_base_/data/anomaly.yaml
+++ b/src/otx/recipe/_base_/data/anomaly.yaml
@@ -1,5 +1,4 @@
 task: ANOMALY_CLASSIFICATION
-input_size: 256
 data_format: mvtec
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
@@ -7,6 +6,7 @@ image_color_channel: RGB
 stack_images: false
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size: 256
   subset_name: train
   transform_lib_type: TORCHVISION
   to_tv_image: true
@@ -30,6 +30,7 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size: 256
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: true
@@ -53,6 +54,7 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size: 256
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: true
diff --git a/src/otx/recipe/_base_/data/classification.yaml b/src/otx/recipe/_base_/data/classification.yaml
index e8ee41bf15e..04b675d8774 100644
--- a/src/otx/recipe/_base_/data/classification.yaml
+++ b/src/otx/recipe/_base_/data/classification.yaml
@@ -1,5 +1,4 @@
 task: MULTI_CLASS_CLS
-input_size: 224
 mem_cache_size: 1GB
 mem_cache_img_max_size:
   - 500
@@ -9,6 +8,7 @@ stack_images: true
 data_format: imagenet_with_subset_dirs
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size: 224
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -34,6 +34,7 @@ train_subset:
     class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
 val_subset:
+  input_size: 224
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -56,6 +57,7 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size: 224
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 64
diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml
index c08a5fea022..fa88d02b3fd 100644
--- a/src/otx/recipe/_base_/data/detection.yaml
+++ b/src/otx/recipe/_base_/data/detection.yaml
@@ -1,7 +1,4 @@
 task: DETECTION
-input_size:
-  - 800
-  - 992
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -9,6 +6,9 @@ stack_images: true
 data_format: coco_instances
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size:
+    - 800
+    - 992
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -35,6 +35,9 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size:
+    - 800
+    - 992
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -56,6 +59,9 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size:
+    - 800
+    - 992
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1
diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml
index 3520f3930a7..299bf488d4f 100644
--- a/src/otx/recipe/_base_/data/instance_segmentation.yaml
+++ b/src/otx/recipe/_base_/data/instance_segmentation.yaml
@@ -1,7 +1,4 @@
 task: INSTANCE_SEGMENTATION
-input_size:
-  - 1024
-  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -10,6 +7,9 @@ data_format: coco_instances
 include_polygons: true
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -41,6 +41,9 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -66,6 +69,9 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index c466aa657bb..b42d24775b4 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -6,15 +6,15 @@ data_format: coco_person_keypoints
 unannotated_items_ratio: 0.0
 image_color_channel: RGB
 train_subset:
+  input_size:
+    - 192
+    - 256
   subset_name: train
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args:
-        input_size:
-          - 192
-          - 256
+      init_args: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
@@ -27,15 +27,15 @@ train_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 val_subset:
+  input_size:
+    - 192
+    - 256
   subset_name: val
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args:
-        input_size:
-          - 192
-          - 256
+      init_args: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true
@@ -47,15 +47,15 @@ val_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 test_subset:
+  input_size:
+    - 192
+    - 256
   subset_name: test
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args:
-        input_size:
-          - 192
-          - 256
+      init_args: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/_base_/data/rotated_detection.yaml b/src/otx/recipe/_base_/data/rotated_detection.yaml
index 8ac4759ffc5..1d41b9b3c82 100644
--- a/src/otx/recipe/_base_/data/rotated_detection.yaml
+++ b/src/otx/recipe/_base_/data/rotated_detection.yaml
@@ -1,7 +1,4 @@
 task: ROTATED_DETECTION
-input_size:
-  - 1024
-  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -10,6 +7,9 @@ data_format: coco_instances
 include_polygons: true
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   to_tv_image: false
@@ -41,6 +41,9 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   to_tv_image: false
@@ -66,6 +69,9 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: false
diff --git a/src/otx/recipe/_base_/data/semantic_segmentation.yaml b/src/otx/recipe/_base_/data/semantic_segmentation.yaml
index 52b3dec6f63..2a9ec2d8779 100644
--- a/src/otx/recipe/_base_/data/semantic_segmentation.yaml
+++ b/src/otx/recipe/_base_/data/semantic_segmentation.yaml
@@ -1,7 +1,4 @@
 task: SEMANTIC_SEGMENTATION
-input_size:
-  - 512
-  - 512
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -10,6 +7,9 @@ include_polygons: true
 unannotated_items_ratio: 0.0
 ignore_index: 255
 train_subset:
+  input_size:
+    - 512
+    - 512
   subset_name: train
   batch_size: 8
   num_workers: 4
@@ -42,6 +42,9 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size:
+    - 512
+    - 512
   subset_name: val
   batch_size: 8
   num_workers: 4
@@ -64,6 +67,9 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size:
+    - 512
+    - 512
   subset_name: test
   num_workers: 4
   batch_size: 8
diff --git a/src/otx/recipe/_base_/data/torchvision_semisl.yaml b/src/otx/recipe/_base_/data/torchvision_semisl.yaml
index 1b5d630a1ec..25ce95252f6 100644
--- a/src/otx/recipe/_base_/data/torchvision_semisl.yaml
+++ b/src/otx/recipe/_base_/data/torchvision_semisl.yaml
@@ -1,5 +1,4 @@
 task: MULTI_CLASS_CLS
-input_size: 224
 mem_cache_size: 1GB
 mem_cache_img_max_size:
   - 500
@@ -9,6 +8,7 @@ stack_images: True
 data_format: imagenet_with_subset_dirs
 unannotated_items_ratio: 0.0
 train_subset:
+  input_size: 224
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 16
@@ -34,6 +34,7 @@ train_subset:
     class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
 val_subset:
+  input_size: 224
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -56,6 +57,7 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size: 224
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 64
diff --git a/src/otx/recipe/_base_/data/visual_prompting.yaml b/src/otx/recipe/_base_/data/visual_prompting.yaml
index f51287efdec..5fa9188f64f 100644
--- a/src/otx/recipe/_base_/data/visual_prompting.yaml
+++ b/src/otx/recipe/_base_/data/visual_prompting.yaml
@@ -1,7 +1,4 @@
 task: VISUAL_PROMPTING
-input_size:
-  - 1024
-  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -12,6 +9,9 @@ vpm_config:
   use_bbox: true
   use_point: false
 train_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 2
@@ -39,6 +39,9 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -66,6 +69,9 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
+  input_size:
+    - 1024
+    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1

From c049044f6cbd61122cddb32884c0f9edeabd75fd Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 10:35:46 +0900
Subject: [PATCH 25/42] update keypoint detection

---
 src/otx/algo/keypoint_detection/rtmpose.py    | 41 +++++++++++++++----
 src/otx/core/model/keypoint_detection.py      |  3 +-
 .../_base_/data/keypoint_detection.yaml       |  9 ++--
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py
index c552580b557..8388e6d8a52 100644
--- a/src/otx/algo/keypoint_detection/rtmpose.py
+++ b/src/otx/algo/keypoint_detection/rtmpose.py
@@ -13,10 +13,17 @@
 from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
+from otx.core.metrics import MetricCallable
+from otx.core.metrics.pck import PCKMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.schedulers import LRSchedulerListCallable
+from otx.core.types.export import TaskLevelExportParameters
+from otx.core.types.label import LabelInfoTypes
 
 if TYPE_CHECKING:
     from otx.core.exporter.base import OTXModelExporter
     from otx.core.types.export import TaskLevelExportParameters
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
 
 
 class RTMPose(OTXKeypointDetectionModel):
@@ -25,13 +32,13 @@ class RTMPose(OTXKeypointDetectionModel):
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            msg = f"Exporter should have a image_size but it is given by {self.image_size}"
+        if self.input_size is None:
+            msg = f"Exporter should have a input_size but it is given by {self.input_size}"
             raise ValueError(msg)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -60,12 +67,30 @@ class RTMPoseTiny(RTMPose):
     """RTMPose Tiny Model."""
 
     load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth"
-    image_size = (1, 3, 192, 256)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        label_info: LabelInfoTypes,
+        input_size: tuple[int, int] = (192, 256),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = PCKMeasureCallable,
+        torch_compile: bool = False,
+    ) -> None:
+        self.mean = (0.0, 0.0, 0.0)
+        self.std = (255.0, 255.0, 255.0)
+        super().__init__(
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
+
     def _build_model(self, num_classes: int) -> RTMPose:
-        input_size = (192, 256)
         simcc_split_ratio = 2.0
         sigma = (4.9, 5.66)
 
@@ -82,13 +107,13 @@ def _build_model(self, num_classes: int) -> RTMPose:
         head = RTMCCHead(
             out_channels=num_classes,
             in_channels=384,
-            input_size=input_size,
-            in_featuremap_size=(input_size[0] // 32, input_size[1] // 32),
+            input_size=self.input_size,
+            in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32),
             simcc_split_ratio=simcc_split_ratio,
             final_layer_kernel_size=7,
             loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True),
             decoder_cfg={
-                "input_size": input_size,
+                "input_size": self.input_size,
                 "simcc_split_ratio": simcc_split_ratio,
                 "sigma": sigma,
                 "normalize": False,
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 69c05ed148e..06f22371deb 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -32,16 +32,17 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = PCKMeasureCallable,
         torch_compile: bool = False,
     ) -> None:
-        self.image_size = (1, 3, 192, 256)
         self.mean = (0.0, 0.0, 0.0)
         self.std = (255.0, 255.0, 255.0)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index b42d24775b4..e982b3ac467 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -14,7 +14,8 @@ train_subset:
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args: $(input_size)
+      init_args:
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
@@ -35,7 +36,8 @@ val_subset:
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args: $(input_size)
+      init_args:
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true
@@ -55,7 +57,8 @@ test_subset:
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-      init_args: $(input_size)
+      init_args:
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true

From 9097b3d0a7ca98877bb323a45cd9d414e2e23ba2 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 10:44:33 +0900
Subject: [PATCH 26/42] align with pre-commit

---
 src/otx/algo/keypoint_detection/rtmpose.py | 11 +++++------
 src/otx/core/model/keypoint_detection.py   |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py
index 8388e6d8a52..2e932806a27 100644
--- a/src/otx/algo/keypoint_detection/rtmpose.py
+++ b/src/otx/algo/keypoint_detection/rtmpose.py
@@ -12,18 +12,17 @@
 from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss
 from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
-from otx.core.metrics import MetricCallable
 from otx.core.metrics.pck import PCKMeasureCallable
 from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
-from otx.core.schedulers import LRSchedulerListCallable
-from otx.core.types.export import TaskLevelExportParameters
-from otx.core.types.label import LabelInfoTypes
+from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
 
 if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
     from otx.core.exporter.base import OTXModelExporter
+    from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.export import TaskLevelExportParameters
-    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from otx.core.types.label import LabelInfoTypes
 
 
 class RTMPose(OTXKeypointDetectionModel):
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 06f22371deb..02cbb652333 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -48,6 +48,7 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int]
 
     @abstractmethod
     def _build_model(self, num_classes: int) -> nn.Module:

From 0adb7ea54bb57b761f75b423167b45772d08d416 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 11:59:37 +0900
Subject: [PATCH 27/42] update docstring

---
 src/otx/cli/cli.py               |  7 ++++---
 src/otx/core/data/module.py      | 19 ++++++++++++++++---
 src/otx/core/data/utils/utils.py |  4 +++-
 src/otx/core/model/base.py       |  5 ++++-
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index d20e30a700e..31ced921c18 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -23,7 +23,6 @@
 from otx.cli.utils.workspace import Workspace
 from otx.core.types.task import OTXTaskType
 from otx.core.utils.imports import get_otx_root_path
-from otx.utils.utils import get_model_cls_from_config
 
 if TYPE_CHECKING:
     from jsonargparse._actions import _ActionSubCommands
@@ -333,7 +332,9 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
             model_config = self.config[self.subcommand].pop("model")
 
             # if adaptive_input_size will be executed and the model has input_size_multiplier, pass it to OTXDataModule
-            if self.config[self.subcommand].data.adaptive_input_size != "none":
+            if self.config[self.subcommand].data.get("adaptive_input_size") is not None:
+                from otx.utils.utils import get_model_cls_from_config
+
                 model_cls = get_model_cls_from_config(model_config)
                 self.config[self.subcommand].data.input_size_multiplier = model_cls.input_size_multiplier
 
@@ -382,7 +383,7 @@ def instantiate_model(self, model_config: Namespace) -> OTXModel:
             tuple: The model and optimizer and scheduler.
         """
         from otx.core.model.base import OTXModel
-        from otx.utils.utils import can_pass_tile_config, should_pass_label_info
+        from otx.utils.utils import can_pass_tile_config, get_model_cls_from_config, should_pass_label_info
 
         skip = set()
 
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index f60ef4cae24..cd1209a7d64 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -39,7 +39,20 @@
 
 
 class OTXDataModule(LightningDataModule):
-    """LightningDataModule extension for OTX pipeline."""
+    """LightningDataModule extension for OTX pipeline.
+
+    Args:
+        input_size (int | tuple[int, int] | None, optional):
+            Final image or video shape of data after data transformation. It'll be applied to all subset configs
+            If it's not None. Defaults to None.
+        adaptive_input_size (Literal["auto", "downscale"] | None, optional):
+            The adaptive input size mode. If it's set, appropriate input size is found by analyzing dataset.
+            "auto" can find both bigger and smaller input size than current input size and "downscale" uses only
+            smaller size than default setting. Defaults to None.
+        input_size_multiplier (int, optional):
+            adaptive_input_size will finds multiple of input_size_multiplier value if it's set. It's usefull when
+            a model requries multiple of specific value as input_size. Defaults to 1.
+    """
 
     def __init__(  # noqa: PLR0913
         self,
@@ -62,7 +75,7 @@ def __init__(  # noqa: PLR0913
         auto_num_workers: bool = False,
         device: DeviceType = DeviceType.auto,
         input_size: int | tuple[int, int] | None = None,
-        adaptive_input_size: Literal["auto", "downscale", "none"] = "none",
+        adaptive_input_size: Literal["auto", "downscale"] | None = None,
         input_size_multiplier: int = 1,
     ) -> None:
         """Constructor."""
@@ -122,7 +135,7 @@ def __init__(  # noqa: PLR0913
                 subset=self.unlabeled_subset.subset_name,
             )
 
-        if adaptive_input_size != "none":
+        if adaptive_input_size is not None:
             input_size = adapt_input_size_to_dataset(
                 dataset,
                 input_size,
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 9e2ee03837d..d4651ab5a9e 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -231,8 +231,10 @@ def area(x: list[int] | tuple[int, int]) -> int:
             logger.info(f"-> Downscale only: {image_size} -> {base_input_size}")
             return base_input_size
 
+    image_size = tuple(int(val) for val in image_size)  # type: ignore[assignment]
+
     logger.info(f"-> Adapted input size: {image_size}")
-    return tuple(image_size)  # type: ignore[return-value]
+    return image_size  # type: ignore[return-value]
 
 
 def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index ac945d26b71..bd42c668a52 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -96,10 +96,13 @@ class OTXModel(LightningModule, Generic[T_OTXBatchDataEntity, T_OTXBatchPredEnti
 
     Attributes:
         explain_mode: If true, `self.predict_step()` will produce a XAI output as well
+        input_size_multiplier (int):
+            multiplier value for input size a model requires. If input_size isn't multiple of this value,
+            error is raised.
     """
 
     _OPTIMIZED_MODEL_BASE_NAME: str = "optimized_model"
-    input_size_multiplier = 1
+    input_size_multiplier: int = 1
 
     def __init__(
         self,

From 0896ee300fc0f0e3e2513b6d088569f9f04818eb Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 13:13:31 +0900
Subject: [PATCH 28/42] update unit test

---
 tests/unit/cli/test_cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py
index 8fa1581f18a..3b2501066ce 100644
--- a/tests/unit/cli/test_cli.py
+++ b/tests/unit/cli/test_cli.py
@@ -9,7 +9,6 @@
 import torch
 import yaml
 from otx.cli import OTXCLI, main
-from otx.cli import cli as target_file
 from rich.console import Console
 
 
@@ -145,7 +144,7 @@ def test_instantiate_classes_set_adaptive_input_size(
         mock_model_cls,
     ) -> None:
         mocker.patch("otx.cli.OTXCLI.run")
-        mocker.patch.object(target_file, "get_model_cls_from_config", return_value=mock_model_cls)
+        mocker.patch("otx.utils.utils.get_model_cls_from_config", return_value=mock_model_cls)
         fxt_train_argv.extend(["--data.adaptive_input_size", "auto"])
         monkeypatch.setattr("sys.argv", fxt_train_argv)
         mock_data_module = mocker.patch("otx.core.data.module.adapt_input_size_to_dataset", return_value=1024)

From 05483ced884675279a047d8cab12d08860c71ad7 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 13:55:14 +0900
Subject: [PATCH 29/42] update auto_configurator to use None intead of none

---
 src/otx/engine/utils/auto_configurator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 5574f1fc3d9..d95826e5c1c 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -228,7 +228,7 @@ def get_datamodule(self) -> OTXDataModule | None:
         _ = data_config.pop("__path__", {})  # Remove __path__ key that for CLI
         _ = data_config.pop("config", {})  # Remove config key that for CLI
 
-        if data_config.get("adaptive_input_size", "none") != "none":
+        if data_config.get("adaptive_input_size") is not None:
             model_cls = get_model_cls_from_config(Namespace(self.config["model"]))
             data_config["input_size_multiplier"] = model_cls.input_size_multiplier
 

From 8d34d2c3c0cf5891865238881f92686df1075b8e Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 15:08:19 +0900
Subject: [PATCH 30/42] revert data module policy to apply input_size to subset
 cfg

---
 src/otx/core/data/module.py                   |  3 ++-
 src/otx/recipe/_base_/data/anomaly.yaml       |  4 +---
 .../recipe/_base_/data/classification.yaml    |  4 +---
 src/otx/recipe/_base_/data/detection.yaml     | 12 +++--------
 .../_base_/data/instance_segmentation.yaml    | 12 +++--------
 .../_base_/data/keypoint_detection.yaml       | 21 ++++++++-----------
 .../recipe/_base_/data/rotated_detection.yaml | 12 +++--------
 .../_base_/data/semantic_segmentation.yaml    | 12 +++--------
 .../_base_/data/torchvision_semisl.yaml       |  4 +---
 .../recipe/_base_/data/visual_prompting.yaml  | 12 +++--------
 src/otx/recipe/detection/yolox_tiny.yaml      | 18 ++++++----------
 tests/unit/core/data/test_module.py           | 12 +++++------
 12 files changed, 41 insertions(+), 85 deletions(-)

diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index cd1209a7d64..06f62f1c614 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -144,7 +144,8 @@ def __init__(  # noqa: PLR0913
             )
         if input_size is not None:
             for subset_cfg in [train_subset, val_subset, test_subset, unlabeled_subset]:
-                subset_cfg.input_size = input_size
+                if subset_cfg.input_size is None:
+                    subset_cfg.input_size = input_size
         self.input_size = input_size
 
         if self.tile_config.enable_tiler and self.tile_config.enable_adaptive_tiling:
diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml
index 29d4471d9a6..2f74b987915 100644
--- a/src/otx/recipe/_base_/data/anomaly.yaml
+++ b/src/otx/recipe/_base_/data/anomaly.yaml
@@ -1,4 +1,5 @@
 task: ANOMALY_CLASSIFICATION
+input_size: 256
 data_format: mvtec
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
@@ -6,7 +7,6 @@ image_color_channel: RGB
 stack_images: false
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size: 256
   subset_name: train
   transform_lib_type: TORCHVISION
   to_tv_image: true
@@ -30,7 +30,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size: 256
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: true
@@ -54,7 +53,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size: 256
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: true
diff --git a/src/otx/recipe/_base_/data/classification.yaml b/src/otx/recipe/_base_/data/classification.yaml
index 04b675d8774..e8ee41bf15e 100644
--- a/src/otx/recipe/_base_/data/classification.yaml
+++ b/src/otx/recipe/_base_/data/classification.yaml
@@ -1,4 +1,5 @@
 task: MULTI_CLASS_CLS
+input_size: 224
 mem_cache_size: 1GB
 mem_cache_img_max_size:
   - 500
@@ -8,7 +9,6 @@ stack_images: true
 data_format: imagenet_with_subset_dirs
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size: 224
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -34,7 +34,6 @@ train_subset:
     class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
 val_subset:
-  input_size: 224
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -57,7 +56,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size: 224
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 64
diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml
index fa88d02b3fd..c08a5fea022 100644
--- a/src/otx/recipe/_base_/data/detection.yaml
+++ b/src/otx/recipe/_base_/data/detection.yaml
@@ -1,4 +1,7 @@
 task: DETECTION
+input_size:
+  - 800
+  - 992
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -6,9 +9,6 @@ stack_images: true
 data_format: coco_instances
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size:
-    - 800
-    - 992
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -35,9 +35,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size:
-    - 800
-    - 992
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -59,9 +56,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size:
-    - 800
-    - 992
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1
diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml
index 299bf488d4f..3520f3930a7 100644
--- a/src/otx/recipe/_base_/data/instance_segmentation.yaml
+++ b/src/otx/recipe/_base_/data/instance_segmentation.yaml
@@ -1,4 +1,7 @@
 task: INSTANCE_SEGMENTATION
+input_size:
+  - 1024
+  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -7,9 +10,6 @@ data_format: coco_instances
 include_polygons: true
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -41,9 +41,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -69,9 +66,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index e982b3ac467..c466aa657bb 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -6,16 +6,15 @@ data_format: coco_person_keypoints
 unannotated_items_ratio: 0.0
 image_color_channel: RGB
 train_subset:
-  input_size:
-    - 192
-    - 256
   subset_name: train
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size: $(input_size)
+        input_size:
+          - 192
+          - 256
     - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
@@ -28,16 +27,15 @@ train_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 val_subset:
-  input_size:
-    - 192
-    - 256
   subset_name: val
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size: $(input_size)
+        input_size:
+          - 192
+          - 256
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true
@@ -49,16 +47,15 @@ val_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 test_subset:
-  input_size:
-    - 192
-    - 256
   subset_name: test
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size: $(input_size)
+        input_size:
+          - 192
+          - 256
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
         is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/_base_/data/rotated_detection.yaml b/src/otx/recipe/_base_/data/rotated_detection.yaml
index 1d41b9b3c82..8ac4759ffc5 100644
--- a/src/otx/recipe/_base_/data/rotated_detection.yaml
+++ b/src/otx/recipe/_base_/data/rotated_detection.yaml
@@ -1,4 +1,7 @@
 task: ROTATED_DETECTION
+input_size:
+  - 1024
+  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -7,9 +10,6 @@ data_format: coco_instances
 include_polygons: true
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   to_tv_image: false
@@ -41,9 +41,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   to_tv_image: false
@@ -69,9 +66,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   to_tv_image: false
diff --git a/src/otx/recipe/_base_/data/semantic_segmentation.yaml b/src/otx/recipe/_base_/data/semantic_segmentation.yaml
index 2a9ec2d8779..52b3dec6f63 100644
--- a/src/otx/recipe/_base_/data/semantic_segmentation.yaml
+++ b/src/otx/recipe/_base_/data/semantic_segmentation.yaml
@@ -1,4 +1,7 @@
 task: SEMANTIC_SEGMENTATION
+input_size:
+  - 512
+  - 512
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -7,9 +10,6 @@ include_polygons: true
 unannotated_items_ratio: 0.0
 ignore_index: 255
 train_subset:
-  input_size:
-    - 512
-    - 512
   subset_name: train
   batch_size: 8
   num_workers: 4
@@ -42,9 +42,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size:
-    - 512
-    - 512
   subset_name: val
   batch_size: 8
   num_workers: 4
@@ -67,9 +64,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size:
-    - 512
-    - 512
   subset_name: test
   num_workers: 4
   batch_size: 8
diff --git a/src/otx/recipe/_base_/data/torchvision_semisl.yaml b/src/otx/recipe/_base_/data/torchvision_semisl.yaml
index 25ce95252f6..1b5d630a1ec 100644
--- a/src/otx/recipe/_base_/data/torchvision_semisl.yaml
+++ b/src/otx/recipe/_base_/data/torchvision_semisl.yaml
@@ -1,4 +1,5 @@
 task: MULTI_CLASS_CLS
+input_size: 224
 mem_cache_size: 1GB
 mem_cache_img_max_size:
   - 500
@@ -8,7 +9,6 @@ stack_images: True
 data_format: imagenet_with_subset_dirs
 unannotated_items_ratio: 0.0
 train_subset:
-  input_size: 224
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 16
@@ -34,7 +34,6 @@ train_subset:
     class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
 val_subset:
-  input_size: 224
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 64
@@ -57,7 +56,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size: 224
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 64
diff --git a/src/otx/recipe/_base_/data/visual_prompting.yaml b/src/otx/recipe/_base_/data/visual_prompting.yaml
index 5fa9188f64f..f51287efdec 100644
--- a/src/otx/recipe/_base_/data/visual_prompting.yaml
+++ b/src/otx/recipe/_base_/data/visual_prompting.yaml
@@ -1,4 +1,7 @@
 task: VISUAL_PROMPTING
+input_size:
+  - 1024
+  - 1024
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
@@ -9,9 +12,6 @@ vpm_config:
   use_bbox: true
   use_point: false
 train_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 2
@@ -39,9 +39,6 @@ train_subset:
     class_path: torch.utils.data.RandomSampler
 
 val_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 1
@@ -69,9 +66,6 @@ val_subset:
     class_path: torch.utils.data.RandomSampler
 
 test_subset:
-  input_size:
-    - 1024
-    - 1024
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 1
diff --git a/src/otx/recipe/detection/yolox_tiny.yaml b/src/otx/recipe/detection/yolox_tiny.yaml
index bdeee86606c..744dc3e72a7 100644
--- a/src/otx/recipe/detection/yolox_tiny.yaml
+++ b/src/otx/recipe/detection/yolox_tiny.yaml
@@ -37,24 +37,24 @@ overrides:
 
   gradient_clip_val: 35.0
   data:
+    input_size:
+      - 416
+      - 416
     train_subset:
-      input_size:
-        - 640
-        - 640
       batch_size: 8
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.CachedMosaic
           init_args:
             random_pop: false
             max_cached_images: 20
-            img_scale: $(input_size) # (H, W)
+            img_scale: $(input_size) * 1.538 # 640x640
         - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
           init_args:
-            border: $(input_size) * -0.5
+            border: $(input_size) * 1.538 * -0.5 # 640x640 * -0.5
         - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
-            scale: $(input_size)
+            scale: $(input_size) * 1.538 # 640x640
             keep_ratio: true
             transform_bbox: true
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
@@ -76,9 +76,6 @@ overrides:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
     val_subset:
-      input_size:
-        - 416
-        - 416
       batch_size: 8
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
@@ -99,9 +96,6 @@ overrides:
             std: [58.395, 57.12, 57.375]
 
     test_subset:
-      input_size:
-        - 416
-        - 416
       batch_size: 8
       transforms:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
diff --git a/tests/unit/core/data/test_module.py b/tests/unit/core/data/test_module.py
index 6cb04105b0d..e5365406ddc 100644
--- a/tests/unit/core/data/test_module.py
+++ b/tests/unit/core/data/test_module.py
@@ -144,7 +144,7 @@ def test_init_input_size(
         # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets
         mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)}
         mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets
-        fxt_config.train_subset.input_size = (1000, 1000)
+        fxt_config.train_subset.input_size = None
         fxt_config.val_subset.input_size = None
         fxt_config.test_subset.input_size = (800, 800)
 
@@ -160,7 +160,7 @@ def test_init_input_size(
 
         assert fxt_config.train_subset.input_size == (1200, 1200)
         assert fxt_config.val_subset.input_size == (1200, 1200)
-        assert fxt_config.test_subset.input_size == (1200, 1200)
+        assert fxt_config.test_subset.input_size == (800, 800)
 
     @pytest.fixture()
     def mock_adapt_input_size_to_dataset(self, mocker) -> MagicMock:
@@ -177,9 +177,9 @@ def test_init_adaptive_input_size(
         # Dataset will have "train_0", "train_1", "val_0", ..., "test_1" subsets
         mock_dm_subsets = {f"{name}_{idx}": MagicMock() for name in ["train", "val", "test"] for idx in range(2)}
         mock_dm_dataset.return_value.subsets.return_value = mock_dm_subsets
-        fxt_config.train_subset.input_size = (1000, 1000)
-        fxt_config.val_subset.input_size = None
-        fxt_config.test_subset.input_size = (800, 800)
+        fxt_config.train_subset.input_size = None
+        fxt_config.val_subset.input_size = (1000, 1000)
+        fxt_config.test_subset.input_size = None
 
         OTXDataModule(
             task=OTXTaskType.MULTI_CLASS_CLS,
@@ -192,7 +192,7 @@ def test_init_adaptive_input_size(
         )
 
         assert fxt_config.train_subset.input_size == (1234, 1234)
-        assert fxt_config.val_subset.input_size == (1234, 1234)
+        assert fxt_config.val_subset.input_size == (1000, 1000)
         assert fxt_config.test_subset.input_size == (1234, 1234)
 
     @pytest.fixture()

From 982c9854114292dce2e3039ed27fba545facb80f Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 17:26:46 +0900
Subject: [PATCH 31/42] revert keypoint detection

---
 src/otx/algo/keypoint_detection/rtmpose.py    | 40 ++++---------------
 .../core/data/transform_libs/torchvision.py   | 10 ++---
 src/otx/core/model/keypoint_detection.py      |  4 +-
 .../_base_/data/keypoint_detection.yaml       | 18 ++++-----
 4 files changed, 22 insertions(+), 50 deletions(-)

diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py
index 8f06378adf1..0086acd80dd 100644
--- a/src/otx/algo/keypoint_detection/rtmpose.py
+++ b/src/otx/algo/keypoint_detection/rtmpose.py
@@ -13,18 +13,12 @@
 from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss
 from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator
 from otx.core.exporter.native import OTXNativeModelExporter
-from otx.core.metrics.pck import PCKMeasureCallable
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
 from torch import nn
 
 if TYPE_CHECKING:
-    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
     from otx.core.exporter.base import OTXModelExporter
-    from otx.core.metrics import MetricCallable
-    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.export import TaskLevelExportParameters
-    from otx.core.types.label import LabelInfoTypes
 
 
 class RTMPose(OTXKeypointDetectionModel):
@@ -33,13 +27,13 @@ class RTMPose(OTXKeypointDetectionModel):
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.input_size is None:
-            msg = f"Exporter should have a input_size but it is given by {self.input_size}"
+        if self.image_size is None:
+            msg = f"Exporter should have a image_size but it is given by {self.image_size}"
             raise ValueError(msg)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, *self.input_size),
+            input_size=self.image_size,
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -68,30 +62,12 @@ class RTMPoseTiny(RTMPose):
     """RTMPose Tiny Model."""
 
     load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth"
+    image_size = (1, 3, 192, 256)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
-    def __init__(
-        self,
-        label_info: LabelInfoTypes,
-        input_size: tuple[int, int] = (192, 256),
-        optimizer: OptimizerCallable = DefaultOptimizerCallable,
-        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
-        metric: MetricCallable = PCKMeasureCallable,
-        torch_compile: bool = False,
-    ) -> None:
-        self.mean = (0.0, 0.0, 0.0)
-        self.std = (255.0, 255.0, 255.0)
-        super().__init__(
-            label_info=label_info,
-            input_size=input_size,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            metric=metric,
-            torch_compile=torch_compile,
-        )
-
     def _build_model(self, num_classes: int) -> RTMPose:
+        input_size = (192, 256)
         simcc_split_ratio = 2.0
         sigma = (4.9, 5.66)
 
@@ -108,13 +84,13 @@ def _build_model(self, num_classes: int) -> RTMPose:
         head = RTMCCHead(
             out_channels=num_classes,
             in_channels=384,
-            input_size=self.input_size,
-            in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32),
+            input_size=input_size,
+            in_featuremap_size=(input_size[0] // 32, input_size[1] // 32),
             simcc_split_ratio=simcc_split_ratio,
             final_layer_kernel_size=7,
             loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True),
             decoder_cfg={
-                "input_size": self.input_size,
+                "input_size": input_size,
                 "simcc_split_ratio": simcc_split_ratio,
                 "sigma": sigma,
                 "normalize": False,
diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index 8d2ecb259a7..e29ee2989df 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -3508,22 +3508,20 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin):
             the specific codec for more details.
 
     Args:
-        encoder (dict | list[dict]): The codec config for keypoint encoding.
-            Both single encoder and multiple encoders (given as a list) are
-            supported
-        target_type (str, deprecated): This argument is deprecated and has no
-            effect. Defaults to ``None``
+        input_size (tuple[int, int]): Input image size in [w, h]
+        is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False.
     """
 
     def __init__(
         self,
+        input_size: tuple[int, int],
         is_numpy_to_tvtensor: bool = False,
     ) -> None:
         super().__init__()
         from otx.algo.keypoint_detection.utils.simcc_label import SimCCLabel
 
         self.encoder = SimCCLabel(
-            input_size=(192, 256),
+            input_size=input_size,
             sigma=(4.9, 5.66),
             simcc_split_ratio=2.0,
             normalize=False,
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 02cbb652333..69c05ed148e 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -32,23 +32,21 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet
     def __init__(
         self,
         label_info: LabelInfoTypes,
-        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = PCKMeasureCallable,
         torch_compile: bool = False,
     ) -> None:
+        self.image_size = (1, 3, 192, 256)
         self.mean = (0.0, 0.0, 0.0)
         self.std = (255.0, 255.0, 255.0)
         super().__init__(
             label_info=label_info,
-            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
         )
-        self.input_size: tuple[int, int]
 
     @abstractmethod
     def _build_model(self, num_classes: int) -> nn.Module:
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index c466aa657bb..b3ffed5b915 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -5,6 +5,9 @@ stack_images: true
 data_format: coco_person_keypoints
 unannotated_items_ratio: 0.0
 image_color_channel: RGB
+input_size:
+  - 192
+  - 256
 train_subset:
   subset_name: train
   batch_size: 32
@@ -12,12 +15,11 @@ train_subset:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size:
-          - 192
-          - 256
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
+        input_size: $(input_size)
         is_numpy_to_tvtensor: true
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
@@ -33,11 +35,10 @@ val_subset:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size:
-          - 192
-          - 256
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
+        input_size: $(input_size)
         is_numpy_to_tvtensor: true
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
@@ -53,11 +54,10 @@ test_subset:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
-        input_size:
-          - 192
-          - 256
+        input_size: $(input_size)
     - class_path: otx.core.data.transform_libs.torchvision.GenerateTarget
       init_args:
+        input_size: $(input_size)
         is_numpy_to_tvtensor: true
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:

From f8f9e2867c1ce39677bdb910955063b4f1cb1684 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 18:09:51 +0900
Subject: [PATCH 32/42] add comments to explain a reason of priority in
 compute_robust_dataset_statistics

---
 src/otx/core/data/utils/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index d4651ab5a9e..fc0c84dbf58 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -133,6 +133,9 @@ def compute_robust_dataset_statistics(dataset: DatasetSubset, max_samples: int =
             )
 
     stat["annotation"]["num_per_image"] = compute_robust_statistics(np.array(num_per_images))
+    # The reason why polygon is used prior to others is based on assumtion that it is more accurate than other shapes.
+    # Especially, polygon can be used in the case both polygon and bbox exist like instance segmentation task.
+    # it's needed to refine this algorithm considering not only instance segmentation but also other tasks.
     if "Polygon" in size_of_shapes:
         stat["annotation"]["size_of_shape"] = compute_robust_scale_statistics(np.array(size_of_shapes["Polygon"]))
     else:

From 09772d817f5afbecd45fcf1d5ade46e548ae3d49 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 12 Aug 2024 18:18:36 +0900
Subject: [PATCH 33/42] add integration test

---
 tests/integration/cli/test_cli.py | 64 +++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index 73908782e98..3c126e9d97b 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -7,6 +7,7 @@
 
 import cv2
 import pytest
+import torch
 import yaml
 from otx.core.types.task import OTXTaskType
 from otx.engine.utils.auto_configurator import DEFAULT_CONFIG_PER_TASK
@@ -555,3 +556,66 @@ def test_otx_adaptive_bs_e2e(
     ]
 
     run_main(command_cfg=command_cfg, open_subprocess=fxt_open_subprocess)
+
+
+@pytest.mark.parametrize("task", pytest.TASK_LIST)
+def test_otx_configurable_input_size_e2e(
+    task: OTXTaskType,
+    tmp_path: Path,
+    fxt_accelerator: str,
+    fxt_target_dataset_per_task: dict,
+    fxt_cli_override_command_per_task: dict,
+    fxt_open_subprocess: bool,
+) -> None:
+    """
+    Test adaptive batch size e2e commands with default template of each task.
+
+    Args:
+        task (OTXTaskType): The task to run adaptive batch size with.
+        tmp_path (Path): The temporary path for storing the training outputs.
+
+    Returns:
+        None
+    """
+    if task not in DEFAULT_CONFIG_PER_TASK:
+        pytest.skip(f"Task {task} is not supported in the auto-configuration.")
+    if task in [
+        OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING,
+        OTXTaskType.ANOMALY_CLASSIFICATION,
+        OTXTaskType.ANOMALY_DETECTION,
+        OTXTaskType.ANOMALY_SEGMENTATION,
+        OTXTaskType.KEYPOINT_DETECTION,
+    ]:
+        pytest.skip(f"{task} doesn't support configurable input size.")
+
+    task = task.lower()
+    tmp_path_cfg_ipt_size = tmp_path / f"otx_configurable_input_size_{task}"
+    tmp_path_cfg_ipt_size.mkdir(parents=True)
+
+    command_cfg = [
+        "otx",
+        "train",
+        "--task",
+        task.upper(),
+        "--data_root",
+        fxt_target_dataset_per_task[task],
+        "--work_dir",
+        str(tmp_path_cfg_ipt_size),
+        "--engine.device",
+        fxt_accelerator,
+        "--data.input_size",
+        str(448),
+        "--max_epoch",
+        "1",
+        *fxt_cli_override_command_per_task[task],
+    ]
+
+    run_main(command_cfg=command_cfg, open_subprocess=fxt_open_subprocess)
+
+    best_ckpt_files = list(tmp_path_cfg_ipt_size.rglob("best_checkpoint.ckpt"))
+    assert len(best_ckpt_files) != 0
+    best_ckpt = torch.load(best_ckpt_files[0])
+    assert best_ckpt["hyper_parameters"]["input_size"] == (448, 448)
+    for param_name in best_ckpt["datamodule_hyper_parameters"]:
+        if "subset" in param_name:
+            assert best_ckpt["datamodule_hyper_parameters"][param_name].input_size == 448

From 5b3f198b40b4c4d22b692a7432ff05006d599e60 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 09:20:01 +0900
Subject: [PATCH 34/42] update unit test

---
 tests/unit/core/data/transform_libs/test_torchvision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 66837a2e609..202a91afb15 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -946,7 +946,7 @@ def keypoint_det_entity(self) -> KeypointDetDataEntity:
         )
 
     def test_forward(self, keypoint_det_entity) -> None:
-        transform = GenerateTarget()
+        transform = GenerateTarget(input_size=(192, 256))
         results = transform(deepcopy(keypoint_det_entity))
 
         assert hasattr(results, "keypoint_x_labels")

From 5fe677768cbd2eec91ce7068992877d0cf9047e1 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 11:02:48 +0900
Subject: [PATCH 35/42] apply input_size to anomaly task

---
 src/otx/algo/anomaly/padim.py |  6 +++++-
 src/otx/algo/anomaly/stfpm.py |  6 +++++-
 src/otx/core/model/anomaly.py | 29 +++++++++++++++--------------
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py
index ab9a6ddb1a3..f667efa897d 100644
--- a/src/otx/algo/anomaly/padim.py
+++ b/src/otx/algo/anomaly/padim.py
@@ -34,6 +34,8 @@ class Padim(OTXAnomaly, AnomalibPadim):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (256, 256)
     """
 
     def __init__(
@@ -47,8 +49,9 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: tuple[int, int] = (256, 256),
     ) -> None:
-        OTXAnomaly.__init__(self)
+        OTXAnomaly.__init__(self, input_size)
         AnomalibPadim.__init__(
             self,
             backbone=backbone,
@@ -57,6 +60,7 @@ def __init__(
             n_features=n_features,
         )
         self.task = task
+        self.input_size = input_size
 
     def configure_optimizers(self) -> tuple[list[Optimizer], list[Optimizer]] | None:
         """PADIM doesn't require optimization, therefore returns no optimizers."""
diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index c9ddb4cd93c..614d3ad52f9 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -32,6 +32,8 @@ class Stfpm(OTXAnomaly, AnomalibStfpm):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (256, 256)
     """
 
     def __init__(
@@ -43,15 +45,17 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: tuple[int, int] = (256, 256),
         **kwargs,
     ) -> None:
-        OTXAnomaly.__init__(self)
+        OTXAnomaly.__init__(self, input_size=input_size)
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
             layers=layers,
         )
         self.task = task
+        self.input_size = input_size
 
     @property
     def trainable_model(self) -> str:
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index a4f57c4fe0e..68abff41a59 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -50,13 +50,17 @@
 
 
 class OTXAnomaly(OTXModel):
-    """Methods used to make OTX model compatible with the Anomalib model."""
+    """Methods used to make OTX model compatible with the Anomalib model.
 
-    def __init__(self) -> None:
-        super().__init__(label_info=AnomalyLabelInfo())
+    Args:
+        input_size (tuple[int, int] | None):
+            Model input size in the order of height and width. Defaults to None.
+    """
+
+    def __init__(self, input_size: tuple[int, int]) -> None:
+        super().__init__(label_info=AnomalyLabelInfo(), input_size=input_size)
         self.optimizer: list[OptimizerCallable] | OptimizerCallable = None
         self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None
-        self._input_size: tuple[int, int] = (256, 256)
         self.trainer: Trainer
         self.model: nn.Module
         self.image_threshold: BaseThreshold
@@ -116,17 +120,15 @@ def task(self, value: OTXTaskType) -> None:
 
     def _get_values_from_transforms(
         self,
-    ) -> tuple[tuple[int, int], tuple[float, float, float], tuple[float, float, float]]:
+    ) -> tuple[tuple[float, float, float], tuple[float, float, float]]:
         """Get the value requested value from default transforms."""
-        image_size, mean_value, std_value = (256, 256), (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
+        mean_value, std_value = (123.675, 116.28, 103.53), (58.395, 57.12, 57.375)
         for transform in self.configure_transforms().transforms:  # type: ignore[attr-defined]
             name = transform.__class__.__name__
-            if "Resize" in name:
-                image_size = tuple(transform.size)  # type: ignore[assignment]
-            elif "Normalize" in name:
+            if "Normalize" in name:
                 mean_value = tuple(value * 255 for value in transform.mean)  # type: ignore[assignment]
                 std_value = tuple(value * 255 for value in transform.std)  # type: ignore[assignment]
-        return image_size, mean_value, std_value
+        return mean_value, std_value
 
     @property
     def trainable_model(self) -> str | None:
@@ -243,7 +245,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
         """Creates OTXAnomalyModelExporter object that can export anomaly models."""
         min_val = self.normalization_metrics.state_dict()["min"].cpu().numpy().tolist()
         max_val = self.normalization_metrics.state_dict()["max"].cpu().numpy().tolist()
-        image_shape, mean_values, scale_values = self._get_values_from_transforms()
+        mean_values, scale_values = self._get_values_from_transforms()
         onnx_export_configuration = {
             "opset_version": 14,
             "dynamic_axes": {"input": {0: "batch_size"}, "output": {0: "batch_size"}},
@@ -251,7 +253,7 @@ def _exporter(self) -> OTXAnomalyModelExporter:
             "output_names": ["output"],
         }
         return OTXAnomalyModelExporter(
-            image_shape=image_shape,
+            image_shape=self.input_size,
             image_threshold=self.image_threshold.value.cpu().numpy().tolist(),
             pixel_threshold=self.pixel_threshold.value.cpu().numpy().tolist(),
             task=self.task,
@@ -299,8 +301,7 @@ def export(
 
     def get_dummy_input(self, batch_size: int = 1) -> AnomalyModelInputs:
         """Returns a dummy input for anomaly model."""
-        image_size, _, _ = self._get_values_from_transforms()
-        images = torch.rand(batch_size, 3, *image_size)
+        images = torch.rand(batch_size, 3, *self.input_size)
         infos = []
         for i, img in enumerate(images):
             infos.append(

From 9dccf2d0550c5bff84e61d09dd080a4017ff2c0a Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 11:05:22 +0900
Subject: [PATCH 36/42] update docstring

---
 src/otx/core/data/transform_libs/torchvision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index e29ee2989df..3ea0b063e00 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -3508,7 +3508,7 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin):
             the specific codec for more details.
 
     Args:
-        input_size (tuple[int, int]): Input image size in [w, h]
+        input_size (tuple[int, int]): Input image size in [w, h]  TODO[wonjulee]: need to change order of shape
         is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False.
     """
 

From 081c94bc6a290edd5c133aebee3d75114852e31e Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 11:12:09 +0900
Subject: [PATCH 37/42] remove unused comment

---
 src/otx/cli/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 31ced921c18..81b006d4d74 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -345,7 +345,6 @@ def instantiate_classes(self, instantiate_engine: bool = True) -> None:
 
             # pass OTXDataModule input size to the model
             if (input_size := self.datamodule.input_size) is not None and "input_size" in model_config["init_args"]:
-                # TODO(eunwoosh): After configurable input size is applied to anomaly, remove input_size check
                 model_config["init_args"]["input_size"] = (
                     (input_size, input_size) if isinstance(input_size, int) else tuple(input_size)
                 )

From 4ee155161770a5d1c53b855c0321f63e092831b3 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 13:45:49 +0900
Subject: [PATCH 38/42] re-enable anomaly integration test

---
 tests/integration/cli/test_cli.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index 3c126e9d97b..603f648dd4b 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -579,14 +579,10 @@ def test_otx_configurable_input_size_e2e(
     """
     if task not in DEFAULT_CONFIG_PER_TASK:
         pytest.skip(f"Task {task} is not supported in the auto-configuration.")
-    if task in [
-        OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING,
-        OTXTaskType.ANOMALY_CLASSIFICATION,
-        OTXTaskType.ANOMALY_DETECTION,
-        OTXTaskType.ANOMALY_SEGMENTATION,
-        OTXTaskType.KEYPOINT_DETECTION,
-    ]:
+    if task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
         pytest.skip(f"{task} doesn't support configurable input size.")
+    if task == OTXTaskType.KEYPOINT_DETECTION:
+        pytest.skip(f"{task} isn't prepared to run integration test.")
 
     task = task.lower()
     tmp_path_cfg_ipt_size = tmp_path / f"otx_configurable_input_size_{task}"

From 9127e5d152ead7cef2fc98a9b5dde77002dbdce0 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 13:47:00 +0900
Subject: [PATCH 39/42] apply configurable input size to keypoint detection

---
 .../keypoint_detection/heads/rtmcc_head.py    |  6 +--
 src/otx/algo/keypoint_detection/rtmpose.py    | 40 +++++++++++++++----
 .../keypoint_detection/utils/simcc_label.py   |  8 ++--
 .../core/data/transform_libs/torchvision.py   |  4 +-
 src/otx/core/model/keypoint_detection.py      |  4 +-
 src/otx/engine/utils/auto_configurator.py     |  2 +-
 .../_base_/data/keypoint_detection.yaml       |  2 +-
 7 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
index 5b38b9f0661..1fdf8252c2c 100644
--- a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
+++ b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
@@ -31,7 +31,7 @@ class RTMCCHead(BaseModule):
         in_channels (int | sequence[int]): Number of channels in the input
             feature map.
         out_channels (int): Number of channels in the output heatmap.
-        input_size (tuple): Size of input image in shape [w, h].
+        input_size (tuple): Size of input image in shape [h, w].
         in_featuremap_size (int | sequence[int]): Size of input feature map.
         loss (nn.module): keypoint loss.
         decoder_cfg (dict): Config dict for the keypoint decoder.
@@ -87,8 +87,8 @@ def __init__(
         )
         self.mlp = nn.Sequential(ScaleNorm(flatten_dims), nn.Linear(flatten_dims, gau_cfg["in_token_dims"], bias=False))
         self.gau = RTMCCBlock(**gau_cfg)
-        self.cls_x = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[0] * self.simcc_split_ratio), bias=False)
-        self.cls_y = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[1] * self.simcc_split_ratio), bias=False)
+        self.cls_x = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[1] * self.simcc_split_ratio), bias=False)
+        self.cls_y = nn.Linear(gau_cfg["out_token_dims"], int(self.input_size[0] * self.simcc_split_ratio), bias=False)
 
     def forward(self, feats: tuple[Tensor]) -> tuple[Tensor, Tensor]:
         """Forward the network.
diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py
index 0086acd80dd..23c8ed8b22e 100644
--- a/src/otx/algo/keypoint_detection/rtmpose.py
+++ b/src/otx/algo/keypoint_detection/rtmpose.py
@@ -13,12 +13,18 @@
 from otx.algo.keypoint_detection.losses.kl_discret_loss import KLDiscretLoss
 from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator
 from otx.core.exporter.native import OTXNativeModelExporter
+from otx.core.metrics.pck import PCKMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
 from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
 from torch import nn
 
 if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
     from otx.core.exporter.base import OTXModelExporter
+    from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
     from otx.core.types.export import TaskLevelExportParameters
+    from otx.core.types.label import LabelInfoTypes
 
 
 class RTMPose(OTXKeypointDetectionModel):
@@ -27,13 +33,13 @@ class RTMPose(OTXKeypointDetectionModel):
     @property
     def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
-        if self.image_size is None:
-            msg = f"Exporter should have a image_size but it is given by {self.image_size}"
+        if self.input_size is None:
+            msg = f"Exporter should have a input_size but it is given by {self.input_size}"
             raise ValueError(msg)
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=self.image_size,
+            input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
             resize_mode="fit_to_window_letterbox",
@@ -62,12 +68,30 @@ class RTMPoseTiny(RTMPose):
     """RTMPose Tiny Model."""
 
     load_from = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth"
-    image_size = (1, 3, 192, 256)
     mean = (123.675, 116.28, 103.53)
     std = (58.395, 57.12, 57.375)
 
+    def __init__(
+        self,
+        label_info: LabelInfoTypes,
+        input_size: tuple[int, int] = (256, 192),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = PCKMeasureCallable,
+        torch_compile: bool = False,
+    ) -> None:
+        self.mean = (0.0, 0.0, 0.0)
+        self.std = (255.0, 255.0, 255.0)
+        super().__init__(
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
+
     def _build_model(self, num_classes: int) -> RTMPose:
-        input_size = (192, 256)
         simcc_split_ratio = 2.0
         sigma = (4.9, 5.66)
 
@@ -84,13 +108,13 @@ def _build_model(self, num_classes: int) -> RTMPose:
         head = RTMCCHead(
             out_channels=num_classes,
             in_channels=384,
-            input_size=input_size,
-            in_featuremap_size=(input_size[0] // 32, input_size[1] // 32),
+            input_size=self.input_size,
+            in_featuremap_size=(self.input_size[0] // 32, self.input_size[1] // 32),
             simcc_split_ratio=simcc_split_ratio,
             final_layer_kernel_size=7,
             loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True),
             decoder_cfg={
-                "input_size": input_size,
+                "input_size": self.input_size,
                 "simcc_split_ratio": simcc_split_ratio,
                 "sigma": sigma,
                 "normalize": False,
diff --git a/src/otx/algo/keypoint_detection/utils/simcc_label.py b/src/otx/algo/keypoint_detection/utils/simcc_label.py
index 4f03997bb7d..429c2f427a4 100644
--- a/src/otx/algo/keypoint_detection/utils/simcc_label.py
+++ b/src/otx/algo/keypoint_detection/utils/simcc_label.py
@@ -21,7 +21,7 @@ class SimCCLabel:
         - instance number: N
         - keypoint number: K
         - keypoint dimension: D
-        - image size: [w, h]
+        - image size: [h, w]
 
     Encoded:
 
@@ -36,7 +36,7 @@ class SimCCLabel:
         - keypoint_weights (np.ndarray): The target weights in shape (N, K)
 
     Args:
-        input_size (tuple): Input image size in [w, h]
+        input_size (tuple): Input image size in [h, w]
         smoothing_type (str): The SimCC label smoothing strategy. Options are
             ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'``
         sigma (float | int | tuple): The sigma value in the Gaussian SimCC
@@ -201,7 +201,7 @@ def _generate_standard(
         Labels will be one-hot vectors if self.label_smooth_weight==0.0
         """
         batch_size, num_keypoints, _ = keypoints.shape
-        w, h = self.input_size
+        h, w = self.input_size
         x_dim = np.around(w * self.simcc_split_ratio).astype(int)
         y_dim = np.around(h * self.simcc_split_ratio).astype(int)
 
@@ -239,7 +239,7 @@ def _generate_gaussian(
     ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Encoding keypoints into SimCC labels with Gaussian Label Smoothing strategy."""
         batch_size, num_keypoints, _ = keypoints.shape
-        w, h = self.input_size
+        h, w = self.input_size
         x_dim = np.around(w * self.simcc_split_ratio).astype(int)
         y_dim = np.around(h * self.simcc_split_ratio).astype(int)
 
diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index 3ea0b063e00..00471d85be2 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -3447,7 +3447,7 @@ def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
         assert len(_inputs) == 1, "[tmp] Multiple entity is not supported yet."  # noqa: S101
         inputs = _inputs[0]
 
-        w, h = self.input_size
+        h, w = self.input_size
         warp_size = (int(w), int(h))
 
         # reshape bbox to fixed aspect ratio
@@ -3508,7 +3508,7 @@ class GenerateTarget(tvt_v2.Transform, NumpytoTVTensorMixin):
             the specific codec for more details.
 
     Args:
-        input_size (tuple[int, int]): Input image size in [w, h]  TODO[wonjulee]: need to change order of shape
+        input_size (tuple[int, int]): Input image size in [h, w]
         is_numpy_to_tvtensor (bool): Whether convert outputs to tensor. Defaults to False.
     """
 
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 69c05ed148e..02cbb652333 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -32,21 +32,23 @@ class OTXKeypointDetectionModel(OTXModel[KeypointDetBatchDataEntity, KeypointDet
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: tuple[int, int],
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = PCKMeasureCallable,
         torch_compile: bool = False,
     ) -> None:
-        self.image_size = (1, 3, 192, 256)
         self.mean = (0.0, 0.0, 0.0)
         self.std = (255.0, 255.0, 255.0)
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
         )
+        self.input_size: tuple[int, int]
 
     @abstractmethod
     def _build_model(self, num_classes: int) -> nn.Module:
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index d95826e5c1c..5b2b50f33bd 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -382,7 +382,7 @@ def get_ov_model(self, model_name: str, label_info: LabelInfo) -> OVModel:
         """
         class_path = OVMODEL_PER_TASK.get(self.task, None)
         if class_path is None:
-            msg = f"{self.task} is not support OVModel."
+            msg = f"{self.task} doesn't support OVModel."
             raise NotImplementedError(msg)
         class_module, class_name = class_path.rsplit(".", 1)
         module = __import__(class_module, fromlist=[class_name])
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index b3ffed5b915..ddd8eaf92ea 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -6,8 +6,8 @@ data_format: coco_person_keypoints
 unannotated_items_ratio: 0.0
 image_color_channel: RGB
 input_size:
-  - 192
   - 256
+  - 192
 train_subset:
   subset_name: train
   batch_size: 32

From a6b922dec7ffa567d121c16f09d0652552b6fab2 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 13:57:28 +0900
Subject: [PATCH 40/42] update unit test

---
 .../keypoint_detection/heads/test_rtmcc_head.py    | 14 +++++++-------
 .../keypoint_detection/utils/test_simcc_label.py   |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py b/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py
index 0f3898c3f49..f2f620671d5 100644
--- a/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py
+++ b/tests/unit/algo/keypoint_detection/heads/test_rtmcc_head.py
@@ -17,19 +17,19 @@ class TestRTMCCHead:
     def fxt_features(self):
         batch_size = 2
         in_channels = 384  # Match the in_channels of the rtmdet_sep_bn_head fixture
-        input_size = (192, 256)
+        input_size = (256, 192)
         return [torch.rand(batch_size, in_channels, input_size[0] // 32, input_size[1] // 32)]
 
     @pytest.fixture()
     def fxt_gt_entity(self):
         batch_size = 2
-        img_infos = [ImageInfo(img_idx=i, img_shape=(192, 256), ori_shape=(192, 256)) for i in range(batch_size)]
+        img_infos = [ImageInfo(img_idx=i, img_shape=(256, 192), ori_shape=(256, 192)) for i in range(batch_size)]
         keypoint_x_labels = [torch.randn((1, 17, 384)) for _ in range(batch_size)]
         keypoint_y_labels = [torch.randn((1, 17, 512)) for _ in range(batch_size)]
         keypoint_weights = [torch.randn((1, 17)) for _ in range(batch_size)]
         return KeypointDetBatchDataEntity(
             batch_size=batch_size,
-            images=tv_tensors.Image(data=torch.randn((batch_size, 3, 192, 256))),
+            images=tv_tensors.Image(data=torch.randn((batch_size, 3, 256, 192))),
             imgs_info=img_infos,
             keypoint_x_labels=keypoint_x_labels,
             keypoint_y_labels=keypoint_y_labels,
@@ -45,13 +45,13 @@ def fxt_rtmcc_head(self) -> RTMCCHead:
         return RTMCCHead(
             out_channels=17,
             in_channels=384,
-            input_size=(192, 256),
+            input_size=(256, 192),
             in_featuremap_size=(6, 8),
             simcc_split_ratio=2.0,
             final_layer_kernel_size=7,
             loss=KLDiscretLoss(use_target_weight=True, beta=10.0, label_softmax=True),
             decoder_cfg={
-                "input_size": (192, 256),
+                "input_size": (256, 192),
                 "simcc_split_ratio": 2.0,
                 "sigma": (4.9, 5.66),
                 "normalize": False,
@@ -72,9 +72,9 @@ def fxt_rtmcc_head(self) -> RTMCCHead:
     def test_forward(self, fxt_rtmcc_head, fxt_features) -> None:
         pred_x, pred_y = fxt_rtmcc_head(fxt_features)
         assert pred_x.shape[1] == fxt_rtmcc_head.out_channels
-        assert pred_x.shape[2] == fxt_rtmcc_head.decoder.input_size[0] * fxt_rtmcc_head.decoder.simcc_split_ratio
+        assert pred_x.shape[2] == fxt_rtmcc_head.decoder.input_size[1] * fxt_rtmcc_head.decoder.simcc_split_ratio
         assert pred_y.shape[1] == fxt_rtmcc_head.out_channels
-        assert pred_y.shape[2] == fxt_rtmcc_head.decoder.input_size[1] * fxt_rtmcc_head.decoder.simcc_split_ratio
+        assert pred_y.shape[2] == fxt_rtmcc_head.decoder.input_size[0] * fxt_rtmcc_head.decoder.simcc_split_ratio
 
     def test_loss(self, fxt_rtmcc_head, fxt_features, fxt_gt_entity) -> None:
         losses = fxt_rtmcc_head.loss(
diff --git a/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py b/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py
index 69d432ae7a9..7eeaa18ed5d 100644
--- a/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py
+++ b/tests/unit/algo/keypoint_detection/utils/test_simcc_label.py
@@ -18,12 +18,12 @@ def fxt_keypoints_visible(self):
 
     @pytest.fixture()
     def fxt_codec_gaussian(self):
-        return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=6.0, simcc_split_ratio=2.0)
+        return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=6.0, simcc_split_ratio=2.0)
 
     @pytest.fixture()
     def fxt_codec_smoothing(self):
         return SimCCLabel(
-            input_size=(192, 256),
+            input_size=(256, 192),
             smoothing_type="standard",
             sigma=5.0,
             simcc_split_ratio=3.0,
@@ -32,11 +32,11 @@ def fxt_codec_smoothing(self):
 
     @pytest.fixture()
     def fxt_codec_dark(self):
-        return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0)
+        return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0)
 
     @pytest.fixture()
     def fxt_codec_separated_sigma(self):
-        return SimCCLabel(input_size=(192, 256), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0)
+        return SimCCLabel(input_size=(256, 192), smoothing_type="gaussian", sigma=(4.9, 5.66), simcc_split_ratio=2.0)
 
     @pytest.mark.parametrize(
         "fxt_codec",

From 255a4e08430deaa70188147c1ee434a21d2b2166 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 14:20:07 +0900
Subject: [PATCH 41/42] update unit test

---
 tests/unit/core/data/transform_libs/test_torchvision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 202a91afb15..6ccf71b3dcb 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -946,7 +946,7 @@ def keypoint_det_entity(self) -> KeypointDetDataEntity:
         )
 
     def test_forward(self, keypoint_det_entity) -> None:
-        transform = GenerateTarget(input_size=(192, 256))
+        transform = GenerateTarget(input_size=(256, 192))
         results = transform(deepcopy(keypoint_det_entity))
 
         assert hasattr(results, "keypoint_x_labels")

From 8e6f8f8e915cbf04f9ac713c4121031683198d8c Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Tue, 13 Aug 2024 23:43:57 +0900
Subject: [PATCH 42/42] update h-label head

---
 src/otx/algo/classification/efficientnet.py   |  9 ++++++--
 .../classification/heads/hlabel_cls_head.py   | 23 +++++++++++--------
 src/otx/algo/classification/mobilenet_v3.py   |  8 +++++--
 .../heads/test_hlabel_cls_head.py             | 19 +++++++++++++++
 4 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/src/otx/algo/classification/efficientnet.py b/src/otx/algo/classification/efficientnet.py
index c2fa5ae7784..d6ab980a3a4 100644
--- a/src/otx/algo/classification/efficientnet.py
+++ b/src/otx/algo/classification/efficientnet.py
@@ -6,7 +6,8 @@
 
 from __future__ import annotations
 
-from copy import deepcopy
+from copy import copy, deepcopy
+from math import ceil
 from typing import TYPE_CHECKING, Literal
 
 from torch import Tensor, nn
@@ -269,6 +270,10 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
+
+        copied_head_config = copy(head_config)
+        copied_head_config["step_size"] = (ceil(self.input_size[0] / 32), ceil(self.input_size[1] / 32))
+
         return ImageClassifier(
             backbone=backbone,
             neck=nn.Identity(),
@@ -276,7 +281,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
                 in_channels=backbone.num_features,
                 multiclass_loss=nn.CrossEntropyLoss(),
                 multilabel_loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"),
-                **head_config,
+                **copied_head_config,
             ),
             optimize_gap=False,
         )
diff --git a/src/otx/algo/classification/heads/hlabel_cls_head.py b/src/otx/algo/classification/heads/hlabel_cls_head.py
index 1b5767c4ace..b0f6cfb9711 100644
--- a/src/otx/algo/classification/heads/hlabel_cls_head.py
+++ b/src/otx/algo/classification/heads/hlabel_cls_head.py
@@ -419,7 +419,7 @@ class HierarchicalCBAMClsHead(HierarchicalClsHead):
         thr (float, optional): Predictions with scores under the thresholds are considered
                             as negative. Defaults to 0.5.
         init_cfg (dict | None, optional): Initialize configuration key-values, Defaults to None.
-        step_size (int, optional): Step size value for HierarchicalCBAMClsHead, Defaults to 7.
+        step_size (int | tuple[int, int], optional): Step size value for HierarchicalCBAMClsHead, Defaults to 7.
     """
 
     def __init__(
@@ -435,7 +435,7 @@ def __init__(
         multilabel_loss: nn.Module | None = None,
         thr: float = 0.5,
         init_cfg: dict | None = None,
-        step_size: int = 7,
+        step_size: int | tuple[int, int] = 7,
         **kwargs,
     ):
         super().__init__(
@@ -452,11 +452,11 @@ def __init__(
             init_cfg=init_cfg,
             **kwargs,
         )
-        self.step_size = step_size
-        self.fc_superclass = nn.Linear(in_channels * step_size * step_size, num_multiclass_heads)
-        self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * step_size * step_size)
+        self.step_size = (step_size, step_size) if isinstance(step_size, int) else tuple(step_size)
+        self.fc_superclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_multiclass_heads)
+        self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * self.step_size[0] * self.step_size[1])
         self.cbam = CBAM(in_channels)
-        self.fc_subclass = nn.Linear(in_channels * step_size * step_size, num_single_label_classes)
+        self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_single_label_classes)
 
         self._init_layers()
 
@@ -464,7 +464,7 @@ def pre_logits(self, feats: tuple[torch.Tensor] | torch.Tensor) -> torch.Tensor:
         """The process before the final classification head."""
         if isinstance(feats, Sequence):
             feats = feats[-1]
-        return feats.view(feats.size(0), self.in_channels * self.step_size * self.step_size)
+        return feats.view(feats.size(0), self.in_channels * self.step_size[0] * self.step_size[1])
 
     def _init_layers(self) -> None:
         """Iniitialize weights of classification head."""
@@ -479,10 +479,15 @@ def forward(self, feats: tuple[torch.Tensor] | torch.Tensor) -> torch.Tensor:
         attention_weights = torch.sigmoid(self.attention_fc(out_superclass))
         attended_features = pre_logits * attention_weights
 
-        attended_features = attended_features.view(pre_logits.size(0), self.in_channels, self.step_size, self.step_size)
+        attended_features = attended_features.view(
+            pre_logits.size(0),
+            self.in_channels,
+            self.step_size[0],
+            self.step_size[1],
+        )
         attended_features = self.cbam(attended_features)
         attended_features = attended_features.view(
             pre_logits.size(0),
-            self.in_channels * self.step_size * self.step_size,
+            self.in_channels * self.step_size[0] * self.step_size[1],
         )
         return self.fc_subclass(attended_features)
diff --git a/src/otx/algo/classification/mobilenet_v3.py b/src/otx/algo/classification/mobilenet_v3.py
index 8470d398aee..40f92c594a4 100644
--- a/src/otx/algo/classification/mobilenet_v3.py
+++ b/src/otx/algo/classification/mobilenet_v3.py
@@ -6,7 +6,8 @@
 
 from __future__ import annotations
 
-from copy import deepcopy
+from copy import copy, deepcopy
+from math import ceil
 from typing import TYPE_CHECKING, Any, Literal
 
 import torch
@@ -331,6 +332,9 @@ def _build_model(self, head_config: dict) -> nn.Module:
         if not isinstance(self.label_info, HLabelInfo):
             raise TypeError(self.label_info)
 
+        copied_head_config = copy(head_config)
+        copied_head_config["step_size"] = (ceil(self.input_size[0] / 32), ceil(self.input_size[1] / 32))
+
         return ImageClassifier(
             backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size),
             neck=nn.Identity(),
@@ -338,7 +342,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
                 in_channels=960,
                 multiclass_loss=nn.CrossEntropyLoss(),
                 multilabel_loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"),
-                **head_config,
+                **copied_head_config,
             ),
             optimize_gap=False,
         )
diff --git a/tests/unit/algo/classification/heads/test_hlabel_cls_head.py b/tests/unit/algo/classification/heads/test_hlabel_cls_head.py
index 11e7191dc49..a32f9bb14d4 100644
--- a/tests/unit/algo/classification/heads/test_hlabel_cls_head.py
+++ b/tests/unit/algo/classification/heads/test_hlabel_cls_head.py
@@ -169,3 +169,22 @@ def test_pre_logits(self, fxt_hierarchical_cbam_cls_head) -> None:
         input_tensor = torch.rand((8, 64, 7, 7))
         pre_logits = fxt_hierarchical_cbam_cls_head.pre_logits(input_tensor)
         assert pre_logits.shape == (8, 64 * 7 * 7)
+
+    def test_pre_logits_tuple_step_size(self) -> None:
+        head_idx_to_logits_range = {"0": (0, 5), "1": (5, 10), "2": (10, 12)}
+        head = HierarchicalCBAMClsHead(
+            num_multiclass_heads=3,
+            num_multilabel_classes=0,
+            head_idx_to_logits_range=head_idx_to_logits_range,
+            num_single_label_classes=12,
+            empty_multiclass_head_indices=[],
+            in_channels=64,
+            num_classes=12,
+            multiclass_loss=CrossEntropyLoss(),
+            multilabel_loss=None,
+            step_size=(14, 7),
+        )
+
+        input_tensor = torch.rand((8, 64, 14, 7))
+        pre_logits = head.pre_logits(input_tensor)
+        assert pre_logits.shape == (8, 64 * 14 * 7)