openvinotoolkit · eunwoosh · Aug 14, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
@@ -26,6 +26,8 @@ All notable changes to this project will be documented in this file.
   (https://github.com/openvinotoolkit/training_extensions/pull/3781)
 - Update head and h-label format for hierarchical label classification
   (https://github.com/openvinotoolkit/training_extensions/pull/3810)
+- Support configurable input size
+  (https://github.com/openvinotoolkit/training_extensions/pull/3788)
 
 ### Enhancements
 

@@ -32,6 +32,7 @@ class MoViNet(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: tuple[int, int] = (224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -40,6 +41,7 @@ def __init__(
         self.load_from = "https://github.com/Atze00/MoViNet-pytorch/blob/main/weights/modelA0_statedict_v3?raw=true"
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,

@@ -33,6 +33,7 @@ class X3D(OTXActionClsModel):
     def __init__(
         self,
         label_info: LabelInfoTypes,
+        input_size: tuple[int, int] = (224, 224),
         optimizer: OptimizerCallable = DefaultOptimizerCallable,
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
@@ -41,6 +42,7 @@ def __init__(
         self.load_from = "https://download.openmmlab.com/mmaction/recognition/x3d/facebook/x3d_m_facebook_16x5x1_kinetics400_rgb_20201027-3f42382a.pth"
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,

@@ -34,6 +34,8 @@ class Padim(OTXAnomaly, AnomalibPadim):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (256, 256)
     """
 
     def __init__(
@@ -47,8 +49,9 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: tuple[int, int] = (256, 256),
     ) -> None:
-        OTXAnomaly.__init__(self)
+        OTXAnomaly.__init__(self, input_size)
         AnomalibPadim.__init__(
             self,
             backbone=backbone,
@@ -57,6 +60,7 @@ def __init__(
             n_features=n_features,
         )
         self.task = task
+        self.input_size = input_size
 
     def configure_optimizers(self) -> tuple[list[Optimizer], list[Optimizer]] | None:
         """PADIM doesn't require optimization, therefore returns no optimizers."""

@@ -32,6 +32,8 @@ class Stfpm(OTXAnomaly, AnomalibStfpm):
         task (Literal[
                 OTXTaskType.ANOMALY_CLASSIFICATION, OTXTaskType.ANOMALY_DETECTION, OTXTaskType.ANOMALY_SEGMENTATION
             ], optional): Task type of Anomaly Task. Defaults to OTXTaskType.ANOMALY_CLASSIFICATION.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (256, 256)
     """
 
     def __init__(
@@ -43,15 +45,17 @@ def __init__(
             OTXTaskType.ANOMALY_DETECTION,
             OTXTaskType.ANOMALY_SEGMENTATION,
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
+        input_size: tuple[int, int] = (256, 256),
         **kwargs,
     ) -> None:
-        OTXAnomaly.__init__(self)
+        OTXAnomaly.__init__(self, input_size=input_size)
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
             layers=layers,
         )
         self.task = task
+        self.input_size = input_size
 
     @property
     def trainable_model(self) -> str:

@@ -572,7 +572,7 @@ class OTXEfficientNet(EfficientNet):
         in_size : tuple of two ints. Spatial size of the expected input image.
     """
 
-    def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
+    def __init__(self, version: EFFICIENTNET_VERSION, input_size: tuple[int, int] | None = None, **kwargs):
         self.model_name = "efficientnet_" + version
 
         if version == "b0":
@@ -615,6 +615,9 @@ def __init__(self, version: EFFICIENTNET_VERSION, **kwargs):
             msg = f"Unsupported EfficientNet version {version}"
             raise ValueError(msg)
 
+        if input_size is not None:
+            in_size = input_size
+
         init_block_channels = 32
         layers = [1, 2, 2, 3, 3, 4, 1]
         downsample = [1, 1, 1, 1, 0, 1, 0]

@@ -57,13 +57,15 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
     ) -> None:
         self.version = version
         self.pretrained = pretrained
 
         super().__init__(
             label_info=label_info,
+            input_size=input_size,
             optimizer=optimizer,
             scheduler=scheduler,
             metric=metric,
@@ -86,7 +88,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXEfficientNet(version=self.version, pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         neck = GlobalAveragePooling(dim=2)
         loss = nn.CrossEntropyLoss(reduction="none")
         if self.train_type == OTXTrainType.SEMI_SUPERVISED:
@@ -149,6 +151,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.version = version
         self.pretrained = pretrained
@@ -159,6 +162,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -176,7 +180,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXEfficientNet(version=self.version, pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         return ImageClassifier(
             backbone=backbone,
             neck=GlobalAveragePooling(dim=2),
@@ -229,6 +233,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.version = version
         self.pretrained = pretrained
@@ -239,6 +244,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -262,7 +268,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
         if not isinstance(self.label_info, HLabelInfo):
             raise TypeError(self.label_info)
 
-        backbone = OTXEfficientNet(version=self.version, pretrained=self.pretrained)
+        backbone = OTXEfficientNet(version=self.version, input_size=self.input_size, pretrained=self.pretrained)
         return ImageClassifier(
             backbone=backbone,
             neck=nn.Identity(),

@@ -5,11 +5,13 @@
 
 from __future__ import annotations
 
+import logging
 from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import Tensor, nn
 from transformers import AutoModelForImageClassification
+from transformers.configuration_utils import PretrainedConfig
 
 from otx.core.data.entity.base import OTXBatchLossEntity
 from otx.core.data.entity.classification import (
@@ -29,6 +31,10 @@
     from otx.core.metrics import MetricCallable
 
 
+DEFAULT_INPUT_SIZE = (224, 224)
+logger = logging.getLogger(__name__)
+
+
 class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
     """HuggingFaceModelForMulticlassCls is a class that represents a Hugging Face model for multiclass classification.
 
@@ -38,6 +44,8 @@ class HuggingFaceModelForMulticlassCls(OTXMulticlassClsModel):
         optimizer (OptimizerCallable, optional): The optimizer callable for training the model.
         scheduler (LRSchedulerCallable | LRSchedulerListCallable, optional): The learning rate scheduler callable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (224, 224)
 
     Example:
         1. API
@@ -59,6 +67,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = DEFAULT_INPUT_SIZE,
     ) -> None:
         self.model_name = model_name_or_path
 
@@ -68,13 +77,23 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
+        model_config, _ = PretrainedConfig.get_config_dict(self.model_name)
+        kwargs = {}
+        if "image_size" in model_config:
+            kwargs["image_size"] = self.input_size[0]
+        elif self.input_size != DEFAULT_INPUT_SIZE:
+            msg = "There is no 'image_size' argument in the model configuration. There may be unexpected results."
+            logger.warning(msg)
+
         return AutoModelForImageClassification.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             num_labels=self.label_info.num_classes,
             ignore_mismatched_sizes=True,
+            **kwargs,
         )
 
     def _customize_inputs(self, inputs: MulticlassClsBatchDataEntity) -> dict[str, Any]:

@@ -62,6 +62,8 @@ class MobileNetV3ForMulticlassCls(OTXMulticlassClsModel):
         metric (MetricCallable, optional): The metric callable. Defaults to MultiClassClsMetricCallable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
         freeze_backbone (bool, optional): Whether to freeze the backbone layers during training. Defaults to False.
+        input_size (tuple[int, int], optional):
+            Model input size in the order of height and width. Defaults to (224, 224)
     """
 
     def __init__(
@@ -72,6 +74,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiClassClsMetricCallable,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
         train_type: Literal[OTXTrainType.SUPERVISED, OTXTrainType.SEMI_SUPERVISED] = OTXTrainType.SUPERVISED,
     ) -> None:
         self.mode = mode
@@ -82,6 +85,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
             train_type=train_type,
         )
 
@@ -100,7 +104,7 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        backbone = OTXMobileNetV3(mode=self.mode)
+        backbone = OTXMobileNetV3(mode=self.mode, input_size=self.input_size)
         neck = GlobalAveragePooling(dim=2)
         loss = nn.CrossEntropyLoss(reduction="none")
         in_channels = 960 if self.mode == "large" else 576
@@ -163,6 +167,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = MultiLabelClsMetricCallable,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -171,6 +176,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -189,7 +195,7 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, num_classes: int) -> nn.Module:
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size),
             neck=GlobalAveragePooling(dim=2),
             head=MultiLabelNonLinearClsHead(
                 num_classes=num_classes,
@@ -246,7 +252,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",
@@ -292,6 +298,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallble,
         torch_compile: bool = False,
+        input_size: tuple[int, int] = (224, 224),
     ) -> None:
         self.mode = mode
         super().__init__(
@@ -300,6 +307,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            input_size=input_size,
         )
 
     def _create_model(self) -> nn.Module:
@@ -324,7 +332,7 @@ def _build_model(self, head_config: dict) -> nn.Module:
             raise TypeError(self.label_info)
 
         return ImageClassifier(
-            backbone=OTXMobileNetV3(mode=self.mode),
+            backbone=OTXMobileNetV3(mode=self.mode, input_size=self.input_size),
             neck=nn.Identity(),
             head=HierarchicalCBAMClsHead(
                 in_channels=960,
@@ -403,7 +411,7 @@ def _exporter(self) -> OTXModelExporter:
         """Creates OTXModelExporter object that can export the model."""
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
-            input_size=(1, 3, 224, 224),
+            input_size=(1, 3, *self.input_size),
             mean=(123.675, 116.28, 103.53),
             std=(58.395, 57.12, 57.375),
             resize_mode="standard",