diff --git a/keras_hub/src/models/mit/mit_backbone.py b/keras_hub/src/models/mit/mit_backbone.py index a6c57816c..9ce9771b4 100644 --- a/keras_hub/src/models/mit/mit_backbone.py +++ b/keras_hub/src/models/mit/mit_backbone.py @@ -23,13 +23,13 @@ class MiTBackbone(FeaturePyramidBackbone): def __init__( self, - depths, + layerwise_depths, num_layers, - blockwise_num_heads, - blockwise_sr_ratios, + layerwise_num_heads, + layerwise_sr_ratios, max_drop_path_rate, - patch_sizes, - strides, + layerwise_patch_sizes, + layerwise_strides, image_shape=(None, None, 3), hidden_dims=None, **kwargs, @@ -43,12 +43,12 @@ def __init__( https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer) Args: - depths: The number of transformer encoders to be used per layer in the + layerwise_depths: The number of transformer encoders to be used per layer in the network. num_layers: int. The number of Transformer layers. - blockwise_num_heads: list of integers, the number of heads to use + layerwise_num_heads: list of integers, the number of heads to use in the attention computation for each layer. - blockwise_sr_ratios: list of integers, the sequence reduction + layerwise_sr_ratios: list of integers, the sequence reduction ratio to perform for each layer on the sequence before key and value projections. If set to > 1, a `Conv2D` layer is used to reduce the length of the sequence. @@ -82,7 +82,10 @@ def __init__( model.fit(images, labels, epochs=3) ``` """ - dpr = [x for x in np.linspace(0.0, max_drop_path_rate, sum(depths))] + dpr = [ + x + for x in np.linspace(0.0, max_drop_path_rate, sum(layerwise_depths)) + ] # === Layers === cur = 0 @@ -93,8 +96,8 @@ def __init__( for i in range(num_layers): patch_embed_layer = OverlappingPatchingAndEmbedding( project_dim=hidden_dims[i], - patch_size=patch_sizes[i], - stride=strides[i], + patch_size=layerwise_patch_sizes[i], + stride=layerwise_strides[i], name=f"patch_and_embed_{i}", ) patch_embedding_layers.append(patch_embed_layer) @@ -102,15 +105,15 @@ def __init__( transformer_block = [ HierarchicalTransformerEncoder( project_dim=hidden_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], + num_heads=layerwise_num_heads[i], + sr_ratio=layerwise_sr_ratios[i], drop_prob=dpr[cur + k], name=f"hierarchical_encoder_{i}_{k}", ) - for k in range(depths[i]) + for k in range(layerwise_depths[i]) ] transformer_blocks.append(transformer_block) - cur += depths[i] + cur += layerwise_depths[i] layer_norms.append(keras.layers.LayerNormalization(epsilon=1e-5)) # === Functional Model === @@ -120,7 +123,7 @@ def __init__( for i in range(num_layers): # Compute new height/width after the `proj` # call in `OverlappingPatchingAndEmbedding` - stride = strides[i] + stride = layerwise_strides[i] new_height, new_width = ( int(ops.shape(x)[1] / stride), int(ops.shape(x)[2] / stride), @@ -138,30 +141,30 @@ def __init__( super().__init__(inputs=image_input, outputs=x, **kwargs) # === Config === - self.depths = depths + self.layerwise_depths = layerwise_depths self.image_shape = image_shape self.hidden_dims = hidden_dims self.pyramid_outputs = pyramid_outputs self.num_layers = num_layers - self.blockwise_num_heads = blockwise_num_heads - self.blockwise_sr_ratios = blockwise_sr_ratios + self.layerwise_num_heads = layerwise_num_heads + self.layerwise_sr_ratios = layerwise_sr_ratios self.max_drop_path_rate = max_drop_path_rate - self.patch_sizes = patch_sizes - self.strides = strides + self.layerwise_patch_sizes = layerwise_patch_sizes + self.layerwise_strides = layerwise_strides def get_config(self): config = super().get_config() config.update( { - "depths": self.depths, + "layerwise_depths": self.layerwise_depths, "hidden_dims": self.hidden_dims, "image_shape": self.image_shape, "num_layers": self.num_layers, - "blockwise_num_heads": self.blockwise_num_heads, - "blockwise_sr_ratios": self.blockwise_sr_ratios, + "layerwise_num_heads": self.layerwise_num_heads, + "layerwise_sr_ratios": self.layerwise_sr_ratios, "max_drop_path_rate": self.max_drop_path_rate, - "patch_sizes": self.patch_sizes, - "strides": self.strides, + "layerwise_patch_sizes": self.layerwise_patch_sizes, + "layerwise_strides": self.layerwise_strides, } ) return config diff --git a/keras_hub/src/models/mit/mit_backbone_test.py b/keras_hub/src/models/mit/mit_backbone_test.py index 88c58e96a..1eab8107e 100644 --- a/keras_hub/src/models/mit/mit_backbone_test.py +++ b/keras_hub/src/models/mit/mit_backbone_test.py @@ -8,15 +8,15 @@ class MiTBackboneTest(TestCase): def setUp(self): self.init_kwargs = { - "depths": [2, 2], + "layerwise_depths": [2, 2], "image_shape": (32, 32, 3), "hidden_dims": [4, 8], "num_layers": 2, - "blockwise_num_heads": [1, 2], - "blockwise_sr_ratios": [8, 4], + "layerwise_num_heads": [1, 2], + "layerwise_sr_ratios": [8, 4], "max_drop_path_rate": 0.1, - "patch_sizes": [7, 3], - "strides": [4, 2], + "layerwise_patch_sizes": [7, 3], + "layerwise_strides": [4, 2], } self.input_size = 32 self.input_data = np.ones( diff --git a/keras_hub/src/models/mit/mit_image_classifier_test.py b/keras_hub/src/models/mit/mit_image_classifier_test.py index 32055c47e..c63a45631 100644 --- a/keras_hub/src/models/mit/mit_image_classifier_test.py +++ b/keras_hub/src/models/mit/mit_image_classifier_test.py @@ -12,15 +12,15 @@ def setUp(self): self.images = np.ones((2, 32, 32, 3), dtype="float32") self.labels = [0, 3] self.backbone = MiTBackbone( - depths=[2, 2, 2, 2], + layerwise_depths=[2, 2, 2, 2], image_shape=(32, 32, 3), hidden_dims=[4, 8], num_layers=2, - blockwise_num_heads=[1, 2], - blockwise_sr_ratios=[8, 4], + layerwise_num_heads=[1, 2], + layerwise_sr_ratios=[8, 4], max_drop_path_rate=0.1, - patch_sizes=[7, 3], - strides=[4, 2], + layerwise_patch_sizes=[7, 3], + layerwise_strides=[4, 2], ) self.init_kwargs = { "backbone": self.backbone, diff --git a/keras_hub/src/models/mit/mit_presets.py b/keras_hub/src/models/mit/mit_presets.py index 9c2a5fe36..d7bbae4e5 100644 --- a/keras_hub/src/models/mit/mit_presets.py +++ b/keras_hub/src/models/mit/mit_presets.py @@ -21,7 +21,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_ade20k_512/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b0_ade20k_512/2", }, "mit_b1_ade20k_512": { "metadata": { @@ -32,7 +32,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_ade20k_512/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b1_ade20k_512/2", }, "mit_b2_ade20k_512": { "metadata": { @@ -43,7 +43,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_ade20k_512/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b2_ade20k_512/2", }, "mit_b3_ade20k_512": { "metadata": { @@ -54,7 +54,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_ade20k_512/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b3_ade20k_512/2", }, "mit_b4_ade20k_512": { "metadata": { @@ -65,7 +65,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_ade20k_512/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b4_ade20k_512/2", }, "mit_b5_ade20k_640": { "metadata": { @@ -76,7 +76,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_ade20k_640/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b5_ade20k_640/2", }, "mit_b0_cityscapes_1024": { "metadata": { @@ -87,7 +87,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b0_cityscapes_1024/2", }, "mit_b1_cityscapes_1024": { "metadata": { @@ -98,7 +98,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b1_cityscapes_1024/2", }, "mit_b2_cityscapes_1024": { "metadata": { @@ -109,7 +109,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b2_cityscapes_1024/2", }, "mit_b3_cityscapes_1024": { "metadata": { @@ -120,7 +120,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b3_cityscapes_1024/2", }, "mit_b4_cityscapes_1024": { "metadata": { @@ -131,7 +131,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b4_cityscapes_1024/2", }, "mit_b5_cityscapes_1024": { "metadata": { @@ -142,7 +142,7 @@ "official_name": "MiT", "path": "mit", }, - "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_cityscapes_1024/1", + "kaggle_handle": "kaggle://kerashub/mix-transformer/keras/mit_b5_cityscapes_1024/2", }, } diff --git a/keras_hub/src/models/mit/mix_transformer_backbone_test.py b/keras_hub/src/models/mit/mix_transformer_backbone_test.py deleted file mode 100644 index 88c58e96a..000000000 --- a/keras_hub/src/models/mit/mix_transformer_backbone_test.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np -import pytest - -from keras_hub.src.models.mit.mit_backbone import MiTBackbone -from keras_hub.src.tests.test_case import TestCase - - -class MiTBackboneTest(TestCase): - def setUp(self): - self.init_kwargs = { - "depths": [2, 2], - "image_shape": (32, 32, 3), - "hidden_dims": [4, 8], - "num_layers": 2, - "blockwise_num_heads": [1, 2], - "blockwise_sr_ratios": [8, 4], - "max_drop_path_rate": 0.1, - "patch_sizes": [7, 3], - "strides": [4, 2], - } - self.input_size = 32 - self.input_data = np.ones( - (2, self.input_size, self.input_size, 3), dtype="float32" - ) - - def test_backbone_basics(self): - self.run_vision_backbone_test( - cls=MiTBackbone, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output_shape=(2, 4, 4, 8), - expected_pyramid_output_keys=["P1", "P2"], - expected_pyramid_image_sizes=[(8, 8), (4, 4)], - run_quantization_check=False, - run_mixed_precision_check=False, - run_data_format_check=False, - ) - - @pytest.mark.large - def test_saved_model(self): - self.run_model_saving_test( - cls=MiTBackbone, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - ) diff --git a/keras_hub/src/models/segformer/segformer_backbone_tests.py b/keras_hub/src/models/segformer/segformer_backbone_tests.py index 22133763e..b0c1a4a1f 100644 --- a/keras_hub/src/models/segformer/segformer_backbone_tests.py +++ b/keras_hub/src/models/segformer/segformer_backbone_tests.py @@ -9,19 +9,20 @@ class SegFormerTest(TestCase): def setUp(self): - image_encoder = MiTBackbone( - depths=[2, 2], - image_shape=(224, 224, 3), - hidden_dims=[32, 64], - num_layers=2, - blockwise_num_heads=[1, 2], - blockwise_sr_ratios=[8, 4], - max_drop_path_rate=0.1, - patch_sizes=[7, 3], - strides=[4, 2], - ) + encoder_init_kwargs = { + "layerwise_depths": [2, 2], + "image_shape": (32, 32, 3), + "hidden_dims": [4, 8], + "num_layers": 2, + "layerwise_num_heads": [1, 2], + "layerwise_sr_ratios": [8, 4], + "max_drop_path_rate": 0.1, + "layerwise_patch_sizes": [7, 3], + "layerwise_strides": [4, 2], + } + image_encoder = MiTBackbone(**encoder_init_kwargs) projection_filters = 256 - self.input_size = 224 + self.input_size = 32 self.input_data = ops.ones((2, self.input_size, self.input_size, 3)) self.init_kwargs = { @@ -30,11 +31,7 @@ def setUp(self): } def test_segformer_backbone_construction(self): - - SegFormerBackbone( - image_encoder=self.init_kwargs["image_encoder"], - projection_filters=self.init_kwargs["projection_filters"], - ) + SegFormerBackbone(**self.init_kwargs) @pytest.mark.large def test_segformer_call(self): @@ -43,34 +40,27 @@ def test_segformer_call(self): projection_filters=self.init_kwargs["projection_filters"], ) - images = np.random.uniform(size=(2, 224, 224, 3)) + images = np.random.uniform(size=(2, 32, 32, 3)) segformer_output = segformer_backbone(images) segformer_predict = segformer_backbone.predict(images) - assert segformer_output.shape == (2, 56, 56, 256) - assert segformer_predict.shape == (2, 56, 56, 256) + assert segformer_output.shape == (2, 8, 8, 256) + assert segformer_predict.shape == (2, 8, 8, 256) def test_backbone_basics(self): - self.run_vision_backbone_test( cls=SegFormerBackbone, - init_kwargs={**self.init_kwargs}, + init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output_shape=(2, 56, 56, 256), - ) - - def test_task(self): - self.run_task_test( - cls=SegFormerBackbone, - init_kwargs={**self.init_kwargs}, - train_data=self.input_data, - expected_output_shape=(2, 56, 56, 256), + expected_output_shape=(2, 8, 8, 256), + run_mixed_precision_check=False, + run_quantization_check=False, ) @pytest.mark.large def test_saved_model(self): self.run_model_saving_test( cls=SegFormerBackbone, - init_kwargs={**self.init_kwargs}, + init_kwargs=self.init_kwargs, input_data=self.input_data, ) diff --git a/keras_hub/src/models/segformer/segformer_presets.py b/keras_hub/src/models/segformer/segformer_presets.py index a980c2792..46baf391e 100644 --- a/keras_hub/src/models/segformer/segformer_presets.py +++ b/keras_hub/src/models/segformer/segformer_presets.py @@ -10,7 +10,7 @@ "official_name": "SegFormerB0", "path": "segformer_b0", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b0_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b0_ade20k_512/2", }, "segformer_b1_ade20k_512": { "metadata": { @@ -21,7 +21,7 @@ "official_name": "SegFormerB1", "path": "segformer_b1", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b1_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b1_ade20k_512/2", }, "segformer_b2_ade20k_512": { "metadata": { @@ -32,7 +32,7 @@ "official_name": "SegFormerB2", "path": "segformer_b2", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b2_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b2_ade20k_512/2", }, "segformer_b3_ade20k_512": { "metadata": { @@ -43,7 +43,7 @@ "official_name": "SegFormerB3", "path": "segformer_b3", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b3_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b3_ade20k_512/2", }, "segformer_b4_ade20k_512": { "metadata": { @@ -54,7 +54,7 @@ "official_name": "SegFormerB4", "path": "segformer_b4", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b4_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b4_ade20k_512/2", }, "segformer_b5_ade20k_640": { "metadata": { @@ -65,7 +65,7 @@ "official_name": "SegFormerB5", "path": "segformer_b5", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b5_ade20k_640", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b5_ade20k_640/2", }, "segformer_b0_cityscapes_1024": { "metadata": { @@ -76,7 +76,7 @@ "official_name": "SegFormerB0", "path": "segformer_b0", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b0_cityscapes_1024", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b0_cityscapes_1024/2", }, "segformer_b1_cityscapes_1024": { "metadata": { @@ -87,7 +87,7 @@ "official_name": "SegFormerB1", "path": "segformer_b1", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b1_ade20k_512", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b1_ade20k_512/2", }, "segformer_b2_cityscapes_1024": { "metadata": { @@ -98,7 +98,7 @@ "official_name": "SegFormerB2", "path": "segformer_b2", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b2_cityscapes_1024", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b2_cityscapes_1024/2", }, "segformer_b3_cityscapes_1024": { "metadata": { @@ -109,7 +109,7 @@ "official_name": "SegFormerB3", "path": "segformer_b3", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b3_cityscapes_1024", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b3_cityscapes_1024/2", }, "segformer_b4_cityscapes_1024": { "metadata": { @@ -120,7 +120,7 @@ "official_name": "SegFormerB4", "path": "segformer_b4", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b4_cityscapes_1024", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b4_cityscapes_1024/2", }, "segformer_b5_cityscapes_1024": { "metadata": { @@ -131,6 +131,6 @@ "official_name": "SegFormerB5", "path": "segformer_b5", }, - "kaggle_handle": "kaggle://keras/segformer/keras/segformer_b5_cityscapes_1024", + "kaggle_handle": "kaggle://kerashub/segformer/keras/segformer_b5_cityscapes_1024/2", }, } diff --git a/tools/checkpoint_conversion/convert_mix_transformer.py b/tools/checkpoint_conversion/convert_mix_transformer.py index 6419cc405..3d86d04c1 100644 --- a/tools/checkpoint_conversion/convert_mix_transformer.py +++ b/tools/checkpoint_conversion/convert_mix_transformer.py @@ -27,12 +27,27 @@ MODEL_CONFIGS = { - "B0": {"hidden_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2]}, - "B1": {"hidden_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2]}, - "B2": {"hidden_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3]}, - "B3": {"hidden_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3]}, - "B4": {"hidden_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3]}, - "B5": {"hidden_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3]}, + "B0": {"hidden_dims": [32, 64, 160, 256], "layerwise_depths": [2, 2, 2, 2]}, + "B1": { + "hidden_dims": [64, 128, 320, 512], + "layerwise_depths": [2, 2, 2, 2], + }, + "B2": { + "hidden_dims": [64, 128, 320, 512], + "layerwise_depths": [3, 4, 6, 3], + }, + "B3": { + "hidden_dims": [64, 128, 320, 512], + "layerwise_depths": [3, 4, 18, 3], + }, + "B4": { + "hidden_dims": [64, 128, 320, 512], + "layerwise_depths": [3, 8, 27, 3], + }, + "B5": { + "hidden_dims": [64, 128, 320, 512], + "layerwise_depths": [3, 6, 40, 3], + }, } flags.DEFINE_string( @@ -144,20 +159,20 @@ def main(_): model_type = FLAGS.preset.split("_")[0] print("\n-> Instantiating KerasHub Model") keras_mit = keras_hub.models.MiTBackbone( - depths=MODEL_CONFIGS[model_type]["depths"], + layerwise_depths=MODEL_CONFIGS[model_type]["layerwise_depths"], image_shape=(224, 224, 3), hidden_dims=MODEL_CONFIGS[model_type]["hidden_dims"], num_layers=4, - blockwise_num_heads=[1, 2, 5, 8], - blockwise_sr_ratios=[8, 4, 2, 1], + layerwise_num_heads=[1, 2, 5, 8], + layerwise_sr_ratios=[8, 4, 2, 1], max_drop_path_rate=0.1, - patch_sizes=[7, 3, 3, 3], - strides=[4, 2, 2, 2], + layerwise_patch_sizes=[7, 3, 3, 3], + layerwise_strides=[4, 2, 2, 2], ) # Indices for the different patch embeddings and layer norms proj_indices, layer_norm_indices, hierarchical_encoder_indices = ( - get_indices_from_depths(MODEL_CONFIGS[model_type]["depths"]) + get_indices_from_depths(MODEL_CONFIGS[model_type]["layerwise_depths"]) ) print("\n-> Converting weights...")