Add MobileCLIP-B & conversion. Add ViTamin configs. Some refactoring …

…of transformer module. * Move NLD -> LND transpose into Transformer module forward(). * Started working on CustomTransformer for MobileCLIP-S0 text-tower but scope too large. Leaving CustomTransformer in for potential use in future.
mlfoundations · Jun 7, 2024 · 8cf653a · 8cf653a
1 parent 1d7b953
commit 8cf653a
Show file tree

Hide file tree

Showing 20 changed files with 476 additions and 44 deletions.
diff --git a/src/open_clip/convert.py b/src/open_clip/convert.py
@@ -139,11 +139,14 @@ def _convert_openclip_txt(module: TextTransformer, prefix):
 
 
 @torch.no_grad()
-def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict):
-    from timm.models.fastvit import _checkpoint_filter_fn
-
-    def _convert_timm_img(state_dict, prefix='image_encoder.'):
-        timm_state_dict = _checkpoint_filter_fn(state_dict, model.visual.trunk)
+def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict, fastvit = True):
+
+    def _convert_timm_img(state_dict):
+        if fastvit:
+            from timm.models.fastvit import checkpoint_filter_fn
+        else:
+            from timm.models.vision_transformer_hybrid import checkpoint_filter_fn
+        timm_state_dict = checkpoint_filter_fn(state_dict, model.visual.trunk)
         timm_state_dict = {'visual.trunk.' + k: v for k, v in timm_state_dict.items()}
         return timm_state_dict
 
@@ -181,5 +184,7 @@ def convert_state_dict(model: Union[CustomTextCLIP, CLIP], state_dict):
     if 'image_encoder.model.patch_embed.0.rbr_conv.0.conv.weight' in state_dict:
         # Apple MobileCLIP s1 & s2 state_dicts (s0 and b not currently supported)
         state_dict = convert_mobile_clip_state_dict(model, state_dict)
-
+    if 'image_encoder.model.patch_emb.0.block.conv.weight' in state_dict:
+        # convert b model
+        state_dict = convert_mobile_clip_state_dict(model, state_dict, fastvit=False)
     return state_dict
diff --git a/src/open_clip/model.py b/src/open_clip/model.py
@@ -272,9 +272,7 @@ def encode_text(self, text, normalize: bool = False):
         x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
 
         x = x + self.positional_embedding.to(cast_dtype)
-        x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.transformer(x, attn_mask=self.attn_mask)
-        x = x.permute(1, 0, 2)  # LND -> NLD
         x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
         x, _ = text_global_pool(x, text, self.text_pool_type)
         if self.text_projection is not None:

diff --git a/src/open_clip/model_configs/MobileCLIP-B.json b/src/open_clip/model_configs/MobileCLIP-B.json
@@ -0,0 +1,21 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_base_mci_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": false
+    },
+    "custom_text": true
+}
diff --git a/...pen_clip/model_configs/mobileclip_s1.json → ...pen_clip/model_configs/MobileCLIP-S1.json b/...pen_clip/model_configs/mobileclip_s1.json → ...pen_clip/model_configs/MobileCLIP-S1.json
diff --git a/...pen_clip/model_configs/mobileclip_s2.json → ...pen_clip/model_configs/MobileCLIP-S2.json b/...pen_clip/model_configs/mobileclip_s2.json → ...pen_clip/model_configs/MobileCLIP-S2.json
diff --git a/src/open_clip/model_configs/ViTamin-B-LTT.json b/src/open_clip/model_configs/ViTamin-B-LTT.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_base_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-B.json b/src/open_clip/model_configs/ViTamin-B.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_base_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 512,
+      "heads": 8,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L-256.json b/src/open_clip/model_configs/ViTamin-L-256.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_256",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L-336.json b/src/open_clip/model_configs/ViTamin-L-336.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L.json b/src/open_clip/model_configs/ViTamin-L.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L2-256.json b/src/open_clip/model_configs/ViTamin-L2-256.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_256",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L2-336.json b/src/open_clip/model_configs/ViTamin-L2-336.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-L2.json b/src/open_clip/model_configs/ViTamin-L2.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-S-LTT.json b/src/open_clip/model_configs/ViTamin-S-LTT.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_small_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-S.json b/src/open_clip/model_configs/ViTamin-S.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 384,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_small_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 384,
+      "heads": 6,
+      "layers": 12
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-XL-256.json b/src/open_clip/model_configs/ViTamin-XL-256.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_xlarge_256",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1152,
+      "heads": 16,
+      "layers": 27
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-XL-336.json b/src/open_clip/model_configs/ViTamin-XL-336.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_xlarge_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1152,
+      "heads": 16,
+      "layers": 27
+    },
+    "custom_text": true
+}
diff --git a/src/open_clip/model_configs/ViTamin-XL-384.json b/src/open_clip/model_configs/ViTamin-XL-384.json
@@ -0,0 +1,20 @@
+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_xlarge_384",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1152,
+      "heads": 16,
+      "layers": 27
+    },
+    "custom_text": true
+}