fix: Fix token type ids not supported (#77)

* Fixed token type ids not supported for every model error * Updated test * Updated test * Changed to inspect
MinishLab · Oct 11, 2024 · 84570b6 · 84570b6
1 parent 2ce3c97
commit 84570b6
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 10 deletions.
diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import inspect
 import logging
 from pathlib import Path
 from typing import Protocol
@@ -127,20 +128,27 @@ def create_output_embeddings_from_model_name(
     for batch_idx in tqdm(range(0, len(stacked), _DEFAULT_BATCH_SIZE)):
         batch = stacked[batch_idx : batch_idx + _DEFAULT_BATCH_SIZE].to(model.device)
         with torch.no_grad():
-            # NOTE: we create these masks because nomic embed requires them.
-            # Normally, we could set them to None
-            token_type_ids = torch.zeros_like(batch)
             attention_mask = torch.ones_like(batch)
-            encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(
-                input_ids=batch.to(device), attention_mask=attention_mask, token_type_ids=token_type_ids
-            )
-            out: torch.Tensor = encoded.last_hidden_state
+            # Prepare model inputs
+            model_inputs = {"input_ids": batch.to(device), "attention_mask": attention_mask}
+
+            # Add token_type_ids only if the model supports it
+            if "token_type_ids" in inspect.getfullargspec(model.forward).args:
+                model_inputs["token_type_ids"] = torch.zeros_like(batch)
+
+            # Perform the forward pass
+            encoded_output: BaseModelOutputWithPoolingAndCrossAttentions = model(**model_inputs)
+            out: torch.Tensor = encoded_output.last_hidden_state
             # NOTE: If the dtype is bfloat 16, we convert to float32,
             # because numpy does not suport bfloat16
             # See here: https://github.com/numpy/numpy/issues/19808
             if out.dtype == torch.bfloat16:
                 out = out.float()
-        intermediate_weights.append(out[:, 1].cpu().numpy())
+
+        # Add the output to the intermediate weights
+        intermediate_weights.append(out[:, 1].detach().cpu().numpy())
+
+    # Concatenate the intermediate weights
     out_weights = np.concatenate(intermediate_weights)
 
     return tokenizer.convert_ids_to_tokens(ids), out_weights
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -44,7 +44,7 @@ def to(self, device: str) -> MockPreTrainedModel:
             self.device = device
             return self
 
-        def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        def forward(self, *args: Any, **kwargs: Any) -> Any:
             # Simulate a last_hidden_state output for a transformer model
             batch_size, seq_length = kwargs["input_ids"].shape
             # Return a tensor of shape (batch_size, seq_length, 768)
@@ -56,6 +56,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
                 },
             )
 
+        def __call__(self, *args: Any, **kwargs: Any) -> Any:
+            # Simply call the forward method to simulate the same behavior as transformers models
+            return self.forward(*args, **kwargs)
+
     return MockPreTrainedModel()
 
 

diff --git a/uv.lock b/uv.lock