Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/MinishLab/model2vec into ad…
Browse files Browse the repository at this point in the history
…d-deduplication-tutorial
  • Loading branch information
Pringled committed Oct 11, 2024
2 parents cee0f34 + 84570b6 commit 803d794
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 13 deletions.
9 changes: 8 additions & 1 deletion model2vec/distill/distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import numpy as np
from huggingface_hub import model_info
from huggingface_hub.utils._errors import RepositoryNotFoundError
from sklearn.decomposition import PCA
from tokenizers.models import BPE, Unigram
from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerFast
Expand All @@ -16,6 +15,14 @@
from model2vec.distill.utils import select_optimal_device
from model2vec.model import StaticModel

try:
# For huggingface_hub>=0.25.0
from huggingface_hub.errors import RepositoryNotFoundError
except ImportError:
# For huggingface_hub<0.25.0
from huggingface_hub.utils._errors import RepositoryNotFoundError


logger = logging.getLogger(__name__)


Expand Down
24 changes: 16 additions & 8 deletions model2vec/distill/inference.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import inspect
import logging
from pathlib import Path
from typing import Protocol
Expand Down Expand Up @@ -127,20 +128,27 @@ def create_output_embeddings_from_model_name(
for batch_idx in tqdm(range(0, len(stacked), _DEFAULT_BATCH_SIZE)):
batch = stacked[batch_idx : batch_idx + _DEFAULT_BATCH_SIZE].to(model.device)
with torch.no_grad():
# NOTE: we create these masks because nomic embed requires them.
# Normally, we could set them to None
token_type_ids = torch.zeros_like(batch)
attention_mask = torch.ones_like(batch)
encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(
input_ids=batch.to(device), attention_mask=attention_mask, token_type_ids=token_type_ids
)
out: torch.Tensor = encoded.last_hidden_state
# Prepare model inputs
model_inputs = {"input_ids": batch.to(device), "attention_mask": attention_mask}

# Add token_type_ids only if the model supports it
if "token_type_ids" in inspect.getfullargspec(model.forward).args:
model_inputs["token_type_ids"] = torch.zeros_like(batch)

# Perform the forward pass
encoded_output: BaseModelOutputWithPoolingAndCrossAttentions = model(**model_inputs)
out: torch.Tensor = encoded_output.last_hidden_state
# NOTE: If the dtype is bfloat 16, we convert to float32,
# because numpy does not suport bfloat16
# See here: https://github.com/numpy/numpy/issues/19808
if out.dtype == torch.bfloat16:
out = out.float()
intermediate_weights.append(out[:, 1].cpu().numpy())

# Add the output to the intermediate weights
intermediate_weights.append(out[:, 1].detach().cpu().numpy())

# Concatenate the intermediate weights
out_weights = np.concatenate(intermediate_weights)

return tokenizer.convert_ids_to_tokens(ids), out_weights
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "model2vec"
description = "Distill a Small Fast Model from any Sentence Transformer"
readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE" }
version = "0.2.3"
version = "0.2.4"
requires-python = ">=3.10"
authors = [{ name = "Stéphan Tulkens", email = "[email protected]"}, {name = "Thomas van Dongen", email = "[email protected]"}]

Expand Down
6 changes: 5 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def to(self, device: str) -> MockPreTrainedModel:
self.device = device
return self

def __call__(self, *args: Any, **kwargs: Any) -> Any:
def forward(self, *args: Any, **kwargs: Any) -> Any:
# Simulate a last_hidden_state output for a transformer model
batch_size, seq_length = kwargs["input_ids"].shape
# Return a tensor of shape (batch_size, seq_length, 768)
Expand All @@ -56,6 +56,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
},
)

def __call__(self, *args: Any, **kwargs: Any) -> Any:
# Simply call the forward method to simulate the same behavior as transformers models
return self.forward(*args, **kwargs)

return MockPreTrainedModel()


Expand Down
8 changes: 7 additions & 1 deletion tests/test_distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@

import numpy as np
import pytest
from huggingface_hub.utils._errors import RepositoryNotFoundError
from pytest import LogCaptureFixture
from transformers import AutoModel, BertTokenizerFast

from model2vec.distill.distillation import _clean_vocabulary, _post_process_embeddings, distill, distill_from_model
from model2vec.model import StaticModel

try:
# For huggingface_hub>=0.25.0
from huggingface_hub.errors import RepositoryNotFoundError
except ImportError:
# For huggingface_hub<0.25.0
from huggingface_hub.utils._errors import RepositoryNotFoundError

rng = np.random.default_rng()


Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 803d794

Please sign in to comment.