diff --git a/README.md b/README.md index c2b5904..740691b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ + +
+ + Model2Vec logo + + +
+
-

Model2Vec: Distill a Small Fast Model from any Sentence Transformer

+

Distill a Small Fast Model from any Sentence Transformer

@@ -16,18 +24,25 @@ Supported Python versions Downloads +<<<<<<< HEAD Downloads +======= + +>>>>>>> 9a887a3a464ae987213db59e2e65c5dccd5df246 License - MIT
+

- Model2Vec + Model2Vec

+Model2Vec is a technique to turn any sentence transformer into a really small fast model, reducing model size by 15x and making the models up to 500x faster, with a small drop in performance. See our results [here](#classification-and-speed-benchmarks), or dive in to see how it works. + ## Table of Contents - [Quickstart](#quickstart) - [What is Model2Vec?](#what-is-model2vec) @@ -282,7 +297,7 @@ MIT If you use Model2Vec in your research, please cite the following: ```bibtex -@software{minishlab2024word2vec, +@software{minishlab2024model2vec, authors = {Stephan Tulkens, Thomas van Dongen}, title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model}, year = {2024}, diff --git a/assets/images/logo.png b/assets/images/logo.png new file mode 100644 index 0000000..6ffa6e1 Binary files /dev/null and b/assets/images/logo.png differ diff --git a/assets/images/tutorial_ezlo.png b/assets/images/tutorial_ezlo.png new file mode 100644 index 0000000..1e3e174 Binary files /dev/null and b/assets/images/tutorial_ezlo.png differ diff --git a/model2vec/model.py b/model2vec/model.py index 0764d37..44e146b 100644 --- a/model2vec/model.py +++ b/model2vec/model.py @@ -56,6 +56,7 @@ def __init__( else: self.unk_token_id = None + self.median_token_length = int(np.median([len(token) for token in self.tokens])) self.config = config self.base_model_name = base_model_name self.language = language @@ -123,6 +124,10 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple :param max_length: The maximum length of the sentence. :return: The tokens. """ + if max_length is not None: + m = max_length * self.median_token_length + sentences = [sentence[:m] for sentence in sentences] + encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False) encodings_ids = [encoding.ids for encoding in encodings] @@ -154,9 +159,11 @@ def from_pretrained( :param token: The huggingface token to use. :return: A StaticEmbedder """ - embeddings, tokenizer, config = load_pretrained(path, token=token) + embeddings, tokenizer, config, metadata = load_pretrained(path, token=token) - return cls(embeddings, tokenizer, config) + return cls( + embeddings, tokenizer, config, base_model_name=metadata.get("base_model"), language=metadata.get("language") + ) def encode( self, diff --git a/model2vec/utils.py b/model2vec/utils.py index d9c2875..7ec8ee7 100644 --- a/model2vec/utils.py +++ b/model2vec/utils.py @@ -102,7 +102,7 @@ def _create_model_card( def load_pretrained( folder_or_repo_path: str | Path, token: str | None = None -) -> tuple[np.ndarray, Tokenizer, dict[str, Any]]: +) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]: """ Loads a pretrained model from a folder. @@ -111,7 +111,7 @@ def load_pretrained( - If the local path is not found, we will attempt to load from the huggingface hub. :param token: The huggingface token to use. :raises: FileNotFoundError if the folder exists, but the file does not exist locally. - :return: The embeddings, tokenizer, and config. + :return: The embeddings, tokenizer, config, and metadata. """ folder_or_repo_path = Path(folder_or_repo_path) @@ -133,6 +133,10 @@ def load_pretrained( if not tokenizer_path.exists(): raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}") + # README is optional, so this is a bit finicky. + readme_path = folder_or_repo_path / "README.md" + metadata = _get_metadata_from_readme(readme_path) + else: logger.info("Folder does not exist locally, attempting to use huggingface hub.") try: @@ -148,6 +152,13 @@ def load_pretrained( # Raise original exception. raise e + try: + readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token) + metadata = _get_metadata_from_readme(Path(readme_path)) + except huggingface_hub.utils.EntryNotFoundError: + logger.info("No README found in the model folder. No model card loaded.") + metadata = {} + config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "config.json", token=token) tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "tokenizer.json", token=token) @@ -162,7 +173,19 @@ def load_pretrained( f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`" ) - return embeddings, tokenizer, config + return embeddings, tokenizer, config, metadata + + +def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]: + """Get metadata from a README file.""" + if not readme_path.exists(): + logger.info(f"README file not found in {readme_path}. No model card loaded.") + return {} + model_card = ModelCard.load(readme_path) + data: dict[str, Any] = model_card.data.to_dict() + if not data: + logger.info("File README.md exists, but was empty. No model card loaded.") + return data def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: str | None) -> None: diff --git a/pyproject.toml b/pyproject.toml index f636139..6d02a68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "model2vec" description = "Distill a Small Fast Model from any Sentence Transformer" readme = { file = "README.md", content-type = "text/markdown" } license = { file = "LICENSE" } -version = "0.1.2" +version = "0.2.0" requires-python = ">=3.10" authors = [{ name = "Stéphan Tulkens", email = "stephantul@gmail.com"}, {name = "Thomas van Dongen", email = "thomas123@live.nl"}] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..7ac4c70 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,25 @@ +from pathlib import Path +from tempfile import NamedTemporaryFile + +from model2vec.utils import _get_metadata_from_readme + + +def test__get_metadata_from_readme_not_exists() -> None: + """Test getting metadata from a README.""" + assert _get_metadata_from_readme(Path("zzz")) == {} + + +def test__get_metadata_from_readme_mocked_file() -> None: + """Test getting metadata from a README.""" + with NamedTemporaryFile() as f: + f.write(b"---\nkey: value\n---\n") + f.flush() + assert _get_metadata_from_readme(Path(f.name))["key"] == "value" + + +def test__get_metadata_from_readme_mocked_file_keys() -> None: + """Test getting metadata from a README.""" + with NamedTemporaryFile() as f: + f.write(b"") + f.flush() + assert set(_get_metadata_from_readme(Path(f.name))) == set() diff --git a/tutorials/README.md b/tutorials/README.md new file mode 100644 index 0000000..642935b --- /dev/null +++ b/tutorials/README.md @@ -0,0 +1,14 @@ +
+ + Tutorials + + +
+ +# Tutorials + +This is a list of all our tutorials. They are all self-contained ipython notebooks. + +| | what? | Link | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| +| **Recipe search** | Learn how to do lightning-fast semantic search by distilling a small model. Compare a really tiny model to a larger with one with a better vocabulary. Learn what Fattoush is (delicious). | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/minishlab/model2vec/blob/master/tutorials/recipe_search.ipynb) |