Fixed merge conflict

MinishLab · Sep 30, 2024 · 890095c · 890095c
2 parents 35d929f + 9a887a3
commit 890095c
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,13 @@
+
+<div align="center">
+    <picture>
+      <img width="50%" alt="Model2Vec logo" src="assets/images/logo.png">
+    </picture>
+  </a>
+</div>
+
 <div align="center">
-  <h1>Model2Vec: Distill a Small Fast Model from any Sentence Transformer</h1>
+  <h2>Distill a Small Fast Model from any Sentence Transformer</h2>
 </div>
 
 <div align="center">
@@ -16,18 +24,25 @@
     <a href="https://pypi.org/project/model2vec/"><img src="https://img.shields.io/pypi/pyversions/model2vec" alt="Supported Python versions"></a>
     <a href="https://pepy.tech/project/model2vec">
     <img src="https://static.pepy.tech/badge/model2vec" alt="Downloads">
+<<<<<<< HEAD
     </a>
     <a href="https://app.codecov.io/gh/MinishLab/model2vec">
     <img src="https://codecov.io/gh/MinishLab/model2vec/graph/badge.svg?token=21TWJ6B5ET" alt="Downloads">
     </a>
+=======
+  </a>
+>>>>>>> 9a887a3a464ae987213db59e2e65c5dccd5df246
     <a href="https://github.com/MinishLab/model2vec/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License - MIT"></a>
   </h2>
 </div>
 
+
 <p align="center">
-  <img src="assets/images/model2vec_model_diagram.png" alt="Model2Vec">
+  <img width="75%", src="assets/images/model2vec_model_diagram.png" alt="Model2Vec">
 </p>
 
+Model2Vec is a technique to turn any sentence transformer into a really small fast model, reducing model size by 15x and making the models up to 500x faster, with a small drop in performance. See our results [here](#classification-and-speed-benchmarks), or dive in to see how it works.
+
 ## Table of Contents
 - [Quickstart](#quickstart)
 - [What is Model2Vec?](#what-is-model2vec)
@@ -282,7 +297,7 @@ MIT
 
 If you use Model2Vec in your research, please cite the following:
 ```bibtex
-@software{minishlab2024word2vec,
+@software{minishlab2024model2vec,
   authors = {Stephan Tulkens, Thomas van Dongen},
   title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
   year = {2024},

diff --git a/assets/images/logo.png b/assets/images/logo.png
diff --git a/assets/images/tutorial_ezlo.png b/assets/images/tutorial_ezlo.png
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -56,6 +56,7 @@ def __init__(
         else:
             self.unk_token_id = None
 
+        self.median_token_length = int(np.median([len(token) for token in self.tokens]))
         self.config = config
         self.base_model_name = base_model_name
         self.language = language
@@ -123,6 +124,10 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
         :param max_length: The maximum length of the sentence.
         :return: The tokens.
         """
+        if max_length is not None:
+            m = max_length * self.median_token_length
+            sentences = [sentence[:m] for sentence in sentences]
+
         encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
         encodings_ids = [encoding.ids for encoding in encodings]
 
@@ -154,9 +159,11 @@ def from_pretrained(
         :param token: The huggingface token to use.
         :return: A StaticEmbedder
         """
-        embeddings, tokenizer, config = load_pretrained(path, token=token)
+        embeddings, tokenizer, config, metadata = load_pretrained(path, token=token)
 
-        return cls(embeddings, tokenizer, config)
+        return cls(
+            embeddings, tokenizer, config, base_model_name=metadata.get("base_model"), language=metadata.get("language")
+        )
 
     def encode(
         self,

diff --git a/model2vec/utils.py b/model2vec/utils.py
@@ -102,7 +102,7 @@ def _create_model_card(
 
 def load_pretrained(
     folder_or_repo_path: str | Path, token: str | None = None
-) -> tuple[np.ndarray, Tokenizer, dict[str, Any]]:
+) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
 
@@ -111,7 +111,7 @@ def load_pretrained(
         - If the local path is not found, we will attempt to load from the huggingface hub.
     :param token: The huggingface token to use.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
-    :return: The embeddings, tokenizer, and config.
+    :return: The embeddings, tokenizer, config, and metadata.
 
     """
     folder_or_repo_path = Path(folder_or_repo_path)
@@ -133,6 +133,10 @@ def load_pretrained(
         if not tokenizer_path.exists():
             raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
 
+        # README is optional, so this is a bit finicky.
+        readme_path = folder_or_repo_path / "README.md"
+        metadata = _get_metadata_from_readme(readme_path)
+
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
         try:
@@ -148,6 +152,13 @@ def load_pretrained(
                 # Raise original exception.
                 raise e
 
+        try:
+            readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
+            metadata = _get_metadata_from_readme(Path(readme_path))
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No README found in the model folder. No model card loaded.")
+            metadata = {}
+
         config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "config.json", token=token)
         tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "tokenizer.json", token=token)
 
@@ -162,7 +173,19 @@ def load_pretrained(
             f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
         )
 
-    return embeddings, tokenizer, config
+    return embeddings, tokenizer, config, metadata
+
+
+def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
+    """Get metadata from a README file."""
+    if not readme_path.exists():
+        logger.info(f"README file not found in {readme_path}. No model card loaded.")
+        return {}
+    model_card = ModelCard.load(readme_path)
+    data: dict[str, Any] = model_card.data.to_dict()
+    if not data:
+        logger.info("File README.md exists, but was empty. No model card loaded.")
+    return data
 
 
 def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: str | None) -> None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "model2vec"
 description = "Distill a Small Fast Model from any Sentence Transformer"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { file = "LICENSE" }
-version = "0.1.2"
+version = "0.2.0"
 requires-python = ">=3.10"
 authors = [{ name = "Stéphan Tulkens", email = "[email protected]"}, {name = "Thomas van Dongen", email = "[email protected]"}]
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+
+from model2vec.utils import _get_metadata_from_readme
+
+
+def test__get_metadata_from_readme_not_exists() -> None:
+    """Test getting metadata from a README."""
+    assert _get_metadata_from_readme(Path("zzz")) == {}
+
+
+def test__get_metadata_from_readme_mocked_file() -> None:
+    """Test getting metadata from a README."""
+    with NamedTemporaryFile() as f:
+        f.write(b"---\nkey: value\n---\n")
+        f.flush()
+        assert _get_metadata_from_readme(Path(f.name))["key"] == "value"
+
+
+def test__get_metadata_from_readme_mocked_file_keys() -> None:
+    """Test getting metadata from a README."""
+    with NamedTemporaryFile() as f:
+        f.write(b"")
+        f.flush()
+        assert set(_get_metadata_from_readme(Path(f.name))) == set()
diff --git a/tutorials/README.md b/tutorials/README.md
@@ -0,0 +1,14 @@
+<div align="center">
+    <picture>
+      <img width="75%", alt="Tutorials" src="../assets/images/tutorial_ezlo.png">
+    </picture>
+  </a>
+</div>
+
+# Tutorials
+
+This is a list of all our tutorials. They are all self-contained ipython notebooks.
+
+|                    | what?                                                                                                                                                                      | Link |
+|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
+| **Recipe search** | Learn how to do lightning-fast semantic search by distilling a small model. Compare a really tiny model to a larger with one with a better vocabulary. Learn what Fattoush is (delicious). | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/minishlab/model2vec/blob/master/tutorials/recipe_search.ipynb)     |