Skip to content

Commit

Permalink
Fixed merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
Pringled committed Sep 30, 2024
2 parents 35d929f + 9a887a3 commit 890095c
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 9 deletions.
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@

<div align="center">
<picture>
<img width="50%" alt="Model2Vec logo" src="assets/images/logo.png">
</picture>
</a>
</div>

<div align="center">
<h1>Model2Vec: Distill a Small Fast Model from any Sentence Transformer</h1>
<h2>Distill a Small Fast Model from any Sentence Transformer</h2>
</div>

<div align="center">
Expand All @@ -16,18 +24,25 @@
<a href="https://pypi.org/project/model2vec/"><img src="https://img.shields.io/pypi/pyversions/model2vec" alt="Supported Python versions"></a>
<a href="https://pepy.tech/project/model2vec">
<img src="https://static.pepy.tech/badge/model2vec" alt="Downloads">
<<<<<<< HEAD
</a>
<a href="https://app.codecov.io/gh/MinishLab/model2vec">
<img src="https://codecov.io/gh/MinishLab/model2vec/graph/badge.svg?token=21TWJ6B5ET" alt="Downloads">
</a>
=======
</a>
>>>>>>> 9a887a3a464ae987213db59e2e65c5dccd5df246
<a href="https://github.com/MinishLab/model2vec/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License - MIT"></a>
</h2>
</div>

<p align="center">
<img src="assets/images/model2vec_model_diagram.png" alt="Model2Vec">
<img width="75%", src="assets/images/model2vec_model_diagram.png" alt="Model2Vec">
</p>

Model2Vec is a technique to turn any sentence transformer into a really small fast model, reducing model size by 15x and making the models up to 500x faster, with a small drop in performance. See our results [here](#classification-and-speed-benchmarks), or dive in to see how it works.

## Table of Contents
- [Quickstart](#quickstart)
- [What is Model2Vec?](#what-is-model2vec)
Expand Down Expand Up @@ -282,7 +297,7 @@ MIT

If you use Model2Vec in your research, please cite the following:
```bibtex
@software{minishlab2024word2vec,
@software{minishlab2024model2vec,
authors = {Stephan Tulkens, Thomas van Dongen},
title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
year = {2024},
Expand Down
Binary file added assets/images/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/images/tutorial_ezlo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 9 additions & 2 deletions model2vec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
else:
self.unk_token_id = None

self.median_token_length = int(np.median([len(token) for token in self.tokens]))
self.config = config
self.base_model_name = base_model_name
self.language = language
Expand Down Expand Up @@ -123,6 +124,10 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
:param max_length: The maximum length of the sentence.
:return: The tokens.
"""
if max_length is not None:
m = max_length * self.median_token_length
sentences = [sentence[:m] for sentence in sentences]

encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
encodings_ids = [encoding.ids for encoding in encodings]

Expand Down Expand Up @@ -154,9 +159,11 @@ def from_pretrained(
:param token: The huggingface token to use.
:return: A StaticEmbedder
"""
embeddings, tokenizer, config = load_pretrained(path, token=token)
embeddings, tokenizer, config, metadata = load_pretrained(path, token=token)

return cls(embeddings, tokenizer, config)
return cls(
embeddings, tokenizer, config, base_model_name=metadata.get("base_model"), language=metadata.get("language")
)

def encode(
self,
Expand Down
29 changes: 26 additions & 3 deletions model2vec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _create_model_card(

def load_pretrained(
folder_or_repo_path: str | Path, token: str | None = None
) -> tuple[np.ndarray, Tokenizer, dict[str, Any]]:
) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
"""
Loads a pretrained model from a folder.
Expand All @@ -111,7 +111,7 @@ def load_pretrained(
- If the local path is not found, we will attempt to load from the huggingface hub.
:param token: The huggingface token to use.
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
:return: The embeddings, tokenizer, and config.
:return: The embeddings, tokenizer, config, and metadata.
"""
folder_or_repo_path = Path(folder_or_repo_path)
Expand All @@ -133,6 +133,10 @@ def load_pretrained(
if not tokenizer_path.exists():
raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")

# README is optional, so this is a bit finicky.
readme_path = folder_or_repo_path / "README.md"
metadata = _get_metadata_from_readme(readme_path)

else:
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
try:
Expand All @@ -148,6 +152,13 @@ def load_pretrained(
# Raise original exception.
raise e

try:
readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
metadata = _get_metadata_from_readme(Path(readme_path))
except huggingface_hub.utils.EntryNotFoundError:
logger.info("No README found in the model folder. No model card loaded.")
metadata = {}

config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "config.json", token=token)
tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "tokenizer.json", token=token)

Expand All @@ -162,7 +173,19 @@ def load_pretrained(
f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
)

return embeddings, tokenizer, config
return embeddings, tokenizer, config, metadata


def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
"""Get metadata from a README file."""
if not readme_path.exists():
logger.info(f"README file not found in {readme_path}. No model card loaded.")
return {}
model_card = ModelCard.load(readme_path)
data: dict[str, Any] = model_card.data.to_dict()
if not data:
logger.info("File README.md exists, but was empty. No model card loaded.")
return data


def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: str | None) -> None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "model2vec"
description = "Distill a Small Fast Model from any Sentence Transformer"
readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE" }
version = "0.1.2"
version = "0.2.0"
requires-python = ">=3.10"
authors = [{ name = "Stéphan Tulkens", email = "[email protected]"}, {name = "Thomas van Dongen", email = "[email protected]"}]

Expand Down
25 changes: 25 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path
from tempfile import NamedTemporaryFile

from model2vec.utils import _get_metadata_from_readme


def test__get_metadata_from_readme_not_exists() -> None:
"""Test getting metadata from a README."""
assert _get_metadata_from_readme(Path("zzz")) == {}


def test__get_metadata_from_readme_mocked_file() -> None:
"""Test getting metadata from a README."""
with NamedTemporaryFile() as f:
f.write(b"---\nkey: value\n---\n")
f.flush()
assert _get_metadata_from_readme(Path(f.name))["key"] == "value"


def test__get_metadata_from_readme_mocked_file_keys() -> None:
"""Test getting metadata from a README."""
with NamedTemporaryFile() as f:
f.write(b"")
f.flush()
assert set(_get_metadata_from_readme(Path(f.name))) == set()
14 changes: 14 additions & 0 deletions tutorials/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<div align="center">
<picture>
<img width="75%", alt="Tutorials" src="../assets/images/tutorial_ezlo.png">
</picture>
</a>
</div>

# Tutorials

This is a list of all our tutorials. They are all self-contained ipython notebooks.

| | what? | Link |
|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
| **Recipe search** | Learn how to do lightning-fast semantic search by distilling a small model. Compare a really tiny model to a larger with one with a better vocabulary. Learn what Fattoush is (delicious). | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/minishlab/model2vec/blob/master/tutorials/recipe_search.ipynb) |

0 comments on commit 890095c

Please sign in to comment.