-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
340 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
"""The module :mod:`sphinx_rag_search_engine.embedding` contains functions to embed | ||
transformers allowing to embed text. | ||
""" | ||
|
||
from ._sentence_transformer import SentenceTransformer | ||
|
||
__all__ = ["SentenceTransformer"] |
104 changes: 104 additions & 0 deletions
104
sphinx_rag_search_engine/embedding/_sentence_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
"""SentenceTransformer with a scikit-learn API.""" | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
from sentence_transformers import SentenceTransformer as SentenceTransformerBase | ||
|
||
|
||
class SentenceTransformer(BaseEstimator, TransformerMixin): | ||
"""Sentence transformer that embeds sentences to embeddings. | ||
This is a thin wrapper around :class:`sentence_transformers.SentenceTransformer` | ||
that follows the scikit-learn API and thus can be used inside a scikit-learn | ||
pipeline. | ||
Parameters | ||
---------- | ||
model_name_or_path : str, default=None | ||
If it is a filepath on disc, it loads the model from that path. If it is not a | ||
path, it first tries to download a pre-trained SentenceTransformer model. If | ||
that fails, tries to construct a model from Huggingface models repository with | ||
that name. | ||
modules : Iterable of nn.Module, default=None | ||
This parameter can be used to create custom SentenceTransformer models from | ||
scratch. | ||
device : str, default=None | ||
Device (e.g. "cpu", "cuda", "mps") that should be used for computation. If None, | ||
checks if a GPU can be used. | ||
cache_folder : str, default=None | ||
Path to store models. | ||
use_auth_token : bool or str, default=None | ||
HuggingFace authentication token to download private models. | ||
show_progress_bar : bool, default=True | ||
Whether to show a progress bar or not during `transform`. | ||
""" | ||
_parameter_constraints = { | ||
"model_name_or_path": [str, None], | ||
"modules": "no_validation", | ||
"device": [str, None], | ||
"cache_folder": [str, None], | ||
"use_auth_token": [str, bool, None], | ||
"show_progress_bar": [bool], | ||
} | ||
|
||
def __init__( | ||
self, | ||
model_name_or_path=None, | ||
modules=None, | ||
device=None, | ||
cache_folder=None, | ||
use_auth_token=None, | ||
show_progress_bar=True, | ||
): | ||
self.model_name_or_path=model_name_or_path | ||
self.modules=modules | ||
self.device=device | ||
self.cache_folder=cache_folder | ||
self.use_auth_token=use_auth_token | ||
self.show_progress_bar = show_progress_bar | ||
|
||
def fit(self, X=None, y=None): | ||
"""No-op operation, only validate parameters. | ||
Parameters | ||
---------- | ||
X : None | ||
This parameter is ignored. | ||
y : None | ||
This parameter is ignored. | ||
Returns | ||
------- | ||
self | ||
The fitted estimator. | ||
""" | ||
self._validate_params() | ||
self._embedding = SentenceTransformerBase( | ||
model_name_or_path=self.model_name_or_path, | ||
modules=self.modules, | ||
device=self.device, | ||
cache_folder=self.cache_folder, | ||
use_auth_token=self.use_auth_token, | ||
) | ||
return self | ||
|
||
def transform(self, X): | ||
"""Embed sentences to vectors. | ||
Parameters | ||
---------- | ||
X : Iterable of dict (n_sentences,) | ||
Iterable of dictionaries with at least a "text" key. | ||
Returns | ||
------- | ||
embedding : ndarray of shape (n_sentences, embedding_size) | ||
The embedding of the sentences. | ||
""" | ||
return self._embedding.encode( | ||
[chunk["text"] for chunk in X], show_progress_bar=self.show_progress_bar | ||
) |
Empty file.
Empty file.
7 changes: 7 additions & 0 deletions
7
...bedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/1_Pooling/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"word_embedding_dimension": 768, | ||
"pooling_mode_cls_token": false, | ||
"pooling_mode_mean_tokens": true, | ||
"pooling_mode_max_tokens": false, | ||
"pooling_mode_mean_sqrt_len_tokens": false | ||
} |
122 changes: 122 additions & 0 deletions
122
...embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
--- | ||
pipeline_tag: sentence-similarity | ||
license: apache-2.0 | ||
tags: | ||
- sentence-transformers | ||
- feature-extraction | ||
- sentence-similarity | ||
- transformers | ||
datasets: | ||
- flax-sentence-embeddings/stackexchange_xml | ||
- s2orc | ||
- ms_marco | ||
- wiki_atomic_edits | ||
- snli | ||
- multi_nli | ||
- embedding-data/altlex | ||
- embedding-data/simple-wiki | ||
- embedding-data/flickr30k-captions | ||
- embedding-data/coco_captions | ||
- embedding-data/sentence-compression | ||
- embedding-data/QQP | ||
- yahoo_answers_topics | ||
|
||
--- | ||
|
||
# sentence-transformers/paraphrase-albert-small-v2 | ||
|
||
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search. | ||
|
||
|
||
|
||
## Usage (Sentence-Transformers) | ||
|
||
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: | ||
|
||
``` | ||
pip install -U sentence-transformers | ||
``` | ||
|
||
Then you can use the model like this: | ||
|
||
```python | ||
from sentence_transformers import SentenceTransformer | ||
sentences = ["This is an example sentence", "Each sentence is converted"] | ||
|
||
model = SentenceTransformer('sentence-transformers/paraphrase-albert-small-v2') | ||
embeddings = model.encode(sentences) | ||
print(embeddings) | ||
``` | ||
|
||
|
||
|
||
## Usage (HuggingFace Transformers) | ||
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. | ||
|
||
```python | ||
from transformers import AutoTokenizer, AutoModel | ||
import torch | ||
|
||
|
||
#Mean Pooling - Take attention mask into account for correct averaging | ||
def mean_pooling(model_output, attention_mask): | ||
token_embeddings = model_output[0] #First element of model_output contains all token embeddings | ||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | ||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) | ||
|
||
|
||
# Sentences we want sentence embeddings for | ||
sentences = ['This is an example sentence', 'Each sentence is converted'] | ||
|
||
# Load model from HuggingFace Hub | ||
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-albert-small-v2') | ||
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-albert-small-v2') | ||
|
||
# Tokenize sentences | ||
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') | ||
|
||
# Compute token embeddings | ||
with torch.no_grad(): | ||
model_output = model(**encoded_input) | ||
|
||
# Perform pooling. In this case, max pooling. | ||
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) | ||
|
||
print("Sentence embeddings:") | ||
print(sentence_embeddings) | ||
``` | ||
|
||
|
||
|
||
## Evaluation Results | ||
|
||
|
||
|
||
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/paraphrase-albert-small-v2) | ||
|
||
|
||
|
||
## Full Model Architecture | ||
``` | ||
SentenceTransformer( | ||
(0): Transformer({'max_seq_length': 100, 'do_lower_case': False}) with Transformer model: AlbertModel | ||
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) | ||
) | ||
``` | ||
|
||
## Citing & Authors | ||
|
||
This model was trained by [sentence-transformers](https://www.sbert.net/). | ||
|
||
If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084): | ||
```bibtex | ||
@inproceedings{reimers-2019-sentence-bert, | ||
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", | ||
author = "Reimers, Nils and Gurevych, Iryna", | ||
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", | ||
month = "11", | ||
year = "2019", | ||
publisher = "Association for Computational Linguistics", | ||
url = "http://arxiv.org/abs/1908.10084", | ||
} | ||
``` |
32 changes: 32 additions & 0 deletions
32
..._engine/embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{ | ||
"_name_or_path": "old_models/paraphrase-albert-small-v2/0_Transformer", | ||
"architectures": [ | ||
"AlbertModel" | ||
], | ||
"attention_probs_dropout_prob": 0, | ||
"bos_token_id": 2, | ||
"classifier_dropout_prob": 0.1, | ||
"down_scale_factor": 1, | ||
"embedding_size": 128, | ||
"eos_token_id": 3, | ||
"gap_size": 0, | ||
"hidden_act": "gelu_new", | ||
"hidden_dropout_prob": 0, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"inner_group_num": 1, | ||
"intermediate_size": 3072, | ||
"layer_norm_eps": 1e-12, | ||
"max_position_embeddings": 512, | ||
"model_type": "albert", | ||
"net_structure_type": 0, | ||
"num_attention_heads": 12, | ||
"num_hidden_groups": 1, | ||
"num_hidden_layers": 6, | ||
"num_memory_blocks": 0, | ||
"pad_token_id": 0, | ||
"position_embedding_type": "absolute", | ||
"transformers_version": "4.7.0", | ||
"type_vocab_size": 2, | ||
"vocab_size": 30000 | ||
} |
7 changes: 7 additions & 0 deletions
7
...s/data/sentence-transformers_paraphrase-albert-small-v2/config_sentence_transformers.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"__version__": { | ||
"sentence_transformers": "2.0.0", | ||
"transformers": "4.7.0", | ||
"pytorch": "1.9.0+cu102" | ||
} | ||
} |
14 changes: 14 additions & 0 deletions
14
...engine/embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/modules.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[ | ||
{ | ||
"idx": 0, | ||
"name": "0", | ||
"path": "", | ||
"type": "sentence_transformers.models.Transformer" | ||
}, | ||
{ | ||
"idx": 1, | ||
"name": "1", | ||
"path": "1_Pooling", | ||
"type": "sentence_transformers.models.Pooling" | ||
} | ||
] |
Binary file added
BIN
+44.6 MB
...e/embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/pytorch_model.bin
Binary file not shown.
4 changes: 4 additions & 0 deletions
4
...ing/tests/data/sentence-transformers_paraphrase-albert-small-v2/sentence_bert_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"max_seq_length": 100, | ||
"do_lower_case": false | ||
} |
1 change: 1 addition & 0 deletions
1
...dding/tests/data/sentence-transformers_paraphrase-albert-small-v2/special_tokens_map.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}} |
Binary file added
BIN
+742 KB
...engine/embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/spiece.model
Binary file not shown.
1 change: 1 addition & 0 deletions
1
...gine/embedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/tokenizer.json
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...bedding/tests/data/sentence-transformers_paraphrase-albert-small-v2/tokenizer_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "old_models/paraphrase-albert-small-v2/0_Transformer"} |
15 changes: 15 additions & 0 deletions
15
sphinx_rag_search_engine/embedding/tests/test_sentence_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from pathlib import Path | ||
|
||
from sphinx_rag_search_engine.embedding import SentenceTransformer | ||
|
||
|
||
def test_xxx(): | ||
cache_folder_path = Path(__file__).parent / "data" | ||
model_name_or_path = "sentence-transformers/paraphrase-albert-small-v2" | ||
|
||
embedder = SentenceTransformer( | ||
model_name_or_path=model_name_or_path, cache_folder=str(cache_folder_path) | ||
) | ||
print( | ||
embedder.fit_transform([{"source": "hello world", "text": "hello world"}]) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.