Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: support UAE sentence embeddings #15134

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/community/langchain_community/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,84 +11,85 @@
"""


import logging
from typing import Any

from langchain_community.embeddings.aleph_alpha import (
AlephAlphaAsymmetricSemanticEmbedding,
AlephAlphaSymmetricSemanticEmbedding,
)
from langchain_community.embeddings.awa import AwaEmbeddings
from langchain_community.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain_community.embeddings.baidu_qianfan_endpoint import (
QianfanEmbeddingsEndpoint,
)
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.bookend import BookendEmbeddings
from langchain_community.embeddings.clarifai import ClarifaiEmbeddings
from langchain_community.embeddings.cohere import CohereEmbeddings
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_community.embeddings.databricks import DatabricksEmbeddings
from langchain_community.embeddings.deepinfra import DeepInfraEmbeddings
from langchain_community.embeddings.edenai import EdenAiEmbeddings
from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings
from langchain_community.embeddings.embaas import EmbaasEmbeddings
from langchain_community.embeddings.ernie import ErnieEmbeddings
from langchain_community.embeddings.fake import (
DeterministicFakeEmbedding,
FakeEmbeddings,
)
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.embeddings.google_palm import GooglePalmEmbeddings
from langchain_community.embeddings.gpt4all import GPT4AllEmbeddings
from langchain_community.embeddings.gradient_ai import GradientEmbeddings
from langchain_community.embeddings.huggingface import (
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInferenceAPIEmbeddings,
HuggingFaceInstructEmbeddings,
HuggingFaceUaeEmbeddings
)
from langchain_community.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from langchain_community.embeddings.infinity import InfinityEmbeddings
from langchain_community.embeddings.javelin_ai_gateway import JavelinAIGatewayEmbeddings
from langchain_community.embeddings.jina import JinaEmbeddings
from langchain_community.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings
from langchain_community.embeddings.llamacpp import LlamaCppEmbeddings
from langchain_community.embeddings.llm_rails import LLMRailsEmbeddings
from langchain_community.embeddings.localai import LocalAIEmbeddings
from langchain_community.embeddings.minimax import MiniMaxEmbeddings
from langchain_community.embeddings.mlflow import MlflowEmbeddings
from langchain_community.embeddings.mlflow_gateway import MlflowAIGatewayEmbeddings
from langchain_community.embeddings.modelscope_hub import ModelScopeEmbeddings
from langchain_community.embeddings.mosaicml import MosaicMLInstructorEmbeddings
from langchain_community.embeddings.nlpcloud import NLPCloudEmbeddings
from langchain_community.embeddings.octoai_embeddings import OctoAIEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings.sagemaker_endpoint import (
SagemakerEndpointEmbeddings,
)
from langchain_community.embeddings.self_hosted import SelfHostedEmbeddings
from langchain_community.embeddings.self_hosted_hugging_face import (
SelfHostedHuggingFaceEmbeddings,
SelfHostedHuggingFaceInstructEmbeddings,
)
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.embeddings.tensorflow_hub import TensorflowHubEmbeddings
from langchain_community.embeddings.vertexai import VertexAIEmbeddings
from langchain_community.embeddings.volcengine import VolcanoEmbeddings
from langchain_community.embeddings.voyageai import VoyageEmbeddings
from langchain_community.embeddings.xinference import XinferenceEmbeddings

logger = logging.getLogger(__name__)

Check failure on line 87 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / ci (libs/community) / lint / build (3.8)

Ruff (I001)

langchain_community/embeddings/__init__.py:14:1: I001 Import block is un-sorted or un-formatted

Check failure on line 87 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / ci (libs/community) / lint / build (3.11)

Ruff (I001)

langchain_community/embeddings/__init__.py:14:1: I001 Import block is un-sorted or un-formatted

__all__ = [
"OpenAIEmbeddings",
"AzureOpenAIEmbeddings",
"ClarifaiEmbeddings",

Check failure on line 92 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / ci (libs/community) / lint / build (3.8)

Ruff (I001)

langchain_community/embeddings/__init__.py:14:1: I001 Import block is un-sorted or un-formatted

Check failure on line 92 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / ci (libs/community) / lint / build (3.11)

Ruff (I001)

langchain_community/embeddings/__init__.py:14:1: I001 Import block is un-sorted or un-formatted
"CohereEmbeddings",
"DatabricksEmbeddings",
"ElasticsearchEmbeddings",
Expand Down Expand Up @@ -139,6 +140,7 @@
"JohnSnowLabsEmbeddings",
"VoyageEmbeddings",
"BookendEmbeddings",
"HuggingFaceUaeEmbeddings",
"VolcanoEmbeddings",
]

Expand Down
86 changes: 86 additions & 0 deletions libs/community/langchain_community/embeddings/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
DEFAULT_BGE_MODEL = "BAAI/bge-large-en"
DEFAULT_UAE_MODEL = "WhereIsAI/UAE-Large-V1"
DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
DEFAULT_QUERY_INSTRUCTION = (
"Represent the question for retrieving supporting documents: "
Expand Down Expand Up @@ -341,3 +342,88 @@ def embed_query(self, text: str) -> List[float]:
Embeddings for the text.
"""
return self.embed_documents([text])[0]


class HuggingFaceUaeEmbeddings(BaseModel, Embeddings):
"""HuggingFace UAE sentence embedding models.
Arxiv: https://arxiv.org/abs/2309.12871

To use, you should have the ``angle_emb`` python package installed.

Example:
.. code-block:: python

from langchain_community.embeddings import HuggingFaceUaeEmbeddings

model_name = "WhereIsAI/UAE-Large-V1"
model_kwargs = {
'device': 'cpu',
'pooling_strategy': 'cls',
}
encode_kwargs = {'to_numpy': True}
prompt = None
hf = HuggingFaceUaeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
prompt=prompt
)
"""

client: Any #: :meta private:
model_name: str = DEFAULT_UAE_MODEL
"""Model name to use."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the model."""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the model."""
prompt: Optional[str] = None
"""prompt argument"""

def __init__(self, **kwargs: Any):
"""Initialize the angle_emb."""
super().__init__(**kwargs)
try:
import angle_emb

except ImportError as exc:
raise ImportError(
"Could not import angle_emb python package. "
"Please install it with `pip install angle_emb`."
) from exc

self.client = angle_emb.AnglE(
self.model_name, **self.model_kwargs
)
self.client.set_prompt(prompt=self.prompt)

class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.

Args:
texts: The list of texts to embed.

Returns:
List of embeddings, one for each text.
"""
texts = [t.replace("\n", " ") for t in texts]
if isinstance(self.prompt, str):
texts = [{'text': text} for text in texts]
embeddings = self.client.encode(texts, **self.encode_kwargs)
return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.

Args:
text: The text to embed.

Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]
1 change: 1 addition & 0 deletions libs/community/tests/unit_tests/embeddings/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"JohnSnowLabsEmbeddings",
"VoyageEmbeddings",
"BookendEmbeddings",
"HuggingFaceUaeEmbeddings",
"VolcanoEmbeddings",
]

Expand Down
Loading