diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1caa1d1..b54be3b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -8,10 +8,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.8 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9a1ce9a..9156225 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -12,10 +12,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.8 @@ -40,6 +40,9 @@ jobs: export TOKEN="${{secrets.TOKEN}}" export EMBEDDING_URL="${{secrets.EMBEDDING_URL}}" export EMBEDDING_TOKEN="${{secrets.EMBEDDING_TOKEN}}" - poetry add --dev pytest + export SPARSE_URL="${{secrets.SPARSE_URL}}" + export SPARSE_TOKEN="${{secrets.SPARSE_TOKEN}}" + export HYBRID_URL="${{secrets.HYBRID_URL}}" + export HYBRID_TOKEN="${{secrets.HYBRID_TOKEN}}" poetry install poetry run pytest diff --git a/README.md b/README.md index 6d33e5a..afbd533 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,28 @@ # Upstash Vector Python SDK + The Upstash Vector Python client > [!NOTE] > **This project is in GA Stage.** > -> The Upstash Professional Support fully covers this project. It receives regular updates, and bug fixes. +> The Upstash Professional Support fully covers this project. It receives regular updates, and bug fixes. > The Upstash team is committed to maintaining and improving its functionality. ## Installation Install a released version from pip: + ```shell pip3 install upstash-vector ``` ## Usage -In order to use this client, head out to [Upstash Console](https://console.upstash.com) and create a vector database. + +In order to use this client, head out to [Upstash Console](https://console.upstash.com) and create a vector database. There, get the `UPSTASH_VECTOR_REST_URL` and the `UPSTASH_VECTOR_REST_TOKEN` from the dashboard. ### Initializing the Index + ```python from upstash_vector import Index @@ -46,11 +50,10 @@ to be later queried or fetched. There are a couple of ways of doing upserts: ```python -# as tuples, either of the form: -# - (id, vector, metadata, data) -# - (id, vector, metadata) -# - (id, vector) - +# - dense indexes +# - (id, vector, metadata, data) +# - (id, vector, metadata) +# - (id, vector) index.upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), @@ -58,15 +61,38 @@ index.upsert( ("id3", [0.3, 0.4]), ] ) + +# - sparse indexes +# - (id, sparse_vector, metadata, data) +# - (id, sparse_vector, metadata) +# - (id, sparse_vector) +index.upsert( + vectors=[ + ("id1", ([0, 1], [0.1, 0.2]), {"metadata_field": "metadata_value"}, "data-value"), + ("id2", ([1, 2], [0.2, 0.2]), {"metadata_field": "metadata_value"}), + ("id3", ([2, 3, 4], [0.3, 0.4, 0.5])), + ] +) + +# - hybrid indexes +# - (id, vector, sparse_vector, metadata, data) +# - (id, vector, sparse_vector, metadata) +# - (id, vector, sparse_vector) +index.upsert( + vectors=[ + ("id1", [0.1, 0.2], ([0, 1], [0.1, 0.2]), {"metadata_field": "metadata_value"}, "data-value"), + ("id2", [0.2, 0.2], ([1, 2], [0.2, 0.2]), {"metadata_field": "metadata_value"}), + ("id3", [0.3, 0.4], ([2, 3, 4], [0.3, 0.4, 0.5])), + ] +) ``` ```python -# as dicts, either of the form: -# - {"id": id, "vector": vector, "metadata": metadata, "data": data) -# - {"id": id, "vector": vector, "metadata": metadata) -# - {"id": id, "vector": vector, "data": data) -# - {"id": id, "vector": vector} - +# - dense indexes +# - {"id": id, "vector": vector, "metadata": metadata, "data": data) +# - {"id": id, "vector": vector, "metadata": metadata) +# - {"id": id, "vector": vector, "data": data) +# - {"id": id, "vector": vector} index.upsert( vectors=[ {"id": "id4", "vector": [0.1, 0.2], "metadata": {"field": "value"}, "data": "value"}, @@ -75,18 +101,68 @@ index.upsert( {"id": "id7", "vector": [0.5, 0.6]}, ] ) + +# - sparse indexes +# - {"id": id, "sparse_vector": sparse_vector, "metadata": metadata, "data": data) +# - {"id": id, "sparse_vector": sparse_vector, "metadata": metadata) +# - {"id": id, "sparse_vector": sparse_vector, "data": data) +# - {"id": id, "sparse_vector": sparse_vector} +index.upsert( + vectors=[ + {"id": "id4", "sparse_vector": ([0, 1], [0.1, 0.2]), "metadata": {"field": "value"}, "data": "value"}, + {"id": "id5", "sparse_vector": ([1, 2], [0.2, 0.2]), "metadata": {"field": "value"}}, + {"id": "id6", "sparse_vector": ([2, 3, 4], [0.3, 0.4, 0.5]), "data": "value"}, + {"id": "id7", "sparse_vector": ([4], [0.3])}, + ] +) + +# - hybrid indexes +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "metadata": metadata, "data": data) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "metadata": metadata) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "data": data) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector} +index.upsert( + vectors=[ + {"id": "id4", "vector": [0.1, 0.2], "sparse_vector": ([0], [0.1]), "metadata": {"field": "value"}, + "data": "value"}, + {"id": "id5", "vector": [0.1, 0.2], "sparse_vector": ([1, 2], [0.2, 0.2]), "metadata": {"field": "value"}}, + {"id": "id6", "vector": [0.1, 0.2], "sparse_vector": ([2, 3, 4], [0.3, 0.4, 0.5]), "data": "value"}, + {"id": "id7", "vector": [0.5, 0.6], "sparse_vector": ([4], [0.3])}, + ] +) ``` ```python -from upstash_vector import Vector +from upstash_vector import Vector, SparseVector + +# dense indexes +index.upsert( + vectors=[ + Vector(id="id5", vector=[1, 2], metadata={"field": "value"}, data="value"), + Vector(id="id6", vector=[1, 2], metadata={"field": "value"}), + Vector(id="id7", vector=[1, 2], data="value"), + Vector(id="id8", vector=[6, 7]), + ] +) -# as Vector objects +# sparse indexes +index.upsert( + vectors=[ + Vector(id="id5", sparse_vector=SparseVector([1], [0.1]), metadata={"field": "value"}, data="value"), + Vector(id="id6", sparse_vector=SparseVector([1, 2], [0.1, 0.2]), metadata={"field": "value"}), + Vector(id="id7", sparse_vector=SparseVector([3, 5], [0.3, 0.3]), data="value"), + Vector(id="id8", sparse_vector=SparseVector([4], [0.2])), + ] +) +# hybrid indexes index.upsert( vectors=[ - Vector(id="id5", vector=[1, 2], metadata={"field": "value"}), - Vector(id="id6", vector=[1, 2], data="value"), - Vector(id="id7", vector=[6, 7]), + Vector(id="id5", vector=[1, 2], sparse_vector=SparseVector([1], [0.1]), metadata={"field": "value"}, + data="value"), + Vector(id="id6", vector=[1, 2], sparse_vector=SparseVector([1, 2], [0.1, 0.2]), metadata={"field": "value"}), + Vector(id="id7", vector=[1, 2], sparse_vector=SparseVector([3, 5], [0.3, 0.3]), data="value"), + Vector(id="id8", vector=[6, 7], sparse_vector=SparseVector([4], [0.2])), ] ) ``` @@ -113,7 +189,7 @@ When no namespace is provided, the default namespace is used. index.upsert( vectors=[ ("id1", [0.1, 0.2]), - ("id2", [0.3,0.4]), + ("id2", [0.3, 0.4]), ], namespace="ns", ) @@ -126,7 +202,8 @@ query vector can be requested from a namespace of an index. ```python res = index.query( - vector=[0.6, 0.9], + vector=[0.6, 0.9], # for dense and hybrid indexes + sparse_vector=([0, 1], [0.1, 0.1]), # for sparse and hybrid indexes top_k=5, include_vectors=False, include_metadata=True, @@ -137,11 +214,12 @@ res = index.query( # List of query results, sorted in the descending order of similarity for r in res: print( - r.id, # The id used while upserting the vector - r.score, # The similarity score of this vector to the query vector. Higher is more similar. - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The data of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.score, # The similarity score of this vector to the query vector. Higher is more similar. + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The data of the vector, if requested and present. ) ``` @@ -160,15 +238,15 @@ res = index.query( When a filter is provided, query results are further narrowed down based on the vectors whose metadata matches with it. -See [Metadata Filtering](https://upstash.com/docs/vector/features/filtering) documentation -for more information regarding the filter syntax. +See [Metadata Filtering](https://upstash.com/docs/vector/features/filtering) documentation +for more information regarding the filter syntax. -Also, a namespace can be specified to query from. -When no namespace is provided, the default namespace is used. +Also, a namespace can be specified to query from. +When no namespace is provided, the default namespace is used. ```python res = index.query( - vector=[0.6, 0.9], + vector=[0.6, 0.9], top_k=5, namespace="ns", ) @@ -180,22 +258,23 @@ A set of vectors can be fetched from a namespace of an index. ```python res = index.fetch( - ids=["id3", "id4"], - include_vectors=False, + ids=["id3", "id4"], + include_vectors=False, include_metadata=True, include_data=True, ) # List of fetch results, one for each id passed for r in res: - if not r: # Can be None, if there is no such vector with the given id + if not r: # Can be None, if there is no such vector with the given id continue - + print( - r.id, # The id used while upserting the vector - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The metadata of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` @@ -203,28 +282,29 @@ or, for singular fetch: ```python res = index.fetch( - "id1", - include_vectors=True, + "id1", + include_vectors=True, include_metadata=True, include_data=False, ) r = res[0] -if r: # Can be None, if there is no such vector with the given id +if r: # Can be None, if there is no such vector with the given id print( - r.id, # The id used while upserting the vector - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The metadata of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` -Also, a namespace can be specified to fetch from. +Also, a namespace can be specified to fetch from. When no namespace is provided, the default namespace is used. ```python res = index.fetch( - ids=["id3", "id4"], + ids=["id3", "id4"], namespace="ns", ) ``` @@ -237,37 +317,38 @@ in a page by page fashion. ```python # Scans the vectors 100 vector at a time, res = index.range( - cursor="", # Start the scan from the beginning - limit=100, - include_vectors=False, + cursor="", # Start the scan from the beginning + limit=100, + include_vectors=False, include_metadata=True, include_data=True, ) while res.next_cursor != "": res = index.range( - cursor=res.next_cursor, - limit=100, - include_vectors=False, + cursor=res.next_cursor, + limit=100, + include_vectors=False, include_metadata=True, include_data=True, ) - + for v in res.vectors: print( - v.id, # The id used while upserting the vector - v.vector, # The value of the vector, if requested. - v.metadata, # The metadata of the vector, if requested and present. - v.data, # The data of the vector, if requested and present. + v.id, # The id used while upserting the vector + v.vector, # The value of the vector, if requested (for dense and hybrid indexes). + v.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + v.metadata, # The metadata of the vector, if requested and present. + v.data, # The data of the vector, if requested and present. ) ``` -Also, a namespace can be specified to range from. +Also, a namespace can be specified to range from. When no namespace is provided, the default namespace is used. ```python res = index.range( - cursor="", + cursor="", limit=100, namespace="ns", ) @@ -284,7 +365,7 @@ res = index.delete( ) print( - res.deleted, # How many vectors are deleted out of the given ids. + res.deleted, # How many vectors are deleted out of the given ids. ) ``` @@ -295,10 +376,10 @@ res = index.delete( "id1", ) -print(res) # A boolean indicating whether the vector is deleted or not. +print(res) # A boolean indicating whether the vector is deleted or not. ``` -Also, a namespace can be specified to delete from. +Also, a namespace can be specified to delete from. When no namespace is provided, the default namespace is used. ```python @@ -310,24 +391,23 @@ res = index.delete( ### Update a Vector -Either the vector value(or data for indexes created with an embedding model) or the metadata -can be updated without needing to set the other one. +Any combination of vector value, sparse vector value, data, or metadata can be updated. ```python res = index.update( - "id1", + "id1", metadata={"new_field": "new_value"}, ) -print(res) # A boolean indicating whether the vector is updated or not. +print(res) # A boolean indicating whether the vector is updated or not. ``` -Also, a namespace can be specified to update from. +Also, a namespace can be specified to update from. When no namespace is provided, the default namespace is used. ```python res = index.update( - "id1", + "id1", metadata={"new_field": "new_value"}, namespace="ns", ) @@ -341,7 +421,7 @@ All vectors can be removed from a namespace of an index. index.reset() ``` -Also, a namespace can be specified to reset. +Also, a namespace can be specified to reset. When no namespace is provided, the default namespace is used. ```python @@ -367,18 +447,18 @@ This information also contains per-namespace status. ```python info = index.info() print( - info.vector_count, # Total number of vectors across all namespaces - info.pending_vector_count, # Total number of vectors waiting to be indexed across all namespaces - info.index_size, # Total size of the index on disk in bytes - info.dimension, # Vector dimension - info.similarity_function, # Similarity function used + info.vector_count, # Total number of vectors across all namespaces + info.pending_vector_count, # Total number of vectors waiting to be indexed across all namespaces + info.index_size, # Total size of the index on disk in bytes + info.dimension, # Vector dimension + info.similarity_function, # Similarity function used ) for ns, ns_info in info.namespaces.items(): print( - ns, # Name of the namespace - ns_info.vector_count, # Total number of vectors in this namespaces - ns_info.pending_vector_count, # Total number of vectors waiting to be indexed in this namespaces + ns, # Name of the namespace + ns_info.vector_count, # Total number of vectors in this namespaces + ns_info.pending_vector_count, # Total number of vectors waiting to be indexed in this namespaces ) ``` @@ -389,12 +469,12 @@ All the names of active namespaces can be listed. ```python namespaces = index.list_namespaces() for ns in namespaces: - print(ns) # name of the namespace + print(ns) # name of the namespace ``` ### Delete a Namespace -A namespace can be deleted entirely. +A namespace can be deleted entirely. If no such namespace exists, and exception is raised. The default namespaces cannot be deleted. @@ -405,7 +485,9 @@ index.delete_namespace(namespace="ns") # Contributing ## Preparing the environment -This project uses [Poetry](https://python-poetry.org) for packaging and dependency management. Make sure you are able to create the poetry shell with relevant dependencies. + +This project uses [Poetry](https://python-poetry.org) for packaging and dependency management. Make sure you are able to +create the poetry shell with relevant dependencies. You will also need a vector database on [Upstash](https://console.upstash.com/). @@ -414,22 +496,33 @@ poetry install ``` ## Code Formatting + ```bash poetry run ruff format . ``` ## Running tests -To run all the tests, make sure the poetry virtual environment activated with all +To run all the tests, make sure the poetry virtual environment activated with all the necessary dependencies. -Create two Vector Stores on upstash. First one should have 2 dimensions. Second one should use an embedding model. Set the necessary environment variables: +Create four Vector Stores on Upstash. First one should have 2 dimensions. Second one should use an embedding model. Set +the necessary environment variables: + +- A dense index with 2 dimensions, with cosine similarity +- A dense index with an embedding model +- A sparse index +- A hyrid index with 2 dimensions, with cosine similarity for the dense component. ``` URL=**** TOKEN=**** EMBEDDING_URL=**** EMBEDDING_TOKEN=**** +SPARSE_URL=**** +SPARSE_TOKEN=**** +HYBRID_URL=**** +HYBRID_TOKEN=**** ``` Then, run the following command to run tests: diff --git a/pyproject.toml b/pyproject.toml index dbf911d..f6e57dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,20 +34,16 @@ python = "^3.8" httpx = ">=0.23.0, <1" [tool.poetry.group.dev.dependencies] -mypy = "^1.8.0" -types-requests = "^2.31.0.20240106" -types-pygments = "^2.17.0.20240106" -types-colorama = "^0.4.15.20240106" -types-setuptools = "^69.0.0.20240115" -pytest = "^7.4.4" -pytest-asyncio = "^0.20.0" -ruff = "^0.1.13" +mypy = "^1.13.0" +pytest = "^8.3.4" +pytest-asyncio = "^0.24.0" +ruff = "^0.8.2" numpy = [ - { version = "<=1.24.4", python = "<=3.8" }, - { version = ">=1.25.0", python = ">=3.9" } + { version = "^1.24.4", python = "<=3.8" }, + { version = "^1.26.4", python = ">=3.9" } ] pandas = "^2.0.3" -pandas-stubs = "^2.0.3" +python-dotenv = "^1.0.1" [build-system] requires = ["poetry-core"] diff --git a/tests/__init__.py b/tests/__init__.py index 146d250..e5cc488 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,25 @@ -import time import asyncio +import os +import time + +import dotenv -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.core.index_operations import DEFAULT_NAMESPACE +dotenv.load_dotenv() + NAMESPACES = [DEFAULT_NAMESPACE, "ns"] +INDEX_URL = os.environ["URL"] +INDEX_TOKEN = os.environ["TOKEN"] +SPARSE_INDEX_URL = os.environ["SPARSE_URL"] +SPARSE_INDEX_TOKEN = os.environ["SPARSE_TOKEN"] +HYBRID_INDEX_URL = os.environ["HYBRID_URL"] +HYBRID_INDEX_TOKEN = os.environ["HYBRID_TOKEN"] +EMBEDDING_INDEX_URL = os.environ["EMBEDDING_URL"] +EMBEDDING_INDEX_TOKEN = os.environ["EMBEDDING_TOKEN"] + def assert_eventually(assertion, retry_delay=0.5, timeout=5.0): deadline = time.time() + timeout diff --git a/tests/conftest.py b/tests/conftest.py index 4e13f68..2358c34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,15 +1,23 @@ -from os import environ - import pytest import pytest_asyncio -from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import ( + EMBEDDING_INDEX_TOKEN, + EMBEDDING_INDEX_URL, + HYBRID_INDEX_TOKEN, + HYBRID_INDEX_URL, + INDEX_TOKEN, + INDEX_URL, + NAMESPACES, + SPARSE_INDEX_TOKEN, + SPARSE_INDEX_URL, +) +from upstash_vector import AsyncIndex, Index @pytest.fixture def index(): - idx = Index(environ["URL"], environ["TOKEN"]) + idx = Index(INDEX_URL, INDEX_TOKEN) for ns in NAMESPACES: idx.reset(namespace=ns) return idx @@ -17,7 +25,39 @@ def index(): @pytest_asyncio.fixture async def async_index(): - idx = AsyncIndex(environ["URL"], environ["TOKEN"]) + idx = AsyncIndex(INDEX_URL, INDEX_TOKEN) + for ns in NAMESPACES: + await idx.reset(namespace=ns) + return idx + + +@pytest.fixture +def sparse_index(): + idx = Index(SPARSE_INDEX_URL, SPARSE_INDEX_TOKEN) + for ns in NAMESPACES: + idx.reset(namespace=ns) + return idx + + +@pytest_asyncio.fixture +async def async_sparse_index(): + idx = AsyncIndex(SPARSE_INDEX_URL, SPARSE_INDEX_TOKEN) + for ns in NAMESPACES: + await idx.reset(namespace=ns) + return idx + + +@pytest.fixture +def hybrid_index(): + idx = Index(HYBRID_INDEX_URL, HYBRID_INDEX_TOKEN) + for ns in NAMESPACES: + idx.reset(namespace=ns) + return idx + + +@pytest_asyncio.fixture +async def async_hybrid_index(): + idx = AsyncIndex(HYBRID_INDEX_URL, HYBRID_INDEX_TOKEN) for ns in NAMESPACES: await idx.reset(namespace=ns) return idx @@ -25,7 +65,7 @@ async def async_index(): @pytest.fixture def embedding_index(): - idx = Index(environ["EMBEDDING_URL"], environ["EMBEDDING_TOKEN"]) + idx = Index(EMBEDDING_INDEX_URL, EMBEDDING_INDEX_TOKEN) for ns in NAMESPACES: idx.reset(namespace=ns) return idx @@ -33,7 +73,7 @@ def embedding_index(): @pytest_asyncio.fixture async def async_embedding_index(): - idx = AsyncIndex(environ["EMBEDDING_URL"], environ["EMBEDDING_TOKEN"]) + idx = AsyncIndex(EMBEDDING_INDEX_URL, EMBEDDING_INDEX_TOKEN) for ns in NAMESPACES: await idx.reset(namespace=ns) return idx diff --git a/tests/core/test_delete.py b/tests/core/test_delete.py index 4799a87..8242cd3 100644 --- a/tests/core/test_delete.py +++ b/tests/core/test_delete.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index @pytest.mark.parametrize("ns", NAMESPACES) diff --git a/tests/core/test_fetch.py b/tests/core/test_fetch.py index 1e7f736..aedd238 100644 --- a/tests/core/test_fetch.py +++ b/tests/core/test_fetch.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector @pytest.mark.parametrize("ns", NAMESPACES) @@ -378,3 +378,171 @@ async def test_fetch_with_data_async(async_index: AsyncIndex, ns: str): assert res[2].metadata is None assert res[2].vector == v3_values assert res[2].data == v3_data + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_fetch_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = sparse_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_fetch_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = hybrid_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.2] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].vector == [0.2, 0.3] + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_fetch_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = await async_sparse_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_fetch_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = await async_hybrid_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.2] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].vector == [0.2, 0.3] + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None diff --git a/tests/core/test_info.py b/tests/core/test_info.py index 7b83345..bb88956 100644 --- a/tests/core/test_info.py +++ b/tests/core/test_info.py @@ -1,13 +1,13 @@ import pytest from tests import ( + NAMESPACES, assert_eventually, assert_eventually_async, - NAMESPACES, ensure_ns_exists, ensure_ns_exists_async, ) -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index def test_info(index: Index): diff --git a/tests/core/test_namespace_operations.py b/tests/core/test_namespace_operations.py index b2399fc..566e63b 100644 --- a/tests/core/test_namespace_operations.py +++ b/tests/core/test_namespace_operations.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES, ensure_ns_exists, ensure_ns_exists_async -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.core.index_operations import DEFAULT_NAMESPACE diff --git a/tests/core/test_query.py b/tests/core/test_query.py index 6128617..04200e4 100644 --- a/tests/core/test_query.py +++ b/tests/core/test_query.py @@ -2,8 +2,9 @@ import pandas as pd import pytest -from tests import assert_eventually, assert_eventually_async, NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import NAMESPACES, assert_eventually, assert_eventually_async +from upstash_vector import AsyncIndex, Index, SparseVector +from upstash_vector.types import FusionAlgorithm, WeightingStrategy @pytest.mark.parametrize("ns", NAMESPACES) @@ -1025,3 +1026,632 @@ async def assertion(): assert query_res[1].data is None await assert_eventually_async(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_sparse_index(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query( + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_sparse_index_weighting_strategy(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query( + sparse_vector=([0, 1, 3], [0.2, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_many_sparse_index(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query_many( + queries=[ + { + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + }, + { + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 2 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.2, 0.3], + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.2, 0.3] + assert res[0].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[1].id == "id0" + assert res[1].vector == [0.1, 0.2] + assert res[1].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index_weighting_strategy(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.1], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.5], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.1, 0.1], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.1, 0.1], + sparse_vector=([0, 1, 3], [0.5, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.1] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.1, 0.1] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].vector == [0.9, 0.5] + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index_fusion_algorithm(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.8, 0.9], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.9], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.3, 0.9], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.9, 0.9], + sparse_vector=([0, 1, 3], [0.1, 0.1, 100]), + include_vectors=True, + include_metadata=True, + include_data=True, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.9, 0.9] + assert res[0].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.3, 0.9] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id0" + assert res[2].vector == [0.8, 0.9] + assert res[2].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_many_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.1])), + ("id1", [0.1, 0.4], ([1, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.5, 0.1], ([2, 3], [0.3, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + "fusion_algorithm": FusionAlgorithm.RRF, + "top_k": 1, + }, + { + "vector": np.array([0.5, 0.1]), + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "fusion_algorithm": FusionAlgorithm.DBSF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 3 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.2, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + assert_eventually(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_sparse_index_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query( + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_sparse_index_weighting_strategy_async( + async_sparse_index: AsyncIndex, ns: str +): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query( + sparse_vector=([0, 1, 3], [0.2, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_many_sparse_index_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query_many( + queries=[ + { + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + }, + { + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 2 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.2, 0.3], + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.2, 0.3] + assert res[0].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[1].id == "id0" + assert res[1].vector == [0.1, 0.2] + assert res[1].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_weighting_strategy_async( + async_hybrid_index: AsyncIndex, ns: str +): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.1], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.5], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.1, 0.1], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.1, 0.1], + sparse_vector=([0, 1, 3], [0.5, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.1] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.1, 0.1] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].vector == [0.9, 0.5] + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_fusion_algorithm_async( + async_hybrid_index: AsyncIndex, ns: str +): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.8, 0.9], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.9], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.3, 0.9], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.9, 0.9], + sparse_vector=([0, 1, 3], [0.1, 0.1, 100]), + include_vectors=True, + include_metadata=True, + include_data=True, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.9, 0.9] + assert res[0].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.3, 0.9] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id0" + assert res[2].vector == [0.8, 0.9] + assert res[2].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_many_hybrid_index_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.1])), + ("id1", [0.1, 0.4], ([1, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.5, 0.1], ([2, 3], [0.3, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + "fusion_algorithm": FusionAlgorithm.RRF, + "top_k": 1, + }, + { + "vector": np.array([0.5, 0.1]), + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "fusion_algorithm": FusionAlgorithm.DBSF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 3 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.2, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + await assert_eventually_async(assertion) diff --git a/tests/core/test_range.py b/tests/core/test_range.py index b8bc2de..76c15ef 100644 --- a/tests/core/test_range.py +++ b/tests/core/test_range.py @@ -4,7 +4,7 @@ from pytest import raises from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.errors import ClientError @@ -37,6 +37,9 @@ def test_range(index: Index, ns: str): assert res.vectors[i].id == f"id-{i}" assert res.vectors[i].metadata == {"meta": i} assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 while res.next_cursor != "": res = index.range( @@ -86,6 +89,9 @@ async def test_range_async(async_index: AsyncIndex, ns: str): assert res.vectors[i].id == f"id-{i}" assert res.vectors[i].metadata == {"meta": i} assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 while res.next_cursor != "": res = await async_index.range( @@ -103,3 +109,221 @@ async def test_range_async(async_index: AsyncIndex, ns: str): include_vectors=True, namespace=ns, ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_range_sparse(sparse_index: Index, ns: str): + vectors = [ + { + "id": f"id-{i}", + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + sparse_index.upsert(vectors=vectors, namespace=ns) + + res = sparse_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = sparse_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + sparse_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_range_sparse_async(async_sparse_index: AsyncIndex, ns: str): + vectors = [ + { + "id": f"id-{i}", + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + await async_sparse_index.upsert(vectors=vectors, namespace=ns) + + res = await async_sparse_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = await async_sparse_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + await async_sparse_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_range_hybrid(hybrid_index: Index, ns: str): + vectors = [ + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + hybrid_index.upsert(vectors=vectors, namespace=ns) + + res = hybrid_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = hybrid_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + hybrid_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_range_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + vectors = [ + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + await async_hybrid_index.upsert(vectors=vectors, namespace=ns) + + res = await async_hybrid_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = await async_hybrid_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + await async_hybrid_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) diff --git a/tests/core/test_reset.py b/tests/core/test_reset.py index ea23cc3..5c273b4 100644 --- a/tests/core/test_reset.py +++ b/tests/core/test_reset.py @@ -1,6 +1,7 @@ import pytest + from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index @pytest.mark.parametrize("ns", NAMESPACES) diff --git a/tests/core/test_resumable_query.py b/tests/core/test_resumable_query.py index bd791a1..5f6bacf 100644 --- a/tests/core/test_resumable_query.py +++ b/tests/core/test_resumable_query.py @@ -1,11 +1,9 @@ -import asyncio -import time - import pytest -from tests import assert_eventually_async, assert_eventually, NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import NAMESPACES, assert_eventually, assert_eventually_async +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.errors import UpstashError +from upstash_vector.types import FusionAlgorithm, WeightingStrategy @pytest.mark.parametrize("ns", NAMESPACES) @@ -28,13 +26,12 @@ def assertion(): namespace=ns, ) - assert isinstance(result, list) - - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + with handle: + assert isinstance(result, list) - handle.stop() + assert len(result) > 0 + assert result[0].metadata is not None + assert result[0].vector is not None with pytest.raises(UpstashError): handle.fetch_next(1) @@ -42,7 +39,6 @@ def assertion(): with pytest.raises(UpstashError): handle.stop() - time.sleep(1) assert_eventually(assertion) @@ -64,12 +60,10 @@ def assertion(): namespace=ns, ) - assert len(result) == 1 - assert result[0].id == "id1" - - handle.stop() + with handle: + assert len(result) == 1 + assert result[0].id == "id1" - time.sleep(1) assert_eventually(assertion) @@ -94,16 +88,15 @@ async def assertion(): namespace=ns, ) - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None - - next_results = await handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + async with handle: + assert isinstance(result, list) + assert len(result) > 0 + assert result[0].metadata is not None + assert result[0].vector is not None - await handle.stop() + next_results = await handle.fetch_next(1) + assert isinstance(next_results, list) + assert len(next_results) == 1 with pytest.raises(UpstashError): await handle.fetch_next(1) @@ -111,7 +104,6 @@ async def assertion(): with pytest.raises(UpstashError): await handle.stop() - await asyncio.sleep(1) await assert_eventually_async(assertion) @@ -136,12 +128,10 @@ async def assertion(): namespace=ns, ) - assert len(result) == 1 - assert result[0].id == "id1" - - await handle.stop() + async with handle: + assert len(result) == 1 + assert result[0].id == "id1" - await asyncio.sleep(1) await assert_eventually_async(assertion) @@ -166,28 +156,26 @@ def assertion(): namespace=ns, ) - assert len(result) == 2 - assert result[0].id == "id1" - assert result[1].id == "id2" - - # Fetch next 2 results - next_results_1 = handle.fetch_next(2) - assert len(next_results_1) == 2 - assert next_results_1[0].id == "id3" - assert next_results_1[1].id == "id4" + with handle: + assert len(result) == 2 + assert result[0].id == "id1" + assert result[1].id == "id2" - # Fetch next 1 result - next_results_2 = handle.fetch_next(1) - assert len(next_results_2) == 1 - assert next_results_2[0].id == "id5" + # Fetch next 2 results + next_results_1 = handle.fetch_next(2) + assert len(next_results_1) == 2 + assert next_results_1[0].id == "id3" + assert next_results_1[1].id == "id4" - # Try to fetch more, should return empty list - next_results_3 = handle.fetch_next(1) - assert len(next_results_3) == 0 + # Fetch next 1 result + next_results_2 = handle.fetch_next(1) + assert len(next_results_2) == 1 + assert next_results_2[0].id == "id5" - handle.stop() + # Try to fetch more, should return empty list + next_results_3 = handle.fetch_next(1) + assert len(next_results_3) == 0 - time.sleep(1) assert_eventually(assertion) @@ -213,100 +201,183 @@ async def assertion(): namespace=ns, ) - assert len(result) == 2 - - next_results_1 = await handle.fetch_next(2) - assert len(next_results_1) == 2 + async with handle: + assert len(result) == 2 - next_results_2 = await handle.fetch_next(1) - assert len(next_results_2) == 1 + next_results_1 = await handle.fetch_next(2) + assert len(next_results_1) == 2 - await handle.stop() + next_results_2 = await handle.fetch_next(1) + assert len(next_results_2) == 1 - await asyncio.sleep(1) await assert_eventually_async(assertion) @pytest.mark.parametrize("ns", NAMESPACES) -def test_resumable_query_context_manager(index: Index, ns: str): - index.upsert( +def test_resumable_query_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"field": "value1"}), - ("id2", [0.3, 0.4], {"field": "value2"}), - ("id3", [0.5, 0.6], {"field": "value3"}), + ("id0", ([0, 1], [0.3, 0.1])), + ("id1", ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), ], namespace=ns, ) def assertion(): - result, handle = index.resumable_query( - vector=[0.1, 0.2], + result, handle = sparse_index.resumable_query( + sparse_vector=([0], [0.1]), top_k=2, - include_metadata=True, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) with handle: - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - next_results = handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + assert_eventually(assertion) - # The query should be stopped automatically after exiting the context - with pytest.raises(UpstashError): - handle.fetch_next(1) +@pytest.mark.parametrize("ns", NAMESPACES) +def test_resumable_query_hybrid(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.9, 0.9], ([0, 1], [0.3, 0.1])), + ("id1", [0.8, 0.9], ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.7, 0.9], ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) - with pytest.raises(UpstashError): - handle.stop() + def assertion(): + result, handle = hybrid_index.resumable_query( + vector=[0.1, 0.1], + sparse_vector=([0], [0.1]), + top_k=2, + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + with handle: + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].vector == [0.9, 0.9] + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].vector == [0.8, 0.9] + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].vector == [0.7, 0.9] + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - time.sleep(1) assert_eventually(assertion) @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) -async def test_resumable_query_async_context_manager(async_index: AsyncIndex, ns: str): - await async_index.upsert( +async def test_resumable_query_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"field": "value1"}), - ("id2", [0.3, 0.4], {"field": "value2"}), - ("id3", [0.5, 0.6], {"field": "value3"}), + ("id0", ([0, 1], [0.3, 0.1])), + ("id1", ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), ], namespace=ns, ) async def assertion(): - result, handle = await async_index.resumable_query( - vector=[0.1, 0.2], + result, handle = await async_sparse_index.resumable_query( + sparse_vector=([0], [0.1]), top_k=2, - include_metadata=True, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) async with handle: - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = await handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - next_results = await handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + await assert_eventually_async(assertion) - # The query should be stopped automatically after exiting the context - with pytest.raises(UpstashError): - await handle.fetch_next(1) +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_resumable_query_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.9, 0.9], ([0, 1], [0.3, 0.1])), + ("id1", [0.8, 0.9], ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.7, 0.9], ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) - with pytest.raises(UpstashError): - await handle.stop() + async def assertion(): + result, handle = await async_hybrid_index.resumable_query( + vector=[0.1, 0.1], + sparse_vector=([0], [0.1]), + top_k=2, + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + async with handle: + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].vector == [0.9, 0.9] + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].vector == [0.8, 0.9] + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = await handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].vector == [0.7, 0.9] + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - await asyncio.sleep(1) await assert_eventually_async(assertion) diff --git a/tests/core/test_update.py b/tests/core/test_update.py index 2da0cae..3bc3a1c 100644 --- a/tests/core/test_update.py +++ b/tests/core/test_update.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.types import MetadataUpdateMode @@ -257,3 +257,123 @@ async def test_update_vector_data_async(async_index: AsyncIndex, ns: str): async def test_update_non_existing_id_async(async_index: AsyncIndex, ns: str): updated = await async_index.update("id-999", vector=[0.4, 0.5], namespace=ns) assert updated is False + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_update_sparse_vector(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + updated = sparse_index.update( + "id1", + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = sparse_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_update_hybrid_vector(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ], + namespace=ns, + ) + + updated = hybrid_index.update( + "id1", + vector=[0.5, 0.6], + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = hybrid_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].vector == [0.5, 0.6] + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_sparse_vector_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + updated = await async_sparse_index.update( + "id1", + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = await async_sparse_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_hybrid_vector_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ], + namespace=ns, + ) + + updated = await async_hybrid_index.update( + "id1", + vector=[0.5, 0.6], + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = await async_hybrid_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].vector == [0.5, 0.6] + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) diff --git a/tests/core/test_upsert.py b/tests/core/test_upsert.py index 5ab4389..34a439e 100644 --- a/tests/core/test_upsert.py +++ b/tests/core/test_upsert.py @@ -1,14 +1,13 @@ +import numpy as np +import pandas as pd import pytest from pytest import raises from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.errors import ClientError from upstash_vector.types import Data, Vector -import numpy as np -import pandas as pd - @pytest.mark.parametrize("ns", NAMESPACES) def test_upsert_tuple(index: Index, ns: str): @@ -766,3 +765,315 @@ async def test_invalid_payload_async(async_index: AsyncIndex, ns: str): ], namespace=ns, ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_upsert_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ( + "id0", + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = sparse_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.sparse_vector is not None + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_upsert_hybrid(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ( + "id0", + [0.1, 0.1], + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + np.array([0.5, 0.5]), + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + pd.array([0.1, 0.2]), + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + vector=[3.3, 1.0], + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + [1.1, 2.2], + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + [1.0, 5.0], + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "vector": [13.0, 0.5], + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "vector": np.array([4.2, 2.4]), + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "vector": [13, 5.4], + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + vector=np.array([0.9, 0.7]), + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + vector=[0.0, 1.0], + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = hybrid_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.vector is not None + assert len(r.vector) == 2 + assert r.sparse_vector is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_upsert_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ( + "id0", + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = await async_sparse_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.sparse_vector is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_upsert_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ( + "id0", + [0.1, 0.1], + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + np.array([0.5, 0.5]), + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + pd.array([0.1, 0.2]), + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + vector=[3.3, 1.0], + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + [1.1, 2.2], + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + [1.0, 5.0], + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "vector": [13.0, 0.5], + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "vector": np.array([4.2, 2.4]), + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "vector": [13, 5.4], + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + vector=np.array([0.9, 0.7]), + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + vector=[0.0, 1.0], + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = await async_hybrid_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.vector is not None + assert len(r.vector) == 2 + assert r.sparse_vector is not None diff --git a/upstash_vector/__init__.py b/upstash_vector/__init__.py index 48cd11f..66b3e9b 100644 --- a/upstash_vector/__init__.py +++ b/upstash_vector/__init__.py @@ -1,6 +1,6 @@ __version__ = "0.6.0" -from upstash_vector.client import Index, AsyncIndex -from upstash_vector.types import Vector +from upstash_vector.client import AsyncIndex, Index +from upstash_vector.types import SparseVector, Vector -__all__ = ["Index", "AsyncIndex", "Vector"] +__all__ = ["Index", "AsyncIndex", "Vector", "SparseVector"] diff --git a/upstash_vector/client.py b/upstash_vector/client.py index aa25a8a..4187fd0 100644 --- a/upstash_vector/client.py +++ b/upstash_vector/client.py @@ -3,7 +3,7 @@ import httpx -from upstash_vector.core.index_operations import IndexOperations, AsyncIndexOperations +from upstash_vector.core.index_operations import AsyncIndexOperations, IndexOperations from upstash_vector.http import ( execute_with_parameters, execute_with_parameters_async, diff --git a/upstash_vector/core/index_operations.py b/upstash_vector/core/index_operations.py index a06f978..b1ffb11 100644 --- a/upstash_vector/core/index_operations.py +++ b/upstash_vector/core/index_operations.py @@ -1,33 +1,38 @@ from typing import ( - Sequence, - Union, - List, + Any, + Awaitable, + Callable, Dict, + List, Optional, - Any, + Sequence, Tuple, - Callable, - Awaitable, + Union, ) from upstash_vector.errors import ClientError from upstash_vector.types import ( Data, DeleteResult, + FetchResult, + FusionAlgorithm, + InfoResult, MetadataUpdateMode, QueryRequest, + QueryResult, RangeResult, - InfoResult, + SparseVector, SupportsToList, - FetchResult, - QueryResult, + TupleAsSparseVectorT, Vector, + WeightingStrategy, ) from upstash_vector.utils import ( - convert_query_requests_to_payload, - convert_to_list, - convert_to_vectors, - convert_to_payload, + query_requests_to_payload, + sequence_to_vectors, + to_list, + to_sparse_vector, + vectors_to_payload, ) DEFAULT_NAMESPACE = "" @@ -82,7 +87,7 @@ def upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), - ("id3", [0.3,0.4]), + ("id3", [0.3, 0.4]), ] ) ``` @@ -133,8 +138,8 @@ def upsert( ``` """ - vectors = convert_to_vectors(vectors) - payload, is_vector = convert_to_payload(vectors) + converted_vectors = sequence_to_vectors(vectors) + payload, is_vector = vectors_to_payload(converted_vectors) path = UPSERT_PATH if is_vector else UPSERT_DATA_PATH return self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -149,6 +154,9 @@ def query( data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -163,6 +171,9 @@ def query( :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. Example usage: @@ -192,18 +203,38 @@ def query( "filter": filter, } - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = QUERY_PATH return [ @@ -262,7 +293,7 @@ def query_many( single_result = self.query(**query, namespace=namespace) return [single_result] - has_vector_query, payload = convert_query_requests_to_payload(queries) + has_vector_query, payload = query_requests_to_payload(queries) path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH result = self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -282,6 +313,9 @@ def resumable_query( namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, max_idle: int = 3600, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> Tuple[List[QueryResult], "ResumableQueryHandle"]: """ Creates a resumable query. @@ -297,6 +331,9 @@ def resumable_query( :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting vectors will have their unstructured data or not. :param max_idle: Maximum idle time for the resumable query in seconds. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. :return: First batch of the results, along with a handle to fetch more or stop the query. @@ -305,7 +342,7 @@ def resumable_query( ```python result, handle = index.resumable_query( vector=[0.6, 0.9], - top_k=100, + top_k=10, include_vectors=False, include_metadata=True, ) @@ -315,13 +352,6 @@ def resumable_query( handle.stop() ``` """ - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - payload = { "topK": top_k, "includeVectors": include_vectors, @@ -331,11 +361,38 @@ def resumable_query( "maxIdle": max_idle, } + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value + if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The resumable query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = RESUMABLE_QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The resumable query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = RESUMABLE_QUERY_PATH result = self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -478,11 +535,12 @@ def fetch( def update( self, id: str, - vector: Optional[List[float]] = None, + vector: Optional[Union[List[float], SupportsToList]] = None, data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, ) -> bool: """ Updates a vector value, data, or metadata for the given id. @@ -495,6 +553,7 @@ def update( :param metadata_update_mode: Whether to overwrite the whole it, or patch the metadata (insert new fields or update according to the `RFC 7396 JSON Merge Patch` algorithm. + :param sparse_vector: The sparse vector value to update to. Example usage: @@ -508,7 +567,14 @@ def update( } if vector is not None: - payload["vector"] = vector + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } if data is not None: payload["data"] = data @@ -577,7 +643,7 @@ async def upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), - ("id3", [0.3,0.4]), + ("id3", [0.3, 0.4]), ] ) ``` @@ -626,8 +692,8 @@ async def upsert( ) ``` """ - vectors = convert_to_vectors(vectors) - payload, is_vector = convert_to_payload(vectors) + converted_vectors = sequence_to_vectors(vectors) + payload, is_vector = vectors_to_payload(converted_vectors) path = UPSERT_PATH if is_vector else UPSERT_DATA_PATH return await self._execute_request_async( @@ -644,6 +710,9 @@ async def query( data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -658,6 +727,9 @@ async def query( :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. Example usage: @@ -687,18 +759,38 @@ async def query( "filter": filter, } - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = QUERY_PATH return [ @@ -757,7 +849,7 @@ async def query_many( single_result = await self.query(**query, namespace=namespace) return [single_result] - has_vector_query, payload = convert_query_requests_to_payload(queries) + has_vector_query, payload = query_requests_to_payload(queries) path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH result = await self._execute_request_async( payload=payload, path=_path_for(namespace, path) @@ -779,6 +871,9 @@ async def resumable_query( namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, max_idle: int = 3600, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> Tuple[List[QueryResult], "AsyncResumableQueryHandle"]: """ Creates a resumable query. @@ -794,6 +889,9 @@ async def resumable_query( :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting vectors will have their unstructured data or not. :param max_idle: Maximum idle time for the resumable query in seconds. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. :return: First batch of the results, along with a handle to fetch more or stop the query. @@ -802,7 +900,7 @@ async def resumable_query( ```python result, handle = await index.resumable_query( vector=[0.6, 0.9], - top_k=100, + top_k=10, include_vectors=False, include_metadata=True, ) @@ -812,13 +910,6 @@ async def resumable_query( await handle.stop() ``` """ - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - payload = { "topK": top_k, "includeVectors": include_vectors, @@ -828,11 +919,38 @@ async def resumable_query( "maxIdle": max_idle, } + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value + if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The resumable query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = RESUMABLE_QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The resumable query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = RESUMABLE_QUERY_PATH result = await self._execute_request_async( @@ -979,11 +1097,12 @@ async def fetch( async def update( self, id: str, - vector: Optional[List[float]] = None, + vector: Optional[Union[List[float], SupportsToList]] = None, data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, ) -> bool: """ Updates a vector value, data, or metadata for the given id. @@ -996,6 +1115,7 @@ async def update( :param metadata_update_mode: Whether to overwrite the whole it, or patch the metadata (insert new fields or update according to the `RFC 7396 JSON Merge Patch` algorithm. + :param sparse_vector: The sparse vector value to update to. Example usage: @@ -1009,7 +1129,14 @@ async def update( } if vector is not None: - payload["vector"] = vector + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } if data is not None: payload["data"] = data diff --git a/upstash_vector/http.py b/upstash_vector/http.py index 77dfc38..b7710c0 100644 --- a/upstash_vector/http.py +++ b/upstash_vector/http.py @@ -4,7 +4,7 @@ from platform import python_version from typing import Any, Dict -from httpx import Client, AsyncClient +from httpx import AsyncClient, Client from upstash_vector import __version__ from upstash_vector.errors import UpstashError diff --git a/upstash_vector/types.py b/upstash_vector/types.py index a022362..35653b6 100644 --- a/upstash_vector/types.py +++ b/upstash_vector/types.py @@ -1,19 +1,52 @@ import enum from dataclasses import dataclass -from typing import Optional, List, Dict, TypedDict, Union, Protocol +from typing import Dict, List, Optional, Protocol, Tuple, TypedDict, Union class SupportsToList(Protocol): - def tolist(self) -> List[float]: - ... + def tolist(self) -> List[float]: ... + + +@dataclass +class SparseVector: + indices: Union[List[int], SupportsToList] + """ + List of dimensions that have non-zero values. + + All the signed 32 bit integer range is valid + as the dimension indices. + """ + + values: Union[List[float], SupportsToList] + """ + Values of the non-zero dimensions. + + It must be of the same size as the `indices`. + """ + + @classmethod + def _from_json(cls, obj: Optional[dict]) -> Optional["SparseVector"]: + if not obj: + return None + + return SparseVector( + obj["indices"], + obj["values"], + ) + + +TupleAsSparseVectorT = Tuple[ + Union[List[int], SupportsToList], Union[List[float], SupportsToList] +] @dataclass class Vector: id: Union[int, str] - vector: Union[List[float], SupportsToList] + vector: Optional[Union[List[float], SupportsToList]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None @dataclass @@ -29,6 +62,7 @@ class FetchResult: vector: Optional[List[float]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[SparseVector] = None @classmethod def _from_json(cls, obj: dict) -> "FetchResult": @@ -37,6 +71,7 @@ def _from_json(cls, obj: dict) -> "FetchResult": vector=obj.get("vector"), metadata=obj.get("metadata"), data=obj.get("data"), + sparse_vector=SparseVector._from_json(obj.get("sparseVector")), ) @@ -47,6 +82,7 @@ class QueryResult: vector: Optional[List[float]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[SparseVector] = None @classmethod def _from_json(cls, obj: dict) -> "QueryResult": @@ -56,6 +92,7 @@ def _from_json(cls, obj: dict) -> "QueryResult": vector=obj.get("vector"), metadata=obj.get("metadata"), data=obj.get("data"), + sparse_vector=SparseVector._from_json(obj.get("sparseVector")), ) @@ -132,19 +169,106 @@ class MetadataUpdateMode(enum.Enum): """Patch the metadata according to Merge Patch algorithm.""" +class WeightingStrategy(enum.Enum): + """ + For sparse vectors, what kind of weighting strategy + should be used while querying the matching non-zero + dimension values of the query vector with the documents. + + If not provided, no weighting will be used. + """ + + IDF = "IDF" + """ + Inverse document frequency. + + It is recommended to use this weighting strategy for + BM25 sparse embedding models. + + It is calculated as + + ln(((N - n(q) + 0.5) / (n(q) + 0.5)) + 1) where + N: Total number of sparse vectors. + n(q): Total number of sparse vectors having non-zero value + for that particular dimension. + ln: Natural logarithm + + The values of N and n(q) are maintained by Upstash as the + vectors are indexed. + """ + + +class FusionAlgorithm(enum.Enum): + """ + Fusion algorithm to use while fusing scores + from dense and sparse components of a hybrid index. + + If not provided, defaults to `RRF`. + """ + + RRF = "RRF" + """ + Reciprocal rank fusion. + + Each sorted score from the dense and sparse indexes are + mapped to 1 / (rank + K), where rank is the order of the + score in the dense or sparse scores and K is a constant + with the value of 60. + + Then, scores from the dense and sparse components are + deduplicated (i.e. if a score for the same vector is present + in both dense and sparse scores, the mapped scores are + added; otherwise individual mapped scores are used) + and the final result is returned as the topK values + of this final list. + + In short, this algorithm just takes the order of the scores + into consideration. + """ + + DBSF = "DBSF" + """ + Distribution based score fusion. + + Each sorted score from the dense and sparse indexes are + normalized as + (s - (mean - 3 * stddev)) / ((mean + 3 * stddev) - (mean - 3 * stddev)) + where s is the score, (mean - 3 * stddev) is the minimum, + and (mean + 3 * stddev) is the maximum tail ends of the distribution. + + Then, scores from the dense and sparse components are + deduplicated (i.e. if a score for the same vector is present + in both dense and sparse scores, the normalized scores are + added; otherwise individual normalized scores are used) + and the final result is returned as the topK values + of this final list. + + In short, this algorithm takes distribution of the scores + into consideration as well, as opposed to the `RRF`. + """ + + class QueryRequest(TypedDict, total=False): vector: Union[List[float], SupportsToList] """ The vector value to query. - Only and only one of `vector` or `data` fields must be provided. + It must be provided only for dense and hybrid indexes. + """ + + sparse_vector: Union[SparseVector, TupleAsSparseVectorT] + """ + The sparse vector value to query. + + It must be provided only for sparse or hybrid indexes. """ data: str """ - Data to query for (after embedding it to a vector). + Data to query for (after embedding it to a vector/sparse vector). - Only and only one of `vector` or `data` fields must be provided. + It must be provided only for indexes created with Upstash hosted models. + When provided, `vector` or `sparse_vector` fields should not be set. """ top_k: int @@ -181,3 +305,22 @@ class QueryRequest(TypedDict, total=False): When not specified, defaults to `""`(no filter). """ + + weighting_strategy: WeightingStrategy + """ + Weighting strategy to be used for sparse vectors. + + It must be provided only for sparse and hybrid indexes. + + When not specified, defaults to no extra weighting (i.e. 1.0). + """ + + fusion_algorithm: FusionAlgorithm + """ + Fusion algorithm to use while fusing scores + from dense and sparse components of a hybrid index. + + It must be provided only for hybrid indexes. + + When not specified, defaults to `RRF`. + """ diff --git a/upstash_vector/utils.py b/upstash_vector/utils.py index ce718fc..50bbf2c 100644 --- a/upstash_vector/utils.py +++ b/upstash_vector/utils.py @@ -1,71 +1,204 @@ -from typing import List, Union, Dict, Any, Optional, Tuple +from typing import Any, Dict, List, Sequence, Tuple, Union from upstash_vector.errors import ClientError -from upstash_vector.types import Data, QueryRequest, Vector +from upstash_vector.types import ( + Data, + QueryRequest, + SparseVector, + SupportsToList, + TupleAsSparseVectorT, + Vector, +) -def convert_to_list(obj): - if isinstance(obj, list): - return obj - elif hasattr(obj, "tolist") and callable(getattr(obj, "tolist")): - return obj.tolist() +def sequence_to_vectors( + vectors: Sequence[Union[dict, tuple, Vector, Data]], +) -> List[Union[Vector, Data]]: + return [_parse_vector(vector) for vector in vectors] + + +def _parse_vector(vector: Union[dict, tuple, Vector, Data]) -> Union[Vector, Data]: + if isinstance(vector, Vector): + dense = vector.vector + if dense is not None: + vector.vector = to_list(dense) + + sparse = vector.sparse_vector + if sparse is not None: + vector.sparse_vector = to_sparse_vector(sparse) + + return vector + elif isinstance(vector, Data): + return vector + elif isinstance(vector, tuple): + return _parse_vector_from_tuple(vector) + elif isinstance(vector, dict): + return _parse_vector_from_dict(vector) + else: + raise ClientError( + f"Given object type is undefined for converting to vector: {vector}" + ) + + +def to_list(maybe_list: Union[list, SupportsToList]) -> list: + if isinstance(maybe_list, list): + return maybe_list + elif hasattr(maybe_list, "tolist") and callable(getattr(maybe_list, "tolist")): + return maybe_list.tolist() raise TypeError( - f"Expected a list or something can be converted to a list(like numpy or pandas array) but got {type(obj)}" + f"Expected a list or something can be converted to a " + f"list(like numpy or pandas array) but got {type(maybe_list)}" ) -def _get_payload_element( - id: Union[int, str], - payload: Union[str, List[float]], - metadata: Optional[Dict[str, Any]] = None, - data: Optional[str] = None, -) -> Union[Vector, Data]: - if isinstance(payload, str): - return Data(id=id, data=payload, metadata=metadata) +def to_sparse_vector( + maybe_sparse_vector: Union[SparseVector, TupleAsSparseVectorT], +) -> SparseVector: + if isinstance(maybe_sparse_vector, SparseVector): + maybe_sparse_vector.indices = to_list(maybe_sparse_vector.indices) + maybe_sparse_vector.values = to_list(maybe_sparse_vector.values) + return maybe_sparse_vector + elif isinstance(maybe_sparse_vector, tuple): + if len(maybe_sparse_vector) != 2: + raise ClientError( + "The tuple for sparse vector should contain two lists; " + "one for indices, and one for values." + ) - return Vector(id=id, vector=convert_to_list(payload), metadata=metadata, data=data) + sparse = SparseVector( + to_list(maybe_sparse_vector[0]), to_list(maybe_sparse_vector[1]) + ) + return sparse + else: + raise ClientError("`sparse_vector` must be a `SparseVector` or `tuple`.") -def _get_payload_element_from_dict( - id: Union[int, str], - vector: Optional[List[float]] = None, - data: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, -) -> Union[Vector, Data]: - if vector is None and data is None: +def _parse_vector_from_tuple(t: tuple) -> Union[Vector, Data]: + if len(t) < 2: raise ClientError( - "Vector dict must have one of `vector` or `data` fields defined." + "The tuple must contain at least two elements; " + "one for id, and other for vector or sparse vector." ) - if vector is None: - # data cannot be none at this point - return Data(id=id, data=data, metadata=metadata) # type:ignore[arg-type] + id = t[0] + if isinstance(t[1], str): + return _parse_data_from_tuple(t, id) - return Vector(id=id, vector=convert_to_list(vector), metadata=metadata, data=data) + if isinstance(t[1], (SparseVector, tuple)): + return _parse_sparse_vector_from_tuple(t, id) + dense = to_list(t[1]) + if len(t) > 2 and isinstance(t[2], (SparseVector, tuple)): + return _parse_hybrid_vector_from_tuple(t, id, dense) -def _tuple_or_dict_to_vectors(vector) -> Union[Vector, Data]: - if isinstance(vector, Vector): - vector.vector = convert_to_list(vector.vector) - return vector - elif isinstance(vector, Data): - return vector - elif isinstance(vector, tuple): - return _get_payload_element(*vector) - elif isinstance(vector, dict): - return _get_payload_element_from_dict(**vector) + return _parse_dense_vector_from_tuple(t, id, dense) + + +def _parse_data_from_tuple(t: tuple, id: str) -> Data: + data = t[1] + if len(t) > 2: + metadata = t[2] + else: + metadata = None + + return Data(id=id, data=data, metadata=metadata) + + +def _parse_sparse_vector_from_tuple(t: tuple, id: str) -> Vector: + sparse = to_sparse_vector(t[1]) + + if len(t) > 2: + metadata = t[2] + if len(t) > 3: + data = t[3] + else: + data = None + else: + metadata = None + data = None + + return Vector( + id=id, + sparse_vector=sparse, + metadata=metadata, + data=data, + ) + + +def _parse_hybrid_vector_from_tuple(t: tuple, id: str, dense: List[float]) -> Vector: + sparse = to_sparse_vector(t[2]) + + if len(t) > 3: + metadata = t[3] + if len(t) > 4: + data = t[4] + else: + data = None + else: + metadata = None + data = None + + return Vector( + id=id, + vector=dense, + sparse_vector=sparse, + metadata=metadata, + data=data, + ) + + +def _parse_dense_vector_from_tuple(t: tuple, id: str, dense: List[float]) -> Vector: + if len(t) > 2: + metadata = t[2] + if len(t) > 3: + data = t[3] + else: + data = None else: + metadata = None + data = None + + return Vector( + id=id, + vector=dense, + metadata=metadata, + data=data, + ) + + +def _parse_vector_from_dict(d: dict) -> Union[Vector, Data]: + id = d["id"] + vector = d.get("vector") + sparse_vector = d.get("sparse_vector") + data = d.get("data") + metadata = d.get("metadata") + + if vector is None and sparse_vector is None and data is not None: + return Data(id=id, data=data, metadata=metadata) + + if vector is None and sparse_vector is None: raise ClientError( - f"Given object type is undefined for converting to vector: {vector}" + "The dict for vector should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." ) + if vector is not None: + vector = to_list(vector) -def convert_to_vectors(vectors) -> List[Union[Vector, Data]]: - return [_tuple_or_dict_to_vectors(vector) for vector in vectors] + if sparse_vector is not None: + sparse_vector = to_sparse_vector(sparse_vector) + + return Vector( + id=id, + vector=vector, + sparse_vector=sparse_vector, + metadata=metadata, + data=data, + ) -def convert_to_payload( +def vectors_to_payload( vectors: List[Union[Vector, Data]], ) -> Tuple[List[Dict[str, Any]], bool]: """ @@ -79,23 +212,21 @@ def convert_to_payload( expecting_vectors = isinstance(vectors[0], Vector) payload = [] for vector in vectors: - is_vector = isinstance(vector, Vector) - if expecting_vectors != is_vector: - raise ClientError( - "All items should either have the `data` or the `vector` field." - " Received items from both kinds. Please send them separately." - ) + if isinstance(vector, Vector): + if not expecting_vectors: + raise ClientError( + "All items should either have the `data` or the `vector` and/or `sparse_vector` field." + " Received items from both kinds. Please send them separately." + ) - if is_vector: - payload.append( - { - "id": vector.id, - "vector": vector.vector, # type: ignore[union-attr] - "metadata": vector.metadata, - "data": vector.data, - } - ) + payload.append(_vector_to_payload(vector)) else: + if expecting_vectors: + raise ClientError( + "All items should either have the `data` or the `vector` and/or `sparse_vector` field." + " Received items from both kinds. Please send them separately." + ) + payload.append( { "id": vector.id, @@ -107,50 +238,87 @@ def convert_to_payload( return payload, expecting_vectors -def convert_query_requests_to_payload( +def _vector_to_payload(vector: Vector) -> Dict[str, Any]: + if vector.sparse_vector is not None: + sparse = { + "indices": vector.sparse_vector.indices, # type: ignore[union-attr] + "values": vector.sparse_vector.values, # type: ignore[union-attr] + } + else: + sparse = None + + return { + "id": vector.id, + "vector": vector.vector, + "sparseVector": sparse, + "metadata": vector.metadata, + "data": vector.data, + } + + +def query_requests_to_payload( queries: List[QueryRequest], ) -> Tuple[bool, List[Dict[str, Any]]]: - has_vector_query = False has_data_query = False - payloads = [] for query in queries: - payload = { - "topK": query.get("top_k", 10), - "includeVectors": query.get("include_vectors", False), - "includeMetadata": query.get("include_metadata", False), - "includeData": query.get("include_data", False), - "filter": query.get("filter", ""), - } + payload: Dict[str, Any] = {} + + if "top_k" in query: + payload["topK"] = query["top_k"] + + if "include_vectors" in query: + payload["includeVectors"] = query["include_vectors"] + + if "include_metadata" in query: + payload["includeMetadata"] = query["include_metadata"] + + if "include_data" in query: + payload["includeData"] = query["include_data"] + + if "filter" in query: + payload["filter"] = query["filter"] + + if "weighting_strategy" in query: + payload["weightingStrategy"] = query["weighting_strategy"].value + + if "fusion_algorithm" in query: + payload["fusionAlgorithm"] = query["fusion_algorithm"].value vector = query.get("vector") + sparse_vector = query.get("sparse_vector") data = query.get("data") - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - if data is not None: - if has_vector_query: + if vector is not None or sparse_vector is not None: raise ClientError( - "`data` and `vector` queries cannot be mixed in the same batch." + "When the query contains `data`, it can't contain `vector` or `sparse_vector`." ) has_data_query = True payload["data"] = data else: + if vector is None and sparse_vector is None: + raise ClientError( + "Query must contain at least one of `vector`, `sparse_vector`, or `data`." + ) + if has_data_query: raise ClientError( - "`data` and `vector` queries cannot be mixed in the same batch." + "`data` and `vector`/`sparse_vector` queries cannot be mixed in the same batch." ) - has_vector_query = True - payload["vector"] = convert_to_list(vector) + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } payloads.append(payload) - return has_vector_query, payloads + return not has_data_query, payloads