From 728eebf3fb96455b22568f28261bf22e64b220b1 Mon Sep 17 00:00:00 2001 From: Metin Dumandag <29387993+mdumandag@users.noreply.github.com> Date: Fri, 6 Dec 2024 18:20:23 +0300 Subject: [PATCH] VEC-224: Add support for sparse and hybrid indexes Added support for sparse and hybrid vectors to upsert, query, resumable query, fetch, update, and range apis. Also, added new query options for sparse and hybrid indexes; namely the weighting strategy and fusion algorithm. --- .github/workflows/release.yaml | 4 +- .github/workflows/test.yaml | 9 +- README.md | 259 ++++++---- pyproject.toml | 18 +- tests/__init__.py | 18 +- tests/conftest.py | 56 ++- tests/core/test_delete.py | 2 +- tests/core/test_fetch.py | 170 ++++++- tests/core/test_info.py | 4 +- tests/core/test_namespace_operations.py | 2 +- tests/core/test_query.py | 634 +++++++++++++++++++++++- tests/core/test_range.py | 226 ++++++++- tests/core/test_reset.py | 3 +- tests/core/test_resumable_query.py | 271 ++++++---- tests/core/test_update.py | 122 ++++- tests/core/test_upsert.py | 319 +++++++++++- upstash_vector/__init__.py | 6 +- upstash_vector/client.py | 2 +- upstash_vector/core/index_operations.py | 241 ++++++--- upstash_vector/http.py | 2 +- upstash_vector/types.py | 157 +++++- upstash_vector/utils.py | 330 +++++++++--- 22 files changed, 2482 insertions(+), 373 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1caa1d1..b54be3b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -8,10 +8,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.8 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9a1ce9a..9156225 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -12,10 +12,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.8 @@ -40,6 +40,9 @@ jobs: export TOKEN="${{secrets.TOKEN}}" export EMBEDDING_URL="${{secrets.EMBEDDING_URL}}" export EMBEDDING_TOKEN="${{secrets.EMBEDDING_TOKEN}}" - poetry add --dev pytest + export SPARSE_URL="${{secrets.SPARSE_URL}}" + export SPARSE_TOKEN="${{secrets.SPARSE_TOKEN}}" + export HYBRID_URL="${{secrets.HYBRID_URL}}" + export HYBRID_TOKEN="${{secrets.HYBRID_TOKEN}}" poetry install poetry run pytest diff --git a/README.md b/README.md index 6d33e5a..afbd533 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,28 @@ # Upstash Vector Python SDK + The Upstash Vector Python client > [!NOTE] > **This project is in GA Stage.** > -> The Upstash Professional Support fully covers this project. It receives regular updates, and bug fixes. +> The Upstash Professional Support fully covers this project. It receives regular updates, and bug fixes. > The Upstash team is committed to maintaining and improving its functionality. ## Installation Install a released version from pip: + ```shell pip3 install upstash-vector ``` ## Usage -In order to use this client, head out to [Upstash Console](https://console.upstash.com) and create a vector database. + +In order to use this client, head out to [Upstash Console](https://console.upstash.com) and create a vector database. There, get the `UPSTASH_VECTOR_REST_URL` and the `UPSTASH_VECTOR_REST_TOKEN` from the dashboard. ### Initializing the Index + ```python from upstash_vector import Index @@ -46,11 +50,10 @@ to be later queried or fetched. There are a couple of ways of doing upserts: ```python -# as tuples, either of the form: -# - (id, vector, metadata, data) -# - (id, vector, metadata) -# - (id, vector) - +# - dense indexes +# - (id, vector, metadata, data) +# - (id, vector, metadata) +# - (id, vector) index.upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), @@ -58,15 +61,38 @@ index.upsert( ("id3", [0.3, 0.4]), ] ) + +# - sparse indexes +# - (id, sparse_vector, metadata, data) +# - (id, sparse_vector, metadata) +# - (id, sparse_vector) +index.upsert( + vectors=[ + ("id1", ([0, 1], [0.1, 0.2]), {"metadata_field": "metadata_value"}, "data-value"), + ("id2", ([1, 2], [0.2, 0.2]), {"metadata_field": "metadata_value"}), + ("id3", ([2, 3, 4], [0.3, 0.4, 0.5])), + ] +) + +# - hybrid indexes +# - (id, vector, sparse_vector, metadata, data) +# - (id, vector, sparse_vector, metadata) +# - (id, vector, sparse_vector) +index.upsert( + vectors=[ + ("id1", [0.1, 0.2], ([0, 1], [0.1, 0.2]), {"metadata_field": "metadata_value"}, "data-value"), + ("id2", [0.2, 0.2], ([1, 2], [0.2, 0.2]), {"metadata_field": "metadata_value"}), + ("id3", [0.3, 0.4], ([2, 3, 4], [0.3, 0.4, 0.5])), + ] +) ``` ```python -# as dicts, either of the form: -# - {"id": id, "vector": vector, "metadata": metadata, "data": data) -# - {"id": id, "vector": vector, "metadata": metadata) -# - {"id": id, "vector": vector, "data": data) -# - {"id": id, "vector": vector} - +# - dense indexes +# - {"id": id, "vector": vector, "metadata": metadata, "data": data) +# - {"id": id, "vector": vector, "metadata": metadata) +# - {"id": id, "vector": vector, "data": data) +# - {"id": id, "vector": vector} index.upsert( vectors=[ {"id": "id4", "vector": [0.1, 0.2], "metadata": {"field": "value"}, "data": "value"}, @@ -75,18 +101,68 @@ index.upsert( {"id": "id7", "vector": [0.5, 0.6]}, ] ) + +# - sparse indexes +# - {"id": id, "sparse_vector": sparse_vector, "metadata": metadata, "data": data) +# - {"id": id, "sparse_vector": sparse_vector, "metadata": metadata) +# - {"id": id, "sparse_vector": sparse_vector, "data": data) +# - {"id": id, "sparse_vector": sparse_vector} +index.upsert( + vectors=[ + {"id": "id4", "sparse_vector": ([0, 1], [0.1, 0.2]), "metadata": {"field": "value"}, "data": "value"}, + {"id": "id5", "sparse_vector": ([1, 2], [0.2, 0.2]), "metadata": {"field": "value"}}, + {"id": "id6", "sparse_vector": ([2, 3, 4], [0.3, 0.4, 0.5]), "data": "value"}, + {"id": "id7", "sparse_vector": ([4], [0.3])}, + ] +) + +# - hybrid indexes +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "metadata": metadata, "data": data) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "metadata": metadata) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector, "data": data) +# - {"id": id, "vector": vector, "sparse_vector": sparse_vector} +index.upsert( + vectors=[ + {"id": "id4", "vector": [0.1, 0.2], "sparse_vector": ([0], [0.1]), "metadata": {"field": "value"}, + "data": "value"}, + {"id": "id5", "vector": [0.1, 0.2], "sparse_vector": ([1, 2], [0.2, 0.2]), "metadata": {"field": "value"}}, + {"id": "id6", "vector": [0.1, 0.2], "sparse_vector": ([2, 3, 4], [0.3, 0.4, 0.5]), "data": "value"}, + {"id": "id7", "vector": [0.5, 0.6], "sparse_vector": ([4], [0.3])}, + ] +) ``` ```python -from upstash_vector import Vector +from upstash_vector import Vector, SparseVector + +# dense indexes +index.upsert( + vectors=[ + Vector(id="id5", vector=[1, 2], metadata={"field": "value"}, data="value"), + Vector(id="id6", vector=[1, 2], metadata={"field": "value"}), + Vector(id="id7", vector=[1, 2], data="value"), + Vector(id="id8", vector=[6, 7]), + ] +) -# as Vector objects +# sparse indexes +index.upsert( + vectors=[ + Vector(id="id5", sparse_vector=SparseVector([1], [0.1]), metadata={"field": "value"}, data="value"), + Vector(id="id6", sparse_vector=SparseVector([1, 2], [0.1, 0.2]), metadata={"field": "value"}), + Vector(id="id7", sparse_vector=SparseVector([3, 5], [0.3, 0.3]), data="value"), + Vector(id="id8", sparse_vector=SparseVector([4], [0.2])), + ] +) +# hybrid indexes index.upsert( vectors=[ - Vector(id="id5", vector=[1, 2], metadata={"field": "value"}), - Vector(id="id6", vector=[1, 2], data="value"), - Vector(id="id7", vector=[6, 7]), + Vector(id="id5", vector=[1, 2], sparse_vector=SparseVector([1], [0.1]), metadata={"field": "value"}, + data="value"), + Vector(id="id6", vector=[1, 2], sparse_vector=SparseVector([1, 2], [0.1, 0.2]), metadata={"field": "value"}), + Vector(id="id7", vector=[1, 2], sparse_vector=SparseVector([3, 5], [0.3, 0.3]), data="value"), + Vector(id="id8", vector=[6, 7], sparse_vector=SparseVector([4], [0.2])), ] ) ``` @@ -113,7 +189,7 @@ When no namespace is provided, the default namespace is used. index.upsert( vectors=[ ("id1", [0.1, 0.2]), - ("id2", [0.3,0.4]), + ("id2", [0.3, 0.4]), ], namespace="ns", ) @@ -126,7 +202,8 @@ query vector can be requested from a namespace of an index. ```python res = index.query( - vector=[0.6, 0.9], + vector=[0.6, 0.9], # for dense and hybrid indexes + sparse_vector=([0, 1], [0.1, 0.1]), # for sparse and hybrid indexes top_k=5, include_vectors=False, include_metadata=True, @@ -137,11 +214,12 @@ res = index.query( # List of query results, sorted in the descending order of similarity for r in res: print( - r.id, # The id used while upserting the vector - r.score, # The similarity score of this vector to the query vector. Higher is more similar. - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The data of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.score, # The similarity score of this vector to the query vector. Higher is more similar. + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The data of the vector, if requested and present. ) ``` @@ -160,15 +238,15 @@ res = index.query( When a filter is provided, query results are further narrowed down based on the vectors whose metadata matches with it. -See [Metadata Filtering](https://upstash.com/docs/vector/features/filtering) documentation -for more information regarding the filter syntax. +See [Metadata Filtering](https://upstash.com/docs/vector/features/filtering) documentation +for more information regarding the filter syntax. -Also, a namespace can be specified to query from. -When no namespace is provided, the default namespace is used. +Also, a namespace can be specified to query from. +When no namespace is provided, the default namespace is used. ```python res = index.query( - vector=[0.6, 0.9], + vector=[0.6, 0.9], top_k=5, namespace="ns", ) @@ -180,22 +258,23 @@ A set of vectors can be fetched from a namespace of an index. ```python res = index.fetch( - ids=["id3", "id4"], - include_vectors=False, + ids=["id3", "id4"], + include_vectors=False, include_metadata=True, include_data=True, ) # List of fetch results, one for each id passed for r in res: - if not r: # Can be None, if there is no such vector with the given id + if not r: # Can be None, if there is no such vector with the given id continue - + print( - r.id, # The id used while upserting the vector - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The metadata of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` @@ -203,28 +282,29 @@ or, for singular fetch: ```python res = index.fetch( - "id1", - include_vectors=True, + "id1", + include_vectors=True, include_metadata=True, include_data=False, ) r = res[0] -if r: # Can be None, if there is no such vector with the given id +if r: # Can be None, if there is no such vector with the given id print( - r.id, # The id used while upserting the vector - r.vector, # The value of the vector, if requested. - r.metadata, # The metadata of the vector, if requested and present. - r.data, # The metadata of the vector, if requested and present. + r.id, # The id used while upserting the vector + r.vector, # The value of the vector, if requested (for dense and hybrid indexes). + r.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` -Also, a namespace can be specified to fetch from. +Also, a namespace can be specified to fetch from. When no namespace is provided, the default namespace is used. ```python res = index.fetch( - ids=["id3", "id4"], + ids=["id3", "id4"], namespace="ns", ) ``` @@ -237,37 +317,38 @@ in a page by page fashion. ```python # Scans the vectors 100 vector at a time, res = index.range( - cursor="", # Start the scan from the beginning - limit=100, - include_vectors=False, + cursor="", # Start the scan from the beginning + limit=100, + include_vectors=False, include_metadata=True, include_data=True, ) while res.next_cursor != "": res = index.range( - cursor=res.next_cursor, - limit=100, - include_vectors=False, + cursor=res.next_cursor, + limit=100, + include_vectors=False, include_metadata=True, include_data=True, ) - + for v in res.vectors: print( - v.id, # The id used while upserting the vector - v.vector, # The value of the vector, if requested. - v.metadata, # The metadata of the vector, if requested and present. - v.data, # The data of the vector, if requested and present. + v.id, # The id used while upserting the vector + v.vector, # The value of the vector, if requested (for dense and hybrid indexes). + v.sparse_vector, # The value of the sparse vector, if requested (for sparse and hybrid indexes). + v.metadata, # The metadata of the vector, if requested and present. + v.data, # The data of the vector, if requested and present. ) ``` -Also, a namespace can be specified to range from. +Also, a namespace can be specified to range from. When no namespace is provided, the default namespace is used. ```python res = index.range( - cursor="", + cursor="", limit=100, namespace="ns", ) @@ -284,7 +365,7 @@ res = index.delete( ) print( - res.deleted, # How many vectors are deleted out of the given ids. + res.deleted, # How many vectors are deleted out of the given ids. ) ``` @@ -295,10 +376,10 @@ res = index.delete( "id1", ) -print(res) # A boolean indicating whether the vector is deleted or not. +print(res) # A boolean indicating whether the vector is deleted or not. ``` -Also, a namespace can be specified to delete from. +Also, a namespace can be specified to delete from. When no namespace is provided, the default namespace is used. ```python @@ -310,24 +391,23 @@ res = index.delete( ### Update a Vector -Either the vector value(or data for indexes created with an embedding model) or the metadata -can be updated without needing to set the other one. +Any combination of vector value, sparse vector value, data, or metadata can be updated. ```python res = index.update( - "id1", + "id1", metadata={"new_field": "new_value"}, ) -print(res) # A boolean indicating whether the vector is updated or not. +print(res) # A boolean indicating whether the vector is updated or not. ``` -Also, a namespace can be specified to update from. +Also, a namespace can be specified to update from. When no namespace is provided, the default namespace is used. ```python res = index.update( - "id1", + "id1", metadata={"new_field": "new_value"}, namespace="ns", ) @@ -341,7 +421,7 @@ All vectors can be removed from a namespace of an index. index.reset() ``` -Also, a namespace can be specified to reset. +Also, a namespace can be specified to reset. When no namespace is provided, the default namespace is used. ```python @@ -367,18 +447,18 @@ This information also contains per-namespace status. ```python info = index.info() print( - info.vector_count, # Total number of vectors across all namespaces - info.pending_vector_count, # Total number of vectors waiting to be indexed across all namespaces - info.index_size, # Total size of the index on disk in bytes - info.dimension, # Vector dimension - info.similarity_function, # Similarity function used + info.vector_count, # Total number of vectors across all namespaces + info.pending_vector_count, # Total number of vectors waiting to be indexed across all namespaces + info.index_size, # Total size of the index on disk in bytes + info.dimension, # Vector dimension + info.similarity_function, # Similarity function used ) for ns, ns_info in info.namespaces.items(): print( - ns, # Name of the namespace - ns_info.vector_count, # Total number of vectors in this namespaces - ns_info.pending_vector_count, # Total number of vectors waiting to be indexed in this namespaces + ns, # Name of the namespace + ns_info.vector_count, # Total number of vectors in this namespaces + ns_info.pending_vector_count, # Total number of vectors waiting to be indexed in this namespaces ) ``` @@ -389,12 +469,12 @@ All the names of active namespaces can be listed. ```python namespaces = index.list_namespaces() for ns in namespaces: - print(ns) # name of the namespace + print(ns) # name of the namespace ``` ### Delete a Namespace -A namespace can be deleted entirely. +A namespace can be deleted entirely. If no such namespace exists, and exception is raised. The default namespaces cannot be deleted. @@ -405,7 +485,9 @@ index.delete_namespace(namespace="ns") # Contributing ## Preparing the environment -This project uses [Poetry](https://python-poetry.org) for packaging and dependency management. Make sure you are able to create the poetry shell with relevant dependencies. + +This project uses [Poetry](https://python-poetry.org) for packaging and dependency management. Make sure you are able to +create the poetry shell with relevant dependencies. You will also need a vector database on [Upstash](https://console.upstash.com/). @@ -414,22 +496,33 @@ poetry install ``` ## Code Formatting + ```bash poetry run ruff format . ``` ## Running tests -To run all the tests, make sure the poetry virtual environment activated with all +To run all the tests, make sure the poetry virtual environment activated with all the necessary dependencies. -Create two Vector Stores on upstash. First one should have 2 dimensions. Second one should use an embedding model. Set the necessary environment variables: +Create four Vector Stores on Upstash. First one should have 2 dimensions. Second one should use an embedding model. Set +the necessary environment variables: + +- A dense index with 2 dimensions, with cosine similarity +- A dense index with an embedding model +- A sparse index +- A hyrid index with 2 dimensions, with cosine similarity for the dense component. ``` URL=**** TOKEN=**** EMBEDDING_URL=**** EMBEDDING_TOKEN=**** +SPARSE_URL=**** +SPARSE_TOKEN=**** +HYBRID_URL=**** +HYBRID_TOKEN=**** ``` Then, run the following command to run tests: diff --git a/pyproject.toml b/pyproject.toml index dbf911d..f6e57dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,20 +34,16 @@ python = "^3.8" httpx = ">=0.23.0, <1" [tool.poetry.group.dev.dependencies] -mypy = "^1.8.0" -types-requests = "^2.31.0.20240106" -types-pygments = "^2.17.0.20240106" -types-colorama = "^0.4.15.20240106" -types-setuptools = "^69.0.0.20240115" -pytest = "^7.4.4" -pytest-asyncio = "^0.20.0" -ruff = "^0.1.13" +mypy = "^1.13.0" +pytest = "^8.3.4" +pytest-asyncio = "^0.24.0" +ruff = "^0.8.2" numpy = [ - { version = "<=1.24.4", python = "<=3.8" }, - { version = ">=1.25.0", python = ">=3.9" } + { version = "^1.24.4", python = "<=3.8" }, + { version = "^1.26.4", python = ">=3.9" } ] pandas = "^2.0.3" -pandas-stubs = "^2.0.3" +python-dotenv = "^1.0.1" [build-system] requires = ["poetry-core"] diff --git a/tests/__init__.py b/tests/__init__.py index 146d250..e5cc488 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,25 @@ -import time import asyncio +import os +import time + +import dotenv -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.core.index_operations import DEFAULT_NAMESPACE +dotenv.load_dotenv() + NAMESPACES = [DEFAULT_NAMESPACE, "ns"] +INDEX_URL = os.environ["URL"] +INDEX_TOKEN = os.environ["TOKEN"] +SPARSE_INDEX_URL = os.environ["SPARSE_URL"] +SPARSE_INDEX_TOKEN = os.environ["SPARSE_TOKEN"] +HYBRID_INDEX_URL = os.environ["HYBRID_URL"] +HYBRID_INDEX_TOKEN = os.environ["HYBRID_TOKEN"] +EMBEDDING_INDEX_URL = os.environ["EMBEDDING_URL"] +EMBEDDING_INDEX_TOKEN = os.environ["EMBEDDING_TOKEN"] + def assert_eventually(assertion, retry_delay=0.5, timeout=5.0): deadline = time.time() + timeout diff --git a/tests/conftest.py b/tests/conftest.py index 4e13f68..2358c34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,15 +1,23 @@ -from os import environ - import pytest import pytest_asyncio -from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import ( + EMBEDDING_INDEX_TOKEN, + EMBEDDING_INDEX_URL, + HYBRID_INDEX_TOKEN, + HYBRID_INDEX_URL, + INDEX_TOKEN, + INDEX_URL, + NAMESPACES, + SPARSE_INDEX_TOKEN, + SPARSE_INDEX_URL, +) +from upstash_vector import AsyncIndex, Index @pytest.fixture def index(): - idx = Index(environ["URL"], environ["TOKEN"]) + idx = Index(INDEX_URL, INDEX_TOKEN) for ns in NAMESPACES: idx.reset(namespace=ns) return idx @@ -17,7 +25,39 @@ def index(): @pytest_asyncio.fixture async def async_index(): - idx = AsyncIndex(environ["URL"], environ["TOKEN"]) + idx = AsyncIndex(INDEX_URL, INDEX_TOKEN) + for ns in NAMESPACES: + await idx.reset(namespace=ns) + return idx + + +@pytest.fixture +def sparse_index(): + idx = Index(SPARSE_INDEX_URL, SPARSE_INDEX_TOKEN) + for ns in NAMESPACES: + idx.reset(namespace=ns) + return idx + + +@pytest_asyncio.fixture +async def async_sparse_index(): + idx = AsyncIndex(SPARSE_INDEX_URL, SPARSE_INDEX_TOKEN) + for ns in NAMESPACES: + await idx.reset(namespace=ns) + return idx + + +@pytest.fixture +def hybrid_index(): + idx = Index(HYBRID_INDEX_URL, HYBRID_INDEX_TOKEN) + for ns in NAMESPACES: + idx.reset(namespace=ns) + return idx + + +@pytest_asyncio.fixture +async def async_hybrid_index(): + idx = AsyncIndex(HYBRID_INDEX_URL, HYBRID_INDEX_TOKEN) for ns in NAMESPACES: await idx.reset(namespace=ns) return idx @@ -25,7 +65,7 @@ async def async_index(): @pytest.fixture def embedding_index(): - idx = Index(environ["EMBEDDING_URL"], environ["EMBEDDING_TOKEN"]) + idx = Index(EMBEDDING_INDEX_URL, EMBEDDING_INDEX_TOKEN) for ns in NAMESPACES: idx.reset(namespace=ns) return idx @@ -33,7 +73,7 @@ def embedding_index(): @pytest_asyncio.fixture async def async_embedding_index(): - idx = AsyncIndex(environ["EMBEDDING_URL"], environ["EMBEDDING_TOKEN"]) + idx = AsyncIndex(EMBEDDING_INDEX_URL, EMBEDDING_INDEX_TOKEN) for ns in NAMESPACES: await idx.reset(namespace=ns) return idx diff --git a/tests/core/test_delete.py b/tests/core/test_delete.py index 4799a87..8242cd3 100644 --- a/tests/core/test_delete.py +++ b/tests/core/test_delete.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index @pytest.mark.parametrize("ns", NAMESPACES) diff --git a/tests/core/test_fetch.py b/tests/core/test_fetch.py index 1e7f736..aedd238 100644 --- a/tests/core/test_fetch.py +++ b/tests/core/test_fetch.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector @pytest.mark.parametrize("ns", NAMESPACES) @@ -378,3 +378,171 @@ async def test_fetch_with_data_async(async_index: AsyncIndex, ns: str): assert res[2].metadata is None assert res[2].vector == v3_values assert res[2].data == v3_data + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_fetch_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = sparse_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_fetch_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = hybrid_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.2] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].vector == [0.2, 0.3] + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_fetch_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = await async_sparse_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_fetch_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + res = await async_hybrid_index.fetch( + ids=["id0", "id1", "id2", "id3"], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 4 + + assert res[0] is not None + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.2] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1] is not None + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].vector == [0.2, 0.3] + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2] is not None + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert res[3] is None diff --git a/tests/core/test_info.py b/tests/core/test_info.py index 7b83345..bb88956 100644 --- a/tests/core/test_info.py +++ b/tests/core/test_info.py @@ -1,13 +1,13 @@ import pytest from tests import ( + NAMESPACES, assert_eventually, assert_eventually_async, - NAMESPACES, ensure_ns_exists, ensure_ns_exists_async, ) -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index def test_info(index: Index): diff --git a/tests/core/test_namespace_operations.py b/tests/core/test_namespace_operations.py index b2399fc..566e63b 100644 --- a/tests/core/test_namespace_operations.py +++ b/tests/core/test_namespace_operations.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES, ensure_ns_exists, ensure_ns_exists_async -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.core.index_operations import DEFAULT_NAMESPACE diff --git a/tests/core/test_query.py b/tests/core/test_query.py index 6128617..04200e4 100644 --- a/tests/core/test_query.py +++ b/tests/core/test_query.py @@ -2,8 +2,9 @@ import pandas as pd import pytest -from tests import assert_eventually, assert_eventually_async, NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import NAMESPACES, assert_eventually, assert_eventually_async +from upstash_vector import AsyncIndex, Index, SparseVector +from upstash_vector.types import FusionAlgorithm, WeightingStrategy @pytest.mark.parametrize("ns", NAMESPACES) @@ -1025,3 +1026,632 @@ async def assertion(): assert query_res[1].data is None await assert_eventually_async(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_sparse_index(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query( + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_sparse_index_weighting_strategy(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query( + sparse_vector=([0, 1, 3], [0.2, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_many_sparse_index(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = sparse_index.query_many( + queries=[ + { + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + }, + { + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 2 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.2, 0.3], + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.2, 0.3] + assert res[0].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[1].id == "id0" + assert res[1].vector == [0.1, 0.2] + assert res[1].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index_weighting_strategy(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.1], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.5], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.1, 0.1], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.1, 0.1], + sparse_vector=([0, 1, 3], [0.5, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.1] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.1, 0.1] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].vector == [0.9, 0.5] + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_hybrid_index_fusion_algorithm(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.8, 0.9], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.9], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.3, 0.9], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query( + vector=[0.9, 0.9], + sparse_vector=([0, 1, 3], [0.1, 0.1, 100]), + include_vectors=True, + include_metadata=True, + include_data=True, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.9, 0.9] + assert res[0].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.3, 0.9] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id0" + assert res[2].vector == [0.8, 0.9] + assert res[2].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_many_hybrid_index(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.1])), + ("id1", [0.1, 0.4], ([1, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.5, 0.1], ([2, 3], [0.3, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + def assertion(): + res = hybrid_index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + "fusion_algorithm": FusionAlgorithm.RRF, + "top_k": 1, + }, + { + "vector": np.array([0.5, 0.1]), + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "fusion_algorithm": FusionAlgorithm.DBSF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 3 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.2, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + assert_eventually(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_sparse_index_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query( + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[1].id == "id1" + assert res[1].metadata == {"key": "value"} + assert res[1].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_sparse_index_weighting_strategy_async( + async_sparse_index: AsyncIndex, ns: str +): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query( + sparse_vector=([0, 1, 3], [0.2, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_many_sparse_index_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.1])), + ("id1", ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_sparse_index.query_many( + queries=[ + { + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + }, + { + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 2 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", [0.3, 0.4], ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.2, 0.3], + sparse_vector=([0, 1, 3], [0.1, 0.5, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.2, 0.3] + assert res[0].sparse_vector == SparseVector([1, 2], [0.2, 0.3]) + + assert res[1].id == "id0" + assert res[1].vector == [0.1, 0.2] + assert res[1].sparse_vector == SparseVector([0, 1], [0.1, 0.2]) + + assert res[2].id == "id2" + assert res[2].metadata == {"key": "value"} + assert res[2].data == "data" + assert res[2].vector == [0.3, 0.4] + assert res[2].sparse_vector == SparseVector([2, 3], [0.3, 0.4]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_weighting_strategy_async( + async_hybrid_index: AsyncIndex, ns: str +): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.1], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.5], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.1, 0.1], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.1, 0.1], + sparse_vector=([0, 1, 3], [0.5, 0.1, 0.1]), + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id0" + assert res[0].metadata is None + assert res[0].vector == [0.1, 0.1] + assert res[0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.1, 0.1] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id1" + assert res[2].metadata == {"key": "value"} + assert res[2].vector == [0.9, 0.5] + assert res[2].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_hybrid_index_fusion_algorithm_async( + async_hybrid_index: AsyncIndex, ns: str +): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.8, 0.9], ([0, 1], [0.1, 0.1])), + ("id1", [0.9, 0.9], ([1, 2], [0.1, 0.1]), {"key": "value"}), + ("id2", [0.3, 0.9], ([2, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query( + vector=[0.9, 0.9], + sparse_vector=([0, 1, 3], [0.1, 0.1, 100]), + include_vectors=True, + include_metadata=True, + include_data=True, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + assert len(res) == 3 + + assert res[0].id == "id1" + assert res[0].metadata == {"key": "value"} + assert res[0].vector == [0.9, 0.9] + assert res[0].sparse_vector == SparseVector([1, 2], [0.1, 0.1]) + + assert res[1].id == "id2" + assert res[1].metadata == {"key": "value"} + assert res[1].data == "data" + assert res[1].vector == [0.3, 0.9] + assert res[1].sparse_vector == SparseVector([2, 3], [0.1, 0.1]) + + assert res[2].id == "id0" + assert res[2].vector == [0.8, 0.9] + assert res[2].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_many_hybrid_index_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.1])), + ("id1", [0.1, 0.4], ([1, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.5, 0.1], ([2, 3], [0.3, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_hybrid_index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "sparse_vector": (np.array([3, 4]), pd.array([0.1, 0.2])), + "fusion_algorithm": FusionAlgorithm.RRF, + "top_k": 1, + }, + { + "vector": np.array([0.5, 0.1]), + "sparse_vector": SparseVector([0, 1], [0.5, 0.1]), + "include_vectors": True, + }, + { + "sparse_vector": SparseVector(np.array([2, 3]), [0.5, 0.5]), + "weighting_strategy": WeightingStrategy.IDF, + "fusion_algorithm": FusionAlgorithm.DBSF, + "include_metadata": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 3 + + assert len(res[0]) == 1 + assert res[0][0].id == "id2" + + assert len(res[1]) == 3 + assert res[1][0].id == "id0" + assert res[1][0].sparse_vector == SparseVector([0, 1], [0.1, 0.1]) + assert res[1][1].id == "id1" + assert res[1][1].sparse_vector == SparseVector([1, 2], [0.2, 0.1]) + + assert len(res[2]) == 2 + assert res[2][0].id == "id2" + assert res[2][0].metadata == {"key": "value"} + assert res[2][1].id == "id1" + assert res[2][1].metadata == {"key": "value"} + + await assert_eventually_async(assertion) diff --git a/tests/core/test_range.py b/tests/core/test_range.py index b8bc2de..76c15ef 100644 --- a/tests/core/test_range.py +++ b/tests/core/test_range.py @@ -4,7 +4,7 @@ from pytest import raises from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index from upstash_vector.errors import ClientError @@ -37,6 +37,9 @@ def test_range(index: Index, ns: str): assert res.vectors[i].id == f"id-{i}" assert res.vectors[i].metadata == {"meta": i} assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 while res.next_cursor != "": res = index.range( @@ -86,6 +89,9 @@ async def test_range_async(async_index: AsyncIndex, ns: str): assert res.vectors[i].id == f"id-{i}" assert res.vectors[i].metadata == {"meta": i} assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 while res.next_cursor != "": res = await async_index.range( @@ -103,3 +109,221 @@ async def test_range_async(async_index: AsyncIndex, ns: str): include_vectors=True, namespace=ns, ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_range_sparse(sparse_index: Index, ns: str): + vectors = [ + { + "id": f"id-{i}", + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + sparse_index.upsert(vectors=vectors, namespace=ns) + + res = sparse_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = sparse_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + sparse_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_range_sparse_async(async_sparse_index: AsyncIndex, ns: str): + vectors = [ + { + "id": f"id-{i}", + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + await async_sparse_index.upsert(vectors=vectors, namespace=ns) + + res = await async_sparse_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = await async_sparse_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + await async_sparse_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_range_hybrid(hybrid_index: Index, ns: str): + vectors = [ + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + hybrid_index.upsert(vectors=vectors, namespace=ns) + + res = hybrid_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = hybrid_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + hybrid_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_range_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + vectors = [ + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "sparse_vector": ( + [random.randint(0, 10) for _ in range(2)], + [random.random() for _ in range(2)], + ), + "metadata": {"meta": i}, + "data": f"data-{i}", + } + for i in range(20) + ] + + await async_hybrid_index.upsert(vectors=vectors, namespace=ns) + + res = await async_hybrid_index.range( + cursor="", + limit=4, + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + assert len(res.vectors) == 4 + assert res.next_cursor != "" + + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + v = res.vectors[i].vector + assert v is not None + assert len(v) == 2 + assert res.vectors[i].sparse_vector is not None + + while res.next_cursor != "": + res = await async_hybrid_index.range( + cursor=res.next_cursor, + limit=8, + include_vectors=True, + namespace=ns, + ) + assert len(res.vectors) == 8 + + with raises(ClientError): + await async_hybrid_index.range( + cursor="0", + limit=0, + include_vectors=True, + namespace=ns, + ) diff --git a/tests/core/test_reset.py b/tests/core/test_reset.py index ea23cc3..5c273b4 100644 --- a/tests/core/test_reset.py +++ b/tests/core/test_reset.py @@ -1,6 +1,7 @@ import pytest + from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index @pytest.mark.parametrize("ns", NAMESPACES) diff --git a/tests/core/test_resumable_query.py b/tests/core/test_resumable_query.py index bd791a1..5f6bacf 100644 --- a/tests/core/test_resumable_query.py +++ b/tests/core/test_resumable_query.py @@ -1,11 +1,9 @@ -import asyncio -import time - import pytest -from tests import assert_eventually_async, assert_eventually, NAMESPACES -from upstash_vector import Index, AsyncIndex +from tests import NAMESPACES, assert_eventually, assert_eventually_async +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.errors import UpstashError +from upstash_vector.types import FusionAlgorithm, WeightingStrategy @pytest.mark.parametrize("ns", NAMESPACES) @@ -28,13 +26,12 @@ def assertion(): namespace=ns, ) - assert isinstance(result, list) - - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + with handle: + assert isinstance(result, list) - handle.stop() + assert len(result) > 0 + assert result[0].metadata is not None + assert result[0].vector is not None with pytest.raises(UpstashError): handle.fetch_next(1) @@ -42,7 +39,6 @@ def assertion(): with pytest.raises(UpstashError): handle.stop() - time.sleep(1) assert_eventually(assertion) @@ -64,12 +60,10 @@ def assertion(): namespace=ns, ) - assert len(result) == 1 - assert result[0].id == "id1" - - handle.stop() + with handle: + assert len(result) == 1 + assert result[0].id == "id1" - time.sleep(1) assert_eventually(assertion) @@ -94,16 +88,15 @@ async def assertion(): namespace=ns, ) - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None - - next_results = await handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + async with handle: + assert isinstance(result, list) + assert len(result) > 0 + assert result[0].metadata is not None + assert result[0].vector is not None - await handle.stop() + next_results = await handle.fetch_next(1) + assert isinstance(next_results, list) + assert len(next_results) == 1 with pytest.raises(UpstashError): await handle.fetch_next(1) @@ -111,7 +104,6 @@ async def assertion(): with pytest.raises(UpstashError): await handle.stop() - await asyncio.sleep(1) await assert_eventually_async(assertion) @@ -136,12 +128,10 @@ async def assertion(): namespace=ns, ) - assert len(result) == 1 - assert result[0].id == "id1" - - await handle.stop() + async with handle: + assert len(result) == 1 + assert result[0].id == "id1" - await asyncio.sleep(1) await assert_eventually_async(assertion) @@ -166,28 +156,26 @@ def assertion(): namespace=ns, ) - assert len(result) == 2 - assert result[0].id == "id1" - assert result[1].id == "id2" - - # Fetch next 2 results - next_results_1 = handle.fetch_next(2) - assert len(next_results_1) == 2 - assert next_results_1[0].id == "id3" - assert next_results_1[1].id == "id4" + with handle: + assert len(result) == 2 + assert result[0].id == "id1" + assert result[1].id == "id2" - # Fetch next 1 result - next_results_2 = handle.fetch_next(1) - assert len(next_results_2) == 1 - assert next_results_2[0].id == "id5" + # Fetch next 2 results + next_results_1 = handle.fetch_next(2) + assert len(next_results_1) == 2 + assert next_results_1[0].id == "id3" + assert next_results_1[1].id == "id4" - # Try to fetch more, should return empty list - next_results_3 = handle.fetch_next(1) - assert len(next_results_3) == 0 + # Fetch next 1 result + next_results_2 = handle.fetch_next(1) + assert len(next_results_2) == 1 + assert next_results_2[0].id == "id5" - handle.stop() + # Try to fetch more, should return empty list + next_results_3 = handle.fetch_next(1) + assert len(next_results_3) == 0 - time.sleep(1) assert_eventually(assertion) @@ -213,100 +201,183 @@ async def assertion(): namespace=ns, ) - assert len(result) == 2 - - next_results_1 = await handle.fetch_next(2) - assert len(next_results_1) == 2 + async with handle: + assert len(result) == 2 - next_results_2 = await handle.fetch_next(1) - assert len(next_results_2) == 1 + next_results_1 = await handle.fetch_next(2) + assert len(next_results_1) == 2 - await handle.stop() + next_results_2 = await handle.fetch_next(1) + assert len(next_results_2) == 1 - await asyncio.sleep(1) await assert_eventually_async(assertion) @pytest.mark.parametrize("ns", NAMESPACES) -def test_resumable_query_context_manager(index: Index, ns: str): - index.upsert( +def test_resumable_query_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"field": "value1"}), - ("id2", [0.3, 0.4], {"field": "value2"}), - ("id3", [0.5, 0.6], {"field": "value3"}), + ("id0", ([0, 1], [0.3, 0.1])), + ("id1", ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), ], namespace=ns, ) def assertion(): - result, handle = index.resumable_query( - vector=[0.1, 0.2], + result, handle = sparse_index.resumable_query( + sparse_vector=([0], [0.1]), top_k=2, - include_metadata=True, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) with handle: - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - next_results = handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + assert_eventually(assertion) - # The query should be stopped automatically after exiting the context - with pytest.raises(UpstashError): - handle.fetch_next(1) +@pytest.mark.parametrize("ns", NAMESPACES) +def test_resumable_query_hybrid(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.9, 0.9], ([0, 1], [0.3, 0.1])), + ("id1", [0.8, 0.9], ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.7, 0.9], ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) - with pytest.raises(UpstashError): - handle.stop() + def assertion(): + result, handle = hybrid_index.resumable_query( + vector=[0.1, 0.1], + sparse_vector=([0], [0.1]), + top_k=2, + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + with handle: + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].vector == [0.9, 0.9] + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].vector == [0.8, 0.9] + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].vector == [0.7, 0.9] + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - time.sleep(1) assert_eventually(assertion) @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) -async def test_resumable_query_async_context_manager(async_index: AsyncIndex, ns: str): - await async_index.upsert( +async def test_resumable_query_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"field": "value1"}), - ("id2", [0.3, 0.4], {"field": "value2"}), - ("id3", [0.5, 0.6], {"field": "value3"}), + ("id0", ([0, 1], [0.3, 0.1])), + ("id1", ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), ], namespace=ns, ) async def assertion(): - result, handle = await async_index.resumable_query( - vector=[0.1, 0.2], + result, handle = await async_sparse_index.resumable_query( + sparse_vector=([0], [0.1]), top_k=2, - include_metadata=True, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) async with handle: - assert isinstance(result, list) - assert len(result) > 0 - assert result[0].metadata is not None - assert result[0].vector is not None + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = await handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - next_results = await handle.fetch_next(1) - assert isinstance(next_results, list) - assert len(next_results) == 1 + await assert_eventually_async(assertion) - # The query should be stopped automatically after exiting the context - with pytest.raises(UpstashError): - await handle.fetch_next(1) +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_resumable_query_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.9, 0.9], ([0, 1], [0.3, 0.1])), + ("id1", [0.8, 0.9], ([0, 2], [0.2, 0.1]), {"key": "value"}), + ("id2", [0.7, 0.9], ([0, 3], [0.1, 0.1]), {"key": "value"}, "data"), + ], + namespace=ns, + ) - with pytest.raises(UpstashError): - await handle.stop() + async def assertion(): + result, handle = await async_hybrid_index.resumable_query( + vector=[0.1, 0.1], + sparse_vector=([0], [0.1]), + top_k=2, + include_vectors=True, + include_metadata=True, + include_data=True, + weighting_strategy=WeightingStrategy.IDF, + fusion_algorithm=FusionAlgorithm.DBSF, + namespace=ns, + ) + + async with handle: + assert len(result) == 2 + assert result[0].id == "id0" + assert result[0].vector == [0.9, 0.9] + assert result[0].sparse_vector == SparseVector([0, 1], [0.3, 0.1]) + assert result[1].id == "id1" + assert result[1].metadata == {"key": "value"} + assert result[1].vector == [0.8, 0.9] + assert result[1].sparse_vector == SparseVector([0, 2], [0.2, 0.1]) + + next_result = await handle.fetch_next(1) + assert len(next_result) == 1 + assert next_result[0].id == "id2" + assert next_result[0].metadata == {"key": "value"} + assert next_result[0].data == "data" + assert next_result[0].vector == [0.7, 0.9] + assert next_result[0].sparse_vector == SparseVector([0, 3], [0.1, 0.1]) - await asyncio.sleep(1) await assert_eventually_async(assertion) diff --git a/tests/core/test_update.py b/tests/core/test_update.py index 2da0cae..3bc3a1c 100644 --- a/tests/core/test_update.py +++ b/tests/core/test_update.py @@ -1,7 +1,7 @@ import pytest from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.types import MetadataUpdateMode @@ -257,3 +257,123 @@ async def test_update_vector_data_async(async_index: AsyncIndex, ns: str): async def test_update_non_existing_id_async(async_index: AsyncIndex, ns: str): updated = await async_index.update("id-999", vector=[0.4, 0.5], namespace=ns) assert updated is False + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_update_sparse_vector(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + updated = sparse_index.update( + "id1", + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = sparse_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_update_hybrid_vector(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ], + namespace=ns, + ) + + updated = hybrid_index.update( + "id1", + vector=[0.5, 0.6], + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = hybrid_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].vector == [0.5, 0.6] + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_sparse_vector_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ("id0", ([0, 1], [0.1, 0.2])), + ("id1", ([1, 2], [0.2, 0.3]), {"key": "value"}), + ("id2", ([2, 3], [0.3, 0.4]), {"key": "value"}, "data"), + ], + namespace=ns, + ) + + updated = await async_sparse_index.update( + "id1", + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = await async_sparse_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_hybrid_vector_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ("id0", [0.1, 0.2], ([0, 1], [0.1, 0.2])), + ("id1", [0.2, 0.3], ([1, 2], [0.2, 0.3]), {"key": "value"}), + ], + namespace=ns, + ) + + updated = await async_hybrid_index.update( + "id1", + vector=[0.5, 0.6], + sparse_vector=SparseVector([6, 7], [0.5, 0.6]), + namespace=ns, + ) + assert updated is True + + res = await async_hybrid_index.fetch( + "id1", + include_vectors=True, + namespace=ns, + ) + assert len(res) == 1 + assert res[0] is not None + assert res[0].id == "id1" + assert res[0].vector == [0.5, 0.6] + assert res[0].sparse_vector == SparseVector([6, 7], [0.5, 0.6]) diff --git a/tests/core/test_upsert.py b/tests/core/test_upsert.py index 5ab4389..34a439e 100644 --- a/tests/core/test_upsert.py +++ b/tests/core/test_upsert.py @@ -1,14 +1,13 @@ +import numpy as np +import pandas as pd import pytest from pytest import raises from tests import NAMESPACES -from upstash_vector import Index, AsyncIndex +from upstash_vector import AsyncIndex, Index, SparseVector from upstash_vector.errors import ClientError from upstash_vector.types import Data, Vector -import numpy as np -import pandas as pd - @pytest.mark.parametrize("ns", NAMESPACES) def test_upsert_tuple(index: Index, ns: str): @@ -766,3 +765,315 @@ async def test_invalid_payload_async(async_index: AsyncIndex, ns: str): ], namespace=ns, ) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_upsert_sparse(sparse_index: Index, ns: str): + sparse_index.upsert( + vectors=[ + ( + "id0", + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = sparse_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.sparse_vector is not None + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_upsert_hybrid(hybrid_index: Index, ns: str): + hybrid_index.upsert( + vectors=[ + ( + "id0", + [0.1, 0.1], + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + np.array([0.5, 0.5]), + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + pd.array([0.1, 0.2]), + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + vector=[3.3, 1.0], + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + [1.1, 2.2], + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + [1.0, 5.0], + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "vector": [13.0, 0.5], + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "vector": np.array([4.2, 2.4]), + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "vector": [13, 5.4], + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + vector=np.array([0.9, 0.7]), + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + vector=[0.0, 1.0], + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = hybrid_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.vector is not None + assert len(r.vector) == 2 + assert r.sparse_vector is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_upsert_sparse_async(async_sparse_index: AsyncIndex, ns: str): + await async_sparse_index.upsert( + vectors=[ + ( + "id0", + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = await async_sparse_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.sparse_vector is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_upsert_hybrid_async(async_hybrid_index: AsyncIndex, ns: str): + await async_hybrid_index.upsert( + vectors=[ + ( + "id0", + [0.1, 0.1], + ([0, 1], [0.1, 0.2]), + ), + ( + "id1", + np.array([0.5, 0.5]), + (np.array([1, 2]), [0.2, 0.3]), + {"key1": "value"}, + ), + ( + "id2", + pd.array([0.1, 0.2]), + ([2, 3], pd.array([0.3, 0.4])), + {"key2": "value"}, + "data2", + ), + Vector( + "id3", + vector=[3.3, 1.0], + sparse_vector=([10, 20, 30, 40], [1.02, 2.01, 3.3, 4.4]), + ), + ( + "id4", + [1.1, 2.2], + SparseVector([2, 39, 93], [0.3, 0.5, 0.1]), + ), + ( + "id5", + [1.0, 5.0], + SparseVector(np.array([11]), pd.array([11.5])), + ), + { + "id": "id6", + "vector": [13.0, 0.5], + "sparse_vector": ([42, 43, 44, 45, 46, 47], [42, 43, 44, 45, 46, 47]), + "metadata": {"key6": "value"}, + }, + { + "id": "id7", + "vector": np.array([4.2, 2.4]), + "sparse_vector": SparseVector([4, 5], [4.5, 5.4]), + "metadata": {"key7": "value"}, + "data": "data7", + }, + { + "id": "id8", + "vector": [13, 5.4], + "sparse_vector": (np.array([0, 3]), np.array([0.1, 3.0])), + }, + Vector( + "id9", + vector=np.array([0.9, 0.7]), + sparse_vector=([1, 2], [1.2, 2.1]), + ), + Vector( + "id10", + vector=[0.0, 1.0], + sparse_vector=SparseVector([1], np.array([12.0])), + ), + ], + namespace=ns, + ) + + ids = [f"id{i}" for i in range(11)] + res = await async_hybrid_index.fetch( + ids, + include_vectors=True, + namespace=ns, + ) + assert len(res) == 11 + for i, r in enumerate(res): + assert r is not None + assert r.id == f"id{i}" + assert r.vector is not None + assert len(r.vector) == 2 + assert r.sparse_vector is not None diff --git a/upstash_vector/__init__.py b/upstash_vector/__init__.py index 48cd11f..66b3e9b 100644 --- a/upstash_vector/__init__.py +++ b/upstash_vector/__init__.py @@ -1,6 +1,6 @@ __version__ = "0.6.0" -from upstash_vector.client import Index, AsyncIndex -from upstash_vector.types import Vector +from upstash_vector.client import AsyncIndex, Index +from upstash_vector.types import SparseVector, Vector -__all__ = ["Index", "AsyncIndex", "Vector"] +__all__ = ["Index", "AsyncIndex", "Vector", "SparseVector"] diff --git a/upstash_vector/client.py b/upstash_vector/client.py index aa25a8a..4187fd0 100644 --- a/upstash_vector/client.py +++ b/upstash_vector/client.py @@ -3,7 +3,7 @@ import httpx -from upstash_vector.core.index_operations import IndexOperations, AsyncIndexOperations +from upstash_vector.core.index_operations import AsyncIndexOperations, IndexOperations from upstash_vector.http import ( execute_with_parameters, execute_with_parameters_async, diff --git a/upstash_vector/core/index_operations.py b/upstash_vector/core/index_operations.py index a06f978..b1ffb11 100644 --- a/upstash_vector/core/index_operations.py +++ b/upstash_vector/core/index_operations.py @@ -1,33 +1,38 @@ from typing import ( - Sequence, - Union, - List, + Any, + Awaitable, + Callable, Dict, + List, Optional, - Any, + Sequence, Tuple, - Callable, - Awaitable, + Union, ) from upstash_vector.errors import ClientError from upstash_vector.types import ( Data, DeleteResult, + FetchResult, + FusionAlgorithm, + InfoResult, MetadataUpdateMode, QueryRequest, + QueryResult, RangeResult, - InfoResult, + SparseVector, SupportsToList, - FetchResult, - QueryResult, + TupleAsSparseVectorT, Vector, + WeightingStrategy, ) from upstash_vector.utils import ( - convert_query_requests_to_payload, - convert_to_list, - convert_to_vectors, - convert_to_payload, + query_requests_to_payload, + sequence_to_vectors, + to_list, + to_sparse_vector, + vectors_to_payload, ) DEFAULT_NAMESPACE = "" @@ -82,7 +87,7 @@ def upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), - ("id3", [0.3,0.4]), + ("id3", [0.3, 0.4]), ] ) ``` @@ -133,8 +138,8 @@ def upsert( ``` """ - vectors = convert_to_vectors(vectors) - payload, is_vector = convert_to_payload(vectors) + converted_vectors = sequence_to_vectors(vectors) + payload, is_vector = vectors_to_payload(converted_vectors) path = UPSERT_PATH if is_vector else UPSERT_DATA_PATH return self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -149,6 +154,9 @@ def query( data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -163,6 +171,9 @@ def query( :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. Example usage: @@ -192,18 +203,38 @@ def query( "filter": filter, } - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = QUERY_PATH return [ @@ -262,7 +293,7 @@ def query_many( single_result = self.query(**query, namespace=namespace) return [single_result] - has_vector_query, payload = convert_query_requests_to_payload(queries) + has_vector_query, payload = query_requests_to_payload(queries) path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH result = self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -282,6 +313,9 @@ def resumable_query( namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, max_idle: int = 3600, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> Tuple[List[QueryResult], "ResumableQueryHandle"]: """ Creates a resumable query. @@ -297,6 +331,9 @@ def resumable_query( :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting vectors will have their unstructured data or not. :param max_idle: Maximum idle time for the resumable query in seconds. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. :return: First batch of the results, along with a handle to fetch more or stop the query. @@ -305,7 +342,7 @@ def resumable_query( ```python result, handle = index.resumable_query( vector=[0.6, 0.9], - top_k=100, + top_k=10, include_vectors=False, include_metadata=True, ) @@ -315,13 +352,6 @@ def resumable_query( handle.stop() ``` """ - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - payload = { "topK": top_k, "includeVectors": include_vectors, @@ -331,11 +361,38 @@ def resumable_query( "maxIdle": max_idle, } + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value + if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The resumable query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = RESUMABLE_QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The resumable query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = RESUMABLE_QUERY_PATH result = self._execute_request(payload=payload, path=_path_for(namespace, path)) @@ -478,11 +535,12 @@ def fetch( def update( self, id: str, - vector: Optional[List[float]] = None, + vector: Optional[Union[List[float], SupportsToList]] = None, data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, ) -> bool: """ Updates a vector value, data, or metadata for the given id. @@ -495,6 +553,7 @@ def update( :param metadata_update_mode: Whether to overwrite the whole it, or patch the metadata (insert new fields or update according to the `RFC 7396 JSON Merge Patch` algorithm. + :param sparse_vector: The sparse vector value to update to. Example usage: @@ -508,7 +567,14 @@ def update( } if vector is not None: - payload["vector"] = vector + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } if data is not None: payload["data"] = data @@ -577,7 +643,7 @@ async def upsert( vectors=[ ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), - ("id3", [0.3,0.4]), + ("id3", [0.3, 0.4]), ] ) ``` @@ -626,8 +692,8 @@ async def upsert( ) ``` """ - vectors = convert_to_vectors(vectors) - payload, is_vector = convert_to_payload(vectors) + converted_vectors = sequence_to_vectors(vectors) + payload, is_vector = vectors_to_payload(converted_vectors) path = UPSERT_PATH if is_vector else UPSERT_DATA_PATH return await self._execute_request_async( @@ -644,6 +710,9 @@ async def query( data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -658,6 +727,9 @@ async def query( :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. Example usage: @@ -687,18 +759,38 @@ async def query( "filter": filter, } - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = QUERY_PATH return [ @@ -757,7 +849,7 @@ async def query_many( single_result = await self.query(**query, namespace=namespace) return [single_result] - has_vector_query, payload = convert_query_requests_to_payload(queries) + has_vector_query, payload = query_requests_to_payload(queries) path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH result = await self._execute_request_async( payload=payload, path=_path_for(namespace, path) @@ -779,6 +871,9 @@ async def resumable_query( namespace: str = DEFAULT_NAMESPACE, include_data: bool = False, max_idle: int = 3600, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, + weighting_strategy: Optional[WeightingStrategy] = None, + fusion_algorithm: Optional[FusionAlgorithm] = None, ) -> Tuple[List[QueryResult], "AsyncResumableQueryHandle"]: """ Creates a resumable query. @@ -794,6 +889,9 @@ async def resumable_query( :param namespace: The namespace to use. When not specified, the default namespace is used. :param include_data: Whether the resulting vectors will have their unstructured data or not. :param max_idle: Maximum idle time for the resumable query in seconds. + :param sparse_vector: The sparse vector value to query. + :param weighting_strategy: Weighting strategy to be used for sparse vectors. + :param fusion_algorithm: Fusion algorithm to use while fusing scores from hybrid vectors. :return: First batch of the results, along with a handle to fetch more or stop the query. @@ -802,7 +900,7 @@ async def resumable_query( ```python result, handle = await index.resumable_query( vector=[0.6, 0.9], - top_k=100, + top_k=10, include_vectors=False, include_metadata=True, ) @@ -812,13 +910,6 @@ async def resumable_query( await handle.stop() ``` """ - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - payload = { "topK": top_k, "includeVectors": include_vectors, @@ -828,11 +919,38 @@ async def resumable_query( "maxIdle": max_idle, } + if weighting_strategy is not None: + payload["weightingStrategy"] = weighting_strategy.value + + if fusion_algorithm is not None: + payload["fusionAlgorithm"] = fusion_algorithm.value + if data is not None: + if vector is not None or sparse_vector is not None: + raise ClientError( + "The resumable query should not have " + "`vector` or `sparse_vector` when it contains `data`." + ) + payload["data"] = data path = RESUMABLE_QUERY_DATA_PATH else: - payload["vector"] = convert_to_list(vector) + if vector is None and sparse_vector is None: + raise ClientError( + "The resumable query should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." + ) + + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } + path = RESUMABLE_QUERY_PATH result = await self._execute_request_async( @@ -979,11 +1097,12 @@ async def fetch( async def update( self, id: str, - vector: Optional[List[float]] = None, + vector: Optional[Union[List[float], SupportsToList]] = None, data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None, ) -> bool: """ Updates a vector value, data, or metadata for the given id. @@ -996,6 +1115,7 @@ async def update( :param metadata_update_mode: Whether to overwrite the whole it, or patch the metadata (insert new fields or update according to the `RFC 7396 JSON Merge Patch` algorithm. + :param sparse_vector: The sparse vector value to update to. Example usage: @@ -1009,7 +1129,14 @@ async def update( } if vector is not None: - payload["vector"] = vector + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } if data is not None: payload["data"] = data diff --git a/upstash_vector/http.py b/upstash_vector/http.py index 77dfc38..b7710c0 100644 --- a/upstash_vector/http.py +++ b/upstash_vector/http.py @@ -4,7 +4,7 @@ from platform import python_version from typing import Any, Dict -from httpx import Client, AsyncClient +from httpx import AsyncClient, Client from upstash_vector import __version__ from upstash_vector.errors import UpstashError diff --git a/upstash_vector/types.py b/upstash_vector/types.py index a022362..35653b6 100644 --- a/upstash_vector/types.py +++ b/upstash_vector/types.py @@ -1,19 +1,52 @@ import enum from dataclasses import dataclass -from typing import Optional, List, Dict, TypedDict, Union, Protocol +from typing import Dict, List, Optional, Protocol, Tuple, TypedDict, Union class SupportsToList(Protocol): - def tolist(self) -> List[float]: - ... + def tolist(self) -> List[float]: ... + + +@dataclass +class SparseVector: + indices: Union[List[int], SupportsToList] + """ + List of dimensions that have non-zero values. + + All the signed 32 bit integer range is valid + as the dimension indices. + """ + + values: Union[List[float], SupportsToList] + """ + Values of the non-zero dimensions. + + It must be of the same size as the `indices`. + """ + + @classmethod + def _from_json(cls, obj: Optional[dict]) -> Optional["SparseVector"]: + if not obj: + return None + + return SparseVector( + obj["indices"], + obj["values"], + ) + + +TupleAsSparseVectorT = Tuple[ + Union[List[int], SupportsToList], Union[List[float], SupportsToList] +] @dataclass class Vector: id: Union[int, str] - vector: Union[List[float], SupportsToList] + vector: Optional[Union[List[float], SupportsToList]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[Union[SparseVector, TupleAsSparseVectorT]] = None @dataclass @@ -29,6 +62,7 @@ class FetchResult: vector: Optional[List[float]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[SparseVector] = None @classmethod def _from_json(cls, obj: dict) -> "FetchResult": @@ -37,6 +71,7 @@ def _from_json(cls, obj: dict) -> "FetchResult": vector=obj.get("vector"), metadata=obj.get("metadata"), data=obj.get("data"), + sparse_vector=SparseVector._from_json(obj.get("sparseVector")), ) @@ -47,6 +82,7 @@ class QueryResult: vector: Optional[List[float]] = None metadata: Optional[Dict] = None data: Optional[str] = None + sparse_vector: Optional[SparseVector] = None @classmethod def _from_json(cls, obj: dict) -> "QueryResult": @@ -56,6 +92,7 @@ def _from_json(cls, obj: dict) -> "QueryResult": vector=obj.get("vector"), metadata=obj.get("metadata"), data=obj.get("data"), + sparse_vector=SparseVector._from_json(obj.get("sparseVector")), ) @@ -132,19 +169,106 @@ class MetadataUpdateMode(enum.Enum): """Patch the metadata according to Merge Patch algorithm.""" +class WeightingStrategy(enum.Enum): + """ + For sparse vectors, what kind of weighting strategy + should be used while querying the matching non-zero + dimension values of the query vector with the documents. + + If not provided, no weighting will be used. + """ + + IDF = "IDF" + """ + Inverse document frequency. + + It is recommended to use this weighting strategy for + BM25 sparse embedding models. + + It is calculated as + + ln(((N - n(q) + 0.5) / (n(q) + 0.5)) + 1) where + N: Total number of sparse vectors. + n(q): Total number of sparse vectors having non-zero value + for that particular dimension. + ln: Natural logarithm + + The values of N and n(q) are maintained by Upstash as the + vectors are indexed. + """ + + +class FusionAlgorithm(enum.Enum): + """ + Fusion algorithm to use while fusing scores + from dense and sparse components of a hybrid index. + + If not provided, defaults to `RRF`. + """ + + RRF = "RRF" + """ + Reciprocal rank fusion. + + Each sorted score from the dense and sparse indexes are + mapped to 1 / (rank + K), where rank is the order of the + score in the dense or sparse scores and K is a constant + with the value of 60. + + Then, scores from the dense and sparse components are + deduplicated (i.e. if a score for the same vector is present + in both dense and sparse scores, the mapped scores are + added; otherwise individual mapped scores are used) + and the final result is returned as the topK values + of this final list. + + In short, this algorithm just takes the order of the scores + into consideration. + """ + + DBSF = "DBSF" + """ + Distribution based score fusion. + + Each sorted score from the dense and sparse indexes are + normalized as + (s - (mean - 3 * stddev)) / ((mean + 3 * stddev) - (mean - 3 * stddev)) + where s is the score, (mean - 3 * stddev) is the minimum, + and (mean + 3 * stddev) is the maximum tail ends of the distribution. + + Then, scores from the dense and sparse components are + deduplicated (i.e. if a score for the same vector is present + in both dense and sparse scores, the normalized scores are + added; otherwise individual normalized scores are used) + and the final result is returned as the topK values + of this final list. + + In short, this algorithm takes distribution of the scores + into consideration as well, as opposed to the `RRF`. + """ + + class QueryRequest(TypedDict, total=False): vector: Union[List[float], SupportsToList] """ The vector value to query. - Only and only one of `vector` or `data` fields must be provided. + It must be provided only for dense and hybrid indexes. + """ + + sparse_vector: Union[SparseVector, TupleAsSparseVectorT] + """ + The sparse vector value to query. + + It must be provided only for sparse or hybrid indexes. """ data: str """ - Data to query for (after embedding it to a vector). + Data to query for (after embedding it to a vector/sparse vector). - Only and only one of `vector` or `data` fields must be provided. + It must be provided only for indexes created with Upstash hosted models. + When provided, `vector` or `sparse_vector` fields should not be set. """ top_k: int @@ -181,3 +305,22 @@ class QueryRequest(TypedDict, total=False): When not specified, defaults to `""`(no filter). """ + + weighting_strategy: WeightingStrategy + """ + Weighting strategy to be used for sparse vectors. + + It must be provided only for sparse and hybrid indexes. + + When not specified, defaults to no extra weighting (i.e. 1.0). + """ + + fusion_algorithm: FusionAlgorithm + """ + Fusion algorithm to use while fusing scores + from dense and sparse components of a hybrid index. + + It must be provided only for hybrid indexes. + + When not specified, defaults to `RRF`. + """ diff --git a/upstash_vector/utils.py b/upstash_vector/utils.py index ce718fc..50bbf2c 100644 --- a/upstash_vector/utils.py +++ b/upstash_vector/utils.py @@ -1,71 +1,204 @@ -from typing import List, Union, Dict, Any, Optional, Tuple +from typing import Any, Dict, List, Sequence, Tuple, Union from upstash_vector.errors import ClientError -from upstash_vector.types import Data, QueryRequest, Vector +from upstash_vector.types import ( + Data, + QueryRequest, + SparseVector, + SupportsToList, + TupleAsSparseVectorT, + Vector, +) -def convert_to_list(obj): - if isinstance(obj, list): - return obj - elif hasattr(obj, "tolist") and callable(getattr(obj, "tolist")): - return obj.tolist() +def sequence_to_vectors( + vectors: Sequence[Union[dict, tuple, Vector, Data]], +) -> List[Union[Vector, Data]]: + return [_parse_vector(vector) for vector in vectors] + + +def _parse_vector(vector: Union[dict, tuple, Vector, Data]) -> Union[Vector, Data]: + if isinstance(vector, Vector): + dense = vector.vector + if dense is not None: + vector.vector = to_list(dense) + + sparse = vector.sparse_vector + if sparse is not None: + vector.sparse_vector = to_sparse_vector(sparse) + + return vector + elif isinstance(vector, Data): + return vector + elif isinstance(vector, tuple): + return _parse_vector_from_tuple(vector) + elif isinstance(vector, dict): + return _parse_vector_from_dict(vector) + else: + raise ClientError( + f"Given object type is undefined for converting to vector: {vector}" + ) + + +def to_list(maybe_list: Union[list, SupportsToList]) -> list: + if isinstance(maybe_list, list): + return maybe_list + elif hasattr(maybe_list, "tolist") and callable(getattr(maybe_list, "tolist")): + return maybe_list.tolist() raise TypeError( - f"Expected a list or something can be converted to a list(like numpy or pandas array) but got {type(obj)}" + f"Expected a list or something can be converted to a " + f"list(like numpy or pandas array) but got {type(maybe_list)}" ) -def _get_payload_element( - id: Union[int, str], - payload: Union[str, List[float]], - metadata: Optional[Dict[str, Any]] = None, - data: Optional[str] = None, -) -> Union[Vector, Data]: - if isinstance(payload, str): - return Data(id=id, data=payload, metadata=metadata) +def to_sparse_vector( + maybe_sparse_vector: Union[SparseVector, TupleAsSparseVectorT], +) -> SparseVector: + if isinstance(maybe_sparse_vector, SparseVector): + maybe_sparse_vector.indices = to_list(maybe_sparse_vector.indices) + maybe_sparse_vector.values = to_list(maybe_sparse_vector.values) + return maybe_sparse_vector + elif isinstance(maybe_sparse_vector, tuple): + if len(maybe_sparse_vector) != 2: + raise ClientError( + "The tuple for sparse vector should contain two lists; " + "one for indices, and one for values." + ) - return Vector(id=id, vector=convert_to_list(payload), metadata=metadata, data=data) + sparse = SparseVector( + to_list(maybe_sparse_vector[0]), to_list(maybe_sparse_vector[1]) + ) + return sparse + else: + raise ClientError("`sparse_vector` must be a `SparseVector` or `tuple`.") -def _get_payload_element_from_dict( - id: Union[int, str], - vector: Optional[List[float]] = None, - data: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, -) -> Union[Vector, Data]: - if vector is None and data is None: +def _parse_vector_from_tuple(t: tuple) -> Union[Vector, Data]: + if len(t) < 2: raise ClientError( - "Vector dict must have one of `vector` or `data` fields defined." + "The tuple must contain at least two elements; " + "one for id, and other for vector or sparse vector." ) - if vector is None: - # data cannot be none at this point - return Data(id=id, data=data, metadata=metadata) # type:ignore[arg-type] + id = t[0] + if isinstance(t[1], str): + return _parse_data_from_tuple(t, id) - return Vector(id=id, vector=convert_to_list(vector), metadata=metadata, data=data) + if isinstance(t[1], (SparseVector, tuple)): + return _parse_sparse_vector_from_tuple(t, id) + dense = to_list(t[1]) + if len(t) > 2 and isinstance(t[2], (SparseVector, tuple)): + return _parse_hybrid_vector_from_tuple(t, id, dense) -def _tuple_or_dict_to_vectors(vector) -> Union[Vector, Data]: - if isinstance(vector, Vector): - vector.vector = convert_to_list(vector.vector) - return vector - elif isinstance(vector, Data): - return vector - elif isinstance(vector, tuple): - return _get_payload_element(*vector) - elif isinstance(vector, dict): - return _get_payload_element_from_dict(**vector) + return _parse_dense_vector_from_tuple(t, id, dense) + + +def _parse_data_from_tuple(t: tuple, id: str) -> Data: + data = t[1] + if len(t) > 2: + metadata = t[2] + else: + metadata = None + + return Data(id=id, data=data, metadata=metadata) + + +def _parse_sparse_vector_from_tuple(t: tuple, id: str) -> Vector: + sparse = to_sparse_vector(t[1]) + + if len(t) > 2: + metadata = t[2] + if len(t) > 3: + data = t[3] + else: + data = None + else: + metadata = None + data = None + + return Vector( + id=id, + sparse_vector=sparse, + metadata=metadata, + data=data, + ) + + +def _parse_hybrid_vector_from_tuple(t: tuple, id: str, dense: List[float]) -> Vector: + sparse = to_sparse_vector(t[2]) + + if len(t) > 3: + metadata = t[3] + if len(t) > 4: + data = t[4] + else: + data = None + else: + metadata = None + data = None + + return Vector( + id=id, + vector=dense, + sparse_vector=sparse, + metadata=metadata, + data=data, + ) + + +def _parse_dense_vector_from_tuple(t: tuple, id: str, dense: List[float]) -> Vector: + if len(t) > 2: + metadata = t[2] + if len(t) > 3: + data = t[3] + else: + data = None else: + metadata = None + data = None + + return Vector( + id=id, + vector=dense, + metadata=metadata, + data=data, + ) + + +def _parse_vector_from_dict(d: dict) -> Union[Vector, Data]: + id = d["id"] + vector = d.get("vector") + sparse_vector = d.get("sparse_vector") + data = d.get("data") + metadata = d.get("metadata") + + if vector is None and sparse_vector is None and data is not None: + return Data(id=id, data=data, metadata=metadata) + + if vector is None and sparse_vector is None: raise ClientError( - f"Given object type is undefined for converting to vector: {vector}" + "The dict for vector should contain `vector` " + "and/or `sparse_vector` when it does not contain `data`." ) + if vector is not None: + vector = to_list(vector) -def convert_to_vectors(vectors) -> List[Union[Vector, Data]]: - return [_tuple_or_dict_to_vectors(vector) for vector in vectors] + if sparse_vector is not None: + sparse_vector = to_sparse_vector(sparse_vector) + + return Vector( + id=id, + vector=vector, + sparse_vector=sparse_vector, + metadata=metadata, + data=data, + ) -def convert_to_payload( +def vectors_to_payload( vectors: List[Union[Vector, Data]], ) -> Tuple[List[Dict[str, Any]], bool]: """ @@ -79,23 +212,21 @@ def convert_to_payload( expecting_vectors = isinstance(vectors[0], Vector) payload = [] for vector in vectors: - is_vector = isinstance(vector, Vector) - if expecting_vectors != is_vector: - raise ClientError( - "All items should either have the `data` or the `vector` field." - " Received items from both kinds. Please send them separately." - ) + if isinstance(vector, Vector): + if not expecting_vectors: + raise ClientError( + "All items should either have the `data` or the `vector` and/or `sparse_vector` field." + " Received items from both kinds. Please send them separately." + ) - if is_vector: - payload.append( - { - "id": vector.id, - "vector": vector.vector, # type: ignore[union-attr] - "metadata": vector.metadata, - "data": vector.data, - } - ) + payload.append(_vector_to_payload(vector)) else: + if expecting_vectors: + raise ClientError( + "All items should either have the `data` or the `vector` and/or `sparse_vector` field." + " Received items from both kinds. Please send them separately." + ) + payload.append( { "id": vector.id, @@ -107,50 +238,87 @@ def convert_to_payload( return payload, expecting_vectors -def convert_query_requests_to_payload( +def _vector_to_payload(vector: Vector) -> Dict[str, Any]: + if vector.sparse_vector is not None: + sparse = { + "indices": vector.sparse_vector.indices, # type: ignore[union-attr] + "values": vector.sparse_vector.values, # type: ignore[union-attr] + } + else: + sparse = None + + return { + "id": vector.id, + "vector": vector.vector, + "sparseVector": sparse, + "metadata": vector.metadata, + "data": vector.data, + } + + +def query_requests_to_payload( queries: List[QueryRequest], ) -> Tuple[bool, List[Dict[str, Any]]]: - has_vector_query = False has_data_query = False - payloads = [] for query in queries: - payload = { - "topK": query.get("top_k", 10), - "includeVectors": query.get("include_vectors", False), - "includeMetadata": query.get("include_metadata", False), - "includeData": query.get("include_data", False), - "filter": query.get("filter", ""), - } + payload: Dict[str, Any] = {} + + if "top_k" in query: + payload["topK"] = query["top_k"] + + if "include_vectors" in query: + payload["includeVectors"] = query["include_vectors"] + + if "include_metadata" in query: + payload["includeMetadata"] = query["include_metadata"] + + if "include_data" in query: + payload["includeData"] = query["include_data"] + + if "filter" in query: + payload["filter"] = query["filter"] + + if "weighting_strategy" in query: + payload["weightingStrategy"] = query["weighting_strategy"].value + + if "fusion_algorithm" in query: + payload["fusionAlgorithm"] = query["fusion_algorithm"].value vector = query.get("vector") + sparse_vector = query.get("sparse_vector") data = query.get("data") - if data is None and vector is None: - raise ClientError("either `data` or `vector` values must be given") - if data is not None and vector is not None: - raise ClientError( - "`data` and `vector` values cannot be given at the same time" - ) - if data is not None: - if has_vector_query: + if vector is not None or sparse_vector is not None: raise ClientError( - "`data` and `vector` queries cannot be mixed in the same batch." + "When the query contains `data`, it can't contain `vector` or `sparse_vector`." ) has_data_query = True payload["data"] = data else: + if vector is None and sparse_vector is None: + raise ClientError( + "Query must contain at least one of `vector`, `sparse_vector`, or `data`." + ) + if has_data_query: raise ClientError( - "`data` and `vector` queries cannot be mixed in the same batch." + "`data` and `vector`/`sparse_vector` queries cannot be mixed in the same batch." ) - has_vector_query = True - payload["vector"] = convert_to_list(vector) + if vector is not None: + payload["vector"] = to_list(vector) + + if sparse_vector is not None: + sparse = to_sparse_vector(sparse_vector) + payload["sparseVector"] = { + "indices": sparse.indices, + "values": sparse.values, + } payloads.append(payload) - return has_vector_query, payloads + return not has_data_query, payloads