spotify · stephen29xie · Sep 10, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ java/classpath.txt
 java/linux-build/include/*
 python/voyager-headers
 .asv/
+*.dSYM
 
 # Cmake
 CMakeLists.txt.user

diff --git a/cpp/src/Index.h b/cpp/src/Index.h
@@ -71,6 +71,11 @@ class Index {
 
   virtual hnswlib::labeltype addItem(std::vector<float> vector,
                                      std::optional<hnswlib::labeltype> id) = 0;
+
+  virtual std::vector<hnswlib::labeltype>
+  addItems(std::vector<std::vector<float>> input,
+           std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) = 0;
+
   virtual std::vector<hnswlib::labeltype>
   addItems(NDArray<float, 2> input, std::vector<hnswlib::labeltype> ids = {},
            int numThreads = -1) = 0;
@@ -86,6 +91,10 @@ class Index {
   virtual std::tuple<std::vector<hnswlib::labeltype>, std::vector<float>>
   query(std::vector<float> queryVector, int k = 1, long queryEf = -1) = 0;
 
+  virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
+  query(std::vector<std::vector<float>> queryVectors, int k = 1,
+        int numThreads = -1, long queryEf = -1) = 0;
+
   virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
   query(NDArray<float, 2> queryVectors, int k = 1, int numThreads = -1,
         long queryEf = -1) = 0;

diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h
@@ -290,6 +290,24 @@ class TypedIndex : public Index {
     return addItems(NDArray<float, 2>(vector, {1, (int)vector.size()}), ids)[0];
   }
 
+  std::vector<hnswlib::labeltype>
+  addItems(const std::vector<std::vector<float>> vectors,
+           std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
+    // Convert the 2D array of float to NDArray<float, 2>
+    int numVectors = vectors.size();
+    int dimensions = numVectors > 0 ? vectors[0].size() : 0;
+    std::array<int, 2> shape = {numVectors, dimensions};
+
+    // flatten the 2d array of floats
+    std::vector<float> flatArray;
+    for (const auto &vector : vectors) {
+      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+    }
+    NDArray<float, 2> ndarray(flatArray, shape);
+
+    return addItems(ndarray, ids, numThreads);
+  }
+
   std::vector<hnswlib::labeltype>
   addItems(NDArray<float, 2> floatInput,
            std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
@@ -502,6 +520,24 @@ class TypedIndex : public Index {
     return algorithmImpl->label_lookup_;
   }
 
+  std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
+  query(std::vector<std::vector<float>> floatQueryVectors, int k = 1,
+        int numThreads = -1, long queryEf = -1) {
+    // Convert the 2D array of float to NDArray<float, 2>
+    int numVectors = floatQueryVectors.size();
+    int dimensions = numVectors > 0 ? floatQueryVectors[0].size() : 0;
+    std::array<int, 2> shape = {numVectors, dimensions};
+
+    // flatten the 2d array of floats
+    std::vector<float> flatArray;
+    for (const auto &vector : floatQueryVectors) {
+      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+    }
+    NDArray<float, 2> ndarray(flatArray, shape);
+
+    return query(ndarray, k, numThreads, queryEf);
+  }
+
   std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
   query(NDArray<float, 2> floatQueryVectors, int k = 1, int numThreads = -1,
         long queryEf = -1) {

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
@@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp)  # Add any test files here
 # Create an executable for the tests
 add_executable(VoyagerTests ${TEST_FILES})
 
+# Add compiler flags
+target_compile_options(VoyagerTests PRIVATE -g)
+
 # Link the test executable with the main project and Doctest
 # target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest)
 target_link_libraries(VoyagerTests

diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
@@ -1,49 +1,166 @@
 #include "doctest.h"
 
 #include "TypedIndex.h"
+#include "test_utils.cpp"
 #include <tuple>
 #include <type_traits>
 
 template <typename dist_t, typename data_t = dist_t,
           typename scalefactor = std::ratio<1, 1>>
-void testCombination(TypedIndex<dist_t, data_t, scalefactor> &index,
-                     SpaceType spaceType, int numDimensions,
-                     StorageDataType storageType) {
-  CHECK(toString(index.getSpace()) == toString(spaceType));
-  CHECK(index.getNumDimensions() == numDimensions);
-  CHECK(toString(index.getStorageDataType()) == toString(storageType));
+void testIndexProperties(TypedIndex<dist_t, data_t, scalefactor> &index,
+                         SpaceType spaceType, int numDimensions,
+                         StorageDataType storageType) {
+  REQUIRE(toString(index.getSpace()) == toString(spaceType));
+  REQUIRE(index.getNumDimensions() == numDimensions);
+  REQUIRE(toString(index.getStorageDataType()) == toString(storageType));
 }
 
-TEST_CASE("Test combinations of different instantiations and sizes") {
-  std::vector<SpaceType> spaceTypesSet = {SpaceType::Euclidean,
-                                          SpaceType::InnerProduct};
-  std::vector<int> numDimensionsSet = {4, 16, 128, 1024};
-  std::vector<int> numElementsSet = {100, 1000, 100000};
+/**
+ * Test the query method of the index. The index is populated with random
+ * vectors, and then queried with the same vectors. The expected result is that
+ * each vector's nearest neighbor is itself and that the distance is zero
+ * (allowing for some precision error based on the storage type).
+ */
+template <typename dist_t, typename data_t = dist_t,
+          typename scalefactor = std::ratio<1, 1>>
+void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
+               int numDimensions, SpaceType spaceType,
+               StorageDataType storageType, bool testSingleVectorMethod,
+               float precisionTolerance) {
+  // create test data and ids
+  std::vector<std::vector<float>> inputData =
+      randomVectors(numVectors, numDimensions);
+  std::vector<hnswlib::labeltype> ids(numVectors);
+  for (int i = 0; i < numVectors; i++) {
+    ids[i] = i;
+  }
+
+  // add items to index
+  if (testSingleVectorMethod == true) {
+    for (auto id : ids) {
+      index.addItem(inputData[id], id);
+    }
+  } else {
+    index.addItems(inputData, ids, -1);
+  }
+
+  int k = 1;
+  float lowerBound = 0.0f - precisionTolerance;
+  float upperBound = 0.0f + precisionTolerance;
+
+  // Use the single-query interface (query with a single target vector)
+  for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
+    for (int i = 0; i < numVectors; i++) {
+
+      /**
+       * Use the raw inputData as target vectors for querying. We don't use the
+       * index data because once data has been added to the index, the model can
+       * change the "ground truth" by changing the data format.
+       */
+      auto targetVector = inputData[i];
+      auto nearestNeighbor = index.query(targetVector, k, queryEf);
+
+      auto labels = std::get<0>(nearestNeighbor);
+      auto distances = std::get<1>(nearestNeighbor);
+      REQUIRE(labels.size() == k);
+      REQUIRE(distances.size() == k);
+
+      /**
+       * E4M3 is too low precision for us to confidently assume that querying
+       * with the unquantized (fp32) vector will return the quantized vector as
+       * its NN. InnerProduct will have negative distance to the closest item,
+       * not zero
+       */
+      if (storageType != StorageDataType::E4M3 &&
+          spaceType != SpaceType::InnerProduct) {
+        REQUIRE(i == labels[0]);
+        REQUIRE(distances[0] >= lowerBound);
+        REQUIRE(distances[0] <= upperBound);
+      }
+    }
+  }
+
+  // Use the bulk-query interface  (query with multiple target vectors at once)
+  for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
+    auto nearestNeighbors = index.query(
+        inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf);
+    NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
+    NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
+    REQUIRE(labels.shape[0] == numVectors);
+    REQUIRE(labels.shape[1] == k);
+    REQUIRE(distances.shape[0] == numVectors);
+    REQUIRE(distances.shape[1] == k);
+
+    for (int i = 0; i < numVectors; i++) {
+      auto label = labels.data[i];
+      auto distance = distances.data[i];
+
+      /**
+       * E4M3 is too low precision for us to confidently assume that querying
+       * with the unquantized (fp32) vector will return the quantized vector
+       * as its NN. InnerProduct will have negative distance to the closest
+       * item, not zero
+       */
+      if (storageType != StorageDataType::E4M3 &&
+          spaceType != SpaceType::InnerProduct) {
+        REQUIRE(i == label);
+        REQUIRE(distance >= lowerBound);
+        REQUIRE(distance <= upperBound);
+      }
+    }
+  }
+}
+
+TEST_CASE("Test combinations of different instantiations. Test that each "
+          "vector's NN is itself and distance is approximately zero.") {
+  std::unordered_map<StorageDataType, float> PRECISION_TOLERANCE_PER_DATA_TYPE =
+      {{StorageDataType::Float32, 0.00001f},
+       {StorageDataType::Float8, 0.10f},
+       {StorageDataType::E4M3, 0.20f}};
+  std::vector<SpaceType> spaceTypesSet = {
+      SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine};
+  std::vector<int> numDimensionsSet = {32};
+  std::vector<int> numVectorsSet = {2000};
   std::vector<StorageDataType> storageTypesSet = {
       StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3};
-
-  auto count = 0;
+  std::vector<bool> testSingleVectorMethods = {true, false};
 
   for (auto spaceType : spaceTypesSet) {
-    for (auto numDimensions : numDimensionsSet) {
-      for (auto numElements : numElementsSet) {
-        for (auto storageType : storageTypesSet) {
-          SUBCASE("Test instantiation ") {
-            CAPTURE(spaceType);
-            CAPTURE(numDimensions);
-            CAPTURE(numElements);
-            CAPTURE(storageType);
-
-            if (storageType == StorageDataType::Float8) {
-              auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
-                  spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
-            } else if (storageType == StorageDataType::Float32) {
-              auto index = TypedIndex<float>(spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
-            } else if (storageType == StorageDataType::E4M3) {
-              auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
+    for (auto storageType : storageTypesSet) {
+      for (auto numDimensions : numDimensionsSet) {
+        for (auto numVectors : numVectorsSet) {
+          for (auto testSingleVectorMethod : testSingleVectorMethods) {
+
+            SUBCASE("Test instantiation ") {
+              CAPTURE(spaceType);
+              CAPTURE(numDimensions);
+              CAPTURE(numVectors);
+              CAPTURE(storageType);
+              CAPTURE(testSingleVectorMethod);
+
+              if (storageType == StorageDataType::Float8) {
+                auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
+                    spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              } else if (storageType == StorageDataType::Float32) {
+                auto index = TypedIndex<float>(spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              } else if (storageType == StorageDataType::E4M3) {
+                auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              }
             }
           }
         }

diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp
@@ -0,0 +1,70 @@
+#include <random>
+#include <vector>
+
+#include "array_utils.h"
+
+NDArray<float, 2> randomQuantizedVectorsNDArray(int numVectors,
+                                                int dimensions) {
+  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  int numElements = numVectors * dimensions;
+  for (int i = 0; i < numElements; ++i) {
+    vectors.data[i] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
+  }
+
+  return vectors;
+}
+
+NDArray<float, 2> randomVectorsNDArray(int numVectors, int dimensions) {
+  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  int numElements = numVectors * dimensions;
+  for (int i = 0; i < numElements; ++i) {
+    vectors.data[i] = static_cast<float>(dis(gen)) * 2 - 1;
+  }
+
+  return vectors;
+}
+
+std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
+                                                       int dimensions) {
+  std::vector<std::vector<float>> vectors(numVectors,
+                                          std::vector<float>(dimensions));
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  for (int i = 0; i < numVectors; ++i) {
+    for (int j = 0; j < dimensions; ++j) {
+      vectors[i][j] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
+    }
+  }
+
+  return vectors;
+}
+
+std::vector<std::vector<float>> randomVectors(int numVectors, int dimensions) {
+  std::vector<std::vector<float>> vectors(numVectors,
+                                          std::vector<float>(dimensions));
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  for (int i = 0; i < numVectors; ++i) {
+    for (int j = 0; j < dimensions; ++j) {
+      vectors[i][j] = static_cast<float>(dis(gen)) * 2 - 1;
+    }
+  }
+
+  return vectors;
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,6 +19,7 @@ java/classpath.txt @@
     java/linux-build/include/*
     python/voyager-headers
     .asv/
+    *.dSYM
     # Cmake
     CMakeLists.txt.user
@@ Expand Down @@