Skip to content

Commit

Permalink
enhance: support null value in index (milvus-io#35238)
Browse files Browse the repository at this point in the history
milvus-io#31728

---------

Signed-off-by: lixinguo <[email protected]>
Co-authored-by: lixinguo <[email protected]>
  • Loading branch information
smellthemoon and lixinguo authored Aug 16, 2024
1 parent f87af9b commit 80dbe87
Show file tree
Hide file tree
Showing 28 changed files with 921 additions and 158 deletions.
2 changes: 2 additions & 0 deletions internal/core/src/common/Consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,5 @@ const int64_t DEFAULT_MAX_OUTPUT_SIZE = 67108864; // bytes, 64MB
const int64_t DEFAULT_CHUNK_MANAGER_REQUEST_TIMEOUT_MS = 10000;

const int64_t DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND = 500;

const size_t MARISA_NULL_KEY_ID = -1;
4 changes: 2 additions & 2 deletions internal/core/src/common/FieldData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
ssize_t byte_count = (element_count + 7) / 8;
// Note: if 'nullable == true` and valid_data is nullptr
// means null_count == 0, will fill it with 0xFF
if (valid_data == nullptr) {
valid_data_.resize(byte_count, 0xFF);
if (!valid_data) {
valid_data_.assign(byte_count, 0xFF);
} else {
std::copy_n(valid_data, byte_count, valid_data_.data());
}
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/common/FieldDataInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data == nullptr) {
valid_data_.resize((n + 7) / 8, 0xFF);
valid_data_.assign((n + 7) / 8, 0xFF);
} else {
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
}
Expand Down
59 changes: 49 additions & 10 deletions internal/core/src/index/BitmapIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,14 @@ BitmapIndex<T>::Build(size_t n, const T* data) {
PanicInfo(DataIsEmpty, "BitmapIndex can not build null values");
}

total_num_rows_ = n;
valid_bitset = TargetBitmap(total_num_rows_, false);

T* p = const_cast<T*>(data);
for (int i = 0; i < n; ++i, ++p) {
data_[*p].add(i);
valid_bitset.set(i);
}
total_num_rows_ = n;

if (data_.size() < DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) {
for (auto it = data_.begin(); it != data_.end(); ++it) {
Expand All @@ -95,8 +98,11 @@ BitmapIndex<T>::BuildPrimitiveField(
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
if (data->is_valid(i)) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
valid_bitset.set(offset);
}
offset++;
}
}
Expand All @@ -114,6 +120,7 @@ BitmapIndex<T>::BuildWithFieldData(
PanicInfo(DataIsEmpty, "scalar bitmap index can not build null values");
}
total_num_rows_ = total_num_rows;
valid_bitset = TargetBitmap(total_num_rows_, false);

switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
Expand Down Expand Up @@ -151,12 +158,14 @@ BitmapIndex<T>::BuildArrayField(const std::vector<FieldDataPtr>& field_datas) {
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto array =
reinterpret_cast<const milvus::Array*>(data->RawValue(i));

for (size_t j = 0; j < array->length(); ++j) {
auto val = static_cast<T>(array->template get_data<GetType>(j));
data_[val].add(offset);
if (data->is_valid(i)) {
auto array =
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
for (size_t j = 0; j < array->length(); ++j) {
auto val = array->template get_data<T>(j);
data_[val].add(offset);
}
valid_bitset.set(offset);
}
offset++;
}
Expand Down Expand Up @@ -330,6 +339,9 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
}
}
}

Expand All @@ -355,6 +367,9 @@ BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
}
}
}

Expand All @@ -367,6 +382,7 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
index_meta_buffer->size);
auto index_length = index_meta.first;
total_num_rows_ = index_meta.second;
valid_bitset = TargetBitmap(total_num_rows_, false);

auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
DeserializeIndexData(index_data_buffer->data.get(), index_length);
Expand All @@ -389,7 +405,7 @@ BitmapIndex<T>::Load(milvus::tracer::TraceContext ctx, const Config& config) {
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
Expand Down Expand Up @@ -442,6 +458,8 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
return res;
} else {
TargetBitmap res(total_num_rows_, false);
Expand All @@ -452,10 +470,31 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
}
res.flip();
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
return res;
}
}

template <typename T>
const TargetBitmap
BitmapIndex<T>::IsNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
res.flip();
return res;
}

template <typename T>
const TargetBitmap
BitmapIndex<T>::IsNotNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
return res;
}

template <typename T>
TargetBitmap
BitmapIndex<T>::RangeForBitset(const T value, const OpType op) {
Expand Down
9 changes: 9 additions & 0 deletions internal/core/src/index/BitmapIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ class BitmapIndex : public ScalarIndex<T> {
const TargetBitmap
NotIn(size_t n, const T* values) override;

const TargetBitmap
IsNull() override;

const TargetBitmap
IsNotNull() override;

const TargetBitmap
Range(T value, OpType op) override;

Expand Down Expand Up @@ -205,6 +211,9 @@ class BitmapIndex : public ScalarIndex<T> {
size_t total_num_rows_{0};
proto::schema::FieldSchema schema_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;

// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset;
};

} // namespace index
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/index/HybridScalarIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
Expand Down
10 changes: 10 additions & 0 deletions internal/core/src/index/HybridScalarIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ class HybridScalarIndex : public ScalarIndex<T> {
return internal_index_->NotIn(n, values);
}

const TargetBitmap
IsNull() override {
return internal_index_->IsNull();
}

const TargetBitmap
IsNotNull() override {
return internal_index_->IsNotNull();
}

const TargetBitmap
Query(const DatasetPtr& dataset) override {
return internal_index_->Query(dataset);
Expand Down
88 changes: 83 additions & 5 deletions internal/core/src/index/InvertedIndexTantivy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <boost/filesystem.hpp>
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <cstddef>
#include <vector>
#include "InvertedIndexTantivy.h"

namespace milvus::index {
Expand Down Expand Up @@ -105,8 +107,14 @@ InvertedIndexTantivy<T>::finish() {
template <typename T>
BinarySet
InvertedIndexTantivy<T>::Serialize(const Config& config) {
auto index_valid_data_length = null_offset.size() * sizeof(size_t);
std::shared_ptr<uint8_t[]> index_valid_data(
new uint8_t[index_valid_data_length]);
memcpy(index_valid_data.get(), null_offset.data(), index_valid_data_length);
BinarySet res_set;

res_set.Append(
"index_null_offset", index_valid_data, index_valid_data_length);
milvus::Disassemble(res_set);
return res_set;
}

Expand Down Expand Up @@ -137,7 +145,8 @@ InvertedIndexTantivy<T>::Upload(const Config& config) {
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}

auto binary_set = Serialize(config);
mem_file_manager_->AddFile(binary_set);
return ret;
}

Expand Down Expand Up @@ -173,6 +182,26 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
files_value.end());
disk_file_manager_->CacheIndexToDisk(files_value);
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
auto index_valid_data_file =
mem_file_manager_->GetRemoteIndexObjectPrefix() +
std::string("/index_null_offset");
std::vector<std::string> file;
file.push_back(index_valid_data_file);
auto index_datas = mem_file_manager_->LoadIndexToMemory(file);
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
auto index_valid_data = binary_set.GetByName("index_null_offset");
null_offset.resize((size_t)index_valid_data->size / sizeof(size_t));
memcpy(null_offset.data(),
index_valid_data->data.get(),
(size_t)index_valid_data->size);
}

inline void
Expand Down Expand Up @@ -212,6 +241,27 @@ InvertedIndexTantivy<T>::In(size_t n, const T* values) {
return bitset;
}

template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::IsNull() {
TargetBitmap bitset(Count());

for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.set(null_offset[i]);
}
return bitset;
}

template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::IsNotNull() {
TargetBitmap bitset(Count(), true);
for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.reset(null_offset[i]);
}
return bitset;
}

template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::InApplyFilter(
Expand Down Expand Up @@ -242,6 +292,9 @@ InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
auto array = wrapper_->term_query(values[i]);
apply_hits(bitset, array, false);
}
for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.reset(null_offset[i]);
}
return bitset;
}

Expand Down Expand Up @@ -378,6 +431,13 @@ template <typename T>
void
InvertedIndexTantivy<T>::BuildWithFieldData(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
if (schema_.nullable()) {
int64_t total = 0;
for (const auto& data : field_datas) {
total += data->get_null_count();
}
null_offset.reserve(total);
}
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
case proto::schema::DataType::Int8:
Expand All @@ -390,6 +450,17 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
case proto::schema::DataType::VarChar: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
if (schema_.nullable()) {
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_multi_data<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i));
}
continue;
}
wrapper_->add_data<T>(static_cast<const T*>(data->Data()), n);
}
break;
Expand Down Expand Up @@ -417,9 +488,12 @@ InvertedIndexTantivy<T>::build_index_for_array(
for (int64_t i = 0; i < n; i++) {
assert(array_column[i].get_element_type() ==
static_cast<DataType>(schema_.element_type()));
if (schema_.nullable() && !data->is_valid(i)) {
null_offset.push_back(i);
}
auto length = data->is_valid(i) ? array_column[i].length() : 0;
wrapper_->template add_multi_data(
reinterpret_cast<const T*>(array_column[i].data()),
array_column[i].length());
reinterpret_cast<const T*>(array_column[i].data()), length);
}
}
}
Expand All @@ -435,12 +509,16 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
Assert(IsStringDataType(array_column[i].get_element_type()));
Assert(IsStringDataType(
static_cast<DataType>(schema_.element_type())));
if (schema_.nullable() && !data->is_valid(i)) {
null_offset.push_back(i);
}
std::vector<std::string> output;
for (int64_t j = 0; j < array_column[i].length(); j++) {
output.push_back(
array_column[i].template get_data<std::string>(j));
}
wrapper_->template add_multi_data(output.data(), output.size());
auto length = data->is_valid(i) ? output.size() : 0;
wrapper_->template add_multi_data(output.data(), length);
}
}
}
Expand Down
Loading

0 comments on commit 80dbe87

Please sign in to comment.