diff --git a/internal/core/src/common/Consts.h b/internal/core/src/common/Consts.h index 5ccf8e8b4ee7c..6b5d645064095 100644 --- a/internal/core/src/common/Consts.h +++ b/internal/core/src/common/Consts.h @@ -69,3 +69,5 @@ const int64_t DEFAULT_MAX_OUTPUT_SIZE = 67108864; // bytes, 64MB const int64_t DEFAULT_CHUNK_MANAGER_REQUEST_TIMEOUT_MS = 10000; const int64_t DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND = 500; + +const size_t MARISA_NULL_KEY_ID = -1; diff --git a/internal/core/src/common/FieldData.cpp b/internal/core/src/common/FieldData.cpp index bd913d6541567..f64e677d9a036 100644 --- a/internal/core/src/common/FieldData.cpp +++ b/internal/core/src/common/FieldData.cpp @@ -69,8 +69,8 @@ FieldDataImpl::FillFieldData( ssize_t byte_count = (element_count + 7) / 8; // Note: if 'nullable == true` and valid_data is nullptr // means null_count == 0, will fill it with 0xFF - if (valid_data == nullptr) { - valid_data_.resize(byte_count, 0xFF); + if (!valid_data) { + valid_data_.assign(byte_count, 0xFF); } else { std::copy_n(valid_data, byte_count, valid_data_.data()); } diff --git a/internal/core/src/common/FieldDataInterface.h b/internal/core/src/common/FieldDataInterface.h index 2fab8b8394193..3de529cb36a4a 100644 --- a/internal/core/src/common/FieldDataInterface.h +++ b/internal/core/src/common/FieldDataInterface.h @@ -476,7 +476,7 @@ class FieldDataJsonImpl : public FieldDataImpl { if (IsNullable()) { auto valid_data = array->null_bitmap_data(); if (valid_data == nullptr) { - valid_data_.resize((n + 7) / 8, 0xFF); + valid_data_.assign((n + 7) / 8, 0xFF); } else { std::copy_n(valid_data, (n + 7) / 8, valid_data_.data()); } diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index a2576610a9b23..0dfc2506cbc28 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -69,11 +69,14 @@ BitmapIndex::Build(size_t n, const T* data) { PanicInfo(DataIsEmpty, "BitmapIndex can not build null values"); } + total_num_rows_ = n; + valid_bitset = TargetBitmap(total_num_rows_, false); + T* p = const_cast(data); for (int i = 0; i < n; ++i, ++p) { data_[*p].add(i); + valid_bitset.set(i); } - total_num_rows_ = n; if (data_.size() < DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) { for (auto it = data_.begin(); it != data_.end(); ++it) { @@ -95,8 +98,11 @@ BitmapIndex::BuildPrimitiveField( for (const auto& data : field_datas) { auto slice_row_num = data->get_num_rows(); for (size_t i = 0; i < slice_row_num; ++i) { - auto val = reinterpret_cast(data->RawValue(i)); - data_[*val].add(offset); + if (data->is_valid(i)) { + auto val = reinterpret_cast(data->RawValue(i)); + data_[*val].add(offset); + valid_bitset.set(offset); + } offset++; } } @@ -114,6 +120,7 @@ BitmapIndex::BuildWithFieldData( PanicInfo(DataIsEmpty, "scalar bitmap index can not build null values"); } total_num_rows_ = total_num_rows; + valid_bitset = TargetBitmap(total_num_rows_, false); switch (schema_.data_type()) { case proto::schema::DataType::Bool: @@ -151,12 +158,14 @@ BitmapIndex::BuildArrayField(const std::vector& field_datas) { for (const auto& data : field_datas) { auto slice_row_num = data->get_num_rows(); for (size_t i = 0; i < slice_row_num; ++i) { - auto array = - reinterpret_cast(data->RawValue(i)); - - for (size_t j = 0; j < array->length(); ++j) { - auto val = static_cast(array->template get_data(j)); - data_[val].add(offset); + if (data->is_valid(i)) { + auto array = + reinterpret_cast(data->RawValue(i)); + for (size_t j = 0; j < array->length(); ++j) { + auto val = array->template get_data(j); + data_[val].add(offset); + } + valid_bitset.set(offset); } offset++; } @@ -330,6 +339,9 @@ BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, } else { data_[key] = value; } + for (const auto& v : value) { + valid_bitset.set(v); + } } } @@ -355,6 +367,9 @@ BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, } else { data_[key] = value; } + for (const auto& v : value) { + valid_bitset.set(v); + } } } @@ -367,6 +382,7 @@ BitmapIndex::LoadWithoutAssemble(const BinarySet& binary_set, index_meta_buffer->size); auto index_length = index_meta.first; total_num_rows_ = index_meta.second; + valid_bitset = TargetBitmap(total_num_rows_, false); auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA); DeserializeIndexData(index_data_buffer->data.get(), index_length); @@ -389,7 +405,7 @@ BitmapIndex::Load(milvus::tracer::TraceContext ctx, const Config& config) { AssembleIndexDatas(index_datas); BinarySet binary_set; for (auto& [key, data] : index_datas) { - auto size = data->Size(); + auto size = data->DataSize(); auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction auto buf = std::shared_ptr( (uint8_t*)const_cast(data->Data()), deleter); @@ -442,6 +458,8 @@ BitmapIndex::NotIn(const size_t n, const T* values) { } } } + // NotIn(null) and In(null) is both false, need to mask with IsNotNull operate + res &= valid_bitset; return res; } else { TargetBitmap res(total_num_rows_, false); @@ -452,10 +470,31 @@ BitmapIndex::NotIn(const size_t n, const T* values) { } } res.flip(); + // NotIn(null) and In(null) is both false, need to mask with IsNotNull operate + res &= valid_bitset; return res; } } +template +const TargetBitmap +BitmapIndex::IsNull() { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap res(total_num_rows_, true); + res &= valid_bitset; + res.flip(); + return res; +} + +template +const TargetBitmap +BitmapIndex::IsNotNull() { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap res(total_num_rows_, true); + res &= valid_bitset; + return res; +} + template TargetBitmap BitmapIndex::RangeForBitset(const T value, const OpType op) { diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index c37cfec1cdccb..3bf279cf8b75b 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -82,6 +82,12 @@ class BitmapIndex : public ScalarIndex { const TargetBitmap NotIn(size_t n, const T* values) override; + const TargetBitmap + IsNull() override; + + const TargetBitmap + IsNotNull() override; + const TargetBitmap Range(T value, OpType op) override; @@ -205,6 +211,9 @@ class BitmapIndex : public ScalarIndex { size_t total_num_rows_{0}; proto::schema::FieldSchema schema_; std::shared_ptr file_manager_; + + // generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate + TargetBitmap valid_bitset; }; } // namespace index diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index e9280523df9a0..84870bc09d118 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -358,7 +358,7 @@ HybridScalarIndex::Load(milvus::tracer::TraceContext ctx, AssembleIndexDatas(index_datas); BinarySet binary_set; for (auto& [key, data] : index_datas) { - auto size = data->Size(); + auto size = data->DataSize(); auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction auto buf = std::shared_ptr( (uint8_t*)const_cast(data->Data()), deleter); diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h index 4a9c60d6bd9f7..0829afc963fbc 100644 --- a/internal/core/src/index/HybridScalarIndex.h +++ b/internal/core/src/index/HybridScalarIndex.h @@ -87,6 +87,16 @@ class HybridScalarIndex : public ScalarIndex { return internal_index_->NotIn(n, values); } + const TargetBitmap + IsNull() override { + return internal_index_->IsNull(); + } + + const TargetBitmap + IsNotNull() override { + return internal_index_->IsNotNull(); + } + const TargetBitmap Query(const DatasetPtr& dataset) override { return internal_index_->Query(dataset); diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index 6de712ecaabe3..adcfdab0c33e9 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "InvertedIndexTantivy.h" namespace milvus::index { @@ -105,8 +107,14 @@ InvertedIndexTantivy::finish() { template BinarySet InvertedIndexTantivy::Serialize(const Config& config) { + auto index_valid_data_length = null_offset.size() * sizeof(size_t); + std::shared_ptr index_valid_data( + new uint8_t[index_valid_data_length]); + memcpy(index_valid_data.get(), null_offset.data(), index_valid_data_length); BinarySet res_set; - + res_set.Append( + "index_null_offset", index_valid_data, index_valid_data_length); + milvus::Disassemble(res_set); return res_set; } @@ -137,7 +145,8 @@ InvertedIndexTantivy::Upload(const Config& config) { for (auto& file : remote_paths_to_size) { ret.Append(file.first, nullptr, file.second); } - + auto binary_set = Serialize(config); + mem_file_manager_->AddFile(binary_set); return ret; } @@ -173,6 +182,26 @@ InvertedIndexTantivy::Load(milvus::tracer::TraceContext ctx, files_value.end()); disk_file_manager_->CacheIndexToDisk(files_value); wrapper_ = std::make_shared(prefix.c_str()); + auto index_valid_data_file = + mem_file_manager_->GetRemoteIndexObjectPrefix() + + std::string("/index_null_offset"); + std::vector file; + file.push_back(index_valid_data_file); + auto index_datas = mem_file_manager_->LoadIndexToMemory(file); + AssembleIndexDatas(index_datas); + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->DataSize(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + auto index_valid_data = binary_set.GetByName("index_null_offset"); + null_offset.resize((size_t)index_valid_data->size / sizeof(size_t)); + memcpy(null_offset.data(), + index_valid_data->data.get(), + (size_t)index_valid_data->size); } inline void @@ -212,6 +241,27 @@ InvertedIndexTantivy::In(size_t n, const T* values) { return bitset; } +template +const TargetBitmap +InvertedIndexTantivy::IsNull() { + TargetBitmap bitset(Count()); + + for (size_t i = 0; i < null_offset.size(); ++i) { + bitset.set(null_offset[i]); + } + return bitset; +} + +template +const TargetBitmap +InvertedIndexTantivy::IsNotNull() { + TargetBitmap bitset(Count(), true); + for (size_t i = 0; i < null_offset.size(); ++i) { + bitset.reset(null_offset[i]); + } + return bitset; +} + template const TargetBitmap InvertedIndexTantivy::InApplyFilter( @@ -242,6 +292,9 @@ InvertedIndexTantivy::NotIn(size_t n, const T* values) { auto array = wrapper_->term_query(values[i]); apply_hits(bitset, array, false); } + for (size_t i = 0; i < null_offset.size(); ++i) { + bitset.reset(null_offset[i]); + } return bitset; } @@ -378,6 +431,13 @@ template void InvertedIndexTantivy::BuildWithFieldData( const std::vector>& field_datas) { + if (schema_.nullable()) { + int64_t total = 0; + for (const auto& data : field_datas) { + total += data->get_null_count(); + } + null_offset.reserve(total); + } switch (schema_.data_type()) { case proto::schema::DataType::Bool: case proto::schema::DataType::Int8: @@ -390,6 +450,17 @@ InvertedIndexTantivy::BuildWithFieldData( case proto::schema::DataType::VarChar: { for (const auto& data : field_datas) { auto n = data->get_num_rows(); + if (schema_.nullable()) { + for (int i = 0; i < n; i++) { + if (!data->is_valid(i)) { + null_offset.push_back(i); + } + wrapper_->add_multi_data( + static_cast(data->RawValue(i)), + data->is_valid(i)); + } + continue; + } wrapper_->add_data(static_cast(data->Data()), n); } break; @@ -417,9 +488,12 @@ InvertedIndexTantivy::build_index_for_array( for (int64_t i = 0; i < n; i++) { assert(array_column[i].get_element_type() == static_cast(schema_.element_type())); + if (schema_.nullable() && !data->is_valid(i)) { + null_offset.push_back(i); + } + auto length = data->is_valid(i) ? array_column[i].length() : 0; wrapper_->template add_multi_data( - reinterpret_cast(array_column[i].data()), - array_column[i].length()); + reinterpret_cast(array_column[i].data()), length); } } } @@ -435,12 +509,16 @@ InvertedIndexTantivy::build_index_for_array( Assert(IsStringDataType(array_column[i].get_element_type())); Assert(IsStringDataType( static_cast(schema_.element_type()))); + if (schema_.nullable() && !data->is_valid(i)) { + null_offset.push_back(i); + } std::vector output; for (int64_t j = 0; j < array_column[i].length(); j++) { output.push_back( array_column[i].template get_data(j)); } - wrapper_->template add_multi_data(output.data(), output.size()); + auto length = data->is_valid(i) ? output.size() : 0; + wrapper_->template add_multi_data(output.data(), length); } } } diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 12165c572bc8a..7c3ade38b3747 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -11,6 +11,8 @@ #pragma once +#include +#include #include "common/RegexQuery.h" #include "index/Index.h" #include "storage/FileManager.h" @@ -80,12 +82,8 @@ class InvertedIndexTantivy : public ScalarIndex { const void* values, const Config& config = {}) override; - /* - * deprecated. - * TODO: why not remove this? - */ BinarySet - Serialize(const Config& config /* not used */) override; + Serialize(const Config& config) override; BinarySet Upload(const Config& config = {}) override; @@ -101,6 +99,12 @@ class InvertedIndexTantivy : public ScalarIndex { const TargetBitmap In(size_t n, const T* values) override; + const TargetBitmap + IsNull() override; + + const TargetBitmap + IsNotNull() override; + const TargetBitmap InApplyFilter( size_t n, @@ -193,5 +197,9 @@ class InvertedIndexTantivy : public ScalarIndex { */ MemFileManagerPtr mem_file_manager_; DiskFileManagerPtr disk_file_manager_; + + // all data need to be built to align the offset + // so need to store null_offset in inverted index additionally + std::vector null_offset{}; }; } // namespace milvus::index diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index bdb576dc2fa34..1a3067256adce 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -82,6 +82,12 @@ class ScalarIndex : public IndexBase { virtual const TargetBitmap In(size_t n, const T* values) = 0; + virtual const TargetBitmap + IsNull() = 0; + + virtual const TargetBitmap + IsNotNull() = 0; + virtual const TargetBitmap InApplyFilter(size_t n, const T* values, diff --git a/internal/core/src/index/ScalarIndexSort.cpp b/internal/core/src/index/ScalarIndexSort.cpp index e2d1bdb17a4bd..3d206a697f738 100644 --- a/internal/core/src/index/ScalarIndexSort.cpp +++ b/internal/core/src/index/ScalarIndexSort.cpp @@ -68,10 +68,13 @@ ScalarIndexSort::Build(size_t n, const T* values) { PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!"); } data_.reserve(n); + total_num_rows_ = n; + valid_bitset = TargetBitmap(total_num_rows_, false); idx_to_offsets_.resize(n); T* p = const_cast(values); for (size_t i = 0; i < n; ++i) { data_.emplace_back(IndexStructure(*p++, i)); + valid_bitset.set(i); } std::sort(data_.begin(), data_.end()); for (size_t i = 0; i < data_.size(); ++i) { @@ -84,28 +87,33 @@ template void ScalarIndexSort::BuildWithFieldData( const std::vector& field_datas) { - int64_t total_num_rows = 0; + int64_t length = 0; for (const auto& data : field_datas) { - total_num_rows += data->get_num_rows(); + total_num_rows_ += data->get_num_rows(); + length += data->get_num_rows() - data->get_null_count(); } - if (total_num_rows == 0) { + if (length == 0) { PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!"); } - data_.reserve(total_num_rows); + data_.reserve(length); + valid_bitset = TargetBitmap(total_num_rows_, false); int64_t offset = 0; for (const auto& data : field_datas) { auto slice_num = data->get_num_rows(); for (size_t i = 0; i < slice_num; ++i) { - auto value = reinterpret_cast(data->RawValue(i)); - data_.emplace_back(IndexStructure(*value, offset)); + if (data->is_valid(i)) { + auto value = reinterpret_cast(data->RawValue(i)); + data_.emplace_back(IndexStructure(*value, offset)); + valid_bitset.set(offset); + } offset++; } } std::sort(data_.begin(), data_.end()); - idx_to_offsets_.resize(total_num_rows); - for (size_t i = 0; i < total_num_rows; ++i) { + idx_to_offsets_.resize(total_num_rows_); + for (size_t i = 0; i < length; ++i) { idx_to_offsets_[data_[i].idx_] = i; } is_built_ = true; @@ -124,9 +132,13 @@ ScalarIndexSort::Serialize(const Config& config) { auto index_size = data_.size(); memcpy(index_length.get(), &index_size, sizeof(size_t)); + std::shared_ptr index_num_rows(new uint8_t[sizeof(size_t)]); + memcpy(index_num_rows.get(), &total_num_rows_, sizeof(size_t)); + BinarySet res_set; res_set.Append("index_data", index_data, index_data_size); res_set.Append("index_length", index_length, sizeof(size_t)); + res_set.Append("index_num_rows", index_num_rows, sizeof(size_t)); milvus::Disassemble(res_set); @@ -158,11 +170,18 @@ ScalarIndexSort::LoadWithoutAssemble(const BinarySet& index_binary, auto index_data = index_binary.GetByName("index_data"); data_.resize(index_size); - idx_to_offsets_.resize(index_size); + auto index_num_rows = index_binary.GetByName("index_num_rows"); + memcpy(&total_num_rows_, + index_num_rows->data.get(), + (size_t)index_num_rows->size); + idx_to_offsets_.resize(total_num_rows_); + valid_bitset = TargetBitmap(total_num_rows_, false); memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size); for (size_t i = 0; i < data_.size(); ++i) { idx_to_offsets_[data_[i].idx_] = i; + valid_bitset.set(data_[i].idx_); } + is_built_ = true; } @@ -185,7 +204,7 @@ ScalarIndexSort::Load(milvus::tracer::TraceContext ctx, AssembleIndexDatas(index_datas); BinarySet binary_set; for (auto& [key, data] : index_datas) { - auto size = data->Size(); + auto size = data->DataSize(); auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction auto buf = std::shared_ptr( (uint8_t*)const_cast(data->Data()), deleter); @@ -199,7 +218,7 @@ template const TargetBitmap ScalarIndexSort::In(const size_t n, const T* values) { AssertInfo(is_built_, "index has not been built"); - TargetBitmap bitset(data_.size()); + TargetBitmap bitset(Count()); for (size_t i = 0; i < n; ++i) { auto lb = std::lower_bound( data_.begin(), data_.end(), IndexStructure(*(values + i))); @@ -221,7 +240,7 @@ template const TargetBitmap ScalarIndexSort::NotIn(const size_t n, const T* values) { AssertInfo(is_built_, "index has not been built"); - TargetBitmap bitset(data_.size(), true); + TargetBitmap bitset(Count(), true); for (size_t i = 0; i < n; ++i) { auto lb = std::lower_bound( data_.begin(), data_.end(), IndexStructure(*(values + i))); @@ -236,6 +255,27 @@ ScalarIndexSort::NotIn(const size_t n, const T* values) { bitset[lb->idx_] = false; } } + // NotIn(null) and In(null) is both false, need to mask with IsNotNull operate + bitset &= valid_bitset; + return bitset; +} + +template +const TargetBitmap +ScalarIndexSort::IsNull() { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap bitset(total_num_rows_, true); + bitset &= valid_bitset; + bitset.flip(); + return bitset; +} + +template +const TargetBitmap +ScalarIndexSort::IsNotNull() { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap bitset(total_num_rows_, true); + bitset &= valid_bitset; return bitset; } @@ -243,7 +283,7 @@ template const TargetBitmap ScalarIndexSort::Range(const T value, const OpType op) { AssertInfo(is_built_, "index has not been built"); - TargetBitmap bitset(data_.size()); + TargetBitmap bitset(Count()); auto lb = data_.begin(); auto ub = data_.end(); if (ShouldSkip(value, value, op)) { @@ -283,7 +323,7 @@ ScalarIndexSort::Range(T lower_bound_value, T upper_bound_value, bool ub_inclusive) { AssertInfo(is_built_, "index has not been built"); - TargetBitmap bitset(data_.size()); + TargetBitmap bitset(Count()); if (lower_bound_value > upper_bound_value || (lower_bound_value == upper_bound_value && !(lb_inclusive && ub_inclusive))) { diff --git a/internal/core/src/index/ScalarIndexSort.h b/internal/core/src/index/ScalarIndexSort.h index cee30ca62ef35..fb33f030c2a03 100644 --- a/internal/core/src/index/ScalarIndexSort.h +++ b/internal/core/src/index/ScalarIndexSort.h @@ -47,7 +47,7 @@ class ScalarIndexSort : public ScalarIndex { int64_t Count() override { - return data_.size(); + return total_num_rows_; } ScalarIndexType @@ -67,6 +67,12 @@ class ScalarIndexSort : public ScalarIndex { const TargetBitmap NotIn(size_t n, const T* values) override; + const TargetBitmap + IsNull() override; + + const TargetBitmap + IsNotNull() override; + const TargetBitmap Range(T value, OpType op) override; @@ -120,6 +126,9 @@ class ScalarIndexSort : public ScalarIndex { std::vector idx_to_offsets_; // used to retrieve. std::vector> data_; std::shared_ptr file_manager_; + size_t total_num_rows_{0}; + // generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate + TargetBitmap valid_bitset; }; template diff --git a/internal/core/src/index/SkipIndex.cpp b/internal/core/src/index/SkipIndex.cpp index dcf850bae27a1..11949db2bcf9c 100644 --- a/internal/core/src/index/SkipIndex.cpp +++ b/internal/core/src/index/SkipIndex.cpp @@ -33,67 +33,74 @@ SkipIndex::LoadPrimitive(milvus::FieldId field_id, int64_t chunk_id, milvus::DataType data_type, const void* chunk_data, + const bool* valid_data, int64_t count) { auto chunkMetrics = std::make_unique(); if (count > 0) { - chunkMetrics->hasValue_ = true; switch (data_type) { case DataType::INT8: { const int8_t* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } case DataType::INT16: { const int16_t* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } case DataType::INT32: { const int32_t* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } case DataType::INT64: { const int64_t* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } case DataType::FLOAT: { const float* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } case DataType::DOUBLE: { const double* typedData = static_cast(chunk_data); - std::pair minMax = - ProcessFieldMetrics(typedData, count); - chunkMetrics->min_ = Metrics(minMax.first); - chunkMetrics->max_ = Metrics(minMax.second); + auto info = + ProcessFieldMetrics(typedData, valid_data, count); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; break; } } } + chunkMetrics->hasValue_ = chunkMetrics->null_count_ == count ? false : true; std::unique_lock lck(mutex_); if (fieldChunkMetrics_.count(field_id) == 0) { fieldChunkMetrics_.insert(std::make_pair( @@ -111,21 +118,15 @@ SkipIndex::LoadString(milvus::FieldId field_id, int num_rows = var_column.NumRows(); auto chunkMetrics = std::make_unique(); if (num_rows > 0) { - chunkMetrics->hasValue_ = true; - std::string_view min_string = var_column.RawAt(0); - std::string_view max_string = var_column.RawAt(0); - for (size_t i = 1; i < num_rows; i++) { - const auto& val = var_column.RawAt(i); - if (val < min_string) { - min_string = val; - } - if (val > max_string) { - max_string = val; - } - } - chunkMetrics->min_ = Metrics(min_string); - chunkMetrics->max_ = Metrics(max_string); + auto info = ProcessStringFieldMetrics(var_column); + chunkMetrics->min_ = Metrics(info.min_); + chunkMetrics->max_ = Metrics(info.max_); + chunkMetrics->null_count_ = info.null_count_; } + + chunkMetrics->hasValue_ = + chunkMetrics->null_count_ == num_rows ? false : true; + std::unique_lock lck(mutex_); if (fieldChunkMetrics_.count(field_id) == 0) { fieldChunkMetrics_.insert(std::make_pair( diff --git a/internal/core/src/index/SkipIndex.h b/internal/core/src/index/SkipIndex.h index dba2cb1ebe89a..b6b2b33da6305 100644 --- a/internal/core/src/index/SkipIndex.h +++ b/internal/core/src/index/SkipIndex.h @@ -10,6 +10,7 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #pragma once +#include #include #include "common/Types.h" @@ -29,6 +30,7 @@ struct FieldChunkMetrics { Metrics min_; Metrics max_; bool hasValue_; + int64_t null_count_; FieldChunkMetrics() : hasValue_(false){}; }; @@ -73,6 +75,7 @@ class SkipIndex { int64_t chunk_id, milvus::DataType data_type, const void* chunk_data, + const bool* valid_data, int64_t count); void @@ -217,17 +220,43 @@ class SkipIndex { return should_skip; } + // todo: support some null_count_ skip + template - std::pair - ProcessFieldMetrics(const T* data, int64_t count) { + struct metricInfo { + T min_; + T max_; + int64_t null_count_; + }; + + template + metricInfo + ProcessFieldMetrics(const T* data, const bool* valid_data, int64_t count) { //double check to avoid crush if (data == nullptr || count == 0) { return {T(), T()}; } - T minValue = data[0]; - T maxValue = data[0]; - for (size_t i = 0; i < count; i++) { + // find first not null value + int64_t start = 0; + for (int64_t i = start; i < count; i++) { + if (valid_data != nullptr && !valid_data[i]) { + start++; + continue; + } + break; + } + if (start > count - 1) { + return {T(), T(), count}; + } + T minValue = data[start]; + T maxValue = data[start]; + int64_t null_count = start; + for (int64_t i = start; i < count; i++) { T value = data[i]; + if (valid_data != nullptr && !valid_data[i]) { + null_count++; + continue; + } if (value < minValue) { minValue = value; } @@ -235,7 +264,42 @@ class SkipIndex { maxValue = value; } } - return {minValue, maxValue}; + return {minValue, maxValue, null_count}; + } + + metricInfo + ProcessStringFieldMetrics( + const milvus::VariableColumn& var_column) { + int num_rows = var_column.NumRows(); + // find first not null value + int64_t start = 0; + for (int64_t i = start; i < num_rows; i++) { + if (!var_column.IsValid(i)) { + start++; + continue; + } + break; + } + if (start > num_rows - 1) { + return {std::string_view(), std::string_view(), num_rows}; + } + std::string_view min_string = var_column.RawAt(start); + std::string_view max_string = var_column.RawAt(start); + int64_t null_count = start; + for (int64_t i = start; i < num_rows; i++) { + const auto& val = var_column.RawAt(i); + if (!var_column.IsValid(i)) { + null_count++; + continue; + } + if (val < min_string) { + min_string = val; + } + if (val > max_string) { + max_string = val; + } + } + return {min_string, max_string, null_count}; } private: diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index 9f3e455781edd..6052532fc0a87 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -83,23 +83,29 @@ StringIndexMarisa::BuildWithFieldData( for (const auto& data : field_datas) { auto slice_num = data->get_num_rows(); for (int64_t i = 0; i < slice_num; ++i) { - keyset.push_back( - (*static_cast(data->RawValue(i))).c_str()); + if (data->is_valid(i)) { + keyset.push_back( + (*static_cast(data->RawValue(i))) + .c_str()); + } } total_num_rows += slice_num; } trie_.build(keyset); // fill str_ids_ - str_ids_.resize(total_num_rows); + str_ids_.resize(total_num_rows, MARISA_NULL_KEY_ID); int64_t offset = 0; for (const auto& data : field_datas) { auto slice_num = data->get_num_rows(); for (int64_t i = 0; i < slice_num; ++i) { - auto str_id = - lookup(*static_cast(data->RawValue(i))); - AssertInfo(valid_str_id(str_id), "invalid marisa key"); - str_ids_[offset++] = str_id; + if (data->is_valid(offset)) { + auto str_id = + lookup(*static_cast(data->RawValue(i))); + AssertInfo(valid_str_id(str_id), "invalid marisa key"); + str_ids_[offset] = str_id; + } + offset++; } } @@ -228,7 +234,7 @@ StringIndexMarisa::Load(milvus::tracer::TraceContext ctx, AssembleIndexDatas(index_datas); BinarySet binary_set; for (auto& [key, data] : index_datas) { - auto size = data->Size(); + auto size = data->DataSize(); auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction auto buf = std::shared_ptr( (uint8_t*)const_cast(data->Data()), deleter); @@ -267,6 +273,32 @@ StringIndexMarisa::NotIn(size_t n, const std::string* values) { } } } + // NotIn(null) and In(null) is both false, need to mask with IsNotNull operate + auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID]; + for (size_t i = 0; i < offsets.size(); i++) { + bitset.reset(offsets[i]); + } + return bitset; +} + +const TargetBitmap +StringIndexMarisa::IsNull() { + TargetBitmap bitset(str_ids_.size()); + auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID]; + for (size_t i = 0; i < offsets.size(); i++) { + bitset.set(offsets[i]); + } + return bitset; +} + +const TargetBitmap +StringIndexMarisa::IsNotNull() { + TargetBitmap bitset(str_ids_.size()); + auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID]; + for (size_t i = 0; i < offsets.size(); i++) { + bitset.set(offsets[i]); + } + bitset.flip(); return bitset; } diff --git a/internal/core/src/index/StringIndexMarisa.h b/internal/core/src/index/StringIndexMarisa.h index c9e91e3217df7..a1227414a3845 100644 --- a/internal/core/src/index/StringIndexMarisa.h +++ b/internal/core/src/index/StringIndexMarisa.h @@ -69,6 +69,12 @@ class StringIndexMarisa : public StringIndex { const TargetBitmap NotIn(size_t n, const std::string* values) override; + const TargetBitmap + IsNull() override; + + const TargetBitmap + IsNotNull() override; + const TargetBitmap Range(std::string value, OpType op) override; diff --git a/internal/core/src/index/Utils.cpp b/internal/core/src/index/Utils.cpp index dfd41298b44a3..0b5702fa1dd82 100644 --- a/internal/core/src/index/Utils.cpp +++ b/internal/core/src/index/Utils.cpp @@ -242,14 +242,15 @@ void AssembleIndexDatas(std::map& index_datas) { if (index_datas.find(INDEX_FILE_SLICE_META) != index_datas.end()) { auto slice_meta = index_datas.at(INDEX_FILE_SLICE_META); - Config meta_data = Config::parse(std::string( - static_cast(slice_meta->Data()), slice_meta->Size())); + Config meta_data = Config::parse( + std::string(static_cast(slice_meta->Data()), + slice_meta->DataSize())); for (auto& item : meta_data[META]) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - // todo: support nullable index + // build index skip null value, so not need to set nullable == true auto new_field_data = storage::CreateFieldData(DataType::INT8, false, 1, total_len); @@ -258,7 +259,7 @@ AssembleIndexDatas(std::map& index_datas) { AssertInfo(index_datas.find(file_name) != index_datas.end(), "lost index slice data"); auto data = index_datas.at(file_name); - auto len = data->Size(); + auto len = data->DataSize(); new_field_data->FillFieldData(data->Data(), len); index_datas.erase(file_name); } @@ -282,13 +283,13 @@ AssembleIndexDatas(std::map& index_datas, index_datas.erase(INDEX_FILE_SLICE_META); Config metadata = Config::parse( std::string(static_cast(raw_metadata->Data()), - raw_metadata->Size())); + raw_metadata->DataSize())); for (auto& item : metadata[META]) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - // todo: support nullable index + // build index skip null value, so not need to set nullable == true auto new_field_data = storage::CreateFieldData(DataType::INT8, false, 1, total_len); @@ -299,7 +300,7 @@ AssembleIndexDatas(std::map& index_datas, auto& channel = it->second; auto data_array = storage::CollectFieldDataChannel(channel); auto data = storage::MergeFieldData(data_array); - auto len = data->Size(); + auto len = data->DataSize(); new_field_data->FillFieldData(data->Data(), len); index_datas.erase(file_name); } diff --git a/internal/core/src/index/VectorMemIndex.cpp b/internal/core/src/index/VectorMemIndex.cpp index 0f515a442d5e4..97e5752626daf 100644 --- a/internal/core/src/index/VectorMemIndex.cpp +++ b/internal/core/src/index/VectorMemIndex.cpp @@ -195,7 +195,6 @@ VectorMemIndex::Load(milvus::tracer::TraceContext ctx, std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - // todo: support nullable index auto new_field_data = milvus::storage::CreateFieldData( DataType::INT8, false, 1, total_len); diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index a6c309f472529..243fc1cd6da5d 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -245,7 +245,10 @@ class ColumnBase { bool IsValid(size_t offset) const { - return valid_data_[offset]; + if (nullable_) { + return valid_data_[offset]; + } + return true; } bool diff --git a/internal/core/src/segcore/SegmentInterface.cpp b/internal/core/src/segcore/SegmentInterface.cpp index 91ffe3e321c0d..f0bb039bef3f6 100644 --- a/internal/core/src/segcore/SegmentInterface.cpp +++ b/internal/core/src/segcore/SegmentInterface.cpp @@ -357,8 +357,10 @@ SegmentInternalInterface::LoadPrimitiveSkipIndex(milvus::FieldId field_id, int64_t chunk_id, milvus::DataType data_type, const void* chunk_data, + const bool* valid_data, int64_t count) { - skip_index_.LoadPrimitive(field_id, chunk_id, data_type, chunk_data, count); + skip_index_.LoadPrimitive( + field_id, chunk_id, data_type, chunk_data, valid_data, count); } void diff --git a/internal/core/src/segcore/SegmentInterface.h b/internal/core/src/segcore/SegmentInterface.h index 161e2e6386cee..181a301c9623e 100644 --- a/internal/core/src/segcore/SegmentInterface.h +++ b/internal/core/src/segcore/SegmentInterface.h @@ -248,6 +248,7 @@ class SegmentInternalInterface : public SegmentInterface { int64_t chunk_id, DataType data_type, const void* chunk_data, + const bool* valid_data, int64_t count); void diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index ead926812a946..be21c819f6003 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -423,8 +423,12 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { column->AppendBatch(field_data); stats_.mem_size += field_data->Size(); } - LoadPrimitiveSkipIndex( - field_id, 0, data_type, column->Span().data(), num_rows); + LoadPrimitiveSkipIndex(field_id, + 0, + data_type, + column->Span().data(), + column->Span().valid_data(), + num_rows); } AssertInfo(column->NumRows() == num_rows, diff --git a/internal/core/src/storage/DiskFileManagerImpl.cpp b/internal/core/src/storage/DiskFileManagerImpl.cpp index a84fe4fd5b357..16d01d3e86766 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.cpp +++ b/internal/core/src/storage/DiskFileManagerImpl.cpp @@ -213,7 +213,7 @@ DiskFileManagerImpl::CacheIndexToDisk( auto index_chunks = GetObjectData(rcm_.get(), batch_remote_files); for (auto& chunk : index_chunks) { auto index_data = chunk.get()->GetFieldData(); - auto index_size = index_data->Size(); + auto index_size = index_data->DataSize(); auto chunk_data = reinterpret_cast( const_cast(index_data->Data())); file.Write(chunk_data, index_size); diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index 825767047b08d..95dc7ba13193d 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -528,6 +528,7 @@ EncodeAndUploadIndexSlice(ChunkManager* chunk_manager, IndexMeta index_meta, FieldDataMeta field_meta, std::string object_key) { + // index not use valid_data, so no need to set nullable==true auto field_data = CreateFieldData(DataType::INT8, false); field_data->FillFieldData(buf, batch_size); auto indexData = std::make_shared(field_data); @@ -551,8 +552,8 @@ EncodeAndUploadFieldSlice(ChunkManager* chunk_manager, auto dim = IsSparseFloatVectorDataType(field_meta.get_data_type()) ? -1 : field_meta.get_dim(); - auto field_data = CreateFieldData( - field_meta.get_data_type(), field_meta.is_nullable(), dim, 0); + auto field_data = + CreateFieldData(field_meta.get_data_type(), false, dim, 0); field_data->FillFieldData(buf, element_count); auto insertData = std::make_shared(field_data); insertData->SetFieldDataMeta(field_data_meta); diff --git a/internal/core/unittest/test_array_bitmap_index.cpp b/internal/core/unittest/test_array_bitmap_index.cpp index d62b5cb75836c..d0a104c5bbef0 100644 --- a/internal/core/unittest/test_array_bitmap_index.cpp +++ b/internal/core/unittest/test_array_bitmap_index.cpp @@ -162,6 +162,7 @@ class ArrayBitmapIndexTest : public testing::Test { int64_t index_version) { proto::schema::FieldSchema field_schema; field_schema.set_data_type(proto::schema::DataType::Array); + field_schema.set_nullable(nullable_); proto::schema::DataType element_type; if constexpr (std::is_same_v) { element_type = proto::schema::DataType::Int8; @@ -185,9 +186,26 @@ class ArrayBitmapIndexTest : public testing::Test { segment_id, field_id, index_build_id, index_version}; data_ = GenerateArrayData(element_type, cardinality_, nb_, 10); - - auto field_data = storage::CreateFieldData(DataType::ARRAY); - field_data->FillFieldData(data_.data(), data_.size()); + auto field_data = storage::CreateFieldData(DataType::ARRAY, nullable_); + if (nullable_) { + valid_data_.reserve(nb_); + uint8_t* ptr = new uint8_t[(nb_ + 7) / 8]; + for (int i = 0; i < nb_; i++) { + int byteIndex = i / 8; + int bitIndex = i % 8; + if (i % 2 == 0) { + valid_data_.push_back(true); + ptr[byteIndex] |= (1 << bitIndex); + } else { + valid_data_.push_back(false); + ptr[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(data_.data(), ptr, data_.size()); + delete[] ptr; + } else { + field_data->FillFieldData(data_.data(), data_.size()); + } storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); insert_data.SetTimestamps(0, 100); @@ -237,6 +255,7 @@ class ArrayBitmapIndexTest : public testing::Test { SetParam() { nb_ = 10000; cardinality_ = 30; + nullable_ = false; } void @@ -293,6 +312,9 @@ class ArrayBitmapIndexTest : public testing::Test { for (size_t i = 0; i < bitset.size(); i++) { auto ref = [&]() -> bool { milvus::Array array = data_[i]; + if (nullable_ && !valid_data_[i]) { + return false; + } for (size_t j = 0; j < array.length(); ++j) { auto val = array.template get_data(j); if (s.find(val) != s.end()) { @@ -313,7 +335,9 @@ class ArrayBitmapIndexTest : public testing::Test { IndexBasePtr index_; size_t nb_; size_t cardinality_; + bool nullable_; std::vector data_; + FixedVector valid_data_; }; TYPED_TEST_SUITE_P(ArrayBitmapIndexTest); @@ -350,6 +374,7 @@ class ArrayBitmapIndexTestV1 : public ArrayBitmapIndexTest { SetParam() override { this->nb_ = 10000; this->cardinality_ = 200; + this->nullable_ = false; } virtual ~ArrayBitmapIndexTestV1() { @@ -363,10 +388,36 @@ TYPED_TEST_P(ArrayBitmapIndexTestV1, CountFuncTest) { EXPECT_EQ(count, this->nb_); } +template +class ArrayBitmapIndexTestNullable : public ArrayBitmapIndexTest { + public: + virtual void + SetParam() override { + this->nb_ = 10000; + this->cardinality_ = 30; + this->nullable_ = true; + } + + virtual ~ArrayBitmapIndexTestNullable() { + } +}; + +TYPED_TEST_SUITE_P(ArrayBitmapIndexTestNullable); + +TYPED_TEST_P(ArrayBitmapIndexTestNullable, CountFuncTest) { + auto count = this->index_->Count(); + EXPECT_EQ(count, this->nb_); +} + using BitmapTypeV1 = testing::Types; REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTestV1, CountFuncTest); +REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTestNullable, CountFuncTest); INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheckV1, ArrayBitmapIndexTestV1, + BitmapTypeV1); + +INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheckV1, + ArrayBitmapIndexTestNullable, BitmapTypeV1); \ No newline at end of file diff --git a/internal/core/unittest/test_hybrid_index.cpp b/internal/core/unittest/test_hybrid_index.cpp index b4a8c6811d33d..b4e3455bdf4c0 100644 --- a/internal/core/unittest/test_hybrid_index.cpp +++ b/internal/core/unittest/test_hybrid_index.cpp @@ -72,6 +72,7 @@ class HybridIndexTestV1 : public testing::Test { int64_t index_build_id, int64_t index_version) { proto::schema::FieldSchema field_schema; + field_schema.set_nullable(nullable_); if constexpr (std::is_same_v) { field_schema.set_data_type(proto::schema::DataType::Int8); } else if constexpr (std::is_same_v) { @@ -98,8 +99,26 @@ class HybridIndexTestV1 : public testing::Test { data_.push_back(x); } - auto field_data = storage::CreateFieldData(type_); - field_data->FillFieldData(data_.data(), data_.size()); + auto field_data = storage::CreateFieldData(type_, nullable_); + if (nullable_) { + valid_data_.reserve(nb_); + uint8_t* ptr = new uint8_t[(nb_ + 7) / 8]; + for (int i = 0; i < nb_; i++) { + int byteIndex = i / 8; + int bitIndex = i % 8; + if (i % 2 == 0) { + valid_data_.push_back(true); + ptr[byteIndex] |= (1 << bitIndex); + } else { + valid_data_.push_back(false); + ptr[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(data_.data(), ptr, data_.size()); + delete[] ptr; + } else { + field_data->FillFieldData(data_.data(), data_.size()); + } storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); insert_data.SetTimestamps(0, 100); @@ -149,6 +168,7 @@ class HybridIndexTestV1 : public testing::Test { SetParam() { nb_ = 10000; cardinality_ = 30; + nullable_ = false; } void SetUp() override { @@ -171,7 +191,7 @@ class HybridIndexTestV1 : public testing::Test { int64_t field_id = 101; int64_t index_build_id = 1000; int64_t index_version = 10000; - std::string root_path = "/tmp/test-bitmap-index/"; + std::string root_path = "/tmp/test-bitmap-index"; storage::StorageConfig storage_config; storage_config.storage_type = "local"; @@ -204,7 +224,11 @@ class HybridIndexTestV1 : public testing::Test { dynamic_cast*>(index_.get()); auto bitset = index_ptr->In(test_data.size(), test_data.data()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end()); + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end()); + } } } @@ -221,7 +245,39 @@ class HybridIndexTestV1 : public testing::Test { dynamic_cast*>(index_.get()); auto bitset = index_ptr->NotIn(test_data.size(), test_data.data()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end()); + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_NE(bitset[i], s.find(data_[i]) != s.end()); + } + } + } + + void + TestIsNullFunc() { + auto index_ptr = + dynamic_cast*>(index_.get()); + auto bitset = index_ptr->IsNull(); + for (size_t i = 0; i < bitset.size(); i++) { + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(bitset[i], true); + } else { + ASSERT_EQ(bitset[i], false); + } + } + } + + void + TestIsNotNullFunc() { + auto index_ptr = + dynamic_cast*>(index_.get()); + auto bitset = index_ptr->IsNotNull(); + for (size_t i = 0; i < bitset.size(); i++) { + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_EQ(bitset[i], true); + } } } @@ -250,9 +306,15 @@ class HybridIndexTestV1 : public testing::Test { for (size_t i = 0; i < bitset.size(); i++) { auto ans = bitset[i]; auto should = ref(i); - ASSERT_EQ(ans, should) - << "op: " << op << ", @" << i << ", ans: " << ans - << ", ref: " << should; + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(ans, false) + << "op: " << op << ", @" << i << ", ans: " << ans + << ", ref: " << should; + } else { + ASSERT_EQ(ans, should) + << "op: " << op << ", @" << i << ", ans: " << ans + << ", ref: " << should; + } } } } @@ -309,10 +371,17 @@ class HybridIndexTestV1 : public testing::Test { for (size_t i = 0; i < bitset.size(); i++) { auto ans = bitset[i]; auto should = test_case.ref(i); - ASSERT_EQ(ans, should) - << "lower:" << test_case.lower_val - << "upper:" << test_case.upper_val << ", @" << i - << ", ans: " << ans << ", ref: " << should; + if (nullable_ && !valid_data_[i]) { + ASSERT_EQ(ans, false) + << "lower:" << test_case.lower_val + << "upper:" << test_case.upper_val << ", @" << i + << ", ans: " << ans << ", ref: " << false; + } else { + ASSERT_EQ(ans, should) + << "lower:" << test_case.lower_val + << "upper:" << test_case.upper_val << ", @" << i + << ", ans: " << ans << ", ref: " << should; + } } } } @@ -325,6 +394,8 @@ class HybridIndexTestV1 : public testing::Test { size_t cardinality_; boost::container::vector data_; std::shared_ptr chunk_manager_; + bool nullable_; + FixedVector valid_data_; }; TYPED_TEST_SUITE_P(HybridIndexTestV1); @@ -342,6 +413,14 @@ TYPED_TEST_P(HybridIndexTestV1, NotINFuncTest) { this->TestNotInFunc(); } +TYPED_TEST_P(HybridIndexTestV1, IsNullFuncTest) { + this->TestIsNullFunc(); +} + +TYPED_TEST_P(HybridIndexTestV1, IsNotNullFuncTest) { + this->TestIsNotNullFunc(); +} + TYPED_TEST_P(HybridIndexTestV1, CompareValFuncTest) { this->TestCompareValueFunc(); } @@ -356,6 +435,8 @@ using BitmapType = REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV1, CountFuncTest, INFuncTest, + IsNullFuncTest, + IsNotNullFuncTest, NotINFuncTest, CompareValFuncTest, TestRangeCompareFuncTest); @@ -371,6 +452,7 @@ class HybridIndexTestV2 : public HybridIndexTestV1 { SetParam() override { this->nb_ = 10000; this->cardinality_ = 2000; + this->nullable_ = false; } virtual ~HybridIndexTestV2() { @@ -392,6 +474,14 @@ TYPED_TEST_P(HybridIndexTestV2, NotINFuncTest) { this->TestNotInFunc(); } +TYPED_TEST_P(HybridIndexTestV2, IsNullFuncTest) { + this->TestIsNullFunc(); +} + +TYPED_TEST_P(HybridIndexTestV2, IsNotNullFuncTest) { + this->TestIsNotNullFunc(); +} + TYPED_TEST_P(HybridIndexTestV2, CompareValFuncTest) { this->TestCompareValueFunc(); } @@ -400,12 +490,68 @@ TYPED_TEST_P(HybridIndexTestV2, TestRangeCompareFuncTest) { this->TestRangeCompareFunc(); } +template +class HybridIndexTestNullable : public HybridIndexTestV1 { + public: + virtual void + SetParam() override { + this->nb_ = 10000; + this->cardinality_ = 2000; + this->nullable_ = true; + } + + virtual ~HybridIndexTestNullable() { + } +}; + +TYPED_TEST_SUITE_P(HybridIndexTestNullable); + +TYPED_TEST_P(HybridIndexTestNullable, CountFuncTest) { + auto count = this->index_->Count(); + EXPECT_EQ(count, this->nb_); +} + +TYPED_TEST_P(HybridIndexTestNullable, INFuncTest) { + this->TestInFunc(); +} + +TYPED_TEST_P(HybridIndexTestNullable, NotINFuncTest) { + this->TestNotInFunc(); +} + +TYPED_TEST_P(HybridIndexTestNullable, IsNullFuncTest) { + this->TestIsNullFunc(); +} + +TYPED_TEST_P(HybridIndexTestNullable, IsNotNullFuncTest) { + this->TestIsNotNullFunc(); +} + +TYPED_TEST_P(HybridIndexTestNullable, CompareValFuncTest) { + this->TestCompareValueFunc(); +} + +TYPED_TEST_P(HybridIndexTestNullable, TestRangeCompareFuncTest) { + this->TestRangeCompareFunc(); +} + using BitmapType = testing::Types; REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2, CountFuncTest, INFuncTest, + IsNullFuncTest, + IsNotNullFuncTest, + NotINFuncTest, + CompareValFuncTest, + TestRangeCompareFuncTest); + +REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestNullable, + CountFuncTest, + INFuncTest, + IsNullFuncTest, + IsNotNullFuncTest, NotINFuncTest, CompareValFuncTest, TestRangeCompareFuncTest); @@ -413,3 +559,7 @@ REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2, INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_HighCardinality, HybridIndexTestV2, BitmapType); + +INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_Nullable, + HybridIndexTestNullable, + BitmapType); diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index d0cbd0e80c7de..3ffc9128f0f37 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -32,8 +32,8 @@ gen_field_meta(int64_t collection_id = 1, int64_t segment_id = 3, int64_t field_id = 101, DataType data_type = DataType::NONE, - DataType element_type = DataType::NONE) - -> storage::FieldDataMeta { + DataType element_type = DataType::NONE, + bool nullable = false) -> storage::FieldDataMeta { auto meta = storage::FieldDataMeta{ .collection_id = collection_id, .partition_id = partition_id, @@ -44,6 +44,7 @@ gen_field_meta(int64_t collection_id = 1, static_cast(data_type)); meta.field_schema.set_element_type( static_cast(element_type)); + meta.field_schema.set_nullable(nullable); return meta; } @@ -92,7 +93,10 @@ struct ChunkManagerWrapper { }; } // namespace milvus::test -template +template void test_run() { int64_t collection_id = 1; @@ -102,8 +106,13 @@ test_run() { int64_t index_build_id = 1000; int64_t index_version = 10000; - auto field_meta = test::gen_field_meta( - collection_id, partition_id, segment_id, field_id, dtype, element_type); + auto field_meta = test::gen_field_meta(collection_id, + partition_id, + segment_id, + field_id, + dtype, + element_type, + nullable); auto index_meta = test::gen_index_meta( segment_id, field_id, index_build_id, index_version); @@ -114,6 +123,7 @@ test_run() { size_t nb = 10000; std::vector data_gen; boost::container::vector data; + FixedVector valid_data; if constexpr (!std::is_same_v) { data_gen = GenSortedArr(nb); } else { @@ -121,12 +131,36 @@ test_run() { data_gen.push_back(rand() % 2 == 0); } } + if (nullable) { + valid_data.reserve(nb); + for (size_t i = 0; i < nb; i++) { + valid_data.push_back(rand() % 2 == 0); + } + } for (auto x : data_gen) { data.push_back(x); } - auto field_data = storage::CreateFieldData(dtype); - field_data->FillFieldData(data.data(), data.size()); + auto field_data = storage::CreateFieldData(dtype, nullable); + if (nullable) { + int byteSize = (nb + 7) / 8; + uint8_t* valid_data_ = new uint8_t[byteSize]; + for (int i = 0; i < nb; i++) { + bool value = valid_data[i]; + int byteIndex = i / 8; + int bitIndex = i % 8; + if (value) { + valid_data_[byteIndex] |= (1 << bitIndex); + } else { + valid_data_[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(data.data(), valid_data_, data.size()); + delete[] valid_data_; + } else { + field_data->FillFieldData(data.data(), data.size()); + } + // std::cout << "length:" << field_data->get_num_rows() << std::endl; storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); insert_data.SetTimestamps(0, 100); @@ -197,7 +231,11 @@ test_run() { real_index->In(test_data.size(), test_data.data()); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], s.find(data[i]) != s.end()); + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_EQ(bitset[i], s.find(data[i]) != s.end()); + } } } @@ -213,7 +251,35 @@ test_run() { real_index->NotIn(test_data.size(), test_data.data()); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_NE(bitset[i], s.find(data[i]) != s.end()); + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_NE(bitset[i], s.find(data[i]) != s.end()); + } + } + } + + { + auto bitset = real_index->IsNull(); + ASSERT_EQ(cnt, bitset.size()); + for (size_t i = 0; i < bitset.size(); i++) { + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], true); + } else { + ASSERT_EQ(bitset[i], false); + } + } + } + + { + auto bitset = real_index->IsNotNull(); + ASSERT_EQ(cnt, bitset.size()); + for (size_t i = 0; i < bitset.size(); i++) { + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_EQ(bitset[i], true); + } } } } @@ -241,12 +307,16 @@ test_run() { for (const auto& [test_value, op, ref] : test_cases) { auto bitset = real_index->Range(test_value, op); ASSERT_EQ(cnt, bitset.size()); - for (size_t i = 0; i < bitset.size(); i++) { + for (size_t i = 0; i < nb; i++) { auto ans = bitset[i]; auto should = ref(i); - ASSERT_EQ(ans, should) - << "op: " << op << ", @" << i << ", ans: " << ans - << ", ref: " << should; + if (nullable && !valid_data[i]) { + ASSERT_EQ(ans, false); + } else { + ASSERT_EQ(ans, should) + << "op: " << op << ", @" << i + << ", ans: " << ans << ", ref: " << should; + } } } } @@ -287,11 +357,16 @@ test_run() { auto bitset = real_index->Range(lb, lb_inclusive, ub, ub_inclusive); ASSERT_EQ(cnt, bitset.size()); - for (size_t i = 0; i < bitset.size(); i++) { + for (size_t i = 0; i < nb; i++) { auto ans = bitset[i]; auto should = ref(i); - ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans - << ", ref: " << should; + if (nullable && !valid_data[i]) { + ASSERT_EQ(ans, false); + } else { + ASSERT_EQ(ans, should) + << "@" << i << ", ans: " << ans + << ", ref: " << should; + } } } } @@ -299,6 +374,7 @@ test_run() { } } +template void test_string() { using T = std::string; @@ -316,7 +392,8 @@ test_string() { segment_id, field_id, dtype, - DataType::NONE); + DataType::NONE, + nullable); auto index_meta = test::gen_index_meta( segment_id, field_id, index_build_id, index_version); @@ -326,12 +403,36 @@ test_string() { size_t nb = 10000; boost::container::vector data; + FixedVector valid_data; for (size_t i = 0; i < nb; i++) { data.push_back(std::to_string(rand())); } + if (nullable) { + valid_data.reserve(nb); + for (size_t i = 0; i < nb; i++) { + valid_data.push_back(rand() % 2 == 0); + } + } - auto field_data = storage::CreateFieldData(dtype, false); - field_data->FillFieldData(data.data(), data.size()); + auto field_data = storage::CreateFieldData(dtype, nullable); + if (nullable) { + int byteSize = (nb + 7) / 8; + uint8_t* valid_data_ = new uint8_t[byteSize]; + for (int i = 0; i < nb; i++) { + bool value = valid_data[i]; + int byteIndex = i / 8; + int bitIndex = i % 8; + if (value) { + valid_data_[byteIndex] |= (1 << bitIndex); + } else { + valid_data_[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(data.data(), valid_data_, data.size()); + delete[] valid_data_; + } else { + field_data->FillFieldData(data.data(), data.size()); + } storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); insert_data.SetTimestamps(0, 100); @@ -399,7 +500,11 @@ test_string() { auto bitset = real_index->In(test_data.size(), test_data.data()); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], s.find(data[i]) != s.end()); + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_EQ(bitset[i], s.find(data[i]) != s.end()); + } } } @@ -414,7 +519,11 @@ test_string() { auto bitset = real_index->NotIn(test_data.size(), test_data.data()); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_NE(bitset[i], s.find(data[i]) != s.end()); + if (nullable && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + ASSERT_NE(bitset[i], s.find(data[i]) != s.end()); + } } } @@ -441,9 +550,13 @@ test_string() { for (size_t i = 0; i < bitset.size(); i++) { auto ans = bitset[i]; auto should = ref(i); - ASSERT_EQ(ans, should) - << "op: " << op << ", @" << i << ", ans: " << ans - << ", ref: " << should; + if (nullable && !valid_data[i]) { + ASSERT_EQ(ans, false); + } else { + ASSERT_EQ(ans, should) + << "op: " << op << ", @" << i << ", ans: " << ans + << ", ref: " << should; + } } } } @@ -484,11 +597,15 @@ test_string() { auto bitset = real_index->Range(lb, lb_inclusive, ub, ub_inclusive); ASSERT_EQ(cnt, bitset.size()); - for (size_t i = 0; i < bitset.size(); i++) { + for (size_t i = 0; i < nb; i++) { auto ans = bitset[i]; auto should = ref(i); - ASSERT_EQ(ans, should) - << "@" << i << ", ans: " << ans << ", ref: " << should; + if (nullable && !valid_data[i]) { + ASSERT_EQ(ans, false); + } else { + ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans + << ", ref: " << should; + } } } } @@ -501,7 +618,11 @@ test_string() { auto bitset = real_index->Query(dataset); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix)); + auto should = boost::starts_with(data[i], prefix); + if (nullable && !valid_data[i]) { + should = false; + } + ASSERT_EQ(bitset[i], should); } } @@ -511,7 +632,11 @@ test_string() { auto bitset = real_index->RegexQuery(prefix + "(.|\n)*"); ASSERT_EQ(cnt, bitset.size()); for (size_t i = 0; i < bitset.size(); i++) { - ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix)); + auto should = boost::starts_with(data[i], prefix); + if (nullable && !valid_data[i]) { + should = false; + } + ASSERT_EQ(bitset[i], should); } } } @@ -529,4 +654,15 @@ TEST(InvertedIndex, Naive) { test_run(); test_string(); + test_run(); + test_run(); + test_run(); + test_run(); + + test_run(); + + test_run(); + test_run(); + + test_string(); } diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index 99d0071a3ffab..79772c5b3ecb9 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -1872,7 +1872,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::INT64, false, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( - pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); + pk_fid, 0, DataType::INT64, pk_field_data->Data(), nullptr, N); auto& skip_index = segment->GetSkipIndex(); bool equal_5_skip = skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::Equal, 5); @@ -1914,7 +1914,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::INT32, false, 1, 10); int32_field_data->FillFieldData(int32s.data(), N); segment->LoadPrimitiveSkipIndex( - i32_fid, 0, DataType::INT32, int32_field_data->Data(), N); + i32_fid, 0, DataType::INT32, int32_field_data->Data(), nullptr, N); less_than_1_skip = skip_index.CanSkipUnaryRange(i32_fid, 0, OpType::LessThan, 1); ASSERT_TRUE(less_than_1_skip); @@ -1925,7 +1925,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::INT16, false, 1, 10); int16_field_data->FillFieldData(int16s.data(), N); segment->LoadPrimitiveSkipIndex( - i16_fid, 0, DataType::INT16, int16_field_data->Data(), N); + i16_fid, 0, DataType::INT16, int16_field_data->Data(), nullptr, N); bool less_than_12_skip = skip_index.CanSkipUnaryRange(i16_fid, 0, OpType::LessThan, 12); ASSERT_FALSE(less_than_12_skip); @@ -1936,7 +1936,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::INT8, false, 1, 10); int8_field_data->FillFieldData(int8s.data(), N); segment->LoadPrimitiveSkipIndex( - i8_fid, 0, DataType::INT8, int8_field_data->Data(), N); + i8_fid, 0, DataType::INT8, int8_field_data->Data(), nullptr, N); bool greater_than_12_skip = skip_index.CanSkipUnaryRange( i8_fid, 0, OpType::GreaterThan, 12); ASSERT_TRUE(greater_than_12_skip); @@ -1948,7 +1948,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::FLOAT, false, 1, 10); float_field_data->FillFieldData(floats.data(), N); segment->LoadPrimitiveSkipIndex( - float_fid, 0, DataType::FLOAT, float_field_data->Data(), N); + float_fid, 0, DataType::FLOAT, float_field_data->Data(), nullptr, N); greater_than_10_skip = skip_index.CanSkipUnaryRange( float_fid, 0, OpType::GreaterThan, 10.0); ASSERT_TRUE(greater_than_10_skip); @@ -1960,7 +1960,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { storage::CreateFieldData(DataType::DOUBLE, false, 1, 10); double_field_data->FillFieldData(doubles.data(), N); segment->LoadPrimitiveSkipIndex( - double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N); + double_fid, 0, DataType::DOUBLE, double_field_data->Data(), nullptr, N); greater_than_10_skip = skip_index.CanSkipUnaryRange( double_fid, 0, OpType::GreaterThan, 10.0); ASSERT_TRUE(greater_than_10_skip); @@ -1984,7 +1984,7 @@ TEST(Sealed, SkipIndexSkipBinaryRange) { storage::CreateFieldData(DataType::INT64, false, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( - pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); + pk_fid, 0, DataType::INT64, pk_field_data->Data(), nullptr, N); auto& skip_index = segment->GetSkipIndex(); ASSERT_FALSE( skip_index.CanSkipBinaryRange(pk_fid, 0, -3, 1, true, true)); @@ -2002,6 +2002,117 @@ TEST(Sealed, SkipIndexSkipBinaryRange) { skip_index.CanSkipBinaryRange(pk_fid, 0, 10, 12, true, true)); } +TEST(Sealed, SkipIndexSkipUnaryRangeNullable) { + auto schema = std::make_shared(); + auto dim = 128; + auto metrics_type = "L2"; + auto fake_vec_fid = schema->AddDebugField( + "fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type); + auto i64_fid = schema->AddDebugField("int64_field", DataType::INT64, true); + + auto dataset = DataGen(schema, 5); + auto segment = CreateSealedSegment(schema); + + //test for int64 + std::vector int64s = {1, 2, 3, 4, 5}; + uint8_t* valid_data = new uint8_t[1]{0x03}; + FixedVector valid_data_ = {true, true, false, false, false}; + auto int64s_field_data = + storage::CreateFieldData(DataType::INT64, true, 1, 5); + + int64s_field_data->FillFieldData(int64s.data(), valid_data, 5); + segment->LoadPrimitiveSkipIndex(i64_fid, + 0, + DataType::INT64, + int64s_field_data->Data(), + valid_data_.data(), + 5); + auto& skip_index = segment->GetSkipIndex(); + bool equal_5_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::Equal, 5); + bool equal_4_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::Equal, 4); + bool equal_2_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::Equal, 2); + bool equal_1_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::Equal, 1); + ASSERT_TRUE(equal_5_skip); + ASSERT_TRUE(equal_4_skip); + ASSERT_FALSE(equal_2_skip); + ASSERT_FALSE(equal_1_skip); + bool less_than_1_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::LessThan, 1); + bool less_than_5_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::LessThan, 5); + ASSERT_TRUE(less_than_1_skip); + ASSERT_FALSE(less_than_5_skip); + bool less_equal_than_1_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::LessEqual, 1); + bool less_equal_than_15_skip = + skip_index.CanSkipUnaryRange(i64_fid, 0, OpType::LessThan, 15); + ASSERT_FALSE(less_equal_than_1_skip); + ASSERT_FALSE(less_equal_than_15_skip); + bool greater_than_10_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterThan, 10); + bool greater_than_5_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterThan, 5); + bool greater_than_2_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterThan, 2); + bool greater_than_1_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterThan, 1); + ASSERT_TRUE(greater_than_10_skip); + ASSERT_TRUE(greater_than_5_skip); + ASSERT_TRUE(greater_than_2_skip); + ASSERT_FALSE(greater_than_1_skip); + bool greater_equal_than_3_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterEqual, 3); + bool greater_equal_than_2_skip = skip_index.CanSkipUnaryRange( + i64_fid, 0, OpType::GreaterEqual, 2); + ASSERT_TRUE(greater_equal_than_3_skip); + ASSERT_FALSE(greater_equal_than_2_skip); +} + +TEST(Sealed, SkipIndexSkipBinaryRangeNullable) { + auto schema = std::make_shared(); + auto dim = 128; + auto metrics_type = "L2"; + auto fake_vec_fid = schema->AddDebugField( + "fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type); + auto i64_fid = schema->AddDebugField("int64_field", DataType::INT64, true); + auto dataset = DataGen(schema, 5); + auto segment = CreateSealedSegment(schema); + + //test for int64 + std::vector int64s = {1, 2, 3, 4, 5}; + uint8_t* valid_data = new uint8_t[1]{0x03}; + FixedVector valid_data_ = {true, true, false, false, false}; + auto int64s_field_data = + storage::CreateFieldData(DataType::INT64, true, 1, 5); + + int64s_field_data->FillFieldData(int64s.data(), valid_data, 5); + segment->LoadPrimitiveSkipIndex(i64_fid, + 0, + DataType::INT64, + int64s_field_data->Data(), + valid_data_.data(), + 5); + auto& skip_index = segment->GetSkipIndex(); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(i64_fid, 0, -3, 1, true, true)); + ASSERT_TRUE( + skip_index.CanSkipBinaryRange(i64_fid, 0, -3, 1, true, false)); + + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(i64_fid, 0, 1, 3, true, true)); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(i64_fid, 0, 1, 2, true, false)); + + ASSERT_TRUE( + skip_index.CanSkipBinaryRange(i64_fid, 0, 2, 3, false, true)); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(i64_fid, 0, 2, 3, true, true)); +} + TEST(Sealed, SkipIndexSkipStringRange) { auto schema = std::make_shared(); auto dim = 128;