diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index b3d8e3beee729..662295ddb4cd6 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -198,6 +198,39 @@ TextMatchIndex::AddTexts(size_t n, } } +// schema_ may not be initialized so we need this `nullable` parameter +void +TextMatchIndex::BuildIndexFromFieldData( + const std::vector& field_datas, bool nullable) { + int64_t offset = 0; + if (nullable) { + int64_t total = 0; + for (const auto& data : field_datas) { + total += data->get_null_count(); + } + null_offset.reserve(total); + for (const auto& data : field_datas) { + auto n = data->get_num_rows(); + for (int i = 0; i < n; i++) { + if (!data->is_valid(i)) { + null_offset.push_back(i); + } + wrapper_->add_data( + static_cast(data->RawValue(i)), + data->is_valid(i) ? 1 : 0, + offset++); + } + } + } else { + for (const auto& data : field_datas) { + auto n = data->get_num_rows(); + wrapper_->add_data( + static_cast(data->Data()), n, offset); + offset += n; + } + } +} + void TextMatchIndex::Finish() { finish(); diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index 5549cc1ee2767..17d14340fc70d 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -58,6 +58,10 @@ class TextMatchIndex : public InvertedIndexTantivy { const bool* valids, int64_t offset_begin); + void + BuildIndexFromFieldData(const std::vector& field_datas, + bool nullable); + void Finish(); diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index 0ab4825d0e10e..d5d721cb22126 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -274,6 +274,16 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) { storage::GetByteSizeOfFieldDatas(field_data)); } + // build text match index + if (field_meta.enable_match()) { + auto index = GetTextIndex(field_id); + index->BuildIndexFromFieldData(field_data, + field_meta.is_nullable()); + index->Commit(); + // Reload reader so that the index can be read immediately + index->Reload(); + } + // update the mem size stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data); diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index a7e9260c95001..f7b6366385e1c 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -20,6 +20,8 @@ #include "query/PlanProto.h" #include "query/ExecPlanNodeVisitor.h" #include "expr/ITypeExpr.h" +#include "segcore/segment_c.h" +#include "test_utils/storage_test_utils.h" using namespace milvus; using namespace milvus::query; @@ -751,3 +753,64 @@ TEST(TextMatch, SealedJieBaNullable) { ASSERT_FALSE(final[2]); } } + +// Test that growing segment loading flushed binlogs will build text match index. +TEST(TextMatch, GrowingLoadData) { + int64_t N = 7; + auto schema = GenTestSchema({}, true); + schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false); + schema->AddField( + FieldName("Timestamp"), FieldId(1), DataType::INT64, false); + std::vector raw_str = {"football, basketball, pingpang", + "swimming, football", + "golf", + "", + "baseball", + "kungfu, football", + ""}; + auto raw_data = DataGen(schema, N); + auto str_col = raw_data.raw_->mutable_fields_data() + ->at(1) + .mutable_scalars() + ->mutable_string_data() + ->mutable_data(); + for (int64_t i = 0; i < N; i++) { + str_col->at(i) = raw_str[i]; + } + auto str_col_valid = + raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data(); + for (int64_t i = 0; i < N; i++) { + str_col_valid->at(i) = true; + } + // so we cannot match the second row + str_col_valid->at(1) = false; + + auto storage_config = get_default_local_storage_config(); + auto cm = storage::CreateChunkManager(storage_config); + auto load_info = PrepareInsertBinlog( + 1, + 2, + 3, + storage_config.root_path + "/" + "test_growing_segment_load_data", + raw_data, + cm); + + auto segment = CreateGrowingSegment(schema, empty_index_meta); + auto status = LoadFieldData(segment.get(), &load_info); + ASSERT_EQ(status.error_code, Success); + ASSERT_EQ(segment->get_real_count(), N); + ASSERT_NE(segment->get_field_avg_size(FieldId(101)), 0); + + // Check whether the text index has been built. + auto expr = GetTextMatchExpr(schema, "football"); + BitsetType final; + final = ExecuteQueryExpr(expr, segment.get(), N, MAX_TIMESTAMP); + ASSERT_EQ(final.size(), N); + ASSERT_TRUE(final[0]); + ASSERT_FALSE(final[1]); + ASSERT_FALSE(final[2]); + ASSERT_FALSE(final[3]); + ASSERT_FALSE(final[4]); + ASSERT_TRUE(final[5]); + ASSERT_FALSE(final[6]); +} \ No newline at end of file