Skip to content

Commit

Permalink
fix: build text index when loading field data (#39070) (#39113)
Browse files Browse the repository at this point in the history
fix: #39053 may fix
#38644 which could be caused
by #39053

---------

Signed-off-by: SpadeA-Tang <[email protected]>
  • Loading branch information
SpadeA-Tang authored Jan 9, 2025
1 parent f896b0e commit f70262c
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 0 deletions.
33 changes: 33 additions & 0 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,39 @@ TextMatchIndex::AddTexts(size_t n,
}
}

// schema_ may not be initialized so we need this `nullable` parameter
void
TextMatchIndex::BuildIndexFromFieldData(
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
int64_t offset = 0;
if (nullable) {
int64_t total = 0;
for (const auto& data : field_datas) {
total += data->get_null_count();
}
null_offset.reserve(total);
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_data(
static_cast<const std::string*>(data->RawValue(i)),
data->is_valid(i) ? 1 : 0,
offset++);
}
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data(
static_cast<const std::string*>(data->Data()), n, offset);
offset += n;
}
}
}

void
TextMatchIndex::Finish() {
finish();
Expand Down
4 changes: 4 additions & 0 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
const bool* valids,
int64_t offset_begin);

void
BuildIndexFromFieldData(const std::vector<FieldDataPtr>& field_datas,
bool nullable);

void
Finish();

Expand Down
10 changes: 10 additions & 0 deletions internal/core/src/segcore/SegmentGrowingImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,16 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
storage::GetByteSizeOfFieldDatas(field_data));
}

// build text match index
if (field_meta.enable_match()) {
auto index = GetTextIndex(field_id);
index->BuildIndexFromFieldData(field_data,
field_meta.is_nullable());
index->Commit();
// Reload reader so that the index can be read immediately
index->Reload();
}

// update the mem size
stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data);

Expand Down
63 changes: 63 additions & 0 deletions internal/core/unittest/test_text_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
#include "expr/ITypeExpr.h"
#include "segcore/segment_c.h"
#include "test_utils/storage_test_utils.h"

using namespace milvus;
using namespace milvus::query;
Expand Down Expand Up @@ -751,3 +753,64 @@ TEST(TextMatch, SealedJieBaNullable) {
ASSERT_FALSE(final[2]);
}
}

// Test that growing segment loading flushed binlogs will build text match index.
TEST(TextMatch, GrowingLoadData) {
int64_t N = 7;
auto schema = GenTestSchema({}, true);
schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false);
schema->AddField(
FieldName("Timestamp"), FieldId(1), DataType::INT64, false);
std::vector<std::string> raw_str = {"football, basketball, pingpang",
"swimming, football",
"golf",
"",
"baseball",
"kungfu, football",
""};
auto raw_data = DataGen(schema, N);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = true;
}
// so we cannot match the second row
str_col_valid->at(1) = false;

auto storage_config = get_default_local_storage_config();
auto cm = storage::CreateChunkManager(storage_config);
auto load_info = PrepareInsertBinlog(
1,
2,
3,
storage_config.root_path + "/" + "test_growing_segment_load_data",
raw_data,
cm);

auto segment = CreateGrowingSegment(schema, empty_index_meta);
auto status = LoadFieldData(segment.get(), &load_info);
ASSERT_EQ(status.error_code, Success);
ASSERT_EQ(segment->get_real_count(), N);
ASSERT_NE(segment->get_field_avg_size(FieldId(101)), 0);

// Check whether the text index has been built.
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, segment.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
ASSERT_FALSE(final[3]);
ASSERT_FALSE(final[4]);
ASSERT_TRUE(final[5]);
ASSERT_FALSE(final[6]);
}

0 comments on commit f70262c

Please sign in to comment.