Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: build text index when loading field data #39070

Merged
merged 5 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,39 @@
}
}

// schema_ may not be initialized so we need this `nullable` parameter
void
TextMatchIndex::BuildIndexFromFieldData(
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
int64_t offset = 0;
if (nullable) {
int64_t total = 0;
for (const auto& data : field_datas) {
total += data->get_null_count();
}
null_offset.reserve(total);
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_data(
static_cast<const std::string*>(data->RawValue(i)),
data->is_valid(i) ? 1 : 0,
offset++);
}
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data(
static_cast<const std::string*>(data->Data()), n, offset);
offset += n;

Check warning on line 229 in internal/core/src/index/TextMatchIndex.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/index/TextMatchIndex.cpp#L225-L229

Added lines #L225 - L229 were not covered by tests
}
}
}

void
TextMatchIndex::Finish() {
finish();
Expand Down
4 changes: 4 additions & 0 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
const bool* valids,
int64_t offset_begin);

void
BuildIndexFromFieldData(const std::vector<FieldDataPtr>& field_datas,
bool nullable);

void
Finish();

Expand Down
10 changes: 10 additions & 0 deletions internal/core/src/segcore/SegmentGrowingImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,16 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
storage::GetByteSizeOfFieldDatas(field_data));
}

// build text match index
if (field_meta.enable_match()) {
auto index = GetTextIndex(field_id);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can get index at this interface ? index not build before ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

index is created when at the constructor of SegmentGrowingImpl

index->BuildIndexFromFieldData(field_data,
field_meta.is_nullable());
index->Commit();
// Reload reader so that the index can be read immediately
index->Reload();
}

// update the mem size
stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data);

Expand Down
63 changes: 63 additions & 0 deletions internal/core/unittest/test_text_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
#include "expr/ITypeExpr.h"
#include "segcore/segment_c.h"
#include "test_utils/storage_test_utils.h"

using namespace milvus;
using namespace milvus::query;
Expand Down Expand Up @@ -751,3 +753,64 @@ TEST(TextMatch, SealedJieBaNullable) {
ASSERT_FALSE(final[2]);
}
}

// Test that growing segment loading flushed binlogs will build text match index.
TEST(TextMatch, GrowingLoadData) {
int64_t N = 7;
auto schema = GenTestSchema({}, true);
schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false);
schema->AddField(
FieldName("Timestamp"), FieldId(1), DataType::INT64, false);
std::vector<std::string> raw_str = {"football, basketball, pingpang",
"swimming, football",
"golf",
"",
"baseball",
"kungfu, football",
""};
auto raw_data = DataGen(schema, N);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = true;
}
// so we cannot match the second row
str_col_valid->at(1) = false;

auto storage_config = get_default_local_storage_config();
auto cm = storage::CreateChunkManager(storage_config);
auto load_info = PrepareInsertBinlog(
1,
2,
3,
storage_config.root_path + "/" + "test_growing_segment_load_data",
raw_data,
cm);

auto segment = CreateGrowingSegment(schema, empty_index_meta);
auto status = LoadFieldData(segment.get(), &load_info);
ASSERT_EQ(status.error_code, Success);
ASSERT_EQ(segment->get_real_count(), N);
ASSERT_NE(segment->get_field_avg_size(FieldId(101)), 0);

// Check whether the text index has been built.
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, segment.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
ASSERT_FALSE(final[3]);
ASSERT_FALSE(final[4]);
ASSERT_TRUE(final[5]);
ASSERT_FALSE(final[6]);
}
Loading