From 74970fa9df2c71dde19e8d83a6db66322e40f9cd Mon Sep 17 00:00:00 2001 From: bjwswang Date: Tue, 12 Mar 2024 08:59:37 +0000 Subject: [PATCH] feat: configure chunksize and chunkoverlap in knowledgebase Signed-off-by: bjwswang --- .../v1alpha1/document_loader_types.go | 4 +- api/base/v1alpha1/knowledgebase.go | 13 ++ api/base/v1alpha1/knowledgebase_types.go | 14 +- api/base/v1alpha1/zz_generated.deepcopy.go | 16 ++ apiserver/graph/generated/generated.go | 175 +++++++++++++++++- apiserver/graph/generated/models_gen.go | 12 ++ .../graph/impl/knowledgebase.resolvers.go | 31 +--- apiserver/graph/schema/knowledgebase.gql | 8 + apiserver/graph/schema/knowledgebase.graphqls | 29 +++ apiserver/pkg/knowledgebase/knowledgebase.go | 67 ++++++- ...ia.kubeagi.k8s.com.cn_documentloaders.yaml | 4 +- ...dia.kubeagi.k8s.com.cn_knowledgebases.yaml | 8 + controllers/base/knowledgebase_controller.go | 6 +- ...ia.kubeagi.k8s.com.cn_documentloaders.yaml | 4 +- ...dia.kubeagi.k8s.com.cn_knowledgebases.yaml | 8 + 15 files changed, 353 insertions(+), 46 deletions(-) diff --git a/api/app-node/documentloader/v1alpha1/document_loader_types.go b/api/app-node/documentloader/v1alpha1/document_loader_types.go index 6ecf8e14a..c4fc2681f 100644 --- a/api/app-node/documentloader/v1alpha1/document_loader_types.go +++ b/api/app-node/documentloader/v1alpha1/document_loader_types.go @@ -28,10 +28,10 @@ type DocumentLoaderSpec struct { // CommonSpec v1alpha1.CommonSpec `json:",inline"` // ChunkSize for text splitter - // +kubebuilder:default=2048 + // +kubebuilder:default=512 ChunkSize int `json:"chunkSize,omitempty"` // ChunkOverlap for text splitter - // +kubebuilder:default=200 + // +kubebuilder:default=100 ChunkOverlap int `json:"chunkOverlap,omitempty"` // FileExtName the type of documents, can be .pdf, .txt, .mp3, etc ... FileExtName string `json:"fileExtName,omitempty"` diff --git a/api/base/v1alpha1/knowledgebase.go b/api/base/v1alpha1/knowledgebase.go index a79b51e71..7d047c758 100644 --- a/api/base/v1alpha1/knowledgebase.go +++ b/api/base/v1alpha1/knowledgebase.go @@ -10,6 +10,19 @@ const ( UpdateSourceFileAnnotationKey = Group + "/update-source-file-time" ) +func (kb *KnowledgeBase) EmbeddingOptions() EmbeddingOptions { + options := kb.Spec.EmbeddingOptions + if kb.Spec.EmbeddingOptions.ChunkSize == 0 { + // default 1024 + options.ChunkSize = 1024 + } + if kb.Spec.EmbeddingOptions.ChunkOverlap == 0 { + // default 100 + options.ChunkOverlap = 100 + } + return options +} + func (kb *KnowledgeBase) VectorStoreCollectionName() string { return kb.Namespace + "_" + kb.Name } diff --git a/api/base/v1alpha1/knowledgebase_types.go b/api/base/v1alpha1/knowledgebase_types.go index c7484a78b..15b007e22 100644 --- a/api/base/v1alpha1/knowledgebase_types.go +++ b/api/base/v1alpha1/knowledgebase_types.go @@ -27,13 +27,23 @@ type KnowledgeBaseSpec struct { // Embedder defines the embedder to embedding files Embedder *TypedObjectReference `json:"embedder,omitempty"` - // TODO: add EmbedderOptions - // VectorStore defines the vectorstore to store results VectorStore *TypedObjectReference `json:"vectorStore,omitempty"` // FileGroups included files Grouped by VersionedDataset FileGroups []FileGroup `json:"fileGroups,omitempty"` + + // Embedding Options + EmbeddingOptions `json:",inline"` +} + +type EmbeddingOptions struct { + // ChunkSize for text splitter + // +kubebuilder:default=1024 + ChunkSize int `json:"chunkSize,omitempty"` + // ChunkOverlap for text splitter + // +kubebuilder:default=100 + ChunkOverlap int `json:"chunkOverlap,omitempty"` } type FileGroupDetail struct { diff --git a/api/base/v1alpha1/zz_generated.deepcopy.go b/api/base/v1alpha1/zz_generated.deepcopy.go index 4d238a13a..f1661e173 100644 --- a/api/base/v1alpha1/zz_generated.deepcopy.go +++ b/api/base/v1alpha1/zz_generated.deepcopy.go @@ -495,6 +495,21 @@ func (in *EmbedderStatus) DeepCopy() *EmbedderStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EmbeddingOptions) DeepCopyInto(out *EmbeddingOptions) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmbeddingOptions. +func (in *EmbeddingOptions) DeepCopy() *EmbeddingOptions { + if in == nil { + return nil + } + out := new(EmbeddingOptions) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Endpoint) DeepCopyInto(out *Endpoint) { *out = *in @@ -701,6 +716,7 @@ func (in *KnowledgeBaseSpec) DeepCopyInto(out *KnowledgeBaseSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.EmbeddingOptions = in.EmbeddingOptions } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KnowledgeBaseSpec. diff --git a/apiserver/graph/generated/generated.go b/apiserver/graph/generated/generated.go index 638dd4187..606a5ed45 100644 --- a/apiserver/graph/generated/generated.go +++ b/apiserver/graph/generated/generated.go @@ -370,6 +370,8 @@ type ComplexityRoot struct { KnowledgeBase struct { Annotations func(childComplexity int) int + ChunkOverlap func(childComplexity int) int + ChunkSize func(childComplexity int) int CreationTimestamp func(childComplexity int) int Creator func(childComplexity int) int Description func(childComplexity int) int @@ -2480,6 +2482,20 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.KnowledgeBase.Annotations(childComplexity), true + case "KnowledgeBase.chunkOverlap": + if e.complexity.KnowledgeBase.ChunkOverlap == nil { + break + } + + return e.complexity.KnowledgeBase.ChunkOverlap(childComplexity), true + + case "KnowledgeBase.chunkSize": + if e.complexity.KnowledgeBase.ChunkSize == nil { + break + } + + return e.complexity.KnowledgeBase.ChunkSize(childComplexity), true + case "KnowledgeBase.creationTimestamp": if e.complexity.KnowledgeBase.CreationTimestamp == nil { break @@ -6203,6 +6219,16 @@ type KnowledgeBase { fileGroupDetails为知识库中所处理的文件组的详细内容和状态 """ fileGroupDetails: [filegroupdetail] + + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int """ 知识库整体连接状态 @@ -6250,6 +6276,16 @@ input CreateKnowledgeBaseInput{ vectorStore: TypedObjectReferenceInput """知识库文件""" fileGroups: [filegroupinput!] + + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int } """知识库更新的输入""" @@ -6271,6 +6307,15 @@ input UpdateKnowledgeBaseInput { """更新知识库文件""" fileGroups: [filegroupinput!] + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int } """知识库分页列表查询的输入""" @@ -18820,6 +18865,88 @@ func (ec *executionContext) fieldContext_KnowledgeBase_fileGroupDetails(ctx cont return fc, nil } +func (ec *executionContext) _KnowledgeBase_chunkSize(ctx context.Context, field graphql.CollectedField, obj *KnowledgeBase) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_KnowledgeBase_chunkSize(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.ChunkSize, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*int) + fc.Result = res + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_KnowledgeBase_chunkSize(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "KnowledgeBase", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _KnowledgeBase_chunkOverlap(ctx context.Context, field graphql.CollectedField, obj *KnowledgeBase) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_KnowledgeBase_chunkOverlap(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.ChunkOverlap, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*int) + fc.Result = res + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_KnowledgeBase_chunkOverlap(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "KnowledgeBase", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _KnowledgeBase_status(ctx context.Context, field graphql.CollectedField, obj *KnowledgeBase) (ret graphql.Marshaler) { fc, err := ec.fieldContext_KnowledgeBase_status(ctx, field) if err != nil { @@ -19010,6 +19137,10 @@ func (ec *executionContext) fieldContext_KnowledgeBaseMutation_createKnowledgeBa return ec.fieldContext_KnowledgeBase_vectorStore(ctx, field) case "fileGroupDetails": return ec.fieldContext_KnowledgeBase_fileGroupDetails(ctx, field) + case "chunkSize": + return ec.fieldContext_KnowledgeBase_chunkSize(ctx, field) + case "chunkOverlap": + return ec.fieldContext_KnowledgeBase_chunkOverlap(ctx, field) case "status": return ec.fieldContext_KnowledgeBase_status(ctx, field) case "reason": @@ -19101,6 +19232,10 @@ func (ec *executionContext) fieldContext_KnowledgeBaseMutation_updateKnowledgeBa return ec.fieldContext_KnowledgeBase_vectorStore(ctx, field) case "fileGroupDetails": return ec.fieldContext_KnowledgeBase_fileGroupDetails(ctx, field) + case "chunkSize": + return ec.fieldContext_KnowledgeBase_chunkSize(ctx, field) + case "chunkOverlap": + return ec.fieldContext_KnowledgeBase_chunkOverlap(ctx, field) case "status": return ec.fieldContext_KnowledgeBase_status(ctx, field) case "reason": @@ -19244,6 +19379,10 @@ func (ec *executionContext) fieldContext_KnowledgeBaseQuery_getKnowledgeBase(ctx return ec.fieldContext_KnowledgeBase_vectorStore(ctx, field) case "fileGroupDetails": return ec.fieldContext_KnowledgeBase_fileGroupDetails(ctx, field) + case "chunkSize": + return ec.fieldContext_KnowledgeBase_chunkSize(ctx, field) + case "chunkOverlap": + return ec.fieldContext_KnowledgeBase_chunkOverlap(ctx, field) case "status": return ec.fieldContext_KnowledgeBase_status(ctx, field) case "reason": @@ -33370,7 +33509,7 @@ func (ec *executionContext) unmarshalInputCreateKnowledgeBaseInput(ctx context.C asMap[k] = v } - fieldsInOrder := [...]string{"name", "namespace", "labels", "annotations", "displayName", "description", "embedder", "vectorStore", "fileGroups"} + fieldsInOrder := [...]string{"name", "namespace", "labels", "annotations", "displayName", "description", "embedder", "vectorStore", "fileGroups", "chunkSize", "chunkOverlap"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -33440,6 +33579,20 @@ func (ec *executionContext) unmarshalInputCreateKnowledgeBaseInput(ctx context.C return it, err } it.FileGroups = data + case "chunkSize": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("chunkSize")) + data, err := ec.unmarshalOInt2ᚖint(ctx, v) + if err != nil { + return it, err + } + it.ChunkSize = data + case "chunkOverlap": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("chunkOverlap")) + data, err := ec.unmarshalOInt2ᚖint(ctx, v) + if err != nil { + return it, err + } + it.ChunkOverlap = data } } @@ -36171,7 +36324,7 @@ func (ec *executionContext) unmarshalInputUpdateKnowledgeBaseInput(ctx context.C asMap[k] = v } - fieldsInOrder := [...]string{"name", "namespace", "labels", "annotations", "displayName", "description", "fileGroups"} + fieldsInOrder := [...]string{"name", "namespace", "labels", "annotations", "displayName", "description", "fileGroups", "chunkSize", "chunkOverlap"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -36227,6 +36380,20 @@ func (ec *executionContext) unmarshalInputUpdateKnowledgeBaseInput(ctx context.C return it, err } it.FileGroups = data + case "chunkSize": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("chunkSize")) + data, err := ec.unmarshalOInt2ᚖint(ctx, v) + if err != nil { + return it, err + } + it.ChunkSize = data + case "chunkOverlap": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("chunkOverlap")) + data, err := ec.unmarshalOInt2ᚖint(ctx, v) + if err != nil { + return it, err + } + it.ChunkOverlap = data } } @@ -39810,6 +39977,10 @@ func (ec *executionContext) _KnowledgeBase(ctx context.Context, sel ast.Selectio out.Values[i] = ec._KnowledgeBase_vectorStore(ctx, field, obj) case "fileGroupDetails": out.Values[i] = ec._KnowledgeBase_fileGroupDetails(ctx, field, obj) + case "chunkSize": + out.Values[i] = ec._KnowledgeBase_chunkSize(ctx, field, obj) + case "chunkOverlap": + out.Values[i] = ec._KnowledgeBase_chunkOverlap(ctx, field, obj) case "status": out.Values[i] = ec._KnowledgeBase_status(ctx, field, obj) case "reason": diff --git a/apiserver/graph/generated/models_gen.go b/apiserver/graph/generated/models_gen.go index e7f9f60fb..ea6852b51 100644 --- a/apiserver/graph/generated/models_gen.go +++ b/apiserver/graph/generated/models_gen.go @@ -249,6 +249,10 @@ type CreateKnowledgeBaseInput struct { VectorStore *TypedObjectReferenceInput `json:"vectorStore,omitempty"` // 知识库文件 FileGroups []*Filegroupinput `json:"fileGroups,omitempty"` + // chunkSize为知识库做文档拆分时的块大小 + ChunkSize *int `json:"chunkSize,omitempty"` + // chunkOverlap为知识库作文档拆分时相邻块的交集 + ChunkOverlap *int `json:"chunkOverlap,omitempty"` } type CreateLLMInput struct { @@ -849,6 +853,10 @@ type KnowledgeBase struct { VectorStore *TypedObjectReference `json:"vectorStore,omitempty"` // fileGroupDetails为知识库中所处理的文件组的详细内容和状态 FileGroupDetails []*Filegroupdetail `json:"fileGroupDetails,omitempty"` + // chunkSize为知识库做文档拆分时的块大小 + ChunkSize *int `json:"chunkSize,omitempty"` + // chunkOverlap为知识库作文档拆分时相邻块的交集 + ChunkOverlap *int `json:"chunkOverlap,omitempty"` // 知识库整体连接状态 // 规则: True 代表正常 False代表异常 // 规则: Deleting 代表删除中 @@ -1668,6 +1676,10 @@ type UpdateKnowledgeBaseInput struct { Description *string `json:"description,omitempty"` // 更新知识库文件 FileGroups []*Filegroupinput `json:"fileGroups,omitempty"` + // chunkSize为知识库做文档拆分时的块大小 + ChunkSize *int `json:"chunkSize,omitempty"` + // chunkOverlap为知识库作文档拆分时相邻块的交集 + ChunkOverlap *int `json:"chunkOverlap,omitempty"` } type UpdateLLMInput struct { diff --git a/apiserver/graph/impl/knowledgebase.resolvers.go b/apiserver/graph/impl/knowledgebase.resolvers.go index 147b6ff62..35f9e2463 100644 --- a/apiserver/graph/impl/knowledgebase.resolvers.go +++ b/apiserver/graph/impl/knowledgebase.resolvers.go @@ -7,10 +7,8 @@ package impl import ( "context" - "github.com/kubeagi/arcadia/api/base/v1alpha1" "github.com/kubeagi/arcadia/apiserver/graph/generated" "github.com/kubeagi/arcadia/apiserver/pkg/knowledgebase" - "github.com/kubeagi/arcadia/pkg/config" ) // CreateKnowledgeBase is the resolver for the createKnowledgeBase field. @@ -20,34 +18,7 @@ func (r *knowledgeBaseMutationResolver) CreateKnowledgeBase(ctx context.Context, return nil, err } - var filegroups []v1alpha1.FileGroup - var vectorstore v1alpha1.TypedObjectReference - vector, _ := config.GetVectorStore(ctx, c) - displayname, description, embedder := "", "", "" - if input.DisplayName != nil { - displayname = *input.DisplayName - } - if input.Description != nil { - description = *input.Description - } - if input.VectorStore != nil { - vectorstore = v1alpha1.TypedObjectReference(*input.VectorStore) - } else { - vectorstore = *vector - } - if input.Embedder != "" { - embedder = input.Embedder - } - if input.FileGroups != nil { - for _, f := range input.FileGroups { - filegroup := v1alpha1.FileGroup{ - Source: (*v1alpha1.TypedObjectReference)(&f.Source), - Paths: f.Path, - } - filegroups = append(filegroups, filegroup) - } - } - return knowledgebase.CreateKnowledgeBase(ctx, c, input.Name, input.Namespace, displayname, description, embedder, vectorstore, filegroups) + return knowledgebase.CreateKnowledgeBase(ctx, c, input) } // UpdateKnowledgeBase is the resolver for the updateKnowledgeBase field. diff --git a/apiserver/graph/schema/knowledgebase.gql b/apiserver/graph/schema/knowledgebase.gql index 90ff0d256..1442b0950 100644 --- a/apiserver/graph/schema/knowledgebase.gql +++ b/apiserver/graph/schema/knowledgebase.gql @@ -15,6 +15,8 @@ query listKnowledgeBases($input: ListKnowledgeBaseInput!){ creator displayName description + chunkSize + chunkOverlap status reason message @@ -63,6 +65,8 @@ query getKnowledgeBase($name: String!, $namespace: String!) { creator displayName description + chunkSize + chunkOverlap status reason message @@ -109,6 +113,8 @@ mutation createKnowledgeBase($input: CreateKnowledgeBaseInput!) { creator displayName description + chunkSize + chunkOverlap status reason message @@ -155,6 +161,8 @@ mutation updateKnowledgeBase($input: UpdateKnowledgeBaseInput) { creator displayName description + chunkSize + chunkOverlap status reason message diff --git a/apiserver/graph/schema/knowledgebase.graphqls b/apiserver/graph/schema/knowledgebase.graphqls index 63ce68039..a2ee1a1c9 100644 --- a/apiserver/graph/schema/knowledgebase.graphqls +++ b/apiserver/graph/schema/knowledgebase.graphqls @@ -125,6 +125,16 @@ type KnowledgeBase { fileGroupDetails为知识库中所处理的文件组的详细内容和状态 """ fileGroupDetails: [filegroupdetail] + + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int """ 知识库整体连接状态 @@ -172,6 +182,16 @@ input CreateKnowledgeBaseInput{ vectorStore: TypedObjectReferenceInput """知识库文件""" fileGroups: [filegroupinput!] + + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int } """知识库更新的输入""" @@ -193,6 +213,15 @@ input UpdateKnowledgeBaseInput { """更新知识库文件""" fileGroups: [filegroupinput!] + + """ + chunkSize为知识库做文档拆分时的块大小 + """ + chunkSize: Int + """ + chunkOverlap为知识库作文档拆分时相邻块的交集 + """ + chunkOverlap: Int } """知识库分页列表查询的输入""" diff --git a/apiserver/pkg/knowledgebase/knowledgebase.go b/apiserver/pkg/knowledgebase/knowledgebase.go index 327f7aba8..15065b8ea 100644 --- a/apiserver/pkg/knowledgebase/knowledgebase.go +++ b/apiserver/pkg/knowledgebase/knowledgebase.go @@ -29,6 +29,12 @@ import ( "github.com/kubeagi/arcadia/apiserver/graph/generated" "github.com/kubeagi/arcadia/apiserver/pkg/common" graphqlutils "github.com/kubeagi/arcadia/apiserver/pkg/utils" + "github.com/kubeagi/arcadia/pkg/config" +) + +const ( + DefaultChunkSize = 1024 + DefaultChunkOverlap = 100 ) func knowledgebase2modelConverter(ctx context.Context, c client.Client) func(obj client.Object) (generated.PageNode, error) { @@ -101,6 +107,8 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1 embedderType = string(embedderResource.Spec.Provider.GetType()) } + embeddingOptions := knowledgebase.EmbeddingOptions() + md := generated.KnowledgeBase{ ID: &id, Name: knowledgebase.GetName(), @@ -122,6 +130,8 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1 Namespace: knowledgebase.Spec.VectorStore.Namespace, }, FileGroupDetails: filegroupdetails, + ChunkSize: &embeddingOptions.ChunkSize, + ChunkOverlap: &embeddingOptions.ChunkOverlap, // Status info Status: &status, Reason: &reason, @@ -130,11 +140,49 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1 return &md, nil } -func CreateKnowledgeBase(ctx context.Context, c client.Client, name, namespace, displayname, description, embedder string, vectorstore v1alpha1.TypedObjectReference, filegroups []v1alpha1.FileGroup) (*generated.KnowledgeBase, error) { +func CreateKnowledgeBase(ctx context.Context, c client.Client, input generated.CreateKnowledgeBaseInput) (*generated.KnowledgeBase, error) { + var filegroups []v1alpha1.FileGroup + var vectorstore v1alpha1.TypedObjectReference + vector, _ := config.GetVectorStore(ctx, c) + displayname, description, embedder := "", "", "" + if input.DisplayName != nil { + displayname = *input.DisplayName + } + if input.Description != nil { + description = *input.Description + } + if input.VectorStore != nil { + vectorstore = v1alpha1.TypedObjectReference(*input.VectorStore) + } else { + vectorstore = *vector + } + if input.Embedder != "" { + embedder = input.Embedder + } + if input.FileGroups != nil { + for _, f := range input.FileGroups { + filegroup := v1alpha1.FileGroup{ + Source: (*v1alpha1.TypedObjectReference)(&f.Source), + Paths: f.Path, + } + filegroups = append(filegroups, filegroup) + } + } + + // Embedding options + chunkSize := DefaultChunkSize + if input.ChunkSize != nil { + chunkSize = *input.ChunkSize + } + chunkOverlap := DefaultChunkOverlap + if input.ChunkOverlap != nil { + chunkOverlap = *input.ChunkOverlap + } + knowledgebase := &v1alpha1.KnowledgeBase{ ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, + Name: input.Name, + Namespace: input.Namespace, }, Spec: v1alpha1.KnowledgeBaseSpec{ CommonSpec: v1alpha1.CommonSpec{ @@ -144,10 +192,14 @@ func CreateKnowledgeBase(ctx context.Context, c client.Client, name, namespace, Embedder: &v1alpha1.TypedObjectReference{ Kind: "Embedder", Name: embedder, - Namespace: &namespace, + Namespace: &input.Namespace, }, VectorStore: &vectorstore, FileGroups: filegroups, + EmbeddingOptions: v1alpha1.EmbeddingOptions{ + ChunkSize: chunkSize, + ChunkOverlap: chunkOverlap, + }, }, } common.SetCreator(ctx, &knowledgebase.Spec.CommonSpec) @@ -213,6 +265,13 @@ func UpdateKnowledgeBase(ctx context.Context, c client.Client, input *generated. kb.Spec.FileGroups = filegroups } + if input.ChunkSize != nil { + kb.Spec.ChunkSize = *input.ChunkSize + } + if input.ChunkOverlap != nil { + kb.Spec.ChunkOverlap = *input.ChunkOverlap + } + err = c.Update(ctx, kb) if err != nil { return nil, err diff --git a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml index bf7bb8f40..2b1f60b26 100644 --- a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml +++ b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml @@ -36,11 +36,11 @@ spec: description: DocumentLoaderSpec defines the desired state of DocumentLoader properties: chunkOverlap: - default: 200 + default: 100 description: ChunkOverlap for text splitter type: integer chunkSize: - default: 2048 + default: 512 description: ChunkSize for text splitter type: integer creator: diff --git a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml index 644676090..386ac0a2f 100644 --- a/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml +++ b/config/crd/bases/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml @@ -39,6 +39,14 @@ spec: spec: description: KnowledgeBaseSpec defines the desired state of KnowledgeBase properties: + chunkOverlap: + default: 100 + description: ChunkOverlap for text splitter + type: integer + chunkSize: + default: 1024 + description: ChunkSize for text splitter + type: integer creator: description: Creator defines datasource creator (AUTO-FILLED by webhook) type: string diff --git a/controllers/base/knowledgebase_controller.go b/controllers/base/knowledgebase_controller.go index e880a19c4..679e02a30 100644 --- a/controllers/base/knowledgebase_controller.go +++ b/controllers/base/knowledgebase_controller.go @@ -538,15 +538,17 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge } case ".html", ".htm": loader = documentloaders.NewHTML(dataReader) + // TODO: support .mp3,.wav default: loader = documentloaders.NewText(dataReader) } // initialize text splitter // var split textsplitter.TextSplitter + embeddingOptions := kb.EmbeddingOptions() split := textsplitter.NewRecursiveCharacter( - textsplitter.WithChunkSize(300), - textsplitter.WithChunkOverlap(30), + textsplitter.WithChunkSize(embeddingOptions.ChunkSize), + textsplitter.WithChunkOverlap(embeddingOptions.ChunkOverlap), ) // switch { // case "token": diff --git a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml index bf7bb8f40..2b1f60b26 100644 --- a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml +++ b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_documentloaders.yaml @@ -36,11 +36,11 @@ spec: description: DocumentLoaderSpec defines the desired state of DocumentLoader properties: chunkOverlap: - default: 200 + default: 100 description: ChunkOverlap for text splitter type: integer chunkSize: - default: 2048 + default: 512 description: ChunkSize for text splitter type: integer creator: diff --git a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml index 644676090..386ac0a2f 100644 --- a/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml +++ b/deploy/charts/arcadia/crds/arcadia.kubeagi.k8s.com.cn_knowledgebases.yaml @@ -39,6 +39,14 @@ spec: spec: description: KnowledgeBaseSpec defines the desired state of KnowledgeBase properties: + chunkOverlap: + default: 100 + description: ChunkOverlap for text splitter + type: integer + chunkSize: + default: 1024 + description: ChunkSize for text splitter + type: integer creator: description: Creator defines datasource creator (AUTO-FILLED by webhook) type: string