Skip to content

Commit

Permalink
Merge pull request #832 from bjwswang/main
Browse files Browse the repository at this point in the history
feat: configure chunksize and chunkoverlap in knowledgebase
  • Loading branch information
bjwswang authored Mar 12, 2024
2 parents 5dfce73 + 74970fa commit 647ba0b
Show file tree
Hide file tree
Showing 15 changed files with 353 additions and 46 deletions.
4 changes: 2 additions & 2 deletions api/app-node/documentloader/v1alpha1/document_loader_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ type DocumentLoaderSpec struct {
// CommonSpec
v1alpha1.CommonSpec `json:",inline"`
// ChunkSize for text splitter
// +kubebuilder:default=2048
// +kubebuilder:default=512
ChunkSize int `json:"chunkSize,omitempty"`
// ChunkOverlap for text splitter
// +kubebuilder:default=200
// +kubebuilder:default=100
ChunkOverlap int `json:"chunkOverlap,omitempty"`
// FileExtName the type of documents, can be .pdf, .txt, .mp3, etc ...
FileExtName string `json:"fileExtName,omitempty"`
Expand Down
13 changes: 13 additions & 0 deletions api/base/v1alpha1/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ const (
UpdateSourceFileAnnotationKey = Group + "/update-source-file-time"
)

func (kb *KnowledgeBase) EmbeddingOptions() EmbeddingOptions {
options := kb.Spec.EmbeddingOptions
if kb.Spec.EmbeddingOptions.ChunkSize == 0 {
// default 1024
options.ChunkSize = 1024
}
if kb.Spec.EmbeddingOptions.ChunkOverlap == 0 {
// default 100
options.ChunkOverlap = 100
}
return options
}

func (kb *KnowledgeBase) VectorStoreCollectionName() string {
return kb.Namespace + "_" + kb.Name
}
Expand Down
14 changes: 12 additions & 2 deletions api/base/v1alpha1/knowledgebase_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,23 @@ type KnowledgeBaseSpec struct {
// Embedder defines the embedder to embedding files
Embedder *TypedObjectReference `json:"embedder,omitempty"`

// TODO: add EmbedderOptions

// VectorStore defines the vectorstore to store results
VectorStore *TypedObjectReference `json:"vectorStore,omitempty"`

// FileGroups included files Grouped by VersionedDataset
FileGroups []FileGroup `json:"fileGroups,omitempty"`

// Embedding Options
EmbeddingOptions `json:",inline"`
}

type EmbeddingOptions struct {
// ChunkSize for text splitter
// +kubebuilder:default=1024
ChunkSize int `json:"chunkSize,omitempty"`
// ChunkOverlap for text splitter
// +kubebuilder:default=100
ChunkOverlap int `json:"chunkOverlap,omitempty"`
}

type FileGroupDetail struct {
Expand Down
16 changes: 16 additions & 0 deletions api/base/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

175 changes: 173 additions & 2 deletions apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 647ba0b

Please sign in to comment.