Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
Fix: recuisively search file by default, fix file ingestion with edit
Browse files Browse the repository at this point in the history
Signed-off-by: Daishan Peng <[email protected]>
  • Loading branch information
StrongMonkey committed Jun 19, 2024
1 parent 7705695 commit b4674e3
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 11 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
ingestOpts := &client.IngestPathsOpts{
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: s.Recursive,
Recursive: !s.NoRecursive,
}

retrieveOpts := &datastore.RetrieveOpts{
Expand Down
6 changes: 3 additions & 3 deletions pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ type ClientIngest struct {
type ClientIngestOpts struct {
IgnoreExtensions string `usage:"Comma-separated list of file extensions to ignore" env:"KNOW_INGEST_IGNORE_EXTENSIONS"`
Concurrency int `usage:"Number of concurrent ingestion processes" short:"c" default:"10" env:"KNOW_INGEST_CONCURRENCY"`
Recursive bool `usage:"Recursively ingest directories" short:"r" default:"false" env:"KNOW_INGEST_RECURSIVE"`
NoRecursive bool `usage:"Don't recursively ingest directories" default:"false" env:"KNOW_NO_INGEST_RECURSIVE"`
}

func (s *ClientIngest) Customize(cmd *cobra.Command) {
cmd.Use = "ingest [--dataset <dataset-id>] <path>"
cmd.Short = "Ingest a file/directory into a dataset (non-recursive)"
cmd.Short = "Ingest a file/directory into a dataset"
cmd.Args = cobra.ExactArgs(1)
}

Expand All @@ -44,7 +44,7 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
ingestOpts := &client.IngestPathsOpts{
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Recursive: s.Recursive,
Recursive: !s.NoRecursive,
TextSplitterOpts: &s.TextSplitterOpts,
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/datastore/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func (s *Datastore) DeleteDocument(ctx context.Context, documentID, datasetID st
}

// Remove from VectorStore
if err := s.Vectorstore.RemoveDocument(ctx, documentID, datasetID); err != nil {
if err := s.Vectorstore.RemoveDocument(ctx, documentID, datasetID, nil, nil); err != nil {
return fmt.Errorf("failed to remove document from VectorStore: %w", err)
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/datastore/files.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func (s *Datastore) DeleteFile(ctx context.Context, datasetID, fileID string) er

// Remove owned documents from VectorStore and Database
for _, doc := range file.Documents {
if err := s.Vectorstore.RemoveDocument(ctx, doc.ID, datasetID); err != nil {
if err := s.Vectorstore.RemoveDocument(ctx, doc.ID, datasetID, nil, nil); err != nil {
return fmt.Errorf("failed to remove document from VectorStore: %w", err)
}

Expand Down
10 changes: 9 additions & 1 deletion pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
ingestionFlow.FillDefaults(filetype, opts.TextSplitterOpts)

// Mandatory Transformation: Add filename to metadata
em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": filename}}
em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": filename, "absPath": opts.FileMetadata.AbsolutePath}}
ingestionFlow.Transformations = append(ingestionFlow.Transformations, em)

docs, err := ingestionFlow.Run(ctx, bytes.NewReader(content))
Expand All @@ -105,6 +105,14 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
return nil, nil
}

// Before adding doc, we need to remove the existing documents for duplicates or old contents
where := map[string]string{
"absPath": opts.FileMetadata.AbsolutePath,
}
if err := s.Vectorstore.RemoveDocument(ctx, "", datasetID, where, nil); err != nil {
return nil, err
}

// Add documents to VectorStore -> This generates the embeddings
slog.Debug("Ingesting documents", "count", len(docs))
docIDs, err := s.Vectorstore.AddDocuments(ctx, docs, datasetID)
Expand Down
6 changes: 3 additions & 3 deletions pkg/vectorstore/chromem/chromem.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
"github.com/philippgille/chromem-go"
)

// EmbeddingParallelThread can be set as an environment variable to control the number of parallel API calls to create embedding for documents. Default is 100
// VsChromemEmbeddingParallelThread can be set as an environment variable to control the number of parallel API calls to create embedding for documents. Default is 100
const VsChromemEmbeddingParallelThread = "VS_CHROMEM_EMBEDDING_PARALLEL_THREAD"

type Store struct {
Expand Down Expand Up @@ -145,10 +145,10 @@ func (s *Store) RemoveCollection(_ context.Context, collection string) error {
return s.db.DeleteCollection(collection)
}

func (s *Store) RemoveDocument(ctx context.Context, documentID string, collection string) error {
func (s *Store) RemoveDocument(ctx context.Context, documentID string, collection string, where, whereDocument map[string]string) error {
col := s.db.GetCollection(collection, s.embeddingFunc)
if col == nil {
return fmt.Errorf("%w: %q", errors.ErrCollectionNotFound, collection)
}
return col.Delete(ctx, nil, nil, documentID)
return col.Delete(ctx, where, whereDocument, documentID)
}
2 changes: 1 addition & 1 deletion pkg/vectorstore/vectorstores.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ type VectorStore interface {
AddDocuments(ctx context.Context, docs []Document, collection string) ([]string, error) // @return documentIDs, error
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string) ([]Document, error) //nolint:lll
RemoveCollection(ctx context.Context, collection string) error
RemoveDocument(ctx context.Context, documentID string, collection string) error
RemoveDocument(ctx context.Context, documentID string, collection string, where, whereDocument map[string]string) error
}

0 comments on commit b4674e3

Please sign in to comment.