From 98a0187caad9b7e771250cd706fb433c84f4a650 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 3 May 2024 07:26:11 +0200 Subject: [PATCH] add: reset-datastore (hidden) cmd + default dataset 'default' --- pkg/cmd/client.go | 2 +- pkg/cmd/ingest.go | 9 ++++--- pkg/cmd/main.go | 1 + pkg/cmd/reset.go | 38 ++++++++++++++++++++++++++++ pkg/cmd/retrieve.go | 11 ++++---- pkg/datastore/dataset.go | 2 +- pkg/datastore/datastore.go | 52 ++++++++++++++++++++++++++++++-------- 7 files changed, 94 insertions(+), 21 deletions(-) create mode 100644 pkg/cmd/reset.go diff --git a/pkg/cmd/client.go b/pkg/cmd/client.go index 198e3ad8..f1986d25 100644 --- a/pkg/cmd/client.go +++ b/pkg/cmd/client.go @@ -23,7 +23,7 @@ func (s *Client) getClient() (client.Client, error) { if err != nil { return nil, err } - return c, c.Datastore.Index.AutoMigrate() + return c, nil } return client.NewDefaultClient(s.Server), nil } diff --git a/pkg/cmd/ingest.go b/pkg/cmd/ingest.go index 85d39f55..c2b4bb72 100644 --- a/pkg/cmd/ingest.go +++ b/pkg/cmd/ingest.go @@ -9,13 +9,14 @@ import ( type ClientIngest struct { Client + Dataset string `usage:"Target Dataset ID" default:"default" env:"KNOW_TARGET_DATASET"` IgnoreExtensions string `usage:"Comma-separated list of file extensions to ignore" env:"KNOW_INGEST_IGNORE_EXTENSIONS"` } func (s *ClientIngest) Customize(cmd *cobra.Command) { - cmd.Use = "ingest " + cmd.Use = "ingest [--dataset ] " cmd.Short = "Ingest a file/directory into a dataset (non-recursive)" - cmd.Args = cobra.ExactArgs(2) + cmd.Args = cobra.ExactArgs(1) } func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error { @@ -24,8 +25,8 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error { return err } - datasetID := args[0] - filePath := args[1] + datasetID := s.Dataset + filePath := args[0] ingestOpts := &client.IngestPathsOpts{ IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","), diff --git a/pkg/cmd/main.go b/pkg/cmd/main.go index a514eb62..ea74717f 100644 --- a/pkg/cmd/main.go +++ b/pkg/cmd/main.go @@ -15,6 +15,7 @@ func New() *cobra.Command { new(ClientIngest), new(ClientDeleteDataset), new(ClientRetrieve), + new(ClientResetDatastore), ) } diff --git a/pkg/cmd/reset.go b/pkg/cmd/reset.go new file mode 100644 index 00000000..1a7075fa --- /dev/null +++ b/pkg/cmd/reset.go @@ -0,0 +1,38 @@ +package cmd + +import ( + "fmt" + "github.com/gptscript-ai/knowledge/pkg/datastore" + "github.com/spf13/cobra" + "os" + "strings" +) + +type ClientResetDatastore struct { + Client +} + +func (s *ClientResetDatastore) Customize(cmd *cobra.Command) { + cmd.Use = "reset-datastore" + cmd.Short = "Reset the knowledge datastore (WARNING: this deletes all datasets and ingested data)" + cmd.Args = cobra.ExactArgs(0) + cmd.Hidden = true +} + +func (s *ClientResetDatastore) Run(cmd *cobra.Command, args []string) error { + dsn, vectordbPath, err := datastore.GetDatastorePaths(s.DSN, s.VectorDBConfig.VectorDBPath) + if err != nil { + return err + } + + if err := os.RemoveAll(strings.TrimPrefix(dsn, "sqlite://")); err != nil { + return fmt.Errorf("failed to remove database file: %w", err) + } + + if err := os.RemoveAll(vectordbPath); err != nil { + return fmt.Errorf("failed to remove vector database directory: %w", err) + } + + fmt.Printf("Successfully reset datastore (DSN: %q, VectorDBPath: %q)\n", dsn, vectordbPath) + return nil +} diff --git a/pkg/cmd/retrieve.go b/pkg/cmd/retrieve.go index 2e58a173..b1393151 100644 --- a/pkg/cmd/retrieve.go +++ b/pkg/cmd/retrieve.go @@ -8,13 +8,14 @@ import ( type ClientRetrieve struct { Client - TopK int `usage:"Number of sources to retrieve" default:"3"` + Dataset string `usage:"Target Dataset ID" default:"default" env:"KNOW_TARGET_DATASET"` + TopK int `usage:"Number of sources to retrieve" default:"3"` } func (s *ClientRetrieve) Customize(cmd *cobra.Command) { - cmd.Use = "retrieve " + cmd.Use = "retrieve [--dataset ] " cmd.Short = "Retrieve sources for a query from a dataset" - cmd.Args = cobra.ExactArgs(2) + cmd.Args = cobra.ExactArgs(1) } func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error { @@ -23,8 +24,8 @@ func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error { return err } - datasetID := args[0] - query := args[1] + datasetID := s.Dataset + query := args[0] sources, err := c.Retrieve(cmd.Context(), datasetID, query) if err != nil { diff --git a/pkg/datastore/dataset.go b/pkg/datastore/dataset.go index d52cc61e..89f231a8 100644 --- a/pkg/datastore/dataset.go +++ b/pkg/datastore/dataset.go @@ -19,7 +19,6 @@ func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error } // Create dataset - slog.Info("Creating dataset", "id", dataset.ID) tx := s.Index.WithContext(ctx).Create(&dataset) if tx.Error != nil { return tx.Error @@ -30,6 +29,7 @@ func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error if err != nil { return err } + slog.Info("Created dataset", "id", dataset.ID) return nil } diff --git a/pkg/datastore/datastore.go b/pkg/datastore/datastore.go index 40b87c37..a7ccbbe8 100644 --- a/pkg/datastore/datastore.go +++ b/pkg/datastore/datastore.go @@ -1,6 +1,8 @@ package datastore import ( + "context" + "fmt" "github.com/acorn-io/z" "github.com/adrg/xdg" "github.com/gptscript-ai/knowledge/pkg/index" @@ -16,28 +18,43 @@ type Datastore struct { Vectorstore vectorstore.VectorStore } -func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig types.OpenAIConfig) (*Datastore, error) { +func GetDatastorePaths(dsn, vectordbPath string) (string, string, error) { if dsn == "" { var err error dsn, err = xdg.DataFile("gptscript/knowledge/knowledge.db") if err != nil { - return nil, err + return "", "", err } dsn = "sqlite://" + dsn slog.Debug("Using default DSN", "dsn", dsn) } + if vectordbPath == "" { + var err error + vectordbPath, err = xdg.DataFile("gptscript/knowledge/vector.db") + if err != nil { + return "", "", err + } + slog.Debug("Using default VectorDBPath", "vectordbPath", vectordbPath) + } + + return dsn, vectordbPath, nil +} + +func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig types.OpenAIConfig) (*Datastore, error) { + + dsn, vectorDBPath, err := GetDatastorePaths(dsn, vectorDBPath) + if err != nil { + return nil, fmt.Errorf("failed to determine datastore paths: %w", err) + } + idx, err := index.New(dsn, automigrate) if err != nil { return nil, err } - if vectorDBPath == "" { - vectorDBPath, err = xdg.DataFile("gptscript/knowledge/vector.db") - if err != nil { - return nil, err - } - slog.Debug("Using default VectorDBPath", "vectorDBPath", vectorDBPath) + if err := idx.AutoMigrate(); err != nil { + return nil, fmt.Errorf("failed to auto-migrate index: %w", err) } vsdb, err := cg.NewPersistentDB(vectorDBPath, false) @@ -52,8 +69,23 @@ func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfi z.Pointer(true), ) - return &Datastore{ + ds := &Datastore{ Index: idx, Vectorstore: chromem.New(vsdb, embeddingFunc), - }, nil + } + + // Ensure default dataset exists + defaultDS, err := ds.GetDataset(context.Background(), "default") + if err != nil { + return nil, fmt.Errorf("failed to ensure default dataset: %w", err) + } + + if defaultDS == nil { + err = ds.NewDataset(context.Background(), types.Dataset{ID: "default", EmbedDimension: nil}) + if err != nil { + return nil, fmt.Errorf("failed to create default dataset: %w", err) + } + } + + return ds, nil }