Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: reset-datastore (hidden) cmd + default dataset 'default'
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed May 3, 2024
1 parent 2c918ef commit 98a0187
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (s *Client) getClient() (client.Client, error) {
if err != nil {
return nil, err
}
return c, c.Datastore.Index.AutoMigrate()
return c, nil
}
return client.NewDefaultClient(s.Server), nil
}
9 changes: 5 additions & 4 deletions pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ import (

type ClientIngest struct {
Client
Dataset string `usage:"Target Dataset ID" default:"default" env:"KNOW_TARGET_DATASET"`
IgnoreExtensions string `usage:"Comma-separated list of file extensions to ignore" env:"KNOW_INGEST_IGNORE_EXTENSIONS"`
}

func (s *ClientIngest) Customize(cmd *cobra.Command) {
cmd.Use = "ingest <dataset-id> <path>"
cmd.Use = "ingest [--dataset <dataset-id>] <path>"
cmd.Short = "Ingest a file/directory into a dataset (non-recursive)"
cmd.Args = cobra.ExactArgs(2)
cmd.Args = cobra.ExactArgs(1)
}

func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
Expand All @@ -24,8 +25,8 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
return err
}

datasetID := args[0]
filePath := args[1]
datasetID := s.Dataset
filePath := args[0]

ingestOpts := &client.IngestPathsOpts{
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ func New() *cobra.Command {
new(ClientIngest),
new(ClientDeleteDataset),
new(ClientRetrieve),
new(ClientResetDatastore),
)
}

Expand Down
38 changes: 38 additions & 0 deletions pkg/cmd/reset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package cmd

import (
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/spf13/cobra"
"os"
"strings"
)

type ClientResetDatastore struct {
Client
}

func (s *ClientResetDatastore) Customize(cmd *cobra.Command) {
cmd.Use = "reset-datastore"
cmd.Short = "Reset the knowledge datastore (WARNING: this deletes all datasets and ingested data)"
cmd.Args = cobra.ExactArgs(0)
cmd.Hidden = true
}

func (s *ClientResetDatastore) Run(cmd *cobra.Command, args []string) error {
dsn, vectordbPath, err := datastore.GetDatastorePaths(s.DSN, s.VectorDBConfig.VectorDBPath)
if err != nil {
return err
}

if err := os.RemoveAll(strings.TrimPrefix(dsn, "sqlite://")); err != nil {
return fmt.Errorf("failed to remove database file: %w", err)
}

if err := os.RemoveAll(vectordbPath); err != nil {
return fmt.Errorf("failed to remove vector database directory: %w", err)
}

fmt.Printf("Successfully reset datastore (DSN: %q, VectorDBPath: %q)\n", dsn, vectordbPath)
return nil
}
11 changes: 6 additions & 5 deletions pkg/cmd/retrieve.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ import (

type ClientRetrieve struct {
Client
TopK int `usage:"Number of sources to retrieve" default:"3"`
Dataset string `usage:"Target Dataset ID" default:"default" env:"KNOW_TARGET_DATASET"`
TopK int `usage:"Number of sources to retrieve" default:"3"`
}

func (s *ClientRetrieve) Customize(cmd *cobra.Command) {
cmd.Use = "retrieve <dataset-id> <query>"
cmd.Use = "retrieve [--dataset <dataset-id>] <query>"
cmd.Short = "Retrieve sources for a query from a dataset"
cmd.Args = cobra.ExactArgs(2)
cmd.Args = cobra.ExactArgs(1)
}

func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error {
Expand All @@ -23,8 +24,8 @@ func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error {
return err
}

datasetID := args[0]
query := args[1]
datasetID := s.Dataset
query := args[0]

sources, err := c.Retrieve(cmd.Context(), datasetID, query)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/datastore/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error
}

// Create dataset
slog.Info("Creating dataset", "id", dataset.ID)
tx := s.Index.WithContext(ctx).Create(&dataset)
if tx.Error != nil {
return tx.Error
Expand All @@ -30,6 +29,7 @@ func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error
if err != nil {
return err
}
slog.Info("Created dataset", "id", dataset.ID)
return nil
}

Expand Down
52 changes: 42 additions & 10 deletions pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package datastore

import (
"context"
"fmt"
"github.com/acorn-io/z"
"github.com/adrg/xdg"
"github.com/gptscript-ai/knowledge/pkg/index"
Expand All @@ -16,28 +18,43 @@ type Datastore struct {
Vectorstore vectorstore.VectorStore
}

func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig types.OpenAIConfig) (*Datastore, error) {
func GetDatastorePaths(dsn, vectordbPath string) (string, string, error) {
if dsn == "" {
var err error
dsn, err = xdg.DataFile("gptscript/knowledge/knowledge.db")
if err != nil {
return nil, err
return "", "", err
}
dsn = "sqlite://" + dsn
slog.Debug("Using default DSN", "dsn", dsn)
}

if vectordbPath == "" {
var err error
vectordbPath, err = xdg.DataFile("gptscript/knowledge/vector.db")
if err != nil {
return "", "", err
}
slog.Debug("Using default VectorDBPath", "vectordbPath", vectordbPath)
}

return dsn, vectordbPath, nil
}

func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig types.OpenAIConfig) (*Datastore, error) {

dsn, vectorDBPath, err := GetDatastorePaths(dsn, vectorDBPath)
if err != nil {
return nil, fmt.Errorf("failed to determine datastore paths: %w", err)
}

idx, err := index.New(dsn, automigrate)
if err != nil {
return nil, err
}

if vectorDBPath == "" {
vectorDBPath, err = xdg.DataFile("gptscript/knowledge/vector.db")
if err != nil {
return nil, err
}
slog.Debug("Using default VectorDBPath", "vectorDBPath", vectorDBPath)
if err := idx.AutoMigrate(); err != nil {
return nil, fmt.Errorf("failed to auto-migrate index: %w", err)
}

vsdb, err := cg.NewPersistentDB(vectorDBPath, false)
Expand All @@ -52,8 +69,23 @@ func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfi
z.Pointer(true),
)

return &Datastore{
ds := &Datastore{
Index: idx,
Vectorstore: chromem.New(vsdb, embeddingFunc),
}, nil
}

// Ensure default dataset exists
defaultDS, err := ds.GetDataset(context.Background(), "default")
if err != nil {
return nil, fmt.Errorf("failed to ensure default dataset: %w", err)
}

if defaultDS == nil {
err = ds.NewDataset(context.Background(), types.Dataset{ID: "default", EmbedDimension: nil})
if err != nil {
return nil, fmt.Errorf("failed to create default dataset: %w", err)
}
}

return ds, nil
}

0 comments on commit 98a0187

Please sign in to comment.