Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: knowledge import to import datasets from knowledge base export a…
Browse files Browse the repository at this point in the history
…rchives
  • Loading branch information
iwilltry42 committed Jul 1, 2024
1 parent 29e4400 commit ae7b5f8
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 4 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.22.3
toolchain go1.22.4

replace (
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20240627131850-b7f5672836c8 // Export selected collections
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20240701135946-49eb4988eab1 // Import/Export selected collections
github.com/tmc/langchaingo => github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b // Context-Aware Markdown Splitting
)

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,8 @@ github.com/hupe1980/golc v0.0.112 h1:aFUMfnEkqdapukuj6/Ny0zbwBDngeC/ZyvvACRdBfv4
github.com/hupe1980/golc v0.0.112/go.mod h1:w692KzkSTSvXROfyu+jYauNXB4YaL1s8zHPDMnNW88o=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/iwilltry42/chromem-go v0.0.0-20240627131850-b7f5672836c8 h1:GC4Bk6eE9u2I/uuhyM/LVe1UKAj8gG4MUlrm0d2t7j0=
github.com/iwilltry42/chromem-go v0.0.0-20240627131850-b7f5672836c8/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/chromem-go v0.0.0-20240701135946-49eb4988eab1 h1:fo4188Or4x3XwtRsAHP+qwzBrvuxDODoRbAWbeM1OaU=
github.com/iwilltry42/chromem-go v0.0.0-20240701135946-49eb4988eab1/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7 h1:g0fAGBisHaEQ0TRq1iBvemFRf+8AEWEmBESSiWB3Vsc=
github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
Expand Down
1 change: 1 addition & 0 deletions pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ type Client interface {
DeleteDocuments(ctx context.Context, datasetID string, documentIDs ...string) error
Retrieve(ctx context.Context, datasetID string, query string, opts datastore.RetrieveOpts) ([]vectorstore.Document, error)
ExportDatasets(ctx context.Context, path string, datasets ...string) error
ImportDatasets(ctx context.Context, path string, datasets ...string) error
}
5 changes: 5 additions & 0 deletions pkg/client/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,8 @@ func (c *DefaultClient) ExportDatasets(ctx context.Context, path string, dataset
// TODO: implement
panic("not implemented")
}

func (c *DefaultClient) ImportDatasets(ctx context.Context, path string, datasets ...string) error {
// TODO: implement
panic("not implemented")
}
4 changes: 4 additions & 0 deletions pkg/client/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,7 @@ func (c *StandaloneClient) AskDirectory(ctx context.Context, path string, query
func (c *StandaloneClient) ExportDatasets(ctx context.Context, path string, datasets ...string) error {
return c.Datastore.ExportDatasetsToFile(ctx, path, datasets...)
}

func (c *StandaloneClient) ImportDatasets(ctx context.Context, path string, datasets ...string) error {
return c.Datastore.ImportDatasetsFromFile(ctx, path, datasets...)
}
30 changes: 30 additions & 0 deletions pkg/cmd/import.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package cmd

import (
"github.com/spf13/cobra"
)

type ClientImportDatasets struct {
Client
}

func (s *ClientImportDatasets) Customize(cmd *cobra.Command) {
cmd.Use = "import <path> [<dataset-id>...]"
cmd.Short = "Import one or more datasets from an archive (zip) (default: all datasets)"
cmd.Long = `Import one or more datasets from an archive (zip) (default: all datasets).
## IMPORTANT: Embedding functions
Embedding functions are not part of exported knowledge base archives, so you'll have to know the embedding function used to import the archive.
This primarily concerns the choice of the embeddings provider (model) and the embedding dimension.
Note: This is only relevant if you plan to add more documents to the dataset after importing it.
`
cmd.Args = cobra.MinimumNArgs(1)
}

func (s *ClientImportDatasets) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
if err != nil {
return err
}

return c.ImportDatasets(cmd.Context(), args[0], args[1:]...)
}
1 change: 1 addition & 0 deletions pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func New() *cobra.Command {
new(ClientResetDatastore),
new(ClientAskDir),
new(ClientExportDatasets),
new(ClientImportDatasets),
new(Version),
)
}
Expand Down
67 changes: 67 additions & 0 deletions pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,73 @@ func (s *Datastore) ExportDatasetsToFile(ctx context.Context, path string, datas
return nil
}

func (s *Datastore) ImportDatasetsFromFile(ctx context.Context, path string, datasets ...string) error {
tmpDir, err := os.MkdirTemp(os.TempDir(), "knowledge-import-")
if err != nil {
return err
}

defer os.RemoveAll(tmpDir)

r, err := zip.OpenReader(path)
if err != nil {
return err
}
defer r.Close()

if len(r.File) != 2 {
return fmt.Errorf("knowledge archive must contain exactly two files, found %d", len(r.File))
}

dbFile := ""
vectorStoreFile := ""
for _, f := range r.File {
rc, err := f.Open()
if err != nil {
return err
}
defer rc.Close()

path := filepath.Join(tmpDir, f.Name)
if f.FileInfo().IsDir() {
continue
}

f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
if err != nil {
return err
}
defer f.Close()

if _, err := io.Copy(f, rc); err != nil {
return err
}
_ = f.Close()
_ = rc.Close()

// FIXME: this should not be static as we may support multiple (vector) DBs at some point
if filepath.Ext(f.Name()) == ".db" {
dbFile = path
} else if filepath.Ext(f.Name()) == ".gob" {
vectorStoreFile = path
}
}

if dbFile == "" || vectorStoreFile == "" {
return fmt.Errorf("knowledge archive must contain exactly one .db and one .gob file")
}

if err = s.Index.ImportDatasetsFromFile(ctx, dbFile); err != nil {
return err
}

if err = s.Vectorstore.ImportCollectionsFromFile(ctx, vectorStoreFile, datasets...); err != nil {
return err
}

return nil
}

func zipDir(src, dst string) error {
zipfile, err := os.Create(dst)
if err != nil {
Expand Down
29 changes: 29 additions & 0 deletions pkg/index/datasets.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"log/slog"
"os"
"path/filepath"
"strings"
)

func GetDataset(db *gorm.DB, id string) (*Dataset, error) {
Expand Down Expand Up @@ -58,3 +59,31 @@ func (db *DB) ExportDatasetsToFile(ctx context.Context, path string, ids ...stri

return nil
}

func (db *DB) ImportDatasetsFromFile(ctx context.Context, path string) error {
gdb := db.gormDB.WithContext(ctx)

ndb, err := New("sqlite://"+strings.TrimPrefix(path, "sqlite://"), false)
if err != nil {
return err
}
ngdb := ndb.gormDB.WithContext(ctx)

defer ndb.Close()

var datasets []Dataset
err = ngdb.Find(&datasets).Error
if err != nil {
return err
}

// fill new database with exported datasets
for _, dataset := range datasets {
if err := gdb.Create(&dataset).Error; err != nil {
return err
}
}
gdb.Commit()

return nil
}
10 changes: 9 additions & 1 deletion pkg/vectorstore/chromem/chromem.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,15 @@ func (s *Store) RemoveDocument(ctx context.Context, documentID string, collectio
}

func (s *Store) ImportCollectionsFromFile(ctx context.Context, path string, collections ...string) error {
return fmt.Errorf("not implemented")
finfo, err := os.Stat(path)
if err != nil {
return fmt.Errorf("couldn't stat file %q: %w", path, err)
}
if finfo.IsDir() {
return fmt.Errorf("path %q is a directory", path)
}
slog.Debug("Importing collections from file", "path", path)
return s.db.ImportFromFile(path, "", collections...)
}

func (s *Store) ExportCollectionsToFile(ctx context.Context, path string, collections ...string) error {
Expand Down

0 comments on commit ae7b5f8

Please sign in to comment.