Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: pgvector vectorstore (#140)
Browse files Browse the repository at this point in the history
Example configuration via environment variable: export KNOW_VECTOR_DSN="pgvector://knowledge:knowledge@localhost:5432/knowledge?sslmode=disable"
  • Loading branch information
iwilltry42 authored Oct 16, 2024
1 parent a5fe669 commit ae12f51
Show file tree
Hide file tree
Showing 56 changed files with 833 additions and 135 deletions.
10 changes: 10 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
services:
pgvector:
container_name: pgvector
image: pgvector/pgvector:0.7.4-pg17
ports:
- "5432:5432"
environment:
POSTGRES_DB: knowledge
POSTGRES_USER: knowledge
POSTGRES_PASSWORD: knowledge
12 changes: 8 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ require (
github.com/google/uuid v1.6.0
github.com/hupe1980/golc v0.0.112
github.com/iwilltry42/bm25-go v0.0.0-20240909111832-a928590cc9da
github.com/jackc/pgx/v5 v5.7.1
github.com/jmcarbo/stopwords v1.1.9
github.com/joho/godotenv v1.5.1
github.com/knadh/koanf/parsers/json v0.1.0
Expand All @@ -38,14 +39,15 @@ require (
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
github.com/lu4p/cat v0.1.5
github.com/mitchellh/mapstructure v1.5.0
github.com/pgvector/pgvector-go v0.2.2
github.com/philippgille/chromem-go v0.6.1-0.20240811154507-a1944285b284
github.com/spf13/cobra v1.8.1
github.com/stretchr/testify v1.9.0
github.com/swaggo/files v1.0.1
github.com/swaggo/gin-swagger v1.6.0
github.com/swaggo/swag v1.16.3
github.com/tmc/langchaingo v0.1.12
golang.org/x/sync v0.7.0
golang.org/x/sync v0.8.0
gorm.io/gorm v1.25.10
sigs.k8s.io/yaml v1.4.0
)
Expand Down Expand Up @@ -109,6 +111,8 @@ require (
github.com/hupe1980/go-textractor v0.0.9 // indirect
github.com/hupe1980/go-tiktoken v0.0.9 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7 // indirect
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
Expand Down Expand Up @@ -158,11 +162,11 @@ require (
gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/crypto v0.27.0 // indirect
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/sys v0.25.0 // indirect
golang.org/x/text v0.18.0 // indirect
golang.org/x/tools v0.22.0 // indirect
google.golang.org/api v0.184.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect
Expand Down
183 changes: 173 additions & 10 deletions go.sum

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func (s *ClientAskDir) Customize(cmd *cobra.Command) {
}

func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/cmd/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cmd

import (
"archive/zip"
"context"
"fmt"
"io"
"os"
Expand Down Expand Up @@ -89,13 +90,13 @@ func (s *Client) loadArchive() error {
return fmt.Errorf("knowledge archive must contain exactly one .db and one .gob file")
}

s.DSN = types.ArchivePrefix + dbFile
s.VectorDBPath = types.ArchivePrefix + vectorStoreFile
s.DatabaseConfig.DSN = types.ArchivePrefix + dbFile
s.VectorDBConfig.DSN = types.ArchivePrefix + vectorStoreFile

return nil
}

func (s *Client) getClient() (client.Client, error) {
func (s *Client) getClient(ctx context.Context) (client.Client, error) {
if err := s.loadArchive(); err != nil {
return nil, err
}
Expand All @@ -111,7 +112,7 @@ func (s *Client) getClient() (client.Client, error) {
return nil, err
}

ds, err := datastore.NewDatastore(s.DSN, s.AutoMigrate == "true", s.VectorDBConfig.VectorDBPath, provider)
ds, err := datastore.NewDatastore(ctx, s.DatabaseConfig.DSN, s.AutoMigrate == "true", s.VectorDBConfig.DSN, provider)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/create_dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func (s *ClientCreateDataset) Customize(cmd *cobra.Command) {
}

func (s *ClientCreateDataset) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/delete_dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func (s *ClientDeleteDataset) Customize(cmd *cobra.Command) {
}

func (s *ClientDeleteDataset) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/edit_dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cmd
import (
"encoding/json"
"fmt"

"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/gptscript-ai/knowledge/pkg/index"
"github.com/spf13/cobra"
Expand All @@ -23,7 +24,7 @@ func (s *ClientEditDataset) Customize(cmd *cobra.Command) {
}

func (s *ClientEditDataset) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cmd

import (
"fmt"

"github.com/spf13/cobra"
)

Expand All @@ -17,7 +18,7 @@ func (s *ClientExportDatasets) Customize(cmd *cobra.Command) {
}

func (s *ClientExportDatasets) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/get_dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cmd
import (
"encoding/json"
"fmt"

"github.com/spf13/cobra"
)

Expand All @@ -19,7 +20,7 @@ func (s *ClientGetDataset) Customize(cmd *cobra.Command) {
}

func (s *ClientGetDataset) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/import.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func (s *ClientImportDatasets) Customize(cmd *cobra.Command) {
}

func (s *ClientImportDatasets) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ This is a constraint of the Vector Database and Similarity Search, as different
}

func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/list_datasets.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cmd
import (
"encoding/json"
"fmt"

"github.com/spf13/cobra"
)

Expand All @@ -18,7 +19,7 @@ func (s *ClientListDatasets) Customize(cmd *cobra.Command) {
}

func (s *ClientListDatasets) Run(cmd *cobra.Command, args []string) error {
c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
1 change: 0 additions & 1 deletion pkg/cmd/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ func (s *ClientLoad) Run(cmd *cobra.Command, args []string) error {

var texts []string
for _, doc := range docs {

if len(doc.Content) == 0 {
continue
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/reset.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func (s *ClientResetDatastore) Customize(cmd *cobra.Command) {
}

func (s *ClientResetDatastore) Run(cmd *cobra.Command, args []string) error {
dsn, vectordbPath, _, err := datastore.GetDatastorePaths(s.DSN, s.VectorDBConfig.VectorDBPath)
dsn, vectordbPath, _, err := datastore.GetDefaultDSNs(s.DatabaseConfig.DSN, s.VectorDBConfig.DSN)
if err != nil {
return err
}
Expand All @@ -34,6 +34,6 @@ func (s *ClientResetDatastore) Run(cmd *cobra.Command, args []string) error {
return fmt.Errorf("failed to remove vector database directory: %w", err)
}

fmt.Printf("Successfully reset datastore (DSN: %q, VectorDBPath: %q)\n", dsn, vectordbPath)
fmt.Printf("Successfully reset datastore (DSN: %q, DSN: %q)\n", dsn, vectordbPath)
return nil
}
2 changes: 1 addition & 1 deletion pkg/cmd/retrieve.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error {
}
slog.Info("Retrieving sources for query", "query", query, "datasets", datasetIDs)

c, err := s.getClient()
c, err := s.getClient(cmd.Context())
if err != nil {
return err
}
Expand Down
7 changes: 4 additions & 3 deletions pkg/cmd/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package cmd

import (
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings"
"github.com/spf13/cobra"
"log/slog"
"os/signal"
"syscall"

"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings"
"github.com/spf13/cobra"

"github.com/gptscript-ai/knowledge/pkg/config"
"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/gptscript-ai/knowledge/pkg/server"
Expand Down Expand Up @@ -47,7 +48,7 @@ func (s *Server) Run(cmd *cobra.Command, _ []string) error {
return err
}

ds, err := datastore.NewDatastore(s.DSN, s.AutoMigrate == "true", s.VectorDBConfig.VectorDBPath, provider)
ds, err := datastore.NewDatastore(cmd.Context(), s.DatabaseConfig.DSN, s.AutoMigrate == "true", s.VectorDBConfig.DSN, provider)
if err != nil {
return fmt.Errorf("failed to initialize datastore: %w", err)
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package config

import (
"fmt"
"os"
"path"

"github.com/knadh/koanf/parsers/json"
"github.com/knadh/koanf/parsers/yaml"
"github.com/knadh/koanf/providers/rawbytes"
"github.com/knadh/koanf/v2"
"os"
"path"
)

type Config struct {
Expand All @@ -25,12 +26,12 @@ type ModelProviderConfig struct {
}

type DatabaseConfig struct {
DSN string `usage:"Server database connection string (default \"sqlite://$XDG_DATA_HOME/gptscript/knowledge/knowledge.db\")" default:"" env:"KNOW_DB_DSN"`
DSN string `name:"index-dsn" usage:"Index Database Connection string (relational DB) (default \"sqlite://$XDG_DATA_HOME/gptscript/knowledge/knowledge.db\")" default:"" env:"KNOW_INDEX_DSN"`
AutoMigrate string `usage:"Auto migrate database" default:"true" env:"KNOW_DB_AUTO_MIGRATE"`
}

type VectorDBConfig struct {
VectorDBPath string `usage:"VectorDBPath to the vector database (default \"$XDG_DATA_HOME/gptscript/knowledge/vector.db\")" default:"" env:"KNOW_VECTOR_DB_PATH"`
DSN string `name:"vector-dsn" usage:"DSN to the vector database (default \"chromem:$XDG_DATA_HOME/gptscript/knowledge/vector.db\")" default:"" env:"KNOW_VECTOR_DSN"`
}

func LoadConfig(configFile string) (*Config, error) {
Expand Down
Loading

0 comments on commit ae12f51

Please sign in to comment.