Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: retrieval flow
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed May 29, 2024
1 parent 9a85b5c commit f6a45ef
Show file tree
Hide file tree
Showing 35 changed files with 1,289 additions and 101 deletions.
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ replace (
)

require (
dario.cat/mergo v1.0.0
github.com/acorn-io/cmd v0.0.0-20240404013709-34f690bde37b
github.com/acorn-io/z v0.0.0-20231104012607-4cab1b3ec5e5
github.com/adrg/xdg v0.4.0
Expand All @@ -19,6 +20,7 @@ require (
github.com/hupe1980/golc v0.0.110
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
github.com/lu4p/cat v0.1.5
github.com/mitchellh/mapstructure v1.5.0
github.com/philippgille/chromem-go v0.6.0
github.com/spf13/cobra v1.8.0
github.com/stretchr/testify v1.9.0
Expand All @@ -27,14 +29,12 @@ require (
github.com/swaggo/swag v1.16.3
github.com/tmc/langchaingo v0.1.8
golang.org/x/sync v0.7.0
gopkg.in/yaml.v3 v3.0.1
gorm.io/gorm v1.25.9
sigs.k8s.io/yaml v1.4.0
)

require (
cloud.google.com/go/ai v0.4.0 // indirect
dario.cat/mergo v1.0.0 // indirect
github.com/AssemblyAI/assemblyai-go-sdk v1.3.0 // indirect
github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect
Expand Down Expand Up @@ -97,7 +97,6 @@ require (
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/microcosm-cc/bluemonday v1.0.26 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/olekukonko/tablewriter v0.0.5 // indirect
Expand Down Expand Up @@ -138,6 +137,7 @@ require (
google.golang.org/grpc v1.63.2 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
modernc.org/libc v1.22.5 // indirect
modernc.org/mathutil v1.5.0 // indirect
modernc.org/memory v1.5.0 // indirect
Expand Down
11 changes: 11 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
cloud.google.com/go v0.112.1 h1:uJSeirPke5UNZHIb4SxfZklVSiWWVqW4oXlETwZziwM=
cloud.google.com/go/ai v0.4.0 h1:hoF8+joXKfW2Ug7MKssoffXCVUSxUqMUJL0hJxVtO1Q=
cloud.google.com/go/ai v0.4.0/go.mod h1:iX72tmUodGXVDxRDCGUZEPiB9HaMeERXkOdgCkUi8sA=
cloud.google.com/go/aiplatform v1.66.0 h1:bbFYY4JInclG10czRFUYj2rjD+obhh3Gi9zVlyoMgEc=
cloud.google.com/go/aiplatform v1.66.0/go.mod h1:bPQS0UjaXaTAq57UgP3XWDCtYFOIbXXpkMsl6uP4JAc=
cloud.google.com/go/longrunning v0.5.6 h1:xAe8+0YaWoCKr9t1+aWe+OeQgN/iJK1fEgZSXmjuEaE=
cloud.google.com/go/longrunning v0.5.6/go.mod h1:vUaDrWYOMKRuhiv6JBnn49YxCPz2Ayn9GqyjaBT8/mA=
dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk=
dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
github.com/AssemblyAI/assemblyai-go-sdk v1.3.0 h1:AtOVgGxUycvK4P4ypP+1ZupecvFgnfH+Jsum0o5ILoU=
Expand Down Expand Up @@ -39,6 +44,8 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 h1:sHmMWWX5E7guWEFQ9SV
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4/go.mod h1:WjpDrhWisWOIoS9n3nk67A3Ll1vfULJ9Kq6h29HTD48=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.7.3 h1:Ch31Jl96ULFDPPe7nLHnxmImjO9I8bjlPAECb1Wrx6E=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.7.3/go.mod h1:yHTz9jyvT5dT3sUHovWTWjv332k7FvD8MPk7xWEkbnQ=
github.com/aws/aws-sdk-go-v2/service/sagemakerruntime v1.27.3 h1:bvRsyhuEYCvyg3i89U0gRqdoJqtRoH5RK1hwOmmCCsM=
github.com/aws/aws-sdk-go-v2/service/sagemakerruntime v1.27.3/go.mod h1:ljK0mx70pj1+eBVh2tyJM+VzZooiZZgQLMIT+PHJSho=
github.com/aws/aws-sdk-go-v2/service/textract v1.30.4 h1:rz8lRkt7F0iKCGRbpL3NwbqGKUyX6rPrw1Nk1bO9tKc=
github.com/aws/aws-sdk-go-v2/service/textract v1.30.4/go.mod h1:PFUGOzwfe10atRNGR9abVqiKPAPSfKgu/LF0oDBflsY=
github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw=
Expand Down Expand Up @@ -157,6 +164,8 @@ github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
github.com/gorilla/websocket v1.4.1 h1:q7AeDBpnBk8AogcD4DSag/Ukw/KV+YhzLj2bP5HvKCM=
github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/hupe1980/go-huggingface v0.0.15 h1:tTWmUGGunC/BYz4hrwS8SSVtMYVYjceG2uhL8HxeXvw=
github.com/hupe1980/go-huggingface v0.0.15/go.mod h1:IRvsik3+b9BJyw9hCfw1arI6gDObcVto1UA8f3kt8mM=
github.com/hupe1980/go-promptlayer v0.0.6 h1:cga58zaQYPz7wo7EZG1a0goBj7OzoE5s3HT2Dl1Wp6g=
github.com/hupe1980/go-promptlayer v0.0.6/go.mod h1:tiCI1t6OkHe9Qio95G5+DMPOONboRJ67sSn1F0afqcU=
github.com/hupe1980/go-textractor v0.0.9 h1:hrVh7HnANQ6SZVziLtjOhfK00Xe+prVGuGLui6/6FqQ=
Expand Down Expand Up @@ -386,6 +395,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/api v0.176.1 h1:DJSXnV6An+NhJ1J+GWtoF2nHEuqB1VNoTfnIbjNvwD4=
google.golang.org/api v0.176.1/go.mod h1:j2MaSDYcvYV1lkZ1+SMW4IeF90SrEyFA+tluDYWRrFg=
google.golang.org/genproto v0.0.0-20240325203815-454cdb8f5daa h1:ePqxpG3LVx+feAUOx8YmR5T7rc0rdzK8DyxM8cQ9zq0=
google.golang.org/genproto v0.0.0-20240325203815-454cdb8f5daa/go.mod h1:CnZenrTdRJb7jc+jOm0Rkywq+9wh0QC4U8tyiRbEPPM=
google.golang.org/genproto/googleapis/api v0.0.0-20240415180920-8c6c420018be h1:Zz7rLWqp0ApfsR/l7+zSHhY3PMiH2xqgxlfYfAfNpoU=
google.golang.org/genproto/googleapis/api v0.0.0-20240415180920-8c6c420018be/go.mod h1:dvdCTIoAGbkWbcIKBniID56/7XHTt6WfxXNMxuziJ+w=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240415180920-8c6c420018be h1:LG9vZxsWGOmUKieR8wPAUR3u3MpnYFQZROPIMaXh7/A=
Expand Down
8 changes: 2 additions & 6 deletions pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,14 @@ type IngestPathsOpts struct {
IngestionFlows []flows.IngestionFlow
}

type RetrieveOpts struct {
TopK int
}

type Client interface {
CreateDataset(ctx context.Context, datasetID string) (types.Dataset, error)
DeleteDataset(ctx context.Context, datasetID string) error
GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error)
ListDatasets(ctx context.Context) ([]types.Dataset, error)
Ingest(ctx context.Context, datasetID string, data []byte, opts datastore.IngestOpts) ([]string, error)
IngestPaths(ctx context.Context, datasetID string, opts *IngestPathsOpts, paths ...string) (int, error) // returns number of files ingested
AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *RetrieveOpts) ([]vectorstore.Document, error)
AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *datastore.RetrieveOpts) ([]vectorstore.Document, error)
DeleteDocuments(ctx context.Context, datasetID string, documentIDs ...string) error
Retrieve(ctx context.Context, datasetID string, query string, opts RetrieveOpts) ([]vectorstore.Document, error)
Retrieve(ctx context.Context, datasetID string, query string, opts datastore.RetrieveOpts) ([]vectorstore.Document, error)
}
3 changes: 2 additions & 1 deletion pkg/client/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"crypto/sha1"
"encoding/hex"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore"
"github.com/gptscript-ai/knowledge/pkg/vectorstore"
"golang.org/x/sync/errgroup"
"golang.org/x/sync/semaphore"
Expand Down Expand Up @@ -102,7 +103,7 @@ func HashPath(path string) string {
return hex.EncodeToString(hashBytes)
}

func AskDir(ctx context.Context, c Client, path string, query string, opts *IngestPathsOpts, ropts *RetrieveOpts) ([]vectorstore.Document, error) {
func AskDir(ctx context.Context, c Client, path string, query string, opts *IngestPathsOpts, ropts *datastore.RetrieveOpts) ([]vectorstore.Document, error) {
abspath, err := filepath.Abs(path)
if err != nil {
return nil, fmt.Errorf("failed to get absolute path from %q: %w", path, err)
Expand Down
4 changes: 2 additions & 2 deletions pkg/client/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ func (c *DefaultClient) DeleteDocuments(_ context.Context, datasetID string, doc
return nil
}

func (c *DefaultClient) Retrieve(_ context.Context, datasetID string, query string, opts RetrieveOpts) ([]vectorstore.Document, error) {
func (c *DefaultClient) Retrieve(_ context.Context, datasetID string, query string, opts datastore.RetrieveOpts) ([]vectorstore.Document, error) {
q := types.Query{Prompt: query}

if opts.TopK != 0 {
Expand All @@ -189,7 +189,7 @@ func (c *DefaultClient) Retrieve(_ context.Context, datasetID string, query stri
return docs, nil
}

func (c *DefaultClient) AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *RetrieveOpts) ([]vectorstore.Document, error) {
func (c *DefaultClient) AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *datastore.RetrieveOpts) ([]vectorstore.Document, error) {
return AskDir(ctx, c, path, query, opts, ropts)
}

Expand Down
6 changes: 3 additions & 3 deletions pkg/client/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ func (c *StandaloneClient) DeleteDocuments(ctx context.Context, datasetID string
return nil
}

func (c *StandaloneClient) Retrieve(ctx context.Context, datasetID string, query string, opts RetrieveOpts) ([]vectorstore.Document, error) {
return c.Datastore.Retrieve(ctx, datasetID, query, opts.TopK)
func (c *StandaloneClient) Retrieve(ctx context.Context, datasetID string, query string, opts datastore.RetrieveOpts) ([]vectorstore.Document, error) {
return c.Datastore.Retrieve(ctx, datasetID, query, opts)
}

func (c *StandaloneClient) AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *RetrieveOpts) ([]vectorstore.Document, error) {
func (c *StandaloneClient) AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *datastore.RetrieveOpts) ([]vectorstore.Document, error) {
return AskDir(ctx, c, path, query, opts, ropts)
}
16 changes: 12 additions & 4 deletions pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"github.com/acorn-io/z"
"github.com/gptscript-ai/knowledge/pkg/client"
"github.com/gptscript-ai/knowledge/pkg/datastore"
flowconfig "github.com/gptscript-ai/knowledge/pkg/flows/config"
"github.com/spf13/cobra"
"log/slog"
Expand Down Expand Up @@ -41,7 +42,7 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
Recursive: s.Recursive,
}

retrieveOpts := &client.RetrieveOpts{
retrieveOpts := &datastore.RetrieveOpts{
TopK: s.TopK,
}

Expand Down Expand Up @@ -70,11 +71,18 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
}
ingestOpts.IngestionFlows = append(ingestOpts.IngestionFlows, z.Dereference(ingestionFlow))
}

// TODO: add retrieval flows here

slog.Debug("Loaded ingestion flows from config", "flows_file", s.FlowsFile, "dataset", datasetID, "flows", len(ingestOpts.IngestionFlows))

if flow.Retrieval == nil {
slog.Info("No retrieval config in assigned flow", "flows_file", s.FlowsFile, "dataset", datasetID)
} else {
rf, err := flow.Retrieval.AsRetrievalFlow()
if err != nil {
return err
}
retrieveOpts.RetrievalFlow = rf
slog.Debug("Loaded retrieval flow from config", "flows_file", s.FlowsFile, "dataset", datasetID)
}
}

sources, err := c.AskDirectory(cmd.Context(), path, query, ingestOpts, retrieveOpts)
Expand Down
1 change: 0 additions & 1 deletion pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
}

slog.Debug("Loaded ingestion flows from config", "flows_file", s.FlowsFile, "dataset", datasetID, "flows", len(ingestOpts.IngestionFlows))

}

filesIngested, err := c.IngestPaths(cmd.Context(), datasetID, ingestOpts, filePath)
Expand Down
34 changes: 32 additions & 2 deletions pkg/cmd/retrieve.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@ package cmd
import (
"encoding/json"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/client"
"github.com/gptscript-ai/knowledge/pkg/datastore"
flowconfig "github.com/gptscript-ai/knowledge/pkg/flows/config"
"github.com/spf13/cobra"
"log/slog"
)

type ClientRetrieve struct {
Client
Dataset string `usage:"Target Dataset ID" short:"d" default:"default" env:"KNOW_TARGET_DATASET"`
ClientRetrieveOpts
FlowsFile string `usage:"Path to a YAML/JSON file containing retrieval flows" env:"KNOW_FLOWS_FILE"`
}

type ClientRetrieveOpts struct {
Expand All @@ -32,7 +35,34 @@ func (s *ClientRetrieve) Run(cmd *cobra.Command, args []string) error {
datasetID := s.Dataset
query := args[0]

sources, err := c.Retrieve(cmd.Context(), datasetID, query, client.RetrieveOpts{TopK: s.TopK})
retrieveOpts := datastore.RetrieveOpts{
TopK: s.TopK,
}

if s.FlowsFile != "" {
slog.Debug("Loading retrieval flows from config", "flows_file", s.FlowsFile, "dataset", datasetID)
flowCfg, err := flowconfig.FromFile(s.FlowsFile)
if err != nil {
return err
}
flow, err := flowCfg.ForDataset(datasetID) // get flow for the dataset
if err != nil {
return err
}

if flow.Retrieval == nil {
slog.Info("No retrieval config in assigned flow", "flows_file", s.FlowsFile, "dataset", datasetID)
} else {
rf, err := flow.Retrieval.AsRetrievalFlow()
if err != nil {
return err
}
retrieveOpts.RetrievalFlow = rf
slog.Debug("Loaded retrieval flow from config", "flows_file", s.FlowsFile, "dataset", datasetID)
}
}

sources, err := c.Retrieve(cmd.Context(), datasetID, query, retrieveOpts)
if err != nil {
return err
}
Expand Down
9 changes: 4 additions & 5 deletions pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@ import (
"github.com/adrg/xdg"
"github.com/gptscript-ai/knowledge/pkg/config"
"github.com/gptscript-ai/knowledge/pkg/index"
llm2 "github.com/gptscript-ai/knowledge/pkg/llm"
"github.com/gptscript-ai/knowledge/pkg/llm"
"github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/gptscript-ai/knowledge/pkg/vectorstore/chromem"
"github.com/hupe1980/golc/schema"
cg "github.com/philippgille/chromem-go"
"log/slog"
"net/url"
)

type Datastore struct {
LLM schema.ChatModel
LLM llm.LLM
Index *index.DB
Vectorstore vectorstore.VectorStore
}
Expand Down Expand Up @@ -97,13 +96,13 @@ func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfi
)
}

llm, err := llm2.NewOpenAI(openAIConfig)
model, err := llm.NewOpenAI(openAIConfig)
if err != nil {
return nil, fmt.Errorf("failed to create LLM: %w", err)
}

ds := &Datastore{
LLM: llm,
LLM: *model,
Index: idx,
Vectorstore: chromem.New(vsdb, embeddingFunc),
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/datastore/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package defaults

const (
EmbeddingDimension int = 1536
TopK int = 5
TopK int = 10

TextSplitterTokenModel = "gpt-4"
TextSplitterChunkSize = 1024
Expand Down
1 change: 0 additions & 1 deletion pkg/datastore/filetypes/filetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ var FirstclassFileExtensions = map[string]struct{}{

// GetFiletype returns the filetype of a file based on its filename or content.
func GetFiletype(filename string, content []byte) (string, error) {

// 1. By file extension, if available and first-class supported
ext := path.Ext(filename)
if _, ok := FirstclassFileExtensions[ext]; ok {
Expand Down
41 changes: 1 addition & 40 deletions pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/transformers"
"github.com/gptscript-ai/knowledge/pkg/flows"
"github.com/gptscript-ai/knowledge/pkg/index"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"io"
"log/slog"
)

Expand Down Expand Up @@ -96,7 +94,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": filename}}
ingestionFlow.Transformations = append(ingestionFlow.Transformations, em)

docs, err := GetDocuments(ctx, bytes.NewReader(content), ingestionFlow)
docs, err := ingestionFlow.Run(ctx, bytes.NewReader(content))
if err != nil {
slog.Error("Failed to load documents", "error", err)
return nil, fmt.Errorf("failed to load documents: %w", err)
Expand Down Expand Up @@ -150,40 +148,3 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte

return docIDs, nil
}

func GetDocuments(ctx context.Context, reader io.Reader, ingestionFlow flows.IngestionFlow) ([]vs.Document, error) {
var err error
var docs []vs.Document

/*
* Load documents from the content
* For now, we're using documentloaders from both langchaingo and golc
* and translate them to our document schema.
*/

docs, err = ingestionFlow.Load(ctx, reader)
if err != nil {
slog.Error("Failed to load documents", "error", err)
return nil, fmt.Errorf("failed to load documents: %w", err)
}

/*
* Split documents - Chunking
*/
docs, err = ingestionFlow.Split(docs)
if err != nil {
slog.Error("Failed to split documents", "error", err)
return nil, fmt.Errorf("failed to split documents: %w", err)
}

/*
* Transform documents
*/
docs, err = ingestionFlow.Transform(ctx, docs)
if err != nil {
slog.Error("Failed to transform documents", "error", err)
return nil, fmt.Errorf("failed to transform documents: %w", err)
}

return docs, nil
}
2 changes: 1 addition & 1 deletion pkg/datastore/ingest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func TestExtractPDF(t *testing.T) {
em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": d.Name()}}
ingestionFlow.Transformations = append(ingestionFlow.Transformations, em)

docs, err := GetDocuments(ctx, f, ingestionFlow)
docs, err := ingestionFlow.Run(ctx, f)
require.NoError(t, err, "GetDocuments() error = %v", err)
require.NotEmpty(t, docs, "GetDocuments() returned no documents")
return nil
Expand Down
1 change: 1 addition & 0 deletions pkg/datastore/postprocessors/postprocessors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package postprocessors
Loading

0 comments on commit f6a45ef

Please sign in to comment.