From 1d37ee3cba1ea8bda3afe59a830c3c3ff3a2805d Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Mon, 27 May 2024 12:21:12 +0200 Subject: [PATCH] add: filetype detection + default flows --- pkg/datastore/filetypes/filetypes.go | 41 ++++++++++++ pkg/datastore/ingest.go | 59 ++--------------- pkg/flows/config/config.go | 64 +++++++++++++++---- pkg/flows/config/config_test.go | 5 ++ .../testdata/invalid_doubledefault.yaml | 23 +++++++ pkg/flows/config/testdata/valid.yaml | 1 + 6 files changed, 129 insertions(+), 64 deletions(-) create mode 100644 pkg/datastore/filetypes/filetypes.go create mode 100644 pkg/flows/config/testdata/invalid_doubledefault.yaml diff --git a/pkg/datastore/filetypes/filetypes.go b/pkg/datastore/filetypes/filetypes.go new file mode 100644 index 00000000..30cd4330 --- /dev/null +++ b/pkg/datastore/filetypes/filetypes.go @@ -0,0 +1,41 @@ +package filetypes + +import ( + "fmt" + "github.com/gabriel-vasile/mimetype" + "log/slog" + "path" + "strings" +) + +var FirstclassFileExtensions = map[string]struct{}{ + ".pdf": {}, + ".html": {}, + ".md": {}, + ".txt": {}, + ".docx": {}, + ".odt": {}, + ".rtf": {}, + ".csv": {}, + ".ipynb": {}, + ".json": {}, +} + +// GetFiletype returns the filetype of a file based on its filename or content. +func GetFiletype(filename string, content []byte) (string, error) { + + // 1. By file extension, if available and first-class supported + ext := path.Ext(filename) + if _, ok := FirstclassFileExtensions[ext]; ok { + return ext, nil + } + + // 2. By content (mimetype) + mt := mimetype.Detect(content) + if mt != nil { + return strings.Split(mt.String(), ";")[0], nil // remove charset (mimetype), e.g. from "text/plain; charset=utf-8" + } + + slog.Error("Failed to detect filetype", "filename", filename) + return "", fmt.Errorf("failed to detect filetype") +} diff --git a/pkg/datastore/ingest.go b/pkg/datastore/ingest.go index 018b6528..357bd5bb 100644 --- a/pkg/datastore/ingest.go +++ b/pkg/datastore/ingest.go @@ -7,9 +7,9 @@ import ( "errors" "fmt" "github.com/acorn-io/z" - "github.com/gabriel-vasile/mimetype" "github.com/google/uuid" "github.com/gptscript-ai/knowledge/pkg/datastore/documentloader" + "github.com/gptscript-ai/knowledge/pkg/datastore/filetypes" "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter" "github.com/gptscript-ai/knowledge/pkg/datastore/transformers" "github.com/gptscript-ai/knowledge/pkg/datastore/types" @@ -22,31 +22,16 @@ import ( lcgodocloaders "github.com/tmc/langchaingo/documentloaders" "io" "log/slog" - "path" "strings" ) -const () - -var firstclassFileExtensions = map[string]struct{}{ - ".pdf": {}, - ".html": {}, - ".md": {}, - ".txt": {}, - ".docx": {}, - ".odt": {}, - ".rtf": {}, - ".csv": {}, - ".ipynb": {}, - ".json": {}, -} - type IngestOpts struct { Filename *string FileMetadata *index.FileMetadata IsDuplicateFuncName string IsDuplicateFunc IsDuplicateFunc TextSplitterOpts *textsplitter.TextSplitterOpts + IngestionFlow *flows.IngestionFlow } // Ingest loads a document from a reader and adds it to the dataset. @@ -73,27 +58,11 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte /* * Detect filetype */ - reader := bytes.NewReader(content) - var filetype string - if opts.Filename != nil { - filetype = path.Ext(*opts.Filename) - if _, ok := firstclassFileExtensions[filetype]; !ok { - filetype = "" - } - } - if filetype == "" { - filetype, _, err = mimetypeFromReader(bytes.NewReader(content)) - if err != nil { - slog.Error("Failed to detect filetype", "error", err) - return nil, fmt.Errorf("failed to detect filetype: %w", err) - } - } - if filetype == "" { - slog.Error("Failed to detect filetype", "filename", *opts.Filename) - return nil, fmt.Errorf("failed to detect filetype") - } - filetype = strings.Split(filetype, ";")[0] // remove charset (mimetype), e.g. from "text/plain; charset=utf-8" + filetype, err := filetypes.GetFiletype(*opts.Filename, content) + if err != nil { + return nil, err + } /* * Set filename if not provided @@ -127,7 +96,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": *opts.Filename}} ingestionFlow.Transformations = append(ingestionFlow.Transformations, em) - docs, err := GetDocuments(ctx, reader, ingestionFlow) + docs, err := GetDocuments(ctx, bytes.NewReader(content), ingestionFlow) if err != nil { slog.Error("Failed to load documents", "error", err) return nil, fmt.Errorf("failed to load documents: %w", err) @@ -182,20 +151,6 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte return docIDs, nil } -// mimetypeFromReader returns the MIME type of input and a new reader which still has the whole input -func mimetypeFromReader(reader io.Reader) (string, io.Reader, error) { - header := bytes.NewBuffer(nil) - mtype, err := mimetype.DetectReader(io.TeeReader(reader, header)) - if err != nil { - return "", nil, err - } - - // Get back complete input reader - newReader := io.MultiReader(header, reader) - - return mtype.String(), newReader, err -} - func DefaultDocLoaderFunc(filetype string) func(ctx context.Context, reader io.Reader) ([]vs.Document, error) { switch filetype { case ".pdf", "application/pdf": diff --git a/pkg/flows/config/config.go b/pkg/flows/config/config.go index 68ed4030..8512b086 100644 --- a/pkg/flows/config/config.go +++ b/pkg/flows/config/config.go @@ -2,6 +2,7 @@ package config import ( "encoding/json" + "fmt" "github.com/gptscript-ai/knowledge/pkg/datastore/documentloader" "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter" "github.com/gptscript-ai/knowledge/pkg/flows" @@ -14,14 +15,10 @@ type FlowConfig struct { Flows map[string]FlowConfigEntry `json:"flows" yaml:"flows" mapstructure:"flows"` } -type DocumentLoaderConfig struct { - Name string `json:"name" yaml:"name" mapstructure:"name"` - Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"` -} - -type TextSplitterConfig struct { - Name string `json:"name" yaml:"name" mapstructure:"name"` - Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"` +type FlowConfigEntry struct { + Default bool `json:"default,omitempty" yaml:"default" mapstructure:"default"` + Ingestion []IngestionFlowConfig `json:"ingestion,omitempty" yaml:"ingestion" mapstructure:"ingestion"` + Retrieval *RetrievalFlowConfig `json:"retrieval,omitempty" yaml:"retrieval" mapstructure:"retrieval"` } type IngestionFlowConfig struct { @@ -33,9 +30,14 @@ type IngestionFlowConfig struct { type RetrievalFlowConfig struct{} -type FlowConfigEntry struct { - Ingestion []IngestionFlowConfig `json:"ingestion,omitempty" yaml:"ingestion" mapstructure:"ingestion"` - Retrieval RetrievalFlowConfig `json:"retrieval,omitempty" yaml:"retrieval" mapstructure:"retrieval"` +type DocumentLoaderConfig struct { + Name string `json:"name" yaml:"name" mapstructure:"name"` + Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"` +} + +type TextSplitterConfig struct { + Name string `json:"name" yaml:"name" mapstructure:"name"` + Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"` } // FromFile reads a configuration file and returns a FlowConfig. @@ -59,9 +61,45 @@ func FromFile(filename string) (*FlowConfig, error) { return nil, err } - return &config, nil + return &config, config.Validate() +} + +func (f *FlowConfig) Validate() error { + defaultCount := 0 + for name, flow := range f.Flows { + if flow.Default { + defaultCount++ + } + + if len(flow.Ingestion) == 0 && flow.Retrieval == nil { + return fmt.Errorf("flow %q has neither ingestion nor retrieval specified", name) + } + + } + if defaultCount > 1 { + return fmt.Errorf("only one flow can be default, found %d", defaultCount) + } + return nil +} + +func (f *FlowConfig) GetDefaultFlowConfigEntry() (*FlowConfigEntry, error) { + for _, flow := range f.Flows { + if flow.Default { + return &flow, nil + } + } + return nil, fmt.Errorf("default flow not found") } +func (f *FlowConfig) GetFlow(name string) (*FlowConfigEntry, error) { + flow, ok := f.Flows[name] + if !ok { + return nil, fmt.Errorf("flow %q not found", name) + } + return &flow, nil +} + +// AsIngestionFlow converts an IngestionFlowConfig to an actual flows.IngestionFlow. func (i *IngestionFlowConfig) AsIngestionFlow() (*flows.IngestionFlow, error) { flow := &flows.IngestionFlow{} if i.DocumentLoader.Name != "" { @@ -110,5 +148,7 @@ func (i *IngestionFlowConfig) AsIngestionFlow() (*flows.IngestionFlow, error) { flow.Split = splitterFunc } + // TODO: Transformers + return flow, nil } diff --git a/pkg/flows/config/config_test.go b/pkg/flows/config/config_test.go index d63ae2d6..2699ee4d 100644 --- a/pkg/flows/config/config_test.go +++ b/pkg/flows/config/config_test.go @@ -35,3 +35,8 @@ func TestLoadConfigFromNonexistentFile(t *testing.T) { assert.Error(t, err) assert.Nil(t, cfg) } + +func TestLoadConfigInvalidDoubleDefault(t *testing.T) { + _, err := FromFile("testdata/invalid_doubledefault.yaml") + assert.Error(t, err) +} diff --git a/pkg/flows/config/testdata/invalid_doubledefault.yaml b/pkg/flows/config/testdata/invalid_doubledefault.yaml new file mode 100644 index 00000000..57eebb60 --- /dev/null +++ b/pkg/flows/config/testdata/invalid_doubledefault.yaml @@ -0,0 +1,23 @@ +flows: + flow1: + default: true + ingestion: + - filetypes: [".txt", ".md"] + documentLoader: + name: "textLoader" + textSplitter: + name: "simpleSplitter" + transformers: + - "transformer1" + - "transformer2" + retrieval: {} + flow2: + default: true + ingestion: + - filetypes: [".json"] + documentLoader: + name: "jsonLoader" + textSplitter: + name: "jsonSplitter" + transformers: ["transformer3"] + retrieval: {} \ No newline at end of file diff --git a/pkg/flows/config/testdata/valid.yaml b/pkg/flows/config/testdata/valid.yaml index f587f5ad..18ac9aff 100644 --- a/pkg/flows/config/testdata/valid.yaml +++ b/pkg/flows/config/testdata/valid.yaml @@ -1,5 +1,6 @@ flows: flow1: + default: true ingestion: - filetypes: [".txt", ".md"] documentLoader: