Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: filetype detection + default flows
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed May 27, 2024
1 parent 38556c6 commit 1d37ee3
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 64 deletions.
41 changes: 41 additions & 0 deletions pkg/datastore/filetypes/filetypes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package filetypes

import (
"fmt"
"github.com/gabriel-vasile/mimetype"
"log/slog"
"path"
"strings"
)

var FirstclassFileExtensions = map[string]struct{}{
".pdf": {},
".html": {},
".md": {},
".txt": {},
".docx": {},
".odt": {},
".rtf": {},
".csv": {},
".ipynb": {},
".json": {},
}

// GetFiletype returns the filetype of a file based on its filename or content.
func GetFiletype(filename string, content []byte) (string, error) {

// 1. By file extension, if available and first-class supported
ext := path.Ext(filename)
if _, ok := FirstclassFileExtensions[ext]; ok {
return ext, nil
}

// 2. By content (mimetype)
mt := mimetype.Detect(content)
if mt != nil {
return strings.Split(mt.String(), ";")[0], nil // remove charset (mimetype), e.g. from "text/plain; charset=utf-8"
}

slog.Error("Failed to detect filetype", "filename", filename)
return "", fmt.Errorf("failed to detect filetype")
}
59 changes: 7 additions & 52 deletions pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import (
"errors"
"fmt"
"github.com/acorn-io/z"
"github.com/gabriel-vasile/mimetype"
"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
"github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter"
"github.com/gptscript-ai/knowledge/pkg/datastore/transformers"
"github.com/gptscript-ai/knowledge/pkg/datastore/types"
Expand All @@ -22,31 +22,16 @@ import (
lcgodocloaders "github.com/tmc/langchaingo/documentloaders"
"io"
"log/slog"
"path"
"strings"
)

const ()

var firstclassFileExtensions = map[string]struct{}{
".pdf": {},
".html": {},
".md": {},
".txt": {},
".docx": {},
".odt": {},
".rtf": {},
".csv": {},
".ipynb": {},
".json": {},
}

type IngestOpts struct {
Filename *string
FileMetadata *index.FileMetadata
IsDuplicateFuncName string
IsDuplicateFunc IsDuplicateFunc
TextSplitterOpts *textsplitter.TextSplitterOpts
IngestionFlow *flows.IngestionFlow
}

// Ingest loads a document from a reader and adds it to the dataset.
Expand All @@ -73,27 +58,11 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
/*
* Detect filetype
*/
reader := bytes.NewReader(content)
var filetype string
if opts.Filename != nil {
filetype = path.Ext(*opts.Filename)
if _, ok := firstclassFileExtensions[filetype]; !ok {
filetype = ""
}
}
if filetype == "" {
filetype, _, err = mimetypeFromReader(bytes.NewReader(content))
if err != nil {
slog.Error("Failed to detect filetype", "error", err)
return nil, fmt.Errorf("failed to detect filetype: %w", err)
}
}
if filetype == "" {
slog.Error("Failed to detect filetype", "filename", *opts.Filename)
return nil, fmt.Errorf("failed to detect filetype")
}

filetype = strings.Split(filetype, ";")[0] // remove charset (mimetype), e.g. from "text/plain; charset=utf-8"
filetype, err := filetypes.GetFiletype(*opts.Filename, content)
if err != nil {
return nil, err
}

/*
* Set filename if not provided
Expand Down Expand Up @@ -127,7 +96,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
em := &transformers.ExtraMetadata{Metadata: map[string]any{"filename": *opts.Filename}}
ingestionFlow.Transformations = append(ingestionFlow.Transformations, em)

docs, err := GetDocuments(ctx, reader, ingestionFlow)
docs, err := GetDocuments(ctx, bytes.NewReader(content), ingestionFlow)
if err != nil {
slog.Error("Failed to load documents", "error", err)
return nil, fmt.Errorf("failed to load documents: %w", err)
Expand Down Expand Up @@ -182,20 +151,6 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte
return docIDs, nil
}

// mimetypeFromReader returns the MIME type of input and a new reader which still has the whole input
func mimetypeFromReader(reader io.Reader) (string, io.Reader, error) {
header := bytes.NewBuffer(nil)
mtype, err := mimetype.DetectReader(io.TeeReader(reader, header))
if err != nil {
return "", nil, err
}

// Get back complete input reader
newReader := io.MultiReader(header, reader)

return mtype.String(), newReader, err
}

func DefaultDocLoaderFunc(filetype string) func(ctx context.Context, reader io.Reader) ([]vs.Document, error) {
switch filetype {
case ".pdf", "application/pdf":
Expand Down
64 changes: 52 additions & 12 deletions pkg/flows/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package config

import (
"encoding/json"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
"github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter"
"github.com/gptscript-ai/knowledge/pkg/flows"
Expand All @@ -14,14 +15,10 @@ type FlowConfig struct {
Flows map[string]FlowConfigEntry `json:"flows" yaml:"flows" mapstructure:"flows"`
}

type DocumentLoaderConfig struct {
Name string `json:"name" yaml:"name" mapstructure:"name"`
Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"`
}

type TextSplitterConfig struct {
Name string `json:"name" yaml:"name" mapstructure:"name"`
Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"`
type FlowConfigEntry struct {
Default bool `json:"default,omitempty" yaml:"default" mapstructure:"default"`
Ingestion []IngestionFlowConfig `json:"ingestion,omitempty" yaml:"ingestion" mapstructure:"ingestion"`
Retrieval *RetrievalFlowConfig `json:"retrieval,omitempty" yaml:"retrieval" mapstructure:"retrieval"`
}

type IngestionFlowConfig struct {
Expand All @@ -33,9 +30,14 @@ type IngestionFlowConfig struct {

type RetrievalFlowConfig struct{}

type FlowConfigEntry struct {
Ingestion []IngestionFlowConfig `json:"ingestion,omitempty" yaml:"ingestion" mapstructure:"ingestion"`
Retrieval RetrievalFlowConfig `json:"retrieval,omitempty" yaml:"retrieval" mapstructure:"retrieval"`
type DocumentLoaderConfig struct {
Name string `json:"name" yaml:"name" mapstructure:"name"`
Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"`
}

type TextSplitterConfig struct {
Name string `json:"name" yaml:"name" mapstructure:"name"`
Options map[string]any `json:"options,omitempty" yaml:"options" mapstructure:"options"`
}

// FromFile reads a configuration file and returns a FlowConfig.
Expand All @@ -59,9 +61,45 @@ func FromFile(filename string) (*FlowConfig, error) {
return nil, err
}

return &config, nil
return &config, config.Validate()
}

func (f *FlowConfig) Validate() error {
defaultCount := 0
for name, flow := range f.Flows {
if flow.Default {
defaultCount++
}

if len(flow.Ingestion) == 0 && flow.Retrieval == nil {
return fmt.Errorf("flow %q has neither ingestion nor retrieval specified", name)
}

}
if defaultCount > 1 {
return fmt.Errorf("only one flow can be default, found %d", defaultCount)
}
return nil
}

func (f *FlowConfig) GetDefaultFlowConfigEntry() (*FlowConfigEntry, error) {
for _, flow := range f.Flows {
if flow.Default {
return &flow, nil
}
}
return nil, fmt.Errorf("default flow not found")
}

func (f *FlowConfig) GetFlow(name string) (*FlowConfigEntry, error) {
flow, ok := f.Flows[name]
if !ok {
return nil, fmt.Errorf("flow %q not found", name)
}
return &flow, nil
}

// AsIngestionFlow converts an IngestionFlowConfig to an actual flows.IngestionFlow.
func (i *IngestionFlowConfig) AsIngestionFlow() (*flows.IngestionFlow, error) {
flow := &flows.IngestionFlow{}
if i.DocumentLoader.Name != "" {
Expand Down Expand Up @@ -110,5 +148,7 @@ func (i *IngestionFlowConfig) AsIngestionFlow() (*flows.IngestionFlow, error) {
flow.Split = splitterFunc
}

// TODO: Transformers

return flow, nil
}
5 changes: 5 additions & 0 deletions pkg/flows/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,8 @@ func TestLoadConfigFromNonexistentFile(t *testing.T) {
assert.Error(t, err)
assert.Nil(t, cfg)
}

func TestLoadConfigInvalidDoubleDefault(t *testing.T) {
_, err := FromFile("testdata/invalid_doubledefault.yaml")
assert.Error(t, err)
}
23 changes: 23 additions & 0 deletions pkg/flows/config/testdata/invalid_doubledefault.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
flows:
flow1:
default: true
ingestion:
- filetypes: [".txt", ".md"]
documentLoader:
name: "textLoader"
textSplitter:
name: "simpleSplitter"
transformers:
- "transformer1"
- "transformer2"
retrieval: {}
flow2:
default: true
ingestion:
- filetypes: [".json"]
documentLoader:
name: "jsonLoader"
textSplitter:
name: "jsonSplitter"
transformers: ["transformer3"]
retrieval: {}
1 change: 1 addition & 0 deletions pkg/flows/config/testdata/valid.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
flows:
flow1:
default: true
ingestion:
- filetypes: [".txt", ".md"]
documentLoader:
Expand Down

0 comments on commit 1d37ee3

Please sign in to comment.