From ded76747fc0f9fa3642d9a6d1192ecc40b756b51 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Thu, 29 Aug 2024 21:15:33 +0200 Subject: [PATCH] add: metadata-manipulator transformer & post-processor (#97) --- examples/no-filenames.yaml | 17 ++++++ .../postprocessors/postprocessors.go | 17 +++++- pkg/datastore/transformers/metadata.go | 58 ++++++++++++++++++- pkg/datastore/transformers/transformers.go | 1 + pkg/flows/config/config.go | 14 ++++- 5 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 examples/no-filenames.yaml diff --git a/examples/no-filenames.yaml b/examples/no-filenames.yaml new file mode 100644 index 0000000..6fed7b5 --- /dev/null +++ b/examples/no-filenames.yaml @@ -0,0 +1,17 @@ +flows: + bm25: + default: true + retrieval: + retriever: + name: basic + options: + topK: 1 + postprocessors: + - name: metadata + options: + manipulations: + - operator: add + key: foobar + value: 42 + - operator: remove + key: absPath diff --git a/pkg/datastore/postprocessors/postprocessors.go b/pkg/datastore/postprocessors/postprocessors.go index 3bd6b59..987dda2 100644 --- a/pkg/datastore/postprocessors/postprocessors.go +++ b/pkg/datastore/postprocessors/postprocessors.go @@ -7,6 +7,7 @@ import ( "github.com/gptscript-ai/knowledge/pkg/datastore/transformers" "github.com/gptscript-ai/knowledge/pkg/datastore/types" + "github.com/mitchellh/mapstructure" ) // Postprocessor is similar to types.DocumentTransformer, but can take into account the retrieval query @@ -16,7 +17,7 @@ type Postprocessor interface { } type TransformerWrapper struct { - types.DocumentTransformer + DocumentTransformer types.DocumentTransformer } func NewTransformerWrapper(transformer types.DocumentTransformer) *TransformerWrapper { @@ -38,10 +39,24 @@ func (t *TransformerWrapper) Name() string { return t.DocumentTransformer.Name() } +func (t *TransformerWrapper) Decode(cfg map[string]any) error { + transformerCfg, err := transformers.GetTransformer(t.Name()) + if err != nil { + return err + } + err = mapstructure.Decode(cfg, &transformerCfg) + if err != nil { + return fmt.Errorf("failed to decode transformer configuration: %w", err) + } + t.DocumentTransformer = transformerCfg + return nil +} + var PostprocessorMap = map[string]Postprocessor{ transformers.ExtraMetadataName: NewTransformerWrapper(&transformers.ExtraMetadata{}), transformers.KeywordExtractorName: NewTransformerWrapper(&transformers.KeywordExtractor{}), transformers.FilterMarkdownDocsNoContentName: NewTransformerWrapper(&transformers.FilterMarkdownDocsNoContent{}), + transformers.MetadataManipulatorName: NewTransformerWrapper(&transformers.MetadataManipulator{}), SimilarityPostprocessorName: &SimilarityPostprocessor{}, ContentSubstringFilterPostprocessorName: &ContentSubstringFilterPostprocessor{}, ContentFilterPostprocessorName: &ContentFilterPostprocessor{}, diff --git a/pkg/datastore/transformers/metadata.go b/pkg/datastore/transformers/metadata.go index 1e33ac9..7a1da38 100644 --- a/pkg/datastore/transformers/metadata.go +++ b/pkg/datastore/transformers/metadata.go @@ -2,6 +2,8 @@ package transformers import ( "context" + "fmt" + "log/slog" vs "github.com/gptscript-ai/knowledge/pkg/vectorstore" ) @@ -13,10 +15,12 @@ type ExtraMetadata struct { } func (e *ExtraMetadata) Transform(_ context.Context, docs []vs.Document) ([]vs.Document, error) { - for _, doc := range docs { + for i, doc := range docs { + metadata := doc.Metadata for k, v := range e.Metadata { - doc.Metadata[k] = v + metadata[k] = v } + docs[i].Metadata = metadata } return docs, nil } @@ -24,3 +28,53 @@ func (e *ExtraMetadata) Transform(_ context.Context, docs []vs.Document) ([]vs.D func (e *ExtraMetadata) Name() string { return ExtraMetadataName } + +const MetadataManipulatorName = "metadata" + +type MetadataManipulationOperator string + +const ( + MetadataManipulationOperatorAdd MetadataManipulationOperator = "add" + MetadataManipulationOperatorUpdate MetadataManipulationOperator = "upsert" + MetadataManipulationOperatorRemove MetadataManipulationOperator = "remove" +) + +type MetadataManipulation struct { + Operator MetadataManipulationOperator `json:"operator,omitempty" mapstructure:"operator"` + Key string `json:"key,omitempty" mapstructure:"key"` + Value any `json:"value,omitempty" mapstructure:"value"` +} + +type MetadataManipulator struct { + Manipulations []MetadataManipulation +} + +func (m *MetadataManipulator) Name() string { + return MetadataManipulatorName +} + +func (m *MetadataManipulator) Transform(_ context.Context, docs []vs.Document) ([]vs.Document, error) { + for i, doc := range docs { + metadata := doc.Metadata + if metadata == nil { + metadata = make(map[string]any) + } + slog.Debug("metadata manipulator", "docMetadata", metadata, "manipulations", m.Manipulations) + for _, manipulation := range m.Manipulations { + switch manipulation.Operator { + case MetadataManipulationOperatorAdd: + if _, exists := metadata[manipulation.Key]; exists { + return nil, fmt.Errorf("metadata key %q already exists in document", manipulation.Key) + } + metadata[manipulation.Key] = manipulation.Value + case MetadataManipulationOperatorUpdate: + metadata[manipulation.Key] = manipulation.Value + case MetadataManipulationOperatorRemove: + delete(metadata, manipulation.Key) + } + } + slog.Debug("metadata manipulator DONE", "docMetadata", metadata) + docs[i].Metadata = metadata + } + return docs, nil +} diff --git a/pkg/datastore/transformers/transformers.go b/pkg/datastore/transformers/transformers.go index e22f05f..ae47b74 100644 --- a/pkg/datastore/transformers/transformers.go +++ b/pkg/datastore/transformers/transformers.go @@ -10,6 +10,7 @@ var TransformerMap = map[string]types.DocumentTransformer{ ExtraMetadataName: &ExtraMetadata{}, FilterMarkdownDocsNoContentName: &FilterMarkdownDocsNoContent{}, KeywordExtractorName: &KeywordExtractor{}, + MetadataManipulatorName: &MetadataManipulator{}, } func GetTransformer(name string) (types.DocumentTransformer, error) { diff --git a/pkg/flows/config/config.go b/pkg/flows/config/config.go index 9a64066..599cacc 100644 --- a/pkg/flows/config/config.go +++ b/pkg/flows/config/config.go @@ -299,9 +299,19 @@ func (r *RetrievalFlowConfig) AsRetrievalFlow() (*flows.RetrievalFlow, error) { if err != nil { return nil, err } + if len(pp.Options) > 0 { - if err := mapstructure.Decode(pp.Options, &postprocessor); err != nil { - return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err) + // if it's a transformer wrapper, call decode + if transformerWrapper, ok := postprocessor.(*postprocessors.TransformerWrapper); ok { + if err := transformerWrapper.Decode(pp.Options); err != nil { + return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err) + } + postprocessor = transformerWrapper + } else { + + if err := mapstructure.Decode(pp.Options, &postprocessor); err != nil { + return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err) + } } slog.Debug("Postprocessor custom configuration", "name", pp.Name, "config", output.RedactSensitive(postprocessor)) }