From da66f3603c1e70cc0455fdc8b59039af883c7569 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Sat, 26 Oct 2024 09:52:43 +0200 Subject: [PATCH] change: finish markdown_basic splitter and set it as default --- .../markdown_basic/markdown_basic.go | 23 ++++++++++++++++++- .../textsplitter/markdown_basic/options.go | 18 +++++++++++++++ pkg/datastore/textsplitter/textsplitter.go | 16 +++++++++++++ pkg/flows/config/blueprints/default.yaml | 14 +++++++++-- 4 files changed, 68 insertions(+), 3 deletions(-) diff --git a/pkg/datastore/textsplitter/markdown_basic/markdown_basic.go b/pkg/datastore/textsplitter/markdown_basic/markdown_basic.go index e978ff46..e12f23f1 100644 --- a/pkg/datastore/textsplitter/markdown_basic/markdown_basic.go +++ b/pkg/datastore/textsplitter/markdown_basic/markdown_basic.go @@ -1,9 +1,11 @@ package markdown_basic import ( - lcgosplitter "github.com/tmc/langchaingo/textsplitter" "strings" "unicode/utf8" + + vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types" + lcgosplitter "github.com/tmc/langchaingo/textsplitter" ) // NewMarkdownTextSplitter creates a new Markdown text splitter. @@ -41,6 +43,25 @@ type MarkdownTextSplitter struct { IgnoreHeadingOnly bool } +func (sp MarkdownTextSplitter) SplitDocuments(docs []vs.Document) ([]vs.Document, error) { + var newDocs []vs.Document + for _, doc := range docs { + chunks, err := sp.SplitText(doc.Content) + if err != nil { + return nil, err + } + + for _, chunk := range chunks { + newDocs = append(newDocs, vs.Document{ + Content: chunk, + Metadata: doc.Metadata, + }) + } + } + + return newDocs, nil +} + // SplitText splits a text into multiple text. func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) { // Parse markdown line-by-line diff --git a/pkg/datastore/textsplitter/markdown_basic/options.go b/pkg/datastore/textsplitter/markdown_basic/options.go index 186bd4bd..dd7b99ce 100644 --- a/pkg/datastore/textsplitter/markdown_basic/options.go +++ b/pkg/datastore/textsplitter/markdown_basic/options.go @@ -36,6 +36,24 @@ func DefaultOptions() Options { // Option is a function that can be used to set options for a text splitter. type Option func(*Options) +func WithConfig(cfg MarkdownTextSplitter) Option { + return func(o *Options) { + if cfg.ChunkSize != 0 { + o.ChunkSize = cfg.ChunkSize + } + if cfg.ChunkOverlap != 0 { + o.ChunkOverlap = cfg.ChunkOverlap + } + if cfg.SecondSplitter != nil { + o.SecondSplitter = cfg.SecondSplitter + } + if cfg.MaxHeadingLevel != 0 { + o.MaxHeadingLevel = cfg.MaxHeadingLevel + } + o.IgnoreHeadingOnly = cfg.IgnoreHeadingOnly + } +} + // WithChunkSize sets the chunk size for a text splitter. func WithChunkSize(chunkSize int) Option { return func(o *Options) { diff --git a/pkg/datastore/textsplitter/textsplitter.go b/pkg/datastore/textsplitter/textsplitter.go index 4d417370..b287051e 100644 --- a/pkg/datastore/textsplitter/textsplitter.go +++ b/pkg/datastore/textsplitter/textsplitter.go @@ -6,6 +6,7 @@ import ( "dario.cat/mergo" "github.com/gptscript-ai/knowledge/pkg/datastore/defaults" + mdbasic "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter/markdown_basic" dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types" vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types" "github.com/mitchellh/mapstructure" @@ -57,6 +58,8 @@ func GetTextSplitterConfig(name string) (any, error) { switch name { case "text", "markdown": return TextSplitterOpts{}, nil + case "markdown_basic": + return mdbasic.MarkdownTextSplitter{}, nil default: return nil, fmt.Errorf("unknown text splitter %q", name) } @@ -94,6 +97,19 @@ func GetTextSplitter(name string, config any) (dstypes.TextSplitter, error) { } slog.Debug("MarkdownSplitter", "config", cfg) return FromLangchain(NewLcgoMarkdownSplitter(cfg)), nil + case "markdown_basic": + cfg := mdbasic.MarkdownTextSplitter{} + if config != nil { + var customCfg mdbasic.MarkdownTextSplitter + if err := mapstructure.Decode(config, &customCfg); err != nil { + return nil, fmt.Errorf("failed to decode markdown basic splitter configuration: %w", err) + } + if err := mergo.Merge(&customCfg, cfg); err != nil { + return nil, fmt.Errorf("failed to merge markdown basic splitter configuration: %w", err) + } + cfg = customCfg + } + return mdbasic.NewMarkdownTextSplitter(mdbasic.WithConfig(cfg)), nil default: return nil, fmt.Errorf("unknown text splitter %q", name) } diff --git a/pkg/flows/config/blueprints/default.yaml b/pkg/flows/config/blueprints/default.yaml index a3538cad..65c8aedc 100644 --- a/pkg/flows/config/blueprints/default.yaml +++ b/pkg/flows/config/blueprints/default.yaml @@ -4,9 +4,19 @@ flows: globals: ingestion: textsplitter: - chunkSize: 800 - chunkOverlap: 400 + chunkSize: 1024 + chunkOverlap: 256 ingestion: + - filetypes: [".md"] + documentloader: + name: plaintext + textsplitter: + name: markdown_basic + options: + maxHeadingLevel: 2 + ignoreHeadingOnly: true + transformers: + - name: filter_markdown_docs_no_content - filetypes: ["*"] retrieval: retriever: