Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
change: finish markdown_basic splitter and set it as default
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed Oct 26, 2024
1 parent 75062f8 commit da66f36
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 3 deletions.
23 changes: 22 additions & 1 deletion pkg/datastore/textsplitter/markdown_basic/markdown_basic.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package markdown_basic

import (
lcgosplitter "github.com/tmc/langchaingo/textsplitter"
"strings"
"unicode/utf8"

vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
lcgosplitter "github.com/tmc/langchaingo/textsplitter"
)

// NewMarkdownTextSplitter creates a new Markdown text splitter.
Expand Down Expand Up @@ -41,6 +43,25 @@ type MarkdownTextSplitter struct {
IgnoreHeadingOnly bool
}

func (sp MarkdownTextSplitter) SplitDocuments(docs []vs.Document) ([]vs.Document, error) {
var newDocs []vs.Document
for _, doc := range docs {
chunks, err := sp.SplitText(doc.Content)
if err != nil {
return nil, err
}

for _, chunk := range chunks {
newDocs = append(newDocs, vs.Document{
Content: chunk,
Metadata: doc.Metadata,
})
}
}

return newDocs, nil
}

// SplitText splits a text into multiple text.
func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) {
// Parse markdown line-by-line
Expand Down
18 changes: 18 additions & 0 deletions pkg/datastore/textsplitter/markdown_basic/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,24 @@ func DefaultOptions() Options {
// Option is a function that can be used to set options for a text splitter.
type Option func(*Options)

func WithConfig(cfg MarkdownTextSplitter) Option {
return func(o *Options) {
if cfg.ChunkSize != 0 {
o.ChunkSize = cfg.ChunkSize
}
if cfg.ChunkOverlap != 0 {
o.ChunkOverlap = cfg.ChunkOverlap
}
if cfg.SecondSplitter != nil {
o.SecondSplitter = cfg.SecondSplitter
}
if cfg.MaxHeadingLevel != 0 {
o.MaxHeadingLevel = cfg.MaxHeadingLevel
}
o.IgnoreHeadingOnly = cfg.IgnoreHeadingOnly
}
}

// WithChunkSize sets the chunk size for a text splitter.
func WithChunkSize(chunkSize int) Option {
return func(o *Options) {
Expand Down
16 changes: 16 additions & 0 deletions pkg/datastore/textsplitter/textsplitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"dario.cat/mergo"
"github.com/gptscript-ai/knowledge/pkg/datastore/defaults"
mdbasic "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter/markdown_basic"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
"github.com/mitchellh/mapstructure"
Expand Down Expand Up @@ -57,6 +58,8 @@ func GetTextSplitterConfig(name string) (any, error) {
switch name {
case "text", "markdown":
return TextSplitterOpts{}, nil
case "markdown_basic":
return mdbasic.MarkdownTextSplitter{}, nil
default:
return nil, fmt.Errorf("unknown text splitter %q", name)
}
Expand Down Expand Up @@ -94,6 +97,19 @@ func GetTextSplitter(name string, config any) (dstypes.TextSplitter, error) {
}
slog.Debug("MarkdownSplitter", "config", cfg)
return FromLangchain(NewLcgoMarkdownSplitter(cfg)), nil
case "markdown_basic":
cfg := mdbasic.MarkdownTextSplitter{}
if config != nil {
var customCfg mdbasic.MarkdownTextSplitter
if err := mapstructure.Decode(config, &customCfg); err != nil {
return nil, fmt.Errorf("failed to decode markdown basic splitter configuration: %w", err)
}
if err := mergo.Merge(&customCfg, cfg); err != nil {
return nil, fmt.Errorf("failed to merge markdown basic splitter configuration: %w", err)
}
cfg = customCfg
}
return mdbasic.NewMarkdownTextSplitter(mdbasic.WithConfig(cfg)), nil
default:
return nil, fmt.Errorf("unknown text splitter %q", name)
}
Expand Down
14 changes: 12 additions & 2 deletions pkg/flows/config/blueprints/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,19 @@ flows:
globals:
ingestion:
textsplitter:
chunkSize: 800
chunkOverlap: 400
chunkSize: 1024
chunkOverlap: 256
ingestion:
- filetypes: [".md"]
documentloader:
name: plaintext
textsplitter:
name: markdown_basic
options:
maxHeadingLevel: 2
ignoreHeadingOnly: true
transformers:
- name: filter_markdown_docs_no_content
- filetypes: ["*"]
retrieval:
retriever:
Expand Down

0 comments on commit da66f36

Please sign in to comment.