Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

change: finish markdown_basic splitter and set it as default #158

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion pkg/datastore/textsplitter/markdown_basic/markdown_basic.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package markdown_basic

import (
lcgosplitter "github.com/tmc/langchaingo/textsplitter"
"strings"
"unicode/utf8"

vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
lcgosplitter "github.com/tmc/langchaingo/textsplitter"
)

// NewMarkdownTextSplitter creates a new Markdown text splitter.
Expand Down Expand Up @@ -41,6 +43,25 @@ type MarkdownTextSplitter struct {
IgnoreHeadingOnly bool
}

func (sp MarkdownTextSplitter) SplitDocuments(docs []vs.Document) ([]vs.Document, error) {
var newDocs []vs.Document
for _, doc := range docs {
chunks, err := sp.SplitText(doc.Content)
if err != nil {
return nil, err
}

for _, chunk := range chunks {
newDocs = append(newDocs, vs.Document{
Content: chunk,
Metadata: doc.Metadata,
})
}
}

return newDocs, nil
}

// SplitText splits a text into multiple text.
func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) {
// Parse markdown line-by-line
Expand Down
18 changes: 18 additions & 0 deletions pkg/datastore/textsplitter/markdown_basic/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,24 @@ func DefaultOptions() Options {
// Option is a function that can be used to set options for a text splitter.
type Option func(*Options)

func WithConfig(cfg MarkdownTextSplitter) Option {
return func(o *Options) {
if cfg.ChunkSize != 0 {
o.ChunkSize = cfg.ChunkSize
}
if cfg.ChunkOverlap != 0 {
o.ChunkOverlap = cfg.ChunkOverlap
}
if cfg.SecondSplitter != nil {
o.SecondSplitter = cfg.SecondSplitter
}
if cfg.MaxHeadingLevel != 0 {
o.MaxHeadingLevel = cfg.MaxHeadingLevel
}
o.IgnoreHeadingOnly = cfg.IgnoreHeadingOnly
}
}

// WithChunkSize sets the chunk size for a text splitter.
func WithChunkSize(chunkSize int) Option {
return func(o *Options) {
Expand Down
16 changes: 16 additions & 0 deletions pkg/datastore/textsplitter/textsplitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"dario.cat/mergo"
"github.com/gptscript-ai/knowledge/pkg/datastore/defaults"
mdbasic "github.com/gptscript-ai/knowledge/pkg/datastore/textsplitter/markdown_basic"
dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
"github.com/mitchellh/mapstructure"
Expand Down Expand Up @@ -57,6 +58,8 @@ func GetTextSplitterConfig(name string) (any, error) {
switch name {
case "text", "markdown":
return TextSplitterOpts{}, nil
case "markdown_basic":
return mdbasic.MarkdownTextSplitter{}, nil
default:
return nil, fmt.Errorf("unknown text splitter %q", name)
}
Expand Down Expand Up @@ -94,6 +97,19 @@ func GetTextSplitter(name string, config any) (dstypes.TextSplitter, error) {
}
slog.Debug("MarkdownSplitter", "config", cfg)
return FromLangchain(NewLcgoMarkdownSplitter(cfg)), nil
case "markdown_basic":
cfg := mdbasic.MarkdownTextSplitter{}
if config != nil {
var customCfg mdbasic.MarkdownTextSplitter
if err := mapstructure.Decode(config, &customCfg); err != nil {
return nil, fmt.Errorf("failed to decode markdown basic splitter configuration: %w", err)
}
if err := mergo.Merge(&customCfg, cfg); err != nil {
return nil, fmt.Errorf("failed to merge markdown basic splitter configuration: %w", err)
}
cfg = customCfg
}
return mdbasic.NewMarkdownTextSplitter(mdbasic.WithConfig(cfg)), nil
default:
return nil, fmt.Errorf("unknown text splitter %q", name)
}
Expand Down
14 changes: 12 additions & 2 deletions pkg/flows/config/blueprints/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,19 @@ flows:
globals:
ingestion:
textsplitter:
chunkSize: 800
chunkOverlap: 400
chunkSize: 1024
chunkOverlap: 256
ingestion:
- filetypes: [".md"]
documentloader:
name: plaintext
textsplitter:
name: markdown_basic
options:
maxHeadingLevel: 2
ignoreHeadingOnly: true
transformers:
- name: filter_markdown_docs_no_content
- filetypes: ["*"]
retrieval:
retriever:
Expand Down