Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: structured json dataloader (#151)
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed Oct 22, 2024
1 parent fbc53d0 commit a75f75e
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 22 deletions.
20 changes: 20 additions & 0 deletions examples/structured-ingestion/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"metadata": {
"source": "https://example.com",
"filename": "foo.pdf"
},
"documents": [
{
"metadata": {
"page": 1
},
"content": "This is the first page of the document."
},
{
"metadata": {
"page": 2
},
"content": "This is the second page of the document."
}
]
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ require (
github.com/jackc/pgx/v5 v5.7.1
github.com/jmcarbo/stopwords v1.1.9
github.com/joho/godotenv v1.5.1
github.com/knadh/koanf/maps v0.1.1
github.com/knadh/koanf/parsers/json v0.1.0
github.com/knadh/koanf/parsers/yaml v0.1.0
github.com/knadh/koanf/providers/env v0.1.0
Expand Down Expand Up @@ -126,7 +127,6 @@ require (
github.com/kevinburke/ssh_config v1.2.0 // indirect
github.com/klauspost/compress v1.17.6 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/knadh/koanf/maps v0.1.1 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
Expand Down
2 changes: 0 additions & 2 deletions pkg/client/ignore.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ func isIgnored(ignore gitignore.Matcher, path string) bool {
}

func readDefaultIgnoreFile(dirPath string) ([]gitignore.Pattern, error) {

ignoreFilePath := filepath.Join(dirPath, DefaultIgnoreFile)
_, err := os.Stat(ignoreFilePath)
if err != nil {
Expand All @@ -38,7 +37,6 @@ func readDefaultIgnoreFile(dirPath string) ([]gitignore.Pattern, error) {
}

func useDefaultIgnoreFileIfExists(path string) ([]gitignore.Pattern, error) {

var err error
path, err = filepath.Abs(path)
if err != nil {
Expand Down
93 changes: 82 additions & 11 deletions pkg/cmd/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@ import (
"encoding/json"
"fmt"
"os"
"slices"
"strings"

"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader/structured"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
"github.com/knadh/koanf/maps"
"github.com/spf13/cobra"
)

type ClientLoad struct {
Loader string `usage:"Choose a document loader to use"`
Loader string `usage:"Choose a document loader to use"`
OutputFormat string `name:"format" usage:"Choose an output format" default:"structured"`
}

func (s *ClientLoad) Customize(cmd *cobra.Command) {
Expand All @@ -26,6 +30,10 @@ func (s *ClientLoad) Run(cmd *cobra.Command, args []string) error {
input := args[0]
output := args[1]

if !slices.Contains([]string{"structured", "markdown"}, s.OutputFormat) {
return fmt.Errorf("unsupported output format %q", s.OutputFormat)
}

inputBytes, err := os.ReadFile(input)
if err != nil {
return fmt.Errorf("failed to read input file %q: %w", input, err)
Expand Down Expand Up @@ -57,23 +65,64 @@ func (s *ClientLoad) Run(cmd *cobra.Command, args []string) error {
return fmt.Errorf("failed to load documents: %w", err)
}

var texts []string
for _, doc := range docs {
if len(doc.Content) == 0 {
continue
var text string

switch s.OutputFormat {
case "markdown":
var texts []string
for _, doc := range docs {
if len(doc.Content) == 0 {
continue
}

metadata, err := json.Marshal(doc.Metadata)
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
}

content := fmt.Sprintf("!metadata %s\n%s", metadata, doc.Content)

texts = append(texts, content)
}

metadata, err := json.Marshal(doc.Metadata)
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
text = strings.Join(texts, "\n---docbreak---\n")

case "structured":
var structuredInput structured.StructuredInput
structuredInput.Metadata = map[string]any{}
structuredInput.Documents = make([]structured.StructuredInputDocument, 0, len(docs))

commonMetadata := maps.Copy(docs[0].Metadata)
for _, doc := range docs {
commonMetadata = extractCommon(commonMetadata, doc.Metadata)
structuredInput.Documents = append(structuredInput.Documents, structured.StructuredInputDocument{
Metadata: doc.Metadata,
Content: doc.Content,
})
}

content := fmt.Sprintf("!metadata %s\n%s", metadata, doc.Content)
commonMetadata["source"] = input
structuredInput.Metadata = commonMetadata

texts = append(texts, content)
for i, doc := range structuredInput.Documents {
structuredInput.Documents[i].Metadata = dropCommon(doc.Metadata, commonMetadata)
}

jsonBytes := bytes.NewBuffer(nil)
encoder := json.NewEncoder(jsonBytes)
encoder.SetIndent("", " ")
if err := encoder.Encode(structuredInput); err != nil {
return fmt.Errorf("failed to encode structured input: %w", err)
}
text = jsonBytes.String()
default:
return fmt.Errorf("unsupported output format %q", s.OutputFormat)
}

text := strings.Join(texts, "\n---docbreak---\n")
if output == "-" {
fmt.Println(text)
return nil
}

outputFile, err := os.Create(output)
if err != nil {
Expand All @@ -87,3 +136,25 @@ func (s *ClientLoad) Run(cmd *cobra.Command, args []string) error {

return nil
}

func dropCommon(target, common map[string]any) map[string]any {
for key, _ := range target {
if _, exists := common[key]; exists {
delete(target, key)
}
}

return target
}

func extractCommon(target, other map[string]any) map[string]any {
for key, value := range target {
if v, exists := other[key]; exists && v == value {
target[key] = value
} else {
delete(target, key)
}
}

return target
}
8 changes: 8 additions & 0 deletions pkg/datastore/documentloader/documentloaders.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"

"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader/pdf/gopdf"
"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader/structured"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"

golcdocloaders "github.com/hupe1980/golc/documentloader"
Expand Down Expand Up @@ -42,6 +43,8 @@ func GetDocumentLoaderConfig(name string) (any, error) {
return golcdocloaders.CSVOptions{}, nil
case "notebook":
return golcdocloaders.NotebookOptions{}, nil
case "structured":
return structured.Structured{}, nil
default:
return nil, fmt.Errorf("unknown document loader %q", name)
}
Expand Down Expand Up @@ -151,6 +154,11 @@ func GetDocumentLoaderFunc(name string, config any) (LoaderFunc, error) {
}
return FromLangchain(lcgodocloaders.NewText(strings.NewReader(text))).Load(ctx)
}, nil
case "structured":
if config != nil {
return nil, fmt.Errorf("structured document loader does not accept configuration")
}
return new(structured.Structured).Load, nil
default:
return nil, fmt.Errorf("unknown document loader %q", name)
}
Expand Down
41 changes: 41 additions & 0 deletions pkg/datastore/documentloader/structured/structured.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package structured

import (
"context"
"encoding/json"
"fmt"
"io"

vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
"github.com/knadh/koanf/maps"
)

type StructuredInputDocument struct {
Metadata map[string]any `json:"metadata"`
Content string `json:"content"`
}

type StructuredInput struct {
Metadata map[string]any `json:"metadata"`
Documents []StructuredInputDocument `json:"documents"`
}

type Structured struct{}

func (s *Structured) Load(ctx context.Context, reader io.Reader) ([]vs.Document, error) {
var input StructuredInput
if err := json.NewDecoder(reader).Decode(&input); err != nil {
return nil, fmt.Errorf("failed to decode input: %w", err)
}

docs := make([]vs.Document, 0, len(input.Documents))
for _, doc := range input.Documents {
maps.Merge(maps.Copy(input.Metadata), doc.Metadata)
docs = append(docs, vs.Document{
Content: doc.Content,
Metadata: doc.Metadata,
})
}

return docs, nil
}
8 changes: 0 additions & 8 deletions pkg/vectorstore/sqlite-vec/sqlite-vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ func (v *VectorStore) Close() error {
}

func (v *VectorStore) prepareTables(ctx context.Context) error {

err := v.db.Exec(fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s
(
id TEXT PRIMARY KEY,
Expand Down Expand Up @@ -84,11 +83,9 @@ func (v *VectorStore) CreateCollection(ctx context.Context, collection string) e
embedding float[%d] distance_metric=cosine
)
`, collection, dimensionality))

}

func (v *VectorStore) AddDocuments(ctx context.Context, docs []vs.Document, collection string) ([]string, error) {

stmt, _, err := v.db.Prepare(fmt.Sprintf(`INSERT INTO %s_vec(document_id, embedding) VALUES (?, ?)`, collection))
if err != nil {
return nil, fmt.Errorf("failed to prepare statement: %w", err)
Expand Down Expand Up @@ -215,7 +212,6 @@ func (v *VectorStore) SimilaritySearch(ctx context.Context, query string, numDoc
}

return docs, nil

}

func (v *VectorStore) RemoveCollection(ctx context.Context, collection string) error {
Expand Down Expand Up @@ -265,7 +261,6 @@ func (v *VectorStore) RemoveDocument(ctx context.Context, documentID string, col
if stmt.Err() != nil {
return fmt.Errorf("failed to execute statement: %w", stmt.Err())
}

} else {
ids = []string{documentID}
}
Expand All @@ -284,7 +279,6 @@ func (v *VectorStore) RemoveDocument(ctx context.Context, documentID string, col
}

for _, id := range ids {

slog.Debug("deleting document from sqlite-vec", "id", id)

if err := embStmt.BindText(1, id); err != nil {
Expand All @@ -310,7 +304,6 @@ func (v *VectorStore) RemoveDocument(ctx context.Context, documentID string, col
if err := colStmt.Reset(); err != nil {
return fmt.Errorf("failed to reset statement: %w", err)
}

}

return nil
Expand Down Expand Up @@ -358,7 +351,6 @@ func (v *VectorStore) GetDocuments(ctx context.Context, collection string, where
if stmt.Err() != nil {
return nil, fmt.Errorf("failed to execute statement: %w", stmt.Err())
}

}
return docs, nil
}
Expand Down

0 comments on commit a75f75e

Please sign in to comment.