Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: knowledge load command to just use a documentloader for extracti…
Browse files Browse the repository at this point in the history
…ng text/markdown from a file
  • Loading branch information
iwilltry42 committed Oct 10, 2024
1 parent 2fb4119 commit cec79ef
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 1 deletion.
90 changes: 90 additions & 0 deletions pkg/cmd/load.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package cmd

import (
"bytes"
"encoding/json"
"fmt"
"os"
"strings"

"github.com/gptscript-ai/knowledge/pkg/datastore/documentloader"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
"github.com/spf13/cobra"
)

type ClientLoad struct {
Loader string `usage:"Choose a document loader to use"`
}

func (s *ClientLoad) Customize(cmd *cobra.Command) {
cmd.Use = "load <input> <output>"
cmd.Short = "Load a file and transform it to markdown"
cmd.Args = cobra.ExactArgs(2)
}

func (s *ClientLoad) Run(cmd *cobra.Command, args []string) error {
input := args[0]
output := args[1]

inputBytes, err := os.ReadFile(input)
if err != nil {
return fmt.Errorf("failed to read input file %q: %w", input, err)
}

filetype, err := filetypes.GetFiletype(input, inputBytes)
if err != nil {
return fmt.Errorf("failed to get filetype for input file %q: %w", input, err)
}

var loader documentloader.LoaderFunc

if s.Loader == "" {
loader = documentloader.DefaultDocLoaderFunc(filetype, documentloader.DefaultDocLoaderFuncOpts{})
} else {
var err error
loader, err = documentloader.GetDocumentLoaderFunc(s.Loader, nil)
if err != nil {
return fmt.Errorf("failed to get document loader function %q: %w", s.Loader, err)
}
}

if loader == nil {
return fmt.Errorf("unsupported file type %q", input)
}

docs, err := loader(cmd.Context(), bytes.NewReader(inputBytes))
if err != nil {
return fmt.Errorf("failed to load documents: %w", err)
}

var texts []string
for _, doc := range docs {

if len(doc.Content) == 0 {
continue
}

metadata, err := json.Marshal(doc.Metadata)
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
}

content := fmt.Sprintf("!metadata %s\n%s", metadata, doc.Content)

texts = append(texts, content)
}

text := strings.Join(texts, "\n---docbreak---\n")

outputFile, err := os.Create(output)
if err != nil {
return fmt.Errorf("failed to create output file %q: %w", output, err)
}

_, err = outputFile.WriteString(text)
if err != nil {
return fmt.Errorf("failed to write to output file %q: %w", output, err)
}

return nil
}
1 change: 1 addition & 0 deletions pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func New() *cobra.Command {
new(ClientExportDatasets),
new(ClientImportDatasets),
new(ClientEditDataset),
new(ClientLoad),
new(Version),
)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/datastore/documentloader/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ type ArchiveOpts struct {
ErrOnFailedFile bool
}

func DefaultDocLoaderFunc(filetype string, opts DefaultDocLoaderFuncOpts) func(ctx context.Context, reader io.Reader) ([]vs.Document, error) {
func DefaultDocLoaderFunc(filetype string, opts DefaultDocLoaderFuncOpts) LoaderFunc {
switch filetype {
case ".pdf", "application/pdf":
return func(ctx context.Context, reader io.Reader) ([]vs.Document, error) {
Expand Down

0 comments on commit cec79ef

Please sign in to comment.