gptscript-ai · iwilltry42 · Jul 30, 2024 · Jul 29, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -41,4 +41,5 @@ tests/__pycache__/
 .DS_Store
 
 
-vendor/
+vendor/
+
diff --git a/docs/docs/02-usage.md b/docs/docs/02-usage.md
@@ -45,4 +45,68 @@ This mode is useful when you want to share the data with multiple clients or whe
 
 ```bash
 knowledge server
-```
+```
+
+## Ingestion
+
+To ingest a document, you can use the `knowledge ingest` command:
+
+```bash
+knowledge ingest --dataset my-dataset ./path/to/my-document.txt
+```
+
+:::note
+
+    By default, the dataset will be created if it doesn't exist.
+    If you don't want that, you can use the `--no-create-dataset` flag.
+
+:::
+
+### Ignoring Files
+
+You can ignore files by providing an ignore file, similar to `.gitignore`:
+
+```bash
+knowledge ingest --dataset my-dataset --ignore-file .knowledgeignore ./path/to/my-documents
+```
+
+Here's an example ignore file which basically tells knowledge to only consider Markdown files and nothing else:
+
+```gitignore
+# Ignore everything
+*
+
+# Except Markdown files in any directory
+!**/*.md   
+```
+
+:::note
+
+    Alternatively, you can use the `--ignore-extensions` flag to ignore files with specific extensions.
+
+    ```bash
+    knowledge ingest --dataset my-dataset --ignore-extensions=.txt ./path/to/my-documents
+    ```
+
+:::
+
+
+### Remote Files
+
+You can ingest remote files by providing a URL - Currently only Git Repositories are supported:
+
+```bash
+knowledge ingest --dataset my-dataset https://github.com/gptscript-ai/knowledge
+```
+
+You may also specify a Commit Hash, Tag or Branch that you want to check out:
+
+```bash
+knowledge ingest --dataset my-dataset https://github.com/gptscript-ai/[email protected]
+```
+
+:::note
+
+    Here, it's advisable to use a [ignore file](#ignoring-files) to avoid ingesting all the git metadata and potentially present vendor files.
+
+:::
diff --git a/docs/docs/99-cmd/knowledge.md b/docs/docs/99-cmd/knowledge.md
@@ -27,6 +27,5 @@ knowledge [flags]
 * [knowledge ingest](knowledge_ingest.md)	 - Ingest a file/directory into a dataset
 * [knowledge list-datasets](knowledge_list-datasets.md)	 - List existing datasets
 * [knowledge retrieve](knowledge_retrieve.md)	 - Retrieve sources for a query from a dataset
-* [knowledge server](knowledge_server.md)	 - 
 * [knowledge version](knowledge_version.md)	 - 
 
diff --git a/docs/docs/99-cmd/knowledge_askdir.md b/docs/docs/99-cmd/knowledge_askdir.md
@@ -21,6 +21,9 @@ knowledge askdir [--path <path>] <query> [flags]
       --flows-file string                 Path to a YAML/JSON file containing ingestion/retrieval flows ($KNOW_FLOWS_FILE)
   -h, --help                              help for askdir
       --ignore-extensions string          Comma-separated list of file extensions to ignore ($KNOW_INGEST_IGNORE_EXTENSIONS)
+      --ignore-file string                Path to a .gitignore style file ($KNOW_INGEST_IGNORE_FILE)
+      --include-hidden                    Include hidden files and directories ($KNOW_INGEST_INCLUDE_HIDDEN)
+      --no-create-dataset                 Do NOT create the dataset if it doesn't exist ($KNOW_INGEST_NO_CREATE_DATASET)
       --no-recursive                      Don't recursively ingest directories ($KNOW_NO_INGEST_RECURSIVE)
   -p, --path string                       Path to the directory to query ($KNOWLEDGE_CLIENT_ASK_DIR_PATH) (default ".")
       --server string                     URL of the Knowledge API Server ($KNOW_SERVER_URL)

diff --git a/docs/docs/99-cmd/knowledge_import.md b/docs/docs/99-cmd/knowledge_import.md
@@ -9,8 +9,9 @@ Import one or more datasets from an archive (zip) (default: all datasets)
 
 Import one or more datasets from an archive (zip) (default: all datasets).
 ## IMPORTANT: Embedding functions
-   Embedding functions are not part of exported knowledge base archives, so you'll have to know the embedding function used to import the archive.
-   This primarily concerns the choice of the embeddings provider (model).
+   When someone first ingests some data into a dataset, the embedding provider configured at that time will be attached to the dataset.
+   Upon subsequent ingestion actions, the same embedding provider must be used to ensure that the embeddings are consistent.
+   Most of the times, the only field that has to be the same is the model, as that defines the dimensionality usually.
    Note: This is only relevant if you plan to add more documents to the dataset after importing it.
 
 

diff --git a/docs/docs/99-cmd/knowledge_ingest.md b/docs/docs/99-cmd/knowledge_ingest.md
@@ -5,6 +5,18 @@ title: "knowledge ingest"
 
 Ingest a file/directory into a dataset
 
+### Synopsis
+
+Ingest a file or directory into a dataset.
+
+## Important Note
+
+The first time you ingest something into a dataset, the embedding function (model provider) you chose will be attached to that dataset.
+After that, the client must always use that same embedding function to ingest into this dataset.
+Usually, this only concerns the choice of the model, as that commonly defines the embedding dimensionality.
+This is a constraint of the Vector Database and Similarity Search, as different models yield differently sized embedding vectors and also represent the semantics differently.
+
+
 ```
 knowledge ingest [--dataset <dataset-id>] <path> [flags]
 ```
@@ -22,6 +34,9 @@ knowledge ingest [--dataset <dataset-id>] <path> [flags]
       --flows-file string                   Path to a YAML/JSON file containing ingestion/retrieval flows ($KNOW_FLOWS_FILE)
   -h, --help                                help for ingest
       --ignore-extensions string            Comma-separated list of file extensions to ignore ($KNOW_INGEST_IGNORE_EXTENSIONS)
+      --ignore-file string                  Path to a .gitignore style file ($KNOW_INGEST_IGNORE_FILE)
+      --include-hidden                      Include hidden files and directories ($KNOW_INGEST_INCLUDE_HIDDEN)
+      --no-create-dataset                   Do NOT create the dataset if it doesn't exist ($KNOW_INGEST_NO_CREATE_DATASET)
       --no-recursive                        Don't recursively ingest directories ($KNOW_NO_INGEST_RECURSIVE)
       --server string                       URL of the Knowledge API Server ($KNOW_SERVER_URL)
       --textsplitter-chunk-overlap int      Textsplitter Chunk Overlap ($KNOW_TEXTSPLITTER_CHUNK_OVERLAP) (default 256)

diff --git a/examples/.knowignore b/examples/.knowignore
@@ -0,0 +1,2 @@
+*
+!**/*.yaml
diff --git a/examples/configfiles/embedding_provider.yaml b/examples/configfiles/embedding_provider.yaml
@@ -16,4 +16,8 @@ embeddings:
       apiKey: "${GOOGLE_API_KEY}"
       project: "acorn-io"
       # apiEndpoint: https://us-central1-aiplatform.googleapis.com
-      model: "text-embedding-004"
+      model: "text-embedding-004"
+  - name: local
+    type: ollama
+    config:
+      model: mxbai-embed-large
diff --git a/go.mod b/go.mod
@@ -22,7 +22,7 @@ require (
 	github.com/gen2brain/go-fitz v1.23.7
 	github.com/gin-gonic/gin v1.10.0
 	github.com/glebarez/sqlite v1.11.0
-	github.com/go-viper/mapstructure/v2 v2.0.0-alpha.1
+	github.com/go-git/go-git/v5 v5.12.0
 	github.com/google/uuid v1.6.0
 	github.com/hupe1980/golc v0.0.112
 	github.com/joho/godotenv v1.5.1
@@ -84,7 +84,6 @@ require (
 	github.com/glebarez/go-sqlite v1.21.2 // indirect
 	github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect
 	github.com/go-git/go-billy/v5 v5.5.0 // indirect
-	github.com/go-git/go-git/v5 v5.12.0 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 	github.com/go-openapi/spec v0.21.0 // indirect
@@ -93,6 +92,7 @@ require (
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-playground/validator/v10 v10.20.0 // indirect
 	github.com/go-resty/resty/v2 v2.3.0 // indirect
+	github.com/go-viper/mapstructure/v2 v2.0.0-alpha.1 // indirect
 	github.com/gobwas/ws v1.2.1 // indirect
 	github.com/goccy/go-json v0.10.2 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect

diff --git a/pkg/client/client.go b/pkg/client/client.go
@@ -16,11 +16,13 @@ type IngestPathsOpts struct {
 	Recursive        bool
 	TextSplitterOpts *textsplitter.TextSplitterOpts
 	IngestionFlows   []flows.IngestionFlow
-	CreateDataset    bool
+	IgnoreFile       string
+	IncludeHidden    bool
+	NoCreateDataset  bool
 }
 
 type Client interface {
-	CreateDataset(ctx context.Context, datasetID string) (types.Dataset, error)
+	CreateDataset(ctx context.Context, datasetID string) (*index.Dataset, error)
 	DeleteDataset(ctx context.Context, datasetID string) error
 	GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error)
 	ListDatasets(ctx context.Context) ([]types.Dataset, error)

diff --git a/pkg/client/common.go b/pkg/client/common.go
@@ -1,30 +1,86 @@
 package client
 
 import (
+	"bufio"
 	"context"
 	"crypto/sha1"
 	"encoding/hex"
 	"fmt"
+	"github.com/go-git/go-git/v5/plumbing/format/gitignore"
+	"github.com/gptscript-ai/knowledge/pkg/datastore"
+	remotes "github.com/gptscript-ai/knowledge/pkg/datastore/documentloader/remote"
 	dstypes "github.com/gptscript-ai/knowledge/pkg/datastore/types"
+	"github.com/gptscript-ai/knowledge/pkg/index"
+	"golang.org/x/sync/errgroup"
+	"golang.org/x/sync/semaphore"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"slices"
-
-	"github.com/gptscript-ai/knowledge/pkg/datastore"
-	"golang.org/x/sync/errgroup"
-	"golang.org/x/sync/semaphore"
+	"strings"
 )
 
+func isIgnored(ignore gitignore.Matcher, path string) bool {
+	return ignore.Match(strings.Split(path, string(filepath.Separator)), false)
+}
+
 func checkIgnored(path string, ignoreExtensions []string) bool {
 	ext := filepath.Ext(path)
 	slog.Debug("checking path", "path", path, "ext", ext, "ignore", ignoreExtensions)
 	return slices.Contains(ignoreExtensions, ext)
 }
 
+func readIgnoreFile(path string) ([]gitignore.Pattern, error) {
+	stat, err := os.Stat(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to checkout ignore file %q: %w", path, err)
+	}
+
+	if stat.IsDir() {
+		return nil, fmt.Errorf("ignore file %q is a directory", path)
+	}
+
+	var ps []gitignore.Pattern
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open ignore file %q: %w", path, err)
+	}
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		s := scanner.Text()
+		if !strings.HasPrefix(s, "#") && len(strings.TrimSpace(s)) > 0 {
+			ps = append(ps, gitignore.ParsePattern(s, nil))
+		}
+	}
+
+	return ps, nil
+}
+
 func ingestPaths(ctx context.Context, opts *IngestPathsOpts, ingestionFunc func(path string) error, paths ...string) (int, error) {
+
 	ingestedFilesCount := 0
 
+	var ignorePatterns []gitignore.Pattern
+	var err error
+	if opts.IgnoreFile != "" {
+		ignorePatterns, err = readIgnoreFile(opts.IgnoreFile)
+		if err != nil {
+			return ingestedFilesCount, fmt.Errorf("failed to read ignore file %q: %w", opts.IgnoreFile, err)
+		}
+	}
+
+	if len(opts.IgnoreExtensions) > 0 {
+		for _, ext := range opts.IgnoreExtensions {
+			p := "*." + strings.TrimPrefix(ext, ".")
+			ignorePatterns = append(ignorePatterns, gitignore.ParsePattern(p, nil))
+		}
+	}
+
+	slog.Debug("Ignore patterns", "patterns", ignorePatterns)
+
+	ignore := gitignore.NewMatcher(ignorePatterns)
+
 	if opts.Concurrency < 1 {
 		opts.Concurrency = 10
 	}
@@ -35,6 +91,22 @@ func ingestPaths(ctx context.Context, opts *IngestPathsOpts, ingestionFunc func(
 	for _, p := range paths {
 		path := p
 
+		if strings.HasPrefix(path, ".") {
+			if !opts.IncludeHidden {
+				slog.Debug("Ignoring hidden path", "path", path)
+				continue
+			}
+		}
+
+		if remotes.IsRemote(path) {
+			// Load remote files
+			remotePath, err := remotes.LoadRemote(path)
+			if err != nil {
+				return ingestedFilesCount, fmt.Errorf("failed to load from remote %q: %w", path, err)
+			}
+			path = remotePath
+		}
+
 		fileInfo, err := os.Stat(path)
 		if err != nil {
 			return ingestedFilesCount, fmt.Errorf("failed to get file info for %s: %w", path, err)
@@ -55,8 +127,8 @@ func ingestPaths(ctx context.Context, opts *IngestPathsOpts, ingestionFunc func(
 					}
 					return nil
 				}
-				if checkIgnored(subPath, opts.IgnoreExtensions) {
-					slog.Debug("Skipping ingestion of file", "path", subPath, "reason", "extension ignored")
+				if isIgnored(ignore, subPath) {
+					slog.Debug("Ignoring file", "path", subPath, "ignorefile", opts.IgnoreFile, "ignoreExtensions", opts.IgnoreExtensions)
 					return nil
 				}
 
@@ -68,6 +140,7 @@ func ingestPaths(ctx context.Context, opts *IngestPathsOpts, ingestionFunc func(
 					defer sem.Release(1)
 
 					ingestedFilesCount++
+					slog.Debug("Ingesting file", "path", sp)
 					return ingestionFunc(sp)
 				})
 				return nil
@@ -76,8 +149,8 @@ func ingestPaths(ctx context.Context, opts *IngestPathsOpts, ingestionFunc func(
 				return ingestedFilesCount, err
 			}
 		} else {
-			if checkIgnored(path, opts.IgnoreExtensions) {
-				slog.Debug("Skipping ingestion of file", "path", path, "reason", "extension ignored")
+			if isIgnored(ignore, path) {
+				slog.Debug("Ignoring file", "path", path, "ignorefile", opts.IgnoreFile, "ignoreExtensions", opts.IgnoreExtensions)
 				continue
 			}
 			// Process a file directly
@@ -124,23 +197,16 @@ func AskDir(ctx context.Context, c Client, path string, query string, opts *Inge
 	datasetID := HashPath(abspath)
 	slog.Debug("Directory Dataset ID hashed", "path", abspath, "id", datasetID)
 
-	// check if dataset exists
-	dataset, err := c.GetDataset(ctx, datasetID)
+	_, err = getOrCreateDataset(ctx, c, datasetID, true)
 	if err != nil {
-		return nil, fmt.Errorf("failed to get dataset %q: %w", datasetID, err)
-	}
-	if dataset == nil {
-		// create dataset
-		_, err := c.CreateDataset(ctx, datasetID)
-		if err != nil {
-			return nil, fmt.Errorf("failed to create dataset %q: %w", datasetID, err)
-		}
+		return nil, err
 	}
 
 	// ingest files
 	if opts == nil {
 		opts = &IngestPathsOpts{}
 	}
+
 	ingested, err := c.IngestPaths(ctx, datasetID, opts, path)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ingest files: %w", err)
@@ -150,3 +216,23 @@ func AskDir(ctx context.Context, c Client, path string, query string, opts *Inge
 	// retrieve documents
 	return c.Retrieve(ctx, datasetID, query, *ropts)
 }
+
+func getOrCreateDataset(ctx context.Context, c Client, datasetID string, create bool) (*index.Dataset, error) {
+	var ds *index.Dataset
+	var err error
+	ds, err = c.GetDataset(ctx, datasetID)
+	if err != nil {
+		return nil, err
+	}
+	if ds == nil {
+		if create {
+			ds, err = c.CreateDataset(ctx, datasetID)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			return nil, fmt.Errorf("dataset %q not found", datasetID)
+		}
+	}
+	return ds, nil
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,4 +41,5 @@ tests/__pycache__/
		.DS_Store


		vendor/
		vendor/