This repository has been archived by the owner on Oct 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add: dataset metadata + routing retriever (#40)
- Loading branch information
1 parent
26373f2
commit f5559ed
Showing
16 changed files
with
318 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
flows: | ||
foo: | ||
default: true | ||
retrieval: | ||
retriever: | ||
name: routing | ||
options: | ||
model: | ||
openai: | ||
apiKey: "${OPENAI_API_KEY}" | ||
model: gpt-4o | ||
apiType: OPEN_AI | ||
apiBase: https://api.openai.com/v1 | ||
topK: 6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package cmd | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"github.com/gptscript-ai/knowledge/pkg/datastore" | ||
"github.com/gptscript-ai/knowledge/pkg/index" | ||
"github.com/spf13/cobra" | ||
) | ||
|
||
type ClientEditDataset struct { | ||
Client | ||
ResetMetadata bool `usage:"reset metadata to default (empty)"` | ||
UpdateMetadata map[string]string `usage:"update metadata key-value pairs (existing metadata will be updated/preserved)"` | ||
ReplaceMetadata map[string]string `usage:"replace metadata with key-value pairs (existing metadata will be removed)"` | ||
} | ||
|
||
func (s *ClientEditDataset) Customize(cmd *cobra.Command) { | ||
cmd.Use = "edit-dataset <dataset-id>" | ||
cmd.Short = "Edit an existing dataset" | ||
cmd.Args = cobra.ExactArgs(1) | ||
cmd.MarkFlagsMutuallyExclusive("reset-metadata", "update-metadata", "replace-metadata") | ||
} | ||
|
||
func (s *ClientEditDataset) Run(cmd *cobra.Command, args []string) error { | ||
c, err := s.getClient() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
datasetID := args[0] | ||
|
||
// Get current dataset | ||
dataset, err := c.GetDataset(cmd.Context(), datasetID) | ||
if err != nil { | ||
return fmt.Errorf("failed to get dataset: %w", err) | ||
} | ||
|
||
if dataset == nil { | ||
fmt.Printf("dataset not found: %q\n", datasetID) | ||
return fmt.Errorf("dataset not found: %s", datasetID) | ||
} | ||
|
||
updatedDataset := index.Dataset{ | ||
ID: dataset.ID, | ||
} | ||
|
||
// Update Metadata - since flags are mutually exclusive, this should be either an empty map, or one of the update/replace maps | ||
metadata := map[string]any{} | ||
|
||
for k, v := range s.UpdateMetadata { | ||
metadata[k] = v | ||
} | ||
|
||
for k, v := range s.ReplaceMetadata { | ||
metadata[k] = v | ||
} | ||
|
||
updatedDataset.Metadata = metadata | ||
|
||
dataset, err = c.UpdateDataset(cmd.Context(), updatedDataset, &datastore.UpdateDatasetOpts{ReplaceMedata: s.ResetMetadata || len(s.ReplaceMetadata) > 0}) | ||
if err != nil { | ||
return fmt.Errorf("failed to update dataset: %w", err) | ||
} | ||
|
||
dataset.Files = nil // Don't print files | ||
|
||
jsonOutput, err := json.Marshal(dataset) | ||
if err != nil { | ||
return fmt.Errorf("failed to marshal dataset: %w", err) | ||
} | ||
|
||
fmt.Println("Updated dataset:\n", string(jsonOutput)) | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package retrievers | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"github.com/gptscript-ai/knowledge/pkg/datastore/defaults" | ||
"github.com/gptscript-ai/knowledge/pkg/datastore/store" | ||
"github.com/gptscript-ai/knowledge/pkg/llm" | ||
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore" | ||
"log/slog" | ||
) | ||
|
||
type RoutingRetriever struct { | ||
Model llm.LLMConfig | ||
AvailableDatasets []string | ||
TopK int | ||
} | ||
|
||
var routingPromptTpl = `The following query will be used for a vector similarity search. | ||
Please route it to the appropriate dataset. Choose the one that fits best to the query based on the metadata. | ||
Query: "{{.query}}" | ||
Available datasets in a JSON map, where the key is the dataset ID and the value is a map of metadata fields: | ||
{{ .datasets }} | ||
Reply only in the following JSON format, without any styling or markdown syntax: | ||
{"result": "<dataset-id>"}` | ||
|
||
type routingResp struct { | ||
Result string `json:"result"` | ||
} | ||
|
||
func (r *RoutingRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetID string) ([]vs.Document, error) { | ||
log := slog.With("component", "RoutingRetriever") | ||
|
||
log.Debug("Ignoring input datasetID in routing retriever, as it chooses on by itself", "query", query, "inputDataset", datasetID) | ||
|
||
if r.TopK <= 0 { | ||
log.Debug("TopK not set, using default", "default", defaults.TopK) | ||
r.TopK = defaults.TopK | ||
} | ||
|
||
if len(r.AvailableDatasets) == 0 { | ||
allDatasets, err := store.ListDatasets(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
for _, ds := range allDatasets { | ||
r.AvailableDatasets = append(r.AvailableDatasets, ds.ID) | ||
} | ||
} | ||
slog.Debug("Available datasets", "datasets", r.AvailableDatasets) | ||
|
||
datasets := map[string]map[string]any{} | ||
for _, dsID := range r.AvailableDatasets { | ||
dataset, err := store.GetDataset(ctx, dsID) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if dataset == nil { | ||
return nil, fmt.Errorf("dataset not found: %q", dsID) | ||
} | ||
datasets[dataset.ID] = dataset.Metadata | ||
} | ||
|
||
datasetsJSON, err := json.Marshal(datasets) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
m, err := llm.NewFromConfig(r.Model) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
result, err := m.Prompt(context.Background(), routingPromptTpl, map[string]interface{}{"query": query, "datasets": string(datasetsJSON)}) | ||
if err != nil { | ||
return nil, err | ||
} | ||
slog.Debug("Routing result", "result", result) | ||
var resp routingResp | ||
err = json.Unmarshal([]byte(result), &resp) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
slog.Debug("Routing query to dataset", "query", query, "dataset", resp.Result) | ||
|
||
return store.SimilaritySearch(ctx, query, r.TopK, resp.Result) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package store | ||
|
||
import ( | ||
"context" | ||
"github.com/gptscript-ai/knowledge/pkg/index" | ||
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore" | ||
) | ||
|
||
type Store interface { | ||
ListDatasets(ctx context.Context) ([]index.Dataset, error) | ||
GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error) | ||
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string) ([]vs.Document, error) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.