From 2b0a9385d6d7e4b14a2a72128aec0e12c5ea2807 Mon Sep 17 00:00:00 2001 From: David Valentine Date: Wed, 26 Jul 2023 14:29:58 -0700 Subject: [PATCH] update to googledrive and identifier for identifierstring creation from ark's --- internal/common/identifier.go | 61 +++++++++++++++++++----- internal/summoner/acquire/googledrive.go | 45 ++++++++++++----- 2 files changed, 82 insertions(+), 24 deletions(-) diff --git a/internal/common/identifier.go b/internal/common/identifier.go index 326372c1..f544a24a 100644 --- a/internal/common/identifier.go +++ b/internal/common/identifier.go @@ -16,6 +16,7 @@ import ( "github.com/ohler55/ojg/oj" log "github.com/sirupsen/logrus" "github.com/spf13/viper" + "net/url" "sort" "strings" ) @@ -35,6 +36,7 @@ func GenerateIdentifier(v1 *viper.Viper, source config.Sources, jsonld string) ( // Generate calls also do the casecading aka if IdentifierSha is [] it calls JsonSha switch source.IdentifierType { + case config.IdentifierString: return GenerateIdentiferString(v1, source, jsonld) case config.IdentifierSha: @@ -48,16 +50,16 @@ func GenerateIdentifier(v1 *viper.Viper, source config.Sources, jsonld string) ( } -func GetIdentifierByPath(jsonPath string, jsonld string) (interface{}, error) { +func GetIdentifierByPath(jsonPath string, jsonld string) ([]string, error) { obj, err := oj.ParseString(jsonld) if err != nil { - return "", err + return []string{}, err } x, err := jp.ParseString(jsonPath) ys := x.Get(obj) if err != nil { - return "", err + return []string{}, err } // we need to sort the results aString := make([]string, len(ys)) @@ -90,7 +92,7 @@ has no value: https://cburgmer.github.io/json-path-comparison/results/dot_notation_on_object_without_key.html https://cburgmer.github.io/json-path-comparison/results/dot_notation_on_null_value.html */ -func GetIdentiferByPaths(jsonpaths []string, jsonld string) (interface{}, string, error) { +func GetIdentiferByPaths(jsonpaths []string, jsonld string) ([]string, string, error) { for _, jsonPath := range jsonpaths { obj, err := GetIdentifierByPath(jsonPath, jsonld) if err == nil { @@ -106,21 +108,54 @@ func GetIdentiferByPaths(jsonpaths []string, jsonld string) (interface{}, string continue } } - return "", "", errors.New("No Match") + return []string{}, "", errors.New("No Match") } -func GenerateIdentiferString(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) { - uniqueid, err := GenerateIdentifierSha(v1, source, jsonld) +func url2Path(idstring string) string { + u, err := url.Parse(idstring) + if err != nil { + return idstring + } + return u.Path[1:] +} +func encodeark(arkid string) string { + arkid = strings.Replace(arkid, ":/", "_", 1) + arkid = strings.Replace(arkid, "/", "_", 1) + return arkid +} +func safeEncodeString(idstring string) string { + // first see it is a url, if so, then take the path part. + u := url2Path(idstring) + if strings.HasPrefix(u, "ark") { + u = encodeark(u) + } + return u +} +func GenerateIdentiferString(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) { + // generate a file sha, if there is an error, we stop, or use it later + filesha, err := GenerateFileSha(v1, jsonld) if err != nil { - return uniqueid, err + return filesha, err } - if uniqueid.MatchedString != "" { - uniqueid.UniqueId = uniqueid.MatchedString - uniqueid.IdentifierType = config.IdentifierString + jsonpath := []string{"$['@id']", "$['@graph'][?(@['@type']=='schema:Dataset')]['@id']", "$.url"} + uniqueid, foundPath, err := GetIdentiferByPaths(jsonpath, jsonld) + if err == nil && fmt.Sprint(uniqueid) != "[]" { + safestring := safeEncodeString(uniqueid[0]) + id := Identifier{UniqueId: fmt.Sprint(safestring), + IdentifierType: config.IdentifierString, + MatchedPath: foundPath, + MatchedString: fmt.Sprint(uniqueid), + JsonSha: filesha.JsonSha, + } + return id, err + } else { + log.Info(config.IdentifierSha, "Action: Getting normalized sha Error:", err) + // generate a filesha + return filesha, err } - return uniqueid, err + } func GenerateIdentifierSha(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) { @@ -145,7 +180,7 @@ func GenerateIdentifierSha(v1 *viper.Viper, source config.Sources, jsonld string } uniqueid, foundPath, err := GetIdentiferByPaths(jsonpath, jsonld) - if err == nil && uniqueid != "[]" { + if err == nil && fmt.Sprint(uniqueid) != "[]" { id := Identifier{UniqueId: GetSHA(fmt.Sprint(uniqueid)), IdentifierType: config.IdentifierSha, MatchedPath: foundPath, diff --git a/internal/summoner/acquire/googledrive.go b/internal/summoner/acquire/googledrive.go index 74dcc0cb..1076e0c9 100644 --- a/internal/summoner/acquire/googledrive.go +++ b/internal/summoner/acquire/googledrive.go @@ -7,6 +7,7 @@ import ( "github.com/gleanerio/gleaner/internal/common" configTypes "github.com/gleanerio/gleaner/internal/config" "github.com/gleanerio/gleaner/internal/millers/graph" + "github.com/schollz/progressbar/v3" log "github.com/sirupsen/logrus" "github.com/spf13/viper" "google.golang.org/api/googleapi" @@ -280,6 +281,16 @@ func GetFromGDrive(mc *minio.Client, v1 *viper.Viper) (string, error) { //var results []*drive.File var results []string for _, s := range domains { + //runStats := common.NewRunStats() + //c := make(chan os.Signal) + //signal.Notify(c, os.Interrupt, syscall.SIGTERM) + //go func() { + // <-c + // runStats.StopReason = "User Interrupt or Fatal Error" + // summoner.RunStatsOutput(runStats) + // os.Exit(1) + //}() + //serviceJson := os.Getenv(s.CredentialsFile) // just use separate files for all credentials serviceJson := s.CredentialsFile srv, err := GetDriveCredentials(serviceJson) @@ -291,18 +302,27 @@ func GetFromGDrive(mc *minio.Client, v1 *viper.Viper) (string, error) { fn := filepath.Base(u.Path) log.Info("reading google folder id:", fn) l, err := GetFileList(srv, fn, false, "") - - for _, f := range l { + count := len(l) + bar := progressbar.Default(int64(count)) + for i, f := range l { + bar.Add(1) //results = append(results,f) + log.Infof("uploading %i of %i %s\n", i, len(l), f.Name) o, err := gfileProcessing(mc, v1, srv, f, s.Name, bucketName) if err != nil { continue } results = append(results, o) + //if i >= len(l)-i { + // break + //} } + log.Info(" googledrive source %s complete", s.Name) + fmt.Printf(" googledrive source %s complete", s.Name) } var count = len(results) m := fmt.Sprintf("GoogleDrives %d files processed", count) + log.Info("GoogleDrives %d files processed", count) return m, err } @@ -316,15 +336,18 @@ func gfileProcessing(mc *minio.Client, v1 *viper.Viper, srv *drive.Service, f *d // TODO, how do we quickly validate the JSON-LD files to make sure it is at least formatted well - sha := common.GetSHA(contents) // Don't normalize big files.. - - // Upload the file + //sha := common.GetSHA(contents) // Don't normalize big files.. + //sources, err := configTypes.GetSources(v1) + //source, err := configTypes.GetSourceByName(sources,sourceName ) + // + //identifier, err := common.GenerateIdentiferString(v1,*source,contents) + //// Upload the file log.Info(" file", f.Name, "downloaded. Uploading to", bucketName, ":", sourceName) - objectName := fmt.Sprintf("summoned/%s/%s.jsonld", sourceName, sha) - _, err = graph.LoadToMinio(contents, bucketName, objectName, mc) + //objectName := fmt.Sprintf("summoned/%s/%s.jsonld", sourceName, fileId) + sha, err := Upload(v1, mc, bucketName, sourceName, f.Name, contents) if err != nil { - return objectName, err + return sha, err } log.Info(" file", f.Name, "uploaded to", bucketName, "Uploaded :", sourceName) // mill the json-ld to nq and upload to minio @@ -342,7 +365,7 @@ func gfileProcessing(mc *minio.Client, v1 *viper.Viper, srv *drive.Service, f *d milledName := fmt.Sprintf("milled/%s/%s.rdf", sourceName, sha) _, err = graph.LoadToMinio(rdf, bucketName, milledName, mc) if err != nil { - return objectName, err + return f.Name, err } log.Info("Processed files Upload to", milledName, "complete:", sourceName) @@ -350,9 +373,9 @@ func gfileProcessing(mc *minio.Client, v1 *viper.Viper, srv *drive.Service, f *d log.Debug("Building prov") err = StoreProvNG(v1, mc, sourceName, sha, sourceName, "summoned") if err != nil { - return objectName, err + return f.Name, err } log.Info("Loaded:", len(contents)) - return objectName, err + return f.Name, err }