Skip to content

Commit

Permalink
Merge pull request #231 from gleanerio/dev_indentiifer_arks
Browse files Browse the repository at this point in the history
IdentifierString
  • Loading branch information
fils authored Jul 3, 2024
2 parents 256a703 + 11809d3 commit 3de7067
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 49 deletions.
63 changes: 50 additions & 13 deletions internal/common/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/ohler55/ojg/oj"
log "github.com/sirupsen/logrus"
"github.com/spf13/viper"
"net/url"
"sort"
"strings"
)
Expand All @@ -35,6 +36,7 @@ func GenerateIdentifier(v1 *viper.Viper, source config.Sources, jsonld string) (

// Generate calls also do the casecading aka if IdentifierSha is [] it calls JsonSha
switch source.IdentifierType {

case config.IdentifierString:
return GenerateIdentiferString(v1, source, jsonld)
case config.IdentifierSha:
Expand All @@ -48,16 +50,16 @@ func GenerateIdentifier(v1 *viper.Viper, source config.Sources, jsonld string) (

}

func GetIdentifierByPath(jsonPath string, jsonld string) (interface{}, error) {
func GetIdentifierByPath(jsonPath string, jsonld string) ([]string, error) {
obj, err := oj.ParseString(jsonld)
if err != nil {
return "", err
return []string{}, err
}
x, err := jp.ParseString(jsonPath)
ys := x.Get(obj)

if err != nil {
return "", err
return []string{}, err
}
// we need to sort the results
aString := make([]string, len(ys))
Expand Down Expand Up @@ -90,7 +92,7 @@ has no value:
https://cburgmer.github.io/json-path-comparison/results/dot_notation_on_object_without_key.html
https://cburgmer.github.io/json-path-comparison/results/dot_notation_on_null_value.html
*/
func GetIdentiferByPaths(jsonpaths []string, jsonld string) (interface{}, string, error) {
func GetIdentiferByPaths(jsonpaths []string, jsonld string) ([]string, string, error) {
for _, jsonPath := range jsonpaths {
obj, err := GetIdentifierByPath(jsonPath, jsonld)
if err == nil {
Expand All @@ -106,21 +108,56 @@ func GetIdentiferByPaths(jsonpaths []string, jsonld string) (interface{}, string
continue
}
}
return "", "", errors.New("No Match")
return []string{}, "", errors.New("No Match")
}

func GenerateIdentiferString(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) {
uniqueid, err := GenerateIdentifierSha(v1, source, jsonld)
func url2Path(idstring string) string {
u, err := url.Parse(idstring)
if err != nil || u.Path == "" {
return idstring
}

return u.Path[1:]

}
func encodeark(arkid string) string {
arkid = strings.Replace(arkid, ":/", "_", 1)
arkid = strings.Replace(arkid, "/", "_", 1)
return arkid
}
func safeEncodeString(idstring string) string {
// first see it is a url, if so, then take the path part.
u := url2Path(idstring)
if strings.HasPrefix(u, "ark") {
u = encodeark(u)
}
return u
}

func GenerateIdentiferString(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) {
// generate a file sha, if there is an error, we stop, or use it later
filesha, err := GenerateFileSha(v1, jsonld)
if err != nil {
return uniqueid, err
return filesha, err
}
if uniqueid.MatchedString != "" {
uniqueid.UniqueId = uniqueid.MatchedString
uniqueid.IdentifierType = config.IdentifierString

jsonpath := []string{"$['@id']", "$['@graph'][?(@['@type']=='schema:Dataset')]['@id']", "$.url"}
uniqueid, foundPath, err := GetIdentiferByPaths(jsonpath, jsonld)
if err == nil && fmt.Sprint(uniqueid) != "[]" {
safestring := safeEncodeString(uniqueid[0])
id := Identifier{UniqueId: fmt.Sprint(safestring),
IdentifierType: config.IdentifierString,
MatchedPath: foundPath,
MatchedString: fmt.Sprint(uniqueid),
JsonSha: filesha.JsonSha,
}
return id, err
} else {
log.Info(config.IdentifierSha, "Action: Getting normalized sha Error:", err)
// generate a filesha
return filesha, err
}
return uniqueid, err

}

func GenerateIdentifierSha(v1 *viper.Viper, source config.Sources, jsonld string) (Identifier, error) {
Expand All @@ -145,7 +182,7 @@ func GenerateIdentifierSha(v1 *viper.Viper, source config.Sources, jsonld string
}
uniqueid, foundPath, err := GetIdentiferByPaths(jsonpath, jsonld)

if err == nil && uniqueid != "[]" {
if err == nil && fmt.Sprint(uniqueid) != "[]" {
id := Identifier{UniqueId: GetSHA(fmt.Sprint(uniqueid)),
IdentifierType: config.IdentifierSha,
MatchedPath: foundPath,
Expand Down
45 changes: 34 additions & 11 deletions internal/summoner/acquire/googledrive.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/gleanerio/gleaner/internal/common"
configTypes "github.com/gleanerio/gleaner/internal/config"
"github.com/gleanerio/gleaner/internal/millers/graph"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
"github.com/spf13/viper"
"google.golang.org/api/googleapi"
Expand Down Expand Up @@ -280,6 +281,16 @@ func GetFromGDrive(mc *minio.Client, v1 *viper.Viper) (string, error) {
//var results []*drive.File
var results []string
for _, s := range domains {
//runStats := common.NewRunStats()
//c := make(chan os.Signal)
//signal.Notify(c, os.Interrupt, syscall.SIGTERM)
//go func() {
// <-c
// runStats.StopReason = "User Interrupt or Fatal Error"
// summoner.RunStatsOutput(runStats)
// os.Exit(1)
//}()

//serviceJson := os.Getenv(s.CredentialsFile) // just use separate files for all credentials
serviceJson := s.CredentialsFile
srv, err := GetDriveCredentials(serviceJson)
Expand All @@ -291,18 +302,27 @@ func GetFromGDrive(mc *minio.Client, v1 *viper.Viper) (string, error) {
fn := filepath.Base(u.Path)
log.Info("reading google folder id:", fn)
l, err := GetFileList(srv, fn, false, "")

for _, f := range l {
count := len(l)
bar := progressbar.Default(int64(count))
for i, f := range l {
bar.Add(1)
//results = append(results,f)
log.Infof("uploading %i of %i %s\n", i, len(l), f.Name)
o, err := gfileProcessing(mc, v1, srv, f, s.Name, bucketName)
if err != nil {
continue
}
results = append(results, o)
//if i >= len(l)-i {
// break
//}
}
log.Info(" googledrive source %s complete", s.Name)
fmt.Printf(" googledrive source %s complete", s.Name)
}
var count = len(results)
m := fmt.Sprintf("GoogleDrives %d files processed", count)
log.Info("GoogleDrives %d files processed", count)
return m, err
}

Expand All @@ -316,15 +336,18 @@ func gfileProcessing(mc *minio.Client, v1 *viper.Viper, srv *drive.Service, f *d

// TODO, how do we quickly validate the JSON-LD files to make sure it is at least formatted well

sha := common.GetSHA(contents) // Don't normalize big files..

// Upload the file
//sha := common.GetSHA(contents) // Don't normalize big files..
//sources, err := configTypes.GetSources(v1)
//source, err := configTypes.GetSourceByName(sources,sourceName )
//
//identifier, err := common.GenerateIdentiferString(v1,*source,contents)
//// Upload the file
log.Info(" file", f.Name, "downloaded. Uploading to", bucketName, ":", sourceName)

objectName := fmt.Sprintf("summoned/%s/%s.jsonld", sourceName, sha)
_, err = graph.LoadToMinio(contents, bucketName, objectName, mc)
//objectName := fmt.Sprintf("summoned/%s/%s.jsonld", sourceName, fileId)
sha, err := Upload(v1, mc, bucketName, sourceName, f.Name, contents)
if err != nil {
return objectName, err
return sha, err
}
log.Info(" file", f.Name, "uploaded to", bucketName, "Uploaded :", sourceName)
// mill the json-ld to nq and upload to minio
Expand All @@ -342,17 +365,17 @@ func gfileProcessing(mc *minio.Client, v1 *viper.Viper, srv *drive.Service, f *d
milledName := fmt.Sprintf("milled/%s/%s.rdf", sourceName, sha)
_, err = graph.LoadToMinio(rdf, bucketName, milledName, mc)
if err != nil {
return objectName, err
return f.Name, err
}
log.Info("Processed files Upload to", milledName, "complete:", sourceName)

// build prov
log.Debug("Building prov")
err = StoreProvNG(v1, mc, sourceName, sha, sourceName, "summoned")
if err != nil {
return objectName, err
return f.Name, err
}

log.Info("Loaded:", len(contents))
return objectName, err
return f.Name, err
}
25 changes: 0 additions & 25 deletions runConfigurations/cli batch --cfgName ecrr (1).run.xml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
</ENTRIES>
</EXTENSION>
<kind value="FILE" />
<package value="github.com/gleanerio/gleaner" />
<directory value="$PROJECT_DIR$" />
<filePath value="$PROJECT_DIR$/cmd/glcon/main.go" />
<method v="2" />
Expand Down

0 comments on commit 3de7067

Please sign in to comment.