-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheprint2citation.go
134 lines (128 loc) · 4.46 KB
/
eprint2citation.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package irdmtools
import (
"fmt"
"io"
"log"
"path"
"strings"
"time"
// Caltech Library packages
"github.com/caltechlibrary/dataset/v2"
"github.com/caltechlibrary/eprinttools"
)
// This will talk to a EPrints3 database and retrieve an EPrint record
// and output a CiteProc record in JSON format.
// EPrintToCitation takes a single EPrint records and returns a single
// Citation struct
func EPrintToCitation(repoName string, key string, eprint *eprinttools.EPrint, repoHost string, resourceTypes map[string]string, contributorTypes map[string]string) (*Citation, error) {
// This is the way an EPrint URL is actually formed.
eprintURL := fmt.Sprintf("http://%s/%d", repoHost, eprint.EPrintID)
// NOTE: We're dealing with a squirly situation of URLs to use during our migration and
// before the feeds v2.0 implementation.
if strings.HasPrefix(eprint.ID, "http") {
eprintURL = eprint.ID
} else if eprint.OfficialURL != "" {
eprintURL = eprint.OfficialURL
}
citation := new(Citation)
err := citation.CrosswalkEPrint(repoName, key, eprintURL, eprint)
return citation, err
}
// MigrateEPrintDatasetToCitationsDataset takes a dataset of EPrint objects and migrates the ones in the
// id list to a citation dataset collection.
func MigrateEPrintDatasetToCitationDataset(ep3CName string, ids []string, repoHost string, prefix string, citeCName string) error {
ep3, err := dataset.Open(ep3CName)
if err != nil {
return err
}
defer ep3.Close()
cite, err := dataset.Open(citeCName)
if err != nil {
return err
}
defer cite.Close()
resourceTypes := map[string]string{}
contributorTypes := map[string]string{}
tot := len(ids)
start := time.Now()
iTime := time.Now()
reportProgress := false
i := 0
log.Printf("%d/%d citations processed %s: %s", i, tot, time.Since(start).Truncate(time.Second).String(), ProgressETA(start, i, tot))
for _, id := range ids {
eprint := new(eprinttools.EPrint)
if err := ep3.ReadObject(id, eprint); err != nil {
log.Printf("failed to get %s (%d), %s", id, i, err)
continue
}
if eprint.EPrintStatus != "archive" {
log.Printf("skipping, status = %q, %s (%d)", eprint.EPrintStatus, id, i)
continue
}
repoName := path.Base(strings.TrimSuffix(ep3CName, ".ds"))
// NOTE: we want to maintain the contributor type and resource type maps in the existing
// EPrints dataset collection. We do that by acrueing resourceTypes and contributorTypes from
// the eprint record retrieved.
if _, ok := resourceTypes[eprint.Type]; ! ok {
resourceTypes[eprint.Type] = eprint.Type
}
key := id
if prefix != "" {
key = fmt.Sprintf("%s:%s", repoName, id) // the key we will use as the suffix in citation.ds
}
citation, err := EPrintToCitation(repoName, id, eprint, repoHost, resourceTypes, contributorTypes)
if err != nil {
log.Printf("failed to convert (%d) id %s from %s to citation, %s", i, id, repoName, err)
continue
}
if cite.HasKey(key) {
err = cite.UpdateObject(key, citation)
} else {
err = cite.CreateObject(key, citation)
}
if err != nil {
log.Printf("failed to save citation for %s (%d), %s", id, i, err)
}
i++
if iTime, reportProgress = CheckWaitInterval(iTime, time.Minute); reportProgress || (i % 10000) == 0 {
log.Printf("%d/%d citations processed %s: %s", i, tot, time.Since(start).Truncate(time.Second).String(), ProgressETA(start, i, tot))
}
}
log.Printf("%d/%d citations processed %s: completed", i, tot, time.Since(start).Truncate(time.Second).String())
return nil
}
// RunEPrintDSToCitationDS migrates contents from an EPrint dataset collection to a citation dataset collection for
// a give list of ids and repostiory hostname.
func RunEPrintDSToCitationDS(in io.Reader, out io.Writer, eout io.Writer, args []string, repoHost string, prefix string, ids []string) int {
var (
ep3CName string
citeCName string
keys []string
)
if len(args) < 1 {
fmt.Fprintf(eout, "missing eprint collection name and citation collection name\n")
return 1
}
if len(args) < 2 {
fmt.Fprintf(eout, "missing or eprint or citation collection names\n")
return 1
}
if len(args) >= 2 {
ep3CName, citeCName = args[0], args[1]
}
if len(args) > 2 {
keys = args[2:]
}
if len(ids) > 0 {
keys = append(keys, ids...)
}
if len(keys) == 0 {
fmt.Fprintf(eout, "no ids to process, aborting\n")
return 1
}
if err := MigrateEPrintDatasetToCitationDataset(ep3CName, keys, repoHost, prefix, citeCName); err != nil {
fmt.Fprintf(eout, "%s\n", err)
return 1
}
return 0 // OK
}