forked from OBOFoundry/OBOFoundry.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-metadata.py
executable file
·242 lines (203 loc) · 7.81 KB
/
extract-metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3
__author__ = 'cjm'
import argparse
import logging
#from yaml import load, dump
#from yaml import Loader, Dumper
import yaml
def main():
parser = argparse.ArgumentParser(description='OBO'
'Helper utils for OBO',
formatter_class=argparse.RawTextHelpFormatter)
subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help')
# SUBCOMMAND
parser_n = subparsers.add_parser('validate', help='validate yml inside md')
##parser_n.add_argument('-d', '--depth', type=int, help='number of hops')
parser_n.set_defaults(function=validate_markdown)
parser_n.add_argument('files',nargs='*')
# SUBCOMMAND
parser_n = subparsers.add_parser('concat', help='concat ontology yamls')
parser_n.add_argument('-i', '--include', help='yml file to include for header')
parser_n.add_argument('-o', '--output', help='output yaml')
parser_n.set_defaults(function=concat_ont_yaml)
parser_n.add_argument('files',nargs='*')
# SUBCOMMAND
parser_n = subparsers.add_parser('concat-principles', help='concat principles yamls')
parser_n.add_argument('-i', '--include', help='yml file to include for header')
parser_n.add_argument('-o', '--output', help='output yaml')
parser_n.set_defaults(function=concat_principles_yaml)
parser_n.add_argument('files',nargs='*')
args = parser.parse_args()
func = args.function
func(args)
def validate_markdown(args):
"""
Ensure the yaml encoded inside a YAML file is syntactically valid.
First attempt to strip the yaml from the .md file, second use the standard python yaml parser
to parse the embedded yaml. If it can't be passed then an error will be thrown and a stack
trace shown. In future we may try and catch this error and provide a user-friendly report).
In future we also perform additional structural validation on the yaml - check certain fields
are present etc. This could be done in various ways, e.g. jsonschema, programmatic checks. We
should also check translation -> jsonld -> rdf works as expected.
"""
errs = []
for fn in args.files:
print("VALIDATING:"+fn)
# we don't do anything with the results; an
# error is thrown
(obj, md) = load_md(fn)
print("OK:"+fn)
errs += validate_structure(obj,md)
if len(errs) > 0:
print("FAILURES:")
for e in errs:
print("ERROR:"+e)
exit(1)
def validate_structure(obj,md):
errs = []
is_obs = False
if 'id' not in obj:
errs.append("No id: ")
if 'is_obsolete' in obj:
is_obs = True
id = obj['id']
if 'title' not in obj:
errs.append("No title: "+id)
#if 'description' not in obj:
# errs.append("No description: "+id+" " + ("OBS" if is_obs else ""))
if 'layout' not in obj:
errs.append("No layout tag: "+id+" -- this is required for proper rendering")
return errs
def concat_ont_yaml(args):
"""
Given arguments with files and ouput,
read YAML files into an array, decorate the objects, and write an output YAML file.
Output will be Foundry ontologies first, Library ontologies second, and obsolete last.
Assumes that args.files is already sorted alphabetically.
"""
objs = []
foundry = []
library = []
obsolete = []
cfg = {}
if (args.include):
f = open(args.include, 'r')
cfg = yaml.load(f.read())
for fn in args.files:
(obj, md) = load_md(fn)
if 'is_obsolete' in obj and obj['is_obsolete'] == True:
obsolete.append(obj)
elif 'in_foundry_order' in obj:
foundry.append(obj)
else:
library.append(obj)
objs = foundry + library + obsolete
cfg['ontologies'] = objs
decorate_metadata(objs)
f = open(args.output, 'w')
f.write(yaml.dump(cfg))
return cfg
def decorate_metadata(objs):
"""
See:
https://github.com/OBOFoundry/OBOFoundry.github.io/issues/82
"""
for obj in objs:
if 'license' in obj:
# https://creativecommons.org/about/downloads
license = obj['license']
lurl = license['url']
logo = ''
# TODO: decide on canonical URI to use for CC licenses;
# ultimately this should all be specified in RDF/JSON-LD.
# e.g. <http://creativecommons.org/licenses/by/3.0> foaf:depictedBy < ... > .
# e.g. <http://creativecommons.org/licenses/by/3.0> owl:sameAs <https://creativecommons.org/licenses/by/3.0> .
if lurl.find('creativecommons.org/licenses/by-sa') > 0:
logo = 'https://mirrors.creativecommons.org/presskit/buttons/80x15/png/by-sa.png'
elif lurl.find('creativecommons.org/licenses/by/') > 0:
logo = 'http://mirrors.creativecommons.org/presskit/buttons/80x15/png/by.png'
elif lurl.find('creativecommons.org/publicdomain/zero/') > 0:
logo = 'http://mirrors.creativecommons.org/presskit/buttons/80x15/png/cc-zero.png'
if not logo == '':
license['logo'] = logo
if 'products' in obj:
# decorate top-level ontology; but only if it has at least one product
decorate_entry(obj, ".owl")
for product in obj['products']:
decorate_entry(product)
def decorate_entry(obj, suffix=""):
"""
Decorates EITHER an ontology metadata object OR a product object with a purl.
Each object has an identifier which either identifies the ontology sensu grouping
project (e.g. 'go') or a specific product (e.g. 'go.obo' or 'go.owl').
By default each id is prefixed with the OBO prefix (unless is has an alternate prefix,
in which case it is effectively ignored).
"""
id = obj['id']
if not('is_obsolete' in obj):
if has_obo_prefix(obj):
obj['ontology_purl'] = "http://purl.obolibrary.org/obo/" + id + suffix
def has_obo_prefix(obj):
return ('uri_prefix' not in obj) or (obj['uri_prefix'] == 'http://purl.obolibrary.org/obo/')
def has_a_product(obj):
return 'products' in obj and len(obj['products']) > 0
def concat_principles_yaml(args):
objs = []
cfg = {}
if (args.include):
f = open(args.include, 'r')
cfg = yaml.load(f.read())
for fn in args.files:
(obj, md) = load_md(fn)
objs.append(obj)
cfg['principles'] = objs
f = open(args.output, 'w')
f.write(yaml.dump(cfg))
return cfg
def load_md(fn):
"""
Load a yaml text blob from a markdown file and parse the blob.
Returns a tuple (yaml_obj, markdown_text)
"""
f = open(fn, 'r')
text = f.read()
return extract(text)
def extract(mdtext):
"""
Extract a yaml text blob from markdown text and parse the blob.
Returns a tuple (yaml_obj, markdown_text)
"""
lines = mdtext.split("\n")
n = 0
ylines = []
mlines = []
for line in lines:
if (line == "---"):
n=n+1
hlines = []
else:
if (n == 1):
ylines.append(line)
else:
mlines.append(line)
yamltext = "\n".join(ylines)
obj = yaml.load(yamltext)
return (obj, "\n".join(mlines))
def write_legacy_metadata_objects(onts, stream):
"""
write to the old ontologies.txt format
"""
for ont in onts:
write_legacy_metadata_object(ont, stream)
def write_legacy_metadata_object(ont, stream):
"""
write to the old ontologies.txt format (single object) - TODO
"""
write_pv('id', ont['id'], stream)
write_pv('title', ont['title'], stream)
write_pv('namespace', ont['id'].upper, stream)
write_pv('foundry', ont['is_foundry'], stream)
def write_pv(k,v,s):
s.write(p + "\t" + v)
if __name__ == "__main__":
main()