-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
166 lines (145 loc) · 6.25 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# parse imported html from esdac
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import DC, DCAT, DCTERMS, SKOS, SDO, FOAF
import sys,time,hashlib,os
sys.path.append('utils')
from database import insertRecord, dbQuery
from keyword_matching import matchCountryUri
# Load environment variables from .env file
load_dotenv()
recsPerPage = 10000
def fullurl(u):
if not u.startswith('http'):
if u.startswith('/'):
u = 'https://esdac.jrc.ec.europa.eu'+u
else:
u = 'https://esdac.jrc.ec.europa.eu/'+u
return u
def addNS(e):
if e in ['thumbnailUrl']:
return SDO[e]
else:
return DCTERMS[e]
def urlasid(uri,ds):
if 'identifier' not in ds or 'doi.org' in uri: # prefer doi
if uri.startswith('http') and 'doi.org' in uri: # strip doi.org from uri
uri = uri.split('doi.org/').pop()
ds['identifier'] = uri.replace('?','-').replace('#','-').replace('&','-').replace('=','-')
def dict2graph(d):
g = Graph()
r = URIRef(d['identifier'])
for k,v in d.items():
if isinstance(v, list):
for i in v:
g.add((r,addNS(k),Literal(i)))
else:
g.add((r,addNS(k),Literal(v)))
return g
def parseEUDASM(s2):
ds = {'relation':[],'subject':[],'type':'dataset','isReferencedBy':'EUDASM'}
for i in s2.find_all("img"):
ds['title'] = i.get('title')
ds['thumbnailUrl'] = fullurl(i.get('src'))
break
for l in s2.find_all('a',{'typeof':"skos:Concept"}):
for kw in l.text.split('; '):
ds['subject'].append(kw)
for d in s2.find_all("span",{"property":"dc:date"}):
ds['date'] = d.text
for s in s2.find_all("section",{'id':'block-system-main'}):
section = s.text
for l in s2.find_all("span",{"class":"country"}):
ds['subject'].append(l.text)
country_uri = matchCountryUri(l.text)
if country_uri:
ds['subject'].append(country_uri)
for f in s2.find_all("a",{"title":"File"}):
ds['relation'].append(fullurl(f.get('href')))
ds['identifier'] = fullurl(f.get('href')) #set the id to ds url
for p in s2.find_all("a",{"title":"PDF"}):
ds['relation'].append(fullurl(p.get('href')))
for t in s2.find_all("td",{"valign":"top"}):
for b in str(t).split('<b>'):
if b.split('</b>')[0].strip() == 'Publisher:':
ds['publisher'] = b.split('</b>')[1].split('<br/>')[0]
elif b.split('</b>')[0].strip() == 'Author:':
ds['creator'] = b.split('</b>')[1].split('<br/>')[0]
return ds
def parseDOC(s2):
ds = {'relation':[],'subject':[],'type':'document','isReferencedBy':'ESDAC'}
for i in s2.find_all("img"):
ds['title'] = i.get('title')
ds['thumbnailUrl'] = fullurl(i.get('src'))
break
for l in s2.find_all('a',{'typeof':"skos:Concept"}):
for kw in l.text.split('; '):
ds['subject'].append(kw)
for d in s2.find_all("span",{"property":"dc:date"}):
ds['date'] = d.text
for f in s2.find_all("a",{"aria-label":"Download"}):
if f.get('href'):
ds['relation'].append(fullurl(f.get('href')))
urlasid(f.get('href'),ds)
for f in s2.find_all("span",{"class":"file"}):
for fl in f.find_all("a"):
if fl.get('href'):
ds['relation'].append(fullurl(fl.get('href')))
urlasid(fl.get('href'),ds)
for desc in s2.find_all("div",{"class":"details"}):
for desc2 in desc.find_all("p"):
ds['description'] = desc.text
break # only first
for a in desc.find_all("a"):
if a.get('href'):
urlasid(a.get('href'),ds)
ds['relation'].append(fullurl(a.get('href')))
if 'description' not in ds or ds['description'] in [None,'']:
for desc in s2.find_all("div",{"class":"field-content"}):
ds['description'] = desc.text
break
return ds
def parseESDAC(s2):
ds = {'relation':[],'subject':[],'source':[],'type':'dataset','title':ttl,'isReferencedBy':'ESDAC'}
for desc in s2.find_all('div',{'property':"dct:description"}):
ds['description'] = desc.text
for img in s2.find_all('img',{'typeof':"foaf:Image"}):
if not 'image_captcha' in img.get('src'):
ds['thumbnailUrl'] = fullurl(img.get('src'))
for uc in s2.find_all("div",{"class":"field-name-field-data-dataset-notification"}):
ds['accessRights'] = uc.text
for ref in s2.find_all("div",{"class":"field-name-field-data-dataset-references"}):
for ref2 in ref.find_all("li"):
ds['source'].append(ref2.text)
for l in s2.find_all('a',{'typeof':"skos:Concept"}):
for kw in l.text.split('; '):
ds['subject'].append(kw)
for dt in s2.find_all("div",{"class":"field-name-field-data-publication-year"}):
for d in dt.find_all("div",{"class":"field-item"}):
ds['date'] = d.text
for ct in s2.find_all("div",{"class":"field-name-field-data-publisher"}):
for c in ct.find_all("div",{"class":"field-item"}):
ds['publisher'] = c.text
return ds
# for dev only
# dbQuery(f"update harvest.items set turtle=Null where identifier='https://esdac.jrc.ec.europa.eu/resource-type/national-soil-maps-eudasm?page=122/6'",(),False)
# retrieve unparsed records
unparsed = dbQuery(f"select identifier,resultobject,resulttype,title,itemtype from harvest.items where source = 'ESDAC' and (turtle is Null or turtle = '') limit {recsPerPage}")
for rec in sorted(unparsed):
rid,res,restype,ttl,itemtype = rec
print(f'Parse {rid}')
s2 = BeautifulSoup(res, 'html.parser')
if res.startswith('<div'): #eudasm or document
if itemtype == 'dataset': #eudasm
ds = parseEUDASM(s2)
else: #document
ds = parseDOC(s2)
None
else: # full dataset page
ds = parseESDAC(s2)
if not 'identifier' in ds or ds['identifier'] in [None,""]:
ds['identifier'] = rid
dsRDF = dict2graph(ds)
triples = dsRDF.serialize(format='turtle') # todo: strip the namespaces?
dbQuery(f"update harvest.items set uri=%s, turtle=%s where identifier=%s",(ds['identifier'],triples,rid),False)