Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

USAP adapter update #26

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions ieda/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@
'cert_pem_path': 'client_cert.pem',
'cert_key_path': 'client_key_nopassword.pem',
},
{
'sitemap': 'usap_sitemap.xml',
'gmn_path': 'usap',
'cert_pem_path': 'client_cert.pem',
'cert_key_path': 'client_key_nopassword.pem',
},
]

# The following set the rightsholder, submitter and authoritative member node in the
Expand Down
253 changes: 253 additions & 0 deletions ieda/usap_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# This work was created by participants in the DataONE project, and is
# jointly copyrighted by participating institutions in DataONE. For
# more information on DataONE, see our web site at http://dataone.org.
#
# Copyright 2009-2017 DataONE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""USAP Slender Node Adapter

Sitemap: https://www.usap-dc.org/view/dataset/sitemap.xml

The links on the sitemap pages get dataset landing pages that have the SDO
JSON-LD scripts in the html header.

Note that there's no ability to support archival in this approach (unless one
attempts to track when a record identifier used to appear but no longer does).

USAP is no longer being served by IEDA, so we have changed the way the adapter
gets USAP data by breaking the existing IEDA adapter into a separate USAP
adapter.
"""
import datetime
import io
import logging
import os
import pprint
import requests
import sys
import xml.etree.ElementTree as ET

import d1_client.mnclient_2_0
import d1_common.checksum
import d1_common.const
import d1_common.date_time
import d1_common.system_metadata
import d1_common.types.dataoneTypes_v2_0 as v2
import d1_common.types.exceptions
import d1_common.wrap.access_policy
import d1_common.xml

import d1_client
import schema_org
import d1_common.util
import d1_common.url

from usap_config import *

SCIMETA_FORMAT_ID = 'http://www.isotc211.org/2005/gmd'

NS_DICT = {
'gmd': 'http://www.isotc211.org/2005/gmd',
'gco': 'http://www.isotc211.org/2005/gco',
}


def main():
d1_common.util.log_setup()

for sitemap_idx, site_dict in enumerate(SITEMAP_LIST):
try:
proc_sitemap(sitemap_idx, len(SITEMAP_LIST), site_dict)
except Exception:
logging.exception(
'Unable to process sitemap. site_dict={}'.format(site_dict))


def proc_sitemap(sitemap_idx, sitemap_count, site_dict):
logging.info('#' * 100)
logging.info('Processing site. idx={}/{}. site_dict={}'.format(
sitemap_idx + 1, sitemap_count, pprint.pformat(site_dict))
)

sitemap_url = d1_common.url.joinPathElements(SITEMAP_ROOT, site_dict['sitemap'])
gmn_base_url = d1_common.url.joinPathElements(GMN_BASE_URL, site_dict['gmn_path'])
cert_pem_path = os.path.join(CERT_PATH_ROOT, site_dict['cert_pem_path'])
cert_key_path = os.path.join(CERT_PATH_ROOT, site_dict['cert_key_path'])

logging.info('sitemap_url="{}"'.format(sitemap_url))
logging.info('gmn_base_url="{}"'.format(gmn_base_url))
logging.info('cert_pem_path="{}"'.format(cert_pem_path))
logging.info('cert_key_path="{}"'.format(cert_key_path))

resource_list = schema_org.load_resources_from_sitemap(sitemap_url)

logging.info('Found resources: {}'.format(len(resource_list)))

gmn_client = create_gmn_client(gmn_base_url, cert_pem_path, cert_key_path)

for resource_idx, resource_dict in enumerate(resource_list):
try:
proc_resource(resource_idx, len(resource_list), resource_dict, gmn_client)
except Exception:
logging.exception(
'Unable to process resource. resource_idx={}'.format(resource_idx))


def proc_resource(resource_idx, resource_count, resource_dict, gmn_client):
logging.info('-' * 100)
logging.info('Processing resource. idx={}/{}. resource_dict={}'.format(
resource_idx + 1, resource_count, pprint.pformat(resource_dict))
)

entry_dict = schema_org.load_schema_org(resource_dict)

result_dict = {
**resource_dict,
**entry_dict,
}

# {
# 'date_modified': '2018-01-25T15:55:08-05:00',
# 'id': 'doi:10.7265/N5F47M23',
# 'metadata_format': None,
# 'metadata_url': 'http://get.iedadata.org/metadata/iso/usap/609539iso.xml',
# 'url': 'http://get.iedadata.org/metadata/iso/609539'
# }

logging.info('result_dict={}'.format(pprint.pformat(result_dict)))

if 'error' in result_dict:
logging.error(
'error="{}" url="{}"'.format(result_dict['error'], result_dict['url'])
)
return

sid = result_dict['id']
pid = result_dict['url']

logging.info('schema.org. sid="{}" pid="{}"'.format(sid, pid))

if is_in_gmn(gmn_client, pid):
logging.info('Skipped. Already in GMN. sid="{}" pid="{}"'.format(sid, pid))
return

scimeta_xml_bytes = download_scimeta_xml(result_dict['metadata_url'])

pid_sysmeta_pyxb = generate_system_metadata(scimeta_xml_bytes, pid, sid)

logging.info('pid_sysmeta_pyxb:')
logging.info(d1_common.xml.serialize_to_xml_str(pid_sysmeta_pyxb))

head_sysmeta_pyxb = get_sysmeta(gmn_client, sid)

if head_sysmeta_pyxb:
logging.info('head_sysmeta_pyxb:')
logging.info(d1_common.xml.serialize_to_xml_str(head_sysmeta_pyxb))

head_pid = head_sysmeta_pyxb.identifier.value()
logging.info(
'SID already exists on GMN. Adding to chain. head_pid="{}"'
.format(head_pid)
)
# print('1'*100)
gmn_client.update(
head_pid, io.BytesIO(scimeta_xml_bytes), pid, pid_sysmeta_pyxb
)
# print('2'*100)
else:
# print('3'*100)
logging.info(
'SID does not exist on GMN. Starting new chain. pid="{}"'.format(pid)
)
gmn_client.create(pid, io.BytesIO(scimeta_xml_bytes), pid_sysmeta_pyxb)
# print('4'*100)


def download_scimeta_xml(scimeta_url):
try:
return requests.get(scimeta_url).content
except requests.HTTPError as e:
raise AdapterException(
'Unable to download SciMeta. error="{}"'.format(str(e))
)


def parse_doi(iso_xml):
"""Get the DOI from an ISO XML doc"""
tree = ET.parse(iso_xml)
root = tree.getroot()
doi_el = root.findall(
'.gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/'
'gmd:CI_Citation/gmd:identifier/gmd:MD_Identifier/gmd:code/'
'gco:CharacterString', NS_DICT
)[0]
return doi_el.text.strip()


def create_gmn_client(gmn_base_url, cert_pem_path, cert_key_path):
return d1_client.mnclient_2_0.MemberNodeClient_2_0(
base_url=gmn_base_url,
cert_pem_path=cert_pem_path,
cert_key_path=cert_key_path,
# retries=1,
)


def is_in_gmn(gmn_client, did):
try:
gmn_client.getSystemMetadata(did)
except d1_common.types.exceptions.NotFound:
return False
return True


def get_sysmeta(gmn_client, sid):
try:
return gmn_client.getSystemMetadata(sid)
except d1_common.types.exceptions.NotFound:
return None


def generate_system_metadata(scimeta_bytes, pid, sid):
sysmeta_pyxb = v2.systemMetadata()
sysmeta_pyxb.seriesId = sid
sysmeta_pyxb.formatId = SCIMETA_FORMAT_ID
sysmeta_pyxb.size = len(scimeta_bytes)
sysmeta_pyxb.checksum = d1_common.checksum.create_checksum_object_from_stream(
io.BytesIO(scimeta_bytes))
sysmeta_pyxb.identifier = pid
sysmeta_pyxb.dateUploaded = d1_common.date_time.utc_now()
sysmeta_pyxb.dateSysMetadataModified = datetime.datetime.now()
sysmeta_pyxb.rightsHolder = SCIMETA_RIGHTS_HOLDER
sysmeta_pyxb.submitter = SCIMETA_SUBMITTER
sysmeta_pyxb.authoritativeMemberNode = SCIMETA_AUTHORITATIVE_MEMBER_NODE
sysmeta_pyxb.originMemberNode = SCIMETA_AUTHORITATIVE_MEMBER_NODE
sysmeta_pyxb.accessPolicy = v2.AccessPolicy()

with d1_common.wrap.access_policy.wrap_sysmeta_pyxb(sysmeta_pyxb) as ap:
ap.clear()
ap.add_public_read()

return sysmeta_pyxb


class AdapterException(Exception):
pass


if __name__ == '__main__':
sys.exit(main())
25 changes: 25 additions & 0 deletions ieda/usap_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# IEDA adapter production config

# IEDA sitemap index URL. The 'sitemap' paths below are appended to this URL.
SITEMAP_ROOT = 'https://www.usap-dc.org/view/dataset/'
GMN_BASE_URL = 'https://gmn.dataone.org/ieda'

# Absolute path to the root of the certificate storage. The certificate paths below are
# relative to this directory.
CERT_PATH_ROOT = ''

# List of sites from which metadata is extracted by the adapter.
SITEMAP_LIST = [
{
'sitemap': 'sitemap.xml',
'gmn_path': 'usap',
'cert_pem_path': 'client_cert.pem',
'cert_key_path': 'client_key_nopassword.pem',
},
]

# The following set the rightsholder, submitter and authoritative member node in the
# DataONE System Metadata for the generated metadata objects.
SCIMETA_RIGHTS_HOLDER = 'CN=urn:node:IEDA,DC=dataone,DC=org'
SCIMETA_SUBMITTER = 'CN=urn:node:IEDA,DC=dataone,DC=org'
SCIMETA_AUTHORITATIVE_MEMBER_NODE = 'urn:node:IEDA'