Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Library Usage #118

Merged
merged 21 commits into from
Aug 24, 2023
Merged
Changes from 11 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2df147a
Add validate() function for programmatic access
kchason Jul 25, 2023
f485161
Fix pre-commit formatting
kchason Jul 25, 2023
b509dc5
Fix property reference
kchason Jul 25, 2023
8c40df1
Make type generic to account for multiple return types
kchason Jul 25, 2023
f6d48e2
Fix List vs list for casting
kchason Jul 25, 2023
2d3a65b
Feedback from PR
kchason Jul 26, 2023
bfe9992
Merge branch 'develop' into library-usage
kchason Jul 26, 2023
86ed417
Instantiate properties as instance variables instead of class
kchason Jul 26, 2023
40be311
Fix None vs "none" ontology version specification
ajnelson-nist Aug 15, 2023
ae5f077
Add explicit `-> None` on `__init__`
ajnelson-nist Aug 21, 2023
97d7fbb
Constrain `ValidationResult.graph` type to `pyshacl.validate(...)[1]`…
ajnelson-nist Aug 21, 2023
64bd95c
Merge branch 'develop' into library-usage
kchason Aug 22, 2023
4208a99
Wrap errors and positional arg signature support
kchason Aug 22, 2023
73e4683
Separate types and utils into discrete files
kchason Aug 22, 2023
d41a418
Fix import reference
kchason Aug 22, 2023
8f957dc
Forward arguments with unpacking syntax
ajnelson-nist Aug 23, 2023
6f9b6c9
Merge branch 'develop' into library-usage
ajnelson-nist Aug 23, 2023
00f1360
Default case_validate.validate inference parameter to None rather tha…
ajnelson-nist Aug 23, 2023
15f00c9
Consolidate case_validate CLI validation logic into case_validate.val…
ajnelson-nist Aug 23, 2023
eccaad4
Add new case_validate source files to Make dependencies
ajnelson-nist Aug 23, 2023
90f5c8c
case_validate: Update NIST inlined license text
ajnelson-nist Aug 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 186 additions & 73 deletions case_utils/case_validate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@
import os
import sys
import warnings
from typing import Dict, Set, Tuple, Union
from typing import Dict, List, Optional, Set, Tuple, Union

import pyshacl # type: ignore
import rdflib
from rdflib import Graph

import case_utils.ontology
from case_utils.ontology.version_info import (
Expand All @@ -64,13 +65,189 @@ class NonExistentCDOConceptWarning(UserWarning):
pass


class ValidationResult:
kchason marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
conforms: bool,
graph: Union[Exception, bytes, str, rdflib.Graph],
text: str,
undefined_concepts: Set[rdflib.URIRef],
) -> None:
self.conforms = conforms
self.graph = graph
self.text = text
self.undefined_concepts = undefined_concepts


def concept_is_cdo_concept(n_concept: rdflib.URIRef) -> bool:
"""
Determine if a concept is part of the CDO ontology.

:param n_concept: The concept to check.
:return: whether the concept is part of the CDO ontologies.
"""
concept_iri = str(n_concept)
return concept_iri.startswith(
"https://ontology.unifiedcyberontology.org/"
) or concept_iri.startswith("https://ontology.caseontology.org/")


def get_ontology_graph(
case_version: Optional[str] = None, supplemental_graphs: Optional[List[str]] = None
) -> rdflib.Graph:
"""
Get the ontology graph for the given case_version and any supplemental graphs.

:param case_version: the version of the CASE ontology to use. If None (i.e. null), the most recent version will be used. If "none" (the string), no pre-built version of CASE will be used.
:param supplemental_graphs: a list of supplemental graphs to use. If None, no supplemental graphs will be used.
:return: the ontology graph against which to validate the data graph.
"""
ontology_graph = rdflib.Graph()

if case_version != "none":
# Load bundled CASE ontology at requested version.
if case_version is None:
case_version = CURRENT_CASE_VERSION
ttl_filename = case_version + ".ttl"
_logger.debug("ttl_filename = %r.", ttl_filename)
ttl_data = importlib.resources.read_text(case_utils.ontology, ttl_filename)
ontology_graph.parse(data=ttl_data, format="turtle")

if supplemental_graphs:
for arg_ontology_graph in supplemental_graphs:
_logger.debug("arg_ontology_graph = %r.", arg_ontology_graph)
ontology_graph.parse(arg_ontology_graph)

return ontology_graph


def get_invalid_cdo_concepts(
data_graph: Graph, ontology_graph: Graph
) -> Set[rdflib.URIRef]:
"""
Get the set of concepts in the data graph that are not part of the CDO ontologies.

:param data_graph: The data graph to validate.
:param ontology_graph: The ontology graph to use for validation.
:return: The set of concepts in the data graph that are not part of the CDO ontologies.
"""
# Construct set of CDO concepts for data graph concept-existence review.
cdo_concepts: Set[rdflib.URIRef] = set()

for n_structural_class in [
NS_OWL.Class,
NS_OWL.AnnotationProperty,
NS_OWL.DatatypeProperty,
NS_OWL.ObjectProperty,
NS_RDFS.Datatype,
NS_SH.NodeShape,
NS_SH.PropertyShape,
NS_SH.Shape,
]:
for ontology_triple in ontology_graph.triples(
(None, NS_RDF.type, n_structural_class)
):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
if concept_is_cdo_concept(ontology_triple[0]):
cdo_concepts.add(ontology_triple[0])
for n_ontology_predicate in [
NS_OWL.backwardCompatibleWith,
NS_OWL.imports,
NS_OWL.incompatibleWith,
NS_OWL.priorVersion,
NS_OWL.versionIRI,
]:
for ontology_triple in ontology_graph.triples(
(None, n_ontology_predicate, None)
):
assert isinstance(ontology_triple[0], rdflib.URIRef)
assert isinstance(ontology_triple[2], rdflib.URIRef)
cdo_concepts.add(ontology_triple[0])
cdo_concepts.add(ontology_triple[2])
for ontology_triple in ontology_graph.triples((None, NS_RDF.type, NS_OWL.Ontology)):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
cdo_concepts.add(ontology_triple[0])

# Also load historical ontology and version IRIs.
ontology_and_version_iris_data = importlib.resources.read_text(
case_utils.ontology, "ontology_and_version_iris.txt"
)
for line in ontology_and_version_iris_data.split("\n"):
cleaned_line = line.strip()
if cleaned_line == "":
continue
cdo_concepts.add(rdflib.URIRef(cleaned_line))

data_cdo_concepts: Set[rdflib.URIRef] = set()
for data_triple in data_graph.triples((None, None, None)):
for data_triple_member in data_triple:
if isinstance(data_triple_member, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member):
data_cdo_concepts.add(data_triple_member)
elif isinstance(data_triple_member, rdflib.Literal):
if isinstance(data_triple_member.datatype, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member.datatype):
data_cdo_concepts.add(data_triple_member.datatype)

return data_cdo_concepts - cdo_concepts


def validate(
input_file: str,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For our awareness, this is a narrower argument than the first argument of pyshacl.validate; here's today's definition:

https://github.com/RDFLib/pySHACL/blob/v0.23.0/pyshacl/validate.py#L369-L370

pyshacl.validate's first argument seems to permit a string to be a file path or URL, OR a full string dump of a graph. See these lines for heuristics in pyshacl.rdfutil.load.load_from_graph:

https://github.com/RDFLib/pySHACL/blob/v0.23.0/pyshacl/rdfutil/load.py#L222-L227

Should we implement "str means path" now, or just adopt the load_from_graph usage now from these lines:

https://github.com/RDFLib/pySHACL/blob/v0.23.0/pyshacl/validate.py#L424-L428

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Welp, I found an issue pushing us towards expanding input_file from str. case_validate is written to take multiple input files as data graphs (as well as multiple input files as ontology graphs). I believe this behavior should be preserved, because otherwise a user that needs to read two data graphs at once needs to do some intermediary graph compilation before calling case_validate.

So, I think the first argument needs to become at least either Union[str, List[str]] or Union[str, Graph]. The current code path from the CLI entry point I think favors Union[str, List[str]].

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is addressed in 15f00c9.

case_version: Optional[str] = None,
supplemental_graphs: Optional[List[str]] = None,
abort_on_first: bool = False,
inference: Optional[str] = "none",
ajnelson-nist marked this conversation as resolved.
Show resolved Hide resolved
) -> ValidationResult:
"""
Validate the given data graph against the given CASE ontology version and supplemental graphs.

:param input_file: The path to the file containing the data graph to validate.
:param case_version: The version of the CASE ontology to use. If None, the most recent version will be used.
:param supplemental_graphs: The supplemental graphs to use. If None, no supplemental graphs will be used.
:param abort_on_first: Whether to abort on the first validation error.
:param inference: The type of inference to use. If "none", no inference will be used.
:return: The validation result object containing the defined properties.
"""
# Convert the data graph string to a rdflib.Graph object.
data_graph = rdflib.Graph()
data_graph.parse(input_file)

# Get the ontology graph from the case_version and supplemental_graphs arguments
ontology_graph: Graph = get_ontology_graph(case_version, supplemental_graphs)

# Get the undefined CDO concepts
undefined_cdo_concepts = get_invalid_cdo_concepts(data_graph, ontology_graph)

# Validate data graph against ontology graph.
validate_result: Tuple[
bool, Union[Exception, bytes, str, rdflib.Graph], str
ajnelson-nist marked this conversation as resolved.
Show resolved Hide resolved
] = pyshacl.validate(
data_graph,
shacl_graph=ontology_graph,
ont_graph=ontology_graph,
inference=inference,
meta_shacl=False,
abort_on_first=abort_on_first,
allow_infos=False,
allow_warnings=False,
debug=False,
do_owl_imports=False,
)

# Relieve RAM of the data graph after validation has run.
del data_graph

return ValidationResult(
validate_result[0],
validate_result[1],
validate_result[2],
undefined_cdo_concepts,
)


def main() -> None:
parser = argparse.ArgumentParser(
description="CASE wrapper to pySHACL command line tool."
Expand Down Expand Up @@ -170,78 +347,13 @@ def main() -> None:
_logger.debug("in_graph = %r.", in_graph)
data_graph.parse(in_graph)

ontology_graph = rdflib.Graph()
if args.built_version != "none":
ttl_filename = args.built_version + ".ttl"
_logger.debug("ttl_filename = %r.", ttl_filename)
ttl_data = importlib.resources.read_text(case_utils.ontology, ttl_filename)
ontology_graph.parse(data=ttl_data, format="turtle")
if args.ontology_graph:
for arg_ontology_graph in args.ontology_graph:
_logger.debug("arg_ontology_graph = %r.", arg_ontology_graph)
ontology_graph.parse(arg_ontology_graph)

# Construct set of CDO concepts for data graph concept-existence review.
cdo_concepts: Set[rdflib.URIRef] = set()

for n_structural_class in [
NS_OWL.Class,
NS_OWL.AnnotationProperty,
NS_OWL.DatatypeProperty,
NS_OWL.ObjectProperty,
NS_RDFS.Datatype,
NS_SH.NodeShape,
NS_SH.PropertyShape,
NS_SH.Shape,
]:
for ontology_triple in ontology_graph.triples(
(None, NS_RDF.type, n_structural_class)
):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
if concept_is_cdo_concept(ontology_triple[0]):
cdo_concepts.add(ontology_triple[0])
for n_ontology_predicate in [
NS_OWL.backwardCompatibleWith,
NS_OWL.imports,
NS_OWL.incompatibleWith,
NS_OWL.priorVersion,
NS_OWL.versionIRI,
]:
for ontology_triple in ontology_graph.triples(
(None, n_ontology_predicate, None)
):
assert isinstance(ontology_triple[0], rdflib.URIRef)
assert isinstance(ontology_triple[2], rdflib.URIRef)
cdo_concepts.add(ontology_triple[0])
cdo_concepts.add(ontology_triple[2])
for ontology_triple in ontology_graph.triples((None, NS_RDF.type, NS_OWL.Ontology)):
if not isinstance(ontology_triple[0], rdflib.URIRef):
continue
cdo_concepts.add(ontology_triple[0])

# Also load historical ontology and version IRIs.
ontology_and_version_iris_data = importlib.resources.read_text(
case_utils.ontology, "ontology_and_version_iris.txt"
# Get the ontology graph based on the CASE version and supplemental graphs specified by the CLI
ontology_graph = get_ontology_graph(
case_version=args.built_version, supplemental_graphs=args.ontology_graph
)
for line in ontology_and_version_iris_data.split("\n"):
cleaned_line = line.strip()
if cleaned_line == "":
continue
cdo_concepts.add(rdflib.URIRef(cleaned_line))

data_cdo_concepts: Set[rdflib.URIRef] = set()
for data_triple in data_graph.triples((None, None, None)):
for data_triple_member in data_triple:
if isinstance(data_triple_member, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member):
data_cdo_concepts.add(data_triple_member)
elif isinstance(data_triple_member, rdflib.Literal):
if isinstance(data_triple_member.datatype, rdflib.URIRef):
if concept_is_cdo_concept(data_triple_member.datatype):
data_cdo_concepts.add(data_triple_member.datatype)

undefined_cdo_concepts = data_cdo_concepts - cdo_concepts
# Get the list of undefined CDO concepts in the graph
undefined_cdo_concepts = get_invalid_cdo_concepts(data_graph, ontology_graph)
for undefined_cdo_concept in sorted(undefined_cdo_concepts):
warnings.warn(undefined_cdo_concept, NonExistentCDOConceptWarning)
undefined_cdo_concepts_message = (
Expand All @@ -259,8 +371,9 @@ def main() -> None:
if args.format != "human":
validator_kwargs["serialize_report_graph"] = args.format

validate_result: Tuple[bool, Union[Exception, bytes, str, rdflib.Graph], str]
validate_result = pyshacl.validate(
validate_result: Tuple[
bool, Union[Exception, bytes, str, rdflib.Graph], str
ajnelson-nist marked this conversation as resolved.
Show resolved Hide resolved
] = pyshacl.validate(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this call needs to be replaced with the validate() method this PR is adding to this file, but only after logistics related to #123 are settled.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is addressed in 15f00c9.

data_graph,
shacl_graph=ontology_graph,
ont_graph=ontology_graph,
Expand Down
Loading