Skip to content

Commit

Permalink
Use jsonpath to collect values when downloading a file (#425)
Browse files Browse the repository at this point in the history
* Use jsonpath to collect values when downloading a file

* keep the default headers when downloading

* remove unnecessary "if" in collect_values_jp
  • Loading branch information
crisely09 authored Oct 31, 2024
1 parent bf14555 commit 0451e90
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 7 deletions.
8 changes: 6 additions & 2 deletions kgforge/core/archetypes/read_only_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
)
from kgforge.core.commons.execution import not_supported
from kgforge.core.commons.sparql_query_builder import SPARQLQueryBuilder
from kgforge.core.reshaping import collect_values
from kgforge.core.reshaping import collect_values, collect_values_jp
from kgforge.core.wrappings import Filter
from kgforge.core.wrappings.dict import DictWrapper

Expand Down Expand Up @@ -107,9 +107,13 @@ def download(
# path: DirPath.
urls = []
store_metadata = []
constraint_dict = None
if content_type:
constraint_dict = {'encodingFormat': content_type}
to_download = [data] if isinstance(data, Resource) else data
for d in to_download:
collected_values = collect_values(d, follow, DownloadingError)
# collected_values = collect_values(d, follow, DownloadingError)
collected_values = collect_values_jp(d, follow, DownloadingError, constraint_dict)
urls.extend(collected_values)
store_metadata.extend(
[d._store_metadata for _ in range(len(collected_values))]
Expand Down
28 changes: 27 additions & 1 deletion kgforge/core/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# You should have received a copy of the GNU Lesser General Public License
# along with Blue Brain Nexus Forge. If not, see <https://choosealicense.com/licenses/lgpl-3.0/>.

from typing import Dict, Iterator, List, Union, Type
from typing import Dict, Iterator, List, Union, Type, Optional
import jsonpath_ng as jp

from kgforge.core.resource import Resource
from kgforge.core.commons.attributes import repr_class
Expand Down Expand Up @@ -106,3 +107,28 @@ def _collect(things: List) -> Iterator[str]:
raise exception(
f"An error occur when collecting values for path to follow '{follow}': {str(e)}"
) from e


def collect_values_jp(data: Resource, follow: str,
exception: Type[Exception] = Exception,
constraint_dict: Optional[Dict] = None) -> List[str]:
try:
properties = follow.split('.')
pattern = f"$." + "[*].".join(properties)
jp_query = jp.parse(pattern)
data = as_json(data, False, False, None, None, None)
results = jp_query.find(data)
if len(results) == 0:
raise exception(f"Path not found")
if constraint_dict:
if len(constraint_dict) != 1:
raise NotImplementedError("Only one constraint can be impossed at the moment")
[(k, v)] = list(constraint_dict.items())
return [result.value for result in results if result.context.value[k] == v]
else:
return [result.value for result in results]

except Exception as e:
raise exception(
f"An error occur when collecting values for path to follow '{follow}': {str(e)}"
) from e
2 changes: 0 additions & 2 deletions kgforge/specializations/stores/bluebrain_nexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,6 @@ def _download_one(
params_download = copy.deepcopy(self.service.params.get("download", {}))
headers = (
self.service.headers_download
if not content_type
else update_dict(self.service.headers_download, {"Accept": content_type})
)

response = requests.get(
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
"owlrl>=5.2.3",
"elasticsearch_dsl==7.4.0",
"requests==2.32.0",
"typing-extensions"
"typing-extensions",
"jsonpath-ng"
],
extras_require={
"dev": [
Expand Down
28 changes: 27 additions & 1 deletion tests/core/test_reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest
from kgforge.core.resource import Resource
from kgforge.core.forge import KnowledgeGraphForge
from kgforge.core.reshaping import collect_values, Reshaper
from kgforge.core.reshaping import collect_values, collect_values_jp, Reshaper


def test_collect_values():
Expand All @@ -43,6 +43,32 @@ def test_collect_values():
collect_values(None, "hasPart.url",ValueError)


def test_collect_values_jasonpath():
simple = Resource(type="Experiment", url="file.gz")
r = collect_values_jp(simple, "url")
assert simple.url in r, "url should be in the list"
deep = Resource(type="Experiment", level1=Resource(level2=Resource(url="file.gz")))
r = collect_values_jp(deep, "level1.level2.url")
assert deep.level1.level2.url in r, "url should be in the list"
files = [Resource(type="Experiment", url=f"file{i}") for i in range(3)]
files.append(Resource(type="Experiment", contentUrl=f"file3"))
data_set = Resource(type="Dataset", hasPart=files)
r = collect_values_jp(data_set, "hasPart.contentUrl")
assert ["file3"] == r, "one element should be in the list"
r = collect_values_jp(data_set, "hasPart.url")
assert ["file0", "file1", "file2"] == r, "three elements should be in the list"
files = [Resource(type="Experiment", url=f"file{i}", encodingFormat=f"application/{ext}") for i, ext in enumerate(['csv', 'swc'])]
data_set = Resource(type="Dataset", hasPart=files)
r = collect_values_jp(data_set, "hasPart.url", constraint_dict={'encodingFormat': 'application/swc'})
assert ["file1"] == r, "only the file with encodingFormat `application/swc` must be returned"
with pytest.raises(Exception):
collect_values_jp(data_set, "hasPart.url", constraint_dict={'encodingFormat': 'application/swc', 'contentUrl': 'something'})
with pytest.raises(Exception):
collect_values_jp(data_set, "fake.path")
with pytest.raises(ValueError):
collect_values_jp(None, "hasPart.url", ValueError)


def test_reshape(config):
forge = KnowledgeGraphForge(config)
reshaper = Reshaper(versioned_id_template="{x.id}?_version={x._store_metadata.version}")
Expand Down

0 comments on commit 0451e90

Please sign in to comment.