From 637169f83bea955e3c262bae06c856c76a26ae8d Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Tue, 19 Sep 2023 16:18:58 -0400 Subject: [PATCH 01/11] added utils.download and basic tests for http_download --- pyvo/utils/__init__.py | 1 + pyvo/utils/download.py | 303 ++++++++++++++++++++++++++++++ pyvo/utils/tests/data/basic.xml | 28 +++ pyvo/utils/tests/test_download.py | 108 +++++++++++ 4 files changed, 440 insertions(+) create mode 100644 pyvo/utils/download.py create mode 100644 pyvo/utils/tests/data/basic.xml create mode 100644 pyvo/utils/tests/test_download.py diff --git a/pyvo/utils/__init__.py b/pyvo/utils/__init__.py index 96be2b9e3..46f1f05c7 100644 --- a/pyvo/utils/__init__.py +++ b/pyvo/utils/__init__.py @@ -1,2 +1,3 @@ from .compat import * from .prototype import prototype_feature, activate_features +from .download import http_download, aws_download \ No newline at end of file diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py new file mode 100644 index 000000000..972dc3bc6 --- /dev/null +++ b/pyvo/utils/download.py @@ -0,0 +1,303 @@ +""" +Utilties for downloading data +""" +import os +from urllib.parse import urlparse, unquote, parse_qs +import threading +from warnings import warn + +import astropy +from astropy.utils.console import ProgressBarOrSpinner +from astropy.utils.exceptions import AstropyUserWarning + + +from .http import use_session + +__all__ = ['http_download', 'aws_download'] + + +class PyvoUserWarning(AstropyUserWarning): + pass + + +# adapted from astroquery._download_file. +def http_download(url, + local_filepath=None, + cache=True, + timeout=None, + session=None, + verbose=False, + **kwargs): + """Download file from http(s) url + + Parameters + ---------- + url: str + The URL of the file to download + local_filepath: str + Local path, including filename, where the file is to be downloaded. + cache : bool + If True, check if a cached file exists before download + timeout: int + Time to attempt download before failing + session: requests.Session + Session to use. If None, create a new one. + verbose: bool + If True, print progress and debug text + + Keywords + -------- + additional keywords to be passed to session.request() + + Return + ------ + local_filepath: path to the downloaded file + + """ + + _session = use_session(session) + method = 'GET' + + if not local_filepath: + local_filepath = _filename_from_url(url) + + + response = _session.request(method, url, timeout=timeout, + stream=True, **kwargs) + + + response.raise_for_status() + if 'content-length' in response.headers: + length = int(response.headers['content-length']) + if length == 0: + if verbose: + print(f'URL {url} has length=0') + else: + length = None + + + if cache and os.path.exists(local_filepath): + if length is not None and os.path.getsize(local_filepath) != length: + warn(f'Found cached file but it has the wrong size. Overwriting ...', + category=PyvoUserWarning) + else: + if verbose: + print(f'Found cached file {local_filepath}.') + response.close() + return local_filepath + + response = _session.request(method, url, timeout=timeout, + stream=True, **kwargs) + response.raise_for_status() + + + blocksize = astropy.utils.data.conf.download_block_size + n_bytes = 0 + with ProgressBarOrSpinner(length, f'Downloading URL {url} to {local_filepath} ...') as pb: + with open(local_filepath, 'wb') as f: + for block in response.iter_content(blocksize): + f.write(block) + n_bytes += len(block) + if length is not None: + pb.update(min(n_bytes, length)) + else: + pb.update(n_bytes) + response.close() + return local_filepath + + +def _s3_is_accessible(s3_resource, bucket_name, key): + """Do a head_object call to test access + + Paramters + --------- + s3_resource : s3.ServiceResource + the service resource used for s3 connection. + bucket_name : str + bucket name. + key : str + key to file to test. + + Return + ----- + (accessible, msg) where accessible is a bool and msg is the failure message + + """ + + s3_client = s3_resource.meta.client + + try: + header_info = s3_client.head_object(Bucket=bucket_name, Key=key) + accessible, msg = True, '' + except Exception as e: + accessible = False + msg = str(e) + + return accessible, msg + + +# adapted from astroquery.mast. +def aws_download(uri=None, + bucket_name=None, + key=None, + local_filepath=None, + cache=False, + timeout=None, + aws_profile=None, + session=None, + verbose=False): + """Download file from AWS. + + Adapted from astroquery.mast + + Parameters + ---------- + uri: str + The URI for s3 location of the form: s3://bucket-name/key. + If given, bucket_name and key are ignored. + if None, both bucket_name and key are required. + bucket_name: str + Name of the s3 bucket + key: str + s3 key to the file. + local_filepath: str + Local path, including filename, where the file is to be downloaded. + cache : bool + If True, check if a cached file exists before download + timeout: int + Time to attempt download before failing + aws_profile: str + name of the user's profile for credentials in ~/.aws/config + or ~/.aws/credentials. Use to authenticate the AWS user with boto3. + session: boto3.session.Session + Session to use that include authentication if needed. + If None, create an annonymous one. If given, aws_profile is ignored + verbose: bool + If True, print progress and debug text + + Return + ------ + local_filepath: path to the downloaded file + + """ + try: + import boto3 + import botocore + except ImportError: + raise ImportError('aws_download requires boto3. Make sure it is installed first') + + if uri is None and (bucket_name is None and key is None): + raise ValueError('Either uri or both bucket_name and key must be given') + + if uri: + parsed = urlparse(uri, allow_fragments=False) + bucket_name = parsed.netloc + key = parsed.path[1:] + if verbose: + print(f'bucket: {bucket_name}, key: {key}') + + if not local_filepath: + local_filepath = _filename_from_url(f's3://{bucket_name}/{key}') + + + if session: + if not isinstance(session, boto3.session.Session): + raise ValueError('session has to be instance of boto3.session.Session') + s3_config = botocore.client.Config(connect_timeout=timeout) + s3_resource = session.resource(service_name='s3', config=s3_config) + else: + if aws_profile is None: + s3_config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=timeout) + s3_resource = boto3.resource(service_name='s3', config=s3_config) + else: + session = boto3.session.Session(profile_name=aws_profile) + s3_config = botocore.client.Config(connect_timeout=timeout) + s3_resource = session.resource(service_name='s3', config=s3_config) + + # check access + accessible, message1 = _s3_is_accessible(s3_resource, bucket_name, key) + if verbose: + print(f'Access with profile or annonymous: {accessible}. Message: {message1}') + + # If access with profile fails, attemp to use any credientials + # in the user system e.g. environment variables etc. boto3 should find them. + if not accessible: + s3_resource = boto3.resource(service_name='s3') + accessible, message2 = _s3_is_accessible(s3_resource, bucket_name, key) + if verbose: + print(f'Access with system credentials: {accessible}. Message: {message1}') + # is still not accessible, fail + if not accessible: + raise PermissionError((f'{key} in {bucket_name} is ' + f'inaccessible:\n{message1}\n{message2}')) + + # proceed with download + s3_client = s3_resource.meta.client + bkt = s3_resource.Bucket(bucket_name) + + # Ask the webserver what the expected content length is and use that. + info_lookup = s3_client.head_object(Bucket=bucket_name, Key=key) + length = info_lookup["ContentLength"] + + # if we have cache, use it and return, otherwise download data + if cache and os.path.exists(local_filepath): + + if length is not None: + statinfo = os.stat(local_filepath) + if statinfo.st_size == length: + # found cached file with expected size. Stop + if verbose: + print(f'Found cached file {local_filepath}.') + return local_filepath + if verbose: + print(f'Found cached file {local_filepath} with size {statinfo.st_size} ' + f'that is different from expected size {length}') + + with ProgressBarOrSpinner(length, (f'Downloading {key} to {local_filepath} ...')) as pb: + + # Bytes read tracks how much data has been received so far + # This variable will be updated in multiple threads below + global bytes_read + bytes_read = 0 + + progress_lock = threading.Lock() + + def progress_callback(numbytes): + # Boto3 calls this from multiple threads pulling the data from S3 + global bytes_read + + # This callback can be called in multiple threads + # Access to updating the console needs to be locked + with progress_lock: + bytes_read += numbytes + pb.update(bytes_read) + + bkt.download_file(key, local_filepath, Callback=progress_callback) + return local_filepath + +def _filename_from_url(url): + """Extract file name from uri/url + handle cases of urls of the form: + - https://example.com/files/myfile.pdf?user_id=123 + - https://example.com/files/myfile.pdf + - http://example.com/service?file=/location/myfile.pdf&size=large + + Parameters + ---------- + url: str + url of the file + + """ + parsed_url = urlparse(url) + path = parsed_url.path + path_parts = path.split('/') + filename = path_parts[-1] + if '.' not in filename: + query_params = parse_qs(parsed_url.query) + for param, values in query_params.items(): + if len(values) > 0: + filename = os.path.basename(unquote(values[0])) + if '.' in filename: + break + + return filename \ No newline at end of file diff --git a/pyvo/utils/tests/data/basic.xml b/pyvo/utils/tests/data/basic.xml new file mode 100644 index 000000000..cef585839 --- /dev/null +++ b/pyvo/utils/tests/data/basic.xml @@ -0,0 +1,28 @@ + + + + + OK + + + + + + + + + + + + + + + + + + + +
23Illuminatus
42Don't panic, and always carry a towel
1337Elite
+
+
diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py new file mode 100644 index 000000000..ca833dc28 --- /dev/null +++ b/pyvo/utils/tests/test_download.py @@ -0,0 +1,108 @@ +# Licensed under a 3-clause BSD style license - see LICENSE.rst +""" +Tests for pyvo.utils.download +""" +import os +from functools import partial +import re +import pytest +import requests_mock +from contextlib import contextmanager +from urllib.error import URLError + + +from astropy.utils.data import get_pkg_data_contents + +from pyvo.utils.download import _filename_from_url, PyvoUserWarning, http_download + + + + +class ContextAdapter(requests_mock.Adapter): + """ + requests_mock adapter where ``register_uri`` returns a context manager + """ + @contextmanager + def register_uri(self, *args, **kwargs): + matcher = super().register_uri(*args, **kwargs) + + yield matcher + + self.remove_matcher(matcher) + + def remove_matcher(self, matcher): + if matcher in self._matchers: + self._matchers.remove(matcher) + + +@pytest.fixture(scope='function') +def mocker(): + with requests_mock.Mocker( + adapter=ContextAdapter(case_sensitive=True) + ) as mocker_ins: + yield mocker_ins + + + + +get_pkg_data_contents = partial( + get_pkg_data_contents, package=__package__, encoding='binary') + +data_re = re.compile('http://example.com/data.*') + + +def test__filename_from_url(): + urls = [ + 'https://example.com/files/myfile.pdf?user_id=123', + 'https://example.com/files/myfile.pdf', + 'http://somesite.com/service?file=/location/myfile.pdf&size=large' + ] + + for url in urls: + filename = _filename_from_url(url) + assert(filename == 'myfile.pdf') + + +@pytest.fixture(name='http_mock') +def _data_downloader(mocker): + def callback(request, context): + #print(context.headers);raise ValueError + fname = request.path.split('/')[-1] + return get_pkg_data_contents(f'data/{fname}') + + with mocker.register_uri( + 'GET', data_re, content=callback, headers={'content-length':'901'}, + ) as matcher: + yield matcher + +@pytest.mark.usefixtures('http_mock') +def test_http_download__noPath(): + filename = http_download('http://example.com/data/basic.xml', + local_filepath=None, cache=False) + assert(filename == 'basic.xml') + os.remove('basic.xml') + +@pytest.mark.usefixtures('http_mock') +def test_http_download__noFile(): + with pytest.raises(URLError): + filename = http_download('http://example.com/data/nofile.fits') + + +@pytest.mark.usefixtures('http_mock') +def test_http_download__wPath(): + filename = http_download('http://example.com/data/basic.xml', + local_filepath='basic2.xml', cache=False) + assert(filename == 'basic2.xml') + os.remove('basic2.xml') + +@pytest.mark.usefixtures('http_mock') +def test_http_download__wrong_cache(): + # get the file first + with open('basic.xml', 'w') as fp: + fp.write('some content') + # get it from cache + with pytest.warns(PyvoUserWarning): + filename = http_download('http://example.com/data/basic.xml', + local_filepath='basic.xml', cache=True) + assert(os.path.getsize('basic.xml') == 901) + os.remove('basic.xml') \ No newline at end of file From ecf2782049be5103ec24addfb40a621fcee86279 Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Tue, 19 Sep 2023 22:20:47 -0400 Subject: [PATCH 02/11] added tests for utils.aws_download --- pyvo/utils/__init__.py | 2 +- pyvo/utils/download.py | 33 +++++---- pyvo/utils/tests/test_download.py | 116 ++++++++++++++++++++++++------ 3 files changed, 114 insertions(+), 37 deletions(-) diff --git a/pyvo/utils/__init__.py b/pyvo/utils/__init__.py index 46f1f05c7..d74f69100 100644 --- a/pyvo/utils/__init__.py +++ b/pyvo/utils/__init__.py @@ -1,3 +1,3 @@ from .compat import * from .prototype import prototype_feature, activate_features -from .download import http_download, aws_download \ No newline at end of file +from .download import http_download, aws_download diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index 972dc3bc6..1a63cd1cd 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -120,20 +120,21 @@ def _s3_is_accessible(s3_resource, bucket_name, key): Return ----- - (accessible, msg) where accessible is a bool and msg is the failure message + (accessible, exception) where accessible is a bool and exception is the error exception """ s3_client = s3_resource.meta.client + exception = None try: header_info = s3_client.head_object(Bucket=bucket_name, Key=key) - accessible, msg = True, '' + accessible = True except Exception as e: accessible = False - msg = str(e) + exception = e - return accessible, msg + return accessible, exception # adapted from astroquery.mast. @@ -205,31 +206,34 @@ def aws_download(uri=None, raise ValueError('session has to be instance of boto3.session.Session') s3_config = botocore.client.Config(connect_timeout=timeout) s3_resource = session.resource(service_name='s3', config=s3_config) + msg1 = 'Access with session' else: if aws_profile is None: s3_config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=timeout) s3_resource = boto3.resource(service_name='s3', config=s3_config) + msg1 = 'Annonymous access' else: session = boto3.session.Session(profile_name=aws_profile) s3_config = botocore.client.Config(connect_timeout=timeout) s3_resource = session.resource(service_name='s3', config=s3_config) + msg1 = f'Access with profile ({aws_profile})' # check access - accessible, message1 = _s3_is_accessible(s3_resource, bucket_name, key) + accessible, exception1 = _s3_is_accessible(s3_resource, bucket_name, key) if verbose: - print(f'Access with profile or annonymous: {accessible}. Message: {message1}') + print(f'{msg1}: {accessible}. Message: {str(exception1)}') # If access with profile fails, attemp to use any credientials # in the user system e.g. environment variables etc. boto3 should find them. if not accessible: + msg2 = 'Access with system credentials' s3_resource = boto3.resource(service_name='s3') - accessible, message2 = _s3_is_accessible(s3_resource, bucket_name, key) + accessible, exception2 = _s3_is_accessible(s3_resource, bucket_name, key) if verbose: - print(f'Access with system credentials: {accessible}. Message: {message1}') + print(f'{msg2}: {accessible}. Message: {str(exception2)}') # is still not accessible, fail if not accessible: - raise PermissionError((f'{key} in {bucket_name} is ' - f'inaccessible:\n{message1}\n{message2}')) + raise exception2 # proceed with download s3_client = s3_resource.meta.client @@ -249,9 +253,9 @@ def aws_download(uri=None, if verbose: print(f'Found cached file {local_filepath}.') return local_filepath - if verbose: - print(f'Found cached file {local_filepath} with size {statinfo.st_size} ' - f'that is different from expected size {length}') + else: + warn(f'Found cached file but it has the wrong size. Overwriting ...', + category=PyvoUserWarning) with ProgressBarOrSpinner(length, (f'Downloading {key} to {local_filepath} ...')) as pb: @@ -275,6 +279,7 @@ def progress_callback(numbytes): bkt.download_file(key, local_filepath, Callback=progress_callback) return local_filepath + def _filename_from_url(url): """Extract file name from uri/url handle cases of urls of the form: @@ -300,4 +305,4 @@ def _filename_from_url(url): if '.' in filename: break - return filename \ No newline at end of file + return filename diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index ca833dc28..6882e2e74 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -8,14 +8,25 @@ import pytest import requests_mock from contextlib import contextmanager -from urllib.error import URLError +import urllib +import boto3 +from botocore.exceptions import ClientError +from moto import mock_s3 from astropy.utils.data import get_pkg_data_contents -from pyvo.utils.download import _filename_from_url, PyvoUserWarning, http_download - +from pyvo.utils.download import (_filename_from_url, PyvoUserWarning, _s3_is_accessible, + http_download, aws_download) +## Useful variables +# For http_download: +get_pkg_data_contents = partial( + get_pkg_data_contents, package=__package__, encoding='binary') +data_re = re.compile('http://example.com/data.*') +# For aws_download: +s3_bucket = 'pyvo-bucket' +s3_key = 'key1/key2/somekey.txt' class ContextAdapter(requests_mock.Adapter): @@ -43,14 +54,6 @@ def mocker(): yield mocker_ins - - -get_pkg_data_contents = partial( - get_pkg_data_contents, package=__package__, encoding='binary') - -data_re = re.compile('http://example.com/data.*') - - def test__filename_from_url(): urls = [ 'https://example.com/files/myfile.pdf?user_id=123', @@ -66,7 +69,6 @@ def test__filename_from_url(): @pytest.fixture(name='http_mock') def _data_downloader(mocker): def callback(request, context): - #print(context.headers);raise ValueError fname = request.path.split('/')[-1] return get_pkg_data_contents(f'data/{fname}') @@ -75,28 +77,28 @@ def callback(request, context): ) as matcher: yield matcher -@pytest.mark.usefixtures('http_mock') -def test_http_download__noPath(): + +def test_http_download__noPath(http_mock): filename = http_download('http://example.com/data/basic.xml', local_filepath=None, cache=False) assert(filename == 'basic.xml') os.remove('basic.xml') -@pytest.mark.usefixtures('http_mock') -def test_http_download__noFile(): - with pytest.raises(URLError): + +def test_http_download__noFile(http_mock): + with pytest.raises(urllib.error.URLError): filename = http_download('http://example.com/data/nofile.fits') -@pytest.mark.usefixtures('http_mock') -def test_http_download__wPath(): +def test_http_download__wPath(http_mock): filename = http_download('http://example.com/data/basic.xml', local_filepath='basic2.xml', cache=False) assert(filename == 'basic2.xml') + assert(os.path.exists('basic2.xml')) os.remove('basic2.xml') -@pytest.mark.usefixtures('http_mock') -def test_http_download__wrong_cache(): + +def test_http_download__wrong_cache(http_mock): # get the file first with open('basic.xml', 'w') as fp: fp.write('some content') @@ -105,4 +107,74 @@ def test_http_download__wrong_cache(): filename = http_download('http://example.com/data/basic.xml', local_filepath='basic.xml', cache=True) assert(os.path.getsize('basic.xml') == 901) - os.remove('basic.xml') \ No newline at end of file + os.remove('basic.xml') + + +@pytest.fixture(name='s3_mock') +def _s3_mock(mocker): + with mock_s3(): + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=s3_bucket) + s3_client = conn.meta.client + s3_client.put_object(Bucket=s3_bucket, Key=s3_key, Body='my content') + yield conn + + +def test_s3_mock_basic(s3_mock): + s3 = s3_mock.meta.client + body = s3_mock.Object(s3_bucket, s3_key).get()['Body'] + content = body.read().decode('utf-8') + assert content == 'my content' + + +def test__s3_is_accessible_yes(s3_mock): + accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, s3_key) + assert(accessible) + assert(exc is None) + + +def test__s3_is_accessible_no_bucket(s3_mock): + accessible, exc = _s3_is_accessible(s3_mock, 'some-bucket', s3_key) + assert(not accessible) + assert('NoSuchBucket' in str(exc)) + + +def test__s3_is_accessible_no_key(s3_mock): + accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, 'does/not/exist') + assert(not accessible) + assert(isinstance(exc, ClientError)) + errmsg = str(exc) + assert('Not Found' in errmsg and '404' in errmsg) + + +def test_s3_download__noPath(s3_mock): + filename = aws_download(f's3://{s3_bucket}/{s3_key}', + local_filepath=None, cache=False) + fname = s3_key.split('/')[-1] + assert(filename == fname) + os.remove(fname) + + +def test_s3_download__noKey(s3_mock): + with pytest.raises(ClientError): + filename = aws_download(f's3://{s3_bucket}/does/not/exist') + + +def test_s3_download__wPath(s3_mock): + filename = aws_download(f's3://{s3_bucket}/{s3_key}', + local_filepath='newkey.txt', cache=False) + assert(filename == 'newkey.txt') + assert(os.path.exists('newkey.txt')) + os.remove(filename) + + +def test_aws_download__wrong_cache(s3_mock): + # get the file first + with open('somekey.txt', 'w') as fp: + fp.write('not my content') + # get it from cache + with pytest.warns(PyvoUserWarning): + filename = aws_download(f's3://{s3_bucket}/{s3_key}', + local_filepath='somekey.txt', cache=True) + assert(os.path.getsize('somekey.txt') == 10) + os.remove('somekey.txt') From fa27c1910a53e0f3aa057bb148664112591b6e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Mon, 25 Sep 2023 13:19:11 -0700 Subject: [PATCH 03/11] Handling optional dependencies --- pyvo/utils/tests/test_download.py | 37 ++++++++++++++++++++++--------- setup.cfg | 4 ++++ tox.ini | 1 + 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index 6882e2e74..500038b59 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -9,17 +9,24 @@ import requests_mock from contextlib import contextmanager import urllib -import boto3 -from botocore.exceptions import ClientError -from moto import mock_s3 - from astropy.utils.data import get_pkg_data_contents from pyvo.utils.download import (_filename_from_url, PyvoUserWarning, _s3_is_accessible, http_download, aws_download) -## Useful variables +try: + # Both boto3, botocore and moto are optional dependencies, but the former 2 are + # dependencies of the latter, so it's enough to handle them with one variable + import boto3 + from botocore.exceptions import ClientError + from moto import mock_s3 + HAS_MOTO = True +except ImportError: + HAS_MOTO = False + + +# Useful variables # For http_download: get_pkg_data_contents = partial( get_pkg_data_contents, package=__package__, encoding='binary') @@ -60,7 +67,7 @@ def test__filename_from_url(): 'https://example.com/files/myfile.pdf', 'http://somesite.com/service?file=/location/myfile.pdf&size=large' ] - + for url in urls: filename = _filename_from_url(url) assert(filename == 'myfile.pdf') @@ -79,11 +86,11 @@ def callback(request, context): def test_http_download__noPath(http_mock): - filename = http_download('http://example.com/data/basic.xml', + filename = http_download('http://example.com/data/basic.xml', local_filepath=None, cache=False) assert(filename == 'basic.xml') os.remove('basic.xml') - + def test_http_download__noFile(http_mock): with pytest.raises(urllib.error.URLError): @@ -91,7 +98,7 @@ def test_http_download__noFile(http_mock): def test_http_download__wPath(http_mock): - filename = http_download('http://example.com/data/basic.xml', + filename = http_download('http://example.com/data/basic.xml', local_filepath='basic2.xml', cache=False) assert(filename == 'basic2.xml') assert(os.path.exists('basic2.xml')) @@ -104,12 +111,14 @@ def test_http_download__wrong_cache(http_mock): fp.write('some content') # get it from cache with pytest.warns(PyvoUserWarning): - filename = http_download('http://example.com/data/basic.xml', + filename = http_download('http://example.com/data/basic.xml', local_filepath='basic.xml', cache=True) assert(os.path.getsize('basic.xml') == 901) os.remove('basic.xml') + +@pytest.mark.skipif('not HAS_MOTO') @pytest.fixture(name='s3_mock') def _s3_mock(mocker): with mock_s3(): @@ -120,6 +129,7 @@ def _s3_mock(mocker): yield conn +@pytest.mark.skipif('not HAS_MOTO') def test_s3_mock_basic(s3_mock): s3 = s3_mock.meta.client body = s3_mock.Object(s3_bucket, s3_key).get()['Body'] @@ -127,18 +137,21 @@ def test_s3_mock_basic(s3_mock): assert content == 'my content' +@pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_yes(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, s3_key) assert(accessible) assert(exc is None) +@pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_no_bucket(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, 'some-bucket', s3_key) assert(not accessible) assert('NoSuchBucket' in str(exc)) +@pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_no_key(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, 'does/not/exist') assert(not accessible) @@ -147,6 +160,7 @@ def test__s3_is_accessible_no_key(s3_mock): assert('Not Found' in errmsg and '404' in errmsg) +@pytest.mark.skipif('not HAS_MOTO') def test_s3_download__noPath(s3_mock): filename = aws_download(f's3://{s3_bucket}/{s3_key}', local_filepath=None, cache=False) @@ -155,11 +169,13 @@ def test_s3_download__noPath(s3_mock): os.remove(fname) +@pytest.mark.skipif('not HAS_MOTO') def test_s3_download__noKey(s3_mock): with pytest.raises(ClientError): filename = aws_download(f's3://{s3_bucket}/does/not/exist') +@pytest.mark.skipif('not HAS_MOTO') def test_s3_download__wPath(s3_mock): filename = aws_download(f's3://{s3_bucket}/{s3_key}', local_filepath='newkey.txt', cache=False) @@ -168,6 +184,7 @@ def test_s3_download__wPath(s3_mock): os.remove(filename) +@pytest.mark.skipif('not HAS_MOTO') def test_aws_download__wrong_cache(s3_mock): # get the file first with open('somekey.txt', 'w') as fp: diff --git a/setup.cfg b/setup.cfg index 0919f9e0d..5aa68fb6a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,10 +67,14 @@ python_requires = >=3.8 [options.extras_require] all = pillow + boto3 + botocore test = pytest-doctestplus>=0.13 pytest-astropy requests-mock +test_all = + moto docs = sphinx-astropy diff --git a/tox.ini b/tox.ini index 6fab0c398..be23c32c2 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ requires = extras = test: test alldeps: all + alldeps: test_all description = run tests From b9938f2f2470f690a41c66e166f96182796127a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Mon, 25 Sep 2023 13:19:32 -0700 Subject: [PATCH 04/11] TST: fixing code style CI job --- pyvo/utils/download.py | 29 +++++++--------- pyvo/utils/tests/test_download.py | 55 +++++++++++++++---------------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index 1a63cd1cd..ef67fdc99 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -29,7 +29,7 @@ def http_download(url, verbose=False, **kwargs): """Download file from http(s) url - + Parameters ---------- url: str @@ -44,7 +44,7 @@ def http_download(url, Session to use. If None, create a new one. verbose: bool If True, print progress and debug text - + Keywords -------- additional keywords to be passed to session.request() @@ -61,11 +61,9 @@ def http_download(url, if not local_filepath: local_filepath = _filename_from_url(url) - response = _session.request(method, url, timeout=timeout, - stream=True, **kwargs) + stream=True, **kwargs) - response.raise_for_status() if 'content-length' in response.headers: length = int(response.headers['content-length']) @@ -75,10 +73,9 @@ def http_download(url, else: length = None - if cache and os.path.exists(local_filepath): if length is not None and os.path.getsize(local_filepath) != length: - warn(f'Found cached file but it has the wrong size. Overwriting ...', + warn('Found cached file but it has the wrong size. Overwriting ...', category=PyvoUserWarning) else: if verbose: @@ -90,7 +87,6 @@ def http_download(url, stream=True, **kwargs) response.raise_for_status() - blocksize = astropy.utils.data.conf.download_block_size n_bytes = 0 with ProgressBarOrSpinner(length, f'Downloading URL {url} to {local_filepath} ...') as pb: @@ -128,11 +124,11 @@ def _s3_is_accessible(s3_resource, bucket_name, key): exception = None try: - header_info = s3_client.head_object(Bucket=bucket_name, Key=key) + _ = s3_client.head_object(Bucket=bucket_name, Key=key) accessible = True - except Exception as e: + except Exception as ex: accessible = False - exception = e + exception = ex return accessible, exception @@ -171,7 +167,7 @@ def aws_download(uri=None, name of the user's profile for credentials in ~/.aws/config or ~/.aws/credentials. Use to authenticate the AWS user with boto3. session: boto3.session.Session - Session to use that include authentication if needed. + Session to use that include authentication if needed. If None, create an annonymous one. If given, aws_profile is ignored verbose: bool If True, print progress and debug text @@ -200,7 +196,6 @@ def aws_download(uri=None, if not local_filepath: local_filepath = _filename_from_url(f's3://{bucket_name}/{key}') - if session: if not isinstance(session, boto3.session.Session): raise ValueError('session has to be instance of boto3.session.Session') @@ -254,7 +249,7 @@ def aws_download(uri=None, print(f'Found cached file {local_filepath}.') return local_filepath else: - warn(f'Found cached file but it has the wrong size. Overwriting ...', + warn('Found cached file but it has the wrong size. Overwriting ...', category=PyvoUserWarning) with ProgressBarOrSpinner(length, (f'Downloading {key} to {local_filepath} ...')) as pb: @@ -286,12 +281,12 @@ def _filename_from_url(url): - https://example.com/files/myfile.pdf?user_id=123 - https://example.com/files/myfile.pdf - http://example.com/service?file=/location/myfile.pdf&size=large - + Parameters ---------- url: str url of the file - + """ parsed_url = urlparse(url) path = parsed_url.path @@ -304,5 +299,5 @@ def _filename_from_url(url): filename = os.path.basename(unquote(values[0])) if '.' in filename: break - + return filename diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index 500038b59..6f50b5837 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -25,11 +25,10 @@ except ImportError: HAS_MOTO = False - # Useful variables # For http_download: get_pkg_data_contents = partial( - get_pkg_data_contents, package=__package__, encoding='binary') + get_pkg_data_contents, package=__package__, encoding='binary') data_re = re.compile('http://example.com/data.*') # For aws_download: s3_bucket = 'pyvo-bucket' @@ -70,7 +69,7 @@ def test__filename_from_url(): for url in urls: filename = _filename_from_url(url) - assert(filename == 'myfile.pdf') + assert filename == 'myfile.pdf' @pytest.fixture(name='http_mock') @@ -80,7 +79,7 @@ def callback(request, context): return get_pkg_data_contents(f'data/{fname}') with mocker.register_uri( - 'GET', data_re, content=callback, headers={'content-length':'901'}, + 'GET', data_re, content=callback, headers={'content-length': '901'}, ) as matcher: yield matcher @@ -88,20 +87,20 @@ def callback(request, context): def test_http_download__noPath(http_mock): filename = http_download('http://example.com/data/basic.xml', local_filepath=None, cache=False) - assert(filename == 'basic.xml') + assert filename == 'basic.xml' os.remove('basic.xml') def test_http_download__noFile(http_mock): with pytest.raises(urllib.error.URLError): - filename = http_download('http://example.com/data/nofile.fits') + http_download('http://example.com/data/nofile.fits') def test_http_download__wPath(http_mock): filename = http_download('http://example.com/data/basic.xml', local_filepath='basic2.xml', cache=False) - assert(filename == 'basic2.xml') - assert(os.path.exists('basic2.xml')) + assert filename == 'basic2.xml' + assert os.path.exists('basic2.xml') os.remove('basic2.xml') @@ -111,13 +110,12 @@ def test_http_download__wrong_cache(http_mock): fp.write('some content') # get it from cache with pytest.warns(PyvoUserWarning): - filename = http_download('http://example.com/data/basic.xml', - local_filepath='basic.xml', cache=True) - assert(os.path.getsize('basic.xml') == 901) + http_download('http://example.com/data/basic.xml', + local_filepath='basic.xml', cache=True) + assert os.path.getsize('basic.xml') == 901 os.remove('basic.xml') - @pytest.mark.skipif('not HAS_MOTO') @pytest.fixture(name='s3_mock') def _s3_mock(mocker): @@ -131,7 +129,6 @@ def _s3_mock(mocker): @pytest.mark.skipif('not HAS_MOTO') def test_s3_mock_basic(s3_mock): - s3 = s3_mock.meta.client body = s3_mock.Object(s3_bucket, s3_key).get()['Body'] content = body.read().decode('utf-8') assert content == 'my content' @@ -140,47 +137,47 @@ def test_s3_mock_basic(s3_mock): @pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_yes(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, s3_key) - assert(accessible) - assert(exc is None) + assert accessible + assert exc is None @pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_no_bucket(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, 'some-bucket', s3_key) - assert(not accessible) - assert('NoSuchBucket' in str(exc)) + assert not accessible + assert 'NoSuchBucket' in str(exc) @pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_no_key(s3_mock): accessible, exc = _s3_is_accessible(s3_mock, s3_bucket, 'does/not/exist') - assert(not accessible) - assert(isinstance(exc, ClientError)) + assert not accessible + assert isinstance(exc, ClientError) errmsg = str(exc) - assert('Not Found' in errmsg and '404' in errmsg) + assert 'Not Found' in errmsg and '404' in errmsg @pytest.mark.skipif('not HAS_MOTO') def test_s3_download__noPath(s3_mock): filename = aws_download(f's3://{s3_bucket}/{s3_key}', - local_filepath=None, cache=False) + local_filepath=None, cache=False) fname = s3_key.split('/')[-1] - assert(filename == fname) + assert filename == fname os.remove(fname) @pytest.mark.skipif('not HAS_MOTO') def test_s3_download__noKey(s3_mock): with pytest.raises(ClientError): - filename = aws_download(f's3://{s3_bucket}/does/not/exist') + aws_download(f's3://{s3_bucket}/does/not/exist') @pytest.mark.skipif('not HAS_MOTO') def test_s3_download__wPath(s3_mock): filename = aws_download(f's3://{s3_bucket}/{s3_key}', - local_filepath='newkey.txt', cache=False) - assert(filename == 'newkey.txt') - assert(os.path.exists('newkey.txt')) + local_filepath='newkey.txt', cache=False) + assert filename == 'newkey.txt' + assert os.path.exists('newkey.txt') os.remove(filename) @@ -191,7 +188,7 @@ def test_aws_download__wrong_cache(s3_mock): fp.write('not my content') # get it from cache with pytest.warns(PyvoUserWarning): - filename = aws_download(f's3://{s3_bucket}/{s3_key}', - local_filepath='somekey.txt', cache=True) - assert(os.path.getsize('somekey.txt') == 10) + aws_download(f's3://{s3_bucket}/{s3_key}', + local_filepath='somekey.txt', cache=True) + assert os.path.getsize('somekey.txt') == 10 os.remove('somekey.txt') From b801fb301c7720a4513eed9ffc663e6f9786c3de Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Mon, 25 Sep 2023 21:20:02 -0400 Subject: [PATCH 05/11] fix windows Ci --- pyvo/utils/tests/test_download.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index 6f50b5837..ae071561d 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -112,7 +112,10 @@ def test_http_download__wrong_cache(http_mock): with pytest.warns(PyvoUserWarning): http_download('http://example.com/data/basic.xml', local_filepath='basic.xml', cache=True) - assert os.path.getsize('basic.xml') == 901 + with open('basic.xml') as fp: + lines = fp.readlines() + assert len(lines) == 28 + assert '' in lines[0] os.remove('basic.xml') From c657172ffa993b831a04d6cf0ea4de9097fdf719 Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Mon, 16 Oct 2023 14:03:49 -0400 Subject: [PATCH 06/11] removed kwargs and unnecessary request call --- pyvo/utils/download.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index ef67fdc99..0f0ed1f4b 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -26,8 +26,7 @@ def http_download(url, cache=True, timeout=None, session=None, - verbose=False, - **kwargs): + verbose=False): """Download file from http(s) url Parameters @@ -45,9 +44,6 @@ def http_download(url, verbose: bool If True, print progress and debug text - Keywords - -------- - additional keywords to be passed to session.request() Return ------ @@ -62,7 +58,7 @@ def http_download(url, local_filepath = _filename_from_url(url) response = _session.request(method, url, timeout=timeout, - stream=True, **kwargs) + stream=True) response.raise_for_status() if 'content-length' in response.headers: @@ -83,10 +79,6 @@ def http_download(url, response.close() return local_filepath - response = _session.request(method, url, timeout=timeout, - stream=True, **kwargs) - response.raise_for_status() - blocksize = astropy.utils.data.conf.download_block_size n_bytes = 0 with ProgressBarOrSpinner(length, f'Downloading URL {url} to {local_filepath} ...') as pb: From 15190b506abdddbe9d0a050ca8496387508c247b Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Mon, 16 Oct 2023 14:16:04 -0400 Subject: [PATCH 07/11] add auth to http_download and use PyvoUserWarning from dal.exceptions --- pyvo/utils/download.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index 0f0ed1f4b..af623a2d8 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -10,15 +10,13 @@ from astropy.utils.console import ProgressBarOrSpinner from astropy.utils.exceptions import AstropyUserWarning +from pyvo.dal.exceptions import PyvoUserWarning +from pyvo.utils.http import use_session -from .http import use_session __all__ = ['http_download', 'aws_download'] -class PyvoUserWarning(AstropyUserWarning): - pass - # adapted from astroquery._download_file. def http_download(url, @@ -26,6 +24,7 @@ def http_download(url, cache=True, timeout=None, session=None, + auth=None, verbose=False): """Download file from http(s) url @@ -41,6 +40,8 @@ def http_download(url, Time to attempt download before failing session: requests.Session Session to use. If None, create a new one. + auth: dict or None + to be passed to seesionr.request for authentication when needed. verbose: bool If True, print progress and debug text @@ -58,19 +59,20 @@ def http_download(url, local_filepath = _filename_from_url(url) response = _session.request(method, url, timeout=timeout, - stream=True) + stream=True, auth=auth) response.raise_for_status() if 'content-length' in response.headers: length = int(response.headers['content-length']) if length == 0: if verbose: - print(f'URL {url} has length=0') + warn(f'URL {url} has length=0', category=PyvoUserWarning) else: length = None if cache and os.path.exists(local_filepath): - if length is not None and os.path.getsize(local_filepath) != length: + local_size = os.stat(local_filepath).st_size + if length is not None and local_size != length: warn('Found cached file but it has the wrong size. Overwriting ...', category=PyvoUserWarning) else: From 1ad2b969a6208cd1a76c522f261990dfdc23a495 Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Mon, 16 Oct 2023 15:35:56 -0400 Subject: [PATCH 08/11] added remote tests for utils.download --- pyvo/utils/download.py | 2 +- pyvo/utils/tests/test_download.py | 105 ++++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 6 deletions(-) diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index af623a2d8..224d4a6df 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -190,7 +190,7 @@ def aws_download(uri=None, if not local_filepath: local_filepath = _filename_from_url(f's3://{bucket_name}/{key}') - if session: + if session is not None: if not isinstance(session, boto3.session.Session): raise ValueError('session has to be instance of boto3.session.Session') s3_config = botocore.client.Config(connect_timeout=timeout) diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index ae071561d..31dbc810a 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -7,6 +7,7 @@ import re import pytest import requests_mock +import requests from contextlib import contextmanager import urllib @@ -19,6 +20,7 @@ # Both boto3, botocore and moto are optional dependencies, but the former 2 are # dependencies of the latter, so it's enough to handle them with one variable import boto3 + import botocore from botocore.exceptions import ClientError from moto import mock_s3 HAS_MOTO = True @@ -91,11 +93,6 @@ def test_http_download__noPath(http_mock): os.remove('basic.xml') -def test_http_download__noFile(http_mock): - with pytest.raises(urllib.error.URLError): - http_download('http://example.com/data/nofile.fits') - - def test_http_download__wPath(http_mock): filename = http_download('http://example.com/data/basic.xml', local_filepath='basic2.xml', cache=False) @@ -104,6 +101,18 @@ def test_http_download__wPath(http_mock): os.remove('basic2.xml') +def test_http_download__wCache(http_mock, capsys): + filename1 = http_download('http://example.com/data/basic.xml', + local_filepath=None, cache=False) + assert filename1 == 'basic.xml' + + filename2 = http_download('http://example.com/data/basic.xml', + local_filepath=None, cache=True, verbose=True) + assert filename1 == filename2 + assert 'Found cached file' in capsys.readouterr().out + os.remove('basic.xml') + + def test_http_download__wrong_cache(http_mock): # get the file first with open('basic.xml', 'w') as fp: @@ -166,6 +175,7 @@ def test_s3_download__noPath(s3_mock): local_filepath=None, cache=False) fname = s3_key.split('/')[-1] assert filename == fname + assert os.path.exists(filename) os.remove(fname) @@ -195,3 +205,88 @@ def test_aws_download__wrong_cache(s3_mock): local_filepath='somekey.txt', cache=True) assert os.path.getsize('somekey.txt') == 10 os.remove('somekey.txt') + +## ---------------------- ## +## ---- Remote Tests ---- ## + +@pytest.mark.remote_data +def test_http_download__noFile(): + with pytest.raises(requests.exceptions.HTTPError): + http_download('https://heasarc.gsfc.nasa.gov/FTP/data/nofile.fits') + + +@pytest.mark.remote_data +def test_http_download__remote(): + url = 'https://heasarc.gsfc.nasa.gov/FTP/asca/README' + filename1 = http_download(url, local_filepath=None, cache=False) + assert filename1 == 'README' + with open('README', 'r') as fp: + lines = fp.readlines() + assert len(lines) == 26 + assert 'The asca directory' in lines[1] + os.remove('README') + + +@pytest.mark.remote_data +def test_http_download__remote_cache(capsys): + url = 'https://heasarc.gsfc.nasa.gov/FTP/asca/README' + filename1 = http_download(url, local_filepath=None, cache=False) + + filename2 = http_download(url, local_filepath=None, cache=True, verbose=True) + assert filename1 == filename2 + assert 'Found cached file' in capsys.readouterr().out + os.remove('README') + + +@pytest.mark.remote_data +@pytest.mark.skipif('not HAS_MOTO') +def test__s3_is_accessible_no_bucket_remote(): + s3config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=100) + s3_resource = boto3.resource(service_name='s3', config=s3config) + accessible, exc = _s3_is_accessible(s3_resource, 'pyvo-nonexistent-bucket', s3_key) + assert not accessible + assert '404' in str(exc) + +@pytest.mark.remote_data +@pytest.mark.skipif('not HAS_MOTO') +def test__s3_is_accessible_no_key_remote(): + s3config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=100) + s3_resource = boto3.resource(service_name='s3', config=s3config) + accessible, exc = _s3_is_accessible(s3_resource, 'nasa-heasarc', 'README') + assert not accessible + assert isinstance(exc, ClientError) + errmsg = str(exc) + assert 'Not Found' in errmsg and '404' in errmsg + + +@pytest.mark.remote_data +@pytest.mark.skipif('not HAS_MOTO') +def test__s3_is_accessible_yes_remote(): + s3config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=100) + s3_resource = boto3.resource(service_name='s3', config=s3config) + key = 'asca/data/rev2/97006000/aux/ad97006000_002_source_info.html.gz' + accessible, exc = _s3_is_accessible(s3_resource, 'nasa-heasarc', key) + assert accessible + assert exc is None + +@pytest.mark.remote_data +@pytest.mark.skipif('not HAS_MOTO') +def test__s3_is_accessible_download_remote(): + s3config = botocore.client.Config(signature_version=botocore.UNSIGNED, connect_timeout=100) + s3_resource = boto3.resource(service_name='s3', config=s3config) + key = 'asca/data/rev2/97006000/aux/ad97006000_002_source_info.html.gz' + accessible, exc = _s3_is_accessible(s3_resource, 'nasa-heasarc', key) + assert accessible + assert exc is None + + +@pytest.mark.remote_data +@pytest.mark.skipif('not HAS_MOTO') +def test_s3_download__noPath_remote(): + key = 'asca/data/rev2/97006000/aux/ad97006000_002_source_info.html.gz' + filename = aws_download(f's3://nasa-heasarc/{key}', + local_filepath=None, cache=False) + fname = 'ad97006000_002_source_info.html.gz' + assert filename == fname + assert os.path.exists(filename) + os.remove(fname) From f74ff6d05684f3a74a83c1097a911db027756b63 Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Mon, 16 Oct 2023 15:40:00 -0400 Subject: [PATCH 09/11] add utils.download to CHANGES.rst --- CHANGES.rst | 3 +++ pyvo/utils/download.py | 2 -- pyvo/utils/tests/test_download.py | 8 +++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index b3182cad7..26464c4fa 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -36,6 +36,9 @@ - Output of ``repr`` for DALResults instance now clearly shows it is a DALResultsTable and not a generic astropy Table. [#478] +- Add ``utils.http_download`` and ``utils.aws_download`` for downling data + files from on-prem servers and the aws cloud. [#489] + 1.4.3 (unreleased) ================== diff --git a/pyvo/utils/download.py b/pyvo/utils/download.py index 224d4a6df..e14be3f69 100644 --- a/pyvo/utils/download.py +++ b/pyvo/utils/download.py @@ -8,7 +8,6 @@ import astropy from astropy.utils.console import ProgressBarOrSpinner -from astropy.utils.exceptions import AstropyUserWarning from pyvo.dal.exceptions import PyvoUserWarning from pyvo.utils.http import use_session @@ -17,7 +16,6 @@ __all__ = ['http_download', 'aws_download'] - # adapted from astroquery._download_file. def http_download(url, local_filepath=None, diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index 31dbc810a..a94061153 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -9,7 +9,6 @@ import requests_mock import requests from contextlib import contextmanager -import urllib from astropy.utils.data import get_pkg_data_contents @@ -206,8 +205,9 @@ def test_aws_download__wrong_cache(s3_mock): assert os.path.getsize('somekey.txt') == 10 os.remove('somekey.txt') -## ---------------------- ## -## ---- Remote Tests ---- ## +# ---------------------- +# ---- Remote Tests ---- + @pytest.mark.remote_data def test_http_download__noFile(): @@ -247,6 +247,7 @@ def test__s3_is_accessible_no_bucket_remote(): assert not accessible assert '404' in str(exc) + @pytest.mark.remote_data @pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_no_key_remote(): @@ -269,6 +270,7 @@ def test__s3_is_accessible_yes_remote(): assert accessible assert exc is None + @pytest.mark.remote_data @pytest.mark.skipif('not HAS_MOTO') def test__s3_is_accessible_download_remote(): From 398be70e2ba7c8c15a061fbd92e2b51f1c4f1f24 Mon Sep 17 00:00:00 2001 From: Abdu Zoghbi Date: Tue, 17 Oct 2023 22:36:19 -0400 Subject: [PATCH 10/11] add docs for utils.download --- docs/index.rst | 1 + docs/utils/download.rst | 62 +++++++++++++++++++++++++++++++ pyvo/utils/tests/test_download.py | 12 ------ 3 files changed, 63 insertions(+), 12 deletions(-) create mode 100644 docs/utils/download.rst diff --git a/docs/index.rst b/docs/index.rst index 91262306d..53af141b5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -135,3 +135,4 @@ Using `pyvo` io/index auth/index utils/prototypes + utils/download diff --git a/docs/utils/download.rst b/docs/utils/download.rst new file mode 100644 index 000000000..3cbcf984a --- /dev/null +++ b/docs/utils/download.rst @@ -0,0 +1,62 @@ +.. _pyvo-download: + +************************************************** +Download Utilities (`pyvo.utils.download`) +************************************************** + +The download utilities provide methods for downloading data once a link to it +it obtained. These can be considered an advanced version of `~pyvo.dal.Record.getdataset` that can handle +data from standard on-prem servers as well as cloud data. For now only AWS is supported. + +There two methods with the same call signature: `~pyvo.utils.download.http_download` and `~pyvo.utils.download.aws_download`. The first handles standard links from on-prem servers, while the second downloads data from the `Amazon S3 storage`_. + + +.. _pyvo-download-examples: + +Example Usage +============== +This is an example of downloading data when an URL or URI are available: + +.. doctest-remote-data:: + + >>> from pyvo.utils import aws_download, http_download + >>> data_url = 'https://heasarc.gsfc.nasa.gov/FTP/chandra/data/byobsid/2/3052/primary/acisf03052N004_cntr_img2.jpg' + >>> image_file = http_download(url=data_url) + >>> s3_uri = 's3://nasa-heasarc/chandra/data/byobsid/2/3052/primary/acisf03052N004_cntr_img2.jpg' + >>> image2_file = aws_download(uri=s3_uri) + Downloading chandra/data/byobsid/2/3052/primary/acisf03052N004_cntr_img2.jpg to acisf03052N004_cntr_img2.jpg ... [Done] + + +A bucket name and a key can also be passed to `~pyvo.utils.download.aws_download` instead of an URI: + +.. doctest-remote-data:: + >>> from pyvo.utils import aws_download + >>> s3_key = 'chandra/data/byobsid/2/3052/primary/acisf03052N004_cntr_img2.jpg' + >>> s3_bucket = 'nasa-heasarc' + >>> image2_file = aws_download(bucket_name=s3_bucket, key=s3_key) + Downloading chandra/data/byobsid/2/3052/primary/acisf03052N004_cntr_img2.jpg to acisf03052N004_cntr_img2.jpg ... [Done] + + +If the aws data requires authentication, a credential profile (e.g. ``aws_user`` profile in ``~/.aws/credentials``) can be passed: + +.. doctest-skip:: + >>> image2_file = aws_download(bucket=s3_bucket, key=s3_key, aws_profile='aws_user') + + +A session (instance of ``boto3.session.Session``) can also be passed instead (see detials in `AWS session documentation`_): + +.. doctest-skip:: + >>> s3_session = boto3.session.Session(aws_access_key_id, aws_secret_access_key) + >>> image2_file = aws_download(bucket=s3_bucket, key=s3_key, session=s3_session) + + +.. _pyvo-download-api: + +Reference/API +============= + +.. automodapi:: pyvo.utils.download + + +.. _Amazon S3 storage: https://aws.amazon.com/s3/ +.. _AWS session documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html \ No newline at end of file diff --git a/pyvo/utils/tests/test_download.py b/pyvo/utils/tests/test_download.py index a94061153..915df1640 100644 --- a/pyvo/utils/tests/test_download.py +++ b/pyvo/utils/tests/test_download.py @@ -100,18 +100,6 @@ def test_http_download__wPath(http_mock): os.remove('basic2.xml') -def test_http_download__wCache(http_mock, capsys): - filename1 = http_download('http://example.com/data/basic.xml', - local_filepath=None, cache=False) - assert filename1 == 'basic.xml' - - filename2 = http_download('http://example.com/data/basic.xml', - local_filepath=None, cache=True, verbose=True) - assert filename1 == filename2 - assert 'Found cached file' in capsys.readouterr().out - os.remove('basic.xml') - - def test_http_download__wrong_cache(http_mock): # get the file first with open('basic.xml', 'w') as fp: From ac47c96cf847d687fe4ac2d1c5dbc2b52a700b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Wed, 18 Oct 2023 12:24:51 -0700 Subject: [PATCH 11/11] MAINT: ignore deprecation triggered in dateutil by botocore --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5aa68fb6a..987adc58a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,8 @@ filterwarnings = # Numpy 2.0 deprecations triggered by upstream libraries. # Exact warning messages differ, thus using a super generic filter. ignore:numpy.core:DeprecationWarning - +# botocore triggers dateutil deprecation + ignore:.*. Use timezone-aware objects to represent datetimes in UTC:DeprecationWarning [flake8] max-line-length = 110