From 4781ad5b70e5a1d93b8cc8cdd5aed6a97542e35d Mon Sep 17 00:00:00 2001 From: Will Ronchetti Date: Thu, 27 Feb 2020 08:13:24 -0500 Subject: [PATCH] C4-24 Give ff_utils a lucene API (#58) * C4-24 first pass at API * C4-24 second pass at API * C4-24 fix flake * C4-24 add test * C4-24 address Kent's points * flake * C4-24 add ability to use generator * QOL change * up test_get_metadata flakiness to 3 --- dcicutils/es_utils.py | 38 +++++++++++++++++++++++++++---- dcicutils/ff_utils.py | 52 +++++++++++++++++++++++++++++++++++++++++++ test/conftest.py | 4 +++- test/test_es_utils.py | 29 ++++++++++++++++++++++++ test/test_ff_utils.py | 37 +++++++++++++++++++++++++++++- 5 files changed, 154 insertions(+), 6 deletions(-) create mode 100644 test/test_es_utils.py diff --git a/dcicutils/es_utils.py b/dcicutils/es_utils.py index e6d630dc0..eb9f57cac 100644 --- a/dcicutils/es_utils.py +++ b/dcicutils/es_utils.py @@ -1,10 +1,11 @@ +import logging from elasticsearch import Elasticsearch, RequestsHttpConnection from aws_requests_auth.boto_utils import BotoAWSRequestsAuth -''' -info about snapshots on AWS -https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/es-managedomains-snapshots.html -''' + +logging.basicConfig() +logger = logging.getLogger('logger') +logger.setLevel(logging.INFO) def create_es_client(es_url, use_aws_auth=True, **options): @@ -48,6 +49,12 @@ def get_index_list(client, name, days_old=0, timestring='%Y.%m.%d', ilo=None): def create_snapshot_repo(client, repo_name, s3_bucket): + """ + Creates a repo to store ES snapshots on + + info about snapshots on AWS + https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/es-managedomains-snapshots.html + """ snapshot_body = {'type': 's3', 'settings': { 'bucket': s3_bucket, @@ -56,3 +63,26 @@ def create_snapshot_repo(client, repo_name, s3_bucket): } } return client.snapshot.create_repository(repository=repo_name, body=snapshot_body) + + +def execute_lucene_query_on_es(client, index, query): + """ + Executes the given lucene query (in dictionary form) + + :arg client: elasticsearch client + :arg index: index to search under + :arg query: dictionary of query + + :returns: result of query or None + """ + try: + raw_result = client.search(body=query, index=index) + except Exception as e: + logger.error('Failed to execute search on index %s with query %s.\n Exception: %s' % (index, query, str(e))) + return None + try: + result = raw_result['hits']['hits'] + return result + except KeyError: + logger.error('Searching index %s with query %s gave no results' % (index, query)) + return None diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index e4358224b..02264d495 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -879,6 +879,58 @@ def get_health_page(key=None, ff_env=None): return ret +class SearchESMetadataHandler(object): + """ + Wrapper class for executing lucene queries directly on ES. + Resolves ES instance location via health page of the given + environment. Requires AWS permissions to use. + Can be used directly but is used through search_es_metadata. + + NOTE: use this class directly if you plan on making multiple subsequent requests + to the same environment. + """ + + def __init__(self, key=None, ff_env=None): + self.health = get_health_page(key, ff_env) # expensive - do not call repeatedly! + self.es_url = self.health['elasticsearch'] + self.client = es_utils.create_es_client(self.es_url) + + def execute_search(self, index, query, is_generator=False, page_size=200): + """ + Executes lucene query on this client's index. + + :arg index: index to search under + :arg query: query to run + :arg is_generator: boolean on whether or not to use a generator + :arg page_size: if using a generator, how many results to give per request + + :returns: list of results of query or None + """ + if not is_generator: + return es_utils.execute_lucene_query_on_es(self.client, index=index, query=query) + return search_result_generator(get_es_search_generator(self.client, index, query, page_size=page_size)) + + +def search_es_metadata(index, query, key=None, ff_env=None, is_generator=False): + """ + Executes a lucene search query on on the ES Instance for this + environment. + + NOTE: It is okay to use this function directly but for repeat usage please use + SearchESMetadataHandler as it caches an expensive API request to AWS + + :arg index: index to search under + :arg query: dictionary of query + :arg key: optional, 2-tuple authentication key (access_key_id, secret) + :arg ff_env: ff_env to use + :arg is_generator: boolean on whether or not to use a generator + + :returns: list of results of query or None + """ + search_handler = SearchESMetadataHandler(key, ff_env) + return search_handler.execute_search(index, query, is_generator) + + ##################### # Utility functions # ##################### diff --git a/test/conftest.py b/test/conftest.py index 7be818c57..e9997072d 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -4,8 +4,9 @@ from dcicutils.s3_utils import s3Utils from dcicutils.ff_utils import authorized_request -# this is the ff_env we use for integrated tests +# XXX: Refactor to config INTEGRATED_ENV = 'fourfront-mastertest' +INTEGRATED_ES = 'https://search-fourfront-mastertest-wusehbixktyxtbagz5wzefffp4.us-east-1.es.amazonaws.com' @pytest.fixture(scope='session') @@ -27,6 +28,7 @@ def integrated_ff(): integrated['ff_key'] = s3.get_access_keys() integrated['higlass_key'] = s3.get_higlass_key() integrated['ff_env'] = INTEGRATED_ENV + integrated['es_url'] = INTEGRATED_ES # do this to make sure env is up (will error if not) res = authorized_request(integrated['ff_key']['server'], auth=integrated['ff_key']) if res.status_code != 200: diff --git a/test/test_es_utils.py b/test/test_es_utils.py new file mode 100644 index 000000000..09c93f508 --- /dev/null +++ b/test/test_es_utils.py @@ -0,0 +1,29 @@ +import pytest + +from dcicutils.es_utils import create_es_client, execute_lucene_query_on_es + + +@pytest.fixture +def es_client_fixture(integrated_ff): + """ Fixture that creates an es client to mastertest """ + return create_es_client(integrated_ff['es_url']) + + +@pytest.mark.integrated +def test_lucene_query_basic(es_client_fixture): + """ Tests basic lucene queries via the underlying endpoint on mastertest """ + results = execute_lucene_query_on_es(client=es_client_fixture, index='fourfront-mastertestuser', query={}) + assert len(results) == 10 + test_query = { + 'query': { + 'bool': { + 'must': [ # search for will's user insert + {'terms': {'_id': ['1a12362f-4eb6-4a9c-8173-776667226988']}} + ], + 'must_not': [] + } + }, + 'sort': [{'_uid': {'order': 'desc'}}] + } + results = execute_lucene_query_on_es(client=es_client_fixture, index='fourfront-mastertestuser', query=test_query) + assert len(results) == 1 \ No newline at end of file diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 8059d30e4..4eb88d0b8 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -215,7 +215,7 @@ def test_authorized_request_integrated(integrated_ff): @pytest.mark.integrated -@pytest.mark.flaky +@pytest.mark.flaky(max_runs=3) # very flaky for some reason def test_get_metadata(integrated_ff, basestring): # use this test biosource test_item = '331111bc-8535-4448-903e-854af460b254' @@ -931,6 +931,41 @@ def clear_folder(folder): clear_folder(test_folder) +@pytest.mark.integrated +def test_search_es_metadata(integrated_ff): + """ Tests search_es_metadata on mastertest """ + res = ff_utils.search_es_metadata('fourfront-mastertestuser', {'size': '1000'}, + key=integrated_ff['ff_key'], ff_env=integrated_ff['ff_env']) + assert len(res) == 27 + test_query = { + 'query': { + 'bool': { + 'must': [ # search for will's user insert + {'terms': {'_id': ['1a12362f-4eb6-4a9c-8173-776667226988']}} + ], + 'must_not': [] + } + }, + 'sort': [{'_uid': {'order': 'desc'}}] + } + res = ff_utils.search_es_metadata('fourfront-mastertestuser', test_query, + key=integrated_ff['ff_key'], ff_env=integrated_ff['ff_env']) + assert len(res) == 1 + + +@pytest.mark.integrated +def test_search_es_metadata_generator(integrated_ff): + """ Tests SearchESMetadataHandler both normally and with a generator, verifies consistent results """ + handler = ff_utils.SearchESMetadataHandler(key=integrated_ff['ff_key'], ff_env=integrated_ff['ff_env']) + no_gen_res = ff_utils.search_es_metadata('fourfront-mastertestuser', {'size': '1000'}, + key=integrated_ff['ff_key'], ff_env=integrated_ff['ff_env']) + res = handler.execute_search('fourfront-mastertestuser', {'size': '1000'}, is_generator=True, page_size=5) + count = 0 + for _ in res: + count += 1 + assert count == len(no_gen_res) + + def test_convert_param(): """ Very basic test that illustrates what convert_param should do """ params = {'param1': 5}