From 19490c982b1cc6ac7a05634627c86d3cea7ac4c4 Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 13:40:44 -0400 Subject: [PATCH 1/7] Allow filters kwarg in get_es_metadata --- dcicutils/ff_utils.py | 44 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index c4a68b52d..8986fa199 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -348,13 +348,21 @@ def get_es_search_generator(es_client, index, body, page_size=50): yield es_hits -def get_es_metadata(uuids, es_client=None, key=None, ff_env=None): +def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): """ Given a list of string item uuids, will return a dictionary response of the full ES record for those items (or an empty dictionary if the items don't exist/ are not indexed) You can pass in an Elasticsearch client (initialized by create_es_client) through the es_client param to save init time. + Advanced users can optionally pass a dict of filters that will be added + to the Elasticsearch query. + For example: filters={'status': 'released'} + You can also specify NOT fields: + example: filters={'status': '!released'} + You can also specifiy lists of values for fields: + example: filters={'status': ['released', '!archived']} + NOTE: filters are always combined using AND queries (must all match) Same auth mechanism as the other metadata functions """ if es_client is None: @@ -366,8 +374,38 @@ def get_es_metadata(uuids, es_client=None, key=None, ff_env=None): es_res = [] for i in range(0, len(uuids), 100): query_uuids = uuids[i:i + 100] - es_query = {'query': {'terms': {'_id': query_uuids}}, - 'sort': [{'_uid': {'order': 'desc'}}]} + es_query = { + 'query': { + 'bool': { + 'must': [ + {'terms': {'_id': query_uuids}} + ], + 'must_not': [] + } + }, + 'sort': [{'_uid': {'order': 'desc'}}] + } + if filters: + if not isinstance(filters, dict): + print('Invalid filter for get_es_metadata: %s' % filters) + else: + for k, v in filters.items(): + key_terms = [] + key_not_terms = [] + iter_terms = [v] if not isinstance(v, list) else v + for val in iter_terms: + if val.startswith('!'): + key_not_terms.append(val[1:]) + else: + key_terms.append(val) + if key_terms: + es_query['query']['bool']['must'].append( + {'terms': {'embedded.' + k + '.raw': key_terms}} + ) + if key_not_terms: + es_query['query']['bool']['must_not'].append( + {'terms': {'embedded.' + k + '.raw': key_not_terms}} + ) for es_page in get_es_search_generator(es_client, '_all', es_query): # return the document source only; eliminate es metadata es_res.extend([hit['_source'] for hit in es_page]) From 77d8bbef775ae67c086f76df39d4929c3a628f64 Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 14:29:25 -0400 Subject: [PATCH 2/7] Added a test for get_es_metadata filters. Incremented package version --- dcicutils/_version.py | 2 +- test/test_ff_utils.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/dcicutils/_version.py b/dcicutils/_version.py index 41db4b01f..c7ccaa718 100644 --- a/dcicutils/_version.py +++ b/dcicutils/_version.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.5.1" +__version__ = "0.5.2" diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 114b59ca5..d43f5f954 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -382,6 +382,24 @@ def test_get_es_metadata(integrated_ff): all_es_uuids = [item['uuid'] for item in all_es] assert set(all_es_uuids) == set(all_uuids) + # make sure filters work with the search + bios_in_rev = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=in+review+by+lab', + key=integrated_ff['ff_key']) + bios_replaced = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=replaced', + key=integrated_ff['ff_key']) + bios_uuids = [item['uuid'] for item in bios_in_rev + bios_replaced] + all_uuids.extend(bios_uuids) # add the replaced biosample uuids + filters = {'status': ['in review by lab', 'replaced'], '@type': ['Biosample']} + bios_es = ff_utils.get_es_metadata(all_uuids, filters=filters, key=integrated_ff['ff_key']) + assert set([item['uuid'] for item in bios_es]) == set(bios_uuids) + + bios_neg_search = '/search/?type=Biosample&frame=object&status=in+review+by+lab&modifications.modification_type!=Other' + bios_neg_res = ff_utils.search_metadata(bios_neg_search, key=integrated_ff['ff_key']) + filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']} + bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key']) + assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res) + + @pytest.mark.integrated def test_get_es_search_generator(integrated_ff): From 7361c0574080a619be9a54b01e6e055861aaacce Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 14:38:07 -0400 Subject: [PATCH 3/7] Improved a random test --- test/test_ff_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index d43f5f954..fd46e01a2 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1,5 +1,6 @@ from dcicutils import ff_utils import pytest +import json pytestmark = pytest.mark.working @@ -218,6 +219,11 @@ def test_get_metadata(integrated_ff, basestring): # testing check_queues functionality requires patching ff_utils.patch_metadata({'description': 'test description'}, obj_id=test_item, key=integrated_ff['ff_key']) + # add a bunch more stuff to the queue + idx_body = json.dumps({'uuids': [test_item], 'target_queue': 'secondary'}) + for i in range(10): + ff_utils.authorized_request(integrated_ff['ff_key']['server'] + '/queue_indexing', + auth=integrated_ff['ff_key'], verb='POST', data=idx_body) res_w_check = ff_utils.get_metadata(test_item, key=integrated_ff['ff_key'], ff_env=integrated_ff['ff_env'], check_queue=True) res_db = ff_utils.get_metadata(test_item, key=integrated_ff['ff_key'], From f2b47c11c9992e3f50cc4e3a590a5fa8ecefa287 Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 14:48:04 -0400 Subject: [PATCH 4/7] Fix flake complaints --- test/test_ff_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index fd46e01a2..9c8f6cd12 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -390,23 +390,23 @@ def test_get_es_metadata(integrated_ff): # make sure filters work with the search bios_in_rev = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=in+review+by+lab', - key=integrated_ff['ff_key']) + key=integrated_ff['ff_key']) bios_replaced = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=replaced', - key=integrated_ff['ff_key']) + key=integrated_ff['ff_key']) bios_uuids = [item['uuid'] for item in bios_in_rev + bios_replaced] all_uuids.extend(bios_uuids) # add the replaced biosample uuids filters = {'status': ['in review by lab', 'replaced'], '@type': ['Biosample']} bios_es = ff_utils.get_es_metadata(all_uuids, filters=filters, key=integrated_ff['ff_key']) assert set([item['uuid'] for item in bios_es]) == set(bios_uuids) - bios_neg_search = '/search/?type=Biosample&frame=object&status=in+review+by+lab&modifications.modification_type!=Other' + bios_neg_search = ('/search/?type=Biosample&frame=object&status=in+review+by+lab' + '&modifications.modification_type!=Other') bios_neg_res = ff_utils.search_metadata(bios_neg_search, key=integrated_ff['ff_key']) filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']} bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key']) assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res) - @pytest.mark.integrated def test_get_es_search_generator(integrated_ff): from dcicutils import es_utils From 84e46b7316c29a4facbc7d467125574211dc2e58 Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 17:01:16 -0400 Subject: [PATCH 5/7] Increased chunk_size and page_size associated with get_es_metadata and underlying generator --- dcicutils/ff_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 8986fa199..0b0fd93b1 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -331,7 +331,7 @@ def delete_field(obj_id, del_field, key=None, ff_env=None): return get_response_json(response) -def get_es_search_generator(es_client, index, body, page_size=50): +def get_es_search_generator(es_client, index, body, page_size=1000): """ Simple generator behind get_es_metada which takes an es_client (from es_utils create_es_client), a string index, and a dict query body. @@ -372,8 +372,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): # sending in too many uuids in the terms query can crash es; break them up # into groups of max size 100 es_res = [] - for i in range(0, len(uuids), 100): - query_uuids = uuids[i:i + 100] + chunk_size = 1000 # number of uuids to batch in one ES terms query + for i in range(0, len(uuids), chunk_size): + query_uuids = uuids[i:i + chunk_size] es_query = { 'query': { 'bool': { From 1dafabce12b9d23061ebee041a9f494d5acc4bff Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 17:34:35 -0400 Subject: [PATCH 6/7] Allow control of chunk_size in get_es_metadata and pass that to page_limit of get_es_search_generator. Also decrease default size for mastertest --- dcicutils/ff_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 0b0fd93b1..9be31d43d 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -331,7 +331,7 @@ def delete_field(obj_id, del_field, key=None, ff_env=None): return get_response_json(response) -def get_es_search_generator(es_client, index, body, page_size=1000): +def get_es_search_generator(es_client, index, body, page_size=200): """ Simple generator behind get_es_metada which takes an es_client (from es_utils create_es_client), a string index, and a dict query body. @@ -348,7 +348,8 @@ def get_es_search_generator(es_client, index, body, page_size=1000): yield es_hits -def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): +def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200, + key=None, ff_env=None): """ Given a list of string item uuids, will return a dictionary response of the full ES record for those items (or an empty @@ -363,6 +364,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): You can also specifiy lists of values for fields: example: filters={'status': ['released', '!archived']} NOTE: filters are always combined using AND queries (must all match) + Int chunk_size may be used to control the number of uuids that are + passed to Elasticsearch in each query; setting this too high may cause + ES reads to timeout. Same auth mechanism as the other metadata functions """ if es_client is None: @@ -372,7 +376,6 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): # sending in too many uuids in the terms query can crash es; break them up # into groups of max size 100 es_res = [] - chunk_size = 1000 # number of uuids to batch in one ES terms query for i in range(0, len(uuids), chunk_size): query_uuids = uuids[i:i + chunk_size] es_query = { @@ -407,7 +410,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None): es_query['query']['bool']['must_not'].append( {'terms': {'embedded.' + k + '.raw': key_not_terms}} ) - for es_page in get_es_search_generator(es_client, '_all', es_query): + # use chunk_limit as page size for performance reasons + for es_page in get_es_search_generator(es_client, '_all', es_query, + page_size=chunk_size): # return the document source only; eliminate es metadata es_res.extend([hit['_source'] for hit in es_page]) return es_res From b3ee2b423c8885bec2ebb57bdd59f2154c9a6d3c Mon Sep 17 00:00:00 2001 From: Carl Vitzthum Date: Thu, 25 Oct 2018 17:55:49 -0400 Subject: [PATCH 7/7] Made docs more clear --- dcicutils/ff_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 9be31d43d..6b8507a6b 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -362,9 +362,12 @@ def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200, You can also specify NOT fields: example: filters={'status': '!released'} You can also specifiy lists of values for fields: - example: filters={'status': ['released', '!archived']} - NOTE: filters are always combined using AND queries (must all match) - Int chunk_size may be used to control the number of uuids that are + example: filters={'status': ['released', archived']} + NOTES: + - different filter field are combined using AND queries (must all match) + example: filters={'status': ['released'], 'public_release': ['2018-01-01']} + - values for the same field and combined with OR (such as multiple statuses) + Integer chunk_size may be used to control the number of uuids that are passed to Elasticsearch in each query; setting this too high may cause ES reads to timeout. Same auth mechanism as the other metadata functions