From 19490c982b1cc6ac7a05634627c86d3cea7ac4c4 Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 13:40:44 -0400
Subject: [PATCH 1/7] Allow filters kwarg in get_es_metadata

---
 dcicutils/ff_utils.py | 44 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
index c4a68b52d..8986fa199 100644
--- a/dcicutils/ff_utils.py
+++ b/dcicutils/ff_utils.py
@@ -348,13 +348,21 @@ def get_es_search_generator(es_client, index, body, page_size=50):
         yield es_hits
 
 
-def get_es_metadata(uuids, es_client=None, key=None, ff_env=None):
+def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
     """
     Given a list of string item uuids, will return a
     dictionary response of the full ES record for those items (or an empty
     dictionary if the items don't exist/ are not indexed)
     You can pass in an Elasticsearch client (initialized by create_es_client)
     through the es_client param to save init time.
+    Advanced users can optionally pass a dict of filters that will be added
+    to the Elasticsearch query.
+        For example: filters={'status': 'released'}
+        You can also specify NOT fields:
+            example: filters={'status': '!released'}
+        You can also specifiy lists of values for fields:
+            example: filters={'status': ['released', '!archived']}
+    NOTE: filters are always combined using AND queries (must all match)
     Same auth mechanism as the other metadata functions
     """
     if es_client is None:
@@ -366,8 +374,38 @@ def get_es_metadata(uuids, es_client=None, key=None, ff_env=None):
     es_res = []
     for i in range(0, len(uuids), 100):
         query_uuids = uuids[i:i + 100]
-        es_query = {'query': {'terms': {'_id': query_uuids}},
-                    'sort': [{'_uid': {'order': 'desc'}}]}
+        es_query = {
+            'query': {
+                'bool': {
+                    'must': [
+                        {'terms': {'_id': query_uuids}}
+                    ],
+                    'must_not': []
+                }
+            },
+            'sort': [{'_uid': {'order': 'desc'}}]
+        }
+        if filters:
+            if not isinstance(filters, dict):
+                print('Invalid filter for get_es_metadata: %s' % filters)
+            else:
+                for k, v in filters.items():
+                    key_terms = []
+                    key_not_terms = []
+                    iter_terms = [v] if not isinstance(v, list) else v
+                    for val in iter_terms:
+                        if val.startswith('!'):
+                            key_not_terms.append(val[1:])
+                        else:
+                            key_terms.append(val)
+                    if key_terms:
+                        es_query['query']['bool']['must'].append(
+                            {'terms': {'embedded.' + k + '.raw': key_terms}}
+                        )
+                    if key_not_terms:
+                        es_query['query']['bool']['must_not'].append(
+                            {'terms': {'embedded.' + k + '.raw': key_not_terms}}
+                        )
         for es_page in get_es_search_generator(es_client, '_all', es_query):
             # return the document source only; eliminate es metadata
             es_res.extend([hit['_source'] for hit in es_page])

From 77d8bbef775ae67c086f76df39d4929c3a628f64 Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 14:29:25 -0400
Subject: [PATCH 2/7] Added a test for get_es_metadata filters. Incremented
 package version

---
 dcicutils/_version.py |  2 +-
 test/test_ff_utils.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/dcicutils/_version.py b/dcicutils/_version.py
index 41db4b01f..c7ccaa718 100644
--- a/dcicutils/_version.py
+++ b/dcicutils/_version.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py
index 114b59ca5..d43f5f954 100644
--- a/test/test_ff_utils.py
+++ b/test/test_ff_utils.py
@@ -382,6 +382,24 @@ def test_get_es_metadata(integrated_ff):
     all_es_uuids = [item['uuid'] for item in all_es]
     assert set(all_es_uuids) == set(all_uuids)
 
+    # make sure filters work with the search
+    bios_in_rev = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=in+review+by+lab',
+                                               key=integrated_ff['ff_key'])
+    bios_replaced = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=replaced',
+                                                 key=integrated_ff['ff_key'])
+    bios_uuids = [item['uuid'] for item in bios_in_rev + bios_replaced]
+    all_uuids.extend(bios_uuids)  # add the replaced biosample uuids
+    filters = {'status': ['in review by lab', 'replaced'], '@type': ['Biosample']}
+    bios_es = ff_utils.get_es_metadata(all_uuids, filters=filters, key=integrated_ff['ff_key'])
+    assert set([item['uuid'] for item in bios_es]) == set(bios_uuids)
+
+    bios_neg_search = '/search/?type=Biosample&frame=object&status=in+review+by+lab&modifications.modification_type!=Other'
+    bios_neg_res = ff_utils.search_metadata(bios_neg_search, key=integrated_ff['ff_key'])
+    filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']}
+    bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key'])
+    assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res)
+
+
 
 @pytest.mark.integrated
 def test_get_es_search_generator(integrated_ff):

From 7361c0574080a619be9a54b01e6e055861aaacce Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 14:38:07 -0400
Subject: [PATCH 3/7] Improved a random test

---
 test/test_ff_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py
index d43f5f954..fd46e01a2 100644
--- a/test/test_ff_utils.py
+++ b/test/test_ff_utils.py
@@ -1,5 +1,6 @@
 from dcicutils import ff_utils
 import pytest
+import json
 pytestmark = pytest.mark.working
 
 
@@ -218,6 +219,11 @@ def test_get_metadata(integrated_ff, basestring):
 
     # testing check_queues functionality requires patching
     ff_utils.patch_metadata({'description': 'test description'}, obj_id=test_item, key=integrated_ff['ff_key'])
+    # add a bunch more stuff to the queue
+    idx_body = json.dumps({'uuids': [test_item], 'target_queue': 'secondary'})
+    for i in range(10):
+        ff_utils.authorized_request(integrated_ff['ff_key']['server'] + '/queue_indexing',
+                                    auth=integrated_ff['ff_key'], verb='POST', data=idx_body)
     res_w_check = ff_utils.get_metadata(test_item, key=integrated_ff['ff_key'],
                                         ff_env=integrated_ff['ff_env'], check_queue=True)
     res_db = ff_utils.get_metadata(test_item, key=integrated_ff['ff_key'],

From f2b47c11c9992e3f50cc4e3a590a5fa8ecefa287 Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 14:48:04 -0400
Subject: [PATCH 4/7] Fix flake complaints

---
 test/test_ff_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py
index fd46e01a2..9c8f6cd12 100644
--- a/test/test_ff_utils.py
+++ b/test/test_ff_utils.py
@@ -390,23 +390,23 @@ def test_get_es_metadata(integrated_ff):
 
     # make sure filters work with the search
     bios_in_rev = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=in+review+by+lab',
-                                               key=integrated_ff['ff_key'])
+                                           key=integrated_ff['ff_key'])
     bios_replaced = ff_utils.search_metadata('/search/?type=Biosample&frame=object&status=replaced',
-                                                 key=integrated_ff['ff_key'])
+                                             key=integrated_ff['ff_key'])
     bios_uuids = [item['uuid'] for item in bios_in_rev + bios_replaced]
     all_uuids.extend(bios_uuids)  # add the replaced biosample uuids
     filters = {'status': ['in review by lab', 'replaced'], '@type': ['Biosample']}
     bios_es = ff_utils.get_es_metadata(all_uuids, filters=filters, key=integrated_ff['ff_key'])
     assert set([item['uuid'] for item in bios_es]) == set(bios_uuids)
 
-    bios_neg_search = '/search/?type=Biosample&frame=object&status=in+review+by+lab&modifications.modification_type!=Other'
+    bios_neg_search = ('/search/?type=Biosample&frame=object&status=in+review+by+lab'
+                       '&modifications.modification_type!=Other')
     bios_neg_res = ff_utils.search_metadata(bios_neg_search, key=integrated_ff['ff_key'])
     filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']}
     bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key'])
     assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res)
 
 
-
 @pytest.mark.integrated
 def test_get_es_search_generator(integrated_ff):
     from dcicutils import es_utils

From 84e46b7316c29a4facbc7d467125574211dc2e58 Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 17:01:16 -0400
Subject: [PATCH 5/7] Increased chunk_size and page_size associated with
 get_es_metadata and underlying generator

---
 dcicutils/ff_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
index 8986fa199..0b0fd93b1 100644
--- a/dcicutils/ff_utils.py
+++ b/dcicutils/ff_utils.py
@@ -331,7 +331,7 @@ def delete_field(obj_id, del_field, key=None, ff_env=None):
     return get_response_json(response)
 
 
-def get_es_search_generator(es_client, index, body, page_size=50):
+def get_es_search_generator(es_client, index, body, page_size=1000):
     """
     Simple generator behind get_es_metada which takes an es_client (from
     es_utils create_es_client), a string index, and a dict query body.
@@ -372,8 +372,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
     # sending in too many uuids in the terms query can crash es; break them up
     # into groups of max size 100
     es_res = []
-    for i in range(0, len(uuids), 100):
-        query_uuids = uuids[i:i + 100]
+    chunk_size = 1000  # number of uuids to batch in one ES terms query
+    for i in range(0, len(uuids), chunk_size):
+        query_uuids = uuids[i:i + chunk_size]
         es_query = {
             'query': {
                 'bool': {

From 1dafabce12b9d23061ebee041a9f494d5acc4bff Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 17:34:35 -0400
Subject: [PATCH 6/7] Allow control of chunk_size in get_es_metadata and pass
 that to page_limit of get_es_search_generator. Also decrease default size for
 mastertest

---
 dcicutils/ff_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
index 0b0fd93b1..9be31d43d 100644
--- a/dcicutils/ff_utils.py
+++ b/dcicutils/ff_utils.py
@@ -331,7 +331,7 @@ def delete_field(obj_id, del_field, key=None, ff_env=None):
     return get_response_json(response)
 
 
-def get_es_search_generator(es_client, index, body, page_size=1000):
+def get_es_search_generator(es_client, index, body, page_size=200):
     """
     Simple generator behind get_es_metada which takes an es_client (from
     es_utils create_es_client), a string index, and a dict query body.
@@ -348,7 +348,8 @@ def get_es_search_generator(es_client, index, body, page_size=1000):
         yield es_hits
 
 
-def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
+def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200,
+                    key=None, ff_env=None):
     """
     Given a list of string item uuids, will return a
     dictionary response of the full ES record for those items (or an empty
@@ -363,6 +364,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
         You can also specifiy lists of values for fields:
             example: filters={'status': ['released', '!archived']}
     NOTE: filters are always combined using AND queries (must all match)
+    Int chunk_size may be used to control the number of uuids that are
+    passed to Elasticsearch in each query; setting this too high may cause
+    ES reads to timeout.
     Same auth mechanism as the other metadata functions
     """
     if es_client is None:
@@ -372,7 +376,6 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
     # sending in too many uuids in the terms query can crash es; break them up
     # into groups of max size 100
     es_res = []
-    chunk_size = 1000  # number of uuids to batch in one ES terms query
     for i in range(0, len(uuids), chunk_size):
         query_uuids = uuids[i:i + chunk_size]
         es_query = {
@@ -407,7 +410,9 @@ def get_es_metadata(uuids, es_client=None, filters={}, key=None, ff_env=None):
                         es_query['query']['bool']['must_not'].append(
                             {'terms': {'embedded.' + k + '.raw': key_not_terms}}
                         )
-        for es_page in get_es_search_generator(es_client, '_all', es_query):
+        # use chunk_limit as page size for performance reasons
+        for es_page in get_es_search_generator(es_client, '_all', es_query,
+                                               page_size=chunk_size):
             # return the document source only; eliminate es metadata
             es_res.extend([hit['_source'] for hit in es_page])
     return es_res

From b3ee2b423c8885bec2ebb57bdd59f2154c9a6d3c Mon Sep 17 00:00:00 2001
From: Carl Vitzthum <carl.vitzthum@gmail.com>
Date: Thu, 25 Oct 2018 17:55:49 -0400
Subject: [PATCH 7/7] Made docs more clear

---
 dcicutils/ff_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
index 9be31d43d..6b8507a6b 100644
--- a/dcicutils/ff_utils.py
+++ b/dcicutils/ff_utils.py
@@ -362,9 +362,12 @@ def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200,
         You can also specify NOT fields:
             example: filters={'status': '!released'}
         You can also specifiy lists of values for fields:
-            example: filters={'status': ['released', '!archived']}
-    NOTE: filters are always combined using AND queries (must all match)
-    Int chunk_size may be used to control the number of uuids that are
+            example: filters={'status': ['released', archived']}
+    NOTES:
+        - different filter field are combined using AND queries (must all match)
+            example: filters={'status': ['released'], 'public_release': ['2018-01-01']}
+        - values for the same field and combined with OR (such as multiple statuses)
+    Integer chunk_size may be used to control the number of uuids that are
     passed to Elasticsearch in each query; setting this too high may cause
     ES reads to timeout.
     Same auth mechanism as the other metadata functions