From 42953191074ab310aa8572dac100650936026eb6 Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Thu, 13 Jun 2024 12:03:46 -0700
Subject: [PATCH 1/6] chore: unfreeze dev (#7186)

---
 .github/workflows/build-images-and-create-deployment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-images-and-create-deployment.yml b/.github/workflows/build-images-and-create-deployment.yml
index 9a3ae1a7cffc8..fbe35ab94cdd7 100644
--- a/.github/workflows/build-images-and-create-deployment.yml
+++ b/.github/workflows/build-images-and-create-deployment.yml
@@ -45,7 +45,7 @@ jobs:
         uses: avakar/create-deployment@v1
         # To stop deployment to a specific DEPLOYMENT_STAGE remove it from condition below.
         # The DEPLOYMENT_STAGE that should be present are dev, stage, prod.
-        if: env.DEPLOYMENT_STAGE == 'prod' || env.DEPLOYMENT_STAGE == 'stage'
+        if: env.DEPLOYMENT_STAGE == 'prod' || env.DEPLOYMENT_STAGE == 'stage' || env.DEPLOYMENT_STAGE == 'dev'
         with:
           auto_merge: false
           environment: ${{ env.DEPLOYMENT_STAGE }}

From ed8cd5e55924ba8bca13a187542b9fa73f8a563b Mon Sep 17 00:00:00 2001
From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com>
Date: Fri, 14 Jun 2024 15:14:23 -0400
Subject: [PATCH 2/6] feat: remove auto_publish from migration (#7156)

---
 .../modules/schema_migration/main.tf          | 23 ++---
 backend/layers/processing/schema_migration.py | 85 +++++++----------
 .../test_collection_migrate.py                | 91 +++++++------------
 .../test_gather_collections.py                | 23 ++---
 ...anup.py => test_log_errors_and_cleanup.py} | 55 +++--------
 5 files changed, 91 insertions(+), 186 deletions(-)
 rename tests/unit/processing/schema_migration/{test_publish_and_cleanup.py => test_log_errors_and_cleanup.py} (71%)

diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf
index aa15694b5c9b5..c10f3904556ea 100644
--- a/.happy/terraform/modules/schema_migration/main.tf
+++ b/.happy/terraform/modules/schema_migration/main.tf
@@ -180,7 +180,6 @@ resource aws_sfn_state_machine sfn_schema_migration {
         "Next": "ApplyDefaults",
         "ResultPath": "$.inputDefaults",
         "Parameters": {
-          "auto_publish": "False",
           "limit_migration": "0"
         }
     },
@@ -221,10 +220,6 @@ resource aws_sfn_state_machine sfn_schema_migration {
               "Name": "EXECUTION_ID",
               "Value.$": "$$.Execution.Name"
             },
-            {
-              "Name": "AUTO_PUBLISH",
-              "Value.$": "$.auto_publish"
-            },
             {
               "Name": "LIMIT_MIGRATION",
               "Value.$": "$.limit_migration"
@@ -286,10 +281,6 @@ resource aws_sfn_state_machine sfn_schema_migration {
                     "Name": "COLLECTION_VERSION_ID",
                     "Value.$": "$.collection_version_id"
                   },
-                  {
-                    "Name": "CAN_PUBLISH",
-                    "Value.$": "$.can_publish"
-                  },
                   {
                     "Name": "TASK_TOKEN",
                     "Value.$": "$$.Task.Token"
@@ -314,7 +305,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                   "States.ALL"
                 ],
                 "ResultPath": null,
-                "Next": "CollectionPublish"
+                "Next": "CollectionCleanup"
               }
             ]
           },
@@ -324,17 +315,17 @@ resource aws_sfn_state_machine sfn_schema_migration {
               {
                 "Variable": "$.key_name",
                 "IsPresent": false,
-                "Next": "CollectionPublish"
+                "Next": "CollectionCleanup"
               }
             ],
             "Default": "SpanDatasets"
           },
-          "CollectionPublish": {
+          "CollectionCleanup": {
             "Type": "Task",
             "Resource": "arn:aws:states:::batch:submitJob.sync",
             "Parameters": {
             "JobDefinition": "${resource.aws_batch_job_definition.schema_migrations.arn}",
-            "JobName": "Collection_publish",
+            "JobName": "Collection_cleanup",
             "JobQueue": "${var.job_queue_arn}",
               "Timeout": {
                 "AttemptDurationSeconds": 600
@@ -343,7 +334,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                 "Environment": [
                   {
                     "Name": "STEP_NAME",
-                    "Value": "collection_publish"
+                    "Value": "collection_cleanup"
                   },
                   {
                     "Name": "MIGRATE",
@@ -497,14 +488,14 @@ resource aws_sfn_state_machine sfn_schema_migration {
                 "Key.$": "$.key_name"
               }
             },
-            "Next": "CollectionPublish",
+            "Next": "CollectionCleanup",
             "MaxConcurrency": 10,
             "Catch": [
               {
                 "ErrorEquals": [
                   "States.ALL"
                 ],
-                "Next": "CollectionPublish",
+                "Next": "CollectionCleanup",
                 "ResultPath": null
               }
             ],
diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 5ac14a516580c..edfddc8dcb89d 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -43,53 +43,35 @@ def fetch_collections(self) -> Iterable[CollectionVersion]:
         unpublished_collections = [*self.business_logic.get_collections(CollectionQueryFilter(is_published=False))]
         return itertools.chain(unpublished_collections, published_collections)
 
-    def gather_collections(self, auto_publish: bool) -> Tuple[Dict[str, str], Dict[str, str]]:
+    def gather_collections(self) -> Tuple[Dict[str, str], Dict[str, str]]:
         """
         This function is used to gather all the collections and their datasets that will be migrated
         A json file is created and uploaded to S3 with the list of collections and datasets that will be migrated. It
         has the following structure:
         [
-            {"can_publish": "true", "collection_id": "<collection_id>", "collection_version_id":
-            "<collection_version_id>"},
-            {"can_publish": "false", "collection_id": "<collection_id>", "collection_version_id":
-            "<collection_version_id>"}
+            {"collection_id": "<collection_id>", "collection_version_id": "<collection_version_id>"},
+            {"collection_id": "<collection_id>", "collection_version_id": "<collection_version_id>"}
             ...
         ]
 
-        :param auto_publish: bool - if False, coerce can_publish to False for all collections. if True, determine
-        can_publish on collection-by-collection basis based on business logic
-        :return: the response retuned to the step function and the list of collections to be migrated
+        :return: the response returned to the step function and the list of collections to be migrated
         """
         response_for_span_collections = []
 
         has_migration_revision = set()
         # iterates over unpublished collections first, so published versions are skipped if there is an active revision
         for collection in self.fetch_collections():
-            _resp = {}
             if collection.is_published() and collection.collection_id.id in has_migration_revision:
                 continue
 
-            if collection.is_published():
-                # published collection without an active revision
-                _resp["can_publish"] = str(True)
-            elif collection.is_unpublished_version():
-                # active revision of a published collection.
-                _resp["can_publish"] = str(False)
-                if collection.is_auto_version:
-                    has_migration_revision.add(collection.collection_id.id)  # migration revision found, skip published
-                    _resp["can_publish"] = str(True)
-            elif collection.is_initial_unpublished_version():
-                # unpublished collection
-                _resp["can_publish"] = str(False)
-
-            if not auto_publish:
-                # auto_publish is off for this migration, overwrite "can_publish" as false in all cases.
-                _resp["can_publish"] = str(False)
-            _resp.update(
-                collection_id=collection.collection_id.id,
-                collection_version_id=collection.version_id.id,
-                execution_id=self.execution_id,
-            )
+            if collection.is_auto_version:
+                has_migration_revision.add(collection.collection_id.id)  # migration revision found, skip published
+
+            _resp = {
+                "collection_id": collection.collection_id.id,
+                "collection_version_id": collection.version_id.id,
+                "execution_id": self.execution_id,
+            }
             response_for_span_collections.append(_resp)
 
         # For testing purposes, only migrate a randomly sampled subset of the collections gathered
@@ -139,16 +121,17 @@ def dataset_migrate(
         }
 
     def collection_migrate(
-        self, collection_id: str, collection_version_id: str, can_publish: bool
+        self,
+        collection_id: str,
+        collection_version_id: str,
     ) -> Tuple[Dict[str, str], Dict[str, str], List[Dict[str, str]]]:
         """
         This function is used to migrate a collection and its datasets to the latest schema version.
 
         :param collection_id: the canonical collection id
         :param collection_version_id: the collection version to migrate
-        :param can_publish: if True, the collection will be published after migration
-        :return: the response retuned to the step function, the response for the publish_and_cleanup step function, and
-        the list of datasets to be migrated
+        :return: the response retuned to the step function, the response for the log_errors_and_cleanup step function,
+        and the list of datasets to be migrated
         """
         # Get datasets from collection
         version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id))
@@ -167,9 +150,7 @@ def collection_migrate(
                     "All datasets in the collection have been migrated", extra={"dataset_count": len(version.datasets)}
                 )
             response_for_dataset_migrate = []
-            response_for_publish_and_cleanup = {
-                "can_publish": str(False),  # skip publishing, because the collection is already published and no
-                # revision is created, or the collection is private or a revision.
+            response_for_log_errors_and_cleanup = {
                 "collection_version_id": collection_version_id,
             }
             response_for_sfn = {"collection_version_id": collection_version_id}
@@ -195,30 +176,31 @@ def collection_migrate(
                 if dataset.status.processing_status == DatasetProcessingStatus.SUCCESS
                 # Filter out datasets that are not successfully processed
             ]
-            response_for_publish_and_cleanup = {
-                "can_publish": str(can_publish),
+            response_for_log_errors_and_cleanup = {
                 "collection_version_id": private_collection_version_id,
             }
             response_for_sfn = {"collection_version_id": private_collection_version_id}
-        response_for_publish_and_cleanup["datasets"] = response_for_dataset_migrate
-        response_for_publish_and_cleanup["collection_url"] = collection_url
+        response_for_log_errors_and_cleanup["datasets"] = response_for_dataset_migrate
+        response_for_log_errors_and_cleanup["collection_url"] = collection_url
 
         response_for_sfn["execution_id"] = self.execution_id
 
-        self._store_sfn_response("publish_and_cleanup", version.collection_id.id, response_for_publish_and_cleanup)
+        self._store_sfn_response(
+            "log_errors_and_cleanup", version.collection_id.id, response_for_log_errors_and_cleanup
+        )
 
         if response_for_dataset_migrate:
             key_name = self._store_sfn_response("span_datasets", version.collection_id.id, response_for_dataset_migrate)
             response_for_sfn["key_name"] = key_name
-        return (response_for_sfn, response_for_publish_and_cleanup, response_for_dataset_migrate)
+        return (response_for_sfn, response_for_log_errors_and_cleanup, response_for_dataset_migrate)
 
-    def publish_and_cleanup(self, collection_version_id: str) -> list:
+    def log_errors_and_cleanup(self, collection_version_id: str) -> list:
         errors = []
         collection_version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id))
         object_keys_to_delete = []
 
         # Get the datasets that were processed
-        extra_info = self._retrieve_sfn_response("publish_and_cleanup", collection_version.collection_id.id)
+        extra_info = self._retrieve_sfn_response("log_errors_and_cleanup", collection_version.collection_id.id)
         processed_datasets = {d["dataset_id"]: d["dataset_version_id"] for d in extra_info["datasets"]}
 
         # Process datasets errors
@@ -275,8 +257,6 @@ def publish_and_cleanup(self, collection_version_id: str) -> list:
         self.s3_provider.delete_files(self.artifact_bucket, object_keys_to_delete)
         if errors:
             self._store_sfn_response("report/errors", collection_version_id, errors)
-        elif extra_info["can_publish"].lower() == "true":
-            self.business_logic.publish_collection_version(collection_version.version_id)
         return errors
 
     def _store_sfn_response(self, directory: str, file_name: str, response: Dict[str, str]):
@@ -379,17 +359,14 @@ def migrate(self, step_name) -> bool:
         self.logger.info(f"Starting {step_name}", extra={"step": step_name})
         if step_name == "gather_collections":
             gather_collections = self.error_wrapper(self.gather_collections, "gather_collections")
-            auto_publish = os.environ["AUTO_PUBLISH"].lower() == "true"
-            response, _ = gather_collections(auto_publish)
+            response, _ = gather_collections()
         elif step_name == "collection_migrate":
             collection_id = os.environ["COLLECTION_ID"]
             collection_version_id = os.environ["COLLECTION_VERSION_ID"]
-            can_publish = os.environ["CAN_PUBLISH"].lower() == "true"
             collection_migrate = self.error_wrapper(self.collection_migrate, collection_id)
             response, _, _ = collection_migrate(
                 collection_id=collection_id,
                 collection_version_id=collection_version_id,
-                can_publish=can_publish,
             )
         elif step_name == "dataset_migrate":
             collection_version_id = os.environ["COLLECTION_VERSION_ID"]
@@ -403,10 +380,10 @@ def migrate(self, step_name) -> bool:
                 dataset_id=dataset_id,
                 dataset_version_id=dataset_version_id,
             )
-        elif step_name == "collection_publish":
+        elif step_name == "collection_cleanup":
             collection_version_id = os.environ["COLLECTION_VERSION_ID"]
-            publish_and_cleanup = self.error_wrapper(self.publish_and_cleanup, collection_version_id)
-            response = publish_and_cleanup(collection_version_id=collection_version_id)
+            log_errors_and_cleanup = self.error_wrapper(self.log_errors_and_cleanup, collection_version_id)
+            response = log_errors_and_cleanup(collection_version_id=collection_version_id)
         elif step_name == "report":
             response = self.report()
         self.logger.info("output", extra={"response": response})
diff --git a/tests/unit/processing/schema_migration/test_collection_migrate.py b/tests/unit/processing/schema_migration/test_collection_migrate.py
index ac6d7d3be21d7..ed45ad3e5da08 100644
--- a/tests/unit/processing/schema_migration/test_collection_migrate.py
+++ b/tests/unit/processing/schema_migration/test_collection_migrate.py
@@ -4,7 +4,7 @@
 
 
 class TestCollectionMigrate:
-    def test_can_publish_true(self, schema_migrate_and_collections):
+    def test_migrate_published_collection(self, schema_migrate_and_collections):
         schema_migrate, collections = schema_migrate_and_collections
         schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response)
         schema_migrate.schema_version = "0.0.0"
@@ -20,18 +20,19 @@ def test_can_publish_true(self, schema_migrate_and_collections):
             }
             for dataset in published.datasets
         ]
-        response, response_for_publish_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
-            published.collection_id.id, published.version_id.id, True
+        response, response_for_log_errors_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
+            published.collection_id.id,
+            published.version_id.id,
         )
         schema_migrate._store_sfn_response.assert_any_call(
-            "publish_and_cleanup", published.collection_id.id, response_for_publish_and_cleanup
+            "log_errors_and_cleanup", published.collection_id.id, response_for_log_errors_and_cleanup
         )
         schema_migrate._store_sfn_response.assert_any_call(
             "span_datasets", published.collection_id.id, response_for_span_datasets
         )
-        assert response_for_publish_and_cleanup["collection_version_id"] == collection_version_id.id
+        assert response_for_log_errors_and_cleanup["collection_version_id"] == collection_version_id.id
         assert (
-            response_for_publish_and_cleanup["collection_url"]
+            response_for_log_errors_and_cleanup["collection_url"]
             == f"https://collections_domain/collections/{published.collection_id.id}"
         )
         assert "key_name" in response
@@ -40,7 +41,7 @@ def test_can_publish_true(self, schema_migrate_and_collections):
             assert response_for_span_datasets[i].pop("collection_id") == published.collection_id.id
             assert response_for_span_datasets[i] == datasets[i]
 
-    def test_can_publish_false(self, schema_migrate_and_collections):
+    def test_migrate_private_collection(self, schema_migrate_and_collections):
         schema_migrate, collections = schema_migrate_and_collections
         schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response)
         schema_migrate.schema_version = "0.0.0"
@@ -54,20 +55,21 @@ def test_can_publish_false(self, schema_migrate_and_collections):
             }
             for dataset in private.datasets
         ]
-        response, response_for_publish_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
-            private.collection_id.id, private.version_id.id, False
+        response, response_for_log_errors_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
+            private.collection_id.id,
+            private.version_id.id,
         )
         schema_migrate._store_sfn_response.assert_any_call(
-            "publish_and_cleanup", private.collection_id.id, response_for_publish_and_cleanup
+            "log_errors_and_cleanup", private.collection_id.id, response_for_log_errors_and_cleanup
         )
         schema_migrate._store_sfn_response.assert_any_call(
             "span_datasets", private.collection_id.id, response_for_span_datasets
         )
 
-        # verify response_for_publish_and_cleanup
-        assert response_for_publish_and_cleanup["collection_version_id"] == private.version_id.id
+        # verify response_for_log_errors_and_cleanup
+        assert response_for_log_errors_and_cleanup["collection_version_id"] == private.version_id.id
         assert (
-            response_for_publish_and_cleanup["collection_url"]
+            response_for_log_errors_and_cleanup["collection_url"]
             == f"https://collections_domain/collections/{private.collection_id.id}"
         )
 
@@ -82,50 +84,24 @@ def test_can_publish_false(self, schema_migrate_and_collections):
             assert response_for_span_datasets[i].pop("collection_id") == private.collection_id.id
             assert response_for_span_datasets[i] == datasets[i]
 
-    def test_can_publish_false_and_no_datasets(self, schema_migrate_and_collections):
+    def test_filter_schema_version(self, schema_migrate_and_collections):
         schema_migrate, collections = schema_migrate_and_collections
         schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response)
-        schema_migrate.schema_version = "0.0.0"
         published = collections["published"][0]
-        published.datasets = []
         schema_migrate.business_logic.create_collection_version.return_value = Mock(version_id=CollectionVersionId())
-        response, response_for_publish_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
-            published.collection_id.id, published.version_id.id, False
+        response, response_for_log_errors_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
+            published.collection_id.id,
+            published.version_id.id,
         )
         schema_migrate._store_sfn_response.assert_called_once_with(
-            "publish_and_cleanup", published.collection_id.id, response_for_publish_and_cleanup
-        )
-
-        # verify response_for_publish_and_cleanup
-        assert response_for_publish_and_cleanup["collection_version_id"] == published.version_id.id
-        assert (
-            response_for_publish_and_cleanup["collection_url"]
-            == f"https://collections_domain/collections/{published.collection_id.id}"
+            "log_errors_and_cleanup", published.collection_id.id, response_for_log_errors_and_cleanup
         )
-        # verify response_for_span_datasets
-        assert not response_for_span_datasets
-
-        # verify response
-        assert "key_name" not in response
-        assert response["collection_version_id"] == published.version_id.id
-        assert response["execution_id"] == "test-execution-arn"
 
-    def test_can_publish_true_and_filtered_schema_version(self, schema_migrate_and_collections):
-        schema_migrate, collections = schema_migrate_and_collections
-        schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response)
-        published = collections["published"][0]
-        schema_migrate.business_logic.create_collection_version.return_value = Mock(version_id=CollectionVersionId())
-        response, response_for_publish_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
-            published.collection_id.id, published.version_id.id, False
-        )
-        schema_migrate._store_sfn_response.assert_called_once_with(
-            "publish_and_cleanup", published.collection_id.id, response_for_publish_and_cleanup
-        )
+        # verify response_for_log_errors_and_cleanup
 
-        # verify response_for_publish_and_cleanup
-        assert response_for_publish_and_cleanup["collection_version_id"] == published.version_id.id
+        assert response_for_log_errors_and_cleanup["collection_version_id"] == published.version_id.id
         assert (
-            response_for_publish_and_cleanup["collection_url"]
+            response_for_log_errors_and_cleanup["collection_url"]
             == f"https://collections_domain/collections/{published.collection_id.id}"
         )
 
@@ -143,17 +119,18 @@ def test_no_datasets(self, schema_migrate_and_collections):
         published = collections["published"][0]
         published.datasets = []
         schema_migrate.business_logic.create_collection_version.return_value = Mock(version_id=CollectionVersionId())
-        response, response_for_publish_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
-            published.collection_id.id, published.version_id.id, False
+        response, response_for_log_errors_and_cleanup, response_for_span_datasets = schema_migrate.collection_migrate(
+            published.collection_id.id,
+            published.version_id.id,
         )
         schema_migrate._store_sfn_response.assert_called_once_with(
-            "publish_and_cleanup", published.collection_id.id, response_for_publish_and_cleanup
+            "log_errors_and_cleanup", published.collection_id.id, response_for_log_errors_and_cleanup
         )
 
-        # verify response_for_publish_and_cleanup
-        assert response_for_publish_and_cleanup["collection_version_id"] == published.version_id.id
+        # verify response_for_log_errors_and_cleanup
+        assert response_for_log_errors_and_cleanup["collection_version_id"] == published.version_id.id
         assert (
-            response_for_publish_and_cleanup["collection_url"]
+            response_for_log_errors_and_cleanup["collection_url"]
             == f"https://collections_domain/collections/{published.collection_id.id}"
         )
 
@@ -175,7 +152,7 @@ def test_create_migration_revision__private(self, schema_migrate_and_collections
         )
 
         # only call create_collection_version if the collection is published
-        schema_migrate.collection_migrate(private.collection_id.id, private.version_id.id, False)
+        schema_migrate.collection_migrate(private.collection_id.id, private.version_id.id)
         schema_migrate.business_logic.create_collection_version.assert_not_called()
 
     def test_create_migration_revision__published_with_revision(self, schema_migrate_and_collections):
@@ -187,10 +164,10 @@ def test_create_migration_revision__published_with_revision(self, schema_migrate
             return_value=Mock(version_id=CollectionVersionId())
         )
 
-        schema_migrate.collection_migrate(revision.collection_id.id, revision.version_id.id, False)
+        schema_migrate.collection_migrate(revision.collection_id.id, revision.version_id.id)
         schema_migrate.business_logic.create_collection_version.assert_not_called()
 
-        schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id, False)
+        schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id)
         schema_migrate.business_logic.create_collection_version.assert_called_once()
 
     def test_create_migration_revision__published_no_revision(self, schema_migrate_and_collections):
@@ -203,5 +180,5 @@ def test_create_migration_revision__published_no_revision(self, schema_migrate_a
             return_value=Mock(version_id=CollectionVersionId())
         )
 
-        schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id, False)
+        schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id)
         schema_migrate.business_logic.create_collection_version.assert_called_once()
diff --git a/tests/unit/processing/schema_migration/test_gather_collections.py b/tests/unit/processing/schema_migration/test_gather_collections.py
index c270b8f62a2f0..887be2fb8f7b5 100644
--- a/tests/unit/processing/schema_migration/test_gather_collections.py
+++ b/tests/unit/processing/schema_migration/test_gather_collections.py
@@ -5,16 +5,14 @@ def test_with_revision(self, schema_migrate_and_collections):
         published, revision = collections["revision"]
         # get_collections is called twice
         schema_migrate.business_logic.get_collections.side_effect = [[published], [revision]]
-        _, response = schema_migrate.gather_collections(auto_publish=True)
+        _, response = schema_migrate.gather_collections()
         assert len(response) == 2
         assert {
-            "can_publish": "True",
             "collection_id": published.collection_id.id,
             "collection_version_id": published.version_id.id,
             "execution_id": "test-execution-arn",
         } in response
         assert {
-            "can_publish": "False",
             "collection_id": revision.collection_id.id,
             "collection_version_id": revision.version_id.id,
             "execution_id": "test-execution-arn",
@@ -25,16 +23,14 @@ def test_with_migration_revision(self, schema_migrate_and_collections):
         published, migration_revision = collections["migration_revision"]
         # get_collections is called twice
         schema_migrate.business_logic.get_collections.side_effect = [[published], [migration_revision]]
-        _, response = schema_migrate.gather_collections(auto_publish=True)
+        _, response = schema_migrate.gather_collections()
         assert len(response) == 1
         assert {
-            "can_publish": "True",
             "collection_id": published.collection_id.id,
             "collection_version_id": published.version_id.id,
             "execution_id": "test-execution-arn",
         } not in response
         assert {
-            "can_publish": "True",
             "collection_id": migration_revision.collection_id.id,
             "collection_version_id": migration_revision.version_id.id,
             "execution_id": "test-execution-arn",
@@ -45,9 +41,8 @@ def test_with_published(self, schema_migrate_and_collections):
         published = collections["published"]
         # get_collections is called twice
         schema_migrate.business_logic.get_collections.side_effect = [published, []]
-        _, response = schema_migrate.gather_collections(auto_publish=True)
+        _, response = schema_migrate.gather_collections()
         assert {
-            "can_publish": "True",
             "collection_id": published[0].collection_id.id,
             "collection_version_id": published[0].version_id.id,
             "execution_id": "test-execution-arn",
@@ -59,16 +54,15 @@ def test_with_private(self, schema_migrate_and_collections):
         private = collections["private"]
         # get_collections is called twice
         schema_migrate.business_logic.get_collections.side_effect = [[], private]
-        _, response = schema_migrate.gather_collections(auto_publish=True)
+        _, response = schema_migrate.gather_collections()
         assert {
-            "can_publish": "False",
             "collection_id": private[0].collection_id.id,
             "collection_version_id": private[0].version_id.id,
             "execution_id": "test-execution-arn",
         } in response
         assert len(response) == 1
 
-    def test_with_auto_publish_false(self, schema_migrate_and_collections):
+    def test_with_mixed_collection_types(self, schema_migrate_and_collections):
         schema_migrate, collections = schema_migrate_and_collections
         published_no_revision = collections["published"][0]
         private = collections["private"][0]
@@ -79,35 +73,30 @@ def test_with_auto_publish_false(self, schema_migrate_and_collections):
             [published_no_revision, published_with_non_migration_revision, published_with_migration_revision],
             [private, revision, migration_revision],
         ]
-        _, response = schema_migrate.gather_collections(auto_publish=False)
+        _, response = schema_migrate.gather_collections()
         assert len(response) == 5
         for obj in [
             {
-                "can_publish": "False",
                 "collection_id": published_no_revision.collection_id.id,
                 "collection_version_id": published_no_revision.version_id.id,
                 "execution_id": "test-execution-arn",
             },
             {
-                "can_publish": "False",
                 "collection_id": private.collection_id.id,
                 "collection_version_id": private.version_id.id,
                 "execution_id": "test-execution-arn",
             },
             {
-                "can_publish": "False",
                 "collection_id": revision.collection_id.id,
                 "collection_version_id": revision.version_id.id,
                 "execution_id": "test-execution-arn",
             },
             {
-                "can_publish": "False",
                 "collection_id": published_with_non_migration_revision.collection_id.id,
                 "collection_version_id": published_with_non_migration_revision.version_id.id,
                 "execution_id": "test-execution-arn",
             },
             {
-                "can_publish": "False",
                 "collection_id": migration_revision.collection_id.id,
                 "collection_version_id": migration_revision.version_id.id,
                 "execution_id": "test-execution-arn",
diff --git a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py b/tests/unit/processing/schema_migration/test_log_errors_and_cleanup.py
similarity index 71%
rename from tests/unit/processing/schema_migration/test_publish_and_cleanup.py
rename to tests/unit/processing/schema_migration/test_log_errors_and_cleanup.py
index 071b4b0aa4a49..89dd66e8e7761 100644
--- a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py
+++ b/tests/unit/processing/schema_migration/test_log_errors_and_cleanup.py
@@ -6,10 +6,9 @@
 from tests.unit.processing.schema_migration.conftest import make_mock_collection_version, make_mock_dataset_version
 
 
-def factory_download_file(can_publish):
+def factory_download_file():
     def download_file(bucket, key_name, local_path):
         contents = {
-            "can_publish": str(can_publish),
             "datasets": [
                 {
                     "dataset_id": "dataset_id_1",
@@ -24,7 +23,7 @@ def download_file(bucket, key_name, local_path):
                     "dataset_version_id": "prev_non_migrated_dataset_version_id",
                 },
             ],
-            # these datasets populate the processed_dataset variable in the publish_and_cleanup function
+            # these datasets populate the processed_dataset variable in the log_errors_and_cleanup function
         }
         with open(local_path, "w") as f:
             f.write(json.dumps(contents))
@@ -41,9 +40,9 @@ def mock_json_dump(response, f, **kwargs):
 
 
 @patch("backend.layers.processing.schema_migration.json.dump", side_effect=mock_json_dump)
-class TestPublishAndCleanup:
+class TestLogErrorsAndCleanup:
     def test_OK(self, mock_json, schema_migrate):
-        schema_migrate.business_logic.s3_provider.download_file = factory_download_file(True)
+        schema_migrate.business_logic.s3_provider.download_file = factory_download_file()
         schema_migrate.business_logic.s3_provider.delete_files = Mock()
         datasets = [
             make_mock_dataset_version(
@@ -55,11 +54,10 @@ def test_OK(self, mock_json, schema_migrate):
         collection_version = make_mock_collection_version(datasets)
         schema_migrate.business_logic.get_collection_version.return_value = collection_version
 
-        errors = schema_migrate.publish_and_cleanup(collection_version.version_id.id)
+        errors = schema_migrate.log_errors_and_cleanup(collection_version.version_id.id)
         assert errors == []
-        schema_migrate.business_logic.publish_collection_version.assert_called_once_with(collection_version.version_id)
         schema_migrate.s3_provider.delete_files.assert_any_call(
-            "artifact-bucket", ["schema_migration/test-execution-arn/publish_and_cleanup/collection_id.json"]
+            "artifact-bucket", ["schema_migration/test-execution-arn/log_errors_and_cleanup/collection_id.json"]
         )
         schema_migrate.s3_provider.delete_files.assert_any_call(
             "artifact-bucket",
@@ -67,7 +65,7 @@ def test_OK(self, mock_json, schema_migrate):
         )
 
     def test_with_errors(self, mock_json, schema_migrate):
-        schema_migrate.business_logic.s3_provider.download_file = factory_download_file(True)
+        schema_migrate.business_logic.s3_provider.download_file = factory_download_file()
         schema_migrate.business_logic.s3_provider.delete_files = Mock()
         failed_dataset = make_mock_dataset_version(
             dataset_id="dataset_id_2",
@@ -87,7 +85,7 @@ def test_with_errors(self, mock_json, schema_migrate):
         collection_version = make_mock_collection_version(datasets)
         schema_migrate.business_logic.get_collection_version.return_value = collection_version
 
-        errors = schema_migrate.publish_and_cleanup(collection_version.version_id.id)
+        errors = schema_migrate.log_errors_and_cleanup(collection_version.version_id.id)
         assert len(errors) == 2
         assert {
             "message": failed_dataset.status.validation_message,
@@ -107,9 +105,8 @@ def test_with_errors(self, mock_json, schema_migrate):
             "dataset_id": non_migrated_dataset.dataset_id.id,
             "rollback": False,
         } in errors
-        schema_migrate.business_logic.publish_collection_version.assert_not_called()
         schema_migrate.s3_provider.delete_files.assert_any_call(
-            "artifact-bucket", ["schema_migration/test-execution-arn/publish_and_cleanup/collection_id.json"]
+            "artifact-bucket", ["schema_migration/test-execution-arn/log_errors_and_cleanup/collection_id.json"]
         )
         schema_migrate.s3_provider.delete_files.assert_any_call(
             "artifact-bucket",
@@ -119,45 +116,19 @@ def test_with_errors(self, mock_json, schema_migrate):
             ],
         )
 
-    def test_can_not_publish(self, mock_json, schema_migrate):
-        schema_migrate.business_logic.s3_provider.download_file = factory_download_file(False)
-        schema_migrate.business_logic.s3_provider.delete_files = Mock()
-        dataset_status = dict(processing_status=DatasetProcessingStatus.SUCCESS)
-        metadata = dict(schema_version="1.0.0")
-        collection_version = make_mock_collection_version(
-            [
-                make_mock_dataset_version(
-                    dataset_id="dataset_id_1",
-                    version_id="new_successful_dataset_version_id",
-                    status=dataset_status,
-                    metadata=metadata,
-                )
-            ]
-        )
-        schema_migrate.business_logic.get_collection_version.return_value = collection_version
-        errors = schema_migrate.publish_and_cleanup(collection_version.version_id.id)
-        assert errors == []
-        schema_migrate.business_logic.publish_collection_version.assert_not_called()
-        schema_migrate.s3_provider.delete_files.assert_any_call(
-            "artifact-bucket", ["schema_migration/test-execution-arn/publish_and_cleanup/collection_id.json"]
-        )
-        schema_migrate.s3_provider.delete_files.assert_any_call(
-            "artifact-bucket", ["prev_successful_dataset_version_id/migrated.h5ad"]
-        )
-
     def test_skip_unprocessed_datasets(self, mock_json, schema_migrate):
         """
-        Test that datasets that do not appear in the processed_datasets variable in publish_and_cleanup are skipped
+        Test that datasets that do not appear in the processed_datasets variable in log_errors_and_cleanup are skipped
         """
-        schema_migrate.business_logic.s3_provider.download_file = factory_download_file(False)
+        schema_migrate.business_logic.s3_provider.download_file = factory_download_file()
         schema_migrate.business_logic.s3_provider.delete_files = Mock()
         collection_version = make_mock_collection_version([make_mock_dataset_version()])
         schema_migrate.business_logic.get_collection_version.return_value = collection_version
         schema_migrate.check_dataset_is_latest_schema_version = Mock(return_value=True)
-        errors = schema_migrate.publish_and_cleanup(collection_version.version_id.id)
+        errors = schema_migrate.log_errors_and_cleanup(collection_version.version_id.id)
         assert errors == []
         schema_migrate.check_dataset_is_latest_schema_version.assert_not_called()
         schema_migrate.s3_provider.delete_files.assert_any_call(
-            "artifact-bucket", ["schema_migration/test-execution-arn/publish_and_cleanup/collection_id.json"]
+            "artifact-bucket", ["schema_migration/test-execution-arn/log_errors_and_cleanup/collection_id.json"]
         )
         schema_migrate.s3_provider.delete_files.assert_any_call("artifact-bucket", [])

From 5afa5396d9ac97e2f16813466018a4c0480f1146 Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Mon, 17 Jun 2024 14:56:39 -0700
Subject: [PATCH 3/6] fix: optimize database queries (#7191)

---
 backend/api_server/db.py                      |   2 +
 backend/common/utils/db_session.py            |   1 +
 backend/common/utils/timer.py                 |  16 ++
 .../api/v1/curation/collections/actions.py    |   4 +-
 .../api/v1/curation/collections/common.py     |  16 +-
 backend/layers/persistence/persistence.py     | 165 ++++++++++--------
 6 files changed, 131 insertions(+), 73 deletions(-)
 create mode 100644 backend/common/utils/timer.py

diff --git a/backend/api_server/db.py b/backend/api_server/db.py
index 50ae640557af7..a15e136d36379 100644
--- a/backend/api_server/db.py
+++ b/backend/api_server/db.py
@@ -4,6 +4,8 @@
 
 from backend.common.utils.db_session import db_session_manager
 
+# TODO remove from code base
+
 
 def dbconnect(func):
     @wraps(func)
diff --git a/backend/common/utils/db_session.py b/backend/common/utils/db_session.py
index 95b7fc4159f1d..b5025d2501101 100644
--- a/backend/common/utils/db_session.py
+++ b/backend/common/utils/db_session.py
@@ -1,3 +1,4 @@
+# TODO remove from code base
 import logging
 from contextlib import contextmanager
 
diff --git a/backend/common/utils/timer.py b/backend/common/utils/timer.py
new file mode 100644
index 0000000000000..f1f82a34373b0
--- /dev/null
+++ b/backend/common/utils/timer.py
@@ -0,0 +1,16 @@
+import logging
+import time
+from contextlib import contextmanager
+
+logging.basicConfig(level=logging.INFO)
+
+
+@contextmanager
+def log_time_taken(description: str = "Code block"):
+    start_time = time.time()
+    try:
+        yield
+    finally:
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        logging.info(dict(type="METRIC", details=dict(description=description, time=elapsed_time, unit="seconds")))
diff --git a/backend/curation/api/v1/curation/collections/actions.py b/backend/curation/api/v1/curation/collections/actions.py
index 0554dea51e39d..f52e1d8c07c46 100644
--- a/backend/curation/api/v1/curation/collections/actions.py
+++ b/backend/curation/api/v1/curation/collections/actions.py
@@ -1,3 +1,5 @@
+import logging
+
 from flask import jsonify, make_response
 
 import backend.common.doi as doi
@@ -37,7 +39,7 @@ def get(visibility: str, token_info: dict, curator: str = None):
         else:
             filters["curator_name"] = curator
 
-    print(filters)
+    logging.info(filters)
 
     resp_collections = []
     for collection_version in get_business_logic().get_collections(CollectionQueryFilter(**filters)):
diff --git a/backend/curation/api/v1/curation/collections/common.py b/backend/curation/api/v1/curation/collections/common.py
index f721acf53a7ef..852ab329a5208 100644
--- a/backend/curation/api/v1/curation/collections/common.py
+++ b/backend/curation/api/v1/curation/collections/common.py
@@ -195,11 +195,20 @@ def reshape_datasets_for_curation_api(
     as_version: bool = False,
     is_published: bool = False,
 ) -> List[dict]:
+    business_logic = get_business_logic()
     active_datasets = []
+    dataset_version_ids = []
+    dataset_versions = []
     for dv in datasets:
-        dataset_version = get_business_logic().get_dataset_version(dv) if isinstance(dv, DatasetVersionId) else dv
+        if isinstance(dv, DatasetVersion):
+            dataset_versions.append(dv)
+        else:
+            dataset_version_ids.append(dv)
+    if dataset_version_ids:
+        dataset_versions.extend(business_logic.database_provider.get_dataset_versions_by_id(dataset_version_ids))
+    for dv in dataset_versions:
         reshaped_dataset = reshape_dataset_for_curation_api(
-            dataset_version, use_canonical_url, preview, as_canonical=not as_version, is_published=is_published
+            dv, use_canonical_url, preview, as_canonical=not as_version, is_published=is_published
         )
         active_datasets.append(reshaped_dataset)
     return active_datasets
@@ -275,7 +284,8 @@ def reshape_dataset_for_curation_datasets_index_api(
     """
     Create the response shape for the curation datasets index API response. Handles shape for both public and private
     requests.
-    :param visibility: the requested visibility of the datasets to be included in the dataset index response; either PUBLIC or PRIVATE.
+    :param visibility: the requested visibility of the datasets to be included in the dataset index response; either
+    PUBLIC or PRIVATE.
     :param collection_version: the collection version of the dataset to be included in the API response.
     :param dataset_version: a dataset version to be included in the API response.
     :return: A dictionary shaped for inclusion in the datasets index API response.
diff --git a/backend/layers/persistence/persistence.py b/backend/layers/persistence/persistence.py
index 90c5e5e5d5ced..d9e6393cbfd62 100644
--- a/backend/layers/persistence/persistence.py
+++ b/backend/layers/persistence/persistence.py
@@ -6,7 +6,8 @@
 from datetime import datetime
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
-from sqlalchemy import create_engine, delete
+from server_timing import Timing as ServerTiming
+from sqlalchemy import create_engine, delete, update
 from sqlalchemy.exc import ProgrammingError, SQLAlchemyError
 from sqlalchemy.orm import Session, sessionmaker
 from tenacity import retry
@@ -14,6 +15,7 @@
 from tenacity.wait import wait_fixed
 
 from backend.common.corpora_config import CorporaDbConfig
+from backend.common.utils.timer import log_time_taken
 from backend.layers.business.exceptions import CollectionIsPublishedException, DatasetIsPublishedException
 from backend.layers.common.entities import (
     CanonicalCollection,
@@ -281,16 +283,10 @@ def get_dataset_versions_by_id(
                 canonical_ids.append(version.dataset_id)
                 artifact_ids.extend(version.artifacts)
 
-            if get_tombstoned:
-                canonical_dataset_query = session.query(DatasetTable).filter(DatasetTable.id.in_(canonical_ids))
-            else:
-                canonical_dataset_query = (
-                    session.query(DatasetTable)
-                    .filter(DatasetTable.id.in_(canonical_ids))
-                    .filter(DatasetTable.tombstone.is_(False))
-                )
+            canonical_dataset_query = session.query(DatasetTable).filter(DatasetTable.id.in_(canonical_ids))
+            if not get_tombstoned:
+                canonical_dataset_query = canonical_dataset_query.filter(DatasetTable.tombstone.is_(False))
             canonical_datasets = canonical_dataset_query.all()
-
             canonical_map = {canonical_dataset.id: canonical_dataset for canonical_dataset in canonical_datasets}
 
             artifacts = session.query(DatasetArtifactTable).filter(DatasetArtifactTable.id.in_(artifact_ids)).all()
@@ -377,12 +373,15 @@ def get_all_versions_for_collection(
         with self._manage_session() as session:
             version_rows = session.query(CollectionVersionTable).filter_by(collection_id=collection_id.id).all()
             canonical_collection = self.get_canonical_collection(collection_id)
-            versions = list()
-            for i in range(len(version_rows)):
-                datasets = self.get_dataset_versions_by_id(
-                    [DatasetVersionId(str(id)) for id in version_rows[i].datasets], get_tombstoned=get_tombstoned
-                )
-                version = self._row_to_collection_version_with_datasets(version_rows[i], canonical_collection, datasets)
+            versions = []
+            dataset_version_ids = [DatasetVersionId(str(_id)) for vr in version_rows for _id in vr.datasets]
+            datasets = {
+                str(dv.version_id): dv
+                for dv in self.get_dataset_versions_by_id(dataset_version_ids, get_tombstoned=get_tombstoned)
+            }
+            for row in version_rows:
+                ds = [datasets[str(id)] for id in row.datasets]
+                version = self._row_to_collection_version_with_datasets(row, canonical_collection, ds)
                 versions.append(version)
             return versions
 
@@ -397,12 +396,15 @@ def get_unpublished_versions_for_collection(
                 session.query(CollectionVersionTable).filter_by(collection_id=collection_id.id, published_at=None).all()
             )
             canonical_collection = self.get_canonical_collection(collection_id)
-            versions = list()
-            for i in range(len(version_rows)):
-                datasets = self.get_dataset_versions_by_id(
-                    [DatasetVersionId(str(id)) for id in version_rows[i].datasets], get_tombstoned=get_tombstoned
-                )
-                version = self._row_to_collection_version_with_datasets(version_rows[i], canonical_collection, datasets)
+            versions = []
+            dataset_version_ids = [DatasetVersionId(str(_id)) for vr in version_rows for _id in vr.datasets]
+            datasets = {
+                str(dv.version_id): dv
+                for dv in self.get_dataset_versions_by_id(dataset_version_ids, get_tombstoned=get_tombstoned)
+            }
+            for row in version_rows:
+                ds = [datasets[str(id)] for id in row.datasets]
+                version = self._row_to_collection_version_with_datasets(row, canonical_collection, ds)
                 versions.append(version)
             return versions
 
@@ -412,33 +414,42 @@ def get_all_collections_versions(self, get_tombstoned: bool = False) -> Iterable
         TODO: for performance reasons, it might be necessary to add a filtering parameter here.
         """
         with self._manage_session() as session:
-            versions = session.query(CollectionVersionTable).all()
+            with ServerTiming.time("get-cv-all"), log_time_taken("get-cv-all"):
+                versions = session.query(CollectionVersionTable).all()
+
+            with ServerTiming.time("get-cc-by-id"), log_time_taken("get-cc-by-id"):
+                if get_tombstoned:
+                    all_canonical_collections = session.query(CollectionTable)
+                else:
+                    all_canonical_collections = session.query(CollectionTable).filter(
+                        CollectionTable.tombstone.isnot(True)
+                    )
 
-            # Create a canonical mapping
-            if get_tombstoned:
-                all_canonical_collections = session.query(CollectionTable)
-            else:
-                all_canonical_collections = session.query(CollectionTable).filter(CollectionTable.tombstone.isnot(True))
-
-            all_canonical_map = dict()
-            for collection_row in all_canonical_collections.all():
-                all_canonical_map[str(collection_row.id)] = CanonicalCollection(
-                    CollectionId(str(collection_row.id)),
-                    None if collection_row.version_id is None else CollectionVersionId(str(collection_row.version_id)),
-                    collection_row.originally_published_at,
-                    collection_row.revised_at,
-                    collection_row.tombstone,
-                )
+                all_canonical_map = dict()
+                for collection_row in all_canonical_collections.all():
+                    all_canonical_map[str(collection_row.id)] = CanonicalCollection(
+                        CollectionId(str(collection_row.id)),
+                        (
+                            None
+                            if collection_row.version_id is None
+                            else CollectionVersionId(str(collection_row.version_id))
+                        ),
+                        collection_row.originally_published_at,
+                        collection_row.revised_at,
+                        collection_row.tombstone,
+                    )
 
             result = []
-            all_dataset_tombstones = {
-                str(dataset.id)
-                for dataset in session.query(DatasetTable).filter(DatasetTable.tombstone.is_(True)).all()
-            }
-            all_dataset_version_mappings = {
-                str(dataset_version.id): str(dataset_version.dataset_id)
-                for dataset_version in session.query(DatasetVersionTable).all()
-            }
+            with ServerTiming.time("get-cdt-all"), log_time_taken("get-cdt-all"):
+                all_dataset_tombstones = {
+                    str(dataset.id)
+                    for dataset in session.query(DatasetTable).filter(DatasetTable.tombstone.is_(True)).all()
+                }
+            with ServerTiming.time("get-dv-all"), log_time_taken("get-dv-all"):
+                all_dataset_version_mappings = {
+                    str(dataset_version.id): str(dataset_version.dataset_id)
+                    for dataset_version in session.query(DatasetVersionTable).all()
+                }
             for v in versions:
                 include_dataset_version_ids = []
                 if str(v.collection_id) in all_canonical_map:
@@ -461,21 +472,25 @@ def get_all_mapped_collection_versions(self, get_tombstoned: bool = False) -> It
         will be present in the CollectionVersion.datasets array for active (mapped) Collection versions.
         """
         with self._manage_session() as session:
-            if get_tombstoned:
-                canonical_collections = session.query(CollectionTable).filter(CollectionTable.version_id.isnot(None))
-            else:
-                canonical_collections = (
-                    session.query(CollectionTable)
-                    .filter(CollectionTable.version_id.isnot(None))
-                    .filter_by(tombstone=False)
-                )
+            with ServerTiming.time("get-cc-all"), log_time_taken("get-cc-all"):
+                if get_tombstoned:
+                    canonical_collections = session.query(CollectionTable).filter(
+                        CollectionTable.version_id.isnot(None)
+                    )
+                else:
+                    canonical_collections = (
+                        session.query(CollectionTable)
+                        .filter(CollectionTable.version_id.isnot(None))
+                        .filter_by(tombstone=False)
+                    )
 
-            mapped_version_ids = {cc.version_id: cc for cc in canonical_collections.all()}
-            versions = (
-                session.query(CollectionVersionTable)
-                .filter(CollectionVersionTable.id.in_(mapped_version_ids.keys()))
-                .all()
-            )  # noqa
+                mapped_version_ids = {cc.version_id: cc for cc in canonical_collections.all()}
+            with ServerTiming.time("get-cv-by-id"), log_time_taken("get-cv-by-id"):
+                versions = (
+                    session.query(CollectionVersionTable)
+                    .filter(CollectionVersionTable.id.in_(mapped_version_ids.keys()))
+                    .all()
+                )  # noqa
 
             for version in versions:
                 canonical_row = mapped_version_ids[version.id]
@@ -620,11 +635,18 @@ def _delete_dataset_version_and_artifact_rows(
         """
         Delete DatasetVersionTable rows (and their dependent DatasetArtifactTable rows)
         """
+        dataset_version_ids = []
+        artifact_ids = []
         for d_v_row in dataset_version_rows:
-            ids = [str(_id) for _id in d_v_row.artifacts]
-            artifact_delete_statement = delete(DatasetArtifactTable).where(DatasetArtifactTable.id.in_(ids))
-            session.execute(artifact_delete_statement)
-            session.delete(d_v_row)
+            dataset_version_ids.append(str(d_v_row.id))
+            for artifact in d_v_row.artifacts:
+                artifact_ids.append(str(artifact))
+        artifact_delete_statement = delete(DatasetArtifactTable).where(DatasetArtifactTable.id.in_(artifact_ids))
+        session.execute(artifact_delete_statement)
+        dataset_version_delete_statement = delete(DatasetVersionTable).where(
+            DatasetVersionTable.id.in_(dataset_version_ids)
+        )
+        session.execute(dataset_version_delete_statement)
         session.flush()
 
     def finalize_collection_version(
@@ -679,11 +701,16 @@ def finalize_collection_version(
             # get all dataset versions for the datasets that are being tombstoned
             dataset_version_ids_to_delete_from_s3 = []
             if dataset_ids_to_tombstone:
-                datasets = session.query(DatasetTable).filter(DatasetTable.id.in_(dataset_ids_to_tombstone)).all()
-                for dataset in datasets:
-                    dataset.tombstone = True
-                    dataset_all_versions = session.query(DatasetVersionTable).filter_by(dataset_id=dataset.id).all()
-                    dataset_version_ids_to_delete_from_s3.extend([dv.id for dv in dataset_all_versions])
+                tombstone_dataset_statement = (
+                    update(DatasetTable).where(DatasetTable.id.in_(dataset_ids_to_tombstone)).values(tombstone=True)
+                )
+                session.execute(tombstone_dataset_statement)
+                dataset_all_version_ids = (
+                    session.query(DatasetVersionTable.id)
+                    .filter(DatasetVersionTable.dataset_id.in_(dataset_ids_to_tombstone))
+                    .all()
+                )
+                dataset_version_ids_to_delete_from_s3.extend(str(dv_id) for dv_id in dataset_all_version_ids)
 
             # update dataset versions for datasets that are not being tombstoned
             dataset_version_ids = session.query(CollectionVersionTable.datasets).filter_by(id=version_id.id).one()[0]

From 71d403677ae2ecd643994eb0172fe1b4444b134f Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:04:23 -0700
Subject: [PATCH 4/6] fix: remove ServerTiming from persistence layer (#7207)

merging to fix dev
---
 backend/layers/persistence/persistence.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/backend/layers/persistence/persistence.py b/backend/layers/persistence/persistence.py
index d9e6393cbfd62..02ef4ea03d41a 100644
--- a/backend/layers/persistence/persistence.py
+++ b/backend/layers/persistence/persistence.py
@@ -6,7 +6,6 @@
 from datetime import datetime
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
-from server_timing import Timing as ServerTiming
 from sqlalchemy import create_engine, delete, update
 from sqlalchemy.exc import ProgrammingError, SQLAlchemyError
 from sqlalchemy.orm import Session, sessionmaker
@@ -414,10 +413,10 @@ def get_all_collections_versions(self, get_tombstoned: bool = False) -> Iterable
         TODO: for performance reasons, it might be necessary to add a filtering parameter here.
         """
         with self._manage_session() as session:
-            with ServerTiming.time("get-cv-all"), log_time_taken("get-cv-all"):
+            with log_time_taken("get-cv-all"):
                 versions = session.query(CollectionVersionTable).all()
 
-            with ServerTiming.time("get-cc-by-id"), log_time_taken("get-cc-by-id"):
+            with log_time_taken("get-cc-by-id"):
                 if get_tombstoned:
                     all_canonical_collections = session.query(CollectionTable)
                 else:
@@ -440,12 +439,12 @@ def get_all_collections_versions(self, get_tombstoned: bool = False) -> Iterable
                     )
 
             result = []
-            with ServerTiming.time("get-cdt-all"), log_time_taken("get-cdt-all"):
+            with log_time_taken("get-cdt-all"):
                 all_dataset_tombstones = {
                     str(dataset.id)
                     for dataset in session.query(DatasetTable).filter(DatasetTable.tombstone.is_(True)).all()
                 }
-            with ServerTiming.time("get-dv-all"), log_time_taken("get-dv-all"):
+            with log_time_taken("get-dv-all"):
                 all_dataset_version_mappings = {
                     str(dataset_version.id): str(dataset_version.dataset_id)
                     for dataset_version in session.query(DatasetVersionTable).all()
@@ -472,7 +471,7 @@ def get_all_mapped_collection_versions(self, get_tombstoned: bool = False) -> It
         will be present in the CollectionVersion.datasets array for active (mapped) Collection versions.
         """
         with self._manage_session() as session:
-            with ServerTiming.time("get-cc-all"), log_time_taken("get-cc-all"):
+            with log_time_taken("get-cc-all"):
                 if get_tombstoned:
                     canonical_collections = session.query(CollectionTable).filter(
                         CollectionTable.version_id.isnot(None)
@@ -485,7 +484,7 @@ def get_all_mapped_collection_versions(self, get_tombstoned: bool = False) -> It
                     )
 
                 mapped_version_ids = {cc.version_id: cc for cc in canonical_collections.all()}
-            with ServerTiming.time("get-cv-by-id"), log_time_taken("get-cv-by-id"):
+            with log_time_taken("get-cv-by-id"):
                 versions = (
                     session.query(CollectionVersionTable)
                     .filter(CollectionVersionTable.id.in_(mapped_version_ids.keys()))

From 766110e11b96c59ef834da9d20ffea2a1d2da68f Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Thu, 20 Jun 2024 07:37:58 -0700
Subject: [PATCH 5/6] fix: cloudfront 504 error for long backend requests.
 (#7189)

---
 .happy/terraform/modules/ecs-stack/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.happy/terraform/modules/ecs-stack/main.tf b/.happy/terraform/modules/ecs-stack/main.tf
index 3710ee1f47c91..adcb1277a5331 100644
--- a/.happy/terraform/modules/ecs-stack/main.tf
+++ b/.happy/terraform/modules/ecs-stack/main.tf
@@ -31,7 +31,7 @@ locals {
   backend_wmg_workers           = var.backend_wmg_workers
   backend_cmd                  = ["gunicorn", "--worker-class", "gevent", "--workers", "${local.backend_workers}",
     "--bind", "0.0.0.0:5000", "backend.api_server.app:app", "--max-requests", "10000", "--timeout", "180",
-    "--keep-alive", "61", "--log-level", "info"]
+    "--keep-alive", "100", "--log-level", "info"]
   backend_de_cmd                  = ["gunicorn", "--worker-class", "gevent", "--workers", "${local.backend_de_workers}",
     "--bind", "0.0.0.0:5000", "backend.de.server.app:app", "--max-requests", "10000", "--timeout", "540",
     "--keep-alive", "61", "--log-level", "info"]

From 762aeefb5858535674bc96299988aa58963f6c6d Mon Sep 17 00:00:00 2001
From: Emanuele Bezzi <ebezzi@chanzuckerberg.com>
Date: Thu, 20 Jun 2024 09:06:54 -0700
Subject: [PATCH 6/6] fix: add more ignored keys in the Census Models page
 (#7215)

---
 .../components/Project/ProjectButtons/index.tsx            | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/index.tsx b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/index.tsx
index 755804501202f..0ce5f2f37b2fe 100644
--- a/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/index.tsx
+++ b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/index.tsx
@@ -7,7 +7,12 @@ import { ButtonsColumn, ButtonsRow, StyledButton } from "./style";
 import DetailItem from "../../DetailItem";
 import { DATA_TYPE_TO_EMBEDDING } from "..";
 
-const IGNORE_DIFFERENT_METADATA_KEYS = ["model_link", "id"];
+const IGNORE_DIFFERENT_METADATA_KEYS = [
+  "model_link",
+  "id",
+  "relative_uri",
+  "indexes",
+];
 const ATTRIBUTE_TO_LABEL: Record<string, string> = {
   experiment_name: "Organism",
   n_cells: "Cells",