From 2b8fc0ce011d2d1bb65dfd9a587dc942a52d6a7a Mon Sep 17 00:00:00 2001 From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Date: Fri, 7 Jun 2024 18:39:52 -0400 Subject: [PATCH 1/8] chore: bump cellxgene-schema 5.1.0 related pinned versions (#7171) --- python_dependencies/processing/requirements.txt | 6 +++--- tests/unit/processing/test_h5ad_data_file.py | 4 ++-- tests/unit/processing/test_spatial_assets_utils.py | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python_dependencies/processing/requirements.txt b/python_dependencies/processing/requirements.txt index 5396acfd9dc69..972c9add3f136 100644 --- a/python_dependencies/processing/requirements.txt +++ b/python_dependencies/processing/requirements.txt @@ -5,8 +5,8 @@ botocore>=1.14.17 cellxgene-schema dataclasses-json ddtrace==2.1.4 -numba==0.56.2 -numpy==1.23.2 +numba==0.59.1 +numpy==1.26.4 pandas==1.4.4 psutil>=5.9.0 psycopg2-binary==2.* @@ -18,7 +18,7 @@ requests>=2.22.0 rpy2==3.5.16 rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability s3fs==0.4.2 -scanpy==1.9.3 +scanpy==1.9.8 SQLAlchemy==2.* tenacity tiledb==0.25.0 # Portal's tiledb version should always be the same or older than Explorer's diff --git a/tests/unit/processing/test_h5ad_data_file.py b/tests/unit/processing/test_h5ad_data_file.py index f3d01c06ec47e..7fbeb0e7bcceb 100644 --- a/tests/unit/processing/test_h5ad_data_file.py +++ b/tests/unit/processing/test_h5ad_data_file.py @@ -171,7 +171,7 @@ def test__slash_in_attribute_name(self): col_name = "fo/o" - attrs = [tiledb.Attr(name=col_name, dtype=np.int)] + attrs = [tiledb.Attr(name=col_name, dtype=int)] domain = tiledb.Domain(tiledb.Dim(domain=(0, 99), tile=100, dtype=np.uint32)) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major" @@ -181,7 +181,7 @@ def test__slash_in_attribute_name(self): try: with tiledb.open("foo", mode="w") as A: value = dict() - value[col_name] = np.zeros((100,), dtype=np.int) + value[col_name] = np.zeros((100,), dtype=int) A[:] = value # if there's a regression, this statement will throw a TileDBError # if we get here we're good finally: diff --git a/tests/unit/processing/test_spatial_assets_utils.py b/tests/unit/processing/test_spatial_assets_utils.py index 2a69f8ff012f7..cc40bb43dface 100644 --- a/tests/unit/processing/test_spatial_assets_utils.py +++ b/tests/unit/processing/test_spatial_assets_utils.py @@ -248,7 +248,9 @@ def test__upload_assets_failure(spatial_processor, asset_folder, dataset_version mock_upload.assert_called_once_with(asset_folder, expected_s3_uri) -def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatial_data, dataset_version_id, mocker): +def test__create_deep_zoom_assets( + spatial_processor, cxg_container, valid_spatial_data, dataset_version_id, mocker, tmpdir +): mock_fetch_image = mocker.patch.object(spatial_processor, "_fetch_image") mock_process_and_flip_image = mocker.patch.object(spatial_processor, "_process_and_flip_image") mock_generate_deep_zoom_assets = mocker.patch.object(spatial_processor, "_generate_deep_zoom_assets") @@ -256,8 +258,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia # mock the TemporaryDirectory context manager mock_temp_dir = mocker.patch("tempfile.TemporaryDirectory") - temp_dir_name = "/mock/temp/dir" - mock_temp_dir.return_value.__enter__.return_value = temp_dir_name + mock_temp_dir.return_value.__enter__.return_value = tmpdir # mock return values for the internal methods mock_fetch_image.return_value = (np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8), "fullres") @@ -266,7 +267,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia # call the method under test spatial_processor.create_deep_zoom_assets(cxg_container, valid_spatial_data, dataset_version_id) - assets_folder = os.path.join(temp_dir_name, cxg_container.replace(".cxg", "")) + assets_folder = os.path.join(tmpdir, cxg_container.replace(".cxg", "")) # assertions to ensure each step is called mock_fetch_image.assert_called_once_with(valid_spatial_data) From b2a2317f2c99b85f3213316050962a25f6941457 Mon Sep 17 00:00:00 2001 From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:16:36 -0400 Subject: [PATCH 2/8] feat: add summary citation to discover API + DP API (#7166) --- .../canonical_markers.py | 9 +- .../pipeline/canonical_marker_genes/utils.py | 131 ------------------ .../source_collections_generator.py | 2 +- backend/common/citation.py | 52 +++++++ backend/{layers => }/common/doi.py | 26 ++++ .../providers}/crossref_provider.py | 43 +++++- backend/curation/api/curation-api.yml | 7 + .../api/v1/curation/collections/actions.py | 2 +- .../collections/collection_id/actions.py | 2 +- .../api/v1/curation/collections/common.py | 4 + backend/layers/business/business.py | 12 +- backend/layers/processing/submissions/app.py | 2 +- backend/portal/api/portal-api.yml | 11 ++ backend/portal/api/portal_api.py | 8 +- backend/portal/api/providers.py | 2 +- frontend/src/common/entities.ts | 2 +- frontend/src/common/queries/collections.ts | 6 - frontend/src/common/queries/filter.ts | 61 ++------ .../common/Filter/common/constants.ts | 2 +- .../common/Filter/common/entities.ts | 2 +- frontend/src/views/Collection/index.tsx | 2 +- frontend/src/views/Collection/utils.ts | 12 +- .../src/views/Collections/common/constants.ts | 2 +- frontend/src/views/Collections/index.tsx | 4 +- frontend/src/views/Datasets/index.tsx | 2 +- frontend/tests/features/filter/filter.test.ts | 36 ----- .../pipeline/fixtures/source_collections.json | 2 +- .../pipeline/test_canonical_marker_genes.py | 73 +--------- tests/unit/backend/common/test_citation.py | 31 +++++ .../test_crossref_provider.py | 57 +++++--- tests/unit/backend/common/test_doi.py | 18 +++ .../backend/layers/api/test_curation_api.py | 12 +- .../backend/layers/api/test_portal_api.py | 10 +- .../backend/layers/business/test_business.py | 10 +- tests/unit/backend/layers/common/base_test.py | 2 +- 35 files changed, 301 insertions(+), 358 deletions(-) delete mode 100644 backend/cellguide/pipeline/canonical_marker_genes/utils.py create mode 100644 backend/common/citation.py rename backend/{layers => }/common/doi.py (78%) rename backend/{layers/thirdparty => common/providers}/crossref_provider.py (82%) create mode 100644 tests/unit/backend/common/test_citation.py rename tests/unit/backend/{layers/thirdparty => common}/test_crossref_provider.py (85%) create mode 100644 tests/unit/backend/common/test_doi.py diff --git a/backend/cellguide/pipeline/canonical_marker_genes/canonical_markers.py b/backend/cellguide/pipeline/canonical_marker_genes/canonical_markers.py index 31090b5fd4b1a..374ceabfb68fb 100644 --- a/backend/cellguide/pipeline/canonical_marker_genes/canonical_markers.py +++ b/backend/cellguide/pipeline/canonical_marker_genes/canonical_markers.py @@ -12,14 +12,12 @@ ParsedAsctbTableEntry, Reference, ) -from backend.cellguide.pipeline.canonical_marker_genes.utils import ( - clean_doi, - get_title_and_citation_from_doi, -) from backend.cellguide.pipeline.constants import ASCTB_MASTER_SHEET_URL, CELLGUIDE_PIPELINE_NUM_CPUS from backend.common.census_cube.data.ontology_labels import ontology_term_label from backend.common.census_cube.utils import setup_retry_session +from backend.common.doi import clean_doi from backend.common.marker_genes.marker_gene_files.gene_metadata import get_gene_id_to_name_and_symbol +from backend.common.providers.crossref_provider import CrossrefProvider logger = logging.getLogger(__name__) @@ -53,6 +51,7 @@ def __init__(self, *, wmg_tissues: list[str], wmg_human_genes: list[str]): gene_metadata = get_gene_id_to_name_and_symbol() self.gene_id_to_name = gene_metadata.gene_id_to_name + self.crossref_provider = CrossrefProvider() def _get_tissue_id(self, anatomical_structures: list[AnatomicalStructure]) -> str: """ @@ -132,7 +131,7 @@ def fetch_doi_info(ref): doi = clean_doi(ref.doi) if doi: if doi not in doi_to_citation: - title = get_title_and_citation_from_doi(doi) + title = self.crossref_provider.get_title_and_citation_from_doi(doi) doi_to_citation[doi] = title else: title = doi_to_citation[doi] diff --git a/backend/cellguide/pipeline/canonical_marker_genes/utils.py b/backend/cellguide/pipeline/canonical_marker_genes/utils.py deleted file mode 100644 index a5c88658174c8..0000000000000 --- a/backend/cellguide/pipeline/canonical_marker_genes/utils.py +++ /dev/null @@ -1,131 +0,0 @@ -from requests import Response - -from backend.common.census_cube.utils import setup_retry_session - - -def clean_doi(doi: str) -> str: - """ - Cleans the DOI string. - - Parameters - ---------- - doi : str - The DOI string to be cleaned. - - Returns - ------- - str - The cleaned DOI string. - """ - doi = doi.strip() - if doi == "No DOI": - return "" - - if doi != "" and doi[-1] == ".": - doi = doi[:-1] - if " " in doi: - doi = doi.split(" ")[1] # this handles cases where the DOI string is "DOI: {doi}" - doi = doi.strip() - return doi - - -def _get_response_from_url(url: str) -> Response: - # this wrapper is necessary to patch with a mock - # requests function in relevant unit tests - session = setup_retry_session() - return session.get(url) - - -def get_title_and_citation_from_doi(doi: str) -> str: - """ - Retrieves the title and citation from a DOI. - - Parameters - ---------- - doi : str - The DOI string. - - Returns - ------- - str - The title and citation associated with the DOI. - """ - - url = f"https://api.crossref.org/works/{doi}" - - # for some reason crossref fails sporadically for valid DOIs - for _ in range(5): - # Send a GET request to the API - response = _get_response_from_url(url) - # If the GET request is successful, the status code will be 200 - if response.status_code == 200: - # Get the response data - data = response.json() - - # Get the title and citation count from the data - try: - title = data["message"]["title"][0] - citation = format_citation_crossref(data["message"]) - except Exception: - try: - title = data["message"]["items"][0]["title"][0] - citation = format_citation_crossref(data["message"]["items"][0]) - except Exception: - return doi - return f"{title}\n\n - {citation}" - - return doi - - -def format_citation_dp(message: dict) -> str: - """ - Formats the citation message. - - Parameters - ---------- - message : dict - The message containing publisher_metadata from the /collections API. - - Returns - ------- - str - The formatted citation string. - """ - - first_author = message["authors"][0] - if "family" in first_author: - author_str = f"{first_author['family']}, {first_author['given']} et al." - else: - author_str = f"{first_author['name']} et al." - - journal = " " + message["journal"] if message["journal"] else "" - year = f"{message['published_year']}" - - return f"{author_str} ({year}){journal}" - - -def format_citation_crossref(message: dict) -> str: - """ - Formats the citation message. - - Parameters - ---------- - message : dict - The message containing citation details output from CrossRef. - - Returns - ------- - str - The formatted citation string. - """ - - first_author = message["author"][0] - if "family" in first_author: - author_str = f"{first_author['family']}, {first_author['given']} et al." - else: - author_str = f"{first_author['name']} et al." - - journal = " " + message["container-title"][0] if len(message["container-title"]) else "" - year = message["created"]["date-parts"][0][0] - - return f"{author_str} ({year}){journal}" diff --git a/backend/cellguide/pipeline/source_collections/source_collections_generator.py b/backend/cellguide/pipeline/source_collections/source_collections_generator.py index 3139e64d7c4f3..e220834fdd1c7 100644 --- a/backend/cellguide/pipeline/source_collections/source_collections_generator.py +++ b/backend/cellguide/pipeline/source_collections/source_collections_generator.py @@ -1,10 +1,10 @@ -from backend.cellguide.pipeline.canonical_marker_genes.utils import format_citation_dp from backend.cellguide.pipeline.source_collections.types import SourceCollectionsData from backend.common.census_cube.utils import ( descendants, get_collections_from_discover_api, get_datasets_from_discover_api, ) +from backend.common.citation import format_citation_dp def generate_source_collections_data(all_cell_type_ids_in_corpus: list[str]) -> dict[str, list[SourceCollectionsData]]: diff --git a/backend/common/citation.py b/backend/common/citation.py new file mode 100644 index 0000000000000..fb8301c25a8c2 --- /dev/null +++ b/backend/common/citation.py @@ -0,0 +1,52 @@ +def format_citation_dp(message: dict) -> str: + """ + Formats the citation message. + + Parameters + ---------- + message : dict + The message containing publisher_metadata from the /collections API. + + Returns + ------- + str + The formatted citation string. + """ + author_str_suffix = "" + if len(message["authors"]) > 1: + author_str_suffix = " et al." + first_author = message["authors"][0] + author_str = f"{first_author['family']}" if "family" in first_author else f"{first_author['name']}" + author_str += author_str_suffix + + journal = message["journal"] if message["journal"] else "" + year = f"{message['published_year']}" + + return f"{author_str} ({year}) {journal}" + + +def format_citation_crossref(message: dict) -> str: + """ + Formats the citation message. + + Parameters + ---------- + message : dict + The message containing citation details output from CrossRef. + + Returns + ------- + str + The formatted citation string. + """ + author_str_suffix = "" + if len(message["author"]) > 1: + author_str_suffix = " et al." + first_author = message["author"][0] + author_str = f"{first_author['family']}" if "family" in first_author else f"{first_author['name']}" + author_str += author_str_suffix + + journal = message["container-title"][0] if len(message["container-title"]) else "" + year = message["created"]["date-parts"][0][0] + + return f"{author_str} ({year}) {journal}" diff --git a/backend/layers/common/doi.py b/backend/common/doi.py similarity index 78% rename from backend/layers/common/doi.py rename to backend/common/doi.py index 8cc0717606351..989ebbd260c8a 100644 --- a/backend/layers/common/doi.py +++ b/backend/common/doi.py @@ -51,3 +51,29 @@ def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str] return None doi_url = f"https://doi.org/{parsed_doi}" return doi_url + + +def clean_doi(doi: str) -> str: + """ + Cleans the DOI string. + + Parameters + ---------- + doi : str + The DOI string to be cleaned. + + Returns + ------- + str + The cleaned DOI string. + """ + doi = doi.strip() + if doi == "No DOI": + return "" + + if doi != "" and doi[-1] == ".": + doi = doi[:-1] + if " " in doi: + doi = doi.split(" ")[1] # this handles cases where the DOI string is "DOI: {doi}" + doi = doi.strip() + return doi diff --git a/backend/layers/thirdparty/crossref_provider.py b/backend/common/providers/crossref_provider.py similarity index 82% rename from backend/layers/thirdparty/crossref_provider.py rename to backend/common/providers/crossref_provider.py index 0c6c65ae76a7f..5e9d778cb4f7e 100644 --- a/backend/layers/thirdparty/crossref_provider.py +++ b/backend/common/providers/crossref_provider.py @@ -5,8 +5,9 @@ import requests +from backend.common.citation import format_citation_crossref from backend.common.corpora_config import CorporaConfig -from backend.layers.common.doi import doi_curie_from_link +from backend.common.doi import doi_curie_from_link class CrossrefProviderInterface: @@ -16,6 +17,12 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: def fetch_preprint_published_doi(self, doi): pass + def _fetch_crossref_payload(self, doi): + pass + + def get_title_and_citation_from_doi(self, doi: str) -> str: + pass + class CrossrefException(Exception): pass @@ -74,6 +81,36 @@ def _fetch_crossref_payload(self, doi): return res + def get_title_and_citation_from_doi(self, doi: str) -> str: + """ + Retrieves the title and citation from a DOI. + + Parameters + ---------- + doi : str + The DOI string. + + Returns + ------- + str + The title and citation associated with the DOI. + """ + + response = self._fetch_crossref_payload(doi) + data = response.json() + + # Get the title and citation count from the data + try: + title = data["message"]["title"][0] + citation = format_citation_crossref(data["message"]) + except Exception: + try: + title = data["message"]["items"][0]["title"][0] + citation = format_citation_crossref(data["message"]["items"][0]) + except Exception: + return doi + return f"{title}\n\n - {citation}" + def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: """ Fetches and extracts publisher metadata from Crossref for a specified DOI. @@ -109,12 +146,16 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: # Journal try: + raw_journal = None if "short-container-title" in message and message["short-container-title"]: raw_journal = message["short-container-title"][0] elif "container-title" in message and message["container-title"]: raw_journal = message["container-title"][0] elif "institution" in message: raw_journal = message["institution"][0]["name"] + + if raw_journal is None: + raise CrossrefParseException("Journal node missing") except Exception: raise CrossrefParseException("Journal node missing") from None diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml index 54a8a650fa26b..2d4578ddbe668 100644 --- a/backend/curation/api/curation-api.yml +++ b/backend/curation/api/curation-api.yml @@ -1125,6 +1125,8 @@ components: $ref: "#/components/schemas/citation" collection_doi: $ref: "#/components/schemas/doi" + collection_doi_label: + $ref: "#/components/schemas/doi_label" collection_id: $ref: "#/components/schemas/collection_id" collection_name: @@ -1409,6 +1411,11 @@ components: type: string nullable: true example: "10.1093/ajae/aaq063" + doi_label: + description: Summary citation for the given DOI + type: string + nullable: true + example: "Ahern et al. (2022) Cell" donor_id: description: Free-text that identifies a unique individual that data were derived from. type: array diff --git a/backend/curation/api/v1/curation/collections/actions.py b/backend/curation/api/v1/curation/collections/actions.py index 8edfeb5928733..0554dea51e39d 100644 --- a/backend/curation/api/v1/curation/collections/actions.py +++ b/backend/curation/api/v1/curation/collections/actions.py @@ -1,11 +1,11 @@ from flask import jsonify, make_response +import backend.common.doi as doi from backend.common.utils.http_exceptions import ForbiddenHTTPException, InvalidParametersHTTPException from backend.curation.api.v1.curation.collections.common import reshape_for_curation_api from backend.layers.auth.user_info import UserInfo from backend.layers.business.entities import CollectionQueryFilter from backend.layers.business.exceptions import CollectionCreationException, InvalidMetadataException -from backend.layers.common import doi from backend.layers.common.entities import CollectionLinkType, CollectionMetadata, Link, Visibility from backend.portal.api.providers import get_business_logic diff --git a/backend/curation/api/v1/curation/collections/collection_id/actions.py b/backend/curation/api/v1/curation/collections/collection_id/actions.py index 7b5169d116f64..d6efbd20d1d1c 100644 --- a/backend/curation/api/v1/curation/collections/collection_id/actions.py +++ b/backend/curation/api/v1/curation/collections/collection_id/actions.py @@ -2,6 +2,7 @@ from flask import Response, jsonify, make_response +import backend.common.doi as doi from backend.common.utils.http_exceptions import InvalidParametersHTTPException, MethodNotAllowedException from backend.curation.api.v1.curation.collections.common import ( extract_doi_from_links, @@ -12,7 +13,6 @@ from backend.layers.auth.user_info import UserInfo from backend.layers.business.entities import CollectionMetadataUpdate from backend.layers.business.exceptions import CollectionUpdateException, InvalidMetadataException -from backend.layers.common import doi from backend.layers.common.entities import CollectionId, CollectionLinkType, Link from backend.portal.api.providers import get_business_logic diff --git a/backend/curation/api/v1/curation/collections/common.py b/backend/curation/api/v1/curation/collections/common.py index cd35121ceb3e3..f721acf53a7ef 100644 --- a/backend/curation/api/v1/curation/collections/common.py +++ b/backend/curation/api/v1/curation/collections/common.py @@ -3,6 +3,7 @@ from urllib.parse import urlparse from uuid import UUID +from backend.common.citation import format_citation_dp from backend.common.corpora_config import CorporaConfig from backend.common.utils.http_exceptions import ( ForbiddenHTTPException, @@ -294,6 +295,9 @@ def reshape_dataset_for_curation_datasets_index_api( ds["collection_name"] = collection_version.metadata.name doi, _ = extract_doi_from_links(collection_version.metadata.links) ds["collection_doi"] = doi + ds["collection_doi_label"] = ( + format_citation_dp(collection_version.publisher_metadata) if collection_version.publisher_metadata else None + ) # At this point, dataset shape for public requests are complete. if is_dataset_visibility_public: diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py index 3932a4624bd0f..de8b38a97eddc 100644 --- a/backend/layers/business/business.py +++ b/backend/layers/business/business.py @@ -8,7 +8,13 @@ from backend.common.constants import DATA_SUBMISSION_POLICY_VERSION from backend.common.corpora_config import CorporaConfig +from backend.common.doi import doi_curie_from_link from backend.common.feature_flag import FeatureFlagService, FeatureFlagValues +from backend.common.providers.crossref_provider import ( + CrossrefDOINotFoundException, + CrossrefException, + CrossrefProviderInterface, +) from backend.layers.business.business_interface import BusinessLogicInterface from backend.layers.business.entities import ( CollectionMetadataUpdate, @@ -38,7 +44,6 @@ ) from backend.layers.common import validation from backend.layers.common.cleanup import sanitize -from backend.layers.common.doi import doi_curie_from_link from backend.layers.common.entities import ( CanonicalCollection, CollectionId, @@ -74,11 +79,6 @@ from backend.layers.common.regex import S3_URI_REGEX from backend.layers.persistence.persistence_interface import DatabaseProviderInterface from backend.layers.thirdparty.batch_job_provider import BatchJobProviderInterface -from backend.layers.thirdparty.crossref_provider import ( - CrossrefDOINotFoundException, - CrossrefException, - CrossrefProviderInterface, -) from backend.layers.thirdparty.s3_exceptions import S3DeleteException from backend.layers.thirdparty.s3_provider_interface import S3ProviderInterface from backend.layers.thirdparty.step_function_provider import StepFunctionProviderInterface diff --git a/backend/layers/processing/submissions/app.py b/backend/layers/processing/submissions/app.py index 43750d16018bf..205c5545caf84 100644 --- a/backend/layers/processing/submissions/app.py +++ b/backend/layers/processing/submissions/app.py @@ -8,6 +8,7 @@ from pythonjsonlogger import jsonlogger from backend.common.logging_config import DATETIME_FORMAT, LOG_FORMAT +from backend.common.providers.crossref_provider import CrossrefProvider from backend.common.utils.exceptions import ( CorporaException, NonExistentCollectionException, @@ -18,7 +19,6 @@ from backend.layers.business.exceptions import CollectionNotFoundException, DatasetNotFoundException from backend.layers.persistence.persistence import DatabaseProvider from backend.layers.thirdparty.batch_job_provider import BatchJobProvider -from backend.layers.thirdparty.crossref_provider import CrossrefProvider from backend.layers.thirdparty.s3_provider import S3Provider from backend.layers.thirdparty.step_function_provider import StepFunctionProvider from backend.layers.thirdparty.uri_provider import UriProvider diff --git a/backend/portal/api/portal-api.yml b/backend/portal/api/portal-api.yml index 0c80e014a4c14..6770fad3ec49d 100644 --- a/backend/portal/api/portal-api.yml +++ b/backend/portal/api/portal-api.yml @@ -445,6 +445,8 @@ paths: type: string publisher_metadata: $ref: "#/components/schemas/publisher_metadata" + summary_citation: + $ref: "#/components/schemas/summary_citation" published_at: type: number revised_at: @@ -490,6 +492,8 @@ paths: type: string publisher_metadata: $ref: "#/components/schemas/publisher_metadata" + summary_citation: + $ref: "#/components/schemas/summary_citation" published_at: type: number nullable: true @@ -1124,6 +1128,8 @@ components: $ref: "#/components/schemas/publisher_metadata" revising_in: $ref: "#/components/schemas/revising_in" + summary_citation: + $ref: "#/components/schemas/summary_citation" datasets: type: array items: @@ -1285,6 +1291,11 @@ components: type: string detail: type: string + summary_citation: + description: Summary citation for the associated DOI + type: string + nullable: true + example: "Ahern et al. (2022) Cell" dataset_identifiers: type: object description: Information for identifying and accessing a datatset diff --git a/backend/portal/api/portal_api.py b/backend/portal/api/portal_api.py index 2478d455a87ba..fe7a621546b72 100644 --- a/backend/portal/api/portal_api.py +++ b/backend/portal/api/portal_api.py @@ -6,6 +6,8 @@ from flask import Response, jsonify, make_response +from backend.common import doi +from backend.common.citation import format_citation_dp from backend.common.constants import DATA_SUBMISSION_POLICY_VERSION from backend.common.utils.http_exceptions import ( ConflictException, @@ -35,7 +37,6 @@ InvalidURIException, MaxFileSizeExceededException, ) -from backend.layers.common import doi from backend.layers.common.entities import ( CollectionId, CollectionMetadata, @@ -290,6 +291,9 @@ def _collection_to_response(collection: CollectionVersionWithDatasets, access_ty "name": collection.metadata.name, "published_at": collection.canonical_collection.originally_published_at, "publisher_metadata": collection.publisher_metadata, # TODO: convert + "summary_citation": ( + format_citation_dp(collection.publisher_metadata) if collection.publisher_metadata else None + ), "revision_of": revision_of, "revising_in": revising_in, "updated_at": collection.published_at or collection.created_at, @@ -430,6 +434,7 @@ def get_collection_index(): transformed_collection["publisher_metadata"] = _publisher_metadata_to_response( collection.publisher_metadata ) + transformed_collection["summary_citation"] = format_citation_dp(collection.publisher_metadata) transformed_collection["published_at"] = collection.canonical_collection.originally_published_at transformed_collection["revised_at"] = collection.published_at @@ -484,6 +489,7 @@ def get_user_collection_index(token_info): transformed_collection["publisher_metadata"] = _publisher_metadata_to_response( collection.publisher_metadata ) + transformed_collection["summary_citation"] = format_citation_dp(collection.publisher_metadata) transformed_collection["consortia"] = collection.metadata.consortia diff --git a/backend/portal/api/providers.py b/backend/portal/api/providers.py index e902b67f2ba5f..08ac477e06f22 100644 --- a/backend/portal/api/providers.py +++ b/backend/portal/api/providers.py @@ -1,8 +1,8 @@ +from backend.common.providers.crossref_provider import CrossrefProvider from backend.layers.business.business import BusinessLogic from backend.layers.persistence.persistence import DatabaseProvider from backend.layers.thirdparty.batch_job_provider import BatchJobProvider from backend.layers.thirdparty.cloudfront_provider import CloudfrontProvider -from backend.layers.thirdparty.crossref_provider import CrossrefProvider from backend.layers.thirdparty.s3_provider import S3Provider from backend.layers.thirdparty.step_function_provider import StepFunctionProvider from backend.layers.thirdparty.uri_provider import UriProvider diff --git a/frontend/src/common/entities.ts b/frontend/src/common/entities.ts index c89a1aae70cb7..3da5affb2c02a 100644 --- a/frontend/src/common/entities.ts +++ b/frontend/src/common/entities.ts @@ -102,7 +102,7 @@ export interface Collection { updated_at: number; publisher_metadata: PublisherMetadata; revision_diff: boolean; - summaryCitation: string; + summary_citation: string; tombstone?: boolean; revising_in?: Collection["id"]; revision_of?: Collection["id"]; diff --git a/frontend/src/common/queries/collections.ts b/frontend/src/common/queries/collections.ts index 2c6087f07e6ea..8e5a4211de8e1 100644 --- a/frontend/src/common/queries/collections.ts +++ b/frontend/src/common/queries/collections.ts @@ -7,7 +7,6 @@ import { } from "react-query"; import { Collection, COLLECTION_STATUS, Dataset } from "src/common/entities"; import { - buildSummaryCitation, createTaggedTissueOntology, ProcessedCollectionResponse, TissueOntology, @@ -206,11 +205,6 @@ function fetchCollection() { ); } - // Add summary citation to collection. - collection.summaryCitation = buildSummaryCitation( - collection.publisher_metadata - ); - return collection; }; } diff --git a/frontend/src/common/queries/filter.ts b/frontend/src/common/queries/filter.ts index c3ada64b7a1cd..f6c0aa069a7dc 100644 --- a/frontend/src/common/queries/filter.ts +++ b/frontend/src/common/queries/filter.ts @@ -3,10 +3,8 @@ import { QueryStatus, useQuery, UseQueryResult } from "react-query"; import { API } from "src/common/API"; import { ACCESS_TYPE, - Author, Collection, COLLECTION_STATUS, - Consortium, IS_PRIMARY_DATA, Ontology, PublisherMetadata, @@ -84,6 +82,7 @@ export interface CollectionResponse { name: string; published_at: number; publisher_metadata: PublisherMetadata; + summary_citation: string; revised_at: number; } @@ -109,7 +108,7 @@ export type ProcessedCollectionResponse = ( publicationDateValues: number[]; revisedBy?: string; status?: COLLECTION_STATUS[]; - summaryCitation: string; + summary_citation: string; }; /** @@ -538,8 +537,8 @@ function buildDatasetRow( isOverMaxCellCount: checkIsOverMaxCellCount(dataset.cell_count), publicationDateValues, recency, - summaryCitation: - collection?.summaryCitation ?? CATEGORY_VALUE_KEY.NO_PUBLICATION, + summary_citation: + collection?.summary_citation ?? CATEGORY_VALUE_KEY.NO_PUBLICATION, }; return sortCategoryValues(datasetRow); } @@ -556,42 +555,6 @@ function buildConsortia(collection?: ProcessedCollectionResponse): string[] { return [CATEGORY_VALUE_KEY.NO_CONSORTIUM]; } -/** - * Build summary citation format from given publisher metadata: - * Last name of first author (publication year) journal abbreviation such as Ren et al. (2021) Cell. - * @param publisherMetadata - Publication metadata of a collection. - */ -export function buildSummaryCitation( - publisherMetadata?: PublisherMetadata -): string { - if (!publisherMetadata) { - return CATEGORY_VALUE_KEY.NO_PUBLICATION; - } - - const citationTokens = []; - - // Add author to citation - family name if first author is a person, name if first author is a consortium. - const { authors, journal, published_year: publishedYear } = publisherMetadata; - const [firstAuthor] = authors ?? []; - if (firstAuthor) { - if (isAuthorPerson(firstAuthor)) { - citationTokens.push(firstAuthor.family); - } else { - citationTokens.push(firstAuthor.name); - } - - if (authors.length > 1) { - citationTokens.push("et al."); - } - } - - // Add year and journal. - citationTokens.push(`(${publishedYear})`); - citationTokens.push(journal); - - return citationTokens.join(" "); -} - /** * Determine date bins that publication date falls within. Date bins are recency-based: * - Published 1 month ago, all date bins are applicable (1 month to 3 years). @@ -929,16 +892,6 @@ function groupDatasetRowsByCollection( }, new Map()); } -/** - * Publication authors can be a person or a consortium; determine if the given author is in fact a person (and not a - * consortium). - * @param author - Person or consortium associated with a publication. - * @returns True if author is a person and not a consortium. - */ -function isAuthorPerson(author: Author | Consortium): author is Author { - return (author as Author).given !== undefined; -} - /** * Returns true if the given self reported ethnicity ontology term contains * mutliple ethnicities. @@ -997,13 +950,15 @@ function processCollectionResponse( ); // Build the summary citation from the collection's publication metadata, if any. - const summaryCitation = buildSummaryCitation(collection?.publisher_metadata); + const summary_citation = collection.summary_citation ?? [ + CATEGORY_VALUE_KEY.NO_PUBLICATION, + ]; return { ...collection, consortia, publicationDateValues, - summaryCitation, + summary_citation, }; } diff --git a/frontend/src/components/common/Filter/common/constants.ts b/frontend/src/components/common/Filter/common/constants.ts index 5619c71a9d35b..18176c3862008 100644 --- a/frontend/src/components/common/Filter/common/constants.ts +++ b/frontend/src/components/common/Filter/common/constants.ts @@ -1039,7 +1039,7 @@ const CATEGORY_FILTER_CONFIGS: CategoryFilterConfig[] = [ { analyticsEvent: EVENTS.FILTER_SELECT_PUBLICATION, categoryFilterId: CATEGORY_FILTER_ID.PUBLICATION, - filterOnKey: "summaryCitation", + filterOnKey: "summary_citation", label: "Publication", labelKind: "VALUE", matchKind: "INCLUDES_SOME", diff --git a/frontend/src/components/common/Filter/common/entities.ts b/frontend/src/components/common/Filter/common/entities.ts index 0320d36848d13..1b01105354481 100644 --- a/frontend/src/components/common/Filter/common/entities.ts +++ b/frontend/src/components/common/Filter/common/entities.ts @@ -696,7 +696,7 @@ export enum PUBLICATION_DATE_LABELS { */ export interface PublisherMetadataCategories { publicationDateValues?: number[]; // Set of date bins that publication date falls within - summaryCitation: string; + summary_citation: string; } /** diff --git a/frontend/src/views/Collection/index.tsx b/frontend/src/views/Collection/index.tsx index 4557f55474ec9..65e0e19f0d703 100644 --- a/frontend/src/views/Collection/index.tsx +++ b/frontend/src/views/Collection/index.tsx @@ -108,7 +108,7 @@ const Collection: FC = () => { const collectionConsortia = collection.consortia; const collectionMetadataLinks = buildCollectionMetadataLinks( collection.links, - collection.summaryCitation, + collection.summary_citation, collection.contact_name, collection.contact_email ); diff --git a/frontend/src/views/Collection/utils.ts b/frontend/src/views/Collection/utils.ts index d4d0639d2ab2b..c7e747ef0c8c1 100644 --- a/frontend/src/views/Collection/utils.ts +++ b/frontend/src/views/Collection/utils.ts @@ -36,14 +36,14 @@ const LINK_ORDER: COLLECTION_LINK_TYPE[] = [ /** * Returns collection metadata in preferred order of display. * @param links - links associated with collection. - * @param summaryCitation - Summary citation format of collection publication metadata. + * @param summary_citation - Summary citation format of collection publication metadata. * @param contactName - Name of collection contact. * @param contactEmail - Email of collection contact. * @returns Array of collection metadata in preferred order of display. */ export function buildCollectionMetadataLinks( links: Link[], - summaryCitation: string, + summary_citation: string, contactName?: Collection["contact_name"], contactEmail?: Collection["contact_email"] ): CollectionMetadataLink[] { @@ -56,7 +56,7 @@ export function buildCollectionMetadataLinks( /* If collection has an associated DOI, display either the summary citation or the DOI itself. */ const doiLink = links.find((link: Link) => link.link_type === "DOI"); if (doiLink) { - const doiMetadataLink = buildDoiMetadataLink(doiLink, summaryCitation); + const doiMetadataLink = buildDoiMetadataLink(doiLink, summary_citation); if (doiMetadataLink) { collectionMetadataLinks.push(doiMetadataLink); @@ -97,7 +97,7 @@ export function buildCollectionMetadataLinks( */ function buildDoiMetadataLink( doiLink: Link, - summaryCitation: string + summary_citation: string ): CollectionMetadataLink | undefined { // Build display model of DOI link. const doiMetadataLink = buildCollectionMetadataLink(doiLink); @@ -106,7 +106,7 @@ function buildDoiMetadataLink( } // If there's no summary citation for the collection, return the DOI link with the label "DOI". - if (summaryCitation === CATEGORY_VALUE_KEY.NO_PUBLICATION) { + if (summary_citation === CATEGORY_VALUE_KEY.NO_PUBLICATION) { return { ...doiMetadataLink, label: "DOI", @@ -117,7 +117,7 @@ function buildDoiMetadataLink( return { ...doiMetadataLink, label: "Publication", - value: summaryCitation, + value: summary_citation, }; } diff --git a/frontend/src/views/Collections/common/constants.ts b/frontend/src/views/Collections/common/constants.ts index 1eaa0dcf38bbd..a4492105f7b64 100644 --- a/frontend/src/views/Collections/common/constants.ts +++ b/frontend/src/views/Collections/common/constants.ts @@ -34,7 +34,7 @@ export const COLLECTION_REVISED_BY = "revisedBy"; /** * Collection summary citation object key. */ -export const COLLECTION_SUMMARY_CITATION = "summaryCitation"; +export const COLLECTION_SUMMARY_CITATION = "summary_citation"; /** * Collection status object key. diff --git a/frontend/src/views/Collections/index.tsx b/frontend/src/views/Collections/index.tsx index e4013efc797a3..d45e98eaf8dab 100644 --- a/frontend/src/views/Collections/index.tsx +++ b/frontend/src/views/Collections/index.tsx @@ -125,7 +125,7 @@ export default function Collections(): JSX.Element { // Viewable in collections mode, hidden in curator mode. { Cell: ({ row }: RowPropsValue) => { - return
{row.values.summaryCitation}
; + return
{row.values.summary_citation}
; }, Header: "Publication", accessor: COLLECTION_SUMMARY_CITATION, @@ -222,7 +222,7 @@ export default function Collections(): JSX.Element { }, // Hidden, required for filter. { - accessor: "summaryCitation", + accessor: "summary_citation", filter: "includesSome", id: CATEGORY_FILTER_ID.PUBLICATION, }, diff --git a/frontend/src/views/Datasets/index.tsx b/frontend/src/views/Datasets/index.tsx index feaff08b0bfd2..fbaa85cc589d2 100644 --- a/frontend/src/views/Datasets/index.tsx +++ b/frontend/src/views/Datasets/index.tsx @@ -247,7 +247,7 @@ export default function Datasets(): JSX.Element { }, // Hidden, required for filter. { - accessor: "summaryCitation", + accessor: "summary_citation", filter: "includesSome", id: CATEGORY_FILTER_ID.PUBLICATION, }, diff --git a/frontend/tests/features/filter/filter.test.ts b/frontend/tests/features/filter/filter.test.ts index c03b13ca2e885..273dca6f6a01a 100644 --- a/frontend/tests/features/filter/filter.test.ts +++ b/frontend/tests/features/filter/filter.test.ts @@ -4,9 +4,7 @@ // App dependencies import { expect } from "@playwright/test"; -import { PublisherMetadata } from "src/common/entities"; import { - buildSummaryCitation, calculateMonthsSincePublication, calculateRecency, CollectionResponse, @@ -142,40 +140,6 @@ describe("filter", () => { expect(recency).toEqual(publishedAt); }); }); - describe("Build Summary Citation", () => { - test("builds summary citation for consortium first author", () => { - const consortiumName = "The Tabula Sapiens Consortium"; - const journal = "bioRxiv"; - const year = 2022; - const publisherMetadata = { - authors: [ - { name: consortiumName }, - { family: "Quake", given: "Stephen R" }, - ], - journal: journal, - published_year: year, - } as PublisherMetadata; - const summaryCitation = buildSummaryCitation(publisherMetadata); - expect(summaryCitation).toEqual( - `${consortiumName} et al. (${year}) ${journal}` - ); - }); - test("builds summary citation for person first author", () => { - const family = "Quake"; - const journal = "bioRxiv"; - const year = 2022; - const publisherMetadata = { - authors: [ - { family: family, given: "Stephen R" }, - { name: "The Tabula Sapiens Consortium" }, - ], - journal: journal, - published_year: year, - } as PublisherMetadata; - const summaryCitation = buildSummaryCitation(publisherMetadata); - expect(summaryCitation).toEqual(`${family} et al. (${year}) ${journal}`); - }); - }); describe("Process tissue type", () => { test("handles 3.x.x format", () => { // TODO remove this test with #6266. diff --git a/tests/unit/backend/cellguide/pipeline/fixtures/source_collections.json b/tests/unit/backend/cellguide/pipeline/fixtures/source_collections.json index c4e0e14e6f3b6..2ba82a247e6e4 100644 --- a/tests/unit/backend/cellguide/pipeline/fixtures/source_collections.json +++ b/tests/unit/backend/cellguide/pipeline/fixtures/source_collections.json @@ -1 +1 @@ -{"CL:0000066": [{"collection_name": "HTAN MSK - Single cell profiling reveals novel tumor and myeloid subpopulations in small cell lung cancer", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/62e8f058-9c37-48bc-9200-e767f318a8ec", "publication_url": "10.1016/j.ccell.2021.09.008", "publication_title": "Chan, Joseph M. et al. (2021) Cancer Cell", "tissue": [{"label": "pleural effusion", "ontology_term_id": "UBERON:0000175"}], "disease": [{"label": "small cell lung carcinoma", "ontology_term_id": "MONDO:0008433"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}], "CL:0000084": [], "CL:0000097": [], "CL:0000115": [], "CL:0000148": [], "CL:0000169": [], "CL:0000171": [], "CL:0000173": [], "CL:0000187": [], "CL:0000188": [], "CL:0000192": [], "CL:0000235": [], "CL:0000453": [], "CL:0000492": [], "CL:0000499": [], "CL:0000738": [], "CL:0000786": [], "CL:0000787": [], "CL:0000788": [], "CL:0000794": [], "CL:0000814": [], "CL:0000815": [], "CL:0000895": [], "CL:0000897": [], "CL:0000900": [], "CL:0000909": [], "CL:0002064": [], "CL:0002079": [], "CL:0002275": [], "CL:0002394": [], "CL:0002399": [], "CL:0002410": [], "CL:0000000": [{"collection_name": "Spatial multiomics map of trophoblast development in early pregnancy", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/e2c257e7-6f79-487c-b81c-39451cd4ab3c", "publication_url": "10.1038/s41586-023-05869-0", "publication_title": "Arutyunyan, Anna et al. (2023) Nature", "tissue": [{"label": "trophoblast cell (cell culture)", "ontology_term_id": "CL:0000351 (cell culture)"}], "disease": [{"label": "normal", "ontology_term_id": "PATO:0000461"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}, {"collection_name": "HTAN MSK - Single cell profiling reveals novel tumor and myeloid subpopulations in small cell lung cancer", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/62e8f058-9c37-48bc-9200-e767f318a8ec", "publication_url": "10.1016/j.ccell.2021.09.008", "publication_title": "Chan, Joseph M. et al. (2021) Cancer Cell", "tissue": [{"label": "pleural effusion", "ontology_term_id": "UBERON:0000175"}], "disease": [{"label": "small cell lung carcinoma", "ontology_term_id": "MONDO:0008433"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}, {"collection_name": "Massively multiplex chemical transcriptomics at single-cell resolution", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/00109df5-7810-4542-8db5-2288c46e0424", "publication_url": "10.1126/science.aax6234", "publication_title": "Srivatsan, Sanjay R. et al. (2020) Science", "tissue": [{"label": "mammary gland epithelial cell (cell culture)", "ontology_term_id": "CL:0002327 (cell culture)"}, {"label": "cultured cell (cell culture)", "ontology_term_id": "CL:0000010 (cell culture)"}], "disease": [{"label": "normal", "ontology_term_id": "PATO:0000461"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}], "CL:0000057": [], "CL:0000255": [], "CL:0000152": [], "CL:0000988": [], "CL:0000763": [], "CL:0000145": [], "CL:0002274": [], "CL:0000151": [], "CL:0002320": [], "CL:0000068": [], "CL:0000069": [], "CL:0002078": [], "CL:0000215": [], "CL:0000213": [], "CL:0000083": [], "CL:0000542": [], "CL:0000945": [], "CL:0000766": [], "CL:0000518": [], "CL:0000234": [], "CL:0000113": [], "CL:0000842": [], "CL:0000325": [], "CL:0000147": [], "CL:0000150": [], "CL:0000154": [], "CL:0000163": [], "CL:0000164": [], "CL:0000167": [], "CL:0000168": [], "CL:0008024": [], "CL:0000170": [], "CL:0002067": [], "CL:0000172": [], "CL:0000502": [], "CL:0000183": [], "CL:0000393": [], "CL:0008000": [], "CL:0008007": [], "CL:0000211": [], "CL:0000219": [], "CL:0002242": [], "CL:0000226": [], "CL:0000473": [], "CL:0000236": [], "CL:0000457": [], "CL:0000451": [], "CL:0000990": [], "CL:0000624": [], "CL:0000622": [], "CL:0000791": [], "CL:0000625": [], "CL:0000696": [], "CL:0000782": [], "CL:0000784": [], "CL:0000785": [], "CL:0001201": [], "CL:0000946": [], "CL:0000789": [], "CL:0002419": [], "CL:0000813": [], "CL:0000898": [], "CL:0001200": [], "CL:1001599": [], "CL:1001433": []} \ No newline at end of file +{"CL:0000066": [{"collection_name": "HTAN MSK - Single cell profiling reveals novel tumor and myeloid subpopulations in small cell lung cancer", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/62e8f058-9c37-48bc-9200-e767f318a8ec", "publication_url": "10.1016/j.ccell.2021.09.008", "publication_title": "Chan et al. (2021) Cancer Cell", "tissue": [{"label": "pleural effusion", "ontology_term_id": "UBERON:0000175"}], "disease": [{"label": "small cell lung carcinoma", "ontology_term_id": "MONDO:0008433"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}], "CL:0000084": [], "CL:0000097": [], "CL:0000115": [], "CL:0000148": [], "CL:0000169": [], "CL:0000171": [], "CL:0000173": [], "CL:0000187": [], "CL:0000188": [], "CL:0000192": [], "CL:0000235": [], "CL:0000453": [], "CL:0000492": [], "CL:0000499": [], "CL:0000738": [], "CL:0000786": [], "CL:0000787": [], "CL:0000788": [], "CL:0000794": [], "CL:0000814": [], "CL:0000815": [], "CL:0000895": [], "CL:0000897": [], "CL:0000900": [], "CL:0000909": [], "CL:0002064": [], "CL:0002079": [], "CL:0002275": [], "CL:0002394": [], "CL:0002399": [], "CL:0002410": [], "CL:0000000": [{"collection_name": "Spatial multiomics map of trophoblast development in early pregnancy", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/e2c257e7-6f79-487c-b81c-39451cd4ab3c", "publication_url": "10.1038/s41586-023-05869-0", "publication_title": "Arutyunyan et al. (2023) Nature", "tissue": [{"label": "trophoblast cell (cell culture)", "ontology_term_id": "CL:0000351 (cell culture)"}], "disease": [{"label": "normal", "ontology_term_id": "PATO:0000461"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}, {"collection_name": "HTAN MSK - Single cell profiling reveals novel tumor and myeloid subpopulations in small cell lung cancer", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/62e8f058-9c37-48bc-9200-e767f318a8ec", "publication_url": "10.1016/j.ccell.2021.09.008", "publication_title": "Chan et al. (2021) Cancer Cell", "tissue": [{"label": "pleural effusion", "ontology_term_id": "UBERON:0000175"}], "disease": [{"label": "small cell lung carcinoma", "ontology_term_id": "MONDO:0008433"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}, {"collection_name": "Massively multiplex chemical transcriptomics at single-cell resolution", "collection_url": "https://cellxgene.staging.single-cell.czi.technology/collections/00109df5-7810-4542-8db5-2288c46e0424", "publication_url": "10.1126/science.aax6234", "publication_title": "Srivatsan et al. (2020) Science", "tissue": [{"label": "mammary gland epithelial cell (cell culture)", "ontology_term_id": "CL:0002327 (cell culture)"}, {"label": "cultured cell (cell culture)", "ontology_term_id": "CL:0000010 (cell culture)"}], "disease": [{"label": "normal", "ontology_term_id": "PATO:0000461"}], "organism": [{"label": "Homo sapiens", "ontology_term_id": "NCBITaxon:9606"}]}], "CL:0000057": [], "CL:0000255": [], "CL:0000152": [], "CL:0000988": [], "CL:0000763": [], "CL:0000145": [], "CL:0002274": [], "CL:0000151": [], "CL:0002320": [], "CL:0000068": [], "CL:0000069": [], "CL:0002078": [], "CL:0000215": [], "CL:0000213": [], "CL:0000083": [], "CL:0000542": [], "CL:0000945": [], "CL:0000766": [], "CL:0000518": [], "CL:0000234": [], "CL:0000113": [], "CL:0000842": [], "CL:0000325": [], "CL:0000147": [], "CL:0000150": [], "CL:0000154": [], "CL:0000163": [], "CL:0000164": [], "CL:0000167": [], "CL:0000168": [], "CL:0008024": [], "CL:0000170": [], "CL:0002067": [], "CL:0000172": [], "CL:0000502": [], "CL:0000183": [], "CL:0000393": [], "CL:0008000": [], "CL:0008007": [], "CL:0000211": [], "CL:0000219": [], "CL:0002242": [], "CL:0000226": [], "CL:0000473": [], "CL:0000236": [], "CL:0000457": [], "CL:0000451": [], "CL:0000990": [], "CL:0000624": [], "CL:0000622": [], "CL:0000791": [], "CL:0000625": [], "CL:0000696": [], "CL:0000782": [], "CL:0000784": [], "CL:0000785": [], "CL:0001201": [], "CL:0000946": [], "CL:0000789": [], "CL:0002419": [], "CL:0000813": [], "CL:0000898": [], "CL:0001200": [], "CL:1001599": [], "CL:1001433": []} \ No newline at end of file diff --git a/tests/unit/backend/cellguide/pipeline/test_canonical_marker_genes.py b/tests/unit/backend/cellguide/pipeline/test_canonical_marker_genes.py index c8fcedd585913..daf09bd7a74dc 100644 --- a/tests/unit/backend/cellguide/pipeline/test_canonical_marker_genes.py +++ b/tests/unit/backend/cellguide/pipeline/test_canonical_marker_genes.py @@ -1,14 +1,8 @@ import json import unittest -from unittest.mock import Mock, patch +from unittest.mock import patch from backend.cellguide.pipeline.canonical_marker_genes.canonical_markers import CanonicalMarkerGenesCompiler -from backend.cellguide.pipeline.canonical_marker_genes.utils import ( - clean_doi, - format_citation_crossref, - format_citation_dp, - get_title_and_citation_from_doi, -) from backend.cellguide.pipeline.utils import convert_dataclass_to_dict_and_strip_nones from tests.test_utils import compare_dicts from tests.test_utils.mocks import mock_get_asctb_master_sheet, mock_get_title_and_citation_from_doi @@ -41,10 +35,11 @@ def test__canonical_marker_genes(self): new=mock_get_asctb_master_sheet, ), patch( - "backend.cellguide.pipeline.canonical_marker_genes.canonical_markers.get_title_and_citation_from_doi", - new=mock_get_title_and_citation_from_doi, - ), + "backend.cellguide.pipeline.canonical_marker_genes.canonical_markers.CrossrefProvider", + ) as MockCrossrefProvider, ): + mock_instance = MockCrossrefProvider.return_value + mock_instance.get_title_and_citation_from_doi.side_effect = mock_get_title_and_citation_from_doi marker_gene_compiler = CanonicalMarkerGenesCompiler( wmg_tissues=wmg_tissues, wmg_human_genes=wmg_human_genes ) @@ -53,61 +48,3 @@ def test__canonical_marker_genes(self): marker_gene_compiler.get_processed_asctb_table_entries() ) self.assertTrue(compare_dicts(canonical_marker_genes, expected__canonical_marker_genes)) - - -class CanonicalMarkerGeneCompilerUtilsTests(unittest.TestCase): - def test__clean_doi(self): - test_cases = [ - ("10.1016/j.cell.2019.11.025", "10.1016/j.cell.2019.11.025"), - ("DOI: 10.1016/j.cell.2019.11.025.", "10.1016/j.cell.2019.11.025"), - (" DOI: 10.1016/j.cell.2019.11.025 ", "10.1016/j.cell.2019.11.025"), - ("10.1016/j.cell.2019.11.025. ", "10.1016/j.cell.2019.11.025"), - ("", ""), - ] - - for doi, expected in test_cases: - with self.subTest(doi=doi): - self.assertEqual(clean_doi(doi), expected) - - @patch("backend.cellguide.pipeline.canonical_marker_genes.utils._get_response_from_url") - def test__get_title_and_citation_from_doi(self, mock_get): - mock_response = Mock() - mock_response.status_code = 200 - mock_response.json.return_value = { - "message": { - "title": ["Test Title"], - "author": [{"family": "Doe", "given": "John"}], - "container-title": ["Test Journal"], - "created": {"date-parts": [[2022]]}, - } - } - mock_get.return_value = mock_response - - result = get_title_and_citation_from_doi("10.1016/j.cell.2019.11.025") - self.assertEqual(result, "Test Title\n\n - Doe, John et al. (2022) Test Journal") - - def test__format_citation(self): - message = { - "author": [{"family": "Doe", "given": "John"}], - "container-title": ["Test Journal"], - "created": {"date-parts": [[2022]]}, - } - result = format_citation_crossref(message) - self.assertEqual(result, "Doe, John et al. (2022) Test Journal") - - message = { - "authors": [ - {"family": "Gabitto", "given": "Mariano I."}, - {"family": "Travaglini", "given": "Kyle J."}, - {"family": "Rachleff", "given": "Victoria M."}, - {"family": "Kaplan", "given": "Eitan S."}, - ], - "is_preprint": True, - "journal": "bioRxiv", - "published_at": 1683590400.0, - "published_day": 9, - "published_month": 5, - "published_year": 2023, - } - result = format_citation_dp(message) - self.assertEqual(result, "Gabitto, Mariano I. et al. (2023) bioRxiv") diff --git a/tests/unit/backend/common/test_citation.py b/tests/unit/backend/common/test_citation.py new file mode 100644 index 0000000000000..b54f63b299f39 --- /dev/null +++ b/tests/unit/backend/common/test_citation.py @@ -0,0 +1,31 @@ +import unittest + +from backend.common.citation import format_citation_crossref, format_citation_dp + + +class TestCitation(unittest.TestCase): + def test__format_citation(self): + message = { + "author": [{"family": "Doe", "given": "John"}], + "container-title": ["Test Journal"], + "created": {"date-parts": [[2022]]}, + } + result = format_citation_crossref(message) + self.assertEqual(result, "Doe (2022) Test Journal") + + message = { + "authors": [ + {"family": "Gabitto", "given": "Mariano I."}, + {"family": "Travaglini", "given": "Kyle J."}, + {"family": "Rachleff", "given": "Victoria M."}, + {"family": "Kaplan", "given": "Eitan S."}, + ], + "is_preprint": True, + "journal": "bioRxiv", + "published_at": 1683590400.0, + "published_day": 9, + "published_month": 5, + "published_year": 2023, + } + result = format_citation_dp(message) + self.assertEqual(result, "Gabitto et al. (2023) bioRxiv") diff --git a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py b/tests/unit/backend/common/test_crossref_provider.py similarity index 85% rename from tests/unit/backend/layers/thirdparty/test_crossref_provider.py rename to tests/unit/backend/common/test_crossref_provider.py index 1fbcb8986e27a..e8432a1d04002 100644 --- a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py +++ b/tests/unit/backend/common/test_crossref_provider.py @@ -1,12 +1,12 @@ import copy import json import unittest -from unittest.mock import patch +from unittest.mock import Mock, patch from requests import RequestException from requests.models import HTTPError, Response -from backend.layers.thirdparty.crossref_provider import ( +from backend.common.providers.crossref_provider import ( CrossrefDOINotFoundException, CrossrefException, CrossrefFetchException, @@ -16,15 +16,15 @@ class TestCrossrefProvider(unittest.TestCase): - @patch("backend.layers.thirdparty.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.requests.get") def test__provider_does_not_call_crossref_in_test(self, mock_get): provider = CrossrefProvider() metadata, doi = provider.fetch_metadata("test_doi") self.assertIsNone(metadata) mock_get.assert_not_called() - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get): # Defining a mocked CorporaConfig will allow the provider to consider the `crossref_api_key` # not None, so it will go ahead and do the mocked call. @@ -73,8 +73,8 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get self.assertDictEqual(expected_response, res) - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__published_doi_used_if_exists_for_preprint(self, mock_config, mock_get): # Defining a mocked CorporaConfig will allow the provider to consider the `crossref_api_key` @@ -172,8 +172,8 @@ def make_response(content): self.assertDictEqual(expected_response, res) - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_get): response = Response() response.status_code = 200 @@ -243,8 +243,8 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge self.assertDictEqual(expected_response, res) - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): response = Response() response.status_code = 200 @@ -282,8 +282,8 @@ def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): self.assertDictEqual(expected_response, author_data) - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_throws_exception_if_request_fails(self, mock_config, mock_get): """ Asserts a CrossrefFetchException if the GET request fails for any reason @@ -299,8 +299,8 @@ def test__provider_throws_exception_if_request_fails(self, mock_config, mock_get with self.assertRaises(CrossrefException): provider.fetch_metadata("test_doi") - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_throws_exception_if_request_fails_with_404(self, mock_config, mock_get): """ Asserts a CrossrefFetchException if the GET request fails for any reason @@ -314,8 +314,8 @@ def test__provider_throws_exception_if_request_fails_with_404(self, mock_config, with self.assertRaises(CrossrefDOINotFoundException): provider.fetch_metadata("test_doi") - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_throws_exception_if_request_fails_with_non_2xx_code(self, mock_config, mock_get): """ Asserts a CrossrefFetchException if the GET request return a 500 error (any non 2xx will work) @@ -330,8 +330,8 @@ def test__provider_throws_exception_if_request_fails_with_non_2xx_code(self, moc with self.assertRaises(CrossrefFetchException): provider.fetch_metadata("test_doi") - @patch("backend.layers.thirdparty.crossref_provider.requests.get") - @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_throws_exception_if_request_cannot_be_parsed(self, mock_config, mock_get): """ Asserts an CrossrefParseException if the GET request succeeds but cannot be parsed @@ -352,6 +352,25 @@ def test__provider_throws_exception_if_request_cannot_be_parsed(self, mock_confi with self.assertRaises(CrossrefException): provider.fetch_metadata("test_doi") + @patch("backend.common.providers.crossref_provider.requests.get") + def test__get_title_and_citation_from_doi(self, mock_get): + provider = CrossrefProvider() + provider.crossref_api_key = "test_key" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "message": { + "title": ["Test Title"], + "author": [{"family": "Doe", "given": "John"}], + "container-title": ["Test Journal"], + "created": {"date-parts": [[2022]]}, + } + } + mock_get.return_value = mock_response + + result = provider.get_title_and_citation_from_doi("10.1016/j.cell.2019.11.025") + self.assertEqual(result, "Test Title\n\n - Doe (2022) Test Journal") + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/backend/common/test_doi.py b/tests/unit/backend/common/test_doi.py new file mode 100644 index 0000000000000..d98400e601657 --- /dev/null +++ b/tests/unit/backend/common/test_doi.py @@ -0,0 +1,18 @@ +import unittest + +from backend.common.doi import clean_doi + + +class TestDoi(unittest.TestCase): + def test__clean_doi(self): + test_cases = [ + ("10.1016/j.cell.2019.11.025", "10.1016/j.cell.2019.11.025"), + ("DOI: 10.1016/j.cell.2019.11.025.", "10.1016/j.cell.2019.11.025"), + (" DOI: 10.1016/j.cell.2019.11.025 ", "10.1016/j.cell.2019.11.025"), + ("10.1016/j.cell.2019.11.025. ", "10.1016/j.cell.2019.11.025"), + ("", ""), + ] + + for doi, expected in test_cases: + with self.subTest(doi=doi): + self.assertEqual(clean_doi(doi), expected) diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py index cff55805528c9..5cf7b564ea138 100644 --- a/tests/unit/backend/layers/api/test_curation_api.py +++ b/tests/unit/backend/layers/api/test_curation_api.py @@ -6,6 +6,7 @@ from dataclasses import asdict from unittest.mock import Mock, patch +from backend.common.providers.crossref_provider import CrossrefDOINotFoundException from backend.common.utils.api_key import generate from backend.curation.api.v1.curation.collections.common import EntityColumns from backend.layers.common.entities import ( @@ -23,7 +24,6 @@ SpatialMetadata, Visibility, ) -from backend.layers.thirdparty.crossref_provider import CrossrefDOINotFoundException from tests.unit.backend.layers.api.test_portal_api import generate_mock_publisher_metadata from tests.unit.backend.layers.common.base_api_test import BaseAPIPortalTest from tests.unit.backend.layers.common.base_test import DatasetArtifactUpdate, DatasetStatusUpdate @@ -1806,7 +1806,7 @@ def test_get_all_datasets_200(self): ), ) self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + return_value=(generate_mock_publisher_metadata(journal_override="Science"), "78.91011/j.celrep") ) published_collection_2 = self.generate_published_collection( owner="other owner", @@ -1834,7 +1834,9 @@ def test_get_all_datasets_200(self): with self.subTest("With no credentials"): self.assertEqual(3, len(response.json)) - with self.subTest("Contains collection_id, collection_version_id, collection_name and collection_doi"): + with self.subTest( + "Contains collection_id, collection_version_id, collection_name, collection_doi, and collection_doi_label" + ): collection_ids = {published_collection_1.collection_id.id, published_collection_2.collection_id.id} collection__version_ids = { published_collection_1.version_id.id, @@ -1842,21 +1844,25 @@ def test_get_all_datasets_200(self): } collection_names = {published_collection_1.metadata.name, published_collection_2.metadata.name} expected_collection_dois = {"12.3456/j.celrep", "78.91011/j.celrep"} + expected_collection_doi_labels = {"Doe et al. (2021) Nature", "Doe et al. (2021) Science"} received_collection_ids = set() received_collection_version_ids = set() received_collection_names = set() received_collection_dois = set() + received_collection_doi_labels = set() for dataset in response.json: received_collection_ids.add(dataset["collection_id"]) received_collection_version_ids.add(dataset["collection_version_id"]) received_collection_names.add(dataset["collection_name"]) received_collection_dois.add(dataset["collection_doi"]) + received_collection_doi_labels.add(dataset["collection_doi_label"]) self.assertEqual(collection_ids, received_collection_ids) self.assertEqual(collection__version_ids, received_collection_version_ids) self.assertEqual(collection_names, received_collection_names) self.assertEqual(expected_collection_dois, received_collection_dois) + self.assertEqual(expected_collection_doi_labels, received_collection_doi_labels) self.generate_dataset( collection_version=revision, diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py index fb52219a9fbe0..4fc9078cef781 100644 --- a/tests/unit/backend/layers/api/test_portal_api.py +++ b/tests/unit/backend/layers/api/test_portal_api.py @@ -9,6 +9,7 @@ from furl import furl from backend.common.constants import DATA_SUBMISSION_POLICY_VERSION +from backend.common.providers.crossref_provider import CrossrefDOINotFoundException, CrossrefFetchException from backend.layers.business.entities import DatasetArtifactDownloadData from backend.layers.common.entities import ( CollectionId, @@ -22,7 +23,6 @@ OntologyTermId, TissueOntologyTermId, ) -from backend.layers.thirdparty.crossref_provider import CrossrefDOINotFoundException, CrossrefFetchException from backend.layers.thirdparty.uri_provider import FileInfo, FileInfoException from tests.unit.backend.layers.api.fixture import generate_mock_publisher_metadata from tests.unit.backend.layers.common.base_api_test import BaseAPIPortalTest @@ -693,6 +693,7 @@ def test__can_retrieve_created_collection(self): self.assertEqual(body["contact_name"], data["contact_name"]) self.assertEqual(body["contact_email"], data["contact_email"]) self.assertEqual(body["consortia"], []) + self.assertEqual(body["summary_citation"], "Doe et al. (2021) Nature") # test that non owners only have read access no_cookie_headers = {"host": "localhost", "Content-Type": "application/json"} @@ -814,8 +815,8 @@ def test__get_all_collections_for_index(self): """ The `collections/index` endpoint should only return public collections """ - - collection = self.generate_published_collection() + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "123")) + collection = self.generate_published_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) collection_to_tombstone = self.generate_published_collection() private_collection = self.generate_unpublished_collection() @@ -842,6 +843,9 @@ def test__get_all_collections_for_index(self): # Both `published_at` and `revised_at` should point to the same timestamp self.assertEqual(actual_collection["published_at"], collection.published_at.timestamp()) self.assertEqual(actual_collection["revised_at"], collection.published_at.timestamp()) + # assert publisher metadata /summary citation are included + self.assertDictEqual(actual_collection["publisher_metadata"], collection.publisher_metadata) + self.assertEqual(actual_collection["summary_citation"], "Doe et al. (2021) Nature") # test that the owner of a private collection can not see the private collection headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index 7db3c48768d14..faa1c59e42a79 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -9,6 +9,11 @@ from backend.common.constants import DATA_SUBMISSION_POLICY_VERSION from backend.common.corpora_config import CorporaConfig +from backend.common.providers.crossref_provider import ( + CrossrefDOINotFoundException, + CrossrefException, + CrossrefProviderInterface, +) from backend.layers.business.business import ( BusinessLogic, CollectionMetadataUpdate, @@ -52,11 +57,6 @@ from backend.layers.persistence.persistence import DatabaseProvider from backend.layers.persistence.persistence_mock import DatabaseProviderMock from backend.layers.thirdparty.batch_job_provider import BatchJobProviderInterface -from backend.layers.thirdparty.crossref_provider import ( - CrossrefDOINotFoundException, - CrossrefException, - CrossrefProviderInterface, -) from backend.layers.thirdparty.s3_exceptions import S3DeleteException from backend.layers.thirdparty.s3_provider_mock import MockS3Provider from backend.layers.thirdparty.step_function_provider import StepFunctionProviderInterface diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py index ff310812428e2..026dd9467f21d 100644 --- a/tests/unit/backend/layers/common/base_test.py +++ b/tests/unit/backend/layers/common/base_test.py @@ -7,6 +7,7 @@ from unittest.mock import Mock from backend.common.corpora_config import CorporaConfig +from backend.common.providers.crossref_provider import CrossrefProviderInterface from backend.layers.business.business import BusinessLogic from backend.layers.common.entities import ( CollectionId, @@ -26,7 +27,6 @@ from backend.layers.persistence.persistence import DatabaseProvider from backend.layers.persistence.persistence_mock import DatabaseProviderMock from backend.layers.thirdparty.batch_job_provider import BatchJobProviderInterface -from backend.layers.thirdparty.crossref_provider import CrossrefProviderInterface from backend.layers.thirdparty.s3_provider_interface import S3ProviderInterface from backend.layers.thirdparty.step_function_provider import StepFunctionProviderInterface from backend.layers.thirdparty.uri_provider import FileInfo, UriProviderInterface From 09f9d4a10b1ab1c6e70426654865e927191ab158 Mon Sep 17 00:00:00 2001 From: atarashansky Date: Mon, 10 Jun 2024 10:58:33 -0700 Subject: [PATCH 3/8] chore: fix weird _get_stats bug (#7181) --- backend/wmg/pipeline/pipeline.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/backend/wmg/pipeline/pipeline.py b/backend/wmg/pipeline/pipeline.py index 4fb675e5acd2c..b0a82c6a4f4c0 100644 --- a/backend/wmg/pipeline/pipeline.py +++ b/backend/wmg/pipeline/pipeline.py @@ -29,7 +29,6 @@ PRIMARY_FILTER_DIMENSIONS_CREATED_FLAG, ) from backend.wmg.pipeline.dataset_metadata import create_dataset_metadata -from backend.wmg.pipeline.errors import PipelineStepMissing from backend.wmg.pipeline.expression_summary_and_cell_counts import create_expression_summary_and_cell_counts_cubes from backend.wmg.pipeline.expression_summary_and_cell_counts_diffexp import ( create_expression_summary_and_cell_counts_diffexp_cubes, @@ -93,7 +92,13 @@ def run_pipeline(corpus_path: Optional[str] = None, skip_validation: bool = Fals snapshot_id=snapshot_id, is_snapshot_validation_successful=is_valid, ) - stats = _get_stats(corpus_path) + # get dataset count + with tiledb.open(os.path.join(corpus_path, CELL_COUNTS_CUBE_NAME)) as cc_cube: + cell_counts_df = cc_cube.df[:] + dataset_count = len(cell_counts_df["dataset_id"].unique()) + cell_count = int(cell_counts_df["n_cells"].sum()) + + stats = {"dataset_count": dataset_count, "cell_count": cell_count} if is_valid: logger.info(f"Updated latest_snapshot_identifier in s3. Current snapshot location: {cube_data_s3_path}") @@ -131,17 +136,3 @@ def main(): if __name__ == "__main__": main() sys.exit() - - -def _get_stats(corpus_path: str) -> dict[str, int]: - pipeline_state = load_pipeline_state(corpus_path) - if not pipeline_state.get(EXPRESSION_SUMMARY_AND_CELL_COUNTS_CUBE_CREATED_FLAG): - raise PipelineStepMissing("cell_counts") - - # get dataset count - with tiledb.open(os.path.join(corpus_path, CELL_COUNTS_CUBE_NAME)) as cc_cube: - cell_counts_df = cc_cube.df[:] - dataset_count = len(cell_counts_df["dataset_id"].unique()) - cell_count = int(cell_counts_df["n_cells"].sum()) - - return {"dataset_count": dataset_count, "cell_count": cell_count} From fc18558e9306267651e6ed532e8a4c956d253fb8 Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:40:37 -0700 Subject: [PATCH 4/8] chore: remove unused commands from cxg_admin (#7163) --- scripts/cxg_admin.py | 102 --------------------------- scripts/cxg_admin_scripts/migrate.py | 0 2 files changed, 102 deletions(-) delete mode 100644 scripts/cxg_admin_scripts/migrate.py diff --git a/scripts/cxg_admin.py b/scripts/cxg_admin.py index 2e3cd4dc27bbf..fe5219ef6a951 100755 --- a/scripts/cxg_admin.py +++ b/scripts/cxg_admin.py @@ -24,7 +24,6 @@ from scripts.cxg_admin_scripts import ( dataset_details, deletions, - migrate, reprocess_datafile, schema_migration, tombstones, @@ -230,66 +229,6 @@ def refresh_preprint_doi(ctx): updates.refresh_preprint_doi(ctx) -# Commands to migrate the data, typically one off scripts run to populate db for existing rows after adding a new field - - -@cli.command() -@click.pass_context -def create_cxg_artifacts(ctx): - """ - Create cxg artifacts for all datasets in the database based on their explorer_url - DO NOT run/use once dataset updates have shipped -- the s3 location will no longer be - based on the explorer_url in all cases. - To run - ./scripts/cxg_admin.py --deployment prod create-cxg-artifacts - """ - migrate.create_cxg_artifacts(ctx) - - -@cli.command() -@click.pass_context -def migrate_schema_version(ctx): - """ - Populates `schema_version` for each existing dataset. Since the schema version only exists - in the cxg file and we don't want to open them, we will call the cellxgene explorer endpoint - which contains the version. This is a one-off procedure since new datasets will have - the version already set. - """ - - migrate.migrate_schema_version(ctx) - - -@cli.command() -@click.pass_context -def migrate_published_at(ctx): - """ - Populates `published_at` for each existing collection and dataset. This is a - one-off procedure since published_at will be set for collections and new - datasets when they are first published. - """ - migrate.migrate_published_at(ctx) - - -@cli.command() -@click.pass_context -def populate_revised_at(ctx): - """ - Populates `revised_at` for each existing collection and dataset with the - current datetime (UTC). This is a one-off procedure since revised_at will - be set for collections and datasets when they are updated. - """ - migrate.populate_revised_at(ctx) - - -@cli.command() -@click.pass_context -def backfill_processing_status_for_datasets(ctx): - """ - Backfills the `dataset_processing_status` table for datasets that do not have a matching record. - """ - migrate.backfill_processing_status_for_datasets(ctx) - - # Commands to reprocess dataset artifacts (seurat or cxg) @@ -339,47 +278,6 @@ def get_public_datasets(ctx): print(json.dumps(published_datasets, indent=2)) -@cli.command() -@click.pass_context -def migrate_redesign_read(ctx): - """ - Dumps the existing database to a set of .json files that match to the new schema. Files will be in the migration - folder - ./scripts/cxg_admin.py --deployment dev migrate-redesign-read - """ - migrate.migrate_redesign_read(ctx) - - -@cli.command() -@click.pass_context -def migrate_redesign_write(ctx): - """ - Reads the files generated by `migrate-redesign-read` and saves them to the desired environment - ./scripts/cxg_admin.py --deployment dev migrate-redesign-write - """ - migrate.migrate_redesign_write(ctx) - - -@cli.command() -@click.pass_context -def migrate_redesign_debug(ctx): - """ - Used to debug the schema migration process. To be removed. - ./scripts/cxg_admin.py --deployment dev migrate-redesign-debug - """ - migrate.migrate_redesign_debug(ctx) - - -@cli.command() -@click.pass_context -def migrate_redesign_correct_published_at(ctx): - """ - Used to debug the schema migration process. To be removed. - ./scripts/cxg_admin.py --deployment dev migrate-redesign-debug - """ - migrate.migrate_redesign_correct_published_at(ctx) - - @cli.command() @click.pass_context @click.argument("report_patj", type=click.Path(exists=True)) diff --git a/scripts/cxg_admin_scripts/migrate.py b/scripts/cxg_admin_scripts/migrate.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From e7cb2176fbb24dbb7b6b49448aadcb110cf9a36e Mon Sep 17 00:00:00 2001 From: Mim Hastie Date: Wed, 12 Jun 2024 07:26:36 -0700 Subject: [PATCH 5/8] feat: 5.1.0 references (#7024) Co-authored-by: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> --- backend/curation/api/curation-api.yml | 48 +++++++++---------- .../032__Contribute and Publish Data.mdx | 2 +- .../4_2_2__Cell Type and Gene Ordering.mdx | 2 +- .../components/CopyCaption/index.tsx | 4 +- tests/unit/processing/test_handle_error.py | 2 +- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml index 2d4578ddbe668..ad618ee536285 100644 --- a/backend/curation/api/curation-api.yml +++ b/backend/curation/api/curation-api.yml @@ -725,7 +725,7 @@ components: batch_condition: description: | These keys define the batches that a normalization or integration algorithm should be aware of. - [batch condition schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#batch_condition) + [batch condition schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#batch_condition) type: array items: @@ -737,7 +737,7 @@ components: Citation that includes downloadable permalink to h5ad artifact for this dataset, a permalink to collection it belongs to in CZ CELLxGENE Discover, and--if applicable--the Publication DOI associated with the dataset. See details about the exact format in the - [schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#citation) + [schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#citation) type: string nullable: true collection_list: @@ -1008,7 +1008,7 @@ components: type: integer cell_type: description: | - [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#cell_type) + [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#cell_type) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1023,7 +1023,7 @@ components: $ref: "#/components/schemas/default_embedding" development_stage: description: | - [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#development_stage) + [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#development_stage) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1050,7 +1050,7 @@ components: type: number title: description: | - [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#title) + [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#title) nullable: true type: string organism: @@ -1077,14 +1077,14 @@ components: type: string self_reported_ethnicity: description: | - [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#self_reported_ethnicity) + [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#self_reported_ethnicity) default: [] items: $ref: "#/components/schemas/ontology_element" type: array sex: description: | - [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#sex) + [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#sex) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1116,7 +1116,7 @@ components: type: integer cell_type: description: | - [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#cell_type) + [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#cell_type) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1139,7 +1139,7 @@ components: $ref: "#/components/schemas/dataset_version_id" development_stage: description: | - [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#development_stage) + [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#development_stage) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1158,7 +1158,7 @@ components: type: number title: description: | - [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#title) + [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#title) nullable: true type: string organism: @@ -1186,14 +1186,14 @@ components: type: string self_reported_ethnicity: description: | - [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#self_reported_ethnicity) + [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#self_reported_ethnicity) default: [] items: $ref: "#/components/schemas/ontology_element" type: array sex: description: | - [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#sex) + [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#sex) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1225,7 +1225,7 @@ components: type: integer cell_type: description: | - [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#cell_type) + [cell type label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#cell_type) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1247,7 +1247,7 @@ components: $ref: "#/components/schemas/default_embedding" development_stage: description: | - [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#development_stage) + [development stage label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#development_stage) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1272,7 +1272,7 @@ components: type: number title: description: | - [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#title) + [title](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#title) nullable: true type: string organism: @@ -1289,14 +1289,14 @@ components: type: string self_reported_ethnicity: description: | - [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#self_reported_ethnicity) + [self reported ethnicity label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#self_reported_ethnicity) default: [] items: $ref: "#/components/schemas/ontology_element" type: array sex: description: | - [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#sex) + [sex label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#sex) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1312,21 +1312,21 @@ components: type: object dataset_assay: description: | - [assay label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#assay) + [assay label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#assay) default: [] items: $ref: "#/components/schemas/ontology_element" type: array dataset_disease: description: | - [disease label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#disease) + [disease label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#disease) default: [] items: $ref: "#/components/schemas/ontology_element" type: array dataset_organism: description: | - [organism label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#organism) + [organism label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism) default: [] items: $ref: "#/components/schemas/ontology_element" @@ -1339,7 +1339,7 @@ components: type: string dataset_tissue: description: | - [tissue label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#tissue) + [tissue label](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#tissue) default: [] items: allOf: @@ -1400,7 +1400,7 @@ components: CELLxGENE Discover runs a heuristic to detect the approximate distribution of the data in X so that it can accurately calculate statistical properties of the data. This field enables the curator to override this heuristic and - specify the data distribution explicitly. [x_approximate_distribution](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#x_approximate_distribution) + specify the data distribution explicitly. [x_approximate_distribution](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x_approximate_distribution) enum: - COUNT - NORMAL @@ -1461,7 +1461,7 @@ components: nullable: true is_primary_data: description: | - [is_primary_data](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#is_primary_data) + [is_primary_data](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#is_primary_data) Describes whether cellular observations for this Dataset are all canonical (True), all non-canonical (False), or contain a mixture (True, False). @@ -1596,7 +1596,7 @@ components: type: boolean suspension_type: description: | - [suspension_type](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#suspension_type) + [suspension_type](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#suspension_type) List of unique suspension types represented in the dataset, corresponding to dataset's assay(s). Possible item values are 'nucleus', 'cell', and/or 'na'. diff --git a/frontend/doc-site/032__Contribute and Publish Data.mdx b/frontend/doc-site/032__Contribute and Publish Data.mdx index f4ce48e23829f..acbcd47395f0e 100644 --- a/frontend/doc-site/032__Contribute and Publish Data.mdx +++ b/frontend/doc-site/032__Contribute and Publish Data.mdx @@ -77,7 +77,7 @@ Each dataset needs the following information added to a single h5ad (AnnData 0.8 - tissue_ontology_term_id: [UBERON](https://www.ebi.ac.uk/ols/ontologies/uberon) - cell_type_ontology_term_id: [CL](https://www.ebi.ac.uk/ols/ontologies/cl) - assay_ontology_term_id: [EFO](https://www.ebi.ac.uk/ols/ontologies/efo) - - suspension_type: `cell`, `nucleus`, or `na`, as corresponding to assay. Use [this table](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#suspension_type) defined in the data schema for guidance. If the assay does not appear in this table, the most appropriate value MUST be selected and the [curation team informed](mailto:cellxgene@chanzuckerberg.com) during submission so that the assay can be added to the table. + - suspension_type: `cell`, `nucleus`, or `na`, as corresponding to assay. Use [this table](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#suspension_type) defined in the data schema for guidance. If the assay does not appear in this table, the most appropriate value MUST be selected and the [curation team informed](mailto:cellxgene@chanzuckerberg.com) during submission so that the assay can be added to the table. - **Embeddings in obsm**: - One or more two-dimensional embeddings, prefixed with 'X\_' - **Features in var & raw.var (if present)**: diff --git a/frontend/doc-site/04__Analyze Public Data/4_2__Gene Expression Documentation/4_2_2__Cell Type and Gene Ordering.mdx b/frontend/doc-site/04__Analyze Public Data/4_2__Gene Expression Documentation/4_2_2__Cell Type and Gene Ordering.mdx index a9c3a8983b268..c7b9565738a63 100644 --- a/frontend/doc-site/04__Analyze Public Data/4_2__Gene Expression Documentation/4_2_2__Cell Type and Gene Ordering.mdx +++ b/frontend/doc-site/04__Analyze Public Data/4_2__Gene Expression Documentation/4_2_2__Cell Type and Gene Ordering.mdx @@ -4,7 +4,7 @@ Cell types and genes shown in a dot plot can be arranged in different ways as de # Cell Type Ordering -Cell types are annotated by the original data contributors and mapped to the closest cell ontology (CL) term as defined in the [data schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#cell_type_ontology_term_id). +Cell types are annotated by the original data contributors and mapped to the closest cell ontology (CL) term as defined in the [data schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#cell_type_ontology_term_id). In some cases there are cell types annotated with a high-level term whereas in some other cases they can be very granularly annotated. For example, there are some cells annotated as "T cells" and others annotated with children terms like "effector CD8-positive, alpha-beta T cell". All of these cell types are shown in the dot plot and they should not be interpreted as one being a subset of the other. diff --git a/frontend/src/components/Collections/components/Dataset/components/DownloadDataset/components/Content/components/DownloadLink/components/CopyCaption/index.tsx b/frontend/src/components/Collections/components/Dataset/components/DownloadDataset/components/Content/components/DownloadLink/components/CopyCaption/index.tsx index 78596a0679024..838c64830ad38 100644 --- a/frontend/src/components/Collections/components/Dataset/components/DownloadDataset/components/Content/components/DownloadLink/components/CopyCaption/index.tsx +++ b/frontend/src/components/Collections/components/Dataset/components/DownloadDataset/components/Content/components/DownloadLink/components/CopyCaption/index.tsx @@ -4,9 +4,9 @@ import { Link } from "@czi-sds/components"; const DISCOVER_API_URL = "https://api.cellxgene.cziscience.com/curation/ui/#/"; const SCHEMA_URL = - "https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md"; + "https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md"; const SEURAT_SCHEMA_URL = - "https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/seurat_encoding.md"; + "https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/seurat_encoding.md"; interface Props { selectedFormat: DATASET_ASSET_FORMAT | ""; diff --git a/tests/unit/processing/test_handle_error.py b/tests/unit/processing/test_handle_error.py index e8a1b98fbc7d1..6b18d35fbf4da 100644 --- a/tests/unit/processing/test_handle_error.py +++ b/tests/unit/processing/test_handle_error.py @@ -89,7 +89,7 @@ def get_collection_version_mock(): publisher_metadata={}, published_at=None, created_at=datetime.datetime.now(), - schema_version="5.0.0", + schema_version="5.1.0", canonical_collection=CanonicalCollection( id=CollectionId("collection123"), version_id=None, From 2d7588455c4eb1cd00545eff6f796dc03e430859 Mon Sep 17 00:00:00 2001 From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Date: Wed, 12 Jun 2024 16:34:21 -0400 Subject: [PATCH 6/8] fix: accept NoneType do_not_publish_list (#7185) --- backend/layers/processing/publish_revisions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/layers/processing/publish_revisions.py b/backend/layers/processing/publish_revisions.py index a3168f6729245..0e49ab1fb58b0 100644 --- a/backend/layers/processing/publish_revisions.py +++ b/backend/layers/processing/publish_revisions.py @@ -105,5 +105,7 @@ def run(self): S3Provider(), UriProvider(), ) - do_not_publish_list = os.environ.get("DO_NOT_PUBLISH_LIST", None).split(",") + do_not_publish_list = os.environ.get("DO_NOT_PUBLISH_LIST", None) + if do_not_publish_list is not None: + do_not_publish_list = do_not_publish_list.split(",") PublishRevisions(business_logic, do_not_publish_list).run() From 314a060b00957df96c242c1517ae9a12278381bf Mon Sep 17 00:00:00 2001 From: Mim Hastie Date: Wed, 12 Jun 2024 14:59:13 -0700 Subject: [PATCH 7/8] feat: update publisher metadata on publish revision (#7108) --- backend/common/providers/crossref_provider.py | 29 +- backend/layers/business/business.py | 75 +++- backend/layers/business/business_interface.py | 2 +- .../backend/common/test_crossref_provider.py | 18 +- .../backend/layers/api/test_curation_api.py | 50 ++- .../backend/layers/api/test_portal_api.py | 58 ++- .../backend/layers/business/test_business.py | 347 +++++++++++++++++- .../unit/processing/test_process_validate.py | 2 +- 8 files changed, 511 insertions(+), 70 deletions(-) diff --git a/backend/common/providers/crossref_provider.py b/backend/common/providers/crossref_provider.py index 5e9d778cb4f7e..79c356729290c 100644 --- a/backend/common/providers/crossref_provider.py +++ b/backend/common/providers/crossref_provider.py @@ -11,8 +11,8 @@ class CrossrefProviderInterface: - def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: - return None, None + def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str], Optional[float]]: + return None, None, None def fetch_preprint_published_doi(self, doi): pass @@ -111,7 +111,7 @@ def get_title_and_citation_from_doi(self, doi: str) -> str: return doi return f"{title}\n\n - {citation}" - def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: + def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str], Optional[datetime]]: """ Fetches and extracts publisher metadata from Crossref for a specified DOI. If the Crossref API URI isn't in the configuration, we will just return an empty object. @@ -119,11 +119,12 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: :param doi: str - DOI uri link or curie identifier return: tuple - publisher metadata dict and DOI curie identifier """ + doi_curie = doi_curie_from_link(doi) res = self._fetch_crossref_payload(doi_curie) if not res: - return None, None + return None, None, None try: message = res.json()["message"] @@ -138,11 +139,10 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: published_year, published_month, published_day = self.parse_date_parts(published_date) - dates = [] - for k, v in message.items(): - if isinstance(v, dict) and "date-parts" in v: - dt = v["date-parts"][0] - dates.append(f"{k}: {dt}") + # Calculate the deposited date; used when checking for updates. + deposited_at = None + if "deposited" in message and (deposited_timestamp := message["deposited"].get("timestamp")) is not None: + deposited_at = deposited_timestamp / 1000 # Journal try: @@ -179,9 +179,9 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: # Preprint is_preprint = message.get("subtype") == "preprint" if is_preprint: - published_metadata, published_doi_curie = self.fetch_published_metadata(message) + published_metadata, published_doi_curie, published_deposited_at = self.fetch_published_metadata(message) if published_metadata and published_doi_curie: # if not, use preprint doi curie - return published_metadata, published_doi_curie + return published_metadata, published_doi_curie, published_deposited_at return ( { @@ -194,11 +194,14 @@ def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: "is_preprint": is_preprint, }, doi_curie, + deposited_at, ) except Exception as e: raise CrossrefParseException("Cannot parse metadata from Crossref") from e - def fetch_published_metadata(self, doi_response_message: dict) -> Tuple[Optional[dict], Optional[str]]: + def fetch_published_metadata( + self, doi_response_message: dict + ) -> Tuple[Optional[dict], Optional[str], Optional[float]]: try: published_doi = doi_response_message["relation"]["is-preprint-of"] # the new DOI to query for ... @@ -206,4 +209,4 @@ def fetch_published_metadata(self, doi_response_message: dict) -> Tuple[Optional if entity["id-type"] == "doi": return self.fetch_metadata(entity["id"]) except Exception: # if fetch of published doi errors out, just use preprint doi - return None, None + return None, None, None diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py index de8b38a97eddc..32784b481cb35 100644 --- a/backend/layers/business/business.py +++ b/backend/layers/business/business.py @@ -166,7 +166,7 @@ def trigger_dataset_artifact_update( current_dataset_version_id, new_dataset_version_id, metadata_update ) - def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]: + def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str], Optional[float]]: """ Retrieves publisher metadata from Crossref. """ @@ -176,7 +176,7 @@ def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict errors.append({"link_type": CollectionLinkType.DOI, "reason": "DOI cannot be found on Crossref"}) except CrossrefException as e: logging.warning(f"CrossrefException on create_collection: {e}. Will ignore metadata.") - return None, None + return None, None, None def create_collection( self, owner: str, curator_name: str, collection_metadata: CollectionMetadata @@ -197,7 +197,7 @@ def create_collection( publisher_metadata = None if doi_link: - publisher_metadata, doi_curie_from_crossref = self._get_publisher_metadata(doi_link.uri, errors) + publisher_metadata, doi_curie_from_crossref, _ = self._get_publisher_metadata(doi_link.uri, errors) # Ensure DOI link has correct hyperlink formation a la https://doi.org/{curie_identifier} # DOI returned from Crossref may be a different (published) DOI altogether if submitted DOI is preprint doi_link.uri = f"https://doi.org/{doi_curie_from_crossref}" @@ -404,7 +404,7 @@ def update_collection_version( unset_publisher_metadata = True elif new_doi and new_doi != current_doi: # If the DOI has changed, fetch and update the metadata - publisher_metadata_to_set, doi_curie_from_crossref = self._get_publisher_metadata(new_doi, errors) + publisher_metadata_to_set, doi_curie_from_crossref, _ = self._get_publisher_metadata(new_doi, errors) new_doi = f"https://doi.org/{doi_curie_from_crossref}" next((link for link in body.links if link.type == "DOI")).uri = new_doi # noqa - DOI link exists @@ -962,7 +962,8 @@ def publish_collection_version( has_dataset_revisions = False # if collection is a revision and has no changes to previous version's datasets--don't update 'revised_at' # used for cases where revision only contains collection-level metadata changes - if version.canonical_collection.version_id is not None: + is_revision = version.canonical_collection.version_id is not None + if is_revision: date_of_last_publish = ( version.canonical_collection.revised_at or version.canonical_collection.originally_published_at ) @@ -972,6 +973,19 @@ def publish_collection_version( if canonical_datasets != version_datasets: has_dataset_revisions = True + # Check Crossref for updates in publisher metadata since last publish of revision, or since private collection was created. + # Raise exception if DOI has moved from pre-print to published, forcing curators to re-publish the collection once corresponding + # artifacts update is complete. + last_action_at = date_of_last_publish if is_revision else version.created_at + doi_update = self._update_crossref_metadata(version, last_action_at) + if doi_update: + raise CollectionPublishException( + [ + f"DOI was updated from {doi_update[0]} to {doi_update[1]} requiring updates to corresponding artifacts. " + "Retry publish once artifact updates are complete." + ] + ) + # Finalize Collection publication and delete any tombstoned assets dataset_version_ids_to_delete_from_s3 = self.database_provider.finalize_collection_version( version.collection_id, @@ -1087,6 +1101,57 @@ def delete_artifacts(self, artifacts: List[DatasetArtifact]) -> None: except S3DeleteException as e: raise CollectionDeleteException("Attempt to delete public Datasets failed") from e + def _update_crossref_metadata( + self, version: CollectionVersion, last_action_at: datetime + ) -> Optional[Tuple[str, str]]: + """ + Call Crossref for the latest publisher metadata and: + - if a DOI has moved from pre-print to published, trigger update to collection (and artifacts), otherwise, + - if Crossref has been updated since last publish of revision or since private collection was created, update collection + version publisher metadata. + + :param collection_version_id: The collection version (either a revision or a private collection) to check publisher updates for. + :param last_action_at: The originally published at or revised at date of revision, or the created at date of a private collection. + :return: Tuple of current DOI and DOI returned from Crossref if DOI has changed, otherwise None. + """ + # Get the DOI from the collection version metadata; exit if no DOI. + links = version.metadata.links + link_doi = next((link for link in links if link.type == "DOI"), None) + if not link_doi: + return + + # Fetch the latest publisher metadata from Crossref. Ignore errors, and exit if no metadata is found. + publisher_metadata, crossref_doi_curie, deposited_at_timestamp = self._get_publisher_metadata(link_doi.uri, []) + if not publisher_metadata or not crossref_doi_curie: + return + + # Handle change in publisher metadata from pre-print to published. + crossref_doi = f"https://doi.org/{crossref_doi_curie}" + if crossref_doi != link_doi.uri: + + # Set the DOI in the collection version metadata links to be the returned DOI and update collection + # version (subsequently triggering update of artifacts). + updated_links = [Link(link.name, link.type, crossref_doi) if link.type == "DOI" else link for link in links] + update = CollectionMetadataUpdate( + name=None, + description=None, + contact_name=None, + contact_email=None, + links=updated_links, + consortia=None, + ) + self.update_collection_version(version.version_id, update, True) + + # Return the existing DOI and the DOI returned from Crossref. + return link_doi.uri, crossref_doi + + # Otherwise, DOI is unchanged: check if there are updates in Crossref that have occurred since last publish of + # revision or since private collection was created, and update collection metadata if so. + if deposited_at_timestamp and datetime.fromtimestamp(deposited_at_timestamp) > last_action_at: + self.database_provider.save_collection_publisher_metadata(version.version_id, publisher_metadata) + + return None + def _get_collection_and_dataset( self, collection_id: str, dataset_id: str ) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]: diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py index f78e537bd1c15..4031793b9b1a7 100644 --- a/backend/layers/business/business_interface.py +++ b/backend/layers/business/business_interface.py @@ -62,7 +62,7 @@ def get_collection_version_from_canonical( ) -> Optional[CollectionVersionWithDatasets]: pass - def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]: + def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str], Optional[float]]: pass def create_collection( diff --git a/tests/unit/backend/common/test_crossref_provider.py b/tests/unit/backend/common/test_crossref_provider.py index e8432a1d04002..20506dca8a111 100644 --- a/tests/unit/backend/common/test_crossref_provider.py +++ b/tests/unit/backend/common/test_crossref_provider.py @@ -19,7 +19,7 @@ class TestCrossrefProvider(unittest.TestCase): @patch("backend.common.providers.crossref_provider.requests.get") def test__provider_does_not_call_crossref_in_test(self, mock_get): provider = CrossrefProvider() - metadata, doi = provider.fetch_metadata("test_doi") + metadata, doi, _ = provider.fetch_metadata("test_doi") self.assertIsNone(metadata) mock_get.assert_not_called() @@ -57,7 +57,7 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get mock_get.return_value = response provider = CrossrefProvider() - res, doi_curie_from_crossref = provider.fetch_metadata("test_doi") + res, doi_curie_from_crossref, _ = provider.fetch_metadata("test_doi") self.assertEqual("test_doi", doi_curie_from_crossref) mock_get.assert_called_once() @@ -101,6 +101,7 @@ def make_response(content): "sequence": "additional", }, ], + "deposited": {"timestamp": 1716932866440}, "published-online": {"date-parts": [[2021, 11, 10]]}, "container-title": ["Nature"], "subtype": "preprint", @@ -129,7 +130,7 @@ def make_response(content): responses = [response_published, response_preprint] mock_get.side_effect = lambda *x, **y: responses.pop() - res, doi_curie_from_crossref = provider.fetch_metadata("preprint_doi") + res, doi_curie_from_crossref, _ = provider.fetch_metadata("preprint_doi") self.assertEqual("published_doi", doi_curie_from_crossref) expected_response = { "authors": [{"given": "Jonathan", "family": "Doe"}, {"given": "Jane", "family": "Doe"}], @@ -147,7 +148,7 @@ def make_response(content): self.subTest("Preprint DOI is used when published is referenced but cannot be retrieved") and patch.object(provider, "fetch_published_metadata") as fetch_published_metadata_mock ): - fetch_published_metadata_mock.return_value = (None, None) + fetch_published_metadata_mock.return_value = (None, None, None) preprint_body = copy.deepcopy(body) response_preprint = make_response(preprint_body) @@ -158,7 +159,7 @@ def make_response(content): responses = [response_published, response_preprint] mock_get.side_effect = lambda *x, **y: responses.pop() - res, doi_curie_from_crossref = provider.fetch_metadata("preprint_doi") + res, doi_curie_from_crossref, _ = provider.fetch_metadata("preprint_doi") self.assertEqual("preprint_doi", doi_curie_from_crossref) expected_response = { "authors": [{"given": "John", "family": "Doe"}, {"given": "Jane", "family": "Doe"}], @@ -177,6 +178,7 @@ def make_response(content): def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_get): response = Response() response.status_code = 200 + deposited_timestamp = 17169328664 response._content = str.encode( json.dumps( { @@ -212,6 +214,7 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge "name": "Bat consortium", }, ], + "deposited": {"timestamp": deposited_timestamp}, "published-online": {"date-parts": [[2021, 11]]}, "container-title": ["Nature"], }, @@ -221,7 +224,7 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge mock_get.return_value = response provider = CrossrefProvider() - res, _ = provider.fetch_metadata("test_doi") + res, _, deposited_at = provider.fetch_metadata("test_doi") mock_get.assert_called_once() expected_response = { @@ -242,6 +245,7 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge } self.assertDictEqual(expected_response, res) + self.assertEqual(deposited_timestamp / 1000, deposited_at) @patch("backend.common.providers.crossref_provider.requests.get") @patch("backend.common.providers.crossref_provider.CorporaConfig") @@ -265,7 +269,7 @@ def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): mock_get.return_value = response provider = CrossrefProvider() - author_data, _ = provider.fetch_metadata("test_doi") + author_data, _, _ = provider.fetch_metadata("test_doi") mock_get.assert_called_once() expected_response = { diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py index 5cf7b564ea138..aa3929ddb575b 100644 --- a/tests/unit/backend/layers/api/test_curation_api.py +++ b/tests/unit/backend/layers/api/test_curation_api.py @@ -1300,12 +1300,16 @@ def test__update_collection__doi__OK(self): {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] new_doi = "10.1016" # a real DOI (CURIE reference) - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.2020", 17169328.664) + ) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_doi, original_collection["doi"]) metadata = {"doi": new_doi} - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) response = self.app.patch( f"/curation/v1/collections/{collection_id}", json=metadata, @@ -1320,7 +1324,9 @@ def test__update_collection__consortia__OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.2020", 17169328.664) + ) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1339,7 +1345,9 @@ def test__remove_collection__consortia__OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.2020", 17169328.664) + ) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1358,7 +1366,9 @@ def test__update_public_collection_verify_fix_consortia_sort_order_OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.2020", 17169328.664) + ) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1376,7 +1386,7 @@ def test__update_collection__doi_is_not_CURIE_reference__BAD_REQUEST(self): {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"}, ] self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "10.1011/something") + return_value=(generate_mock_publisher_metadata(), "10.1011/something", 17169328.664) ) collection = self.generate_collection(links=links, visibility="PRIVATE") collection_id = collection.collection_id @@ -1397,7 +1407,9 @@ def test__update_collection__links_None_does_not_remove_publisher_metadata(self) {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"}, ] mock_publisher_metadata = generate_mock_publisher_metadata() - self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(mock_publisher_metadata, "10.1011/something", 17169328.664) + ) collection = self.generate_collection(links=links, visibility="PRIVATE") collection_id = collection.collection_id @@ -1424,7 +1436,9 @@ def test__update_collection__doi_does_not_exist__BAD_REQUEST(self): {"link_name": "new link", "link_type": "RAW_DATA", "link_url": "http://brand_new_link.place"}, ] mock_publisher_metadata = generate_mock_publisher_metadata() - self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(mock_publisher_metadata, "10.1011/something", 17169328.664) + ) collection = self.generate_collection(links=links, visibility="PRIVATE") self.assertIsNotNone(collection.publisher_metadata) @@ -1791,9 +1805,8 @@ def test_get_dataset_no_assets(self): self.assertEqual([], body["assets"]) def test_get_all_datasets_200(self): - self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep") - ) + crossref_return_value_1 = (generate_mock_publisher_metadata(), "12.3456/j.celrep", 17169328.664) + self.crossref_provider.fetch_metadata = Mock(return_value=crossref_return_value_1) published_collection_1 = self.generate_published_collection( add_datasets=2, metadata=CollectionMetadata( @@ -1806,7 +1819,11 @@ def test_get_all_datasets_200(self): ), ) self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(journal_override="Science"), "78.91011/j.celrep") + return_value=( + generate_mock_publisher_metadata(journal_override="Science"), + "78.91011/j.celrep", + 17169328.664, + ) ) published_collection_2 = self.generate_published_collection( owner="other owner", @@ -1864,6 +1881,9 @@ def test_get_all_datasets_200(self): self.assertEqual(expected_collection_dois, received_collection_dois) self.assertEqual(expected_collection_doi_labels, received_collection_doi_labels) + # Mock Crossref return value for revision. + self.crossref_provider.fetch_metadata = Mock(return_value=crossref_return_value_1) + self.generate_dataset( collection_version=revision, replace_dataset_version_id=revision.datasets[0].version_id, @@ -2165,7 +2185,7 @@ def _fetch_datasets(self, visibility=None, headers=None, schema_version=None, st def test_get_datasets_by_schema_200(self): self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep") + return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep", 17169328.664) ) published_collection_1 = self.generate_published_collection( add_datasets=2, @@ -2179,7 +2199,7 @@ def test_get_datasets_by_schema_200(self): ), ) self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep", 17169328.664) ) published_collection_2 = self.generate_published_collection( owner="other owner", @@ -2196,7 +2216,7 @@ def test_get_datasets_by_schema_200(self): dataset_schema_version="3.1.0", ) self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep", 17169328.664) ) published_collection_3 = self.generate_published_collection( owner="other owner", diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py index 4fc9078cef781..13bfb091f9f1a 100644 --- a/tests/unit/backend/layers/api/test_portal_api.py +++ b/tests/unit/backend/layers/api/test_portal_api.py @@ -359,7 +359,9 @@ def test__post_collection_returns_id_on_success(self): "links": [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], "consortia": ["Consortia 1"], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -418,7 +420,9 @@ def test__post_collection_sorts_consortia(self): ], "consortia": ["Consortia 3", "Consortia 1"], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016/foo")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016/foo", 17169328.664) + ) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -444,7 +448,9 @@ def test__post_collection_normalizes_doi(self): ], "consortia": ["Consortia 1"], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016/foo")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016/foo", 17169328.664) + ) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -592,7 +598,9 @@ def test__post_collection_adds_publisher_metadata(self): "links": [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], "consortia": ["Consortia 1"], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -677,7 +685,9 @@ def test__can_retrieve_created_collection(self): {"link_name": "DOI Link 2", "link_url": "http://doi.org/10.1017", "link_type": "DOI"}, ], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1017")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1017", 17169328.664) + ) response = self.app.post(test_url.url, headers=headers, data=json.dumps(data)) self.assertEqual(201, response.status_code) collection_id = json.loads(response.data)["collection_id"] @@ -729,7 +739,9 @@ def test__create_collection_strip_string_fields(self): ], "consortia": ["Consortia 1 "], } - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1017")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1017", 17169328.664) + ) response = self.app.post(test_url.url, headers=headers, data=json.dumps(data)) self.assertEqual(201, response.status_code) collection_id = json.loads(response.data)["collection_id"] @@ -815,7 +827,9 @@ def test__get_all_collections_for_index(self): """ The `collections/index` endpoint should only return public collections """ - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "123")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "123", 17169328.664) + ) collection = self.generate_published_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) collection_to_tombstone = self.generate_published_collection() private_collection = self.generate_unpublished_collection() @@ -1182,7 +1196,9 @@ def test__update_collection__OK(self): "consortia", ] headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) # Update the collection expected_body = { @@ -1206,7 +1222,9 @@ def test__update_collection__OK(self): def test__update_collection_strip_string_fields__OK(self): collection = self.generate_unpublished_collection() headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) # Update the collection new_body = { @@ -1231,7 +1249,9 @@ def test__update_collection_strip_string_fields__OK(self): ) def test__update_collection_partial__OK(self): - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "123")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "123", 17169328.664) + ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} @@ -1370,7 +1390,9 @@ def test__update_with_invalid_collection_consortia(self): def test__update_collection_links__OK(self): collection = self.generate_unpublished_collection() headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) # add links links = [{"link_name": "DOI Link", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}] @@ -1415,7 +1437,7 @@ def test__update_collection_links__OK(self): def test__update_collection_new_doi_updates_metadata(self): # Generate a collection with "Old Journal" as publisher metadata self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + return_value=(generate_mock_publisher_metadata("Old Journal"), "123", 17169328.664) ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) @@ -1428,7 +1450,7 @@ def test__update_collection_new_doi_updates_metadata(self): headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata("New Journal"), "10.1234/5678") + return_value=(generate_mock_publisher_metadata("New Journal"), "10.1234/5678", 17169328.664) ) response = self.app.put( f"/dp/v1/collections/{collection.version_id}", @@ -1448,7 +1470,7 @@ def test__update_collection_new_doi_updates_metadata(self): def test__update_collection_remove_doi_deletes_metadata(self): # Generate a collection with "Old Journal" as publisher metadata self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + return_value=(generate_mock_publisher_metadata("Old Journal"), "123", 17169328.664) ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "https://doi.org/123")]) @@ -1459,7 +1481,7 @@ def test__update_collection_remove_doi_deletes_metadata(self): # From now on, Crossref will return `New Journal` self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata("This will"), "not be called") + return_value=(generate_mock_publisher_metadata("This will"), "not be called", 17169328.664) ) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} @@ -1481,7 +1503,7 @@ def test__update_collection_remove_doi_deletes_metadata(self): # ✅ def test__update_collection_same_doi_does_not_update_metadata(self): self.crossref_provider.fetch_metadata = Mock( - return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + return_value=(generate_mock_publisher_metadata("Old Journal"), "123", 17169328.664) ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) @@ -2439,7 +2461,9 @@ def test__start_revision_of_a_collection_w_links__201(self): Link("Link 1", "OTHER", "http://link.good"), Link("DOI Link", "DOI", "http://doi.org/10.1016"), ] - self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1016", 17169328.664) + ) published_collection = self.generate_published_collection(links=links, add_datasets=2) # Starts a revision diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index faa1c59e42a79..6dcc250b77ebd 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -194,15 +194,21 @@ def tearDownClass(cls) -> None: os.unsetenv("DATASETS_BUCKET") def initialize_empty_unpublished_collection( - self, owner: str = test_user_name, curator_name: str = test_curator_name + self, owner: str = test_user_name, curator_name: str = test_curator_name, links: List[Link] = None ) -> CollectionVersion: """ Initializes an unpublished collection to be used for testing, with no datasets """ + if links: + metadata = deepcopy(self.sample_collection_metadata) + metadata.links = links + else: + metadata = self.sample_collection_metadata + version = self.database_provider.create_canonical_collection( owner, curator_name, - self.sample_collection_metadata, + metadata, ) return version @@ -212,13 +218,14 @@ def initialize_unpublished_collection( curator_name: str = test_curator_name, complete_dataset_ingestion: bool = True, num_datasets: int = 2, + links: List[Link] = None, ) -> CollectionVersionWithDatasets: """ Initializes an unpublished collection to be used for testing, with two datasets. By default also completes dataset ingestion (normally, a process that would be done asynchonously). Pass `complete_dataset_ingestion=False` if you want to initialize datasets only. """ - version = self.initialize_empty_unpublished_collection(owner, curator_name) + version = self.initialize_empty_unpublished_collection(owner, curator_name, links) for _ in range(num_datasets): dataset_version = self.database_provider.create_canonical_dataset( version.version_id, @@ -237,11 +244,12 @@ def initialize_published_collection( curator_name: str = test_curator_name, published_at: datetime = None, num_datasets: int = 2, + links: List[Link] = None, ) -> CollectionVersionWithDatasets: """ Initializes a published collection to be used for testing, with a single dataset """ - version = self.initialize_unpublished_collection(owner, curator_name, num_datasets=num_datasets) + version = self.initialize_unpublished_collection(owner, curator_name, num_datasets=num_datasets, links=links) # published_at must be later than created_at for constituent Dataset Versions if published_at: assert published_at >= datetime.utcnow() @@ -278,9 +286,10 @@ def initialize_collection_with_an_unpublished_revision( curator_name: str = test_curator_name, published_at: datetime = None, num_datasets: int = 2, + links: List[Link] = None, ) -> Tuple[CollectionVersionWithDatasets, CollectionVersionWithDatasets]: # Published with an unpublished revision - published_version = self.initialize_published_collection(owner, curator_name, published_at, num_datasets) + published_version = self.initialize_published_collection(owner, curator_name, published_at, num_datasets, links) revision = self.business_logic.create_collection_version(published_version.collection_id) return ( self.database_provider.get_collection_version_with_datasets(published_version.version_id), @@ -372,7 +381,9 @@ def test_create_collection_with_valid_doi_ok(self): self.sample_collection_metadata.links = links_with_doi expected_publisher_metadata = {"authors": ["Test Author"]} - self.crossref_provider.fetch_metadata = Mock(return_value=(expected_publisher_metadata, "good/doi")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, "good/doi", 17169328.664) + ) collection = self.business_logic.create_collection( test_user_name, test_curator_name, self.sample_collection_metadata @@ -647,7 +658,9 @@ def test_update_collection_same_doi(self): metadata.links = links expected_publisher_metadata = {"authors": ["Test Author"]} - self.crossref_provider.fetch_metadata = Mock(return_value=(expected_publisher_metadata, "test/doi")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, "test/doi", 17169328.664) + ) # We need to call `business_logic.create_collection` so that the publisher metadata is populated version = self.business_logic.create_collection(test_user_name, test_curator_name, metadata) @@ -677,7 +690,9 @@ def test_update_collection_change_doi(self): links = [Link("test doi", "DOI", "http://doi.org/test.doi")] metadata.links = links - self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["Test Author"]}, "test.doi")) + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, "test.doi", 17169328.664) + ) # We need to call `business_logic.create_collection` so that the publisher metadata is populated version = self.business_logic.create_collection(test_user_name, test_curator_name, metadata) @@ -693,7 +708,7 @@ def test_update_collection_change_doi(self): consortia=None, ) - expected_updated_publisher_metadata = ({"authors": ["New Test Author"]}, "new.test.doi") + expected_updated_publisher_metadata = ({"authors": ["New Test Author"]}, "new.test.doi", 17169328.664) self.crossref_provider.fetch_metadata = Mock(return_value=expected_updated_publisher_metadata) self.batch_job_provider.start_metadata_update_batch_job = Mock() self.business_logic.update_collection_version(version.version_id, body) @@ -720,7 +735,9 @@ def test_update_collection_change_doi__trigger_dataset_artifact_updates(self): ) self.batch_job_provider.start_metadata_update_batch_job = Mock() self.business_logic.generate_dataset_citation = Mock(return_value="test citation") - self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["New Test Author"]}, "new.test.doi")) + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["New Test Author"]}, "new.test.doi", 17169328.664) + ) self.business_logic.update_collection_version(revision.version_id, body) assert self.batch_job_provider.start_metadata_update_batch_job.call_count == 2 @@ -749,7 +766,9 @@ def test_update_published_collection_fail__dataset_status_pending(self): links=[Link("new test doi", "DOI", "http://doi.org/new.test.doi")], consortia=None, ) - self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["New Test Author"]}, "new.test.doi")) + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["New Test Author"]}, "new.test.doi", 17169328.664) + ) with self.assertRaises(CollectionUpdateException): self.business_logic.update_collection_version(revision.version_id, body) @@ -2256,6 +2275,312 @@ def test_dataset_published_at_with_replaced_dataset_ok(self): self.assertEqual(dataset_0.canonical_dataset.published_at, published_collection.published_at) self.assertEqual(dataset_1.canonical_dataset.published_at, published_collection.published_at) + def test_publish_new_collection_no_doi_crossref_not_checked_ok(self): + """ + When publishing a new collection without a DOI, no Crossref check should be made. + """ + + # Create an unpublished collection without a DOI. + collection = self.initialize_unpublished_collection() + + # Mock the Crossref check. + self.crossref_provider.fetch_metadata = Mock() + + # Publish the collection. + self.business_logic.publish_collection_version(collection.version_id) + + # Confirm Crossref was not called. + self.crossref_provider.fetch_metadata.assert_not_called() + + def test_publish_private_collection_crossref_checked_ok(self): + """ + When publishing a new collection with a DOI, a Crossref check should be made. + """ + + # Mock the Crossref check. + doi_curie = "test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, doi_curie, 17169328.664) + ) + + # Create a private collection with a DOI. + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + collection_version = self.initialize_unpublished_collection(links=links) + + # Publish the private collection. + self.business_logic.publish_collection_version(collection_version.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + def test_publish_private_collection_same_doi_publisher_metadata_not_updated_ok(self): + """ + When publishing a private collection - and there is no change in DOI - publisher metadata should not + be updated if the deposited date is before the created at date of the corresponding collection. + """ + + # Get the time before the collection is created. + before_created_at = datetime.utcnow().timestamp() + + # Create a private collection with a DOI. + doi_curie = "test/doi" + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + collection_version = self.initialize_unpublished_collection(links=links) + + # Mock the Crossref update, and set the deposited date to be before the created at date of the collection. + expected_publisher_metadata = {"authors": ["Test Author"]} + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, doi_curie, before_created_at) + ) + + # Mock the publisher metadata update. + self.database_provider.save_collection_publisher_metadata = Mock() + + # Publish the private collection. + self.business_logic.publish_collection_version(collection_version.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + # Confirm the metadata was not updated. + self.database_provider.save_collection_publisher_metadata.assert_not_called() + + def test_publish_private_collection_same_doi_publisher_metadata_updated_ok(self): + """ + When publishing a private collection - and there is no change in DOI - publisher metadata should be + updated if the deposited date is after the created at date of the corresponding collection. + """ + + # Create a private collection. + doi_curie = "test/doi" + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + collection_version = self.initialize_unpublished_collection(links=links) + + # Get the time after the collection has been created. + after_created_at = datetime.utcnow().timestamp() + + # Mock the Crossref update, and set the deposited date to be after the created at date of the collection. + expected_publisher_metadata = {"authors": ["Test Author"]} + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, doi_curie, after_created_at) + ) + + # Mock the publisher metadata update. + self.database_provider.save_collection_publisher_metadata = Mock() + + # Publish the private collection. + self.business_logic.publish_collection_version(collection_version.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + # Confirm the publisher metadata was updated. + self.database_provider.save_collection_publisher_metadata.assert_called_once() + + def test_publish_revision_no_doi_crossref_not_checked_ok(self): + """ + Crossref should not be called to check for updates to the publisher metadata when publishing + a revision that has no DOI. + """ + + # Create a revision without a DOI. + _, revision = self.initialize_collection_with_an_unpublished_revision() + + # Mock the Crossref check. + self.crossref_provider.fetch_metadata = Mock() + + # Publish the revision. + self.business_logic.publish_collection_version(revision.version_id) + + # Confirm Crossref was not called. + self.crossref_provider.fetch_metadata.assert_not_called() + + def test_publish_revision_crossref_checked_ok(self): + """ + Crossref should be called to check for updates to the publisher metadata when publishing + a revision that has a DOI. + """ + + # Mock the Crossref check. + doi_curie = "test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, doi_curie, 17169328.664) + ) + + # Create a revision with a DOI. + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + _, revision = self.initialize_collection_with_an_unpublished_revision(links=links) + + # Reset the Crossref check so we can check it on publish. + self.crossref_provider.fetch_metadata.reset_mock() + + # Publish the revision. + self.business_logic.publish_collection_version(revision.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + def test_publish_revision_changed_doi_collection_updated_ok(self): + """ + When publishing a revision - and there is a change in DOI - collection and artifacts should be updated. + """ + + # Mock the Crossref check. + doi_curie = "test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, doi_curie, 17169328.664) + ) + + # Create a revision with a DOI. + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + _, revision = self.initialize_collection_with_an_unpublished_revision(links=links) + + # Mock a DOI change from Crossref. + new_doi_curie = "new/test/doi" + self.crossref_provider.fetch_metadata.reset_mock() + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, new_doi_curie, 17169328.664) + ) + + # Mock the call to update the collection and artifacts. + self.business_logic.update_collection_version = Mock() + + # Mock the publisher metadata update. + self.database_provider.save_collection_publisher_metadata = Mock() + + # Publish the revision. + with self.assertRaises(CollectionPublishException): + self.business_logic.publish_collection_version(revision.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + # Confirm collection (and artifacts) update was called. + self.business_logic.update_collection_version.assert_called_once() + + # Confirm the metadata was not updated. + self.database_provider.save_collection_publisher_metadata.assert_not_called() + + def test_publish_revision_same_doi_publisher_metadata_not_updated_ok(self): + """ + When publishing a revision - and there is no change in DOI - publisher metadata should not be + updated if the deposited date is before the revised at or originally published at date of the + corresponding collection. + """ + + # Get the time before the collection is published. + before_published_at = datetime.utcnow().timestamp() + + # Create a revision with a DOI. + doi_curie = "test/doi" + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + _, revision = self.initialize_collection_with_an_unpublished_revision(links=links) + + # Mock the Crossref update, and set the deposited date to be before the published date of the collection. + expected_publisher_metadata = {"authors": ["Test Author"]} + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, doi_curie, before_published_at) + ) + + # Mock the publisher metadata update. + self.database_provider.save_collection_publisher_metadata = Mock() + + # Publish the revision. + self.business_logic.publish_collection_version(revision.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + # Confirm the metadata was not updated. + self.database_provider.save_collection_publisher_metadata.assert_not_called() + + def test_publish_revision_same_doi_publisher_metadata_updated_ok(self): + """ + When publishing a revision - and there is no change in DOI - publisher metadata should be updated + if the deposited date is after the revised at or published at date of the corresponding collection. + """ + + # Create a published collection. + doi_curie = "test/doi" + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + collection = self.initialize_published_collection(links=links) + + # Get the time after the collection has been published. + after_published_at = datetime.utcnow().timestamp() + + # Create a revision. + revision = self.business_logic.create_collection_version(collection.collection_id) + + # Mock the Crossref update, and set the deposited date to be after the published date of the collection. + expected_publisher_metadata = {"authors": ["Test Author"]} + self.crossref_provider.fetch_metadata = Mock( + return_value=(expected_publisher_metadata, doi_curie, after_published_at) + ) + + # Mock the publisher metadata update. + self.database_provider.save_collection_publisher_metadata = Mock() + + # Publish the revision. + self.business_logic.publish_collection_version(revision.version_id) + + # Confirm Crossref was called. + self.crossref_provider.fetch_metadata.assert_called_once() + + # Confirm the publisher metadata was updated. + self.database_provider.save_collection_publisher_metadata.assert_called_once() + + def test_update_crossref_metadata_same_doi_returns_none_ok(self): + """ + The check for changes in Crossref returns None if there is no change between the existing DOI and the + DOI returned from Crossref. + """ + + # Mock the Crossref check. + doi_curie = "test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, doi_curie, 17169328.664) + ) + + # Create a revision with a DOI. + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + _, revision = self.initialize_collection_with_an_unpublished_revision(links=links) + + # Check Crossref for updates. + doi_update = self.business_logic._update_crossref_metadata(revision, datetime.utcnow()) + + # Confirm the returned DOI update is None. + self.assertIsNone(doi_update) + + def test_update_crossref_metadata_changed_doi_returns_doi_update_ok(self): + """ + The check for changes in Crossref returns the existing DOI and the returned Crossref DOI if there is a + change between the two. + """ + + # Mock the Crossref check. + doi_curie = "test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, doi_curie, 17169328.664) + ) + + # Create a revision with a DOI. + links = [Link("test doi", "DOI", f"https://doi.org/{doi_curie}")] + _, revision = self.initialize_collection_with_an_unpublished_revision(links=links) + + # Mock a DOI change from Crossref. + new_doi_curie = "new/test/doi" + self.crossref_provider.fetch_metadata = Mock( + return_value=({"authors": ["Test Author"]}, new_doi_curie, 17169328.664) + ) + + # Check Crossref for updates. + doi_update = self.business_logic._update_crossref_metadata(revision, datetime.utcnow()) + + # Confirm the DOI update is not None, and the existing and new DOI are returned. + self.assertIsNotNone(doi_update) + self.assertEqual(doi_update[0], f"https://doi.org/{doi_curie}") + self.assertEqual(doi_update[1], f"https://doi.org/{new_doi_curie}") + class TestCollectionUtilities(BaseBusinessLogicTestCase): def test__delete_datasets_from_public_access_bucket(self): diff --git a/tests/unit/processing/test_process_validate.py b/tests/unit/processing/test_process_validate.py index f8e577c2bf0dd..79bc6fdcaafe6 100644 --- a/tests/unit/processing/test_process_validate.py +++ b/tests/unit/processing/test_process_validate.py @@ -27,7 +27,7 @@ def test_process_download_validate_success(self, mock_read_h5ad): 6. upload the labeled file to S3 """ dropbox_uri = "https://www.dropbox.com/s/fake_location/test.h5ad?dl=0" - self.crossref_provider.fetch_metadata = Mock(return_value=({}, "12.2345")) + self.crossref_provider.fetch_metadata = Mock(return_value=({}, "12.2345", 17169328.664)) collection = self.generate_unpublished_collection( links=[Link(name=None, type="DOI", uri="http://doi.org/12.2345")] From f7a6f57f37543d8dae182ea2c1ab72ead72d84cb Mon Sep 17 00:00:00 2001 From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Date: Thu, 13 Jun 2024 14:13:11 -0400 Subject: [PATCH 8/8] feat: support additional "migration revision" if collection already has an open revision during a migration (#6943) --- ...29561f38_add_is_migration_revision_flag.py | 28 ++++ backend/layers/business/business.py | 31 ++-- backend/layers/common/entities.py | 1 + backend/layers/persistence/orm.py | 1 + backend/layers/persistence/persistence.py | 27 +++- .../persistence/persistence_interface.py | 9 +- .../layers/persistence/persistence_mock.py | 18 ++- .../layers/processing/publish_revisions.py | 2 +- backend/layers/processing/schema_migration.py | 11 +- .../backend/layers/business/test_business.py | 133 +++++++++++++++++- .../processing/schema_migration/conftest.py | 41 +++++- .../test_collection_migrate.py | 41 ++++++ .../test_gather_collections.py | 44 +++++- .../test_publish_revisions.py | 27 ++-- tests/unit/processing/test_handle_error.py | 1 + 15 files changed, 378 insertions(+), 37 deletions(-) create mode 100644 backend/database/versions/07_e31a29561f38_add_is_migration_revision_flag.py diff --git a/backend/database/versions/07_e31a29561f38_add_is_migration_revision_flag.py b/backend/database/versions/07_e31a29561f38_add_is_migration_revision_flag.py new file mode 100644 index 0000000000000..842e2d9b27c4c --- /dev/null +++ b/backend/database/versions/07_e31a29561f38_add_is_migration_revision_flag.py @@ -0,0 +1,28 @@ +"""add is_auto_version flag + +Revision ID: 07_e31a29561f38 +Revises: 06_8bced1b1470b +Create Date: 2024-04-23 10:32:09.778855 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "07_e31a29561f38" +down_revision = "06_8bced1b1470b" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "CollectionVersion", + sa.Column("is_auto_version", sa.BOOLEAN(), nullable=True), + schema="persistence_schema", + ) + + +def downgrade(): + op.drop_column("CollectionVersion", "is_auto_version", schema="persistence_schema") diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py index 32784b481cb35..eb61f5aaddb05 100644 --- a/backend/layers/business/business.py +++ b/backend/layers/business/business.py @@ -799,20 +799,30 @@ def update_dataset_artifact(self, artifact_id: DatasetArtifactId, artifact_uri: """ self.database_provider.update_dataset_artifact(artifact_id, artifact_uri) - def create_collection_version(self, collection_id: CollectionId) -> CollectionVersionWithDatasets: + def create_collection_version( + self, collection_id: CollectionId, is_auto_version: bool = False + ) -> CollectionVersionWithDatasets: """ Creates a collection version for an existing canonical collection. - Also ensures that the collection does not have any active, unpublished version + If is_auto_version is False, ensures that the collection does not have any active, unpublished version. + If is_auto_version is True, ensures that the collection does not have an active, unpublished migration + revision. """ all_versions = self.database_provider.get_all_versions_for_collection(collection_id) if not all_versions: raise CollectionVersionException(f"Collection {collection_id} cannot be found") - if any(v for v in all_versions if v.published_at is None): - raise CollectionVersionException(f"Collection {collection_id} already has an unpublished version") + unpublished_versions = [v for v in all_versions if v.published_at is None] + if unpublished_versions: + if is_auto_version and any(v.is_auto_version for v in unpublished_versions): + raise CollectionVersionException( + f"Collection {collection_id} already has an unpublished migration revision." + ) + elif not is_auto_version: + raise CollectionVersionException(f"Collection {collection_id} already has an unpublished version") - added_version_id = self.database_provider.add_collection_version(collection_id) + added_version_id = self.database_provider.add_collection_version(collection_id, is_auto_version) return self.get_collection_version(added_version_id) def delete_collection_version(self, collection_version: CollectionVersionWithDatasets) -> None: @@ -987,6 +997,7 @@ def publish_collection_version( ) # Finalize Collection publication and delete any tombstoned assets + is_auto_version = version.is_auto_version dataset_version_ids_to_delete_from_s3 = self.database_provider.finalize_collection_version( version.collection_id, version_id, @@ -997,12 +1008,16 @@ def publish_collection_version( self.delete_dataset_versions_from_public_bucket(dataset_version_ids_to_delete_from_s3) # Handle cleanup of unpublished versions + versions_to_keep = {dv.version_id.id for dv in version.datasets} + # If version published was an auto_version, there may be an open revision with dataset versions to keep + if is_auto_version: + open_revision = self.database_provider.get_unpublished_versions_for_collection(version.collection_id) + if open_revision: + versions_to_keep.update({dv.version_id.id for dv in open_revision[0].datasets}) dataset_versions = self.database_provider.get_all_dataset_versions_for_collection( version.collection_id, from_date=date_of_last_publish ) - versions_to_delete = list( - filter(lambda dv: dv.version_id.id not in {dv.version_id.id for dv in version.datasets}, dataset_versions) - ) + versions_to_delete = list(filter(lambda dv: dv.version_id.id not in versions_to_keep, dataset_versions)) self.delete_dataset_version_assets(versions_to_delete) self.database_provider.delete_dataset_versions(versions_to_delete) diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py index c60d6cd7646dc..61227dae938b0 100644 --- a/backend/layers/common/entities.py +++ b/backend/layers/common/entities.py @@ -297,6 +297,7 @@ class CollectionVersionBase: schema_version: str canonical_collection: CanonicalCollection has_custom_dataset_order: bool + is_auto_version: Optional[bool] data_submission_policy_version: str def is_published(self) -> bool: diff --git a/backend/layers/persistence/orm.py b/backend/layers/persistence/orm.py index fa44c4a56982c..9fd7b7390e00f 100644 --- a/backend/layers/persistence/orm.py +++ b/backend/layers/persistence/orm.py @@ -36,6 +36,7 @@ class CollectionVersionTable: schema_version = Column(String) datasets = Column(ARRAY(UUID(as_uuid=True))) has_custom_dataset_order = Column(BOOLEAN) + is_auto_version = Column(BOOLEAN) data_submission_policy_version = Column(String) diff --git a/backend/layers/persistence/persistence.py b/backend/layers/persistence/persistence.py index 97a0af873c3f0..90c5e5e5d5ced 100644 --- a/backend/layers/persistence/persistence.py +++ b/backend/layers/persistence/persistence.py @@ -116,6 +116,7 @@ def _row_to_collection_version(self, row: Any, canonical_collection: CanonicalCo schema_version=row.schema_version, canonical_collection=canonical_collection, has_custom_dataset_order=row.has_custom_dataset_order, + is_auto_version=row.is_auto_version, data_submission_policy_version=row.data_submission_policy_version, ) @@ -136,6 +137,7 @@ def _row_to_collection_version_with_datasets( schema_version=row.schema_version, canonical_collection=canonical_collection, has_custom_dataset_order=row.has_custom_dataset_order, + is_auto_version=row.is_auto_version, data_submission_policy_version=row.data_submission_policy_version, ) @@ -243,6 +245,7 @@ def create_canonical_collection( schema_version=None, datasets=list(), has_custom_dataset_order=False, + is_auto_version=False, data_submission_policy_version=None, ) @@ -383,6 +386,26 @@ def get_all_versions_for_collection( versions.append(version) return versions + def get_unpublished_versions_for_collection( + self, collection_id: CollectionId, get_tombstoned: bool = False + ) -> List[CollectionVersionWithDatasets]: + """ + Retrieves all versions for a specific collections that have published_at set to None + """ + with self._manage_session() as session: + version_rows = ( + session.query(CollectionVersionTable).filter_by(collection_id=collection_id.id, published_at=None).all() + ) + canonical_collection = self.get_canonical_collection(collection_id) + versions = list() + for i in range(len(version_rows)): + datasets = self.get_dataset_versions_by_id( + [DatasetVersionId(str(id)) for id in version_rows[i].datasets], get_tombstoned=get_tombstoned + ) + version = self._row_to_collection_version_with_datasets(version_rows[i], canonical_collection, datasets) + versions.append(version) + return versions + def get_all_collections_versions(self, get_tombstoned: bool = False) -> Iterable[CollectionVersion]: """ Retrieves all versions of all collections. @@ -515,7 +538,7 @@ def save_collection_publisher_metadata( version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one() version.publisher_metadata = json.dumps(publisher_metadata) - def add_collection_version(self, collection_id: CollectionId) -> CollectionVersionId: + def add_collection_version(self, collection_id: CollectionId, is_auto_version: bool) -> CollectionVersionId: """ Adds a collection version to an existing canonical collection. The new version copies all data from the previous version except version_id, schema_version, data_submission_policy_version and datetime-based @@ -538,6 +561,7 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi schema_version=None, datasets=current_version.datasets, has_custom_dataset_order=current_version.has_custom_dataset_order, + is_auto_version=is_auto_version, data_submission_policy_version=None, ) session.add(new_version) @@ -634,6 +658,7 @@ def finalize_collection_version( collection_version.published_at = published_at collection_version.schema_version = schema_version collection_version.data_submission_policy_version = data_submission_policy_version + collection_version.is_auto_version = False dataset_ids_for_new_collection_version = [ d.dataset_id.id for d in self.get_collection_version_with_datasets(version_id).datasets diff --git a/backend/layers/persistence/persistence_interface.py b/backend/layers/persistence/persistence_interface.py index 7d111a14f38a3..84743f086c941 100644 --- a/backend/layers/persistence/persistence_interface.py +++ b/backend/layers/persistence/persistence_interface.py @@ -81,6 +81,13 @@ def get_all_mapped_collection_versions(self) -> Iterable[CollectionVersion]: Retrieves all the collection versions that are mapped to a canonical collection. """ + def get_unpublished_versions_for_collection( + self, collection_id: CollectionId, get_tombstoned: bool = False + ) -> List[CollectionVersionWithDatasets]: + """ + Retrieves all versions for a specific collections that have published_at set to None + """ + def tombstone_collection(self, collection_id: CollectionId) -> None: """ Deletes (tombstones) a canonical collection. @@ -105,7 +112,7 @@ def save_collection_publisher_metadata( Saves publisher metadata for a collection version. Specify None to remove it """ - def add_collection_version(self, collection_id: CollectionId) -> CollectionVersionId: + def add_collection_version(self, collection_id: CollectionId, is_auto_version: bool) -> CollectionVersionId: """ Adds a collection version to an existing canonical collection. The new version copies the following data from the previous version: owner, metadata, publisher_metadata, datasets (IDs). diff --git a/backend/layers/persistence/persistence_mock.py b/backend/layers/persistence/persistence_mock.py index d952d21e54483..61015e939141b 100644 --- a/backend/layers/persistence/persistence_mock.py +++ b/backend/layers/persistence/persistence_mock.py @@ -79,6 +79,7 @@ def create_canonical_collection( canonical_collection=canonical, datasets=[], has_custom_dataset_order=False, + is_auto_version=False, data_submission_policy_version=None, ) self.collections_versions[version_id.id] = version @@ -187,7 +188,7 @@ def save_collection_publisher_metadata( ) -> None: self.collections_versions[version_id.id].publisher_metadata = copy.deepcopy(publisher_metadata) - def add_collection_version(self, collection_id: CollectionId) -> CollectionVersionId: + def add_collection_version(self, collection_id: CollectionId, is_auto_version: bool) -> CollectionVersionId: cc = self.collections[collection_id.id] current_version_id = cc.version_id current_version = self.collections_versions[current_version_id.id] @@ -209,6 +210,7 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi schema_version=None, canonical_collection=cc, has_custom_dataset_order=current_version.has_custom_dataset_order, + is_auto_version=is_auto_version, data_submission_policy_version=None, ) self.collections_versions[new_version_id.id] = collection_version @@ -269,6 +271,19 @@ def get_all_versions_for_collection( ) return versions + def get_unpublished_versions_for_collection( + self, collection_id: CollectionId, get_tombstoned: bool = False + ) -> Iterable[CollectionVersionWithDatasets]: + versions = [] + for collection_version in self.collections_versions.values(): + if collection_version.collection_id == collection_id and collection_version.published_at is None: + versions.append( + self._update_version_with_canonical( + collection_version, update_datasets=True, get_tombstoned=get_tombstoned + ) + ) + return versions + def get_collection_version_with_datasets( self, version_id: CollectionVersionId, get_tombstoned: bool = False ) -> CollectionVersionWithDatasets: @@ -336,6 +351,7 @@ def finalize_collection_version( self.collections_versions[version_id.id].published_at = published_at self.collections_versions[version_id.id].schema_version = schema_version self.collections_versions[version_id.id].data_submission_policy_version = data_submission_policy_version + self.collections_versions[version_id.id].is_auto_version = False return dataset_version_ids_to_delete_from_s3 diff --git a/backend/layers/processing/publish_revisions.py b/backend/layers/processing/publish_revisions.py index 0e49ab1fb58b0..732923566a638 100644 --- a/backend/layers/processing/publish_revisions.py +++ b/backend/layers/processing/publish_revisions.py @@ -67,7 +67,7 @@ def check_datasets(self, collection_version: CollectionVersionWithDatasets) -> L def run(self): for collection_version in self.business_logic.get_collections(CollectionQueryFilter(is_published=False)): - if collection_version.is_unpublished_version(): + if collection_version.is_unpublished_version() and collection_version.is_auto_version: if collection_version.version_id.id in self.do_not_publish_list: self.logger.info( "Skipping collection version, it is in the do not publish list", diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index fe0f8bda71d47..5ac14a516580c 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -62,11 +62,11 @@ def gather_collections(self, auto_publish: bool) -> Tuple[Dict[str, str], Dict[s """ response_for_span_collections = [] - has_revision = set() + has_migration_revision = set() # iterates over unpublished collections first, so published versions are skipped if there is an active revision for collection in self.fetch_collections(): _resp = {} - if collection.is_published() and collection.collection_id.id in has_revision: + if collection.is_published() and collection.collection_id.id in has_migration_revision: continue if collection.is_published(): @@ -74,8 +74,10 @@ def gather_collections(self, auto_publish: bool) -> Tuple[Dict[str, str], Dict[s _resp["can_publish"] = str(True) elif collection.is_unpublished_version(): # active revision of a published collection. - has_revision.add(collection.collection_id.id) # revision found, skip published version _resp["can_publish"] = str(False) + if collection.is_auto_version: + has_migration_revision.add(collection.collection_id.id) # migration revision found, skip published + _resp["can_publish"] = str(True) elif collection.is_initial_unpublished_version(): # unpublished collection _resp["can_publish"] = str(False) @@ -175,7 +177,8 @@ def collection_migrate( if version.is_published(): # Create a new collection version(revision) if the collection is already published private_collection_version_id = self.business_logic.create_collection_version( - CollectionId(collection_id) + CollectionId(collection_id), + is_auto_version=True, ).version_id.id else: private_collection_version_id = collection_version_id diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index 6dcc250b77ebd..df9ed55291d63 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -1756,16 +1756,101 @@ def test_create_collection_version_ok(self): self.assertEqual(version.version_id, published_collection.version_id) self.assertNotEqual(version.version_id, new_version.version_id) - def test_create_collection_version_fails_if_other_versions(self): + def test_create_collection_version_multiple_auto_versions_false_fails(self): """ - A collection version can only be created if there are no other unpublished versions - for that specific canonical collection + Given a canonical collection with a published version and an unpublished version with is_auto_version == False, + creating an unpublished collection version where is_auto_verison is False will fail. """ published_collection = self.initialize_published_collection() - self.business_logic.create_collection_version(published_collection.collection_id) + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=False) with self.assertRaises(CollectionVersionException): - self.business_logic.create_collection_version(published_collection.collection_id) + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=False) + + def test_create_collection_version_multiple_auto_versions_fails(self): + """ + Given a canonical collection with a published version and an unpublished version with is_auto_version == True, + creating another unpublished collection version where is_auto_verison is False will fail. + """ + published_collection = self.initialize_published_collection() + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=True) + + with self.assertRaises(CollectionVersionException): + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=False) + + def test_create_collection_version_multiple_auto_versions_true_fails(self): + """ + Given a canonical collection with a published version and an unpublished version where is_auto_version is True, + creating another unpublished collection version where is_auto_version is True will fail. + """ + published_collection = self.initialize_published_collection() + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=True) + + with self.assertRaises(CollectionVersionException): + self.business_logic.create_collection_version(published_collection.collection_id, is_auto_version=True) + + def test_create_collection_version_is_auto_version_succeeds(self): + """ + Given a canonical collection with a published version and no unpublished versions, + creating an unpublished collection version where is_auto_verison is True will succeed. + """ + published_collection = self.initialize_published_collection() + new_version = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=True + ) + + # The new version has a different version_id + self.assertNotEqual(published_collection.version_id, new_version.version_id) + + # The new version has the same collection_id, collection metadata, and datasets + self.assertEqual(published_collection.collection_id, new_version.collection_id) + self.assertEqual(published_collection.metadata, new_version.metadata) + self.assertEqual(published_collection.datasets, new_version.datasets) + + # The new version should not be published (aka Private) + self.assertIsNone(new_version.published_at) + + # The canonical collection should be published + self.assertIsNotNone(new_version.canonical_collection.originally_published_at) + + # get_collection still retrieves the original version + version = self.business_logic.get_published_collection_version(published_collection.collection_id) + self.assertEqual(version.version_id, published_collection.version_id) + self.assertNotEqual(version.version_id, new_version.version_id) + + self.assertTrue(new_version.is_auto_version) + + def test_create_collection_version_is_auto_version_succeeds__with_revision(self): + """ + Given a canonical collection with a published version and an unpublished version where is_auto_version is False, + creating another unpublished collection version where is_auto_version is True will succeed. + """ + published_collection = self.initialize_published_collection() + existing_revision = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=False + ) + auto_revision = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=True + ) + + # The new version has a different version_id + self.assertNotEqual(published_collection.version_id, auto_revision.version_id) + self.assertNotEqual(existing_revision.version_id, auto_revision.version_id) + + # Both revisions are linked to the same canonical collection + self.assertEqual(published_collection.collection_id, existing_revision.collection_id) + self.assertEqual(published_collection.collection_id, auto_revision.collection_id) + + # The new version has the same collection metadata, and datasets + self.assertEqual(published_collection.metadata, auto_revision.metadata) + self.assertEqual(published_collection.datasets, auto_revision.datasets) + + # The new version should not be published (aka Private) + self.assertIsNone(auto_revision.published_at) + + # The new version has is_auto_version == True, while existing is False + self.assertFalse(existing_revision.is_auto_version) + self.assertTrue(auto_revision.is_auto_version) def test_create_collection_version_fails_if_collection_not_exists(self): """ @@ -1917,6 +2002,7 @@ def test_publish_version_ok(self): self.assertIsNotNone(published_version.canonical_collection.originally_published_at) self.assertEqual(published_version.published_at, published_version.canonical_collection.originally_published_at) self.assertEqual(published_version.data_submission_policy_version, DATA_SUBMISSION_POLICY_VERSION) + self.assertEqual(published_version.is_auto_version, False) # The published and unpublished collection have the same collection_id and version_id self.assertEqual(published_version.collection_id, unpublished_collection.collection_id) @@ -2139,6 +2225,43 @@ def test_publish_version_does_not_change_original_published_at_ok(self): canonical = self.business_logic.get_collection_version(second_version.version_id).canonical_collection self.assertEqual(canonical.originally_published_at, first_version.published_at) + def test_publish_version_sets_is_auto_version_False(self): + """ + When publishing a collection version, is_auto_version should be set to False + """ + published_collection = self.initialize_published_collection() + new_version = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=True + ) + self.assertTrue(new_version.is_auto_version) + self.business_logic.publish_collection_version(new_version.version_id) + + published_new_version = self.business_logic.get_collection_version(new_version.version_id) + self.assertFalse(published_new_version.is_auto_version) + + def test_publish_version_cleanup_with_auto_version(self): + """ + When publishing an auto_version, do NOT cleanup dataset versions from an open revision + """ + published_collection = self.initialize_published_collection() + collection_open_revision = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=False + ) + dataset_to_replace = collection_open_revision.datasets[0] + new_dataset_version_in_open_revision = self.database_provider.replace_dataset_in_collection_version( + collection_open_revision.version_id, dataset_to_replace.version_id + ) + auto_version = self.business_logic.create_collection_version( + published_collection.collection_id, is_auto_version=True + ) + self.business_logic.publish_collection_version(auto_version.version_id) + + # open revision's new dataset and its artifacts should still be persisted + self.assertIsNotNone( + self.database_provider.get_dataset_version(new_dataset_version_in_open_revision.version_id) + ) + [self.assertTrue(self.s3_provider.uri_exists(a.uri)) for a in dataset_to_replace.artifacts] + def test_get_all_collections_published_does_not_retrieve_old_versions(self): """ `get_collections` with is_published=True should not return versions that were previously diff --git a/tests/unit/processing/schema_migration/conftest.py b/tests/unit/processing/schema_migration/conftest.py index 6a8514242837e..52e93970b4d58 100644 --- a/tests/unit/processing/schema_migration/conftest.py +++ b/tests/unit/processing/schema_migration/conftest.py @@ -52,6 +52,7 @@ def published_collection(): collection.is_initial_unpublished_version.return_value = False collection.collection_id = CollectionId() collection.version_id = CollectionVersionId() + collection.is_auto_version = False collection.datasets = [] for _i in range(2): collection.datasets.append(make_mock_dataset_version()) @@ -66,6 +67,7 @@ def revision(): published_collection_with_revision.is_initial_unpublished_version.return_value = False published_collection_with_revision.collection_id = CollectionId() published_collection_with_revision.version_id = CollectionVersionId() + published_collection_with_revision.is_auto_version = False published_collection_with_revision.datasets = [] for _i in range(2): published_collection_with_revision.datasets.append(make_mock_dataset_version()) @@ -76,6 +78,33 @@ def revision(): collection.is_initial_unpublished_version.return_value = False collection.collection_id = published_collection_with_revision.collection_id collection.version_id = CollectionVersionId() + collection.is_auto_version = False + collection.datasets = [] + for _dataset in published_collection_with_revision.datasets: + collection.datasets.append(make_mock_dataset_version(dataset_id=_dataset.dataset_id.id)) + return [published_collection_with_revision, collection] + + +@pytest.fixture +def migration_revision(): + published_collection_with_revision = mock.Mock(spec=CollectionVersionWithDatasets, name="published_with_revision") + published_collection_with_revision.is_published.return_value = True + published_collection_with_revision.is_unpublished_version.return_value = False + published_collection_with_revision.is_initial_unpublished_version.return_value = False + published_collection_with_revision.collection_id = CollectionId() + published_collection_with_revision.version_id = CollectionVersionId() + published_collection_with_revision.is_auto_version = False + published_collection_with_revision.datasets = [] + for _i in range(2): + published_collection_with_revision.datasets.append(make_mock_dataset_version()) + + collection = mock.Mock(spec=CollectionVersionWithDatasets, name="revision") + collection.is_published.return_value = False + collection.is_unpublished_version.return_value = True + collection.is_initial_unpublished_version.return_value = False + collection.collection_id = published_collection_with_revision.collection_id + collection.version_id = CollectionVersionId() + collection.is_auto_version = True collection.datasets = [] for _dataset in published_collection_with_revision.datasets: collection.datasets.append(make_mock_dataset_version(dataset_id=_dataset.dataset_id.id)) @@ -91,6 +120,7 @@ def private(): collection.collection_id = CollectionId() collection.version_id = CollectionVersionId() collection.datasets = [] + collection.is_auto_version = False for _i in range(2): collection.datasets.append(make_mock_dataset_version()) return collection @@ -109,12 +139,14 @@ def schema_migrate(tmpdir): @pytest.fixture def schema_migrate_and_collections( - tmpdir, schema_migrate, published_collection, revision, private + tmpdir, schema_migrate, published_collection, revision, migration_revision, private ) -> Tuple[SchemaMigrate, Dict[str, List]]: db = { published_collection.version_id.id: published_collection, revision[0].version_id.id: revision[0], revision[1].version_id.id: revision[1], + migration_revision[0].version_id.id: migration_revision[0], + migration_revision[1].version_id.id: migration_revision[1], private.version_id.id: private, } @@ -126,4 +158,9 @@ def _get_collection_url(collection_id): schema_migrate.business_logic.get_collection_version = _get_collection_version schema_migrate.business_logic.get_collection_url = _get_collection_url - return schema_migrate, {"published": [published_collection], "revision": revision, "private": [private]} + return schema_migrate, { + "published": [published_collection], + "revision": revision, + "migration_revision": migration_revision, + "private": [private], + } diff --git a/tests/unit/processing/schema_migration/test_collection_migrate.py b/tests/unit/processing/schema_migration/test_collection_migrate.py index cec2d8d5d823e..ac6d7d3be21d7 100644 --- a/tests/unit/processing/schema_migration/test_collection_migrate.py +++ b/tests/unit/processing/schema_migration/test_collection_migrate.py @@ -164,3 +164,44 @@ def test_no_datasets(self, schema_migrate_and_collections): assert "key_name" not in response assert response["collection_version_id"] == published.version_id.id assert response["execution_id"] == "test-execution-arn" + + def test_create_migration_revision__private(self, schema_migrate_and_collections): + schema_migrate, collections = schema_migrate_and_collections + schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response) + schema_migrate.schema_version = "0.0.0" + private = collections["private"][0] + schema_migrate.business_logic.create_collection_version = Mock( + return_value=Mock(version_id=CollectionVersionId()) + ) + + # only call create_collection_version if the collection is published + schema_migrate.collection_migrate(private.collection_id.id, private.version_id.id, False) + schema_migrate.business_logic.create_collection_version.assert_not_called() + + def test_create_migration_revision__published_with_revision(self, schema_migrate_and_collections): + schema_migrate, collections = schema_migrate_and_collections + schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response) + schema_migrate.schema_version = "0.0.0" + published, revision = collections["revision"] + schema_migrate.business_logic.create_collection_version = Mock( + return_value=Mock(version_id=CollectionVersionId()) + ) + + schema_migrate.collection_migrate(revision.collection_id.id, revision.version_id.id, False) + schema_migrate.business_logic.create_collection_version.assert_not_called() + + schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id, False) + schema_migrate.business_logic.create_collection_version.assert_called_once() + + def test_create_migration_revision__published_no_revision(self, schema_migrate_and_collections): + schema_migrate, collections = schema_migrate_and_collections + schema_migrate._store_sfn_response = Mock(wraps=schema_migrate._store_sfn_response) + schema_migrate.schema_version = "0.0.0" + published = collections["published"][0] + + schema_migrate.business_logic.create_collection_version = Mock( + return_value=Mock(version_id=CollectionVersionId()) + ) + + schema_migrate.collection_migrate(published.collection_id.id, published.version_id.id, False) + schema_migrate.business_logic.create_collection_version.assert_called_once() diff --git a/tests/unit/processing/schema_migration/test_gather_collections.py b/tests/unit/processing/schema_migration/test_gather_collections.py index 3003d999b6eb8..c270b8f62a2f0 100644 --- a/tests/unit/processing/schema_migration/test_gather_collections.py +++ b/tests/unit/processing/schema_migration/test_gather_collections.py @@ -6,19 +6,39 @@ def test_with_revision(self, schema_migrate_and_collections): # get_collections is called twice schema_migrate.business_logic.get_collections.side_effect = [[published], [revision]] _, response = schema_migrate.gather_collections(auto_publish=True) + assert len(response) == 2 assert { "can_publish": "True", "collection_id": published.collection_id.id, - "collection_version_id": published.collection_id.id, + "collection_version_id": published.version_id.id, "execution_id": "test-execution-arn", - } not in response + } in response assert { "can_publish": "False", "collection_id": revision.collection_id.id, "collection_version_id": revision.version_id.id, "execution_id": "test-execution-arn", } in response + + def test_with_migration_revision(self, schema_migrate_and_collections): + schema_migrate, collections = schema_migrate_and_collections + published, migration_revision = collections["migration_revision"] + # get_collections is called twice + schema_migrate.business_logic.get_collections.side_effect = [[published], [migration_revision]] + _, response = schema_migrate.gather_collections(auto_publish=True) assert len(response) == 1 + assert { + "can_publish": "True", + "collection_id": published.collection_id.id, + "collection_version_id": published.version_id.id, + "execution_id": "test-execution-arn", + } not in response + assert { + "can_publish": "True", + "collection_id": migration_revision.collection_id.id, + "collection_version_id": migration_revision.version_id.id, + "execution_id": "test-execution-arn", + } in response def test_with_published(self, schema_migrate_and_collections): schema_migrate, collections = schema_migrate_and_collections @@ -52,13 +72,15 @@ def test_with_auto_publish_false(self, schema_migrate_and_collections): schema_migrate, collections = schema_migrate_and_collections published_no_revision = collections["published"][0] private = collections["private"][0] - published_with_revision, revision = collections["revision"] + published_with_non_migration_revision, revision = collections["revision"] + published_with_migration_revision, migration_revision = collections["migration_revision"] # get_collections is called twice schema_migrate.business_logic.get_collections.side_effect = [ - [published_no_revision, published_with_revision], - [private, revision], + [published_no_revision, published_with_non_migration_revision, published_with_migration_revision], + [private, revision, migration_revision], ] _, response = schema_migrate.gather_collections(auto_publish=False) + assert len(response) == 5 for obj in [ { "can_publish": "False", @@ -78,5 +100,17 @@ def test_with_auto_publish_false(self, schema_migrate_and_collections): "collection_version_id": revision.version_id.id, "execution_id": "test-execution-arn", }, + { + "can_publish": "False", + "collection_id": published_with_non_migration_revision.collection_id.id, + "collection_version_id": published_with_non_migration_revision.version_id.id, + "execution_id": "test-execution-arn", + }, + { + "can_publish": "False", + "collection_id": migration_revision.collection_id.id, + "collection_version_id": migration_revision.version_id.id, + "execution_id": "test-execution-arn", + }, ]: assert obj in response diff --git a/tests/unit/processing/schema_migration/test_publish_revisions.py b/tests/unit/processing/schema_migration/test_publish_revisions.py index f299cd8dd17cb..2e0472bbb24c7 100644 --- a/tests/unit/processing/schema_migration/test_publish_revisions.py +++ b/tests/unit/processing/schema_migration/test_publish_revisions.py @@ -19,12 +19,14 @@ def publish_revisions() -> PublishRevisions: @pytest.fixture def publish_revisions_and_collections( - tmpdir, publish_revisions, published_collection, revision, private + tmpdir, publish_revisions, published_collection, revision, migration_revision, private ) -> Tuple[PublishRevisions, Dict[str, List]]: db = { published_collection.version_id.id: published_collection, revision[0].version_id.id: revision[0], revision[1].version_id.id: revision[1], + migration_revision[0].version_id.id: migration_revision[0], + migration_revision[1].version_id.id: migration_revision[1], private.version_id.id: private, } @@ -36,7 +38,12 @@ def _get_collections(filter): publish_revisions.business_logic.get_collection_version = _get_collection_version publish_revisions.business_logic.get_collections = _get_collections - return publish_revisions, {"published": [published_collection], "revision": revision, "private": [private]} + return publish_revisions, { + "published": [published_collection], + "revision": revision, + "migration_revision": migration_revision, + "private": [private], + } class TestPublishRevisions: @@ -89,7 +96,7 @@ def test_run_pos(self, publish_revisions_and_collections, caplog): # Mock necessary objects and methods caplog.set_level(logging.INFO) publish_revisions, collections = publish_revisions_and_collections - _, revision = collections["revision"] + _, migration_revision = collections["migration_revision"] publish_revisions.business_logic.get_collections.return_value = [collections] publish_revisions.check_datasets = Mock(return_value=[]) @@ -97,16 +104,18 @@ def test_run_pos(self, publish_revisions_and_collections, caplog): publish_revisions.run() # Assert the calls and behavior - publish_revisions.check_datasets.assert_called_once_with(revision) + publish_revisions.check_datasets.assert_called_once_with(migration_revision) assert "Publishing collection version." in caplog.messages - publish_revisions.business_logic.publish_collection_version.assert_called_once_with(revision.version_id) + publish_revisions.business_logic.publish_collection_version.assert_called_once_with( + migration_revision.version_id + ) def test_run_neg(self, publish_revisions_and_collections, caplog): """Run method when check_datasets returns errors.""" # Mock necessary objects and methods caplog.set_level(logging.INFO) publish_revisions, collections = publish_revisions_and_collections - _, revision = collections["revision"] + _, migration_revision = collections["migration_revision"] publish_revisions.business_logic.get_collections.return_value = [collections] publish_revisions.check_datasets = Mock(return_value=["error message"]) @@ -114,7 +123,7 @@ def test_run_neg(self, publish_revisions_and_collections, caplog): publish_revisions.run() # Assert the calls and behavior - publish_revisions.check_datasets.assert_called_once_with(revision) + publish_revisions.check_datasets.assert_called_once_with(migration_revision) assert "Unable to publish collection version." in caplog.messages publish_revisions.business_logic.publish_collection_version.assert_not_called() @@ -123,9 +132,9 @@ def test_run_pos__do_not_publish_list(self, publish_revisions_and_collections, c # Mock necessary objects and methods caplog.set_level(logging.INFO) publish_revisions, collections = publish_revisions_and_collections - _, revision = collections["revision"] + _, migration_revision = collections["migration_revision"] publish_revisions.business_logic.get_collections.return_value = [collections] - publish_revisions.do_not_publish_list = [revision.version_id.id] + publish_revisions.do_not_publish_list = [migration_revision.version_id.id] publish_revisions.check_datasets = Mock(return_value=[]) # Call the method diff --git a/tests/unit/processing/test_handle_error.py b/tests/unit/processing/test_handle_error.py index 6b18d35fbf4da..446f951c27897 100644 --- a/tests/unit/processing/test_handle_error.py +++ b/tests/unit/processing/test_handle_error.py @@ -98,6 +98,7 @@ def get_collection_version_mock(): tombstoned=False, ), has_custom_dataset_order=False, + is_auto_version=False, data_submission_policy_version="2.0", ) )