From e22bf1a7c3625946b33f66d7a71dae4d1f1676c6 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 2 Oct 2024 11:31:34 +0100 Subject: [PATCH 1/7] Added validation for required attributes --- .../production/metadata/api/models/dataset.py | 3 +- .../production/metadata/updater/core.py | 38 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 8668de9..1a9e2a7 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -16,7 +16,7 @@ import sqlalchemy from sqlalchemy import Column, Integer, String, text, ForeignKey, Index, JSON -from sqlalchemy.dialects.mysql import DATETIME +from sqlalchemy.dialects.mysql import DATETIME, TINYINT from sqlalchemy.orm import relationship, backref from sqlalchemy.sql import func from sqlalchemy.types import Enum @@ -49,6 +49,7 @@ class Attribute(LoadAble, Base): name = Column(String(128), nullable=False) label = Column(String(128), nullable=False) description = Column(String(255)) + required = Column(TINYINT(1), nullable=False, default=0) type = Column(Enum('string', 'percent', 'float', 'integer', 'bp', 'number'), server_default=text("'string'")) # One to many relationships # attribute_id within dataset attribute diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index e0354b0..2001ce9 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -35,24 +35,56 @@ logger = logging.getLogger(__name__) +# TODO: +# test_update_unreleased_no_force +# test_update_released_force +# dataset_attribute collumn with the mandatory. +# genebuild.start_date should not be required. +# Test for required keys. class CoreMetaUpdater(BaseMetaUpdater): def __init__(self, db_uri, metadata_uri, release=None, force=None): super().__init__(db_uri, metadata_uri, release, force) self.db_type = 'core' # Single query to get all of the metadata information. self.meta_dict = {} + self._load_meta_dict() + self._validate_required_attributes() + + def _load_meta_dict(self): + """Load metadata into meta_dict from the database.""" with self.db.session_scope() as session: - results = session.query(Meta).all() + results = session.query(Meta).filter(Meta.meta_value.isnot(None), + Meta.meta_value.notin_(['', 'Null', 'NULL'])).all() for result in results: species_id = result.species_id meta_key = result.meta_key meta_value = result.meta_value - if species_id not in self.meta_dict: self.meta_dict[species_id] = {} # WARNING! Duplicated meta_keys for a species_id will not error out!. A datacheck is necessary for key values. self.meta_dict[species_id][meta_key] = meta_value + def _validate_required_attributes(self): + """Check if all required attributes are present in the meta_dict for each species.""" + required_attribute_names = [] + with self.metadata_db.session_scope() as session: + # Query the attribute table to get all required attributes + required_attributes = session.query(Attribute.name).filter(Attribute.required == 1).all() + required_attribute_names = {attr.name for attr in required_attributes} + + with self.db.session_scope() as session: + # Check each species_id in meta_dict + missing_attributes = {} + for species_id, meta in self.meta_dict.items(): + missing = required_attribute_names - set(meta.keys()) + if missing: + missing_attributes[species_id] = missing + + if missing_attributes: + # TODO: TEST THIS RETURN VALUE + exceptions.MissingMetaException( + "Species ID {species_id} is missing required attributes: {missing_attributes}") + # Basic API for the meta table in the submission database. def get_meta_single_meta_key(self, species_id, parameter): species_meta = self.meta_dict.get(species_id) @@ -317,6 +349,7 @@ def get_or_new_organism(self, species_id, meta_session): # Getting the common name from the meta table, otherwise we grab it from ncbi. common_name = self.get_meta_single_meta_key(species_id, "species.common_name") + #TODO: Taxid should be required. taxid = self.get_meta_single_meta_key(species_id, "species.taxonomy_id") if common_name is None or common_name == "": @@ -432,6 +465,7 @@ def get_assembly_sequences(self, species_id, assembly): accession_info = defaultdict( # The None's here are improper, but they break far too much for this update if they are changed. # When accession is decided I will fix them. + # TODO: Just delete the comment. No one cares about the assembly sequence table. lambda: { "names": set(), "accession": None, "length": None, "location": None, "chromosomal": None, "karyotype_rank": None From 776b7fd3e690f731e457cac328757ddcd19f022d Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 10 Oct 2024 11:24:36 +0100 Subject: [PATCH 2/7] Added validation for required attributes --- .../production/metadata/updater/core.py | 62 ++++++++----------- src/tests/databases/core_1/meta.txt | 2 +- src/tests/databases/core_2/meta.txt | 2 +- src/tests/databases/core_3/meta.txt | 2 +- src/tests/databases/core_4/meta.txt | 2 +- src/tests/databases/core_5/meta.txt | 2 +- src/tests/databases/core_6/meta.txt | 2 +- src/tests/databases/core_7/meta.txt | 2 +- src/tests/databases/core_8/meta.txt | 2 +- src/tests/databases/core_9/meta.txt | 2 +- 10 files changed, 35 insertions(+), 45 deletions(-) diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 2001ce9..eb1a9e2 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -10,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License.` import logging -import re import uuid from collections import defaultdict @@ -36,11 +35,12 @@ # TODO: -# test_update_unreleased_no_force -# test_update_released_force -# dataset_attribute collumn with the mandatory. -# genebuild.start_date should not be required. -# Test for required keys. +# Fix all the tests. +# dataset_attribute collumn with the mandatory. DONE +# genebuild.start_date should not be required. DONE +# Organism rather than species +# Test for required keys. DONE +# Remove division requirment. class CoreMetaUpdater(BaseMetaUpdater): def __init__(self, db_uri, metadata_uri, release=None, force=None): super().__init__(db_uri, metadata_uri, release, force) @@ -101,11 +101,11 @@ def get_meta_list_from_prefix_meta_key(self, species_id, prefix): def process_core(self, **kwargs): # Special case for loading a single species from a collection database. Can be removed in a future release - sel_species = kwargs.get('species', None) + sel_species = kwargs.get('organism', None) if sel_species: with self.db.session_scope() as session: multi_species = session.execute( - select(Meta.species_id).filter(Meta.meta_key == "species.production_name").filter( + select(Meta.species_id).filter(Meta.meta_key == "organism.production_name").filter( Meta.meta_value == sel_species).distinct() ) else: @@ -113,7 +113,7 @@ def process_core(self, **kwargs): # Handle multi-species databases and run an update for each species with self.db.session_scope() as session: multi_species = session.execute( - select(Meta.species_id).filter(Meta.meta_key == "species.production_name").distinct() + select(Meta.species_id).filter(Meta.meta_key == "organism.production_name").distinct() ) multi_species = [multi_species for multi_species, in multi_species] @@ -283,16 +283,11 @@ def concurrent_commit_genome_uuid(self, meta_session, species_id, genome_uuid): f"but it successfully updated the metadata. ") def new_genome(self, meta_session, species_id, organism, assembly, assembly_dataset, genebuild_dataset): - production_name = self.get_meta_single_meta_key(species_id, "species.production_name") + production_name = self.get_meta_single_meta_key(species_id, "organism.production_name") genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") genebuild_date = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") if genebuild_date is None: - start_date_str = self.get_meta_single_meta_key(species_id, "genebuild.start_date") - match = re.search(r'^(\d{4}-\d{2})', start_date_str) - if match: - genebuild_date = match.group(0) - else: - raise exceptions.MetadataUpdateException(f"Unable to parse genebuild.start_date from meta") + raise exceptions.MetadataUpdateException(f"Unable to parse genebuild.last_geneset_update from meta") # get next release inline to attach the genome to planned_release = get_or_new_release(self.metadata_uri) new_genome = Genome( @@ -342,15 +337,15 @@ def get_or_new_organism(self, species_id, meta_session): Get an existing Organism instance or create a new one, depending on the information from the metadata database. """ # Fetch the Ensembl name of the organism from metadata using either 'species.biosample_id' - # or 'species.production_name' as the key. + # or 'organism.production_name' as the key. biosample_id = self.get_meta_single_meta_key(species_id, "organism.biosample_id") if biosample_id is None: - biosample_id = self.get_meta_single_meta_key(species_id, "species.production_name") + biosample_id = self.get_meta_single_meta_key(species_id, "organism.production_name") # Getting the common name from the meta table, otherwise we grab it from ncbi. - common_name = self.get_meta_single_meta_key(species_id, "species.common_name") + common_name = self.get_meta_single_meta_key(species_id, "organism.common_name") #TODO: Taxid should be required. - taxid = self.get_meta_single_meta_key(species_id, "species.taxonomy_id") + taxid = self.get_meta_single_meta_key(species_id, "organism.taxonomy_id") if common_name is None or common_name == "": with self.metadata_db.session_scope() as session: @@ -361,19 +356,19 @@ def get_or_new_organism(self, species_id, meta_session): common_name = common_name.name if common_name is not None else '-' # Ensure that the first character is upper case. common_name = common_name[0].upper() + common_name[1:] - species_taxonomy_id = self.get_meta_single_meta_key(species_id, "species.species_taxonomy_id") + species_taxonomy_id = self.get_meta_single_meta_key(species_id, "organism.species_taxonomy_id") if species_taxonomy_id is None: species_taxonomy_id = taxid # Instantiate a new Organism object using data fetched from metadata. new_organism = Organism( species_taxonomy_id=species_taxonomy_id, - taxonomy_id=self.get_meta_single_meta_key(species_id, "species.taxonomy_id"), + taxonomy_id=self.get_meta_single_meta_key(species_id, "organism.taxonomy_id"), common_name=common_name, - scientific_name=self.get_meta_single_meta_key(species_id, "species.scientific_name"), + scientific_name=self.get_meta_single_meta_key(species_id, "organism.scientific_name"), biosample_id=biosample_id, - strain=self.get_meta_single_meta_key(species_id, "species.strain"), - strain_type=self.get_meta_single_meta_key(species_id, "strain.type"), - scientific_parlance_name=self.get_meta_single_meta_key(species_id, "species.parlance_name") + strain=self.get_meta_single_meta_key(species_id, "organism.strain"), + strain_type=self.get_meta_single_meta_key(species_id, "organism.type"), + scientific_parlance_name=self.get_meta_single_meta_key(species_id, "organism.scientific_parlance_name") ) # Query the metadata database to find if an Organism with the same Ensembl name already exists. @@ -607,15 +602,13 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") last_geneset_update = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") - start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date") - if None in (provider_name, last_geneset_update, start_date): + if None in (provider_name, last_geneset_update): exceptions.MissingMetaException( - "genebuild.provider_name, genebuild.last_geneset_update, genebuild.start_date are required keys") - # There should not exist an existing genome with assembly_accesion/provider_name/last_geneset_update and start_date. + "genebuild.provider_name, genebuild.last_geneset_update are required keys") + # There should not exist an existing genome with assembly_accesion/provider_name/last_geneset_update provider_name_attr = aliased(DatasetAttribute) last_geneset_update_attr = aliased(DatasetAttribute) - start_date_attr = aliased(DatasetAttribute) existing_combination = ( meta_session.query(Genome) @@ -624,22 +617,19 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F .join(Assembly, Genome.assembly_id == Assembly.assembly_id) .join(provider_name_attr, Dataset.dataset_id == provider_name_attr.dataset_id) .join(last_geneset_update_attr, Dataset.dataset_id == last_geneset_update_attr.dataset_id) - .join(start_date_attr, Dataset.dataset_id == start_date_attr.dataset_id) .filter( Dataset.name == "genebuild", Assembly.accession == assembly_accession, # Correctly linking the assembly_accession provider_name_attr.value == provider_name, last_geneset_update_attr.value == last_geneset_update, - start_date_attr.value == start_date, provider_name_attr.attribute.has(Attribute.name == "genebuild.provider_name"), last_geneset_update_attr.attribute.has(Attribute.name == "genebuild.last_geneset_update"), - start_date_attr.attribute.has(Attribute.name == "genebuild.start_date") ) .exists() ) if meta_session.query(existing_combination).scalar(): exceptions.MetaException( - "genebuild.provider_name, genebuild.last_geneset_update, genebuild.start_date and assembly.accession can" + "genebuild.provider_name, genebuild.last_geneset_update, and assembly.accession can" " not match existing records. If this is an update, please update genebuild.last_geneset_update with the " "current date. " ) @@ -680,7 +670,7 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F def new_homology(self, meta_session, species_id, genome=None, source=None, dataset_attributes=None, version="1.0"): if source is None: - production_name = self.get_meta_single_meta_key(species_id, "species.production_name") + production_name = self.get_meta_single_meta_key(species_id, "organism.production_name") db_version = self.get_meta_single_meta_key(None, "schema_version") compara_name = production_name + "_compara_" + db_version dataset_source = self.get_or_new_source(meta_session, "compara", name=compara_name) diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 2fe36e3..1db6729 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -5,7 +5,7 @@ 15 1 gencode.version 999 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 10029 8 1 species.strain reference diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index c066f90..ad00f68 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -6,7 +6,7 @@ 16 1 genebuild.last_geneset_update 01 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 12096d8..de0b9b5 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -5,7 +5,7 @@ 15 1 genebuild.last_geneset_update 2024-02 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 3e97320..993b0d3 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -5,7 +5,7 @@ 15 1 gencode.version 999 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index c30703d..ea99e2c 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -3,7 +3,7 @@ 13 1 assembly.name test1 11 1 assembly.ucsc_alias test1 7 1 species.division Ensembl_TEST -6 1 species.production_name test_case_5 +6 1 organism.production_name test_case_5 4 1 species.scientific_name Hominoide 8 1 species.strain reference 9 1 species.strain_group Hominoide diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index 0106f13..d27d3e4 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -6,7 +6,7 @@ 16 1 genebuild.last_geneset_update 01 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index 579ae4d..91b41f6 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -6,7 +6,7 @@ 16 1 genebuild.last_geneset_update 01 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST -6 1 species.production_name Jabberwocky +6 1 organism.production_name Jabberwocky 4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index 7272c90..caf553f 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -6,7 +6,7 @@ 16 1 genebuild.last_geneset_update 01 3 1 species.common_name Caenorhabditis elegans (PRJNA13758) 7 1 species.division Ensembl_TEST -6 1 species.production_name Caenorhabditis_elegans +6 1 organism.production_name Caenorhabditis_elegans 4 1 species.scientific_name Caenorhabditis elegans 1 1 species.species_taxonomy_id 6239 8 1 species.strain N2 diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index 1818c24..1ba0aef 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -6,7 +6,7 @@ 16 1 genebuild.last_geneset_update 01 3 1 species.common_name Caenorhabditis elegans (PRJNA13758) 7 1 species.division Ensembl_TEST -6 1 species.production_name Caenorhabditis_elegans +6 1 organism.production_name Caenorhabditis_elegans 4 1 species.scientific_name Caenorhabditis elegans 1 1 species.species_taxonomy_id 6239 8 1 species.strain N2 From a86574811a8fe731d50dc1fe038226461eaf87d1 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 15 Oct 2024 10:59:32 +0100 Subject: [PATCH 3/7] Warnings on dataset factory --- src/ensembl/production/metadata/api/factories/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index 22438fc..e9ceae3 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -15,7 +15,6 @@ import sqlalchemy.orm from ensembl.utils.database.dbconnection import DBConnection -from sqlalchemy.engine import make_url from sqlalchemy.sql import func from ensembl.production.metadata.api.exceptions import * @@ -41,6 +40,7 @@ def create_all_child_datasets(self, dataset_uuid: str, topic: str = 'production_process', status: DatasetStatus = None, release: EnsemblRelease = None): + # CURRENTLY BROKEN FOR STATUS AND RELEASE. Marc broke it with his last update. Trace back to fix. # Retrieve the top-level dataset # Will not work on datasets that are tied to multiple genomes! # !!!! WILL CREATE THE DATASETS EVEN IF THEY ALREADY EXIST From fac4caca47e18a9845bf59566ce1c080ad73f6f3 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 15 Oct 2024 13:14:41 +0100 Subject: [PATCH 4/7] Fixed all tests, removed force fixed genebuild update --- .../metadata/api/factories/datasets.py | 2 +- .../production/metadata/api/factory.py | 2 +- .../production/metadata/updater/base.py | 3 +- .../production/metadata/updater/core.py | 160 +++++--------- .../metadata/updater/updater_utils.py | 12 +- src/tests/databases/core_1/meta.txt | 21 +- src/tests/databases/core_2/meta.txt | 21 +- src/tests/databases/core_3/meta.txt | 21 +- src/tests/databases/core_4/meta.txt | 21 +- src/tests/databases/core_5/meta.txt | 17 +- src/tests/databases/core_6/meta.txt | 21 +- src/tests/databases/core_7/meta.txt | 21 +- src/tests/databases/core_8/meta.txt | 21 +- src/tests/databases/core_9/meta.txt | 19 +- .../ensembl_genome_metadata/attribute.txt | 208 +++++++++--------- .../ensembl_genome_metadata/table.sql | 1 + src/tests/test_updater.py | 71 +----- 17 files changed, 270 insertions(+), 372 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index e9ceae3..5ce2431 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -178,7 +178,7 @@ def __create_child_datasets_recursive(self, session, parent_dataset, topic=None, version = parent_dataset.version # Create the child dataset if not exist_ds: - logger.debug(f"Creating dataset {dataset_type.name}/{dataset_source.name}/{status.value}/{release}") + # logger.debug(f"Creating dataset {dataset_type.name}/{dataset_source.name}/{status.value}/{release}") child_uuid, dataset, attributes, g_dataset = self.create_dataset(session=session, genome_input=genome_uuid, dataset_source=dataset_source, diff --git a/src/ensembl/production/metadata/api/factory.py b/src/ensembl/production/metadata/api/factory.py index f983fdf..9819143 100644 --- a/src/ensembl/production/metadata/api/factory.py +++ b/src/ensembl/production/metadata/api/factory.py @@ -30,7 +30,7 @@ def meta_factory(db_uri, metadata_uri, force=False): elif '_funcgen_' in db_url.database: raise Exception("funcgen not implemented yet") elif '_core_' in db_url.database: - return CoreMetaUpdater(db_uri, metadata_uri, force=force) + return CoreMetaUpdater(db_uri, metadata_uri) elif '_otherfeatures_' in db_url.database: raise Exception("otherfeatures not implemented yet") elif '_rnaseq_' in db_url.database: diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 8714e47..35c69f5 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -18,9 +18,8 @@ class BaseMetaUpdater: - def __init__(self, db_uri, metadata_uri, release=None, force=None): + def __init__(self, db_uri, metadata_uri, release=None): self.db_uri = db_uri - self.force = force self.metadata_uri = metadata_uri self.db = DBConnection(self.db_uri) self.metadata_db = DBConnection(metadata_uri) diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index eb1a9e2..db77479 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -34,16 +34,9 @@ logger = logging.getLogger(__name__) -# TODO: -# Fix all the tests. -# dataset_attribute collumn with the mandatory. DONE -# genebuild.start_date should not be required. DONE -# Organism rather than species -# Test for required keys. DONE -# Remove division requirment. class CoreMetaUpdater(BaseMetaUpdater): - def __init__(self, db_uri, metadata_uri, release=None, force=None): - super().__init__(db_uri, metadata_uri, release, force) + def __init__(self, db_uri, metadata_uri, release=None): + super().__init__(db_uri, metadata_uri, release) self.db_type = 'core' # Single query to get all of the metadata information. self.meta_dict = {} @@ -81,7 +74,6 @@ def _validate_required_attributes(self): missing_attributes[species_id] = missing if missing_attributes: - # TODO: TEST THIS RETURN VALUE exceptions.MissingMetaException( "Species ID {species_id} is missing required attributes: {missing_attributes}") @@ -127,7 +119,7 @@ def process_species(self, species_id): """ with self.metadata_db.session_scope() as meta_session: - organism, division, organism_group_member = self.get_or_new_organism(species_id, meta_session) + organism = self.get_or_new_organism(species_id, meta_session) assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source = self.get_or_new_assembly( species_id, meta_session) genebuild_dataset, genebuild_dataset_attributes = self.get_or_new_genebuild(species_id, meta_session, @@ -140,15 +132,9 @@ def process_species(self, species_id): Genome.genome_uuid == old_genome_uuid).one_or_none() # Logic for existing key in database. if old_genome is not None: - if self.force is False: - raise exceptions.MetadataUpdateException( - "Core database contains a genome.genome_uuid which matches an entry in the meta table. " - "The force flag was not specified so the core was not updated.") - elif self.is_object_new(organism) or self.is_object_new(assembly): - raise exceptions.ExistingGenomeIdCoreException( - f"Core contains a genome.genome_uuid {old_genome_uuid} which matches an existing entry. " - "The assembly data or organism data is new and requires the creation a new uuid. Delete " - "the old uuid from the core to continue") + raise exceptions.MetadataUpdateException( + "Core database contains a genome.genome_uuid which matches an entry in the meta table. " + "Please update the genebuild.version, genebuild.last_genset_update, delete the genome.genome_uuid and resubmit.") else: raise exceptions.MetadataUpdateException( "Database contains a Genome.genome_uuid, but the corresponding data is not in" @@ -229,15 +215,13 @@ def process_species(self, species_id): else: # Check if the data has been released: - if check_release_status(self.metadata_db, genebuild_dataset.dataset_uuid) and not self.force: + if check_release_status(self.metadata_db, genebuild_dataset.dataset_uuid): raise exceptions.WrongReleaseException( "Existing Organism, Assembly, and Datasets within a release. " - "To update released data set force=True. This will force assembly " - "and genebuild" - "dataset updates and assembly sequences.") + "Please update genebuild.version and genebuild.last_geneset_update for new release. ") else: - logger.info('Rewrite of existing datasets. Only assembly dataset attributes, genebuild ' - 'dataset, dataset attributes, and assembly sequences are modified.') + logger.info('Rewrite of existing datasets. Only genebuild ' + 'dataset, dataset attributes are modified.') # TODO: We need to review this process, because if some Variation / Regulation / Compara datasets # exists we'll expect either to refuse the updates - imagine this was a fix in sequences! OR we # decide to delete the other datasets to force their recompute. In this case, we want to rewrite @@ -250,10 +234,8 @@ def process_species(self, species_id): existing=genebuild_dataset) # #Update assembly_dataset - meta_session.query(DatasetAttribute).filter( - DatasetAttribute.dataset_id == assembly_dataset.dataset_id).delete() self.get_or_new_assembly( - species_id, meta_session, source=dataset_source, existing=assembly_dataset) + species_id, meta_session, source=dataset_source) def concurrent_commit_genome_uuid(self, meta_session, species_id, genome_uuid): # Currently impossible with myisam without two phase commit (requires full refactor) @@ -344,10 +326,10 @@ def get_or_new_organism(self, species_id, meta_session): # Getting the common name from the meta table, otherwise we grab it from ncbi. common_name = self.get_meta_single_meta_key(species_id, "organism.common_name") - #TODO: Taxid should be required. taxid = self.get_meta_single_meta_key(species_id, "organism.taxonomy_id") - if common_name is None or common_name == "": - + if taxid is None: + raise exceptions.MissingMetaException("organism.taxid is required") + if common_name is None: with self.metadata_db.session_scope() as session: common_name = session.query(NCBITaxaName).filter( NCBITaxaName.taxon_id == taxid, @@ -374,16 +356,10 @@ def get_or_new_organism(self, species_id, meta_session): # Query the metadata database to find if an Organism with the same Ensembl name already exists. old_organism = meta_session.query(Organism).filter( Organism.biosample_id == new_organism.biosample_id).one_or_none() - division_name = self.get_meta_single_meta_key(species_id, "species.division") - division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() # If an existing Organism is found, return it and indicate that it already existed. if old_organism: - organism_group_member = meta_session.query(OrganismGroupMember).filter( - OrganismGroupMember.organism_id == old_organism.organism_id, - OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() - - return old_organism, division, organism_group_member + return old_organism else: # If no existing Organism is found, conduct additional checks before creating a new one. @@ -401,29 +377,9 @@ def get_or_new_organism(self, species_id, meta_session): if assembly_test is not None: logger.info("Assembly Accession already exists for a different organism.") - # Fetch the division name of the new organism from metadata. - if division_name is None: - exceptions.MissingMetaException("No species.division found in meta table") - - # Query the metadata database to find if an OrganismGroup with the same division name already exists. - if division is None: - # If no such OrganismGroup exists, create a new one. - division = OrganismGroup( - type="Division", - name=division_name, - ) - meta_session.add(division) - - # Create a new OrganismGroupMember linking the new Organism to the division group. - organism_group_member = OrganismGroupMember( - is_reference=0, - organism=new_organism, - organism_group=division, - ) meta_session.add(new_organism) - meta_session.add(organism_group_member) # Return the newly created Organism and indicate that it is new. - return new_organism, division, organism_group_member + return new_organism def get_assembly_sequences(self, species_id, assembly): """ @@ -528,7 +484,7 @@ def get_assembly_sequences(self, species_id, assembly): assembly_sequences.append(assembly_sequence) return assembly_sequences - def get_or_new_assembly(self, species_id, meta_session, source=None, existing=None): + def get_or_new_assembly(self, species_id, meta_session, source=None): # Get the new assembly accession from the core handed over assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession") assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() @@ -538,57 +494,46 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No dataset_source = source # This should return the existing objects - if assembly is not None and existing is None: + if assembly is not None: # Get the existing assembly dataset assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() # I should not need this, but double check on database updating. assembly_dataset_attributes = assembly_dataset.dataset_attributes assembly_sequences = assembly.assembly_sequences return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source - else: - attributes = self.get_meta_list_from_prefix_meta_key(species_id, "assembly") + is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 + with self.db.session_scope() as session: + level = (session.execute(db.select(CoordSystem.name).filter( + CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] + tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") + assembly = Assembly( + ucsc_name=self.get_meta_single_meta_key(species_id, "assembly.ucsc_alias"), + accession=self.get_meta_single_meta_key(species_id, "assembly.accession"), + level=level, + name=self.get_meta_single_meta_key(species_id, "assembly.name"), + accession_body=self.get_meta_single_meta_key(species_id, "assembly.provider"), + assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), + tol_id=tol_id, + alt_accession=self.get_meta_single_meta_key(species_id, "assembly.alt_accession"), + created=func.now(), + assembly_uuid=str(uuid.uuid4()), + url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), + is_reference=is_reference + ) + dataset_factory = DatasetFactory(self.metadata_uri) + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() + (dataset_uuid, assembly_dataset, assembly_dataset_attributes, + new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, + dataset_type, attributes, "assembly", + assembly.accession, None, + DatasetStatus.PROCESSED) + meta_session.add(assembly) + meta_session.add(assembly_dataset) + assembly_sequences = self.get_assembly_sequences(species_id, assembly) + meta_session.add_all(assembly_sequences) - if existing is None: - is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 - with self.db.session_scope() as session: - level = (session.execute(db.select(CoordSystem.name).filter( - CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] - tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") - assembly = Assembly( - ucsc_name=self.get_meta_single_meta_key(species_id, "assembly.ucsc_alias"), - accession=self.get_meta_single_meta_key(species_id, "assembly.accession"), - level=level, - name=self.get_meta_single_meta_key(species_id, "assembly.name"), - accession_body=self.get_meta_single_meta_key(species_id, "assembly.provider"), - assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), - tol_id=tol_id, - alt_accession=self.get_meta_single_meta_key(species_id, "assembly.alt_accession"), - created=func.now(), - assembly_uuid=str(uuid.uuid4()), - url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), - is_reference=is_reference - ) - dataset_factory = DatasetFactory(self.metadata_uri) - dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() - (dataset_uuid, assembly_dataset, assembly_dataset_attributes, - new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, - dataset_type, attributes, "assembly", - assembly.accession, None, - DatasetStatus.PROCESSED) - meta_session.add(assembly) - meta_session.add(assembly_dataset) - assembly_sequences = self.get_assembly_sequences(species_id, assembly) - meta_session.add_all(assembly_sequences) - else: - assembly_dataset = existing - assembly_dataset.dataset_source = dataset_source - for dataset_attribute in assembly_dataset.dataset_attributes: - meta_session.delete(dataset_attribute) - assembly_dataset_attributes = update_attributes(assembly_dataset, attributes, meta_session) - assembly_sequences = meta_session.query(AssemblySequence).filter( - AssemblySequence.assembly_id == assembly.assembly_id) meta_session.add_all(assembly_dataset_attributes) return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source @@ -603,10 +548,6 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") last_geneset_update = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") - if None in (provider_name, last_geneset_update): - exceptions.MissingMetaException( - "genebuild.provider_name, genebuild.last_geneset_update are required keys") - # There should not exist an existing genome with assembly_accesion/provider_name/last_geneset_update provider_name_attr = aliased(DatasetAttribute) last_geneset_update_attr = aliased(DatasetAttribute) @@ -634,7 +575,6 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F "current date. " ) - # The genebuild accession is formed by combining the assembly accession and the genebuild version genebuild_accession = assembly_accession + "_" + genebuild_version if source is None: @@ -662,9 +602,7 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F genebuild_dataset.label = genebuild_accession genebuild_dataset.dataset_source = dataset_source genebuild_dataset.version = genebuild_version - for dataset_attribute in genebuild_dataset.dataset_attributes: - meta_session.delete(dataset_attribute) - genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session) + genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session, replace=True) return genebuild_dataset, genebuild_dataset_attributes diff --git a/src/ensembl/production/metadata/updater/updater_utils.py b/src/ensembl/production/metadata/updater/updater_utils.py index 543c10c..1dd5145 100644 --- a/src/ensembl/production/metadata/updater/updater_utils.py +++ b/src/ensembl/production/metadata/updater/updater_utils.py @@ -13,16 +13,22 @@ from ensembl.production.metadata.api.models import Attribute, DatasetAttribute -def update_attributes(dataset, attributes, session): +def update_attributes(dataset, attributes, session, replace=False): # TODO If attributes already exist, update them. Add option to replace all. dataset_attributes = [] + if replace: + for dataset_attribute in dataset.dataset_attributes: + session.delete(dataset_attribute) + session.flush() for attribute, value in attributes.items(): meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") - dataset_attributes.append(DatasetAttribute( + new_dataset_attribute = DatasetAttribute( value=value, dataset=dataset, attribute=meta_attribute, - )) + ) + session.add(new_dataset_attribute) + dataset_attributes.append(new_dataset_attribute) return dataset_attributes \ No newline at end of file diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 1db6729..87844e8 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -3,16 +3,16 @@ 13 1 assembly.name jaber01 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 10029 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 10029 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 10029 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 10029 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 @@ -20,4 +20,5 @@ 23 1 genebuild.provider_name test 24 1 genebuild.start_date 2023-07-Ensembl 25 1 assembly.alt_accession GCA_0000012345.3 -26 \N schema_version 110 \ No newline at end of file +26 \N schema_version 110 +27 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index ad00f68..29b7d70 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -4,16 +4,16 @@ 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 6666666 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 666668 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 6666666 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 666668 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 @@ -21,4 +21,5 @@ 21 1 genome.genome_uuid test 23 1 genebuild.provider_name test2 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index de0b9b5..28ddd53 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -3,20 +3,21 @@ 11 1 assembly.ucsc_alias SCARYIER 14 1 gencode.version 999 15 1 genebuild.last_geneset_update 2024-02 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 6666666 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 666668 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 6666666 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 666668 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 993b0d3..c0ecec6 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -3,20 +3,21 @@ 13 1 assembly.name jaber01 11 1 assembly.ucsc_alias SCARYIER 15 1 gencode.version 999 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 6666666 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 666668 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 6666666 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 666668 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS02 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index ea99e2c..3432d48 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -2,18 +2,19 @@ 14 1 assembly.default test846 13 1 assembly.name test1 11 1 assembly.ucsc_alias test1 -7 1 species.division Ensembl_TEST +7 1 organism.division Ensembl_TEST 6 1 organism.production_name test_case_5 -4 1 species.scientific_name Hominoide -8 1 species.strain reference -9 1 species.strain_group Hominoide -2 1 species.taxonomy_id 9940 -10 1 species.type monsters -5 1 species.url Hominoide +4 1 organism.scientific_name Hominoide +8 1 organism.strain reference +9 1 organism.strain_group Hominoide +2 1 organism.taxonomy_id 9940 +10 1 organism.type monsters +5 1 organism.url Hominoide 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name removed_for_test 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index d27d3e4..e86f1e1 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -4,16 +4,16 @@ 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 6666666 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 666668 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 6666666 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 666668 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 @@ -21,4 +21,5 @@ 21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 23 1 genebuild.provider_name test 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index 91b41f6..61f951a 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -4,16 +4,16 @@ 11 1 assembly.ucsc_alias test_alias 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 -3 1 species.common_name jabberwocky -7 1 species.division Ensembl_TEST +3 1 organism.common_name jabberwocky +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Jabberwocky -4 1 species.scientific_name carol_jabberwocky -1 1 species.species_taxonomy_id 6666666 -8 1 species.strain reference -9 1 species.strain_group testing -2 1 species.taxonomy_id 666668 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name carol_jabberwocky +1 1 organism.species_taxonomy_id 6666666 +8 1 organism.strain reference +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 666668 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 @@ -22,4 +22,5 @@ 24 1 genebuild.start_date 2023-08-Ensembl 25 1 genebuild.havana_datafreeze_date test2 26 \N schema_version 110 -27 1 assembly.total_coding_sequence_length 8989 \ No newline at end of file +27 1 assembly.total_coding_sequence_length 8989 +28 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index caf553f..e157a3d 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -4,20 +4,21 @@ 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 -3 1 species.common_name Caenorhabditis elegans (PRJNA13758) -7 1 species.division Ensembl_TEST +3 1 organism.common_name Caenorhabditis elegans (PRJNA13758) +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Caenorhabditis_elegans -4 1 species.scientific_name Caenorhabditis elegans -1 1 species.species_taxonomy_id 6239 -8 1 species.strain N2 -9 1 species.strain_group testing -2 1 species.taxonomy_id 6239 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name Caenorhabditis elegans +1 1 organism.species_taxonomy_id 6239 +8 1 organism.strain N2 +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 6239 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test 24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 \ No newline at end of file +25 \N schema_version 110 +29 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index 1ba0aef..50446d5 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -4,16 +4,16 @@ 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 -3 1 species.common_name Caenorhabditis elegans (PRJNA13758) -7 1 species.division Ensembl_TEST +3 1 organism.common_name Caenorhabditis elegans (PRJNA13758) +7 1 organism.division Ensembl_TEST 6 1 organism.production_name Caenorhabditis_elegans -4 1 species.scientific_name Caenorhabditis elegans -1 1 species.species_taxonomy_id 6239 -8 1 species.strain N2 -9 1 species.strain_group testing -2 1 species.taxonomy_id 6239 -10 1 species.type monsters -5 1 species.url Jabbe +4 1 organism.scientific_name Caenorhabditis elegans +1 1 organism.species_taxonomy_id 6239 +8 1 organism.strain N2 +9 1 organism.strain_group testing +2 1 organism.taxonomy_id 6239 +10 1 organism.type monsters +5 1 organism.url Jabbe 17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 @@ -23,3 +23,4 @@ 25 1 assembly.total_genome_length 546 26 1 genebuild.start_date 2023-07-Ensembl 27 \N schema_version 110 +28 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/ensembl_genome_metadata/attribute.txt b/src/tests/databases/ensembl_genome_metadata/attribute.txt index 4f756f2..fbdb239 100644 --- a/src/tests/databases/ensembl_genome_metadata/attribute.txt +++ b/src/tests/databases/ensembl_genome_metadata/attribute.txt @@ -1,104 +1,104 @@ -1 assembly.accession assembly.accession assembly.accession string -2 assembly.chromosomes Chromosomes or plasmids Number of structures in cells containing DNA integer -3 assembly.component_sequences Component sequences Part of the primary sequences in assembly integer -4 assembly.contig_n50 Contig N50 Median size of contigs in a genome assembly bp -5 assembly.date assembly.date assembly.date string -6 assembly.default assembly.default assembly.default string -7 assembly.gc_percentage Average GC content Percentage of nucleotides in DNA that are G or C percent -8 assembly.is_reference assembly.is_reference assembly.is_reference string -9 assembly.level assembly.level assembly.level string -10 assembly.mapping assembly.mapping assembly.mapping string -11 assembly.name assembly.name assembly.name string -12 assembly.provider_name assembly.provider_name assembly.provider_name string -13 assembly.provider_url assembly.provider_url assembly.provider_url string -14 assembly.spanned_gaps Spanned gaps Number of gaps covered by sequencing reads integer -15 assembly.tolid assembly.tolid assembly.tolid string -16 assembly.toplevel_sequences Top level sequences Primary sequences in a genome assembly integer -17 assembly.total_coding_sequence_length Total coding sequence length Total length of all coding sequences bp -18 assembly.total_gap_length Total gap length Total length of all gaps in a genome assembly bp -19 assembly.total_genome_length Total genome length Total length of all genomic sequences bp -20 assembly.ucsc_alias assembly.ucsc_alias assembly.ucsc_alias string -21 genebuild.average_cds_length Average CDS length Average length of coding sequences float -22 genebuild.average_coding_exons_per_coding_gene Average coding exons per coding gene Average coding exons per coding gene string -23 genebuild.average_coding_exons_per_transcript Average coding exons per transcript Average coding exons per coding transcript float -24 genebuild.average_coding_exon_length Average exon length per coding gene Average length of coding exons bp -25 genebuild.average_exon_length Average exon length Average length of exons bp -26 genebuild.average_genomic_span Average coding genomic span Average length of all genomic regions bp -27 genebuild.average_intron_length Average intron length Average intron length per coding gene bp -28 genebuild.average_sequence_legth Average coding sequence length Average length of sequences in genome bp -29 genebuild.coding_genes Coding genes Genes that code for proteins integer -30 genebuild.coding_transcripts Coding transcripts Transcripts that code for proteins integer -31 genebuild.coding_transcripts_per_gene Average coding transcripts per gene Average coding transcripts per gene float -32 genebuild.hash genebuild.hash genebuild.hash string -33 genebuild.initial_release_date genebuild.initial_release_date genebuild.initial_release_date string -34 genebuild.last_geneset_update genebuild.last_geneset_update genebuild.last_geneset_update string -35 genebuild.level genebuild.level genebuild.level string -36 genebuild.longest_gene_length Longest coding gene Length of longest gene bp -37 genebuild.method genebuild.method genebuild.method string -38 genebuild.method_display genebuild.method_display genebuild.method_display string -39 genebuild.nc_average_exons_per_transcript Average exons per non-coding transcript Mean exon count per transcript float -40 genebuild.nc_average_exon_length Average exon length per non-coding transcript Mean exon length bp -41 genebuild.nc_average_genomic_span Average non-coding genomic span Mean length of all genomic regions bp -42 genebuild.nc_average_sequence_length Average non-coding sequence length Mean length of all sequences bp -43 genebuild.nc_longest_gene_length Longest non-coding gene Length of longest non-coding gene bp -44 genebuild.nc_long_non_coding_genes Long non-coding genes Long genes not coding for proteins integer -45 genebuild.nc_misc_non_coding_genes Misc. non-coding genes Miscellaneous non-coding genes integer -46 genebuild.nc_non_coding_genes Non-coding genes Genes that don't code for proteins integer -47 genebuild.nc_shortest_gene_length Shortest non-coding gene Length of shortest gene bp -48 genebuild.nc_small_non_coding_genes Small non-coding genes Small genes not coding for proteins integer -49 genebuild.nc_total_introns Introns in non-coding genes Total intron count integer -50 genebuild.nc_total_transcripts Non-coding transcripts Total RNA transcript count integer -51 genebuild.nc_transcripts_per_gene Average transcripts per non-coding gene Mean transcripts count per gene float -52 genebuild.ps_average_exons_per_transcript Average intron length per pseudogene Mean exon count per pseudogene transcript float -53 genebuild.ps_average_exon_length Average exon length per pseudogene Mean pseudogene exon length bp -54 genebuild.ps_average_genomic_span Average pseudogene genomic span Mean length of pseudogene regions bp -55 genebuild.ps_average_intron_length Average intron length per pseudogene Mean pseudogene intron length bp -56 genebuild.ps_average_sequence_length Average pseudogene sequence length Mean length of pseudogene sequences bp -57 genebuild.ps_longest_gene_length Longest pseudogene Length of longest pseudogene bp -58 genebuild.ps_pseudogenes Pseudogenes Genes which don't code functional protiens integer -59 genebuild.ps_shortest_gene_length Shortest pseudogene Length of shortest pseudogene bp -60 genebuild.ps_total_exons Exons in pseudogenes Total exon count in pseudogenes integer -61 genebuild.ps_total_introns Introns in pseudogenes Total intron count in pseudogenes integer -62 genebuild.ps_total_transcripts Transcripts in pseudogenes Total pseudogene RNA transcript count integer -63 genebuild.ps_transcripts_per_gene Average transcripts per pseudogene Mean pseudogene transcripts count per pseudogene float -64 genebuild.shortest_gene_length Shortest coding gene Length of shortest gene bp -65 genebuild.start_date genebuild.start_date genebuild.start_date string -66 genebuild.total_coding_exons Exons in coding genes Total number of coding exons integer -67 genebuild.total_exons Exons in genes Total number of exons integer -68 genebuild.total_introns Introns in coding genes Total number of introns integer -69 genebuild.total_transcripts Transcripts in coding genes Total number of RNA transcripts integer -70 genebuild.transcripts_per_gene Average transcripts per coding gene Average number of transcripts per gene float -71 genebuild.version genebuild.version genebuild.version string -72 genebuild.sample_gene genebuild.sample_gene Sample Gene Data string -73 genebuild.sample_location genebuild.sample_location Sample Location Data string -74 assembly.coverage_depth assembly.coverage_depth assembly.coverage_depth string -75 assembly.web_accession_source assembly.web_accession_source assembly.web_accession_source string -76 assembly.web_accession_type assembly.web_accession_type assembly.web_accession_type string -77 genebuild.id genebuild.id genebuild.id string -78 genebuild.nc_average_intron_length Average intron length per non-coding transcript Mean intron length bp -79 genebuild.projection_source_db genebuild.projection_source_db genebuild.projection_source_db string -80 assembly.long_name assembly.long_name assembly.long_name string -81 assembly.url_name assembly.url_name assembly.url_name string -82 genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date string -83 assembly.version assembly.version assembly.version string -84 genebuild.provider_name genebuild.provider_name genebuild.provider_name string -85 genebuild.provider_url genebuild.provider_url genebuild.provider_url string -119 variation.short_variants Short variants Small-scale genetic variations integer -120 variation.sample_variant variation.sample_variant variation.sample_variant string -123 variation.short_variants_with_phenotype_assertions Short variation with phenotype assertion Short variation with phenotype assertion string -161 compara.homology_coverage compara.homology_coverage compara.homology_coverage float -162 compara.homology_reference_species compara.homology_reference_species compara.homology_reference_species string -163 regulation.open_chromatin_count regulation.open_chromatin_count Number of open chromatin regions integer -164 regulation.promoter_count regulation.promoter_count Number of promoters integer -165 regulation.enhancer_count regulation.enhancer_count Number of enhancers integer -166 regulation.ctcf_count regulation.ctcf_count Number of CTCF binding sites integer -167 regulation.tfbs_count regulation.tfbs_count Number of regions enriched for transcription factor binding integer -168 assembly.tol_id assembly.tol_id assembly.tol_id string -169 genebuild.annotation_source genebuild.annotation_source genebuild.annotation_source string -170 genebuild.nc_total_exons Exons in non-coding genes Total exon count integer -179 assembly.description assembly.description assembly.description string -180 assembly.master_accession assembly.master_accession assembly.master_accession string -181 assembly.alt_accession assembly.alt_accession assembly.alt_accession string -182 dataset.build_start Dataset Build start date Dataset Build start date string -183 dataset.build_end Dataset Build completed Dataset Build completed string -197 genebuild.provider_version genebuild.provider_version genebuild.provider_version string +1 assembly.accession assembly.accession assembly.accession string 1, +2 assembly.chromosomes Chromosomes or plasmids Number of structures in cells containing DNA integer 0, +3 assembly.component_sequences Component sequences Part of the primary sequences in assembly integer 0, +4 assembly.contig_n50 Contig N50 Median size of contigs in a genome assembly bp 0, +5 assembly.date assembly.date assembly.date string 0, +6 assembly.default assembly.default assembly.default string 0, +7 assembly.gc_percentage Average GC content Percentage of nucleotides in DNA that are G or C percent 0, +8 assembly.is_reference assembly.is_reference assembly.is_reference string 0, +9 assembly.level assembly.level assembly.level string 0, +10 assembly.mapping assembly.mapping assembly.mapping string 0, +11 assembly.name assembly.name assembly.name string 1, +12 assembly.provider_name assembly.provider_name assembly.provider_name string 0, +13 assembly.provider_url assembly.provider_url assembly.provider_url string 0, +14 assembly.spanned_gaps Spanned gaps Number of gaps covered by sequencing reads integer 0, +15 assembly.tolid assembly.tolid assembly.tolid string 0, +16 assembly.toplevel_sequences Top level sequences Primary sequences in a genome assembly integer 0, +17 assembly.total_coding_sequence_length Total coding sequence length Total length of all coding sequences bp 0, +18 assembly.total_gap_length Total gap length Total length of all gaps in a genome assembly bp 0, +19 assembly.total_genome_length Total genome length Total length of all genomic sequences bp 0, +20 assembly.ucsc_alias assembly.ucsc_alias assembly.ucsc_alias string 0, +21 genebuild.average_cds_length Average CDS length Average length of coding sequences float 0, +22 genebuild.average_coding_exons_per_coding_gene Average coding exons per coding gene Average coding exons per coding gene string 0, +23 genebuild.average_coding_exons_per_transcript Average coding exons per transcript Average coding exons per coding transcript float 0, +24 genebuild.average_coding_exon_length Average exon length per coding gene Average length of coding exons bp 0, +25 genebuild.average_exon_length Average exon length Average length of exons bp 0, +26 genebuild.average_genomic_span Average coding genomic span Average length of all genomic regions bp 0, +27 genebuild.average_intron_length Average intron length Average intron length per coding gene bp 0, +28 genebuild.average_sequence_legth Average coding sequence length Average length of sequences in genome bp 0, +29 genebuild.coding_genes Coding genes Genes that code for proteins integer 0, +30 genebuild.coding_transcripts Coding transcripts Transcripts that code for proteins integer 0, +31 genebuild.coding_transcripts_per_gene Average coding transcripts per gene Average coding transcripts per gene float 0, +32 genebuild.hash genebuild.hash genebuild.hash string 0, +33 genebuild.initial_release_date genebuild.initial_release_date genebuild.initial_release_date string 0, +34 genebuild.last_geneset_update genebuild.last_geneset_update genebuild.last_geneset_update string 1, +35 genebuild.level genebuild.level genebuild.level string 0, +36 genebuild.longest_gene_length Longest coding gene Length of longest gene bp 0, +37 genebuild.method genebuild.method genebuild.method string 0, +38 genebuild.method_display genebuild.method_display genebuild.method_display string 0, +39 genebuild.nc_average_exons_per_transcript Average exons per non-coding transcript Mean exon count per transcript float 0, +40 genebuild.nc_average_exon_length Average exon length per non-coding transcript Mean exon length bp 0, +41 genebuild.nc_average_genomic_span Average non-coding genomic span Mean length of all genomic regions bp 0, +42 genebuild.nc_average_sequence_length Average non-coding sequence length Mean length of all sequences bp 0, +43 genebuild.nc_longest_gene_length Longest non-coding gene Length of longest non-coding gene bp 0, +44 genebuild.nc_long_non_coding_genes Long non-coding genes Long genes not coding for proteins integer 0, +45 genebuild.nc_misc_non_coding_genes Misc. non-coding genes Miscellaneous non-coding genes integer 0, +46 genebuild.nc_non_coding_genes Non-coding genes Genes that don't code for proteins integer 0, +47 genebuild.nc_shortest_gene_length Shortest non-coding gene Length of shortest gene bp 0, +48 genebuild.nc_small_non_coding_genes Small non-coding genes Small genes not coding for proteins integer 0, +49 genebuild.nc_total_introns Introns in non-coding genes Total intron count integer 0, +50 genebuild.nc_total_transcripts Non-coding transcripts Total RNA transcript count integer 0, +51 genebuild.nc_transcripts_per_gene Average transcripts per non-coding gene Mean transcripts count per gene float 0, +52 genebuild.ps_average_exons_per_transcript Average intron length per pseudogene Mean exon count per pseudogene transcript float 0, +53 genebuild.ps_average_exon_length Average exon length per pseudogene Mean pseudogene exon length bp 0, +54 genebuild.ps_average_genomic_span Average pseudogene genomic span Mean length of pseudogene regions bp 0, +55 genebuild.ps_average_intron_length Average intron length per pseudogene Mean pseudogene intron length bp 0, +56 genebuild.ps_average_sequence_length Average pseudogene sequence length Mean length of pseudogene sequences bp 0, +57 genebuild.ps_longest_gene_length Longest pseudogene Length of longest pseudogene bp 0, +58 genebuild.ps_pseudogenes Pseudogenes Genes which don't code functional protiens integer 0, +59 genebuild.ps_shortest_gene_length Shortest pseudogene Length of shortest pseudogene bp 0, +60 genebuild.ps_total_exons Exons in pseudogenes Total exon count in pseudogenes integer 0, +61 genebuild.ps_total_introns Introns in pseudogenes Total intron count in pseudogenes integer 0, +62 genebuild.ps_total_transcripts Transcripts in pseudogenes Total pseudogene RNA transcript count integer 0, +63 genebuild.ps_transcripts_per_gene Average transcripts per pseudogene Mean pseudogene transcripts count per pseudogene float 0, +64 genebuild.shortest_gene_length Shortest coding gene Length of shortest gene bp 0, +65 genebuild.start_date genebuild.start_date genebuild.start_date string 1, +66 genebuild.total_coding_exons Exons in coding genes Total number of coding exons integer 0, +67 genebuild.total_exons Exons in genes Total number of exons integer 0, +68 genebuild.total_introns Introns in coding genes Total number of introns integer 0, +69 genebuild.total_transcripts Transcripts in coding genes Total number of RNA transcripts integer 0, +70 genebuild.transcripts_per_gene Average transcripts per coding gene Average number of transcripts per gene float 0, +71 genebuild.version genebuild.version genebuild.version string 1, +72 genebuild.sample_gene genebuild.sample_gene Sample Gene Data string 1, +73 genebuild.sample_location genebuild.sample_location Sample Location Data string 1, +74 assembly.coverage_depth assembly.coverage_depth assembly.coverage_depth string 0, +75 assembly.web_accession_source assembly.web_accession_source assembly.web_accession_source string 0, +76 assembly.web_accession_type assembly.web_accession_type assembly.web_accession_type string 0, +77 genebuild.id genebuild.id genebuild.id string 0, +78 genebuild.nc_average_intron_length Average intron length per non-coding transcript Mean intron length bp 0, +79 genebuild.projection_source_db genebuild.projection_source_db genebuild.projection_source_db string 0, +80 assembly.long_name assembly.long_name assembly.long_name string 0, +81 assembly.url_name assembly.url_name assembly.url_name string 0, +82 genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date string 0, +83 assembly.version assembly.version assembly.version string 0, +84 genebuild.provider_name genebuild.provider_name genebuild.provider_name string 1, +85 genebuild.provider_url genebuild.provider_url genebuild.provider_url string 1, +119 variation.short_variants Short variants Small-scale genetic variations integer 0, +120 variation.sample_variant variation.sample_variant variation.sample_variant string 0, +123 variation.short_variants_with_phenotype_assertions Short variation with phenotype assertion Short variation with phenotype assertion string 0, +161 compara.homology_coverage compara.homology_coverage compara.homology_coverage float 0, +162 compara.homology_reference_species compara.homology_reference_species compara.homology_reference_species string 0, +163 regulation.open_chromatin_count regulation.open_chromatin_count Number of open chromatin regions integer 0, +164 regulation.promoter_count regulation.promoter_count Number of promoters integer 0, +165 regulation.enhancer_count regulation.enhancer_count Number of enhancers integer 0, +166 regulation.ctcf_count regulation.ctcf_count Number of CTCF binding sites integer 0, +167 regulation.tfbs_count regulation.tfbs_count Number of regions enriched for transcription factor binding integer 0, +168 assembly.tol_id assembly.tol_id assembly.tol_id string 0, +169 genebuild.annotation_source genebuild.annotation_source genebuild.annotation_source string 1, +170 genebuild.nc_total_exons Exons in non-coding genes Total exon count integer 0, +179 assembly.description assembly.description assembly.description string 0, +180 assembly.master_accession assembly.master_accession assembly.master_accession string 0, +181 assembly.alt_accession assembly.alt_accession assembly.alt_accession string 0, +182 dataset.build_start Dataset Build start date Dataset Build start date string 0, +183 dataset.build_end Dataset Build completed Dataset Build completed string 0, +197 genebuild.provider_version genebuild.provider_version genebuild.provider_version string 0, diff --git a/src/tests/databases/ensembl_genome_metadata/table.sql b/src/tests/databases/ensembl_genome_metadata/table.sql index 1fd37e3..b7dee88 100644 --- a/src/tests/databases/ensembl_genome_metadata/table.sql +++ b/src/tests/databases/ensembl_genome_metadata/table.sql @@ -56,6 +56,7 @@ CREATE TABLE attribute label varchar(128) not null, description varchar(255) null, type enum ('integer', 'float', 'percent', 'string', 'bp') default 'string' null, + required tinyint(1) DEFAULT '0' not null, constraint name unique (name), constraint name_2 diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 8977eb3..4929afd 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -65,7 +65,7 @@ def test_new_organism(self, test_dbs): assembly = session.query(Assembly).where(Assembly.name == 'jaber01').first() assert organism.scientific_name == 'carol_jabberwocky' assert organism.genomes[0].genebuild_version == 'ENS01' - assert organism.genomes[0].genebuild_date == '2023-07' + assert organism.genomes[0].genebuild_date == '2023-01' # Test the Assembly assert assembly.accession == 'GCF_1111111123.3' assert assembly.alt_accession == 'GCA_0000012345.3' @@ -133,7 +133,7 @@ def test_update_geneset(self, test_dbs): assert dataset.dataset_source.type == "core" assert dataset.dataset_type.name == "genebuild" assert dataset.genome_datasets[0].genome.genebuild_version == 'ENS02' - assert dataset.genome_datasets[0].genome.genebuild_date == '2023-07' + assert dataset.genome_datasets[0].genome.genebuild_date == '2023-01' assert dataset.genome_datasets[0].genome.genome_releases is not None def test_taxonomy_common_name(self, test_dbs): @@ -151,32 +151,17 @@ def test_fail_existing_genome_uuid_data_not_match(self, test_dbs): assert ("Core database contains a genome.genome_uuid which matches an entry in the meta table. " "The force flag was not specified so the core was not updated." in str(exif.value)) - #TODO: fix this test case - @pytest.mark.xfail(strict=False) def test_update_unreleased_no_force(self, test_dbs): test = meta_factory(test_dbs['core_7'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) test.process_core() metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: - # Test that assembly seqs have been updated - new_seq = session.query(AssemblySequence).filter( - AssemblySequence.name == 'TEST1_seq_update').one_or_none() - assert new_seq is None - old_seq = session.query(AssemblySequence).where( - (AssemblySequence.name == 'TEST1_seqA')).first() - assert old_seq is not None # Check that the old datasets have been removed genebuild_test = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( DatasetSource.name.like('%core_5'), ).filter(DatasetType.name == "genebuild").one_or_none() assert genebuild_test is None - # Check that the old attributes are gone - count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'assembly.default', - DatasetAttribute.value == 'NewTest' - ).count() - assert count == 1 count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'genebuild.provider_name', DatasetAttribute.value == 'removed_for_test' @@ -188,68 +173,28 @@ def test_update_unreleased_no_force(self, test_dbs): DatasetSource.name.like('%core_7'), DatasetType.name == 'assembly' ).count() - assert count == 1 + assert count == 0 count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( DatasetSource.name.like('%core_7'), DatasetType.name == 'genebuild' ).count() assert count == 1 - # Check that the new attribute values are present + # Check that new assembly attribute values are not present count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'assembly.ucsc_alias', DatasetAttribute.value == 'test_alias' ).count() - assert count > 0 - + assert count == 0 + # Check that new genebuild attribute values are present count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'genebuild.havana_datafreeze_date', DatasetAttribute.value == 'test2' ).count() assert count > 0 - def test_update_released_no_force(self, test_dbs): + def test_update_released(self, test_dbs): test = meta_factory(test_dbs['core_8'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) with pytest.raises(Exception) as exif: test.process_core() assert ("Existing Organism, Assembly, and Datasets within a release. " - "To update released data set force=True. " - "This will force assembly and genebuilddataset updates and assembly sequences." in str(exif.value)) - - #TODO: fix this test case - @pytest.mark.xfail(strict=False) - def test_update_released_force(self, test_dbs): - test = meta_factory(test_dbs['core_9'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url, force=True) - test.process_core() - metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) - with metadata_db.session_scope() as session: - # Test that assembly seqs have not been updated - new_seq = session.query(AssemblySequence).where( - (AssemblySequence.name == 'TEST1_seq_BAD')).first() - assert new_seq is None - old_seq = session.query(AssemblySequence).where( - (AssemblySequence.accession == 'MtDNA')).first() - assert old_seq is not None - # Check that the old datasets have been removed - - count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - DatasetSource.name.like('%core_7'), - DatasetType.name == 'assembly' - ).count() - assert count == 0 - # Check that the new datasets exist - count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - DatasetSource.name.like('%core_9'), - DatasetType.name == 'assembly' - ).count() - assert count == 1 - # Check that the old attributes are gone - count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'assembly.total_coding_sequence_length', - DatasetAttribute.value == '8989' - ).count() - assert count == 0 - count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'genebuild.havana_datafreeze_date', - DatasetAttribute.value == 'test2' - ).count() - assert count == 1 + "Please update genebuild.version and genebuild.last_geneset_update. " in str(exif.value)) From 6b21462f9abeb89b46ca2c609d3596df9b069986 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 15 Oct 2024 13:31:44 +0100 Subject: [PATCH 5/7] updated VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 50aea0e..a4f52a5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 \ No newline at end of file +3.2.0 \ No newline at end of file From 7549a695f85e32ba428be70acc31f930b2cf5648 Mon Sep 17 00:00:00 2001 From: Daniel Poppleton <111403332+dpopleton@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:09:25 +0100 Subject: [PATCH 6/7] Update test_updater.py bugfix --- src/tests/test_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 7e26d75..86b9cad 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -196,4 +196,4 @@ def test_update_released(self, test_dbs): test = meta_factory(test_dbs['core_8'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) with pytest.raises(Exception) as exif: test.process_core() - assert ("Existing Organism, Assembly, and Datasets within a release. " + assert ("Existing Organism, Assembly, and Datasets within a release. ") From acfa9c97c272266baed25fafa63e89b5491e7993 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 15 Oct 2024 15:59:50 +0100 Subject: [PATCH 7/7] Fixed test. Again --- src/tests/databases/core_8/meta.txt | 3 +- src/tests/databases/core_9/attrib_type.txt | 3 - src/tests/databases/core_9/coord_system.txt | 1 - src/tests/databases/core_9/meta.txt | 26 ------ src/tests/databases/core_9/seq_region.txt | 3 - .../databases/core_9/seq_region_attrib.txt | 8 -- .../databases/core_9/seq_region_synonym.txt | 3 - src/tests/databases/core_9/table.sql | 86 ------------------- 8 files changed, 2 insertions(+), 131 deletions(-) delete mode 100644 src/tests/databases/core_9/attrib_type.txt delete mode 100644 src/tests/databases/core_9/coord_system.txt delete mode 100644 src/tests/databases/core_9/meta.txt delete mode 100644 src/tests/databases/core_9/seq_region.txt delete mode 100644 src/tests/databases/core_9/seq_region_attrib.txt delete mode 100644 src/tests/databases/core_9/seq_region_synonym.txt delete mode 100644 src/tests/databases/core_9/table.sql diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index e157a3d..725a1ac 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -4,6 +4,7 @@ 11 1 assembly.ucsc_alias SCARY 15 1 gencode.version 999 16 1 genebuild.last_geneset_update 01 +3 1 organism.biosample_id SAMN04256190 3 1 organism.common_name Caenorhabditis elegans (PRJNA13758) 7 1 organism.division Ensembl_TEST 6 1 organism.production_name Caenorhabditis_elegans @@ -14,7 +15,7 @@ 2 1 organism.taxonomy_id 6239 10 1 organism.type monsters 5 1 organism.url Jabbe -17 1 genebuild.version ENS01 +17 1 genebuild.version EXT01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test diff --git a/src/tests/databases/core_9/attrib_type.txt b/src/tests/databases/core_9/attrib_type.txt deleted file mode 100644 index 59c569d..0000000 --- a/src/tests/databases/core_9/attrib_type.txt +++ /dev/null @@ -1,3 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". -316 circular_seq Circular sequence Circular chromosome or plasmid molecule \ No newline at end of file diff --git a/src/tests/databases/core_9/coord_system.txt b/src/tests/databases/core_9/coord_system.txt deleted file mode 100644 index 51314bf..0000000 --- a/src/tests/databases/core_9/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt deleted file mode 100644 index 9bb50da..0000000 --- a/src/tests/databases/core_9/meta.txt +++ /dev/null @@ -1,26 +0,0 @@ -12 1 assembly.accession test1 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARY -15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 01 -3 1 organism.common_name Caenorhabditis elegans (PRJNA13758) -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Caenorhabditis_elegans -4 1 organism.scientific_name Caenorhabditis elegans -1 1 organism.species_taxonomy_id 6239 -8 1 organism.strain N2 -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 6239 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 -24 1 genebuild.havana_datafreeze_date test2 -25 1 assembly.stats.total_genome_length 546 -26 1 genebuild.start_date 2023-07-Ensembl -27 \N schema_version 110 -28 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_9/seq_region.txt b/src/tests/databases/core_9/seq_region.txt deleted file mode 100644 index d79f754..0000000 --- a/src/tests/databases/core_9/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq_BAD 1 666666 -2 TEST2_seqB 1 666 -3 TEST3_seqC 1 1666666 diff --git a/src/tests/databases/core_9/seq_region_attrib.txt b/src/tests/databases/core_9/seq_region_attrib.txt deleted file mode 100644 index aad2591..0000000 --- a/src/tests/databases/core_9/seq_region_attrib.txt +++ /dev/null @@ -1,8 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome -1 316 1 -2 316 0 \ No newline at end of file diff --git a/src/tests/databases/core_9/seq_region_synonym.txt b/src/tests/databases/core_9/seq_region_synonym.txt deleted file mode 100644 index de43d91..0000000 --- a/src/tests/databases/core_9/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_9/table.sql b/src/tests/databases/core_9/table.sql deleted file mode 100644 index 953da98..0000000 --- a/src/tests/databases/core_9/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file