diff --git a/ensembl_genes/notebooks/ensembl_genes_eda.ipynb b/ensembl_genes/notebooks/ensembl_genes_eda.ipynb index 1b01c0e..333ec5d 100644 --- a/ensembl_genes/notebooks/ensembl_genes_eda.ipynb +++ b/ensembl_genes/notebooks/ensembl_genes_eda.ipynb @@ -43,7 +43,7 @@ "source": [ "# parameters cell\n", "species = \"human\"\n", - "release = \"104\"" + "release = \"111\"" ] }, { @@ -154,7 +154,7 @@ "metadata": {}, "outputs": [], "source": [ - "ensg.gene_df.seq_region_exc_type.value_counts(dropna=False)" + "pd.crosstab(ensg.gene_df.coord_system, ensg.gene_df.primary_assembly, margins=True)" ] }, { @@ -597,7 +597,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/ensembl_genes/queries/gene_alt_alleles.sql b/ensembl_genes/queries/gene_alt_alleles.sql index c3e4b42..7e70ce9 100644 --- a/ensembl_genes/queries/gene_alt_alleles.sql +++ b/ensembl_genes/queries/gene_alt_alleles.sql @@ -1,8 +1,13 @@ +-- Get alt allele groups: genes with multiple alleles SELECT gene.stable_id AS ensembl_gene_id, alt_allele.alt_allele_group_id, alt_allele_attrib.attrib = "IS_REPRESENTATIVE" AS alt_allele_is_representative, - assembly_exception.exc_seq_region_id IS NULL AS primary_assembly, + -- we used to use assembly_exception to determine primary assembly, but this table is now empty + -- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773 + -- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1'), + -- even though this is a flawed method that would miss scaffolds that are primary assemblies. + LENGTH(seq_region.name) <= 3 AS primary_assembly, seq_region.name AS seq_region, alt_allele_attrib.attrib AS alt_allele_attrib, gene.created_date AS ensembl_created_date @@ -13,11 +18,6 @@ INNER JOIN alt_allele_attrib ON alt_allele.alt_allele_id = alt_allele_attrib.alt_allele_id INNER JOIN seq_region ON gene.seq_region_id = seq_region.seq_region_id -LEFT JOIN assembly_exception - ON seq_region.seq_region_id = assembly_exception.seq_region_id - -- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP) - -- refs internal Related Sciences issue 606. - AND NOT assembly_exception.exc_type <=> "PAR" -- all genes were current when query was written, ensure this is always the case WHERE gene.is_current ORDER BY alt_allele_group_id, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id diff --git a/ensembl_genes/queries/genes.sql b/ensembl_genes/queries/genes.sql index 3c8c80b..e48ec24 100644 --- a/ensembl_genes/queries/genes.sql +++ b/ensembl_genes/queries/genes.sql @@ -15,26 +15,24 @@ SELECT gene.modified_date AS ensembl_modified_date, coord_system.version AS coord_system_version, coord_system.name AS coord_system, - -- get chromosome: refs internal Related Sciences issue 606. - CASE WHEN coord_system.name = "chromosome" - THEN COALESCE(exc_seq_region.name, seq_region.name) + -- we are not able to determine the chromosomes not on the primary assembly + CASE WHEN LENGTH(seq_region.name) <= 3 + THEN seq_region.name END AS chromosome, - assembly_exception.exc_type AS seq_region_exc_type, seq_region.name AS seq_region, gene.seq_region_start AS seq_region_start, gene.seq_region_end AS seq_region_end, gene.seq_region_strand AS seq_region_strand, - assembly_exception.exc_seq_region_id IS NULL AS primary_assembly + -- we used to use assembly_exception to determine primary assembly, but this table is now empty + -- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773 + -- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1') + -- even though this is a flawed method that would miss scaffolds that are primary assemblies. + LENGTH(seq_region.name) <= 3 AS primary_assembly FROM gene LEFT JOIN xref ON xref.xref_id = gene.display_xref_id LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id LEFT JOIN seq_region ON gene.seq_region_id = seq_region.seq_region_id LEFT JOIN coord_system ON seq_region.coord_system_id = coord_system.coord_system_id -LEFT JOIN assembly_exception ON seq_region.seq_region_id = assembly_exception.seq_region_id - -- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP) - -- refs internal Related Sciences issue 606. - AND NOT assembly_exception.exc_type <=> "PAR" -LEFT JOIN seq_region AS exc_seq_region ON assembly_exception.exc_seq_region_id = exc_seq_region.seq_region_id WHERE -- all genes were current when query was written, ensure this is always the case gene.is_current AND