Skip to content

Commit

Permalink
remove sql usage of assembly_exception table
Browse files Browse the repository at this point in the history
refs #25
refs #22

As of ensembl 110, this table was emptied causing our detection of
primary assembly to be flawed. This adopts a slightly flawed approach
for detecting the primary assembly, but one that should unblock
builds. Chromosomes detection is also now incomplete.
  • Loading branch information
dhimmel committed Feb 15, 2024
1 parent 4b1a4d8 commit eb7c779
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 19 deletions.
6 changes: 3 additions & 3 deletions ensembl_genes/notebooks/ensembl_genes_eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"source": [
"# parameters cell\n",
"species = \"human\"\n",
"release = \"104\""
"release = \"111\""
]
},
{
Expand Down Expand Up @@ -154,7 +154,7 @@
"metadata": {},
"outputs": [],
"source": [
"ensg.gene_df.seq_region_exc_type.value_counts(dropna=False)"
"pd.crosstab(ensg.gene_df.coord_system, ensg.gene_df.primary_assembly, margins=True)"
]
},
{
Expand Down Expand Up @@ -597,7 +597,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
12 changes: 6 additions & 6 deletions ensembl_genes/queries/gene_alt_alleles.sql
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
-- Get alt allele groups: genes with multiple alleles
SELECT
gene.stable_id AS ensembl_gene_id,
alt_allele.alt_allele_group_id,
alt_allele_attrib.attrib = "IS_REPRESENTATIVE" AS alt_allele_is_representative,
assembly_exception.exc_seq_region_id IS NULL AS primary_assembly,
-- we used to use assembly_exception to determine primary assembly, but this table is now empty
-- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773
-- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1'),
-- even though this is a flawed method that would miss scaffolds that are primary assemblies.
LENGTH(seq_region.name) <= 3 AS primary_assembly,
seq_region.name AS seq_region,
alt_allele_attrib.attrib AS alt_allele_attrib,
gene.created_date AS ensembl_created_date
Expand All @@ -13,11 +18,6 @@ INNER JOIN alt_allele_attrib
ON alt_allele.alt_allele_id = alt_allele_attrib.alt_allele_id
INNER JOIN seq_region
ON gene.seq_region_id = seq_region.seq_region_id
LEFT JOIN assembly_exception
ON seq_region.seq_region_id = assembly_exception.seq_region_id
-- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP)
-- refs internal Related Sciences issue 606.
AND NOT assembly_exception.exc_type <=> "PAR"
-- all genes were current when query was written, ensure this is always the case
WHERE gene.is_current
ORDER BY alt_allele_group_id, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id
Expand Down
18 changes: 8 additions & 10 deletions ensembl_genes/queries/genes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,24 @@ SELECT
gene.modified_date AS ensembl_modified_date,
coord_system.version AS coord_system_version,
coord_system.name AS coord_system,
-- get chromosome: refs internal Related Sciences issue 606.
CASE WHEN coord_system.name = "chromosome"
THEN COALESCE(exc_seq_region.name, seq_region.name)
-- we are not able to determine the chromosomes not on the primary assembly
CASE WHEN LENGTH(seq_region.name) <= 3
THEN seq_region.name
END AS chromosome,
assembly_exception.exc_type AS seq_region_exc_type,
seq_region.name AS seq_region,
gene.seq_region_start AS seq_region_start,
gene.seq_region_end AS seq_region_end,
gene.seq_region_strand AS seq_region_strand,
assembly_exception.exc_seq_region_id IS NULL AS primary_assembly
-- we used to use assembly_exception to determine primary assembly, but this table is now empty
-- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773
-- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1')
-- even though this is a flawed method that would miss scaffolds that are primary assemblies.
LENGTH(seq_region.name) <= 3 AS primary_assembly
FROM gene
LEFT JOIN xref ON xref.xref_id = gene.display_xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
LEFT JOIN seq_region ON gene.seq_region_id = seq_region.seq_region_id
LEFT JOIN coord_system ON seq_region.coord_system_id = coord_system.coord_system_id
LEFT JOIN assembly_exception ON seq_region.seq_region_id = assembly_exception.seq_region_id
-- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP)
-- refs internal Related Sciences issue 606.
AND NOT assembly_exception.exc_type <=> "PAR"
LEFT JOIN seq_region AS exc_seq_region ON assembly_exception.exc_seq_region_id = exc_seq_region.seq_region_id
WHERE
-- all genes were current when query was written, ensure this is always the case
gene.is_current AND
Expand Down

0 comments on commit eb7c779

Please sign in to comment.