Skip to content

Commit

Permalink
abstract species to module
Browse files Browse the repository at this point in the history
refs #4
  • Loading branch information
dhimmel committed Oct 13, 2021
1 parent 948c0b3 commit 925fb98
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 28 deletions.
46 changes: 18 additions & 28 deletions ensembl_genes/ensembl_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,7 @@

import pandas as pd

ensembl_human_gene_pattern = r"^ENSG[0-9]{11}$"
"""
Regex pattern that valid human ensembl gene IDs should match.
https://bioinformatics.stackexchange.com/a/15044/9750
"""


ensembl_reference_chromosomes = [*map(str, range(1, 23)), "X", "Y", "MT"]
"""
Chromosome names applied to genes on the primary assembly rather than alternative sequences.
Refs internal Related Sciences issue 241.
"""
from .species import Species, human


class GeneForMHC(NamedTuple):
Expand All @@ -40,15 +29,15 @@ class Ensembl_Gene_Queries:
pandas.read_sql does not always preserve column types (e.g. converts bool columns to int).
Convert columns listed to the specified type.
"""
species: str = "homo_sapiens"
species: Species = human
"""Which species to query (only homo_sapiens is currently supported)"""
reference_genome: str = "38"
"""GRCh38"""

def __init__(self, release: str):
"""Example release '104'."""
self.release = release
self.database = f"{self.species}_core_{self.release}_{self.reference_genome}"
self.database = (
f"{self.species.name}_core_{self.release}_{self.species.reference_genome}"
)

@property
def connection_url(self) -> str:
Expand Down Expand Up @@ -81,21 +70,23 @@ def run_query(self, name: str) -> pd.DataFrame:
df[column] = df[column].astype(dtype)
return df

@staticmethod
def get_mhc_category(gene: GeneForMHC) -> str:
@classmethod
def get_mhc_category(cls, gene: GeneForMHC) -> str:
"""Assign MHC status of MHC, xMHC, or no to an ensembl gene record."""
chromosome: str = gene.chromosome
start: int = gene.seq_region_start
end: int = gene.seq_region_end
if chromosome != "6":
if chromosome != cls.species.mhc_chromosome:
return "no"
# Ensembl uses 1 based indexing, such that the interval should include
# the end (closed) as per https://www.biostars.org/p/84686/.
gene_interval = pd.Interval(left=start, right=end, closed="both")
# Refs boundary discussion internal Related Sciences issue 127.
# https://bioinformatics.stackexchange.com/a/14719/9750
mhc = pd.Interval(left=28_510_120, right=33_480_577, closed="both")
xmhc = pd.Interval(left=25_726_063, right=33_410_226, closed="both")
mhc = pd.Interval(
left=cls.species.mhc_lower, right=cls.species.mhc_upper, closed="both"
)
xmhc = pd.Interval(
left=cls.species.xmhc_lower, right=cls.species.xmhc_upper, closed="both"
)
if gene_interval.overlaps(mhc):
return "MHC"
if gene_interval.overlaps(xmhc):
Expand Down Expand Up @@ -125,15 +116,15 @@ def gene_df(self) -> pd.DataFrame:
self._check_gene_df(gene_repr_df)
return gene_repr_df

@staticmethod
def _check_gene_df(gene_df: pd.DataFrame) -> None:
@classmethod
def _check_gene_df(cls, gene_df: pd.DataFrame) -> None:
# ensure genes are distinct by their ensembl_gene_id
gene_duplicate_df = gene_df[gene_df.ensembl_gene_id.duplicated(keep=False)]
assert gene_duplicate_df.empty
# check all ensembl IDs match the expected pattern
# ensures LRG gene IDs are filtered (e.g. LRG_47)
invalid_id_df = gene_df[
~gene_df.ensembl_gene_id.str.fullmatch(ensembl_human_gene_pattern)
~gene_df.ensembl_gene_id.str.fullmatch(cls.species.ensembl_gene_pattern)
]
assert invalid_id_df.empty
# ensure all genes have a symbol
Expand All @@ -144,7 +135,7 @@ def _check_gene_df(gene_df: pd.DataFrame) -> None:
# Refs internal Related Sciences issue 606#issuecomment-929609041.
bad_chromosome_df = gene_df.dropna(subset=["chromosome"])
bad_chromosome_df = bad_chromosome_df[
~bad_chromosome_df.chromosome.str.match(r"^([0-9]{1,2}|X|Y|MT)$")
~bad_chromosome_df.chromosome.isin(cls.species.chromosomes)
]
assert bad_chromosome_df.empty

Expand Down Expand Up @@ -419,7 +410,6 @@ class DatasetExport:

class Ensembl_Gene_Catalog_Writer(Ensembl_Gene_Queries):

artifact_id: str = "ensembl_genes_human"
exports: List[DatasetExport] = [
DatasetExport(
name="genes",
Expand Down
38 changes: 38 additions & 0 deletions ensembl_genes/species.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from dataclasses import dataclass


@dataclass
class Species:
name: str
common_name: str
reference_genome: str
ensembl_gene_pattern: str
mhc_chromosome: str
mhc_lower: int
mhc_upper: int
xmhc_lower: int
xmhc_upper: int
chromosomes: list[str]


human = Species(
name="homo_sapiens",
common_name="human",
# GRCh38
reference_genome="38",
# Regex pattern that valid human ensembl gene IDs should match.
# https://bioinformatics.stackexchange.com/a/15044/9750
ensembl_gene_pattern=r"^ENSG[0-9]{11}$",
# Refs MHC boundary discussion internal Related Sciences issue 127.
# https://bioinformatics.stackexchange.com/a/14719/9750
mhc_chromosome="6",
mhc_lower=28_510_120,
mhc_upper=33_480_577,
xmhc_lower=25_726_063,
xmhc_upper=33_410_226,
# Chromosome names applied to genes on the primary assembly rather than alternative sequences.
# Refs internal Related Sciences issue 241.
chromosomes=[*map(str, range(1, 23)), "X", "Y", "MT"],
)

species = [human]
1 change: 1 addition & 0 deletions tests/ensembl_gene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
],
)
def test_get_mhc_category(gene: Gene, category: str) -> None:
assert Ensembl_Gene_Queries.species.name == "homo_sapiens"
assert Ensembl_Gene_Queries.get_mhc_category(gene) == category


Expand Down

0 comments on commit 925fb98

Please sign in to comment.