Skip to content

Commit

Permalink
issue #83 - switch to GTF for Ensembl
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Aug 29, 2024
1 parent 923e452 commit f95b1a5
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 27 deletions.
2 changes: 1 addition & 1 deletion cdot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__version__ = "0.2.26"

# Data version is kept in generate_transcript_version.json_schema_version

def get_data_schema_int(version: str) -> int:
""" Return an int which increments upon breaking changes - ie anything other than patch """
Expand Down
4 changes: 2 additions & 2 deletions generate_transcript_data/ensembl_transcripts_chm13v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ fi

merge_args=()
for release in 2022_06 2022_07; do
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gff3.gz
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gtf.gz
url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename}
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done
Expand Down
6 changes: 3 additions & 3 deletions generate_transcript_data/ensembl_transcripts_grch37.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ fi
merge_args=()
for release in 82 85 87; do
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh37.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
filename=Homo_sapiens.GRCh37.${release}.gtf.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gtf/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done
Expand Down
8 changes: 4 additions & 4 deletions generate_transcript_data/ensembl_transcripts_grch38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ fi
#81 is first GFF3 for GRCh38
merge_args=()
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112; do
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh38.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
# Switched to using GTFs as they contain protein version while Ensembl GFF3s do not (required for c_to_p)
filename=Homo_sapiens.GRCh38.${release}.gtf.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done
Expand Down
58 changes: 42 additions & 16 deletions generate_transcript_data/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, filename, genome_build, url,
self.transcript_proteins = {}
# Store features in separate dict as we don't need to write all as JSON
self.transcript_features_by_type = defaultdict(lambda: defaultdict(list))

self._warned_about_htseq_tag_attributes = False

name_ac_map = {}
if not no_contig_conversion:
Expand Down Expand Up @@ -345,6 +345,40 @@ def _get_gene_accession(feature) -> Optional[str]:
gene_accession = gene_id
return gene_accession

def _handle_protein_version(self, transcript_accession, feature):
# RefSeq GFF3: CDS protein_id = NP_001659.1
# RefSeq GTF: CDS protein_id = NP_001659.1
# Ensembl GTF: CDS protein_id = ENSP00000477624 protein_version = 1
# Ensembl GTF: CDS protein_id = ENSP00000477624 <----- can't use this one as no version
if feature.type == "CDS":
if protein_accession := feature.attr.get("protein_id"):
if protein_version := feature.attr.get("protein_version"):
protein_accession = f"{protein_accession}.{protein_version}"
if not "." in protein_accession:
raise ValueError(f"Protein '{protein_accession}' missing version")
self.transcript_proteins[transcript_accession] = protein_accession

def _add_tags_to_transcript_data(self, transcript_data, feature):
# Ideally we only want to get this once per transcript
# So we want to pick something like mRNA or transcript not CDS or exon
if feature.type in ('mRNA', 'transcript'):
attr_tuples = getattr(feature, "attr_tuples", None)
if attr_tuples is None:
attr_tuples = feature.attr.items()
if not self._warned_about_htseq_tag_attributes:
self._warned_about_htseq_tag_attributes = True
htseq_version = importlib.metadata.version('HTSeq')
logging.warning("Your version of HTSeq (%s) can not handle duplicated tags. Some will be lost "
"See https://github.com/htseq/htseq/issues/83", htseq_version)

attr_list_vals = defaultdict(list)
for tag, value in attr_tuples:
attr_list_vals[tag].append(value)

if tag_list := attr_list_vals.get("tag"):
transcript_data["tag"] = ",".join(tag_list)


def get_genes_and_transcripts(self):
self._parse()
self._finish()
Expand All @@ -356,6 +390,9 @@ class GTFParser(GFFParser):
""" GTF (GFF2) - used by Ensembl, @see http://gmod.org/wiki/GFF2
GFF2 only has 2 levels of feature hierarchy, so we have to build or 3 levels of gene/transcript/exons ourselves
We *have* to use GTF as Ensembl GFF3s don't include the protein version (just the ID)
"""
GTF_TRANSCRIPTS_DATA = GFFParser.CODING_FEATURES | {"exon"}
FEATURE_ALLOW_LIST = GTF_TRANSCRIPTS_DATA | {"gene", "transcript"}
Expand Down Expand Up @@ -392,14 +429,8 @@ def handle_feature(self, feature):
gene_data["biotype"].add(biotype)
transcript["biotype"].add(biotype)

if feature.type == "CDS":
if protein := feature.attr.get("protein_id"):
if protein_version := feature.attr.get("protein_version"):
protein = f"{protein}.{protein_version}"
self.transcript_proteins[transcript_accession] = protein
elif feature.type == "transcript":
if tag := feature.attr.get("tag"):
transcript["tag"] = tag
self._handle_protein_version(transcript_accession, feature)
self._add_tags_to_transcript_data(transcript, feature)


class GFF3Parser(GFFParser):
Expand Down Expand Up @@ -461,15 +492,11 @@ def handle_feature(self, feature):
# Some exons etc may be for miRNAs that have no transcript ID, so skip those (won't have parent)
if parent_id:
transcript_accession = self.transcript_accession_by_feature_id.get(parent_id)
self._handle_protein_version(transcript_accession, feature)
else:
logging.warning("Transcript data has no parent: %s" % feature.get_gff_line())
transcript_accession = None

if feature.type == "CDS":
dbxref = self._get_dbxref(feature)
if genbank := (dbxref.get("Genbank") or dbxref.get("GenBank")):
self.transcript_proteins[transcript_accession] = genbank

if transcript_accession:
transcript = self.transcript_data_by_accession.get(transcript_accession)
if not transcript:
Expand Down Expand Up @@ -516,8 +543,7 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):
if feature.attr.get("partial"):
transcript_data["partial"] = 1

if tag := feature.attr.get("tag"):
transcript_data["tag"] = tag
self._add_tags_to_transcript_data(transcript_data, feature)

self.transcript_data_by_accession[transcript_accession] = transcript_data
self.transcript_accession_by_feature_id[feature.attr["ID"]] = transcript_accession
Expand Down
2 changes: 1 addition & 1 deletion generate_transcript_data/json_schema_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# After 0.2.22 we split version into separate code (pip) and data schema versions
# The cdot client will use its own major/minor to determine whether it can read these data files
JSON_SCHEMA_VERSION = "0.2.26"
JSON_SCHEMA_VERSION = "0.2.27"

0 comments on commit f95b1a5

Please sign in to comment.