From 14ce007228b62d6781d7dc5b2c5a562ae5a1864c Mon Sep 17 00:00:00 2001 From: Manuel Lera Ramirez Date: Fri, 4 Aug 2023 19:05:59 +0200 Subject: [PATCH] transvar works with genes that have lowercase c in them + update gtf file with single gene rows --- api.py | 10 +++++----- data/pombe_genome.gtf | 16 +++------------- set_up_transvar.sh | 3 +++ transvar_functions.py | 14 ++++++++++++++ 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/api.py b/api.py index ab9bfc5..3cd74a1 100644 --- a/api.py +++ b/api.py @@ -353,15 +353,15 @@ async def get_residue_at_position(systematic_id: str = Query(example='SPAPB1A10. return PlainTextResponse(gene['peptide'][position - 1]) -@ app.get("/ganno") -async def ganno(variant_description: str = Query(example="I:g.2832796A>T", description='Variant described at the genome level (gDNA)')) -> list[TransvarAnnotation]: +@ app.get("/ganno", summary='Variant described at the genome level (gDNA)', response_model=list[TransvarAnnotation]) +async def ganno(variant_description: str = Query(example="II:g.178497T>A", description='Variant described at the genome level (gDNA)')) -> list[TransvarAnnotation]: try: return parse_transvar_string(get_transvar_str_annotation('ganno', variant_description)) except ValueError as e: raise HTTPException(400, str(e)) -@ app.get("/canno") +@ app.get("/canno", summary='Variant described at the coding DNA level (cDNA)', response_model=list[TransvarAnnotation]) async def canno(variant_description: str = Query(example="SPAC3F10.09:c.5A>T", description='Variant described at the coding DNA level (cDNA)')) -> list[TransvarAnnotation]: try: return parse_transvar_string(get_transvar_str_annotation('canno', variant_description)) @@ -369,8 +369,8 @@ async def canno(variant_description: str = Query(example="SPAC3F10.09:c.5A>T", d raise HTTPException(400, str(e)) -@ app.get("/panno") -async def panno(variant_description: str = Query(example="SPAC3F10.09:p.E2L", description='Variant described at the protein level')) -> list[TransvarAnnotation]: +@ app.get("/panno", summary='Variant described at the protein level', response_model=list[TransvarAnnotation]) +async def panno(variant_description: str = Query(example="SPBC1198.04c:p.N3A", description='Variant described at the protein level')) -> list[TransvarAnnotation]: try: return parse_transvar_string(get_transvar_str_annotation('panno', variant_description)) except ValueError as e: diff --git a/data/pombe_genome.gtf b/data/pombe_genome.gtf index 7d8ea08..2127182 100644 --- a/data/pombe_genome.gtf +++ b/data/pombe_genome.gtf @@ -4936,7 +4936,6 @@ I PomBase CDS 1170504 1171270 . - 1 transcript_id "SPAC22A12.08c.1"; gene_id "SP I PomBase CDS 1171362 1171856 . - 1 transcript_id "SPAC22A12.08c.1"; gene_id "SPAC22A12.08c"; I PomBase CDS 1171900 1171956 . - 1 transcript_id "SPAC22A12.08c.1"; gene_id "SPAC22A12.08c"; I PomBase CDS 1171999 1172126 . - 0 transcript_id "SPAC22A12.08c.1"; gene_id "SPAC22A12.08c"; -I PomBase gene 1170705 1172209 . - . gene_id "SPAC22A12.08c"; gene_biotype "protein_coding"; I PomBase transcript 1170705 1172209 . - . transcript_id "SPAC22A12.08c.2"; gene_id "SPAC22A12.08c"; transcript_biotype "protein_coding"; I PomBase exon 1170705 1170724 . - . transcript_id "SPAC22A12.08c.2"; gene_id "SPAC22A12.08c"; I PomBase exon 1170801 1171270 . - . transcript_id "SPAC22A12.08c.2"; gene_id "SPAC22A12.08c"; @@ -24150,7 +24149,6 @@ II PomBase CDS 179322 179387 . - 0 transcript_id "SPBC1198.04c.1"; gene_id "SPBC II PomBase CDS 179440 180143 . - 2 transcript_id "SPBC1198.04c.1"; gene_id "SPBC1198.04c"; II PomBase CDS 180300 180416 . - 2 transcript_id "SPBC1198.04c.1"; gene_id "SPBC1198.04c"; II PomBase CDS 180461 180608 . - 0 transcript_id "SPBC1198.04c.1"; gene_id "SPBC1198.04c"; -II PomBase gene 177469 180628 . - . gene_id "SPBC1198.04c"; gene_biotype "protein_coding"; II PomBase transcript 177469 180628 . - . transcript_id "SPBC1198.04c.2"; gene_id "SPBC1198.04c"; transcript_biotype "protein_coding"; II PomBase exon 177469 177608 . - . transcript_id "SPBC1198.04c.2"; gene_id "SPBC1198.04c"; II PomBase exon 177656 178291 . - . transcript_id "SPBC1198.04c.2"; gene_id "SPBC1198.04c"; @@ -24843,7 +24841,6 @@ II PomBase exon 339378 340150 . + 0 transcript_id "SPNCRNA.4674.1"; gene_id "SPN II PomBase gene 340209 340750 . + . gene_id "SPNCRNA.1342"; gene_biotype "ncRNA"; II PomBase transcript 340209 340750 . + . transcript_id "SPNCRNA.1342.1"; gene_id "SPNCRNA.1342"; transcript_biotype "ncRNA"; II PomBase exon 340209 340750 . + 0 transcript_id "SPNCRNA.1342.1"; gene_id "SPNCRNA.1342"; -II PomBase gene 340399 340907 . - . gene_id "SPNCRNA.103"; gene_biotype "ncRNA"; II PomBase transcript 340399 340907 . - . transcript_id "SPNCRNA.103.2"; gene_id "SPNCRNA.103"; transcript_biotype "ncRNA"; II PomBase exon 340399 340907 . - 0 transcript_id "SPNCRNA.103.2"; gene_id "SPNCRNA.103"; II PomBase gene 340696 341725 . - . gene_id "SPBC1271.13"; gene_biotype "protein_coding"; @@ -26443,7 +26440,6 @@ II PomBase CDS 719806 720252 . + 0 transcript_id "SPBC119.04.1"; gene_id "SPBC11 II PomBase gene 719699 720288 . - . gene_id "SPNCRNA.4843"; gene_biotype "ncRNA"; II PomBase transcript 719699 720288 . - . transcript_id "SPNCRNA.4843.1"; gene_id "SPNCRNA.4843"; transcript_biotype "ncRNA"; II PomBase exon 719699 720288 . - 0 transcript_id "SPNCRNA.4843.1"; gene_id "SPNCRNA.4843"; -II PomBase gene 719719 720990 . + . gene_id "SPBC119.04"; gene_biotype "protein_coding"; II PomBase transcript 719719 720990 . + . transcript_id "SPBC119.04.2"; gene_id "SPBC119.04"; transcript_biotype "protein_coding"; II PomBase exon 719719 720990 . + . transcript_id "SPBC119.04.2"; gene_id "SPBC119.04"; II PomBase CDS 719806 720252 . + 0 transcript_id "SPBC119.04.2"; gene_id "SPBC119.04"; @@ -29009,10 +29005,9 @@ II PomBase exon 1306760 1307341 . + 0 transcript_id "SPNCRNA.5123.1"; gene_id "S II PomBase gene 1307004 1307502 . - . gene_id "SPNCRNA.5124"; gene_biotype "ncRNA"; II PomBase transcript 1307004 1307502 . - . transcript_id "SPNCRNA.5124.1"; gene_id "SPNCRNA.5124"; transcript_biotype "ncRNA"; II PomBase exon 1307004 1307502 . - 0 transcript_id "SPNCRNA.5124.1"; gene_id "SPNCRNA.5124"; -II PomBase gene 1307681 1308192 . + . gene_id "SPNCRNA.1715"; gene_biotype "ncRNA"; +II PomBase gene 1307681 1308359 . + . gene_id "SPNCRNA.1715"; gene_biotype "ncRNA"; II PomBase transcript 1307681 1308192 . + . transcript_id "SPNCRNA.1715.2"; gene_id "SPNCRNA.1715"; transcript_biotype "ncRNA"; II PomBase exon 1307681 1308192 . + 0 transcript_id "SPNCRNA.1715.2"; gene_id "SPNCRNA.1715"; -II PomBase gene 1307681 1308359 . + . gene_id "SPNCRNA.1715"; gene_biotype "ncRNA"; II PomBase transcript 1307681 1308359 . + . transcript_id "SPNCRNA.1715.1"; gene_id "SPNCRNA.1715"; transcript_biotype "ncRNA"; II PomBase exon 1307681 1308359 . + 0 transcript_id "SPNCRNA.1715.1"; gene_id "SPNCRNA.1715"; II PomBase gene 1308634 1308745 . + . gene_id "SPSNORNA.21"; gene_biotype "snoRNA"; @@ -29503,7 +29498,6 @@ II PomBase exon 1414254 1414547 . + . transcript_id "SPBC17A3.07.1"; gene_id "SP II PomBase exon 1414603 1416156 . + . transcript_id "SPBC17A3.07.1"; gene_id "SPBC17A3.07"; II PomBase CDS 1414496 1414547 . + 0 transcript_id "SPBC17A3.07.1"; gene_id "SPBC17A3.07"; II PomBase CDS 1414603 1415945 . + 2 transcript_id "SPBC17A3.07.1"; gene_id "SPBC17A3.07"; -II PomBase gene 1414440 1416156 . + . gene_id "SPBC17A3.07"; gene_biotype "protein_coding"; II PomBase transcript 1414440 1416156 . + . transcript_id "SPBC17A3.07.2"; gene_id "SPBC17A3.07"; transcript_biotype "protein_coding"; II PomBase exon 1414440 1414547 . + . transcript_id "SPBC17A3.07.2"; gene_id "SPBC17A3.07"; II PomBase exon 1414603 1416156 . + . transcript_id "SPBC17A3.07.2"; gene_id "SPBC17A3.07"; @@ -44018,7 +44012,7 @@ III PomBase exon 220148 220286 . - 0 transcript_id "SPCC548.02c.1"; gene_id "SPC III PomBase gene 219734 220102 . + . gene_id "SPNCRNA.6748"; gene_biotype "ncRNA"; III PomBase transcript 219734 220102 . + . transcript_id "SPNCRNA.6748.1"; gene_id "SPNCRNA.6748"; transcript_biotype "ncRNA"; III PomBase exon 219734 220102 . + 0 transcript_id "SPNCRNA.6748.1"; gene_id "SPNCRNA.6748"; -III PomBase gene 221149 222403 . - . gene_id "SPCC548.03c"; gene_biotype "protein_coding"; +III PomBase gene 221149 222935 . - . gene_id "SPCC548.03c"; gene_biotype "protein_coding"; III PomBase transcript 221149 222403 . - . transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; transcript_biotype "protein_coding"; III PomBase exon 221149 221344 . - . transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; III PomBase exon 221395 221457 . - . transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; @@ -44030,7 +44024,6 @@ III PomBase CDS 221395 221457 . - 2 transcript_id "SPCC548.03c.2"; gene_id "SPCC III PomBase CDS 221498 221689 . - 2 transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; III PomBase CDS 221726 222016 . - 2 transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; III PomBase CDS 222065 222347 . - 0 transcript_id "SPCC548.03c.2"; gene_id "SPCC548.03c"; -III PomBase gene 221149 222935 . - . gene_id "SPCC548.03c"; gene_biotype "protein_coding"; III PomBase transcript 221149 222935 . - . transcript_id "SPCC548.03c.1"; gene_id "SPCC548.03c"; transcript_biotype "protein_coding"; III PomBase exon 221149 221344 . - . transcript_id "SPCC548.03c.1"; gene_id "SPCC548.03c"; III PomBase exon 221395 221457 . - . transcript_id "SPCC548.03c.1"; gene_id "SPCC548.03c"; @@ -49593,7 +49586,7 @@ III PomBase gene 1578091 1579074 . - . gene_id "SPCC162.05"; gene_biotype "prote III PomBase transcript 1578091 1579074 . - . transcript_id "SPCC162.05.1"; gene_id "SPCC162.05"; transcript_biotype "protein_coding"; III PomBase exon 1578091 1579074 . - . transcript_id "SPCC162.05.1"; gene_id "SPCC162.05"; III PomBase CDS 1578220 1579044 . - 0 transcript_id "SPCC162.05.1"; gene_id "SPCC162.05"; -III PomBase gene 1579591 1581396 . - . gene_id "SPCC162.04c"; gene_biotype "protein_coding"; +III PomBase gene 1579591 1581928 . - . gene_id "SPCC162.04c"; gene_biotype "protein_coding"; III PomBase transcript 1579591 1581396 . - . transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; transcript_biotype "protein_coding"; III PomBase exon 1579591 1580268 . - . transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; III PomBase exon 1580319 1580381 . - . transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; @@ -49605,7 +49598,6 @@ III PomBase CDS 1580319 1580381 . - 2 transcript_id "SPCC162.04c.2"; gene_id "SP III PomBase CDS 1580422 1580613 . - 2 transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; III PomBase CDS 1580650 1581006 . - 2 transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; III PomBase CDS 1581055 1581337 . - 0 transcript_id "SPCC162.04c.2"; gene_id "SPCC162.04c"; -III PomBase gene 1579591 1581928 . - . gene_id "SPCC162.04c"; gene_biotype "protein_coding"; III PomBase transcript 1579591 1581928 . - . transcript_id "SPCC162.04c.1"; gene_id "SPCC162.04c"; transcript_biotype "protein_coding"; III PomBase exon 1579591 1580268 . - . transcript_id "SPCC162.04c.1"; gene_id "SPCC162.04c"; III PomBase exon 1580319 1580381 . - . transcript_id "SPCC162.04c.1"; gene_id "SPCC162.04c"; @@ -51317,7 +51309,6 @@ III PomBase CDS 2019206 2019288 . + 2 transcript_id "SPCC1906.03.1"; gene_id "SP III PomBase gene 2017583 2017791 . - . gene_id "SPNCRNA.7588"; gene_biotype "ncRNA"; III PomBase transcript 2017583 2017791 . - . transcript_id "SPNCRNA.7588.1"; gene_id "SPNCRNA.7588"; transcript_biotype "ncRNA"; III PomBase exon 2017583 2017791 . - 0 transcript_id "SPNCRNA.7588.1"; gene_id "SPNCRNA.7588"; -III PomBase gene 2018018 2019686 . + . gene_id "SPCC1906.03"; gene_biotype "protein_coding"; III PomBase transcript 2018018 2019686 . + . transcript_id "SPCC1906.03.2"; gene_id "SPCC1906.03"; transcript_biotype "protein_coding"; III PomBase exon 2018018 2018397 . + . transcript_id "SPCC1906.03.2"; gene_id "SPCC1906.03"; III PomBase exon 2018447 2018824 . + . transcript_id "SPCC1906.03.2"; gene_id "SPCC1906.03"; @@ -51918,7 +51909,6 @@ III PomBase CDS 2146492 2146770 . + 2 transcript_id "SPCC1620.02.1"; gene_id "SP III PomBase CDS 2146807 2146998 . + 2 transcript_id "SPCC1620.02.1"; gene_id "SPCC1620.02"; III PomBase CDS 2147039 2147101 . + 2 transcript_id "SPCC1620.02.1"; gene_id "SPCC1620.02"; III PomBase CDS 2147152 2147258 . + 2 transcript_id "SPCC1620.02.1"; gene_id "SPCC1620.02"; -III PomBase gene 2146063 2147814 . + . gene_id "SPCC1620.02"; gene_biotype "protein_coding"; III PomBase transcript 2146063 2147814 . + . transcript_id "SPCC1620.02.2"; gene_id "SPCC1620.02"; transcript_biotype "protein_coding"; III PomBase exon 2146063 2146442 . + . transcript_id "SPCC1620.02.2"; gene_id "SPCC1620.02"; III PomBase exon 2146492 2146770 . + . transcript_id "SPCC1620.02.2"; gene_id "SPCC1620.02"; diff --git a/set_up_transvar.sh b/set_up_transvar.sh index c9a968b..7211ecd 100644 --- a/set_up_transvar.sh +++ b/set_up_transvar.sh @@ -13,3 +13,6 @@ transvar ganno -i 'I:g.2832795T>A' --ensembl data/pombe_genome.gtf.transvardb -- # Hacky way to use the functions inside another script cp $(which transvar) ./transvar_main_script.py + +transvar panno -i 'SPBC1198.04c.1:p.T566S' --ensembl data/pombe_genome.gtf.transvardb --reference data/pombe_genome.fa +transvar panno -i 'SPAPB1A10.09:p.S372_N374delinsAAA' --ensembl data/pombe_genome.gtf.transvardb --reference data/pombe_genome.fa --gseq \ No newline at end of file diff --git a/transvar_functions.py b/transvar_functions.py index d0300df..2d5cea8 100644 --- a/transvar_functions.py +++ b/transvar_functions.py @@ -30,6 +30,19 @@ def parse_transvar_string(transvar_str: str) -> list[TransvarAnnotation]: return [TransvarAnnotation.from_list(t.split('\t')) for t in transvar_list] +class TransvarCustomString(str): + """Hacky class to circunvent https://github.com/zwdzwd/transvar/issues/59 + """ + def upper(self): + return self + + def strip(self, __chars=None): + return TransvarCustomString(str.strip(self, __chars)) + + def split(self, __sep=None, __maxsplit=-1): + return [TransvarCustomString(x) for x in str.split(self, __sep, __maxsplit)] + + def get_transvar_str_annotation(variant_type: str, variant_description: str) -> str: if variant_type not in ['ganno', 'canno', 'panno']: @@ -56,6 +69,7 @@ def get_transvar_str_annotation(variant_type: str, variant_description: str) -> p.set_defaults(func=partial(main_anno, at='p')) args = parser.parse_args([variant_type, '-i', variant_description, '--ensembl', 'data/pombe_genome.gtf.transvardb', '--reference', 'data/pombe_genome.fa']) + args.i = TransvarCustomString(args.i) output_stream = io.StringIO() with redirect_stdout(output_stream):