Skip to content

Commit

Permalink
Issue 78 (#84)
Browse files Browse the repository at this point in the history
* transition grammar with tests. Transforms old syntax into new

* removed check_invalid

* closes #82

* new rules added to the grammar, but need to modify replace_allele_features function

* new syntax working for amino acids

* new syntax working for nucleotides

* manual fixes related to #76

* more changes related to #76

* new grammar with tests

* committed missing test files

* simplify transition grammars

* fix errors on tests + move transition grammar to the end of the file

* transition working both ways including tests

* rename current grammar to old, and removed new from current

* fix #48
  • Loading branch information
manulera authored Jul 31, 2023
1 parent 0e344f7 commit 83ba3c7
Show file tree
Hide file tree
Showing 26 changed files with 7,561 additions and 222 deletions.
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"python.linting.enabled": true,
"python.linting.flake8Enabled": true,
"python.linting.flake8Path": ".venv/bin/flake8",
"python.linting.flake8Args": ["--ignore", "E501,W605"],
"python.linting.flake8Args": ["--ignore", "E501,W605,E731"],
"python.formatting.provider": "autopep8",
"python.formatting.autopep8Args": ["--ignore", "E501,W605"]
"python.formatting.autopep8Args": ["--ignore", "E501,W605,E731"]
}
4 changes: 2 additions & 2 deletions allele_auto_fix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas
from grammar import aminoacid_grammar
from grammar import aminoacid_grammar_old
from models import SyntaxRule
from refinement_functions import split_multiple_aa, join_multiple_aa
import pickle
Expand All @@ -15,7 +15,7 @@
with open('data/coordinate_changes_dict.json') as ins:
coordinate_changes_dict = json.load(ins)

syntax_rules = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar_old]
syntax_rules_dict = {f'{r.type}:{r.rule_name}': r for r in syntax_rules}

data = pandas.read_csv('results/allele_results_errors.tsv', sep='\t', na_filter=False)
Expand Down
10 changes: 6 additions & 4 deletions allele_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@
results/allele_results_errors_summarised.tsv
"""

from models import SyntaxRule
from models import SyntaxRule, AllowedTypes
from refinement_functions import check_allele_description
from grammar import allowed_types, aminoacid_grammar, nucleotide_grammar, disruption_grammar
from grammar import allowed_types_dict, composed_types_dict, aminoacid_grammar_old, nucleotide_grammar_old, disruption_grammar
import pickle
import pandas
import argparse
from common_autofix_functions import print_warnings
from genome_functions import process_systematic_id
import re


def empty_dict():
"""
Return the dictionary with error
Expand Down Expand Up @@ -109,9 +110,10 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
genome = pickle.load(ins)

allele_data = pandas.read_csv(args.alleles, delimiter='\t', na_filter=False)
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar]
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar_old]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar_old]
syntax_rules_disruption = [SyntaxRule.parse_obj(r) for r in disruption_grammar]
allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)

extra_cols = allele_data.apply(lambda row: check_fun(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides, syntax_rules_disruption, allowed_types), axis=1, result_type='expand')
output_data = pandas.concat([allele_data, extra_cols], axis=1)
Expand Down
9 changes: 5 additions & 4 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from starlette.responses import RedirectResponse, PlainTextResponse, FileResponse
from pydantic import BaseModel
import pickle
from grammar import allowed_types, aminoacid_grammar, nucleotide_grammar, disruption_grammar
from models import SyntaxRule, find_rule
from grammar import allowed_types_dict, composed_types_dict, aminoacid_grammar_old, nucleotide_grammar_old, disruption_grammar
from models import SyntaxRule, find_rule, AllowedTypes
from refinement_functions import check_allele_description, split_multiple_aa
from enum import Enum
from allele_fixes import multi_shift_fix, old_coords_fix, primer_mutagenesis as primer_mutagenesis_func
Expand All @@ -19,10 +19,11 @@
from genome_functions import extract_main_feature_and_strand, process_systematic_id
from Bio.SeqRecord import SeqRecord

syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar]
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar_old]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar_old]
syntax_rules_disruption = [SyntaxRule.parse_obj(r) for r in disruption_grammar]
multi_aa_rule = find_rule(syntax_rules_aminoacids, 'amino_acid_mutation', 'multiple_aa')
allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)


class DNAorProtein(str, Enum):
Expand Down
30 changes: 0 additions & 30 deletions change_log/allele_auto_fix_19042023.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -121,36 +121,6 @@ SPAC30.03c R210G tsn1-R210G amino_acid_mutation R194G tsn1-R194G old_coords_fix
SPAC30.03c R211G tsn1-R211G amino_acid_mutation R195G tsn1-R195G old_coords_fix, revision 20080123: complement(join(4394707..4394741,4394799..4394967,4395010..4395092,4395131..4395491,4395530..4395592)) R211 R211G amino_acid_mutation:single_aa PMID:20081200
SPAC328.01c msn5::ura4+ disruption msn5::ura4 syntax_error msn5::ura4+ disruption:usual PMID:22496451
SPAC343.08c G31D mrp17-810 amino_acid_mutation G33D old_coords_fix, revision 20110324: complement(join(1653288..1653306,1653356..1653606,1653724..1653783)) G31 G31D amino_acid_mutation:single_aa PMID:33823662
SPAC3A12.14 E31V,E67V,E104V,E140V cam1-E0 amino_acid_mutation E13V,E49V,E86V,E122V multi_shift_fix E31|E67|E104|E140 0 E31V|E67V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V,E104V,E140V cam1-E0 amino_acid_mutation E33V,E69V,E106V,E142V multi_shift_fix E31|E67|E104|E140 1 E31V|E67V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V,E104V,E140V cam1-E1 amino_acid_mutation E49V,E86V,E122V multi_shift_fix E67|E104|E140 0 E67V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V,E104V,E140V cam1-E1 amino_acid_mutation E69V,E106V,E142V multi_shift_fix E67|E104|E140 1 E67V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E104V,E140V cam1-E12 amino_acid_mutation E86V,E122V multi_shift_fix E104|E140 0 E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E104V,E140V cam1-E12 amino_acid_mutation E106V,E142V multi_shift_fix E104|E140 1 E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E140V cam1-E123 amino_acid_mutation E122V multi_shift_fix E140 0 E140V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E140V cam1-E123 amino_acid_mutation E142V multi_shift_fix E140 1 E140V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E104V cam1-E124 amino_acid_mutation E86V multi_shift_fix E104 0 E104V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E104V cam1-E124 amino_acid_mutation E106V multi_shift_fix E104 1 E104V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V,E140V cam1-E13 amino_acid_mutation E49V,E122V multi_shift_fix E67|E140 0 E67V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V,E140V cam1-E13 amino_acid_mutation E69V,E142V multi_shift_fix E67|E140 1 E67V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V cam1-E134 amino_acid_mutation E49V multi_shift_fix E67 0 E67V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V cam1-E134 amino_acid_mutation E69V multi_shift_fix E67 1 E67V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E67V,E104V cam1-E14 amino_acid_mutation E49V,E86V multi_shift_fix E67|E104 0 E67V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644,PMID:9264467
SPAC3A12.14 E67V,E104V cam1-E14 amino_acid_mutation E69V,E106V multi_shift_fix E67|E104 1 E67V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644,PMID:9264467
SPAC3A12.14 E31V,E104V,E140V cam1-E2 amino_acid_mutation E13V,E86V,E122V multi_shift_fix E31|E104|E140 0 E31V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E104V,E140V cam1-E2 amino_acid_mutation E33V,E106V,E142V multi_shift_fix E31|E104|E140 1 E31V|E104V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E140V cam1-E23 amino_acid_mutation E13V,E122V multi_shift_fix E31|E140 0 E31V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E140V cam1-E23 amino_acid_mutation E33V,E142V multi_shift_fix E31|E140 1 E31V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V cam1-E234 amino_acid_mutation E13V multi_shift_fix E31 0 E31V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V cam1-E234 amino_acid_mutation E33V multi_shift_fix E31 1 E31V amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E104V cam1-E24 amino_acid_mutation E13V,E86V multi_shift_fix E31|E104 0 E31V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E104V cam1-E24 amino_acid_mutation E33V,E106V multi_shift_fix E31|E104 1 E31V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V,E140V cam1-E3 amino_acid_mutation E13V,E49V,E122V multi_shift_fix E31|E67|E140 0 E31V|E67V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V,E140V cam1-E3 amino_acid_mutation E33V,E69V,E142V multi_shift_fix E31|E67|E140 1 E31V|E67V|E140V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V cam1-E34 amino_acid_mutation E13V,E49V multi_shift_fix E31|E67 0 E31V|E67V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V cam1-E34 amino_acid_mutation E33V,E69V multi_shift_fix E31|E67 1 E31V|E67V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V,E104V cam1-E4 amino_acid_mutation E13V,E49V,E86V multi_shift_fix E31|E67|E104 0 E31V|E67V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3A12.14 E31V,E67V,E104V cam1-E4 amino_acid_mutation E33V,E69V,E106V multi_shift_fix E31|E67|E104 1 E31V|E67V|E104V amino_acid_mutation:single_aa|amino_acid_mutation:single_aa|amino_acid_mutation:single_aa PMID:7657644
SPAC3C7.11c cnx1::ura4+ disruption cnx1::ura4 syntax_error cnx1::ura4+ disruption:usual PMID:7876257
SPAC3F10.04 -365--234 gsa1--365--234 partial_nucleotide_deletion (-365)-(-234) syntax_error -365--234 partial_nucleotide_deletion:usual PMID:15529002
SPAC3G6.01 994–1049 hrp3-deltaSANT partial_amino_acid_deletion 994-1049 syntax_error 994–1049 partial_amino_acid_deletion:multiple_aa PMID:33670267
Expand Down
6 changes: 3 additions & 3 deletions check_manual_changes.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import sys
import pandas
from refinement_functions import check_allele_description
from grammar import allowed_types, aminoacid_grammar, nucleotide_grammar
from grammar import allowed_types, aminoacid_grammar_old, nucleotide_grammar_old
from models import SyntaxRule
import pickle


def main(input_file):
with open('data/genome.pickle', 'rb') as ins:
genome = pickle.load(ins)
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar]
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar_old]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar_old]
data = pandas.read_csv(input_file, sep='\t')
data.fillna('', inplace=True)
for i, line in data.iterrows():
Expand Down
Loading

0 comments on commit 83ba3c7

Please sign in to comment.