Skip to content

Commit

Permalink
fix errors on tests + move transition grammar to the end of the file
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Jul 31, 2023
1 parent f213a97 commit 11b70ac
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 25 deletions.
49 changes: 24 additions & 25 deletions grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,31 +279,6 @@ def format_negatives(input_list: list[str], indexes: list[int]):
]


# Transition grammars ==================================================

# This grammar recognises the old syntax, and apply_syntax applies the new style
transition_old2new_aminoacid_grammar = copy.deepcopy(aminoacid_grammar)

for rule in transition_old2new_aminoacid_grammar:
if rule['type'] == 'amino_acid_mutation' and rule['rule_name'] == 'multiple_aa':
rule['apply_syntax'] = lambda g: ''.join(g).upper()
elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'single':
rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'multiple':
rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()


# Same for nucleotides
transition_old2new_nucleotide_grammar = copy.deepcopy(aminoacid_grammar)
for rule in transition_old2new_nucleotide_grammar:
if rule['type'] == 'nucleotide_mutation' and rule['rule_name'] == 'multiple_nt':
rule['apply_syntax'] = lambda g: (''.join(format_negatives(g, [1]))).upper().replace('U', 'T')
elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'single':
rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'multiple':
rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')


# New grammars - here there are a lot of re-used regex, so we use variables to avoid repetition

multi_aa_regex = f'(?<=\\b)({aa}+)-?(\d+)-?({aa}+)(?=\\b)'
Expand Down Expand Up @@ -456,3 +431,27 @@ def format_negatives(input_list: list[str], indexes: list[int]):
'check_sequence': lambda groups, gene: check_multiple_positions_dont_exist(groups[:1], gene, 'dna'),
},
]

# Transition grammars ==================================================

# This grammar recognises the old syntax, and apply_syntax applies the new style
transition_old2new_aminoacid_grammar = copy.deepcopy(aminoacid_grammar)

for rule in transition_old2new_aminoacid_grammar:
if rule['type'] == 'amino_acid_mutation' and rule['rule_name'] == 'multiple_aa':
rule['apply_syntax'] = lambda g: ''.join(g).upper()
elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'single':
rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'multiple':
rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()


# Same for nucleotides
transition_old2new_nucleotide_grammar = copy.deepcopy(nucleotide_grammar)
for rule in transition_old2new_nucleotide_grammar:
if rule['type'] == 'nucleotide_mutation' and rule['rule_name'] == 'multiple_nt':
rule['apply_syntax'] = lambda g: (''.join(format_negatives(g, [1]))).upper().replace('U', 'T')
elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'single':
rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'multiple':
rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
27 changes: 27 additions & 0 deletions test_transition_grammars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from models import SyntaxRule, AllowedTypes
from grammar import aminoacid_grammar, allowed_types_dict, composed_types_dict, nucleotide_grammar, transition_old2new_aminoacid_grammar, transition_old2new_nucleotide_grammar, disruption_grammar
import pandas
from allele_qc import check_fun
import unittest
import pickle


class TransitionGrammarsTest(unittest.TestCase):
# To test that conversion is reversible

def test_transition_grammar(self):
allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)

with open('data/genome.pickle', 'rb') as ins:
genome = pickle.load(ins)
allele_data = pandas.read_csv('data/alleles.tsv', delimiter='\t', na_filter=False)
syntax_rules_aminoacids_old2new = [SyntaxRule.parse_obj(r) for r in transition_old2new_aminoacid_grammar]
syntax_rules_nucleotides_old2new = [SyntaxRule.parse_obj(r) for r in transition_old2new_nucleotide_grammar]
syntax_rules_disruption = [SyntaxRule.parse_obj(r) for r in disruption_grammar]
allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)

extra_cols = allele_data.apply(lambda row: check_fun(row, genome, syntax_rules_aminoacids_old2new, syntax_rules_nucleotides_old2new, syntax_rules_disruption, allowed_types), axis=1, result_type='expand')
new_fixes = pandas.concat([allele_data, extra_cols], axis=1)
# Keep only those with corrections
new_fixes = new_fixes[(new_fixes['change_description_to'] != '') & (new_fixes['pattern_error'] == '') & (new_fixes['invalid_error'] == '')]
print(new_fixes)

0 comments on commit 11b70ac

Please sign in to comment.