fix errors on tests + move transition grammar to the end of the file

pombase · Jul 31, 2023 · 11b70ac · 11b70ac
1 parent f213a97
commit 11b70ac
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 25 deletions.
diff --git a/grammar.py b/grammar.py
@@ -279,31 +279,6 @@ def format_negatives(input_list: list[str], indexes: list[int]):
 ]
 
 
-# Transition grammars ==================================================
-
-# This grammar recognises the old syntax, and apply_syntax applies the new style
-transition_old2new_aminoacid_grammar = copy.deepcopy(aminoacid_grammar)
-
-for rule in transition_old2new_aminoacid_grammar:
-    if rule['type'] == 'amino_acid_mutation' and rule['rule_name'] == 'multiple_aa':
-        rule['apply_syntax'] = lambda g: ''.join(g).upper()
-    elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'single':
-        rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
-    elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'multiple':
-        rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
-
-
-# Same for nucleotides
-transition_old2new_nucleotide_grammar = copy.deepcopy(aminoacid_grammar)
-for rule in transition_old2new_nucleotide_grammar:
-    if rule['type'] == 'nucleotide_mutation' and rule['rule_name'] == 'multiple_nt':
-        rule['apply_syntax'] = lambda g: (''.join(format_negatives(g, [1]))).upper().replace('U', 'T')
-    elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'single':
-        rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
-    elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'multiple':
-        rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
-
-
 # New grammars - here there are a lot of re-used regex, so we use variables to avoid repetition
 
 multi_aa_regex = f'(?<=\\b)({aa}+)-?(\d+)-?({aa}+)(?=\\b)'
@@ -456,3 +431,27 @@ def format_negatives(input_list: list[str], indexes: list[int]):
         'check_sequence': lambda groups, gene: check_multiple_positions_dont_exist(groups[:1], gene, 'dna'),
     },
 ]
+
+# Transition grammars ==================================================
+
+# This grammar recognises the old syntax, and apply_syntax applies the new style
+transition_old2new_aminoacid_grammar = copy.deepcopy(aminoacid_grammar)
+
+for rule in transition_old2new_aminoacid_grammar:
+    if rule['type'] == 'amino_acid_mutation' and rule['rule_name'] == 'multiple_aa':
+        rule['apply_syntax'] = lambda g: ''.join(g).upper()
+    elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'single':
+        rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
+    elif rule['type'] == 'amino_acid_insertion' and rule['rule_name'] == 'multiple':
+        rule['apply_syntax'] = lambda g: f'{g[0]}{g[1]}{g[0]}{g[2]}'.upper()
+
+
+# Same for nucleotides
+transition_old2new_nucleotide_grammar = copy.deepcopy(nucleotide_grammar)
+for rule in transition_old2new_nucleotide_grammar:
+    if rule['type'] == 'nucleotide_mutation' and rule['rule_name'] == 'multiple_nt':
+        rule['apply_syntax'] = lambda g: (''.join(format_negatives(g, [1]))).upper().replace('U', 'T')
+    elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'single':
+        rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
+    elif rule['type'] == 'nucleotide_insertion' and rule['rule_name'] == 'multiple':
+        rule['apply_syntax'] = lambda g: f'{g[0]}{format_negatives(g[1:2],[0])[0]}{g[0]}{g[2]}'.upper().replace('U', 'T')
diff --git a/test_transition_grammars.py b/test_transition_grammars.py
@@ -0,0 +1,27 @@
+from models import SyntaxRule, AllowedTypes
+from grammar import aminoacid_grammar, allowed_types_dict, composed_types_dict, nucleotide_grammar, transition_old2new_aminoacid_grammar, transition_old2new_nucleotide_grammar, disruption_grammar
+import pandas
+from allele_qc import check_fun
+import unittest
+import pickle
+
+
+class TransitionGrammarsTest(unittest.TestCase):
+    # To test that conversion is reversible
+
+    def test_transition_grammar(self):
+        allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)
+
+        with open('data/genome.pickle', 'rb') as ins:
+            genome = pickle.load(ins)
+        allele_data = pandas.read_csv('data/alleles.tsv', delimiter='\t', na_filter=False)
+        syntax_rules_aminoacids_old2new = [SyntaxRule.parse_obj(r) for r in transition_old2new_aminoacid_grammar]
+        syntax_rules_nucleotides_old2new = [SyntaxRule.parse_obj(r) for r in transition_old2new_nucleotide_grammar]
+        syntax_rules_disruption = [SyntaxRule.parse_obj(r) for r in disruption_grammar]
+        allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)
+
+        extra_cols = allele_data.apply(lambda row: check_fun(row, genome, syntax_rules_aminoacids_old2new, syntax_rules_nucleotides_old2new, syntax_rules_disruption, allowed_types), axis=1, result_type='expand')
+        new_fixes = pandas.concat([allele_data, extra_cols], axis=1)
+        # Keep only those with corrections
+        new_fixes = new_fixes[(new_fixes['change_description_to'] != '') & (new_fixes['pattern_error'] == '') & (new_fixes['invalid_error'] == '')]
+        print(new_fixes)