fix for pombase/canto#2807 (#115)

pombase · Mar 18, 2024 · 64a4aba · 64a4aba
1 parent e34f4ea
commit 64a4aba
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 5 deletions.
diff --git a/protein_modification_qc.py b/protein_modification_qc.py
@@ -59,8 +59,17 @@ def check_func(row, genome, allowed_mod_dict):
         type='dummy',
         rule_name='dummy',
         regex=f'(?<!{aa})({aa})(\d+){aa}?',
+        apply_syntax=lambda x: f'{x[0]}{x[1]}',
     )
-    result = replace_allele_features_with_syntax_rules([dummy_rule], [row['sequence_position']], [], gene)
+    # Special abbreviations for CTD modifications
+    ctd_rule = SyntaxRule(
+        type='ctd_abbreviations',
+        rule_name='ctd_abbreviations',
+        regex='(CTD_S2|CTD_T4|CTD_S5|CTD_S7)',
+        apply_syntax=lambda x: x[0]
+    )
+
+    result = replace_allele_features_with_syntax_rules([dummy_rule, ctd_rule], [row['sequence_position']], [], gene)
 
     # Extract the matched and unmatched elements
     match_groups: list[tuple[re.Match, SyntaxRule]] = list(filter(lambda x: type(x) != str, result))
@@ -70,21 +79,38 @@ def check_func(row, genome, allowed_mod_dict):
     if len(unmatched):
         return 'pattern_error', ''
 
-    correct_name = ','.join(''.join(match_group[0].groups()) for match_group in match_groups)
+    correct_name_list = list()
+    for match_group in match_groups:
+        groups_from_match = match_group[0].groups()
+        syntax_rule = match_group[1]
+        correct_name_list.append(syntax_rule.apply_syntax(groups_from_match))
+
+    correct_name = ','.join(correct_name_list)
 
     change_sequence_position_to = ''
     if correct_name != row['sequence_position']:
         change_sequence_position_to = correct_name
 
-    errors = [check_sequence_single_pos(match_group[0].groups(), gene, 'peptide') for match_group in match_groups]
+    # Error handling ommitted for CTD
+    errors = list()
+    for match_group in match_groups:
+        if match_group[1].rule_name == 'ctd_abbreviations':
+            if systematic_id == 'SPBC28F2.12':
+                continue
+            else:
+                errors.append('no_ctd')
+        else:
+            errors.append(check_sequence_single_pos(match_group[0].groups(), gene, 'peptide'))
 
     if any(errors):
         return '|'.join(errors), change_sequence_position_to
 
     # If there are restriction for this particular MOD, check for those
     if allowed_mod_dict[row['modification']]:
         # Get all letters in the sequence_position
-        residues = set(x for x in re.findall('[a-zA-Z]', row['sequence_position']))
+        # We use ([A-Za-z])(?=\d) instead of [A-Za-z] so that CTD abbreviations such as CTD_S2
+        # are also supported
+        residues = set(x for x in re.findall('([A-Za-z])(?=\d)', row['sequence_position']))
         if any(residue not in allowed_mod_dict[row['modification']] for residue in residues):
             return 'residue_not_allowed', change_sequence_position_to
 

diff --git a/protein_modification_transvar.py b/protein_modification_transvar.py
@@ -15,6 +15,20 @@
 tqdm.pandas()
 
 
+def expand_CTD_abbreviations(sequence_position: str) -> str:
+    """Expand CTD abbreviations to all positions"""
+
+    abbreviations = {
+        "CTD_S2": "S1559,S1566,S1579,S1586,S1593,S1600,S1607,S1614,S1621,S1628,S1635,S1642,S1649,S1656,S1663,S1670,S1677,S1684,S1691,S1698,S1705,S1712,S1719,S1726,S1733,S1740,S1747",
+        "CTD_T4": "T1554,T1567,T1581,T1588,T1595,T1602,T1609,T1616,T1623,T1630,T1637,T1644,T1651,T1658,T1665,T1672,T1679,T1686,T1693,T1700,T1707,T1714,T1721,T1728,T1735,T1742,T1749",
+        "CTD_S5": "S1555,S1562,S1568,S1575,S1582,S1589,S1596,S1603,S1610,S1617,S1624,S1631,S1638,S1645,S1652,S1659,S1666,S1673,S1680,S1687,S1694,S1701,S1708,S1715,S1722,S1729,S1736,S1743,S1750",
+        "CTD_S7": "S1557,S1577,S1584,S1591,S1598,S1605,S1612,S1619,S1626,S1633,S1640,S1647,S1654,S1661,S1668,S1675,S1682,S1689,S1696,S1703,S1710,S1717,S1724,S1731,S1738,S1745,S1752"
+    }
+    for key in abbreviations:
+        sequence_position = sequence_position.replace(key, abbreviations[key])
+    return sequence_position
+
+
 def format_for_transvar(row, genome):
 
     # Transvar uses only gene_ids, while the pipeline uses a mix to handle multi-transcripts
@@ -42,6 +56,7 @@ def get_transvar_coordinates(row, db, genome, exclude_transcripts):
     # print(row['systematic_id'], '<<<>>>', row['transvar_input'])
     qc_id = process_systematic_id(row['systematic_id'], genome, 'first')
     transcript_id = None if (qc_id == row['systematic_id']) else qc_id
+
     try:
         transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno', row['transvar_input'], db))
         return get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id)
@@ -69,6 +84,9 @@ def main(genome_file, protein_modification_results_file, exclude_transcripts_fil
     data['exploded_sequence_position'] = data['sequence_position']
     data.loc[data['change_sequence_position_to'] != '', 'exploded_sequence_position'] = data['change_sequence_position_to']
 
+    # Expand CTD abbreviations
+    data['exploded_sequence_position'] = data['sequence_position'].apply(expand_CTD_abbreviations)
+
     # Explode the sequence_position and the rules_applied
     data_exploded = data[['systematic_id', 'sequence_position', 'exploded_sequence_position']].copy()
     data_exploded.drop_duplicates(inplace=True)

diff --git a/transvar_functions.py b/transvar_functions.py
@@ -110,7 +110,7 @@ def get_transvar_str_annotation(variant_type: str, variant_description: str, db:
         transvar_fields_first_row = output_str.split('\n')[1].split('\t')
         if transvar_fields_first_row[-3] == '././.':
             if (transvar_fields_first_row[-1] == 'no_valid_transcript_found') and not variant_description.startswith('Q00'):
-                raise ValueError('no_valid_transcript_found')
+                raise ValueError('no_valid_transcript_found', variant_description)
             else:
                 raise ValueError('Unknown error: ', transvar_fields_first_row[-1])