Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera authored Mar 18, 2024
1 parent e34f4ea commit 64a4aba
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 5 deletions.
34 changes: 30 additions & 4 deletions protein_modification_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,17 @@ def check_func(row, genome, allowed_mod_dict):
type='dummy',
rule_name='dummy',
regex=f'(?<!{aa})({aa})(\d+){aa}?',
apply_syntax=lambda x: f'{x[0]}{x[1]}',
)
result = replace_allele_features_with_syntax_rules([dummy_rule], [row['sequence_position']], [], gene)
# Special abbreviations for CTD modifications
ctd_rule = SyntaxRule(
type='ctd_abbreviations',
rule_name='ctd_abbreviations',
regex='(CTD_S2|CTD_T4|CTD_S5|CTD_S7)',
apply_syntax=lambda x: x[0]
)

result = replace_allele_features_with_syntax_rules([dummy_rule, ctd_rule], [row['sequence_position']], [], gene)

# Extract the matched and unmatched elements
match_groups: list[tuple[re.Match, SyntaxRule]] = list(filter(lambda x: type(x) != str, result))
Expand All @@ -70,21 +79,38 @@ def check_func(row, genome, allowed_mod_dict):
if len(unmatched):
return 'pattern_error', ''

correct_name = ','.join(''.join(match_group[0].groups()) for match_group in match_groups)
correct_name_list = list()
for match_group in match_groups:
groups_from_match = match_group[0].groups()
syntax_rule = match_group[1]
correct_name_list.append(syntax_rule.apply_syntax(groups_from_match))

correct_name = ','.join(correct_name_list)

change_sequence_position_to = ''
if correct_name != row['sequence_position']:
change_sequence_position_to = correct_name

errors = [check_sequence_single_pos(match_group[0].groups(), gene, 'peptide') for match_group in match_groups]
# Error handling ommitted for CTD
errors = list()
for match_group in match_groups:
if match_group[1].rule_name == 'ctd_abbreviations':
if systematic_id == 'SPBC28F2.12':
continue
else:
errors.append('no_ctd')
else:
errors.append(check_sequence_single_pos(match_group[0].groups(), gene, 'peptide'))

if any(errors):
return '|'.join(errors), change_sequence_position_to

# If there are restriction for this particular MOD, check for those
if allowed_mod_dict[row['modification']]:
# Get all letters in the sequence_position
residues = set(x for x in re.findall('[a-zA-Z]', row['sequence_position']))
# We use ([A-Za-z])(?=\d) instead of [A-Za-z] so that CTD abbreviations such as CTD_S2
# are also supported
residues = set(x for x in re.findall('([A-Za-z])(?=\d)', row['sequence_position']))
if any(residue not in allowed_mod_dict[row['modification']] for residue in residues):
return 'residue_not_allowed', change_sequence_position_to

Expand Down
18 changes: 18 additions & 0 deletions protein_modification_transvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@
tqdm.pandas()


def expand_CTD_abbreviations(sequence_position: str) -> str:
"""Expand CTD abbreviations to all positions"""

abbreviations = {
"CTD_S2": "S1559,S1566,S1579,S1586,S1593,S1600,S1607,S1614,S1621,S1628,S1635,S1642,S1649,S1656,S1663,S1670,S1677,S1684,S1691,S1698,S1705,S1712,S1719,S1726,S1733,S1740,S1747",
"CTD_T4": "T1554,T1567,T1581,T1588,T1595,T1602,T1609,T1616,T1623,T1630,T1637,T1644,T1651,T1658,T1665,T1672,T1679,T1686,T1693,T1700,T1707,T1714,T1721,T1728,T1735,T1742,T1749",
"CTD_S5": "S1555,S1562,S1568,S1575,S1582,S1589,S1596,S1603,S1610,S1617,S1624,S1631,S1638,S1645,S1652,S1659,S1666,S1673,S1680,S1687,S1694,S1701,S1708,S1715,S1722,S1729,S1736,S1743,S1750",
"CTD_S7": "S1557,S1577,S1584,S1591,S1598,S1605,S1612,S1619,S1626,S1633,S1640,S1647,S1654,S1661,S1668,S1675,S1682,S1689,S1696,S1703,S1710,S1717,S1724,S1731,S1738,S1745,S1752"
}
for key in abbreviations:
sequence_position = sequence_position.replace(key, abbreviations[key])
return sequence_position


def format_for_transvar(row, genome):

# Transvar uses only gene_ids, while the pipeline uses a mix to handle multi-transcripts
Expand Down Expand Up @@ -42,6 +56,7 @@ def get_transvar_coordinates(row, db, genome, exclude_transcripts):
# print(row['systematic_id'], '<<<>>>', row['transvar_input'])
qc_id = process_systematic_id(row['systematic_id'], genome, 'first')
transcript_id = None if (qc_id == row['systematic_id']) else qc_id

try:
transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno', row['transvar_input'], db))
return get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id)
Expand Down Expand Up @@ -69,6 +84,9 @@ def main(genome_file, protein_modification_results_file, exclude_transcripts_fil
data['exploded_sequence_position'] = data['sequence_position']
data.loc[data['change_sequence_position_to'] != '', 'exploded_sequence_position'] = data['change_sequence_position_to']

# Expand CTD abbreviations
data['exploded_sequence_position'] = data['sequence_position'].apply(expand_CTD_abbreviations)

# Explode the sequence_position and the rules_applied
data_exploded = data[['systematic_id', 'sequence_position', 'exploded_sequence_position']].copy()
data_exploded.drop_duplicates(inplace=True)
Expand Down
2 changes: 1 addition & 1 deletion transvar_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_transvar_str_annotation(variant_type: str, variant_description: str, db:
transvar_fields_first_row = output_str.split('\n')[1].split('\t')
if transvar_fields_first_row[-3] == '././.':
if (transvar_fields_first_row[-1] == 'no_valid_transcript_found') and not variant_description.startswith('Q00'):
raise ValueError('no_valid_transcript_found')
raise ValueError('no_valid_transcript_found', variant_description)
else:
raise ValueError('Unknown error: ', transvar_fields_first_row[-1])

Expand Down

0 comments on commit 64a4aba

Please sign in to comment.