-
Notifications
You must be signed in to change notification settings - Fork 1
/
ctd_support.py
139 lines (116 loc) · 5.54 KB
/
ctd_support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
aa = 'GPAVLIMCFYWHKRQNEDST'
aa = aa + aa.lower()
aa = f'[{aa}]'
ctd_repetition_regex = r'\(r\d+-r\d+(?:-\d+)?\)'
ctd_repetition_regex_with_groups = r'\(r(\d+)-r(\d+)(?:-(\d+))?\)'
ctd_mutation_regex = f'{aa}\d+{aa}(?:{ctd_repetition_regex})?'
ctd_deletion_regex = f'(?:delta|\u0394|∆)(?:{ctd_repetition_regex})?'
# This dictionary has keys that are the systematic IDs of the genes that have CTDs,
# the value is a dictionary with:
# - positions: a dictionary with keys that are the residues of the canonical repeat
# e.g. YSPTSPS for SPBC28F2.12, and the value is the repeats in which the residue is
# present, zero-based index.
# - shifts: a tuple of two integers the first value is the position in the protein where
# an aminoacid is missing (-1) or inserted (+1) with respect to the canonical repeat.
# There is only one in rpb1, but a list of tuples could be used if more are needed.
# - start: the position of the first residue of the CTD in the protein (zero-based index)
# - length: the length of the CTD
rpb1_dictionary = {
'positions': {
'Y1': range(29),
'S2': [1, 2] + list(range(4, 29)),
'P3': [1, ] + list(range(3, 8)) + list(range(9, 29)),
'T4': [0, 2] + list(range(4, 29)),
'S5': range(29),
'P6': range(29),
'S7': [0, 3] + list(range(4, 29)),
},
'shift': (1566, -1),
'start': 1550,
'length': 202,
'nb_repeats': 29,
}
def apply_shift(match: re.match, shift: tuple[int, int]):
num_str = match.group()
num = int(num_str)
# If the number is smaller than the position of the missing aminoacid,
# return the number
if num <= shift[0]:
return num_str
# If the number is bigger than the position of the missing aminoacid,
# return the number plus the shift
return str(shift[1] + num)
def ctd_further_check(g, gg):
return gg['CDS'].qualifiers['systematic_id'][0] in ['SPBC28F2.12']
def ctd_convert_to_normal_variant(ctd_substring: str):
mutations = re.findall(ctd_mutation_regex, ctd_substring)
deletions= re.findall(ctd_deletion_regex, ctd_substring)
starting_position = rpb1_dictionary['start']
repeat_length = len(rpb1_dictionary['positions'])
out_list = []
deleted_repeats = list()
for deletion in deletions:
# Entire deletion, always correct, and takes over everything else
if '(' not in deletion:
return '{}-{}'.format(rpb1_dictionary['start'] + 1, rpb1_dictionary['start'] + rpb1_dictionary['length'])
# Deletion with repeat number
match = re.search(ctd_repetition_regex_with_groups, deletion)
start, stop, step = match.groups()
step = int(step) if step is not None else 1
start, stop = int(start), int(stop)
deleted_ranges = list()
for repeat_number in range(start, stop + 1, step):
deleted_repeats.append(repeat_number)
repeat_start = (repeat_number - 1) * repeat_length + starting_position + 1
repeat_end = repeat_start + repeat_length - 1
deleted_ranges.append((repeat_start, repeat_end))
if step == 1:
out_list.append('{}-{}'.format(deleted_ranges[0][0], deleted_ranges[-1][1]))
else:
out_list.append(','.join(['{}-{}'.format(*x) for x in deleted_ranges]))
for mutation in mutations:
original_residue, index_in_repeat, replaced_by = re.search(r'([A-Za-z])(\d+)([A-Za-z])', mutation).groups()
index_in_repeat = int(index_in_repeat)
repeats_where_residue_is_present = rpb1_dictionary['positions'][mutation[:2]]
match = re.search(ctd_repetition_regex_with_groups, mutation)
if match is None:
start, stop, step = 1, rpb1_dictionary['nb_repeats'], 1
else:
start, stop, step = match.groups()
step = int(step) if step is not None else 1
start, stop = int(start), int(stop)
for repeat_number in range(start, stop + 1, step):
if repeat_number in deleted_repeats:
continue
if repeat_number - 1 not in repeats_where_residue_is_present:
continue
mutated_position = index_in_repeat + (repeat_number - 1) * repeat_length + starting_position
out_list.append('{}{}{}'.format(original_residue, mutated_position, replaced_by))
out_str = ','.join(out_list)
return re.sub(r'(\d+)', lambda x: apply_shift(x, rpb1_dictionary['shift']), out_str)
def ctd_check_sequence(ctd_substring: str):
sequence_errors = []
mutations = re.findall(ctd_mutation_regex, ctd_substring)
for mutation in mutations:
if mutation[:2] not in rpb1_dictionary['positions']:
sequence_errors.append('CTD-' + mutation[:2])
match = re.search(ctd_repetition_regex_with_groups, ctd_substring)
start, stop, step = match.groups()
if int(stop) > rpb1_dictionary['nb_repeats']:
sequence_errors.append('CTD-r' + stop)
if int(start) > rpb1_dictionary['nb_repeats']:
sequence_errors.append('CTD-r' + start)
return '/'.join(sequence_errors)
def ctd_format_for_transvar(capture_groups: list[str], gene: dict) -> list[str]:
ctd_substring = capture_groups[0]
result = list()
for ele in ctd_convert_to_normal_variant(ctd_substring).split(','):
if '-' in ele:
result.append('p.{}_{}del'.format(*ele.split('-')))
else:
result.append('p.{}'.format(ele))
return result
def ctd_apply_syntax(ctd_substring: str):
ctd_bits = re.findall(f'({ctd_mutation_regex}|{ctd_deletion_regex})', ctd_substring)
return 'CTD-' + ','.join(ctd_bits)