-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* getting there * working in the pipeline and the API * changed convert_ctd api endpoint response type
- Loading branch information
Showing
12 changed files
with
374 additions
and
125 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
import re | ||
|
||
aa = 'GPAVLIMCFYWHKRQNEDST' | ||
aa = aa + aa.lower() | ||
aa = f'[{aa}]' | ||
|
||
ctd_repetition_regex = r'\(r\d+-r\d+(?:-\d+)?\)' | ||
ctd_repetition_regex_with_groups = r'\(r(\d+)-r(\d+)(?:-(\d+))?\)' | ||
ctd_mutation_regex = f'{aa}\d+{aa}(?:{ctd_repetition_regex})?' | ||
ctd_deletion_regex = f'(?:delta|\u0394|∆)(?:{ctd_repetition_regex})?' | ||
|
||
# This dictionary has keys that are the systematic IDs of the genes that have CTDs, | ||
# the value is a dictionary with: | ||
# - positions: a dictionary with keys that are the residues of the canonical repeat | ||
# e.g. YSPTSPS for SPBC28F2.12, and the value is the repeats in which the residue is | ||
# present, zero-based index. | ||
# - shifts: a tuple of two integers the first value is the position in the protein where | ||
# an aminoacid is missing (-1) or inserted (+1) with respect to the canonical repeat. | ||
# There is only one in rpb1, but a list of tuples could be used if more are needed. | ||
# - start: the position of the first residue of the CTD in the protein (zero-based index) | ||
# - length: the length of the CTD | ||
|
||
rpb1_dictionary = { | ||
'positions': { | ||
'Y1': range(29), | ||
'S2': [1, 2] + list(range(4, 29)), | ||
'P3': [1, ] + list(range(3, 8)) + list(range(9, 29)), | ||
'T4': [0, 2] + list(range(4, 29)), | ||
'S5': range(29), | ||
'P6': range(29), | ||
'S7': [0, 3] + list(range(4, 29)), | ||
}, | ||
'shift': (1566, -1), | ||
'start': 1550, | ||
'length': 202, | ||
'nb_repeats': 29, | ||
} | ||
|
||
|
||
def apply_shift(match: re.match, shift: tuple[int, int]): | ||
num_str = match.group() | ||
num = int(num_str) | ||
# If the number is smaller than the position of the missing aminoacid, | ||
# return the number | ||
if num <= shift[0]: | ||
return num_str | ||
# If the number is bigger than the position of the missing aminoacid, | ||
# return the number plus the shift | ||
return str(shift[1] + num) | ||
|
||
|
||
def ctd_further_check(g, gg): | ||
return gg['CDS'].qualifiers['systematic_id'][0] in ['SPBC28F2.12'] | ||
|
||
|
||
def ctd_convert_to_normal_variant(ctd_substring: str): | ||
|
||
mutations = re.findall(ctd_mutation_regex, ctd_substring) | ||
deletions= re.findall(ctd_deletion_regex, ctd_substring) | ||
starting_position = rpb1_dictionary['start'] | ||
repeat_length = len(rpb1_dictionary['positions']) | ||
out_list = [] | ||
deleted_repeats = list() | ||
for deletion in deletions: | ||
# Entire deletion, always correct, and takes over everything else | ||
if '(' not in deletion: | ||
return '{}-{}'.format(rpb1_dictionary['start'] + 1, rpb1_dictionary['start'] + rpb1_dictionary['length']) | ||
# Deletion with repeat number | ||
match = re.search(ctd_repetition_regex_with_groups, deletion) | ||
start, stop, step = match.groups() | ||
step = int(step) if step is not None else 1 | ||
start, stop = int(start), int(stop) | ||
deleted_ranges = list() | ||
|
||
for repeat_number in range(start, stop + 1, step): | ||
deleted_repeats.append(repeat_number) | ||
repeat_start = (repeat_number - 1) * repeat_length + starting_position + 1 | ||
repeat_end = repeat_start + repeat_length - 1 | ||
deleted_ranges.append((repeat_start, repeat_end)) | ||
if step == 1: | ||
out_list.append('{}-{}'.format(deleted_ranges[0][0], deleted_ranges[-1][1])) | ||
else: | ||
out_list.append(','.join(['{}-{}'.format(*x) for x in deleted_ranges])) | ||
|
||
for mutation in mutations: | ||
original_residue, index_in_repeat, replaced_by = re.search(r'([A-Za-z])(\d+)([A-Za-z])', mutation).groups() | ||
index_in_repeat = int(index_in_repeat) | ||
repeats_where_residue_is_present = rpb1_dictionary['positions'][mutation[:2]] | ||
match = re.search(ctd_repetition_regex_with_groups, mutation) | ||
if match is None: | ||
start, stop, step = 1, rpb1_dictionary['nb_repeats'], 1 | ||
else: | ||
start, stop, step = match.groups() | ||
step = int(step) if step is not None else 1 | ||
start, stop = int(start), int(stop) | ||
for repeat_number in range(start, stop + 1, step): | ||
if repeat_number in deleted_repeats: | ||
continue | ||
if repeat_number - 1 not in repeats_where_residue_is_present: | ||
continue | ||
mutated_position = index_in_repeat + (repeat_number - 1) * repeat_length + starting_position | ||
out_list.append('{}{}{}'.format(original_residue, mutated_position, replaced_by)) | ||
out_str = ','.join(out_list) | ||
|
||
return re.sub(r'(\d+)', lambda x: apply_shift(x, rpb1_dictionary['shift']), out_str) | ||
|
||
|
||
def ctd_check_sequence(ctd_substring: str): | ||
sequence_errors = [] | ||
mutations = re.findall(ctd_mutation_regex, ctd_substring) | ||
|
||
for mutation in mutations: | ||
if mutation[:2] not in rpb1_dictionary['positions']: | ||
sequence_errors.append('CTD-' + mutation[:2]) | ||
|
||
match = re.search(ctd_repetition_regex_with_groups, ctd_substring) | ||
start, stop, step = match.groups() | ||
if int(stop) > rpb1_dictionary['nb_repeats']: | ||
sequence_errors.append('CTD-r' + stop) | ||
if int(start) > rpb1_dictionary['nb_repeats']: | ||
sequence_errors.append('CTD-r' + start) | ||
|
||
return '/'.join(sequence_errors) | ||
|
||
|
||
def ctd_format_for_transvar(capture_groups: list[str], gene: dict) -> list[str]: | ||
ctd_substring = capture_groups[0] | ||
result = list() | ||
for ele in ctd_convert_to_normal_variant(ctd_substring).split(','): | ||
if '-' in ele: | ||
result.append('p.{}_{}del'.format(*ele.split('-'))) | ||
else: | ||
result.append('p.{}'.format(ele)) | ||
return result | ||
|
||
|
||
def ctd_apply_syntax(ctd_substring: str): | ||
ctd_bits = re.findall(f'({ctd_mutation_regex}|{ctd_deletion_regex})', ctd_substring) | ||
return 'CTD-' + ','.join(ctd_bits) |
Oops, something went wrong.