-
Notifications
You must be signed in to change notification settings - Fork 1
/
allele_auto_fix.py
219 lines (166 loc) · 14.1 KB
/
allele_auto_fix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import pandas
from grammar import aminoacid_grammar
from models import SyntaxRule
from refinement_functions import split_multiple_aa, join_multiple_aa
import pickle
import json
import re
from common_autofix_functions import apply_multi_shift_fix, apply_old_coords_fix, apply_histone_fix, get_preferred_fix, apply_name_fix
import argparse
def main(genome_file, coordinate_changes_file, allele_results_file, output_dir):
with open(genome_file, 'rb') as ins:
genome = pickle.load(ins)
with open(coordinate_changes_file) as ins:
coordinate_changes_dict = json.load(ins)
syntax_rules = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_dict = {f'{r.type}:{r.rule_name}': r for r in syntax_rules}
data = pandas.read_csv(allele_results_file, sep='\t', na_filter=False)
# We only want aminoacid alleles. We don't remove the ones without errors yet, because if there are errors in a session
# for a given allele, other positions may be silent errors (residue matches by chance). We therefore aggregate both with and without errors
# We exclude CTDs from the fixing, because they would be too complicated to include (they may contain commas, for instance)
# Same (for now) for alleles that have
aminoacid_alleles = (data['rules_applied'].str.contains('amino_acid') | data['rules_applied'].str.contains('nonsense')) & ~data['rules_applied'].str.contains('CTD')
# Extra filter for multi_aa until a cleaner solution is found (see https://github.com/pombase/allele_qc/issues/102)
for exclude in ['amino_acid_deletion_and_mutation:multiple_aa', 'amino_acid_insertion_and_mutation:multiple_aa']:
exclude_rows = data['rules_applied'].str.contains(exclude)
if any(exclude_rows):
print(f'Excluding {sum(exclude_rows)} rows with {exclude}, see https://github.com/pombase/allele_qc/issues/102')
print('\n'.join(data.loc[exclude_rows, 'allele_description']))
aminoacid_alleles = aminoacid_alleles & ~exclude_rows
aminoacid_alleles = aminoacid_alleles & ~data['rules_applied'].str.contains('multi_aa')
data_subset = data.loc[aminoacid_alleles, ['systematic_id', 'allele_description', 'allele_name', 'reference', 'change_description_to', 'rules_applied', 'sequence_error']]
# Explode the references
data_subset.loc[:, 'reference'] = data_subset['reference'].apply(str.split, args=[','])
data_subset = data_subset.explode('reference')
# Keep correct description only
data_subset.loc[data_subset['change_description_to'] != '', 'allele_description'] = data_subset.loc[data_subset['change_description_to'] != '', 'change_description_to']
data_subset.drop(columns='change_description_to', inplace=True)
# Copy allele_description to explode it
data_subset['allele_description_exploded'] = data_subset['allele_description'].copy()
# Explode
data_subset.loc[:, 'allele_description_exploded'] = data_subset['allele_description_exploded'].apply(str.split, args=[','])
data_subset.loc[:, 'rules_applied'] = data_subset['rules_applied'].apply(str.split, args=['|'])
# Hack to be able to explode sequence error when it's empty
data_subset.loc[:, 'sequence_error'] = data_subset.apply(lambda x: ['' for i in x['rules_applied']] if x['sequence_error'] == '' else x['sequence_error'].split('|'), axis=1)
data_subset = data_subset.explode(['allele_description_exploded', 'rules_applied', 'sequence_error'])
# Explode again the multiple_aa
multi_aa = data_subset['rules_applied'] == 'amino_acid_mutation:multiple_aa'
data_subset['allele_description_exploded2'] = data_subset['allele_description_exploded'].copy()
data_subset.loc[multi_aa, 'allele_description_exploded2'] = data_subset.loc[multi_aa, 'allele_description_exploded2'].apply(split_multiple_aa, args=[syntax_rules_dict['amino_acid_mutation:multiple_aa'].regex])
data_subset = data_subset.explode('allele_description_exploded2')
# Sort the mutations by the first number in them (there might be two numbers in the deletions)
data_subset.loc[:, 'sorting_col'] = data_subset['allele_description_exploded2'].apply(lambda x: int(re.search(r'\d+', x).group()))
data_subset.sort_values('sorting_col', inplace=True)
data_subset.drop(columns='sorting_col', inplace=True)
data_subset.drop_duplicates(inplace=True)
# Make a copy for matching the proposed fixes later
data_for_fixing = data_subset.copy()
# Drop duplicates that may have arised when splitting allele description at either step
data_subset.drop(columns=['rules_applied', 'allele_description', 'allele_description_exploded'], inplace=True)
data_subset.drop_duplicates(inplace=True)
# Aggregate by publication and systematic id
aggregated_data = data_subset.groupby(['systematic_id', 'reference'], as_index=False).agg({'allele_description_exploded2': ','.join, 'sequence_error': any})
# Here is where we filter for those that have sequence errors
aggregated_data = aggregated_data.loc[aggregated_data['sequence_error'], :]
print('applying fixes...')
extra_cols = aggregated_data.apply(apply_old_coords_fix, axis=1, result_type='expand', args=[coordinate_changes_dict, 'allele_description_exploded2'])
aggregated_data.loc[:, 'old_coords_fix'] = extra_cols.iloc[:, 0]
aggregated_data.loc[:, 'old_coords_revision'] = extra_cols.iloc[:, 1]
aggregated_data.loc[:, 'old_coords_location'] = extra_cols.iloc[:, 2]
aggregated_data['multi_shift_fix'] = aggregated_data.apply(apply_multi_shift_fix, axis=1, args=[genome, 'allele_description_exploded2'])
aggregated_data['histone_fix'] = aggregated_data.apply(apply_histone_fix, axis=1, args=[genome, 'allele_description_exploded2'])
extra_cols = aggregated_data.apply(get_preferred_fix, axis=1, result_type='expand')
aggregated_data.loc[:, 'auto_fix_to'] = extra_cols.iloc[:, 0]
aggregated_data.loc[:, 'auto_fix_comment'] = extra_cols.iloc[:, 1]
# Old intermediary file for checks
# aggregated_data.to_csv(f'{output_dir}/allele_auto_fix_info.tsv', sep='\t', index=False)
# Re-explode the columns that have multiple solutions (now they are aggregated as '|'-separated strings)
aggregated_data_with_fixes = aggregated_data.loc[aggregated_data['auto_fix_to'] != '', :].copy()
# Deaggregate multiple solutions
aggregated_data_with_fixes.loc[:, 'auto_fix_to'] = aggregated_data_with_fixes['auto_fix_to'].apply(str.split, args=['|'])
# We add an extra column with the index of the solution if there is more than one
# We convert to strings because the missing values may give unexpected behaviour in .groupby or .agg
aggregated_data_with_fixes.loc[:, 'solution_index'] = aggregated_data_with_fixes['auto_fix_to'].apply(lambda x: list(range(len(x))) if len(x) > 1 else [''])
aggregated_data_with_fixes.loc[:, 'solution_index'] = aggregated_data_with_fixes.loc[:, 'solution_index']
aggregated_data_with_fixes = aggregated_data_with_fixes.explode(['auto_fix_to', 'solution_index'])
# Deaggregate the parts of each solution
aggregated_data_with_fixes.loc[:, 'allele_description_exploded2'] = aggregated_data_with_fixes['allele_description_exploded2'].apply(str.split, args=[','])
aggregated_data_with_fixes.loc[:, 'auto_fix_to'] = aggregated_data_with_fixes['auto_fix_to'].apply(str.split, args=[','])
for i, row in aggregated_data_with_fixes.iterrows():
if len(row['allele_description_exploded2']) != len(row['auto_fix_to']):
print(row['allele_description_exploded2'], row['auto_fix_to'])
data2merge = aggregated_data_with_fixes.explode(['allele_description_exploded2', 'auto_fix_to'])
data_for_fixing = data_for_fixing.merge(data2merge[['systematic_id', 'reference', 'allele_description_exploded2', 'auto_fix_to', 'auto_fix_comment', 'solution_index']], on=['systematic_id', 'reference', 'allele_description_exploded2'])
# Here we filter again for those that have sequence errors
data_for_fixing = data_for_fixing.loc[data_for_fixing['sequence_error'] != '', :]
# Aggregate the multiple_aa, they only differ on the columns allele_description_exploded2 and auto_fix_to, so we aggregate on everything else
# We also drop allele_description_exploded and allele_description_exploded2 and rules_applied after the aggregation, no longer needed
multi_aa = data_for_fixing['rules_applied'] == 'amino_acid_mutation:multiple_aa'
groupby_columns = list(data_for_fixing.columns)
groupby_columns.remove('allele_description_exploded2')
groupby_columns.remove('auto_fix_to')
new_rows = data_for_fixing[multi_aa].groupby(groupby_columns, as_index=False).agg({'auto_fix_to': lambda x: join_multiple_aa(x.values)})
data_for_fixing = data_for_fixing[~multi_aa]
data_for_fixing = pandas.concat([data_for_fixing, new_rows])
data_for_fixing.drop(columns=['allele_description_exploded', 'allele_description_exploded2', 'sequence_error', 'rules_applied'], inplace=True)
data_for_fixing.drop_duplicates(inplace=True)
# Sort again by first number in the string (concat step may mess up the order,
# which is important for the next aggregation)
data_for_fixing.loc[:, 'sorting_col'] = data_for_fixing['auto_fix_to'].apply(lambda x: int(re.search(r'\d+', x).group()) if x != '?' else 0)
data_for_fixing.sort_values('sorting_col', inplace=True)
data_for_fixing.drop(columns='sorting_col', inplace=True)
# Apply the fix by merging the auto_fix_to individual columns
groupby_columns = list(data_for_fixing.columns)
groupby_columns.remove('auto_fix_to')
data_for_fixing = data_for_fixing.groupby(groupby_columns, as_index=False).agg({'auto_fix_to': ','.join})
# Merge solutions from different PMIDs that are the same
groupby_columns = list(data_for_fixing.columns)
groupby_columns.remove('reference')
data_for_fixing = data_for_fixing.groupby(groupby_columns, as_index=False).agg({'reference': ','.join})
# Here we check if multiple solutions were found from different references, if so give a warning
groupby_columns.remove('auto_fix_to')
different_solutions = data_for_fixing[data_for_fixing[groupby_columns].duplicated(keep=False)]
if not different_solutions.empty:
print('Different solutions have been found in different papers')
print(different_solutions)
# Finally, we merge with the original data
data = data.merge(data_for_fixing[['systematic_id', 'allele_name', 'auto_fix_comment', 'solution_index', 'auto_fix_to']], on=['systematic_id', 'allele_name'], how='outer')
data.fillna('', inplace=True)
# Set auto_fix_to value in change_description_to, and drop the column
fixed_sequence_errors = data['auto_fix_to'] != ''
data.loc[fixed_sequence_errors, 'change_description_to'] = data.loc[fixed_sequence_errors, 'auto_fix_to']
data.drop(columns='auto_fix_to', inplace=True)
columns_auto_fixed = fixed_sequence_errors | \
((data['sequence_error'] == '') & ((data['change_type_to'] != '') | (data['change_description_to'] != '')))
autofixed_data = data[columns_auto_fixed].copy()
# Fill comments for the rest of error types
syntax_error = (autofixed_data.auto_fix_comment == '') & (autofixed_data.change_description_to != '')
type_error = (autofixed_data.auto_fix_comment == '') & (autofixed_data.change_type_to != '')
autofixed_data.loc[syntax_error, 'auto_fix_comment'] = 'syntax_error'
autofixed_data.loc[type_error, 'auto_fix_comment'] = 'type_error'
autofixed_data.loc[syntax_error & type_error, 'auto_fix_comment'] = 'syntax_and_type_error'
autofixed_data['change_name_to'] = autofixed_data.apply(apply_name_fix, axis=1)
autofixed_data = autofixed_data[['systematic_id', 'allele_id', 'allele_name', 'allele_description', 'allele_type', 'change_description_to', 'change_name_to', 'change_type_to', 'auto_fix_comment', 'sequence_error', 'solution_index', 'allele_parts', 'rules_applied', 'reference']].copy()
autofixed_data.sort_values(['systematic_id', 'allele_name'], inplace=True)
autofixed_data.to_csv(f'{output_dir}/allele_auto_fix.tsv', sep='\t', index=False)
errors_cannot_fix = data[~columns_auto_fixed].drop(columns=['auto_fix_comment', 'solution_index'])
# Separate into error types
error_cols = {'pattern_error', 'invalid_error', 'sequence_error'}
id_cols = set(errors_cannot_fix.columns) - error_cols
errors_cannot_fix = errors_cannot_fix.melt(id_vars=id_cols, value_vars=error_cols, var_name='error_type', value_name='error_info')
errors_cannot_fix = errors_cannot_fix[errors_cannot_fix['error_info'] != '']
errors_cannot_fix = errors_cannot_fix[['systematic_id', 'allele_id', 'allele_name', 'allele_description', 'error_type', 'error_info']].copy()
errors_cannot_fix.sort_values(['systematic_id', 'allele_name'], inplace=True)
sequence_errors = errors_cannot_fix[errors_cannot_fix.error_type == 'sequence_error'].drop(columns=['error_type']).rename(columns={'error_info': 'sequence_error'})
sequence_errors.to_csv(f'{output_dir}/allele_cannot_fix_sequence_errors.tsv', sep='\t', index=False)
errors_cannot_fix[errors_cannot_fix.error_type != 'sequence_error'].to_csv(f'{output_dir}/allele_cannot_fix_other_errors.tsv', sep='\t', index=False)
if __name__ == '__main__':
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle', help='input: genome dictionary built from contig files (see load_genome.py).')
parser.add_argument('--coordinate_changes_dict', default='data/coordinate_changes_dict.json', help='input: protein modification dictionary (see build_alignment_dict_from_genome.py -PomBase- or build_alignment_dict_from_peptides.py -SGD- )')
parser.add_argument('--allele_results', default='results/allele_results.tsv', help='input: file output by allele_qc.py')
parser.add_argument('--output_dir', default='results/', help='output directory, will create files allele_auto_fix.tsv, allele_cannot_fix_sequence_errors.tsv, allele_cannot_fix_other_errors.tsv')
args = parser.parse_args()
main(args.genome, args.coordinate_changes_dict, args.allele_results, args.output_dir)