forked from UofUMetabolomicsCore/QSRR_Automator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
__main__.py
481 lines (444 loc) · 34.9 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
Copyright (c) 2020 Bradley Naylor, James Cox and University of Utah
All rights reserved.
Redistribution and use in source and binary forms,
with or without modification, are permitted provided
that the following conditions are met:
* Redistributions of source code must retain the
above copyright notice, this list of conditions
and the following disclaimer.
* Redistributions in binary form must reproduce
the above copyright notice, this list of conditions
and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the author nor the names of any contributors
may be used to endorse or promote products derived
from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from copy import deepcopy
import multiprocessing as mp
import os, sys
from collections import Counter
import time
import pandas as pd
from PyQt5 import QtCore, QtWidgets, uic
import generic_functions as gf
import create_machine_learning_model as cmlm
import assign_molecular_descriptors as amd
import collect_default_values as cdv
import settings_menu
location = os.path.dirname(os.path.abspath(__file__))
#location = os.path.dirname(os.path.abspath(sys.executable))
SMILE_NAME = "SMILES"
RT_NAME = "RT"
NAME_NAME = "Compound Name"
Descriptor_Input_Headers = [NAME_NAME, RT_NAME, SMILE_NAME] #put this here so we can adjust the template creator and mordred query at once (will still demand specific columns, but allows adding extras easily)
Final_Input_Headers = [NAME_NAME, SMILE_NAME]
#need to collect more for the new model creation and the rt_predictions. I believe we have everything from the main function calls but all the functions they call (make descriptors make the models, errors for the prediction) have not yet been analyzed.
NEW_MODEL_CREATION_POTENTIAL_FILENAMES_EXCEL_FILES = ["Training Processing Steps.xlsx", "Successful_Graph.pdf", "saved_model.joblib", "saved_model_settings.csv", "chosen_features.csv", "cross_validation_and_final_sores.csv", "Descriptors for training.xlsx"]
NEW_MODEL_CREATION_POTENTIAL_FILENAMES_CSV_FILES = ["Successful_Graph.pdf", "saved_model.joblib", "saved_model_settings.csv", "chosen_features.csv", "cross_validation_and_final_sores.csv", "Descriptors for training.csv", "Model building settings.csv", "Model building programmer settings.csv", "Model building numerical coercion.csv", "Model building bad samples removed.csv", "Model building Final Descriptors.csv", "Model building replicates removed.csv"]
RT_PREDICTIONS_POTENTIAL_FILENAMES_EXCEL_FILES =["Final RT Prediction.xlsx", "RT Prediction Steps.xlsx", "Descriptors for prediction.xlsx", "Full Error File.xlsx", "Relevant Error File.xlsx"]
RT_PREDICTIONS_POTENTIAL_FILENAMES_CSV_FILES =["Final RT Prediction.csv", "Descriptors for prediction.csv", "Full Error File.csv", "Relevant Error File.csv", "RT prediction settings.csv", "RT prediction programmer settings.csv", "RT prediction numerical coercion.csv"]
form_class = uic.loadUiType(os.path.join(location,"Generic_RT_Predictor_Main_Menu.ui"))[0]
class Main_Machine_Learning_Window(QtWidgets.QMainWindow, form_class):
def __init__(self, parent = None):
QtWidgets.QMainWindow.__init__(self,parent)
self.setupUi(self)
settings = cdv.get_default_values(location, True)
if type(settings) == str:
QtWidgets.QMessageBox.information(self, "Error", settings)
#without the 2 closes running the code from a command line will stop allowing rerunning after the second error. weird. this fixes it, and since it doesn't have any negative effects, we'll just do both unless we figure out what is going on
self.close()
sys.exit(0)
programmer_settings = cdv.get_default_values(location, False)
if type(programmer_settings) == str:
QtWidgets.QMessageBox.information(self, "Error", programmer_settings)
#without the 2 closes running the code from a command line will stop allowing rerunning after the second error. weird. this fixes it, and since it doesn't have any negative effects, we'll just do both unless we figure out what is going on
self.close()
sys.exit(0)
self.settings = settings
self.default_settings = deepcopy(settings)
self.programmer_settings = programmer_settings
self.settings_series_to_write = gf.make_series_for_saving(self.settings)
self.programmer_settings_series_to_write = gf.make_series_for_saving(self.programmer_settings)
self.current_model = False
self.current_model_descriptors = False
self.program_location = location
try:
self.input_location = self.settings["Output Folder"]
gf.make_folder(self.input_location)
self.initial_output_loc = True
except: #since input and output are the same, this will check permissions on both
self.initial_output_loc = False
#this only occurs if we have a permission error in making the new output folder. in this case we can't trust that it exists and so will default to a location that must exist (the templates, output folder creator, new model and predict all have permission checks so having a location shouldn't cause an issue)
# don't know why someone would put this in a admin locked folder but this should help
self.input_location = self.program_location
self.settings["Output Folder"] = self.program_location
self.model_builder.clicked.connect(self.training_or_loading)
self.RT_prediction_button.clicked.connect(self.rt_predictor)
self.training_template_creator.clicked.connect(lambda: self.template_creator(True))
self.prediction_template_creator.clicked.connect(lambda: self.template_creator(False))
self.actionChoose_Output_Folder.triggered.connect(self.choose_output_folder)
self.actionSettings.triggered.connect(self.settings_option)
self.actionExit.triggered.connect(self.ExitButton)
self.current_model_good = False
self.loaded_model_feature_selection = False
def check_for_overwriting(self, folder, files_to_check):
what_will_be_overwritten = []
for f in files_to_check:
if os.path.isfile(os.path.join(folder, f)):
what_will_be_overwritten.append(f)
return what_will_be_overwritten
def template_creator(self, need_rt):
if self.settings["Excel Writing"]: priority = "XlSX (*.xlsx);; CSV (*.csv)"
else: priority = "CSV (*.csv);; XlSX (*.xlsx)"
if need_rt:template_panda = pd.DataFrame(columns =Descriptor_Input_Headers)
else: template_panda = pd.DataFrame(columns = Final_Input_Headers)
while(True):
#will need to deal saving. we'll see if this does it right
QtWidgets.QMessageBox.information(self, "Info", "Please select the filename and location to save your template.")
save_file, file_type = QtWidgets.QFileDialog.getSaveFileName(self, "Provide Save File", self.input_location, priority)#will ask if you wish to overwrite and add the .cef tag if necessary
if save_file =="":
return #will break out of the while without needing to break
try:
if file_type == "CSV (*.csv)": template_panda.to_csv(save_file, index = False)
elif file_type =="XlSX (*.xlsx)": template_panda.to_excel(save_file, index = False)
self.input_location = os.path.dirname(save_file)
QtWidgets.QMessageBox.information(self, "Info", "Your template was successfully created")
break
except:
#note that windows seems to do this on it's own, but having this here is harmless in case windows fails and I don't like trusting another software more than I don't like have an extra while and try in here.
QtWidgets.QMessageBox.information(self, "Error", "File {} cannot be written likely due to permission issue in folder {}. Please try again in a different location".format(save_file, os.path.dirname(save_file)))
def descriptor_assignment(self, need_rt, input_file, excel_writer_object = None):
to_mordred, input_dataframe = amd.initial_preparation_machine_learning_input(input_file, self.settings, self.programmer_settings, need_rt, self.current_model_descriptors, self.loaded_model_feature_selection)
if type(to_mordred) == str: return to_mordred, "" #will be an error message if a string
#quick test to see if user wants to proceed if they have one compound with multipe RTs
if need_rt:
instances_of_smiles = Counter(input_dataframe[SMILE_NAME])
potential_issues = [k for k in instances_of_smiles.keys() if instances_of_smiles[k] >1]
#smiles are the same so this is not going to be complex. if there are more rts than names, then there is a duplicate name without a duplicate rt
#won't have the specific target but should work for a basic "Should we continue" warning
for p in potential_issues:
small_df = input_dataframe[input_dataframe[SMILE_NAME] == p]
temp_names = list(small_df[NAME_NAME])
temp_rts=list(small_df[RT_NAME])
if len(set(temp_names)) < len(set(temp_rts)):
reply = QtWidgets.QMessageBox.question(self, "Warning", "There are multiple instances of a compound with different Retention Times. This can cause poor machine learning fits or have no real effect. Would you like to continue?", QtWidgets.QMessageBox.Yes, QtWidgets.QMessageBox.No)
if reply == QtWidgets.QMessageBox.No:
return "", ""
else:
break
#in this case we have extra columns we are going to assume are descriptors. confirm with the user. if so great, don't do mordred (though still do error checks. otherwise purge these columns
if not to_mordred:
custom_msg_box = QtWidgets.QMessageBox()
custom_msg_box.setText("You already have descriptors in this input file. Do you wish to use them or recalculate?")
cancel_button = custom_msg_box.addButton("Cancel", QtWidgets.QMessageBox.RejectRole) #needed because otherwise the red x will actually do one of the buttons, which the user likely doesn't want.
keep_button = custom_msg_box.addButton("Keep Existing", QtWidgets.QMessageBox.YesRole)
recalculate_button = custom_msg_box.addButton("Recalculate", QtWidgets.QMessageBox.NoRole)
custom_msg_box.exec_()
if custom_msg_box.clickedButton() == cancel_button:
return None, None
elif custom_msg_box.clickedButton() == keep_button:
to_mordred = False
elif custom_msg_box.clickedButton() == recalculate_button:
to_mordred = True
#if we're recalculating purge the other columns
if need_rt:
input_dataframe = input_dataframe[Descriptor_Input_Headers]
else:
input_dataframe = input_dataframe[Final_Input_Headers]
if to_mordred:
numerical_df = amd.actually_add_descriptors(input_dataframe)
if type(numerical_df) == str: return numerical_df, ""
#if we are assuming things are correct we need to remove RT and names and make the smiles the index and
else:
numerical_df = input_dataframe.set_index(SMILE_NAME)
numerical_df = numerical_df.drop(NAME_NAME, axis = 1)
if need_rt: numerical_df = numerical_df.drop(RT_NAME, axis = 1)
#need to drop duplicates of smiles with same descriptors (can cause issues with filters)
numerical_df = numerical_df.drop_duplicates()
#need to ensure that all duplicate smiles are removed (if it didn't drop, smiles are same but descriptors are not. RT is dropped at this point)
instances_of_smiles = Counter(numerical_df.index)
problematic_smiles = [k for k in instances_of_smiles.keys() if instances_of_smiles[k] >1]
if problematic_smiles:
return "The following SMILES have duplicate SMILES with different descriptors (retention time not included): {}. Please correct to continue".format(problematic_smiles), None
if need_rt: input_dataframe = input_dataframe[Descriptor_Input_Headers]
else: input_dataframe = input_dataframe[Final_Input_Headers]
numerical_df = numerical_df.apply(pd.to_numeric, args = ['coerce'])
#now we need to actually write out the data. need to make the excel object (write out the end separately as well so they can use that as a read in)
if self.settings["Excel Writing"]:
input_dataframe.to_excel(excel_writer_object, "Initial Input", index = False)
self.settings_series_to_write.to_excel(excel_writer_object, "User Settings")
self.programmer_settings_series_to_write.to_excel(excel_writer_object, "Programmer Settings")
numerical_df.to_excel(excel_writer_object, "Added Descriptors")
else:
things_to_write = [self.settings_series_to_write, self.programmer_settings_series_to_write, numerical_df]
if need_rt: file_names = ["Model building settings.csv", "Model building programmer settings.csv", "Model building numerical coercion.csv"]
else: file_names = ["RT prediction settings.csv", "RT prediction programmer settings.csv", "RT prediction numerical coercion.csv"]
for i in range(len(things_to_write)):
write_attempt = gf.write_single_file(os.path.join(self.settings["Output Folder"], file_names[i]), things_to_write[i], True, False)
if write_attempt != "Success": return write_attempt, ""
if need_rt:
if self.settings["Excel Writing"]:
trimmed_dataframe = amd.filter_descriptors(numerical_df, self.settings, self.programmer_settings, self.program_location, excel_writer_object)
else:
trimmed_dataframe = amd.filter_descriptors(numerical_df, self.settings, self.programmer_settings, self.program_location)
if type(trimmed_dataframe) == str:
return trimmed_dataframe, ""
trimmed_dataframe = pd.merge(input_dataframe, trimmed_dataframe, right_index = True, left_on = SMILE_NAME, how = 'inner')
if self.settings["Excel Writing"]:
final_filename = os.path.join(self.settings["Output Folder"], "Descriptors for training.xlsx")
trimmed_dataframe.to_excel(excel_writer_object, "Final Descriptor Output", index = False)
write_error = gf.write_single_file(final_filename, trimmed_dataframe, False, True)
else:
final_filename =os.path.join(self.settings["Output Folder"], "Descriptors for training.csv")
write_error = gf.write_single_file(final_filename, trimmed_dataframe, False, False)
if write_error != "Success": return write_error, ""
else:
try:
descriptor_trimmed_dataframe = numerical_df[self.current_model_descriptors]
except KeyError:
return "Required descriptors for your model not in calculated descriptors", ""
trimmed_dataframe = pd.merge(input_dataframe, descriptor_trimmed_dataframe, right_index = True, left_on = SMILE_NAME, how = 'inner')
if self.settings["Excel Writing"]:
final_filename = os.path.join(self.settings["Output Folder"], "Descriptors for prediction.xlsx")
trimmed_dataframe.to_excel(excel_writer_object, "Final Descriptor Output", index = False)
write_error = gf.write_single_file(final_filename, trimmed_dataframe, False, True)
else:
final_filename = os.path.join(self.settings["Output Folder"], "Descriptors for prediction.csv")
write_error = gf.write_single_file(final_filename, trimmed_dataframe, False, False)
if write_error != "Success": return write_error, ""
if descriptor_trimmed_dataframe.isnull().values.any():
#write out all the descriptors in all cases in case the user wishes to know
true_false_df = pd.merge(input_dataframe, descriptor_trimmed_dataframe.isnull(), right_index = True, left_on = SMILE_NAME, how = 'inner')
if self.settings["Excel Writing"]:
problem_file = os.path.join(self.settings["Output Folder"], "Full Error File.xlsx")
gf.write_single_file(problem_file,true_false_df, False, True)
else:
problem_file = os.path.join(self.settings["Output Folder"], "Full Error File.csv")
gf.write_single_file(problem_file,true_false_df, False, False)
#$we're going to test if the missing compounds are actually necessary. if not we'll fill them with a dummy and see what happens
#$ to undo this get rid of the if and all under it and put the return in the else back a tab
# first we need the actual descriptors we need
if self.loaded_model_feature_selection == "Automatic" or self.loaded_model_feature_selection == "Manual": #with None everything that remains should be good since there is no feature_selection_step
#need the true/false mask in order to find the chosen features
try:
chosen_value_mask = list(self.current_model.named_steps["feature_selection_step"].support_)
except KeyError:
chosen_value_mask = list(self.current_model.named_steps["gridsearchcv_step"].best_estimator_.named_steps["feature_selection_step"].get_support())
#turn the mask into a list of columns (otherwise it will be interpreted as an index in the next step instead of columns)
chosen_compounds = descriptor_trimmed_dataframe.columns[chosen_value_mask]
#data frame of only the needed compounds for the model
only_chosen_values = descriptor_trimmed_dataframe[chosen_compounds]
if only_chosen_values.isnull().values.any():#if the needed compounds have nans we can't predict (we could drop, but the user is better able to determine what needs to be done here)
#if there is a problem we need to help the user find the problem (otherwise we may end up with many failed descriptors that aren't needed by the model)
true_false_df = pd.merge(input_dataframe, only_chosen_values.isnull(), right_index = True, left_on = SMILE_NAME, how = 'inner')
if self.settings["Excel Writing"]:
problem_file = os.path.join(self.settings["Output Folder"], "Relevant Error File.xlsx")
gf.write_single_file(problem_file,true_false_df, False, True)
else:
problem_file = os.path.join(self.settings["Output Folder"], "Relevant Error File.csv")
return "some compounds were missing values for needed descriptors. Please look at {} for the problematic features (all should be False. True indicates a problem). Please remove the problematic compounds or retrain the model and try again.".format(problem_file), ""
else: #we will give a warning and then prevent further erroring out
reply = QtWidgets.QMessageBox.information(self, "Warning", "There are compounds which are lacking descriptors found in your training data. These features are not used in your model so prediction will proceed, but this may indicate problematic differences between compounds. Can observe which descriptors have an issue in {} (all should be False. True indicates a problem)".format(problem_file))
trimmed_dataframe = trimmed_dataframe.fillna(0) #allows us to proceed without issues
else:
return "some compounds were missing values for needed descriptors. Please look at {} for the problematic features (all should be False. True indicates a problem). Please remove the problematic compounds or retrain the model and try again.".format(problem_file), ""
return trimmed_dataframe, final_filename
def training_or_loading(self):
#question msg box won't cut it. so we'll need to make a new one.
#basing this off of https://stackoverflow.com/questions/49155926/how-to-customise-a-pyqt-message-box accessed 12/14/18 answer 1
#rebuilding each time should take minimal effort for the processor. if it takes too long can build in a seperate function called once by the __init__ then just call the exec_ here
custom_msg_box = QtWidgets.QMessageBox()
custom_msg_box.setText("Do you want to train a New Method or Load a previous Method?")
new_button = custom_msg_box.addButton("New Model", QtWidgets.QMessageBox.YesRole)
load_button = custom_msg_box.addButton("Load Model", QtWidgets.QMessageBox.NoRole)
custom_msg_box.exec_()
#don't use else. if nothing is clicked (they hit the x in the top right) they should not be channeled into any function
if custom_msg_box.clickedButton() == new_button:
if not self.initial_output_loc:
QtWidgets.QMessageBox.information(self, "Error", "QSRR Automator lacks permission to write in current output folder. Please use the \"Choose Output Folder\" folder in the pull down menu in the top left corner to select a new folder and try again")
return
self.train_the_model()
elif custom_msg_box.clickedButton() == load_button:
self.load_a_model()
def train_the_model(self):
#first thing we need to do is warn the user if we are going to overwrite a file
if self.settings["Excel Writing"]:
overwrite_warning_list = self.check_for_overwriting(self.settings["Output Folder"], NEW_MODEL_CREATION_POTENTIAL_FILENAMES_EXCEL_FILES)
else:
overwrite_warning_list = self.check_for_overwriting(self.settings["Output Folder"], NEW_MODEL_CREATION_POTENTIAL_FILENAMES_CSV_FILES)
if overwrite_warning_list != []:
reply = QtWidgets.QMessageBox.question(self, "Overwrite Warning", "The following files are located in folder {} may be overwritten if you proceed in this folder:\n{}\nDo you wish to continue?".format(self.settings["Output Folder"], "\n".join(overwrite_warning_list)), QtWidgets.QMessageBox.Yes, QtWidgets.QMessageBox.No)
if reply == QtWidgets.QMessageBox.No:
return
start = time.time()
input_file, filter_ = QtWidgets.QFileDialog.getOpenFileName(self, "Select filled in \"Training Template\" file", self.settings["Output Folder"], "(*.csv *.xlsx)")
if not input_file:
return
self.input_location = os.path.dirname(input_file)
if self.settings["Excel Writing"]:
excel_writer = pd.ExcelWriter(os.path.join(self.settings["Output Folder"],"Training Processing Steps.xlsx"))
training_data, final_filename = self.descriptor_assignment(True, input_file, excel_writer)
else:
training_data, final_filename = self.descriptor_assignment(True, input_file)
if type(training_data) == type(None): #straight comparison to None errors out if training_data is a dataframe so this will cover all situations
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer, True)
return
elif type(training_data) == str:
if training_data != "":
QtWidgets.QMessageBox.information(self, "Error", training_data)
self.current_model_good = False
self.loaded_model_feature_selection = False
self.current_model = False
self.current_model_descriptors = False
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer, True)
return
user_save, model_object, series_to_save, score_series = cmlm.model_training_manager(training_data, final_filename, self.settings, self.programmer_settings) # returns a model object or an error message
if not user_save: #only triggered by a specific user choice with a warning
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer)
return
"""#this is unnecessary since there are no error messages (and the only failure condition is dealt with using the user_save above.
#if this changes for whatever reason
if type(model_object) == str: #if it's a string, it's an error message not a gridsearchcv object
QtWidgets.QMessageBox.information(self, "Error", model_object)
self.current_model_good = False
self.current_model = False
self.current_model_descriptors = False
else:"""
self.current_model = model_object.main_pipeline
self.current_model_descriptors = model_object.all_required_features
if self.settings["Excel Writing"]:
failures = cmlm.save_data(model_object, self.settings, series_to_save, score_series, excel_writer)
else:
failures = cmlm.save_data(model_object, self.settings, series_to_save, score_series)
if failures != "Success":
QtWidgets.QMessageBox.information(self, "Error", failures)
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer, True)
return
if self.settings["Excel Writing"]:
stuff = gf.deal_with_writer_object(excel_writer)
if stuff == "Permission Error":
QtWidgets.QMessageBox.information(self, "Error", "{} is open in another program. Please close and try again.".format(os.path.join(self.settings["Output Folder"],"Training Processing Steps.xlsx"))) #$ may need to add a proceed option here
return
QtWidgets.QMessageBox.information(self, "Success", "Model has been trained and saved. Time taken: {} min".format((time.time()-start)/60)) #$ may need to add a proceed option here
self.loaded_model_feature_selection = self.settings["Model To Use"]
self.current_model_good = True
def load_a_model(self):
gf.make_folder(self.settings["Output Folder"])
QtWidgets.QMessageBox.information(self, "Info", "Please choose the folder with your model in it. This will be the export folder of the run you trained the model in")
folder = QtWidgets.QFileDialog.getExistingDirectory(self, "Select a Folder", self.settings["Output Folder"], QtWidgets.QFileDialog.ShowDirsOnly)
if not folder: return
error, descriptors, feature_selection_method_used = cmlm.load_data(folder, self.programmer_settings)
if type(error) == str:
QtWidgets.QMessageBox.information(self, "Error", error)
self.current_model_good = False
self.loaded_model_feature_selection = False
self.current_model = False
self.current_model_descriptors = False
else:
self.current_model = error.ml_object
self.current_model_descriptors = descriptors
self.current_model_good = True
self.loaded_model_feature_selection = feature_selection_method_used
QtWidgets.QMessageBox.information(self, "Success", "Your model was loaded successfully")
def rt_predictor(self):
if not self.current_model_good:
QtWidgets.QMessageBox.information(self, "Error", "You have not built or loaded a model, or your latest attempt to do so was unsuccessful. Please successfully build or load a model before proceeding.")
return
#only if the output folder is in an admin locked location. if the output location is changed this check is not tripped. likely to be completely unnecessary but one if won't slow us too much
if not self.initial_output_loc:
QtWidgets.QMessageBox.information(self, "Error", "QSRR Automator lacks permission to write in current output folder. Please use the \"Choose Output Folder\" folder in the pull down menu in the top left corner to select a new folder and try again")
return
#need to warn the user if we are going to overwrite a file
if self.settings["Excel Writing"]:
overwrite_warning_list = self.check_for_overwriting(self.settings["Output Folder"], RT_PREDICTIONS_POTENTIAL_FILENAMES_EXCEL_FILES)
else:
overwrite_warning_list = self.check_for_overwriting(self.settings["Output Folder"], RT_PREDICTIONS_POTENTIAL_FILENAMES_CSV_FILES)
if overwrite_warning_list != []:
reply = QtWidgets.QMessageBox.question(self, "Overwrite Warning", "The following files are located in folder {} may be overwritten if you proceed in this folder:\n{}\nDo you wish to continue?".format(self.settings["Output Folder"], "\n".join(overwrite_warning_list)), QtWidgets.QMessageBox.Yes, QtWidgets.QMessageBox.No)
if reply == QtWidgets.QMessageBox.No:
return
#need to get the data
gf.make_folder(self.settings["Output Folder"])
QtWidgets.QMessageBox.information(self, "Info", "Please choose file containing compounds for RT prediction. this should be based on the output of the \"Prediction Template\" button")
input_file, filter_ = QtWidgets.QFileDialog.getOpenFileName(self, "Select RT prediction input file", self.settings["Output Folder"], "(*.csv *.xlsx)")
if not input_file:
return
self.input_location = os.path.dirname(input_file)
if self.settings["Excel Writing"]:
excel_writer = pd.ExcelWriter(os.path.join(self.settings["Output Folder"],"RT Prediction Steps.xlsx"))
test_data, final_filename = self.descriptor_assignment(False,input_file, excel_writer)
else:
test_data, final_filename = self.descriptor_assignment(False, input_file)
if type(test_data) == type(None): #a dataframe will error out if compared to None and this may not be a dataframe so can't use pandas methods. this works fine
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer, True)
return
elif type(test_data) == str:
if test_data != "":
QtWidgets.QMessageBox.information(self, "Error", test_data)
if self.settings["Excel Writing"]: gf.deal_with_writer_object(excel_writer)
return
#need to adjust with features if we have such
predicted_rt = cmlm.analyze_user_data(test_data, self.current_model, self.settings)
predicted_data = test_data[Final_Input_Headers].copy() #the .copy() notation tells pandas this is a copy, preventing the setting with copy warning on the next line without deactivating the warning behavior
predicted_data[RT_NAME] =predicted_rt
if self.settings["Excel Writing"]:
predicted_data.to_excel(excel_writer, "RT prediction", index = False)
excel_error = gf.deal_with_writer_object(excel_writer)
if excel_error != None:
QtWidgets.QMessageBox.information(self, "Error", "{} is open in another program. Please close and try again or change settings.".format(os.path.join(self.settings["Output Folder"],"RT Prediction Steps.xlsx")))
return
error = gf.write_single_file(os.path.join(self.settings["Output Folder"], "Final RT Prediction.xlsx"), predicted_data, False, True)
else:
error = gf.write_single_file(os.path.join(self.settings["Output Folder"], "Final RT Prediction.csv"), predicted_data, False, False)
if error == "Success":
QtWidgets.QMessageBox.information(self, "Success", "Your retention times were predicted.")
else:
QtWidgets.QMessageBox.information(self, "Error", error)
#functional
def choose_output_folder(self):
gf.make_folder(self.settings["Output Folder"])
empty_df_for_testing = pd.DataFrame()
while(True):
folder = QtWidgets.QFileDialog.getExistingDirectory(self, "Select a Folder", self.settings["Output Folder"], QtWidgets.QFileDialog.ShowDirsOnly)
if folder:
try:
empty_df_for_testing.to_csv(os.path.join(folder, "generic_output_testing_file.csv"))
os.remove(os.path.join(folder, "generic_output_testing_file.csv"))
self.settings["Output Folder"] = folder
self.input_location = folder
self.initial_output_loc = True
break
except:
QtWidgets.QMessageBox.information(self, "Error", "Folder {} cannot be written to likely due to permission issue. Please move output folder to a location that can be written to proceed".format(folder))
else:
break
def settings_option(self):
self.set_menu = settings_menu.Settings_Menu(self, self.settings, self.default_settings)
self.set_menu.show()
def ExitButton(self):
# from http://stackoverflow.com/questions/1414781/prompt-on-exit-in-pyqt-application
reply = QtWidgets.QMessageBox.question(self, "Quit Option", "Are you sure you wish to exit?", QtWidgets.QMessageBox.Yes, QtWidgets.QMessageBox.No)
if reply == QtWidgets.QMessageBox.Yes:
self.close()
if __name__ == "__main__":
mp.freeze_support()# this is critical since mordred and random forest use multiprocessing.
app = QtWidgets.QApplication(sys.argv)
interaction_gui = Main_Machine_Learning_Window(None)
interaction_gui.show()
app.exec_()