forked from UofUMetabolomicsCore/QSRR_Automator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_check_previous_model_module.py
137 lines (127 loc) · 7.86 KB
/
load_check_previous_model_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Copyright (c) 2020 Bradley Naylor, James Cox and University of Utah
All rights reserved.
Redistribution and use in source and binary forms,
with or without modification, are permitted provided
that the following conditions are met:
* Redistributions of source code must retain the
above copyright notice, this list of conditions
and the following disclaimer.
* Redistributions in binary form must reproduce
the above copyright notice, this list of conditions
and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the author nor the names of any contributors
may be used to endorse or promote products derived
from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
#this is the class to handle actually predicting the retention times of unknonwns
import pandas as pd
#import sklearn.pipeline
import joblib
class load_check_previous_model(object):
def __init__(self, machine_learning_object_file, settings_file_loc, allowed_difference, chosen_features_file):
self.error = "" #this is going to be the main reporter of errors. early returns will alter this so we can check it for every time it may be altered
self.allowed_diffence_in_score = allowed_difference
try:
self.ml_object=joblib.load(machine_learning_object_file)
except:
#functional
self.error = "Unable to open {}. may have been corrupted or deleted. Please correct".format(machine_learning_object_file)
if self.error == "":
self.set_training_values(settings_file_loc)
if self.error == "":#if error from the above function was set no need to do anything else
self.check_training_file(chosen_features_file)
def set_training_values(self, settings_file_loc):
try:
settings = pd.read_csv(settings_file_loc, index_col = "Setting")
self.training_file = settings["Value"]["Training File"]
self.training_score = float(settings["Value"]["Final Model Score"])
self.feature_selection_method = str(settings["Value"]["Feature Selection Method"])
except:
#functional
self.error = "{} does not exist or has been altered. Please correct.".format(settings_file_loc)
def check_training_file(self, chosen_features_file):
try:
if self.training_file[-4:] == ".csv":
training_data = pd.read_csv(self.training_file)
elif self.training_file[-5:] == ".xlsx":
training_data = pd.read_excel(self.training_file)
else:
#functional
self.error = "{} is not a .csv or .xlsx. settings file has been altered. Please correct".format(self.training_file)
return
except:
#functional
self.error = "{} could not be opened or does not exist. Please correct".format(self.training_file)
return
try:
training_data = training_data.set_index("Compound Name")
training_rt = training_data["RT"]
descriptors = list(training_data.columns)
descriptors.remove("RT")
descriptors.remove("SMILES")
training_data = training_data[descriptors]
except:
#functional
self.error = "header of {} has been changed. This is case sensitive. Please correct.".format(self.training_file)
return
if len(descriptors) == 0:
#functional
self.error = "No descriptors present in {}. Please correct".format(self.training_file)
return
#need to check is we have the the correct features and only the correct features
#we'll only bother if the features are chosen. otherwise we can assume full mordred
#this is likely unnecessary, but covering our bases is a good idea.
#if isinstance(self.ml_object.best_estimator_, sklearn.pipeline.Pipeline) and 'feature_selection' in self.ml_object.best_estimator_.named_steps.keys(): # need to check it's a pipeline (random forest isn't a pipeline if feature selection is used) and ensure feature_selection is present if it is a pipeline
if self.feature_selection_method == "Manual" or self.feature_selection_method == "Automatic":
if type(chosen_features_file) == type(None): #this is not fool-proof since the user could be useing the same folder. we will need another check
#functional
self.error = "There is no file containing chosen features when this is necessary for this model. Please correct"
return
self.needed_features = list(pd.read_csv(chosen_features_file)["Chosen Features"])
if self.feature_selection_method == "Manual":
#now we are sure that feature selection is necessary we can fiddle with files
num_chosen_features = sum(self.ml_object["gridsearchcv_step"].best_estimator_.named_steps["feature_selection_step"].get_support()) #the number of features needed in the model
if num_chosen_features != len(self.needed_features):
#functional
self.error = "Requried number of chosen features do not match the number of features in {}. Please correct".format(chosen_features_file)
return
try:
training_data[self.needed_features] #need to check that the features are present. don't actually reassign things. or it will error out
except KeyError:
#functional
self.error = "Features in {} were not present in {}. Please correct".format(chosen_features_file, self.training_file)
return
elif self.feature_selection_method == "Automatic":
try:
#training_data = training_data[self.needed_features] #this was if rfecv is first. now that it is part of the final pipeline we need to do a similar test as manual
training_data[self.needed_features]
except KeyError:
self.error = "Features in {} were not the same as those in {}. Some were missing or altered. This is case sensitive. Please correct.".format(chosen_features_file, self.training_file)
#now that we have everything ready, we can do the actually testing
try:
current_score = self.ml_object.score(training_data ,training_rt)
except ValueError:
#functional
self.error = "Training data is a different shape than during fitting. This is likely due to a change in training file (usually adding or removing descriptor columns). Please Correct."
return
if abs(current_score-self.training_score)/self.training_score > self.allowed_diffence_in_score:
#functional
self.error = "Training data and machine learning model scores do not agree. Files may have been changed or replaced. Please correct."
return
self.all_descriptors = list(training_data.columns)