-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
285 lines (241 loc) · 12.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import matplotlib.patches as patches
import matplotlib.gridspec as gridspec
import sklearn as sklearn
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_squared_error, auc, roc_curve, precision_recall_fscore_support
import composition
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
def plot_2d_grid_search(grid, midpoint=0.7, vmin=0, vmax=1):
text_size = 20
parameters = [x[6:] for x in list(grid.cv_results_.keys()) if 'param_' in x]
param1 = list(set(grid.cv_results_['param_'+parameters[0]]))
if parameters[1] == 'class_weight':
param2 =list(set([d[1] for d in grid.cv_results_['param_'+parameters[1]]]))
else:
param2 =list(set(grid.cv_results_['param_'+parameters[1]]))
scores = grid.cv_results_['mean_test_score'].reshape(len(param1),
len(param2))
param1 = [round(param, 2) for param in param1]
param2 = [round(param, 2) for param in param2]
plt.figure(figsize=(12, 10))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=vmin, vmax=vmax, midpoint=midpoint))
plt.xlabel(parameters[1], size=text_size)
plt.ylabel(parameters[0], size=text_size)
plt.tick_params(direction='in', length=5, bottom=True, top=True, left=True, right=True)
plt.colorbar()
plt.xticks(np.arange(len(param2)), sorted(param2), rotation=90, size=text_size)
plt.yticks(np.arange(len(param1)), sorted(param1), size=text_size)
plt.title('grid search')
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
adopted from:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve
.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : int or None, optional (default=None)
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
train_sizes : array-like, shape (n_ticks,), dtype float or int
Relative or absolute numbers of training examples that will be used to
generate the learning curve. If the dtype is float, it is regarded as a
fraction of the maximum size of the training set (that is determined
by the selected validation method), i.e. it has to be within (0, 1].
Otherwise it is interpreted as absolute sizes of the training sets.
Note that for classification the number of samples usually have to
be big enough to contain at least one sample from each class.
(default: np.linspace(0.1, 1.0, 5))
"""
plt.figure(figsize=(6, 6))
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.tick_params(direction='in', length=5, bottom=True, top=True, left=True, right=True)
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
def rf_feature_importance(rf, X_train, N='all', std_deviation=False):
'''Get feature importances for trained random forest object
Parameters
----------
rf : sklearn RandomForest object
This needs to be a sklearn.ensemble.RandomForestRegressor of RandomForestClassifier object that has been fit to data
N : integer, optional (default=10)
The N most important features are displayed with their relative importance scores
std_deviation : Boolean, optional (default=False)
Whether or not error bars are plotted with the feature importance. (error can be very large if maximum_features!='all' while training random forest
Output
--------
graphic :
return plot showing relative feature importance and confidence intervals
Examples
--------
>>> from sklearn.ensemble import RandomForestRegressor
>>> rf = RandomForestRegressor(max_depth=20, random_state=0)
>>> rf.fit(X_train, y_train)
>>> rf_feature_importance(rf, N=15)
'''
if N=='all':
N=X_train.shape[1]
importance_dic = {}
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[0:N]
# Print the feature ranking
print("Feature ranking:")
for f in range(0, N):
importance_dic[X_train.columns.values[indices[f]]]=importances[indices[f]]
print(("%d. feature %d (%.3f)" % (f + 1, indices[f], importances[indices[f]])),':', X_train.columns.values[indices[f]])
# Plot the feature importances of the forest
plt.figure(figsize=(6,6))
plt.title("Feature importances")
if std_deviation == True:
plt.bar(range(0, N), importances[indices], color="r", yerr=std[indices], align="center")
else:
plt.bar(range(0, N), importances[indices], color="r", align="center")
plt.tick_params(direction='in', length=5, bottom=True, top=True, left=True, right=True)
plt.xticks(range(0, N), indices, rotation=90)
plt.xlim([-1, N])
return X_train.columns.values[indices]
def plot_act_vs_pred(y_actual, y_predicted):
text_size = 20
plt.figure(figsize=(10, 10))
plt.plot(y_actual, y_predicted, marker='o', markersize=14, mfc='#0077be', color='k', linestyle='none', alpha=0.6)
plt.plot([min([min(y_actual), min(y_predicted)]), max([max(y_actual), max(y_predicted)])], [min([min(y_actual), min(y_predicted)]), max([max(y_actual), max(y_predicted)])], 'k--')
# plt.title("actual vs. predicted values", size=text_size)
plt.minorticks_on()
plt.tick_params(direction='in', length=15, bottom=True, top=True, left=True, right=True)
plt.tick_params(direction='in', length=7, bottom=True, top=True, left=True, right=True, which='minor')
# limits = [min([min(y_actual), min(y_predicted)]), max([max(y_actual), max(y_predicted)])]
limits = [0, max([max(y_actual), max(y_predicted)])]
plt.xlim(limits)
plt.ylim(limits)
plt.xticks(size=text_size)
plt.yticks(size=text_size)
plt.xlabel('Actual', size=text_size)
plt.ylabel('Predicted', size=text_size)
def get_roc_auc(actual, probability, plot=False):
fpr, tpr, tttt = roc_curve(actual, probability, pos_label=1)
roc_auc = auc(fpr, tpr)
if plot is True:
plt.figure(2, figsize=(6, 6))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.tick_params(direction='in', length=5, bottom=True, top=True, left=True, right=True)
return roc_auc
def get_classification_performance_metrics(actual, predicted, probability, plot=False):
tn, fp, fn, tp = confusion_matrix(actual, predicted).ravel()
roc_auc = get_roc_auc(actual, probability, plot=plot) * 100
recall = tp / (fn+tp) * 100
precision = tp / (tp+fp) * 100
# print('precision: {:0.2f}, recall: {:0.2f}'.format(precision, recall))
fscore = 2 * (recall * precision) / (recall + precision)
# print('f-score: {:0.2f}, ROC_auc: {:0.2f}'.format(fscore, roc_auc))
ppv = tp / (tp + fp)
npv = tn / (tn + fn)
return fscore, roc_auc
class MaterialsModel():
def __init__(self, trained_model, scalar, normalizer):
self.model = trained_model
self.scalar = scalar
self.normalizer = normalizer
def predict(self, formula):
'''
Parameters
----------
formula: str or list of strings
input chemical formula or list of formulae you want predictions for
Return
----------
prediction: pd.DataFrame()
predicted values generated from the given data
'''
# Store our formula in a dataframe. Give dummy 'taget value'.
# (we will use composition.generate_features() to get the features)
if type(formula) is str:
df_formula = pd.DataFrame()
df_formula['formula'] = [formula]
df_formula['target'] = [0]
if type(formula) is list:
df_formula = pd.DataFrame()
df_formula['formula'] = formula
df_formula['target'] = np.zeros(len(formula))
# here we get the features associated with the formula
X, y, formula = composition.generate_features(df_formula)
# here we scale the data (acording to the training set statistics)
X_scaled = self.scalar.transform(X)
X_scaled = self.normalizer.transform(X_scaled)
y_predicted = self.model.predict(X_scaled)
# save our predictions to a dataframe
prediction = pd.DataFrame(formula)
prediction['predicted value'] = y_predicted
return prediction