-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_knn.py
64 lines (47 loc) · 2.21 KB
/
my_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from data_utils import * # Utility functions we wrote
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
"""
Findings
Model performs around the same for K between 3 and 21. This implies that the classes of data are very well-separable.
Accuracy begins to take a steady drop when K is larger than 21
Chose K=3, there aren't statistically significant differences between small values of K and a simpler model usually generalizes better
"""
# Warning
# Regarding the Nearest Neighbors algorithms, if it is found that two neighbors, neighbor k+1 and k, have identical distances but different labels, the results will depend on the ordering of the training data.
def best_knn():
return KNeighborsClassifier(n_neighbors=3) # Empirically, k=3 was best under both 10-fold and LOO CV.
# TODO Pulled this from randomForest -- move to util?
# Would have to generalize, pass in labels
def plotAccuracy(accuracy, k, title):
fig = plt.figure(figsize=(10,4),tight_layout=True)
ax = fig.add_subplot(1,1,1)
plt.plot(k, accuracy)
ax.set_xlabel("Number of Neighbors (k)")
ax.set_ylabel("Accuracy")
ax.set_title(title, fontsize = 12)
plt.show()
def getScores(X, Y, k_list, num_cv_folds):
kfold = KFold(n_splits=num_cv_folds)
scores = []
for k in k_list:
print("Evaluating KNN with k=%2d" % k)
my_knn = KNeighborsClassifier(n_neighbors=k)
scores.append(cross_val_score(my_knn, X, Y, cv = kfold).mean())
return scores
if __name__ == '__main__':
X, Y = get_training()
k_list = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 31, 51, 71, 91]
ten_scores = getScores(X, Y, k_list, 10)
loocv_scores = getScores(X, Y, k_list, len(X))
print("____________________________________________________________")
for i in range(len(k_list)):
k = k_list[i]
acc1 = ten_scores[i]
acc2 = loocv_scores[i]
print("| k = %2d | 10-Fold Accuracy: %.3f | LOOCV Accuracy: %.3f |" % (k, acc1, acc2))
print("____________________________________________________________")
# Generate graphs of k vs. accuracy
plotAccuracy(ten_scores, k_list, "K-Nearest Neighbors Accuracy (10 Fold CV)")
plotAccuracy(loocv_scores, k_list, "K-Nearest Neighbors Accuracy (LOOCV)")