forked from google/active-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepresent_cluster_centers.py
78 lines (68 loc) · 3.11 KB
/
represent_cluster_centers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Another informative and diverse sampler that mirrors the algorithm described
in Xu, et. al., Representative Sampling for Text Classification Using
Support Vector Machines, 2003
Batch is created by clustering points within the margin of the classifier and
choosing points closest to the k centroids.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn.cluster import MiniBatchKMeans
import numpy as np
from sampling_methods.sampling_def import SamplingMethod
class RepresentativeClusterMeanSampling(SamplingMethod):
"""Selects batch based on informative and diverse criteria.
Returns points within the margin of the classifier that are closest to the
k-means centers of those points.
"""
def __init__(self, X, y, seed):
self.name = 'cluster_mean'
self.X = X
self.flat_X = self.flatten_X()
self.y = y
self.seed = seed
def select_batch_(self, model, N, already_selected, **kwargs):
# Probably okay to always use MiniBatchKMeans
# Should standardize data before clustering
# Can cluster on standardized data but train on raw features if desired
try:
distances = model.decision_function(self.X)
except:
distances = model.predict_proba(self.X)
if len(distances.shape) < 2:
min_margin = abs(distances)
else:
sort_distances = np.sort(distances, 1)[:, -2:]
min_margin = sort_distances[:, 1] - sort_distances[:, 0]
rank_ind = np.argsort(min_margin)
rank_ind = [i for i in rank_ind if i not in already_selected]
distances = abs(model.decision_function(self.X))
min_margin_by_class = np.min(abs(distances[already_selected]),axis=0)
unlabeled_in_margin = np.array([i for i in range(len(self.y))
if i not in already_selected and
any(distances[i]<min_margin_by_class)])
if len(unlabeled_in_margin) < N:
print("Not enough points within margin of classifier, using simple uncertainty sampling")
return rank_ind[0:N]
clustering_model = MiniBatchKMeans(n_clusters=N)
dist_to_centroid = clustering_model.fit_transform(self.flat_X[unlabeled_in_margin])
medoids = np.argmin(dist_to_centroid,axis=0)
medoids = list(set(medoids))
selected_indices = unlabeled_in_margin[medoids]
selected_indices = sorted(selected_indices,key=lambda x: min_margin[x])
remaining = [i for i in rank_ind if i not in selected_indices]
selected_indices.extend(remaining[0:N-len(selected_indices)])
return selected_indices