-
Notifications
You must be signed in to change notification settings - Fork 876
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
1,271 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+14.8 KB
docs/sources/user_guide/data/three_blobs_data_files/three_blobs_data_13_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+16.8 KB
docs/sources/user_guide/data/three_blobs_data_files/three_blobs_data_14_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Sebastian Raschka 2014-2016 | ||
# mlxtend Machine Learning Library Extensions | ||
# Author: Sebastian Raschka <sebastianraschka.com> | ||
# | ||
# License: BSD 3 clause | ||
|
||
from .kmeans import Kmeans | ||
|
||
__all__ = ["Kmeans"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Sebastian Raschka 2014-2016 | ||
# mlxtend Machine Learning Library Extensions | ||
# | ||
# Base Clusteer (Clutering Parent Class) | ||
# Author: Sebastian Raschka <sebastianraschka.com> | ||
# | ||
# License: BSD 3 clause | ||
|
||
import numpy as np | ||
from sys import stderr | ||
from time import time | ||
|
||
|
||
class _BaseCluster(object): | ||
|
||
"""Parent Class Base Cluster | ||
A base class that is implemented by | ||
clustering child classes. | ||
""" | ||
def __init__(self, print_progress=0, random_seed=None): | ||
self.print_progress = print_progress | ||
self.random_seed = random_seed | ||
self._is_fitted = False | ||
|
||
def fit(self, X): | ||
"""Learn cluster centroids from training data. | ||
Parameters | ||
---------- | ||
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | ||
Training vectors, where n_samples is the number of samples and | ||
n_features is the number of features. | ||
Returns | ||
------- | ||
self : object | ||
""" | ||
self._is_fitted = False | ||
self._check_array(X=X) | ||
if self.random_seed is not None: | ||
np.random.seed(self.random_seed) | ||
self._fit(X=X) | ||
self._is_fitted = True | ||
return self | ||
|
||
def _fit(self, X): | ||
# Implemented in child class | ||
pass | ||
|
||
def predict(self, X): | ||
"""Predict cluster labels of X. | ||
Parameters | ||
---------- | ||
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | ||
Training vectors, where n_samples is the number of samples and | ||
n_features is the number of features. | ||
Returns | ||
---------- | ||
cluster_labels : array-like, shape = [n_samples] | ||
Predicted cluster labels. | ||
""" | ||
self._check_array(X=X) | ||
if not self._is_fitted: | ||
raise AttributeError('Model is not fitted, yet.') | ||
return self._predict(X) | ||
|
||
def _predict(self, X): | ||
# Implemented in child class | ||
pass | ||
|
||
def _shuffle(self, arrays): | ||
"""Shuffle arrays in unison.""" | ||
r = np.random.permutation(len(arrays[0])) | ||
return [ary[r] for ary in arrays] | ||
|
||
def _print_progress(self, iteration, cost=None, time_interval=10): | ||
if self.print_progress > 0: | ||
s = '\rIteration: %d/%d' % (iteration, self.n_iter) | ||
if cost: | ||
s += ' | Cost %.2f' % cost | ||
if self.print_progress > 1: | ||
if not hasattr(self, 'ela_str_'): | ||
self.ela_str_ = '00:00:00' | ||
if not iteration % time_interval: | ||
ela_sec = time() - self.init_time_ | ||
self.ela_str_ = self._to_hhmmss(ela_sec) | ||
s += ' | Elapsed: %s' % self.ela_str_ | ||
if self.print_progress > 2: | ||
if not hasattr(self, 'eta_str_'): | ||
self.eta_str_ = '00:00:00' | ||
if not iteration % time_interval: | ||
eta_sec = ((ela_sec / float(iteration)) * | ||
self.n_iter - ela_sec) | ||
self.eta_str_ = self._to_hhmmss(eta_sec) | ||
s += ' | ETA: %s' % self.eta_str_ | ||
stderr.write(s) | ||
stderr.flush() | ||
|
||
def _to_hhmmss(self, sec): | ||
m, s = divmod(sec, 60) | ||
h, m = divmod(m, 60) | ||
return "%d:%02d:%02d" % (h, m, s) | ||
|
||
def _check_array(self, X): | ||
if isinstance(X, list): | ||
raise ValueError('X must be a numpy array') | ||
if not len(X.shape) == 2: | ||
raise ValueError('X must be a 2D array. Try X[:, numpy.newaxis]') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# Sebastian Raschka 2014-2016 | ||
# mlxtend Machine Learning Library Extensions | ||
# | ||
# Estimator for Linear Regression | ||
# Author: Sebastian Raschka <sebastianraschka.com> | ||
# | ||
# License: BSD 3 clause | ||
|
||
from .base import _BaseCluster | ||
import numpy as np | ||
from time import time | ||
from scipy.spatial.distance import euclidean | ||
|
||
|
||
class Kmeans(_BaseCluster): | ||
""" K-means clustering class. | ||
Added in 0.4.1dev | ||
Parameters | ||
------------ | ||
k : int | ||
Number of clusters | ||
max_iter : int (default: 10) | ||
Number of iterations during cluster assignment. | ||
Cluster re-assignment stops automatically when the algorithm | ||
converged. | ||
random_seed : int (default: None) | ||
Set random state for the initial centroid assignment. | ||
print_progress : int (default: 0) | ||
Prints progress in fitting to stderr. | ||
0: No output | ||
1: Iterations elapsed | ||
2: 1 plus time elapsed | ||
3: 2 plus estimated time until completion | ||
Attributes | ||
----------- | ||
centroids_ : 2d-array, shape={k, n_features} | ||
Feature values of the k cluster centroids. | ||
custers_ : dictionary | ||
The cluster assignments stored as a Python dictionary; | ||
the dictionary keys denote the cluster indeces and the items are | ||
Python lists of the sample indices that were assigned to each | ||
cluster. | ||
iterations_ : int | ||
Number of iterations until convergence. | ||
""" | ||
|
||
def __init__(self, k, max_iter=10, random_seed=None, print_progress=0): | ||
super(Kmeans, self).__init__(print_progress=print_progress, | ||
random_seed=random_seed) | ||
self.k = k | ||
self.max_iter = max_iter | ||
|
||
def _fit(self, X): | ||
"""Learn cluster centroids from training data. | ||
Called in self.fit | ||
""" | ||
self.iterations_ = 0 | ||
n_samples = X.shape[0] | ||
|
||
# initialize centroids | ||
idx = np.random.choice(n_samples, self.k, replace=False) | ||
self.centroids_ = X[idx] | ||
|
||
for _ in range(self.max_iter): | ||
|
||
# assign samples to cluster centroids | ||
self.clusters_ = {i: [] for i in range(self.k)} | ||
for sample_idx, cluster_idx in enumerate( | ||
self._get_cluster_idx(X=X, centroids=self.centroids_)): | ||
self.clusters_[cluster_idx].append(sample_idx) | ||
|
||
# recompute centroids | ||
new_centroids = np.array([np.mean(X[self.clusters_[k]], axis=0) | ||
for k in sorted(self.clusters_.keys())]) | ||
|
||
# stop if cluster assignment doesn't change | ||
if (self.centroids_ == new_centroids).all(): | ||
break | ||
else: | ||
self.centroids_ = new_centroids | ||
|
||
self.iterations_ += 1 | ||
|
||
return self | ||
|
||
def _get_cluster_idx(self, X, centroids): | ||
for sample_idx, sample in enumerate(X): | ||
dist = [euclidean(sample, c) for c in self.centroids_] | ||
yield np.argmin(dist) | ||
|
||
def _predict(self, X): | ||
"""Predict cluster labels of X. | ||
Called in self.predict | ||
""" | ||
pred = np.array([idx for idx in self._get_cluster_idx(X=X, | ||
centroids=self.centroids_)]) | ||
return pred |
Oops, something went wrong.