Skip to content

Commit

Permalink
Merge pull request #49 from rasbt/tfcluster
Browse files Browse the repository at this point in the history
Tfcluster
  • Loading branch information
rasbt committed Apr 27, 2016
2 parents abe10b5 + 341d10b commit 4e6ac4c
Show file tree
Hide file tree
Showing 17 changed files with 905 additions and 14 deletions.
4 changes: 3 additions & 1 deletion ci/.travis_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ if [ "$TENSORFLOW" == "true" ]; then
if [[ "$COVERAGE" == "true" ]]; then
nosetests -s -v mlxtend/tf_classifier --nologcapture --with-coverage
nosetests -s -v mlxtend/tf_regressor --nologcapture --with-coverage
nosetests -s -v mlxtend/tf_cluster --nologcapture --with-coverage
else
nosetests -s -v mlxtend/tf_classifier --nologcapture
nosetests -s -v mlxtend/tf_regressor --nologcapture
nosetests -s -v mlxtend/tf_cluster --nologcapture
else
if [[ "$COVERAGE" == "true" ]]; then
nosetests -s -v --with-coverage --exclude-dir=mlxtend/tf_classifier --exclude-dir=mlxtend/tf_regressor --exclude-dir=mlxtend/data --exclude-dir=mlxtend/general_plotting
nosetests -s -v --with-coverage --exclude-dir=mlxtend/tf_classifier --exclude-dir=mlxtend/tf_regressor --exclude-dir=mlxtend/tf_cluster --exclude-dir=mlxtend/data --exclude-dir=mlxtend/general_plotting
else
nosetests -s -v --exclude-dir=mlxtend/tf_classifier --exclude-dir=mlxtend/data --exclude-dir=mlxtend/general_plotting
fi
Expand Down
5 changes: 4 additions & 1 deletion docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ pages:
- user_guide/feature_extraction/RBFKernelPCA.md
- cluster:
- user_guide/cluster/Kmeans.md
- tf_cluster:
- user_guide/tf_cluster/TfKmeans.md
- evaluate:
- user_guide/evaluate/confusion_matrix.md
- user_guide/evaluate/plot_decision_regions.md
Expand Down Expand Up @@ -107,7 +109,8 @@ pages:
- api_subpackages/mlxtend.evaluate.md
- api_subpackages/mlxtend.feature_selection.md
- api_subpackages/mlxtend.feature_extraction.md
- api_subpackages/mlxtend.cluster.md
- api_subpackages/mlxtend.cluster.md
- api_subpackages/mlxtend.tf_cluster.md
- api_subpackages/mlxtend.file_io.md
- api_subpackages/mlxtend.general_plotting.md
- api_subpackages/mlxtend.preprocessing.md
Expand Down
1 change: 1 addition & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

- New TensorFlow estimator for Linear Regression ([`tf_regressor.TfLinearRegression`](./user_guide/tf_regressor/TfLinearRegression.md))
- New k-means clustering estimator ([`cluster.Kmeans`](./user_guide/cluster/Kmeans.md))
- New TensorFlow k-means clustering estimator ([`tf_cluster.Kmeans`](./user_guide/tf_cluster/TfKmeans.md))

##### Changes

Expand Down
3 changes: 3 additions & 0 deletions docs/sources/USER_GUIDE_INDEX.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
## `cluster`
- [`Kmeans`](user_guide/cluster/Kmeans.md) (new in 0.4.1dev)

## `tf_cluster`
- [`TfKmeans`](user_guide/tf_cluster/TfKmeans.md) (new in 0.4.1dev)

## `evaluate`
- [`confusion_matrix`](user_guide/evaluate/confusion_matrix.md)
- [`plot_decision_regions`](user_guide/evaluate/plot_decision_regions.md)
Expand Down
18 changes: 13 additions & 5 deletions docs/sources/user_guide/cluster/Kmeans.ipynb

Large diffs are not rendered by default.

Binary file modified docs/sources/user_guide/cluster/Kmeans_files/Kmeans_17_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
411 changes: 411 additions & 0 deletions docs/sources/user_guide/tf_cluster/TfKmeans.ipynb

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 4 additions & 3 deletions mlxtend/cluster/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,10 @@ def _shuffle(self, arrays):
r = np.random.permutation(len(arrays[0]))
return [ary[r] for ary in arrays]

def _print_progress(self, iteration, cost=None, time_interval=10):
def _print_progress(self, iteration, n_iter,
cost=None, time_interval=10):
if self.print_progress > 0:
s = '\rIteration: %d/%d' % (iteration, self.n_iter)
s = '\rIteration: %d/%d' % (iteration, n_iter)
if cost:
s += ' | Cost %.2f' % cost
if self.print_progress > 1:
Expand All @@ -96,7 +97,7 @@ def _print_progress(self, iteration, cost=None, time_interval=10):
self.eta_str_ = '00:00:00'
if not iteration % time_interval:
eta_sec = ((ela_sec / float(iteration)) *
self.n_iter - ela_sec)
n_iter - ela_sec)
self.eta_str_ = self._to_hhmmss(eta_sec)
s += ' | ETA: %s' % self.eta_str_
stderr.write(s)
Expand Down
9 changes: 7 additions & 2 deletions mlxtend/cluster/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .base import _BaseCluster
import numpy as np
from time import time
from scipy.spatial.distance import euclidean
# from scipy.spatial.distance import euclidean


class Kmeans(_BaseCluster):
Expand Down Expand Up @@ -86,12 +86,17 @@ def _fit(self, X):
self.centroids_ = new_centroids

self.iterations_ += 1
if self.print_progress:
self._print_progress(iteration=self.iterations_,
n_iter=self.max_iter)

return self

def _get_cluster_idx(self, X, centroids):
for sample_idx, sample in enumerate(X):
dist = [euclidean(sample, c) for c in self.centroids_]
# dist = [euclidean(sample, c) for c in self.centroids_]
dist = np.sqrt(np.sum(np.square(sample - self.centroids_), axis=1))

yield np.argmin(dist)

def _predict(self, X):
Expand Down
15 changes: 13 additions & 2 deletions mlxtend/cluster/tests/test_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_nonfitted():
X)


def test_three_blobs_1():
def test_three_blobs_multi():
km = Kmeans(k=3,
max_iter=50,
random_seed=1,
Expand All @@ -34,7 +34,18 @@ def test_three_blobs_1():
assert (y_pred == y).all()


def test_three_blobs_2():
def test_three_blobs_1sample():
km = Kmeans(k=3,
max_iter=50,
random_seed=1,
print_progress=0)
sample = X[1, :].reshape(1, 2)

y_pred = km.fit(X).predict(sample)
assert y_pred[0] == y[1]


def test_three_blobs_centroids():
km = Kmeans(k=3,
max_iter=50,
random_seed=1,
Expand Down
9 changes: 9 additions & 0 deletions mlxtend/tf_cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

from .tf_kmeans import TfKmeans

__all__ = ["TfKmeans"]
71 changes: 71 additions & 0 deletions mlxtend/tf_cluster/tests/test_tf_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

from mlxtend.tf_cluster.tf_base import _TfBaseCluster
import numpy as np
from mlxtend.utils import assert_raises


def test_init():
cl = _TfBaseCluster(print_progress=0, random_seed=1)


def test_check_array_1():
X = np.array([1, 2, 3])
cl = _TfBaseCluster(print_progress=0, random_seed=1)
assert_raises(ValueError,
'X must be a 2D array. Try X[:, numpy.newaxis]',
cl._check_array,
X)


def test_check_array_2():
X = list([[1], [2], [3]])
cl = _TfBaseCluster(print_progress=0, random_seed=1)

assert_raises(ValueError,
'X must be a numpy array',
cl._check_array,
X)


def test_check_array_3():
X = np.array([[1], [2], [3]])
cl = _TfBaseCluster(print_progress=0, random_seed=1)
cl._check_array(X)


def test_fit():
X = np.array([[1], [2], [3]])
tfr = _TfBaseCluster(print_progress=0, random_seed=1)
tfr.fit(X)


def test_predict_1():
X = np.array([[1], [2], [3]])
cl = _TfBaseCluster(print_progress=0, random_seed=1)

assert_raises(AttributeError,
'Model is not fitted, yet.',
cl.predict,
X)


def test_predict_2():
X = np.array([[1], [2], [3]])
cl = _TfBaseCluster(print_progress=0, random_seed=1)

cl.fit(X)
cl.predict(X)


def test_shuffle():
X = np.array([[1], [2], [3]])
y = np.array([1, 2, 3])
cl = _TfBaseCluster(print_progress=0, random_seed=1)
X_sh, y_sh = cl._shuffle(arrays=[X, np.array(y)])
np.testing.assert_equal(X_sh, np.array([[1], [3], [2]]))
np.testing.assert_equal(y_sh, np.array([1, 3, 2]))
59 changes: 59 additions & 0 deletions mlxtend/tf_cluster/tests/test_tf_kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

from mlxtend.data import three_blobs_data
from mlxtend.tf_cluster import TfKmeans
from mlxtend.utils import assert_raises
import numpy as np


X, y = three_blobs_data()


def test_nonfitted():
km = TfKmeans(k=3,
max_iter=50,
random_seed=1,
print_progress=0)

assert_raises(AttributeError,
'Model is not fitted, yet.',
km.predict,
X)


def test_three_blobs_multi():
km = TfKmeans(k=3,
max_iter=50,
random_seed=1,
print_progress=0)
y_pred = km.fit(X).predict(X)
assert (y_pred == y).all()


def test_three_blobs_1sample():
km = TfKmeans(k=3,
max_iter=50,
random_seed=1,
print_progress=0)
sample = X[1, :].reshape(1, 2)

y_pred = km.fit(X).predict(sample)
assert y_pred[0] == y[1]


def test_three_blobs_centroids():
km = TfKmeans(k=3,
max_iter=50,
random_seed=1,
print_progress=0)

centroids = np.array([[-1.5947298, 2.92236966],
[2.06521743, 0.96137409],
[0.9329651, 4.35420713]])

km.fit(X)
np.testing.assert_almost_equal(centroids, km.centroids_, decimal=2)
120 changes: 120 additions & 0 deletions mlxtend/tf_cluster/tf_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Sebastian Raschka 2014-2016
# mlxtend Machine Learning Library Extensions
#
# Base Clusteer (Clutering Parent Class)
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
from sys import stderr
from time import time
import tensorflow as tf


class _TfBaseCluster(object):

"""Parent Class Base Cluster
A base class that is implemented by
clustering child classes.
"""
def __init__(self, print_progress=0, random_seed=None, dtype=None):
self.print_progress = print_progress
self.random_seed = random_seed
if dtype is None:
self.dtype = tf.float32
else:
self.dtype = dtype
self._is_fitted = False

def fit(self, X):
"""Learn cluster centroids from training data.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
self : object
"""
self._is_fitted = False
self._check_array(X=X)
if self.random_seed is not None:
np.random.seed(self.random_seed)
self._fit(X=X)
self._is_fitted = True
return self

def _fit(self, X):
# Implemented in child class
pass

def predict(self, X):
"""Predict cluster labels of X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
cluster_labels : array-like, shape = [n_samples]
Predicted cluster labels.
"""
self._check_array(X=X)
if not self._is_fitted:
raise AttributeError('Model is not fitted, yet.')
return self._predict(X)

def _predict(self, X):
# Implemented in child class
pass

def _shuffle(self, arrays):
"""Shuffle arrays in unison."""
r = np.random.permutation(len(arrays[0]))
return [ary[r] for ary in arrays]

def _print_progress(self, iteration, n_iter,
cost=None, time_interval=10):
if self.print_progress > 0:
s = '\rIteration: %d/%d' % (iteration, n_iter)
if cost:
s += ' | Cost %.2f' % cost
if self.print_progress > 1:
if not hasattr(self, 'ela_str_'):
self.ela_str_ = '00:00:00'
if not iteration % time_interval:
ela_sec = time() - self.init_time_
self.ela_str_ = self._to_hhmmss(ela_sec)
s += ' | Elapsed: %s' % self.ela_str_
if self.print_progress > 2:
if not hasattr(self, 'eta_str_'):
self.eta_str_ = '00:00:00'
if not iteration % time_interval:
eta_sec = ((ela_sec / float(iteration)) *
n_iter - ela_sec)
self.eta_str_ = self._to_hhmmss(eta_sec)
s += ' | ETA: %s' % self.eta_str_
stderr.write(s)
stderr.flush()

def _to_hhmmss(self, sec):
m, s = divmod(sec, 60)
h, m = divmod(m, 60)
return "%d:%02d:%02d" % (h, m, s)

def _check_array(self, X):
if isinstance(X, list):
raise ValueError('X must be a numpy array')
if not len(X.shape) == 2:
raise ValueError('X must be a 2D array. Try X[:, numpy.newaxis]')
Loading

0 comments on commit 4e6ac4c

Please sign in to comment.