Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
modified: plot.py
Browse files Browse the repository at this point in the history
  • Loading branch information
freesinger committed Jan 10, 2019
1 parent e51f667 commit 4d15428
Show file tree
Hide file tree
Showing 20 changed files with 119 additions and 34 deletions.
Binary file modified __pycache__/cluster.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/data_process.cpython-36.pyc
Binary file not shown.
12 changes: 8 additions & 4 deletions cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def locate_center(self, judge, maxid, threshold):
# result showed in rank.png
# 6 clusters should be divided in given dataset

cluster_centers = list(c[0] for c in result[0:5])
cluster_centers = list(c[0] for c in result[0:3])
# given dataset: [1061, 1515, 400, 6, 1566, 614]
# generate dataset: [80, 460, 463, 500, 954, 984]
# generate dataset: [642, 877, 123]

tag_info = dict()
cluster_id = 1
Expand All @@ -47,8 +47,12 @@ def classify(self, taginfo, srt_dens, min_num, maxid):
for ele in srt_dens:
dens_dict[ele[0]] = ele[1]
for i in dens_dict.keys():
if taginfo[i] == -1:
taginfo[i] = taginfo[min_num[i]]
try:
if taginfo[i] == -1:
taginfo[i] = taginfo[min_num[i]]
except KeyError:
raise 'Key error: key does not exist!'

return taginfo

def analysis(self, centers, taginfo, distance, maxid):
Expand Down
29 changes: 20 additions & 9 deletions data_process.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import math
import numpy as np
import matplotlib.pyplot as plt

Expand Down Expand Up @@ -33,39 +32,51 @@ def entropy(self, distance, maxid, factor):
for i in range(1, maxid + 1):
tmp = 0
for j in range(1, maxid + 1):
tmp += math.exp(-pow(distance[(i, j)] / factor, 2))
tmp += np.exp(-pow(distance[(i, j)] / factor, 2))
potential[i] = tmp
z = sum(potential.values())
H = 0
for i in range(1, maxid + 1):
x = potential[i] / z
H += x * math.log(x)
H += x * np.log(x)
return -H

def threshold(self, dist, max_id):
'''
:rtype: factor value makes H smallest
'''
entro = 10.0
# given data:
# 0.02139999999999999 7.203581306901208
# 0.02149999999999999 7.203577254067677
# 0.02159999999999999 7.203577734107922
scape = np.arange(0.021+1e-4, 0.022, 1e-4)

# generate data:
# 0.367020, 6.943842
# 0.368959, 6.943840
# 0.370898, 6.943841

scape = np.linspace(0.330, 0.430, 50)
# 通用数据使用以下一行
# scape = np.linspace(0.001, 1.001, 100)
for factor in scape:
value = self.entropy(dist, max_id, factor)
# print(factor, value)
print('factor: {0:.6f}, entropy: {1:.8f}'.format(factor, value))
# plt.scatter(factor, value, c='r', s=1)
if value and value < entro:
entro, thresh = value, factor
thresh = 3 * thresh / pow(2, 0.5)

"""
plt.xlabel(r'$\sigma$')
plt.ylabel(r'H')
plt.savefig('./images/Entropy.png')
plt.savefig('./images/Entropy test.png')
plt.close()
"""
# print('current: ', entro, thresh)
# current: 7.203577254067677 0.04560838738653229

print('current: ', entro, thresh)
# given data: 7.203577254067677 0.04560838738653229
# generate data: 6.943840312796875 0.7828967189629044
return thresh

def CutOff(self, distance, max_id, threshold):
Expand All @@ -90,7 +101,7 @@ def Guasse(self, distance, max_id, threshold):
for i in range(1, max_id + 1):
tmp = 0
for j in range(1, max_id + 1):
tmp += math.exp(-pow((distance[(i, j)] / threshold), 2))
tmp += np.exp(-pow((distance[(i, j)] / threshold), 2))
guasse[i] = tmp
sorted_guasse = sorted(guasse.items(), key=lambda k:k[1], reverse=True)
return sorted_guasse
Expand Down
10 changes: 5 additions & 5 deletions generatePoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
r = np.random.RandomState(24)
p = r.randn(400, 2)
q = r.randn(400, 2) + 7
r = r.randn(400, 2) + 4
s = r.randn(400, 2) + 4

t = np.concatenate((p, q, r), axis=0)
t = np.concatenate((p, q, s), axis=0)

with open(GENERATE_POINTS, 'w', encoding='utf-8') as f:
for pos in range(len(t)):
Expand All @@ -18,7 +18,7 @@

d = lambda x, y: np.sqrt(np.power((x[0] - y[0]), 2) + np.power((x[1] - y[1]), 2))

with open(GENERATE_POINTS_DIST, 'a', encoding='utf-8') as f:
with open(GENERATE_POINTS_DIST, 'w', encoding='utf-8') as f:
for i in range(len(t)):
for j in range(i + 1, len(t)):
distance = d(t[i], t[j])
Expand All @@ -29,8 +29,8 @@
plt.plot(x, y, 'or', markersize=1, alpha=0.5, label='1')
# plt.show()

x = r[:, 0]
y = r[:, 1]
x = s[:, 0]
y = s[:, 1]
plt.plot(x, y, 'ob', markersize=1, alpha=0.5, label='2')

x = q[:, 0]
Expand Down
Binary file modified images/Cluster1 test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/Cluster2 test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/Cluster3 test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed images/Cluster4 test.png
Binary file not shown.
Binary file removed images/Cluster5 test.png
Binary file not shown.
Binary file modified images/Decision Graph Cutoff test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Entropy test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/cluster_cutoff_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/generatedPoints.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/rank cutoff test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions others/report.html

Large diffs are not rendered by default.

54 changes: 49 additions & 5 deletions others/report.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

- **Cut-off Kernel**

给定截断距离$d_{c} > 0$,采用Cut-off kernel方式计算局部密度,由$\rho_{i}=\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为连续值。
给定截断距离$d_{c} > 0$,采用Cut-off kernel方式计算局部密度,由$\rho_{i}=\large\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为连续值。

- **Gaussian kernel**

给定截断距离$d_{c} > 0$,采用Gaussian kernel方式计算局部密度,由$\rho_{i}=\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为离散值。
给定截断距离$d_{c} > 0$,采用Gaussian kernel方式计算局部密度,由$\rho_{i}=\large\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$,这种方式计算局部密度$\rho_i$为离散值。

#### 2.1.2 最小距离$\delta_i$

Expand Down Expand Up @@ -46,11 +46,11 @@

#### 2.2.1 Potential Of Point(POP)

对一个数据集$\{x_1,x_2,...,x_n\}$,每个点的potential计算公式为$\varphi(x)=\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$,类似Gaussian kernel的计算,其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离,$\sigma$为需要确定的变量值。
对一个数据集$\{x_1,x_2,...,x_n\}$,每个点的potential计算公式为$\varphi(x)=\large\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$,类似Gaussian kernel的计算,其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离,$\sigma$为需要确定的变量值。

#### 2.2.2 Entropy

对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$,定义数据域的熵值$H=-\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$,熵值代表数据域的混乱度,我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势:
对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$,定义数据域的熵值$H=-\large\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$,熵值代表数据域的混乱度,我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势:

![entropy](../images/entropy.png)

Expand Down Expand Up @@ -118,9 +118,53 @@ def classify(self, taginfo, srt_dens, min_num, maxid):

由之前的实验结果可知聚类中心共6个,简单对六个簇的分类情况进行的可视化,横坐标为点标号,纵坐标为点到聚类中心的距离。由于点的个数较多,故采用面积图,如上图所示是第六个簇的效果图。

### 2.4 聚类测试

#### 2.4.1 测试数据

编写`generatePoints.py`来生成三个簇,每个簇400个点且均服从高斯分布,分布图如下所示。

![generatedPoints](../images/generatedPoints.png)

#### 2.4.2 聚类效果

通过求熵值来确定截断距离最佳取值的图如下:

![Entropy test](../images/Entropy test.png)

由画出决策图如下:

![Decision Graph Cutoff test](../images/Decision Graph Cutoff test.png)

定义$\gamma_i=\rho_i\delta_i$为聚类中心的划分标准,画出图像如下:

![rank cutoff test](../images/rank cutoff test.png)

截断距离选择`0.7828`为最佳值,由图能直观看出此时应该划分三个类,和生成三个簇的数据基本相符。

对三个簇进行可视化,画出相应的结果如下图,黑色加粗点为聚类中心:

![result](../images/result.png)

#### 2.4.3 结果分析

聚类结果与生成图对比发现有的边缘点被忽略了,生成每个聚类簇元素视图如下:

![cluster_cutoff_test](../images/cluster_cutoff_test.png)

1. 可见有一部分点被分到第-1个簇中,这是在非聚类中心点分类过程中一些距离三个聚类中心都很远的离群点,因此在可视化过程中由聚类中心生成对应的簇时,这些点会被忽略,从而导致聚类结果图中点的缺失。

**对这些离群点进行有效的信息处理和聚类划分,可以是对该算法优化的下一步工作。**

2. 对一些交错点划分,可见该算法性能较为朴素,在处理维度过高或者密度过大的点时可能任意出现交错点的错误划分。

**对交错点进行有效的处理可以有效解决这个问题,同时可以提升该算法的健壮性。**

## 3. 总结

由于对距离定义未知,所以没有进行六类cluster的plot。文章中提到的聚类算法其实只实现了聚类中心的选择,在这基础上阅读了文章的增补内容,进行了聚类过程算法的补全,同时对截断距离的选取进行优化。在这基础之上还可以对聚类边界进行讨论,对离群点和交叉点进行划分。
由于对距离定义未知,所以没有对初始数据进行六类cluster的plot,只在测试数据集上进行了相关的聚类可视化处理。文章中提到的聚类算法其实只实现了聚类中心的选择,在这基础上阅读了文章的增补内容,进行了聚类过程算法的补全,同时对截断距离的选取进行优化。

在这基础之上还可以对聚类边界进行讨论,对离群点和交叉点进行划分。

对聚类算法的聚类中心选择一直是个研究热点,该算法很朴素但切中要点,能很好地解决聚类中心问题,但是在聚类中心个数的选择上和k-means算法一样,还是需要人为选择,联系对局部密度算法的优化,猜测是否可以对每个点进行熵值计算,寻找聚类中心熵值的特性,从而实现聚类中心个数的自动选择。

Expand Down
24 changes: 13 additions & 11 deletions plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
def main():
solution = data_process.ProcessData()
dist, maxid = solution.data_process(TEST_DATA)
threshold = solution.threshold(dist, maxid)
# 通用数据使用以下一行求截断距离(耗时较长)
# threshold = solution.threshold(dist, maxid)
threshold = 0.7828967189629044
sort_dst = solution.CutOff(dist, maxid, threshold)
# sort_dst = solution.Guasse(dist, maxid, threshold)
min_dist, min_num = solution.min_distance(dist, sort_dst, maxid)
Expand Down Expand Up @@ -37,35 +39,35 @@ def main():
p, x, y = int(p), float(x), float(y)
coords[p] = [x, y]
# print(coords[center[0]])
for i in range(len(center) - 1):
for i in range(len(center)):
c = coords[center[i]]
plt.plot(c[0], c[1], 'ok', markersize=5, alpha=0.8)

color = {0:'k', 1:'b', 2:'g', 3:'r', 4:'c', 5:'m', 6:'y'}
color = {0:'r', 1:'b', 2:'g', 3:'k', 4:'c', 5:'m', 6:'y'}
for p in temp:
for i in range(len(center)):
c = coords[p[0]]
try:
c = coords[p[0]]
if p[1] == i:
# 标号从1开始,故i+1
if p[1] == i + 1:
plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
except KeyError:
continue
# plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
raise 'Key map not exis!'

plt.xlabel('x')
plt.ylabel('y')
plt.title('Plot Result')
plt.savefig('./images/result.png')
plt.show()
# plt.show()
plt.close()

"""
y, x = zip(*temp)
plt.scatter(x, y)
plt.xlabel('Cluster Number')
plt.ylabel('Point Number')
plt.title(r'$d_c=$' + str(threshold))
plt.savefig('./images/cluster_cutoff_test.png')
plt.show()
"""
# plt.show()

if __name__ == '__main__':
main()
Expand Down
Binary file modified report.pdf
Binary file not shown.

0 comments on commit 4d15428

Please sign in to comment.