modified: plot.py

freesinger · Jan 10, 2019 · 4d15428 · 4d15428
1 parent e51f667
commit 4d15428
Show file tree

Hide file tree

Showing 20 changed files with 119 additions and 34 deletions.
diff --git a/__pycache__/cluster.cpython-36.pyc b/__pycache__/cluster.cpython-36.pyc
diff --git a/__pycache__/data_process.cpython-36.pyc b/__pycache__/data_process.cpython-36.pyc
diff --git a/cluster.py b/cluster.py
@@ -23,9 +23,9 @@ def locate_center(self, judge, maxid, threshold):
         # result showed in rank.png
         # 6 clusters should be divided in given dataset
 
-        cluster_centers = list(c[0] for c in result[0:5])
+        cluster_centers = list(c[0] for c in result[0:3])
         # given dataset: [1061, 1515, 400, 6, 1566, 614]
-        # generate dataset: [80, 460, 463, 500, 954, 984]
+        # generate dataset: [642, 877, 123]
 
         tag_info = dict()
         cluster_id = 1
@@ -47,8 +47,12 @@ def classify(self, taginfo, srt_dens, min_num, maxid):
         for ele in srt_dens:
             dens_dict[ele[0]] = ele[1]
         for i in dens_dict.keys():
-            if taginfo[i] == -1:
-                taginfo[i] = taginfo[min_num[i]]
+            try:
+                if taginfo[i] == -1:
+                    taginfo[i] = taginfo[min_num[i]]
+            except KeyError:
+                raise 'Key error: key does not exist!'
+
         return taginfo
 
     def analysis(self, centers, taginfo, distance, maxid):

diff --git a/data_process.py b/data_process.py
@@ -1,4 +1,3 @@
-import math
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -33,39 +32,51 @@ def entropy(self, distance, maxid, factor):
         for i in range(1, maxid + 1):
             tmp = 0
             for j in range(1, maxid + 1):
-                tmp += math.exp(-pow(distance[(i, j)] / factor, 2))
+                tmp += np.exp(-pow(distance[(i, j)] / factor, 2))
             potential[i] = tmp
         z = sum(potential.values())
         H = 0
         for i in range(1, maxid + 1):
             x = potential[i] / z
-            H += x * math.log(x)
+            H += x * np.log(x)
         return -H
 
     def threshold(self, dist, max_id):
         '''
         :rtype: factor value makes H smallest
         '''
         entro = 10.0
+        # given data:
         # 0.02139999999999999 7.203581306901208
         # 0.02149999999999999 7.203577254067677
         # 0.02159999999999999 7.203577734107922
-        scape = np.arange(0.021+1e-4, 0.022, 1e-4)
+
+        # generate data:
+        # 0.367020, 6.943842
+        # 0.368959, 6.943840
+        # 0.370898, 6.943841
+
+        scape = np.linspace(0.330, 0.430, 50)
+        # 通用数据使用以下一行
+        # scape = np.linspace(0.001, 1.001, 100)
         for factor in scape:
             value = self.entropy(dist, max_id, factor)
-            # print(factor, value)
+            print('factor: {0:.6f}, entropy: {1:.8f}'.format(factor, value))
             # plt.scatter(factor, value, c='r', s=1)
             if value and value < entro:
                 entro, thresh = value, factor
         thresh = 3 * thresh / pow(2, 0.5)
+
         """
         plt.xlabel(r'$\sigma$')
         plt.ylabel(r'H')
-        plt.savefig('./images/Entropy.png')
+        plt.savefig('./images/Entropy test.png')
         plt.close()
         """
-        # print('current: ', entro, thresh)
-        # current:  7.203577254067677 0.04560838738653229
+
+        print('current: ', entro, thresh)
+        # given data:  7.203577254067677 0.04560838738653229
+        # generate data: 6.943840312796875 0.7828967189629044
         return thresh
 
     def CutOff(self, distance, max_id, threshold):
@@ -90,7 +101,7 @@ def Guasse(self, distance, max_id, threshold):
         for i in range(1, max_id + 1):
             tmp = 0
             for j in range(1, max_id + 1):
-                tmp += math.exp(-pow((distance[(i, j)] / threshold), 2))
+                tmp += np.exp(-pow((distance[(i, j)] / threshold), 2))
             guasse[i] = tmp
         sorted_guasse = sorted(guasse.items(), key=lambda k:k[1], reverse=True)
         return sorted_guasse

diff --git a/generatePoints.py b/generatePoints.py
@@ -7,9 +7,9 @@
 r = np.random.RandomState(24)
 p = r.randn(400, 2)
 q = r.randn(400, 2) + 7
-r = r.randn(400, 2) + 4
+s = r.randn(400, 2) + 4
 
-t = np.concatenate((p, q, r), axis=0)
+t = np.concatenate((p, q, s), axis=0)
 
 with open(GENERATE_POINTS, 'w', encoding='utf-8') as f:
     for pos in range(len(t)):
@@ -18,7 +18,7 @@
 
 d = lambda x, y: np.sqrt(np.power((x[0] - y[0]), 2) + np.power((x[1] - y[1]), 2))
 
-with open(GENERATE_POINTS_DIST, 'a', encoding='utf-8') as f:
+with open(GENERATE_POINTS_DIST, 'w', encoding='utf-8') as f:
     for i in range(len(t)):
         for j in range(i + 1, len(t)):
             distance = d(t[i], t[j])
@@ -29,8 +29,8 @@
 plt.plot(x, y, 'or', markersize=1, alpha=0.5, label='1')
 # plt.show()
 
-x = r[:, 0]
-y = r[:, 1]
+x = s[:, 0]
+y = s[:, 1]
 plt.plot(x, y, 'ob', markersize=1, alpha=0.5, label='2')
 
 x = q[:, 0]

diff --git a/images/Cluster1 test.png b/images/Cluster1 test.png
diff --git a/images/Cluster2 test.png b/images/Cluster2 test.png
diff --git a/images/Cluster3 test.png b/images/Cluster3 test.png
diff --git a/images/Cluster4 test.png b/images/Cluster4 test.png
diff --git a/images/Cluster5 test.png b/images/Cluster5 test.png
diff --git a/images/Decision Graph Cutoff test.png b/images/Decision Graph Cutoff test.png
diff --git a/images/Entropy test.png b/images/Entropy test.png
diff --git a/images/cluster_cutoff_test.png b/images/cluster_cutoff_test.png
diff --git a/images/generatedPoints.png b/images/generatedPoints.png
diff --git a/images/rank cutoff test.png b/images/rank cutoff test.png
diff --git a/images/result.png b/images/result.png
diff --git a/others/report.html b/others/report.html
diff --git a/others/report.md b/others/report.md
@@ -14,11 +14,11 @@
 
 - **Cut-off Kernel**
 
-给定截断距离$d_{c} > 0$，采用Cut-off kernel方式计算局部密度，由$\rho_{i}=\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$，这种方式计算局部密度$\rho_i$为连续值。
+给定截断距离$d_{c} > 0$，采用Cut-off kernel方式计算局部密度，由$\rho_{i}=\large\Sigma_{j}\chi(d_{ij}-d_{c})$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$，这种方式计算局部密度$\rho_i$为连续值。
 
 - **Gaussian kernel**
 
-给定截断距离$d_{c} > 0$，采用Gaussian kernel方式计算局部密度，由$\rho_{i}=\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$，这种方式计算局部密度$\rho_i$为离散值。
+给定截断距离$d_{c} > 0$，采用Gaussian kernel方式计算局部密度，由$\rho_{i}=\large\Sigma_{j}e^-(\frac{d_{ij}}{d_{c}})^2$且$\chi(x) = 1\text{ if } x<0\text{ and }\chi(x)=0\text{ otherwise}$，这种方式计算局部密度$\rho_i$为离散值。
 
 #### 2.1.2 最小距离$\delta_i$
 
@@ -46,11 +46,11 @@
 
 #### 2.2.1 Potential Of Point(POP)
 
-对一个数据集$\{x_1,x_2,...,x_n\}$，每个点的potential计算公式为$\varphi(x)=\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$，类似Gaussian kernel的计算，其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离，$\sigma$为需要确定的变量值。
+对一个数据集$\{x_1,x_2,...,x_n\}$，每个点的potential计算公式为$\varphi(x)=\large\Sigma_{i=1}^n\big(e^-(\frac{||x-x_i||}{\sigma})^2\big)$，类似Gaussian kernel的计算，其中$||x-x_i||$代表欧式几何空间的$x$与$x_i$的距离，$\sigma$为需要确定的变量值。
 
 #### 2.2.2 Entropy
 
-对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$，定义数据域的熵值$H=-\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$，熵值代表数据域的混乱度，我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势：
+对一个POP集$\{\varphi_1,\varphi_2,...,\varphi_n\}$，定义数据域的熵值$H=-\large\Sigma_{i=1}^n(\frac{\varphi_i}{Z})log(\frac{\varphi_i}{Z})$，熵值代表数据域的混乱度，我们需要求使得$H$最小的变量$\sigma$。 下图直观展示了$H$随$\sigma$的变化趋势：
 
 ![entropy](../images/entropy.png)
 
@@ -118,9 +118,53 @@ def classify(self, taginfo, srt_dens, min_num, maxid):
 
 由之前的实验结果可知聚类中心共6个，简单对六个簇的分类情况进行的可视化，横坐标为点标号，纵坐标为点到聚类中心的距离。由于点的个数较多，故采用面积图，如上图所示是第六个簇的效果图。
 
+### 2.4 聚类测试
+
+#### 2.4.1 测试数据
+
+编写`generatePoints.py`来生成三个簇，每个簇400个点且均服从高斯分布，分布图如下所示。
+
+![generatedPoints](../images/generatedPoints.png)
+
+#### 2.4.2 聚类效果
+
+通过求熵值来确定截断距离最佳取值的图如下：
+
+![Entropy test](../images/Entropy test.png)
+
+由画出决策图如下：
+
+![Decision Graph Cutoff test](../images/Decision Graph Cutoff test.png)
+
+定义$\gamma_i=\rho_i\delta_i$为聚类中心的划分标准，画出图像如下：
+
+![rank cutoff test](../images/rank cutoff test.png)
+
+截断距离选择`0.7828`为最佳值，由图能直观看出此时应该划分三个类，和生成三个簇的数据基本相符。
+
+对三个簇进行可视化，画出相应的结果如下图，黑色加粗点为聚类中心：
+
+![result](../images/result.png)
+
+#### 2.4.3 结果分析
+
+聚类结果与生成图对比发现有的边缘点被忽略了，生成每个聚类簇元素视图如下：
+
+![cluster_cutoff_test](../images/cluster_cutoff_test.png)
+
+1. 可见有一部分点被分到第-1个簇中，这是在非聚类中心点分类过程中一些距离三个聚类中心都很远的离群点，因此在可视化过程中由聚类中心生成对应的簇时，这些点会被忽略，从而导致聚类结果图中点的缺失。
+
+   **对这些离群点进行有效的信息处理和聚类划分，可以是对该算法优化的下一步工作。**
+
+2. 对一些交错点划分，可见该算法性能较为朴素，在处理维度过高或者密度过大的点时可能任意出现交错点的错误划分。
+
+   **对交错点进行有效的处理可以有效解决这个问题，同时可以提升该算法的健壮性。**
+
 ## 3. 总结
 
-由于对距离定义未知，所以没有进行六类cluster的plot。文章中提到的聚类算法其实只实现了聚类中心的选择，在这基础上阅读了文章的增补内容，进行了聚类过程算法的补全，同时对截断距离的选取进行优化。在这基础之上还可以对聚类边界进行讨论，对离群点和交叉点进行划分。
+由于对距离定义未知，所以没有对初始数据进行六类cluster的plot，只在测试数据集上进行了相关的聚类可视化处理。文章中提到的聚类算法其实只实现了聚类中心的选择，在这基础上阅读了文章的增补内容，进行了聚类过程算法的补全，同时对截断距离的选取进行优化。
+
+在这基础之上还可以对聚类边界进行讨论，对离群点和交叉点进行划分。
 
 对聚类算法的聚类中心选择一直是个研究热点，该算法很朴素但切中要点，能很好地解决聚类中心问题，但是在聚类中心个数的选择上和k-means算法一样，还是需要人为选择，联系对局部密度算法的优化，猜测是否可以对每个点进行熵值计算，寻找聚类中心熵值的特性，从而实现聚类中心个数的自动选择。 
 

diff --git a/plot.py b/plot.py
@@ -9,7 +9,9 @@
 def main():
     solution = data_process.ProcessData()
     dist, maxid = solution.data_process(TEST_DATA)
-    threshold = solution.threshold(dist, maxid)
+    # 通用数据使用以下一行求截断距离（耗时较长）
+    # threshold = solution.threshold(dist, maxid)
+    threshold = 0.7828967189629044
     sort_dst = solution.CutOff(dist, maxid, threshold)
     # sort_dst = solution.Guasse(dist, maxid, threshold)
     min_dist, min_num = solution.min_distance(dist, sort_dst, maxid)
@@ -37,35 +39,35 @@ def main():
             p, x, y = int(p), float(x), float(y)
             coords[p] = [x, y]
     # print(coords[center[0]])
-    for i in range(len(center) - 1):
+    for i in range(len(center)):
         c = coords[center[i]]
         plt.plot(c[0], c[1], 'ok', markersize=5, alpha=0.8)
 
-    color = {0:'k', 1:'b', 2:'g', 3:'r', 4:'c', 5:'m', 6:'y'}
+    color = {0:'r', 1:'b', 2:'g', 3:'k', 4:'c', 5:'m', 6:'y'}
     for p in temp:
         for i in range(len(center)):
+            c = coords[p[0]]
             try:
-                c = coords[p[0]]
-                if p[1] == i:
+                # 标号从1开始，故i+1
+                if p[1] == i + 1:
                     plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
             except KeyError:
-                continue
-                # plt.scatter(c[0], c[1], c=color[i], alpha=0.6, s=1)
+                raise 'Key map not exis!'
+
     plt.xlabel('x')
     plt.ylabel('y')
     plt.title('Plot Result')
     plt.savefig('./images/result.png')
-    plt.show()
+    # plt.show()
+    plt.close()
 
-    """
     y, x = zip(*temp)
     plt.scatter(x, y)
     plt.xlabel('Cluster Number')
     plt.ylabel('Point Number')
     plt.title(r'$d_c=$' + str(threshold))
     plt.savefig('./images/cluster_cutoff_test.png')
-    plt.show()
-    """
+    # plt.show()
 
 if __name__ == '__main__':
     main()

diff --git a/report.pdf b/report.pdf