-
Notifications
You must be signed in to change notification settings - Fork 14
/
cluster_lines_kmeans_with_ideal_number_of_clusters.py
135 lines (108 loc) · 3.89 KB
/
cluster_lines_kmeans_with_ideal_number_of_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
##Edge bundling=group
##input=vector
##metric=string euclidean
##target_cluster_size=number 0.1
##kmeans_output=output vector
import numpy as np
import processing
from processing.tools.vector import VectorWriter
from qgis.core import *
from PyQt4.QtCore import *
from sklearn.cluster import KMeans
from datetime import datetime
from math import sqrt
t_start = datetime.now()
print '{0}: Clustering using KMeans'.format(t_start)
layer = processing.getObject(input)
provider = layer.dataProvider()
fields = provider.fields()
fields.append(QgsField('CLUSTER', QVariant.Int))
fields.append(QgsField('CLUSTER_N', QVariant.Int))
writer = VectorWriter(kmeans_output, None,fields, provider.geometryType(), layer.crs() )
# Perform clustering
X = []
for edge in processing.features(layer):
geom = edge.geometry()
azimuth = geom.vertexAt(0).azimuth(geom.vertexAt(1))/4
X.append([geom.vertexAt(0).x(),geom.vertexAt(0).y()])
X.append([geom.vertexAt(1).x(),geom.vertexAt(1).y()])
clusters = 2 # initial number of clusters
mean = target_cluster_size * target_cluster_size * 100 # make sure it's big enough
labels=[]
print sqrt(mean)
print target_cluster_size
while sqrt(mean) > target_cluster_size:
print("Clusters: {0}".format(clusters))
db = KMeans(n_clusters=clusters).fit(X)
pt_labels = list(db.labels_)
# labels[0] = cluster of line0 start
# labels[1] = cluster of line0 end
# labels[2] = cluster of line1 start
# labels[3] = cluster of line1 end
# ...
label_pairs=[]
for ptl in pt_labels:
try:
label_pairs[-1]
except IndexError:
label_pairs.append([])
if len(label_pairs[-1]) < 2:
label_pairs[-1].append(ptl)
else:
label_pairs[-1].sort() # to ignore line direction
label_pairs.append([ptl])
unique_line_labels = []
labels = []
for pair in label_pairs:
if pair not in unique_line_labels:
unique_line_labels.append(pair)
pair_line_label = unique_line_labels.index(pair)
labels.append(pair_line_label)
# Score cluster quality
# size of convex hull of start points or end points per cluster, respectively
start_points = []
end_points = []
for l in range(0,max(labels)+1):
start_points.append(QgsMultiPointV2())
end_points.append(QgsMultiPointV2())
for i,inFeat in enumerate(processing.features(layer)):
geom = inFeat.geometry()
label = int(labels[i])
if label >= 0:
start_points[label].addGeometry(QgsPointV2(geom.vertexAt(0).x(),geom.vertexAt(0).y()))
end_points[label].addGeometry(QgsPointV2(geom.vertexAt(1).x(),geom.vertexAt(1).y()))
start_areas = [QgsGeometry(pts).convexHull().area() for pts in start_points]
end_areas = [QgsGeometry(pts).convexHull().area() for pts in end_points]
sizes = start_areas + end_areas
mean = sum(sizes) / len(sizes)
print("Mean area: {0} or approx {1}^2".format(mean, sqrt(mean) ))
print("Max area: {0} or approx {1}^2".format(max(sizes), sqrt(max(sizes)) ))
if sqrt(mean) > (10*target_cluster_size):
clusters += 2
else:
clusters += 1
# Determine number of edges per cluster
cluster_sizes = []
for l in range(0,max(labels)+1):
cluster_sizes.append(0)
for label in labels:
if label >= 0:
cluster_sizes[label] = cluster_sizes[label]+1
# Create output
outFeat = QgsFeature()
for i,inFeat in enumerate(processing.features(layer)):
inGeom = inFeat.geometry()
outFeat.setGeometry(inGeom)
attrs = inFeat.attributes()
label = int(labels[i])
attrs.append(label)
size = 1
if label >= 0:
size = int(cluster_sizes[label])
attrs.append(size)
outFeat.setAttributes(attrs)
writer.addFeature(outFeat)
del writer
t_end = datetime.now()
print('{0}: Finished!'.format(t_end))
print('Run time: {0}'.format(t_end - t_start))