-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcluster.go
348 lines (305 loc) · 8.86 KB
/
cluster.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/*
* Filename: /Users/bao/code/allhic/cluster.go
* Path: /Users/bao/code/allhic
* Created Date: Saturday, April 21st 2018, 4:09:48 pm
* Author: bao
*
* Copyright (c) 2018 Haibao Tang
*/
package allhic
import (
"bufio"
"fmt"
"os"
"sort"
"strings"
)
// merge is a generic type that stores the merges
type merge struct {
a int
b int
score float64
}
// Clusters stores all the contig IDs per cluster
type Clusters map[int][]int
// clusterLen helps sorting based on the length of a cluster
type clusterLen struct {
cID int
length int
}
// linkage stores the average of a contig to a cluster (for sorting)
type linkage struct {
avgLinkage float64
cID int
}
// Cluster performs the hierarchical clustering
// This function is a re-implementation of the AHClustering() function in LACHESIS
func (r *Partitioner) Cluster() {
// LACHESIS also skips contigs that are thought to be centromeric
G := r.matrix
nclusters := r.K
N := len(r.contigs)
// Auxiliary data structures to facilitate cluster merging
clusterID := make([]int, N)
clusterSize := make([]int, 2*N)
clusterExists := make([]bool, 2*N)
nonSingletonClusters := 0
nContigsSkipped := 0
// Initially all contigs in their own cluster
for i, contig := range r.contigs {
if contig.skip {
clusterID[i] = -1
nContigsSkipped++
continue
}
clusterID[i] = i
clusterSize[i] = 1
clusterExists[i] = true
}
nNonSkipped := N - nContigsSkipped
if nNonSkipped == 0 {
log.Noticef("There are no informative contigs for clustering. Contigs are either SHORT or REPETITVE.")
}
log.Noticef("Clustering starts with %d (%d informative) contigs with target of %d clusters",
N, nNonSkipped, nclusters)
// mergeScores has all possible pairwise merge scores
// We keep a slice containing all potential merges. Best to use a priority queue.
// The original LACHESIS implementation used a C++ multimap with its use similar
// to a priority queue, however, the performance benefit is not obvious since we
// need to perform updates to all merges (remove old merges and insert new merges)
merges := make([]*merge, 0)
for i := 0; i < N; i++ {
if r.contigs[i].skip {
continue
}
for j := i + 1; j < N; j++ {
if !r.contigs[j].skip && G[i][j] > MinAvgLinkage {
merges = append(merges, &merge{
a: i,
b: j,
score: float64(G[i][j]),
})
}
}
}
nMerges := 0
// The core hierarchical clustering
for {
if len(merges) == 0 {
log.Notice("No more merges to do since the queue is empty")
break
}
bestMerge := merges[0]
// Step 1. Find the pairs of the clusters with the highest merge score
for _, merge := range merges {
if merge.score > bestMerge.score {
bestMerge = merge
}
}
// Step 2. Merge the contig pair
newClusterID := N + nMerges
clusterExists[bestMerge.a] = false
clusterExists[bestMerge.b] = false
clusterExists[newClusterID] = true
clusterSize[newClusterID] = clusterSize[bestMerge.a] + clusterSize[bestMerge.b]
if bestMerge.a < N {
nonSingletonClusters++
}
if bestMerge.b < N {
nonSingletonClusters++
}
nonSingletonClusters--
var newCluster []int
for i := 0; i < N; i++ {
if clusterID[i] == bestMerge.a || clusterID[i] == bestMerge.b {
clusterID[i] = newClusterID
newCluster = append(newCluster, i)
}
}
nMerges++
// Step 3. Calculate new score entries for the new cluster
// Remove all used clusters
newMerges := make([]*merge, 0)
for _, merge := range merges {
if clusterExists[merge.a] && clusterExists[merge.b] {
newMerges = append(newMerges, merge)
} else {
// fmt.Println("Ignore", merge)
}
}
// Add all merges with the new cluster
totalLinkageByCluster := make([]int64, 2*N)
for i := 0; i < N; i++ {
cID := clusterID[i]
if cID == newClusterID { // No need to calculate linkages within cluster
continue
}
if cID == -1 { // This happens if contig is skipped
continue
}
for _, j := range newCluster {
totalLinkageByCluster[cID] += G[i][j]
}
}
for i := 0; i < 2*N; i++ {
if totalLinkageByCluster[i] <= 0 {
continue
}
if !clusterExists[i] {
log.Errorf("Cluster %d does not exist", i)
}
// Average linkage
avgLinkage := float64(totalLinkageByCluster[i]) / float64(clusterSize[i]) /
float64(clusterSize[newClusterID])
if avgLinkage < MinAvgLinkage {
continue
}
p := &merge{
a: min(i, newClusterID),
b: max(i, newClusterID),
score: avgLinkage,
}
newMerges = append(newMerges, p)
// fmt.Println("Insert", p)
}
// Analyze the current clusters if enough merges occurred
if nMerges > nNonSkipped/2 && nonSingletonClusters <= nclusters {
if nonSingletonClusters == nclusters {
log.Noticef("%d merges made so far; this leaves %d clusters, and so we are done!",
nMerges, nonSingletonClusters)
break
}
}
if nMerges%50 == 0 {
log.Noticef("Merge #%d: Clusters\t%d + %d -> %d, Linkage = %g",
nMerges, bestMerge.a, bestMerge.b, newClusterID, bestMerge.score)
}
merges = newMerges
}
r.setClusters(clusterID)
}
// setClusters assigns contigs into clusters per clusterID
// When there are contigs skipped (either SHORT or REPETITIVE), we assign the skipped contigs based
// on how well they match non-skipped contigs. Each skipped contig needs to link to a cluster with
// at least NonInformativeRatio times as many links as any other cluster.
func (r *Partitioner) setClusters(clusterID []int) {
clusters := Clusters{}
for i, cID := range clusterID {
if i == cID || cID == -1 { // cID == -1 is skipped
continue
}
clusters[cID] = append(clusters[cID], i)
}
r.clusters = clusters
if !(r.NonInformativeRatio == 0 || r.NonInformativeRatio > 1) {
log.Errorf("NonInformativeRatio needs to either 0 or > 1")
}
// Now try to recover previously skipped contigs
if r.NonInformativeRatio == 0 {
r.sortClusters()
return
}
N := len(r.contigs)
nPassRatio := 0
nFailRatio := 0
nFailCluster := 0
skippedClusters := map[int]int{}
// NonInformativeRatio > 1
// Loop through all skipped contigs. Determine the cluster with largest average linkage.
for i := 0; i < N; i++ {
if clusterID[i] != -1 {
continue
}
linkages := r.findClusterLinkage(i)
if len(linkages) == 0 { // Didn't cluster with any
nFailCluster++
continue
}
sort.Slice(linkages, func(i, j int) bool {
return linkages[i].avgLinkage > linkages[j].avgLinkage
})
passRatio := linkages[0].avgLinkage >= float64(r.NonInformativeRatio) &&
(len(linkages) == 1 || linkages[1].avgLinkage == 0 ||
linkages[0].avgLinkage/linkages[1].avgLinkage >= float64(r.NonInformativeRatio))
if !passRatio {
nFailRatio++
continue
}
skippedClusters[i] = linkages[0].cID
nPassRatio++
}
log.Noticef("setClusters summary (NonInformativeRatio = %d): nPassRatio = %d, nFailRatio = %d, nFailCluster=%d",
r.NonInformativeRatio, nPassRatio, nFailRatio, nFailCluster)
// Insert the skipped contigs into clusters
for contigID, cID := range skippedClusters {
r.clusters[cID] = append(r.clusters[cID], contigID)
}
r.sortClusters()
// fmt.Println(r.clusters)
return
}
// findClusterLinkages
func (r *Partitioner) findClusterLinkage(contigID int) []*linkage {
linkages := make([]*linkage, 0)
for i, cl := range r.clusters {
totalLinkage := int64(0)
clusterSize := len(cl)
// Calculate the average linkage between contig and the cluster
for _, id := range cl {
if contigID == id { // contig in this contig
clusterSize--
} else {
totalLinkage += r.matrix[contigID][id]
}
}
if totalLinkage > 0 {
linkages = append(linkages, &linkage{
avgLinkage: float64(totalLinkage) / float64(clusterSize),
cID: i,
})
}
}
return linkages
}
// sortClusters reorder the cluster by total length
func (r *Partitioner) sortClusters() {
clusterLens := make([]*clusterLen, 0)
for cID, cl := range r.clusters {
c := &clusterLen{
cID: cID,
length: 0,
}
for _, ci := range cl {
c.length += r.contigs[ci].length
}
clusterLens = append(clusterLens, c)
}
// Reorder the clusters based on the size
sort.Slice(clusterLens, func(i, j int) bool {
return clusterLens[i].length > clusterLens[j].length
})
newClusters := Clusters{}
for i, cl := range clusterLens {
newClusters[i] = r.clusters[cl.cID]
}
r.clusters = newClusters
}
// printClusters shows the contents of the clusters
func (r *Partitioner) printClusters() {
clusterfile := RemoveExt(RemoveExt(r.PairsFile)) + ".clusters.txt"
f, _ := os.Create(clusterfile)
w := bufio.NewWriter(f)
_, _ = fmt.Fprintf(w, "#Group\tnContigs\tContigs\n")
for j := 0; j < len(r.clusters); j++ {
ids := r.clusters[j]
names := make([]string, len(ids))
for i, id := range ids {
names[i] = r.contigs[id].name
}
sort.Strings(names)
_, _ = fmt.Fprintf(w, "%dg%d\t%d\t%s\n", r.K, j+1, len(names), strings.Join(names, " "))
}
_ = w.Flush()
log.Noticef("Write %d partitions to `%s`", len(r.clusters), clusterfile)
_ = f.Close()
}