-
Notifications
You must be signed in to change notification settings - Fork 15
/
util-networks-misc.R
277 lines (225 loc) · 11.9 KB
/
util-networks-misc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
## This file is part of coronet, which is free software: you
## can redistribute it and/or modify it under the terms of the GNU General
## Public License as published by the Free Software Foundation, version 2.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
##
## Copyright 2016-2017 by Sofie Kemper <[email protected]>
## Copyright 2016-2017 by Claus Hunsen <[email protected]>
## Copyright 2016-2018 by Thomas Bock <[email protected]>
## Copyright 2020, 2023-2024 by Thomas Bock <[email protected]>
## Copyright 2017 by Angelika Schmid <[email protected]>
## Copyright 2019 by Jakob Kronawitter <[email protected]>
## Copyright 2019-2020 by Anselm Fehnker <[email protected]>
## Copyright 2024 by Leo Sendelbach <[email protected]>
## All Rights Reserved.
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Libraries ---------------------------------------------------------------
requireNamespace("parallel") # for parallel computation
requireNamespace("igraph") # networks
requireNamespace("Matrix") # for sparse matrices
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Get active authors -----------------------------------------------------
#' Get all author names that are active in at least one of the networks.
#'
#' @param networks the list of networks from which the author names are wanted
#' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE]
#'
#' @return the list of author names as a list of sorted vectors
get.author.names.from.networks = function(networks, globally = TRUE) {
## for each network, get a list of authors that are in this network
active.authors.list = lapply(networks, function(network) {
active.authors = igraph::V(network)$name
active.authors = sort(active.authors)
return(active.authors)
})
if (globally) {
## flatten the list of lists to one list of authors
active.authors = unlist(active.authors.list, recursive = FALSE)
## remove distracting named list members
names(active.authors) = NULL
## remove duplicates and order alphabetically ascending
active.authors = active.authors[!duplicated(active.authors)]
active.authors = sort(active.authors)
## return as a list
return(list(active.authors))
} else {
return(active.authors.list)
}
}
#' Get all author names that are active in at least one of the data sources during the data ranges.
#'
#' @param data.ranges the list of the data ranges
#' @param data.sources the data sources from which the author names should be retrieved,
#' can be either \code{"commits"}, \code{"mails"}, or \code{"issues"},
#' or any combination of them [default: c("commits", "mails", "issues")]
#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE]
#'
#' @return the list of author names as a list of sorted vectors
get.author.names.from.data = function(data.ranges, data.sources = c("commits", "mails", "issues"), globally = TRUE) {
data.sources = match.arg.or.default(data.sources, several.ok = TRUE)
## for each range, get the authors who have been active on at least one data source in this range
active.authors.list = lapply(data.ranges, function(range.data) {
active.authors = range.data$get.authors.by.data.source(data.sources)
active.authors.names = sort(active.authors[["author.name"]])
return(active.authors.names)
})
if (globally) {
## flatten the list of lists to one list of authors
active.authors = unlist(active.authors.list, recursive = FALSE)
## remove distracting named list members
names(active.authors) = NULL
## remove duplicates and order alphabetically ascending
active.authors = active.authors[!duplicated(active.authors)]
active.authors = sort(active.authors)
## return as a list
return(list(active.authors))
} else {
return(active.authors.list)
}
}
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Adjacency matrices ----------------------------------------------------
#' Get a sparse expanded adjacency matrix (in triplet format) for a given network.
#'
#' The adjacency matrix is expanded as it may contain rows and columns for authors which are not part of the network
#' but given in the \code{authors} parameter. However, this also means that authors present in the network
#' but not given in the \code{authors} parameter are not contained in the expanded adjacency matrix.
#'
#' @param network the given network
#' @param authors all authors that are wanted in the adjacency matrix
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the sparse adjacency matrix of the network
get.expanded.adjacency = function(network, authors, weighted = FALSE) {
## create an empty sparse matrix using the triplet form with the right size.
## x = 0 indicates that the matrix should contain numeric values (i.e., it is a 'dgTMatrix';
## without setting x = 0 it would be a binary 'ngTMatrix')
matrix = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors), length(authors)), repr = "T")
## add row and column names
rownames(matrix) = authors
colnames(matrix) = authors
if (igraph::vcount(network) > 0) {
if (weighted) {
## get the weighted adjacency matrix for the current network
matrix.data = igraph::get.adjacency(network, attr = "weight")
} else {
## get the unweighted sparse adjacency matrix for the current network
matrix.data = igraph::get.adjacency(network)
}
network.authors.num = nrow(matrix.data)
## order the adjacency matrix and filter out authors that were not in authors list
if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work
matrix.data = matrix.data[order((rownames(matrix.data)[rownames(matrix.data) %in% authors])),
order((rownames(matrix.data)[rownames(matrix.data) %in% authors]))]
}
if (network.authors.num > nrow(matrix.data)) {
# write a warning with the number of authors from the network that we ignore
warning.string = sprintf("The network had %d authors that will not be displayed in the matrix!",
network.authors.num - nrow(matrix.data))
warning(warning.string)
}
## save the activity data per author
if (nrow(matrix.data) > 0) {
matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data
}
if (!weighted) {
matrix[matrix > 0] = 1
}
}
return(matrix)
}
#' Calculates a sparse adjacency matrix for each network in the list.
#' All adjacency matrices are expanded in such a way that they use the same set
#' of authors derived from all networks in the list.
#'
#' @param networks list of networks
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the list of adjacency matrices
get.expanded.adjacency.matrices = function(networks, weighted = FALSE){
authors = get.author.names.from.networks(networks)[[1]]
adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted)
return(adjacency.matrices)
}
#' Gets a list of networks, converts them to sparse adjacency matrices, and sums up the adjacency matrices cumulatively.
#' This means that the first entry of the returned list is just the adjacency matrix from the first network,
#' the second entry is the sum of the first and the second entry, and so on.
#'
#' @param networks list of networks
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the list of cumulated adjacency matrices
get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) {
## get expanded adjacency matrices first
matrices = get.expanded.adjacency.matrices(networks, weighted)
## pair-wise sum of matrices: m.cumul(n) = m.cumul(m - 1) + m
## (intermediate results consecutively stored in matrices.cumulated)
matrices.cumulated = list(matrices[[1]]) # first one is complete already
if (length(matrices) > 1) {
for (m in 2:(length(matrices))){
matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]]
rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m - 1]])
colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m - 1]])
if (!weighted) {
## search for a non-zero entry and set them to an arbitray number (e.g., 42)
## to force that all non-zero entries are correctly set to 1 afterwards
if (length(matrices.cumulated[[m]]@i) > 0) {
## the first non-zero entry of a sparse matrix is at the first position pointed to by
## the lists @i and @j of the matrix. Since these lists store the position 0-based,
## but the list access we use them for is 1-based, we need to add 1 to both values.
row = matrices.cumulated[[m]]@i[[1]] + 1
col = matrices.cumulated[[m]]@j[[1]] + 1
matrices.cumulated[[m]][row][col] = 42
matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i))
}
}
}
}
return(matrices.cumulated)
}
#' Converts a list of adjacency matrices to an array.
#' Expects matrices of equal dimension with equal column- and rownames.
#'
#' @param adjacency.list the list of adjacency matrices
#'
#' @return the converted array
convert.adjacency.matrix.list.to.array = function(adjacency.list){
if (length(adjacency.list) < 1) {
logging::logerror("The method 'convert.adjacency.matrix.list.to.array received' an empty list!")
stop("The method 'convert.adjacency.matrix.list.to.array' received an empty list!")
}
## Check if all matrices have equal column- and rownames
rownames = rownames(adjacency.list[[1]])
colnames = colnames(adjacency.list[[1]])
if (length(adjacency.list) > 1) {
for (i in 2:length(adjacency.list)) {
if (!identical(rownames, rownames(adjacency.list[[i]])) || !identical(colnames, colnames(adjacency.list[[i]]))) {
error.string = sprintf("The matrix at position %d has different col or rownames from the first!", i)
logging::logerror(error.string)
stop(error.string)
}
}
}
## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result
array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list)))
rownames(array) = rownames
colnames(array) = colnames
## copy the activity values from the adjacency matrices in the list to the corresponding array slices
for (i in seq_along(adjacency.list)) {
adjacency = adjacency.list[[i]]
activity.indices = Matrix::which(adjacency != 0, arr.ind = TRUE)
for (j in seq_len(nrow(activity.indices))) {
array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2]), i] =
adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])]
}
}
return(array)
}