-
Notifications
You must be signed in to change notification settings - Fork 0
/
comp_Lib.R
341 lines (271 loc) · 15.1 KB
/
comp_Lib.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#Supply functions necessary for efficient data storage and processing
#Possibly unnecessary with heavy restructuring (ie. if graph creation can be avoided)
library(dplyr)
library(data.table)
library(parallel) #Some thread safety problems
#Creates a set of data-tables representing a graph of sequences, with the edges between those sequences representing the TN93 Distance.
#The time and location associated with the sequence can be taken either directly from the sequence header, or provided separately in a .csv file
#This set of data tables also includes the set of minimum retrospective edges from sequences at the newest time point.
impTN93 <- function(iFile, reVars="_", varInd=c(1, 2), addVarN=NA, dateFormat="%Y", partQ=0.95) {
#@param iFile: The name/path of the input file (expecting tn93 output csv)
#@param reVars: The regular expression used to extract variables from column headers. This is passed to strsplit, creating a vertex of values from the column header
#@param varInd: A vector of numbers describing the order of variables in the split string. This should describe the index of the unique ID, the Timepoint and the location.
# ex. If the header would be split such that the 4th index is the Unique ID, then 4 should be the first number in this list
# ID and timepoint are currently required. If the location information is not available, it should be set as "0".
#@param partQ: The proportion of the set that is to define "known" cases for the purposes of cluster growth. The remaining quantile is marked as new cases.
#@oaram addvarN: The names of additional variables beyond the second.
#@return: A list of 3 Data frames. An edge list (weighted by TN93 genetic distance), a vertex list,
# and a list of minimum edges, for the future establishment of a timepoint-based model
#Obtain tn93 edge list from file
idt <- fread(iFile)
#Reformat edge list as data table object with predictors extracted from sequence header
splitHeaders <- sapply(c(idt$ID1, idt$ID2), function(x) ((strsplit(x,reVars)[[1]]))[varInd])
el <- data.table(ID1=as.character(splitHeaders[1,1:nrow(idt)]), t1=as.Date(splitHeaders[2,1:nrow(idt)], format=dateFormat),
ID2=as.character(splitHeaders[1,(nrow(idt)+1):(2*nrow(idt))]), t2=as.Date(splitHeaders[2,(nrow(idt)+1):(2*nrow(idt))], format=dateFormat),
Distance = as.numeric(idt$Distance), stringsAsFactors= F)
#Obtain the maximum time and time difference between the head and tail of each edge
el[,"tMax" := pmax(t1,t2)]
el[,"tDiff" := (t1-t2)]
#Obtain list of unique sequences (also data table)
vl <- data.table(ID = c(el$ID1, el$ID2), Time = c(el$t1, el$t2), stringsAsFactors=F)
#In the event of additional variables
if(!is.na(addVarN)){
#Add additional variables
for(i in 1:length(addVarN)) {
#Add variables named based on character strings in addVarN
addVars <- splitHeaders[2+i,]
#To catch the event that the inputted variable is likely a date
#This is determined if over half of the input can be converted to a date
if(sum(sapply(sample(addVars,100,replace =F), function(x) {
o <- tryCatch({as.Date(x, format=dateFormat)},
error=function(cond){return(NA)})
return(is.na(o))
}))/100<0.5) {
addVars <- as.Date(addVars, format=dateFormat)
} else {
#If not a date, type.convert automatically converts the variable typing
addVars <- type.convert(addVars)
}
#Obtain typing, this is better than typeof() in that Date and Factor values are captured
addVarT <- (strsplit(capture.output(str(addVars)), " |\\[")[[1]])[2]
#Add variables named based on character strings in addVarN
#These variables types are based on addVarT, converted with typeConv()
v1 <- (paste0(addVarN[i],"1"))
v2 <- (paste0(addVarN[i],"2"))
el[,(v1):= addVars[1:nrow(idt)]]
el[,(v2):= addVars[(nrow(idt)+1):(2*nrow(idt))]]
#Add this variable to the vertex list
vl[, (addVarN[i]) := unlist(list(el[, get(v1)], el[, get(v2)]))]
#Calculate differences for edges if applicable
if(addVarT%in%c("num", "int", "Date")) {
el[,paste0(addVarN[i],"Diff") := abs(el[,..v1]-el[,..v2])]
}
#Calculate shared value if applicable
if(addVarT%in%c("Factor")) {
el[,paste0(addVarN[i],"Match") := F]
el[get(v1) == get(v2), paste0(addVarN[i],"Match") := T]
}
#Calculate logical notation if applicable
#This will be 1 if discordant, 0 if both FALSE or 2 if Both TRUE
if(addVarT%in%c("logi")) {
el[,paste0(addVarN[i],"Logic") := el[,..v1]+el[,..v2]]
}
}
}
#Collapse length of edge list into vertex list (1 row per sequence)
vl <- vl[match(unique(vl$ID), vl$ID),]
#Vertices and edges lists together start a graph object
g <- list(v=vl[order(vl$Time),], e=el[order(el$tMax),])
g$e[, "I" := .I]
g$v[, "I" := .I]
#Set a "New Point", such that cases after this point are considered new for the purposes of growth
newPoint <- quantile(as.numeric(g$v$Time), partQ)
#Split into testing and training partitions
#Label new vertices as inherently New, new edges are those which contain a new vertex
g$v[, "New" := F]
g$e[, "New" := F]
g$v[as.numeric(Time)>=newPoint, "New" := T]
g$e[as.numeric(tMax)>=newPoint, "New" := T]
#This is the complete List of edges, however, edges are filtered by default if they lie between two new vertices
#In the future, more edges may be filtered based on a distance cutoff requirement
g$e[,"Filtered" := F]
#Obtain minimum retrospective edges for new cases
retE <- g$e[(New)&((t1<newPoint)|(t2<newPoint)),]
iMRE <- sapply(g$v[(New), (ID)], function(id){
iE <- retE[(ID1%in%id)|(ID2%in%id), ]
iE[which.min(iE$Distance)[[1]], I]
})
#Only the minimum retrospective edges for all new cases remain unfiltered
g$e[(New), "Filtered" := T]
g$e[iMRE, "Filtered" := F]
return(g)
}
#Create clusters based on component clustering by some measure of genetic distance
#If a new case clusters with a known case, this is considered growth
compClu <- function(iG, maxD) {
#@param iG: The inputted graph. Expecting all vertices, but some edges filtered by distance.
#@param maxD: The maximum tn93 distance. Edges higher than this distance will be filtered out of the graph
#@return: The inputted graph, annotated with a cluster size summary and case membership in the vertices section
#Because data.tables are being used, this prevents original values being reassigned via pointer
iG <- copy(iG)
#Filter cases distant edges
iG$e[(Distance>maxD), "Filtered" := T]
#Simplify the list of unsorted vertices (just id's) and edges (just head and tail id's)
#These are separated by whether or not they are new
vid <- iG$v[!(New), (ID)]
adj <- as.matrix(iG$e[!(Filtered)&!(New),c("ID1","ID2")])
adjN <- as.matrix(iG$e[!(Filtered)&(New),c("ID1","ID2")])
#Initialize the first cluster name and a column for cluster membership.
iG$v$Cluster <- vector(mode="numeric", length=nrow(iG$v))
#The search vertex becomes the first member of the first cluster and is removed from the searchable set of cluster names
ci <- 1
srchV <- vid[ci]
memV <- srchV
vid <- setdiff(vid, memV)
growth <- integer(0)
#Assigning Cluster Membership
repeat {
#Remove edges internal to search query and list outgoing edges
adj <- adj[!((adj[,"ID1"]%in%srchV) & (adj[,"ID2"]%in%srchV)),,drop=F]
exE <- adj[((adj[,"ID1"]%in%srchV) | (adj[,"ID2"]%in%srchV)),,drop=F]
#Find all neighboring vertices to the search vertex (or search vertices) through external edges
#These are then added to the list of member vertices and removed from the list of searchable vertices
nbV <- setdiff(c(exE[,"ID1"],exE[,"ID2"]), srchV)
memV <- c(memV, nbV)
vid <- setdiff(vid, nbV)
#If there are no more neigbours to the search vertices, the cluster is completed and we reset the search parameters
if (length(nbV)==0) {
#Update the growth of this cluster
memVN <- c(adjN[adjN[,"ID1"]%in%memV,"ID2"], adjN[adjN[,"ID2"]%in%memV,"ID1"])
growth <- c(growth, length(memVN))
#Update the vertices with their cluster membership and update the list of
iG$v[(ID)%in%memV, "Cluster" := ci]
#The end condition, catching the event that there are no vertices to assign to clusters
if (length(vid)==0) {break}
#Reset search parameters
ci <- ci+1
srchV <- vid[1]
memV <- srchV
vid <- setdiff(vid, memV)
next
}
#Remove all edges within the current cluster from the adjacency list
adj <- adj[!((adj[,"ID1"]%in%srchV) | (adj[,"ID2"]%in%srchV)),,drop=F]
srchV <- nbV
}
#Add the overall size of clusters (before calculating their connectivity to new cases).
#Also add the connectivity to new clusters under the variable "growth"
iG$c <- list()
iG$c$Membership <- lapply(1:ci, function(x){iG$v[(Cluster)==x, (ID)]})
iG$c$Info <- data.table(Old = as.numeric(table(iG$v[!(New), (Cluster)])), New = growth)
return(iG)
}
#Run across a set of several subGraphs created at various filters, analyzing GAIC at each with clusterAnalyze
##- TO-DO: Monitor Thread-safety -##
GAICRun <- function(iG, maxDs=NA, runID=0, nCores=1, addVarInd=NA, plotGAIC=F, modFormula=(New~Old+Recency)) {
#@param iG: Expecting the entire Graph, but in some cases may take a subset
#@param maxDs: A list of cutoff thresholds
#@param modFormula: The predictive model formula. This may be changed with additional variables
# Recency is always calculated for all clusters.
#@param runID: An identifier to stash this particular run and compare it to others
#@param nCores: The number of cores for parallel functionality. -CURRENTLY SOMETIMES ERROR PRONE
#@param plotGAIC: If true, plots a visual of the key result (GAIC)
#@param addVarInd: The indices of additional variables to be used.
#@return: A data table of each runs cluster information.
# Both null and proposed model AIC values, as well as the AIC loss ($nullAIC, $modAIC and $GAIC)
# The max size, average size and number of singletons ($SizeMax, $MeanSize and $Singletons)
# The total growth, largest growth and number of growing clusters ($GrowthTot, $GrowthMax, and $nGrowing)
# The ID of the largest cluster and the cluster with the highest growth ($SizeMaxID and $GrowthMaxID)
# The effect ratio of mean recency in growing clusters over mean recency in non-growing clusters ($xMag)
#Initialize a set of cutoffs to observe (based on the genetic distance distribution)
if (is.na(maxDs)) {
steps <- head(hist(subset(iG$e, Distance<0.05)$Distance, plot=FALSE)$breaks, -5)
maxDs <- seq(0 , max(steps), max(steps)/50)
}
#Initialize additional variables as the complete set of possible additional vars
addVarN <- colnames(g$v[,!c("ID", "Time", "I", "New", "Cluster")])
if (!is.na(addVarInd)) {
addVarN <- addVarN[addVarInd]
}
#This function runs through severel comparisons of a model weighted by predictors, to a model without those variables
df <- mclapply(maxDs, function(d) {
#Obtain clusters
subG <- compClu(iG, d)
#Obtain recency (a sum value of all member tips collection date recency) for each cluster.
subG$c$Info[, "Recency" := sapply(subG$c$Membership, function(x) {
mean(as.numeric(subG$v[ID%in%x, (Time)]) - min(as.numeric(subG$v[,(Time)])))
})]
#Loop through additional variables
for(nm in addVarN){
#For categorical data, a string is returned and factorized
if(typeof(subG$v[, (nm)]) %in% c("integer", "character")) {
#The most common value is used to represent the value of a given cluster
subG$c$Info[, (nm) := as.factor(sapply(subG$c$Membership, function(x) {
t <- table(subG$v[(ID)%in%x, get(nm)])
names(t[which.max(t)])
}))]
}
#For numeric data, an average is returned
## - UNTESTED - ##
if(typeof(subG$v[, (nm)]) %in% c("double")) {
subG$c$Info[, (nm) := sapply(subG$c$Membership, function(x) {mean(x)})]
}
}
#Compares clusters with weights obtained through variables to clusters with even weights
fit1 <- glm(formula = modFormula, data = subG$c$Info, family = "poisson")
fit2 <- glm(formula = New~Old, data = subG$c$Info, family = "poisson")
#GAIC is the difference between the AIC of two models
#Put another way, this is the AIC loss associated with predictive variables
#Other descriptive data characteristics
dfi <- data.frame(modAIC=fit1$aic, nullAIC=fit2$aic, GAIC=(fit1$aic-fit2$aic),
GrowthTot=sum(subG$c$Info[,(New)]), Singletons=nrow(subG$c$Info[(Old)==1,]), MeanSize=mean(subG$c$Info[,(Old)]),
GrowthMax=max(subG$c$Info[,(New)])[[1]], GrowthMaxID=which.max(subG$c$Info[,(New)]), nGrowing=nrow(subG$c$Info[(New)>0,]),
SizeMax=max(subG$c$Info[,(Old)]), SizeMaxID=which.max(subG$c$Info[,(Old)]))
#To track the potentially many variable effect sizes
for(cf in names(fit1$coefficients)){
dfi[,paste0(cf, "_Coefficient")]=fit1$coefficients[cf]
}
return(dfi)
}, mc.cores=nCores)
#Convert output to data table and add reference labels based on the run information
dt <- as.data.table(bind_rows(df))
dt[,"RunID" := runID]
dt[,"MaxDistance" := maxDs]
#For quick visual feedback
if(plotGAIC) {
plot(dt$MaxDistance, dt$GAIC, xlab="Threshold", ylab="AIC Loss")
lines(dt$MaxDistance, dt$GAIC)
abline(h=0, lty=2)
abline(v=dt$MaxDistance[which.max(abs(dt$GAIC))], lty=2)
}
return(dt)
}
#Obtain several run results for a much larger set of sub-sampled graphs
multiGAICRun <- function(iG, n, maxDs=NA, prop=0.80, modFormula=(New~Old+Recency)) {
#@param iG: Expecting the entire Graph
#@param maxDs: A list of cutoff thresholds
#@param modFormula: The predictive model formula. This may be changed with additional variables
# Recency is always calculated for all clusters.
#@param n: The number of subsample runs to determine
#@param prop: The proportion of the data set sampled for each sub-sample.
#@return: A data table of multiple runs worth of cluster information.
# Each run will be labelled with a particular run ID
#Initialize a set of cutoffs to observe (based on the genetic distance distribution)
if (is.na(maxDs)) {
steps <- head(hist(subset(iG$e, Distance<0.05)$Distance, plot=FALSE)$breaks, -5)
maxDs <- seq(0 , max(steps), max(steps)/50)
}
#Run n different runs, each labelled with their own runID
dt <- bind_rows(lapply(1:n, function(i){
#Copy a sample instance of the inserted graph
sampG <- copy(iG)
#Sample new and old IDs
sampIDs <- c(sample(iG$v[!(New),(ID)], round(prop*nrow(iG$v[!(New)]))),
iG$v[(New), (ID)])
sampG$v <- iG$v[(ID)%in%sampIDs,]
sampG$e <- iG$e[((ID1)%in%sampIDs)&((ID2)%in%sampIDs),]
print(i)
GAICRun(sampG, maxDs, runID=i)
}))
return(dt)
}