-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel selection.R
129 lines (110 loc) · 4.21 KB
/
model selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# model selection performance with sequential test
library(nett)
library(EnvStats)
library(ggplot2)
library(Matrix)
library(dplyr)
library(parallel)
library(foreach)
library(randnet)
source('bootstrap debiasing.R')
# parameters set up -------------------------------------------------------
n <- 5000 # number of nodes
Ktru <- 4 # number of communities
Kmax <- 6 # maximum community number tested
Pi <- rep(1,Ktru)/Ktru # distribution of cluster labels
#Pi <- c(1,1,2,3)/7
beta <- 0.2 # out-in-ratio for planted partition model
lamax = 25
# average degree
lambdas <- round(10^seq(log10(3),log10(lamax), length.out = 10), 2)
nlam = length(lambdas)
# connectivity matrix
# B1
Bs <- lapply(lambdas, function(lambda) pp_conn(n, beta, lambda, Pi)$B )
# B2
# gamma <- 0.3
# B3, different numbers on the diagonal
# Bs <- lapply(lambdas, function(lambda) pp_conn(n, beta, lambda, Pi, d = c(1,2,3,1))$B)
# simulation --------------------------------------------------------------
bootrep <- 10 # bootstrap debaising repetition
alpha <- 1e-6 # significance level
nitr <- 200 # number of simulated adjacency matrix
runs <- expand.grid(lam_idx = 1:nlam, itr = 1:nitr)
simulate_run <- function(j) {
set.seed(j)
idx <- runs[j,"lam_idx"]
itr <- runs[j,"itr"]
lambda <- lambdas[idx]
# B1 or B3
B <- Bs[[idx]]
# B2
# B = gen_rand_conn(n, Ktru, lambda, gamma)
theta <- rpareto(n, 3/4, 4)
z <- sample(Ktru, n, replace=T, prob=Pi)
A <- nett::sample_dcsbm(z, B, theta = theta)
# compute all the estimated labels first
labels = sapply(1:(Kmax+1), function(k) nett::spec_clust(A, k))
nac_results <- model_select(A, labels, alpha=alpha, bootrep=bootrep, Kmax=Kmax) # NAC methods
# all statistic
res = tidyr::tribble(
~method, ~Kest,
"SNAC+", nac_results$snac,
"SNAC+ boot", nac_results$snac_boot,
"NAC+ boot", nac_results$nac_boot,
"AS boot", nac_results$as_boot,
"BIC", nac_results$LRBIC,
"BH",bethe_hessian_select(A, Kmax)$K, # BHMC
"NCV", which.min(NCV.select(A,Kmax)$dc.l2), # network cv
"ECV", which.min(ECV.block(A, Kmax)$dc.l2) # edge cv
)
res$lambda = lambda
res$Ktru = Ktru
res$iter = itr
res$nmi = compute_mutual_info(z, spec_clust(A, Ktru, tau=0.25))
return(res)
}
CPU_CORES_TO_USE <- min(30, detectCores()-1)
printf('Using %d cores ...', CPU_CORES_TO_USE)
elapsed_time <- system.time(
result <- do.call(rbind, mclapply(1:nrow(runs), simulate_run, mc.cores = CPU_CORES_TO_USE))
)["elapsed"]
printf('Total time = %3.1f (s)\nTime per iter = %3.1f (s)', elapsed_time, elapsed_time/nrow(runs))
# visualization -----------------------------------------------------------
nmi <- result %>%
group_by(lambda) %>%
summarize(nmi = mean(nmi))
acc_res <- result %>%
group_by(method,lambda) %>%
summarize(acc = mean(Kest==Ktru), acc_sd = sd(Kest==Ktru)/sqrt(nitr))
acc_res$method <- factor(acc_res$method,
levels = c("NAC+ boot","SNAC+ boot", "AS boot", "SNAC+", "BIC", "BH", "ECV", "NCV"
))
acc_res$lambda <- as.numeric(acc_res$lambda)
## all methods
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
as_tibble(acc_res) %>%
filter(!grepl("th",method)) %>%
ggplot(aes(x=lambda, y=acc, col=method)) +
geom_line(aes(linetype=method), size=2) +
scale_linetype_manual(values=c(1,2,5,1,4,6,6,6))+
scale_colour_manual(values=cbbPalette)+
theme_bw()+
geom_point(aes(shape=method), size = 5)+
scale_shape_manual(
values = c(15, 17, 23, 13, 19, 25, 18, 16)
)+
theme(text = element_text(size=20))+
# geom_errorbar(aes(ymin=acc-acc_sd, ymax=acc+acc_sd), width=.2)+
ylab("Expected Accuracy") + xlab("Average Degree")+
theme(legend.background=element_blank(),
legend.title=element_blank(),
legend.position = c(0.23, 0.8),
legend.text = element_text(size=25),
text = element_text(size=26),
legend.key.width = unit(4, "line")) +
guides(fill=guide_legend(keywidth=0.25,keyheight=0.25,default.unit="inch"))
# save results
# base_fname = gsub("\\.","p", sprintf("seq_BerDCSBM_Ktru%d_Kmax%d_n%d_beta%2.2f_m%d", Ktru, Kmax, n, beta, nitr))
# ggsave(paste0(base_fname,".pdf"))
# saveRDS(result, paste0("result_", base_fname,".rds"))