-
Notifications
You must be signed in to change notification settings - Fork 0
/
powerEs.R
278 lines (217 loc) · 9.11 KB
/
powerEs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
###
### Setup, functions for simulations for Roettger et al. data
###
### M. Sonderegger & J. Kirby
## last update: 3/2018
##
## Requires: data file, called roettgerEtAlData.csv
##
## To run:
## - Source file, and execute simulations using the appropriate function call -- as in runSimsJphon.R.
##
## Note that this code:
## - Does not implement variation in size of between-subjects, between items, observation-level variability
## - Is somewhat specific to the Roettger et al. 2014 data case, but should be adaptable for other cases.
##
library(lme4)
library(arm)
library(plyr)
library(dplyr)
library(ggplot2)
library(simr)
## name of effect of interest
## 'voiceless' for Roettger et al. data
effectOfInterest <- 'voiceless'
## names of factors in the dataset
factorVars <- c("place", "vowel", "accent_type", "prosodic_boundary")
####
## 1. PRELIMINAIRES
####
## load data
E1 = read.csv("roettgerEtAlData.csv",comment.char="")
## REMOVE THIS ROW, there are NAs for variables
E1 <- E1[-481,]
## turn items, subjects into factors
E1$item_pair = as.factor(E1$item_pair)
E1$subject = as.factor(E1$subject)
## MS: centered voicing var
E1$voiceless <- rescale(E1$voicing)
## WHATEVER IS DONE ABOVE, name dataset 'data'
data <- E1
## fit original model
E1.mdl = lmer(vowel_dur ~ voiceless + # critical fixed effect
accent_type + prosodic_boundary + # prosodic control variables
place + vowel + # phonological control variables
norming_voiceless_count + # norming
(1+voiceless|subject) + (1+voiceless|item_pair),
data=E1)
## THIS IS THE NAME OF THE MODEL TO BE USED THROUGHOUT
origMod <- E1.mdl
####
## 2. PROPERTIES OF THE DATASET AND ORIGINAL FITTED MODEL
####
## number of subjects, items, observations
nSubject <- nrow(ranef(origMod)$subject)
nItem <- nrow(ranef(origMod)$item_pair)
n <- nrow(model.frame(origMod))
## original effect size
origEffectSize <- fixef(origMod)[effectOfInterest][[1]]
origNRep <- 1
#####
### 3. FUNCTIONS FOR POWER SIMULATIONS
#####
## Function to prep a new dataset, with smaller/larger sample size
## and changed effect size
##
## mod: original model, which will be re-run for new dataset and
## with new effect size in your simulation
## nS, nI, nRep: sample size params for new dataset
## beta: effect size you want to get power for, for effect of interest
sampleNewDataset <- function(mod, nS = nSubject, nI = nItem, nRep = origNRep,
beta = origEffectSize){
## note for understanding this code: for the purposes of input to simr functions,
## to change the dataset underlying a model, we change the result of calling
## getData on the model.
newMod <- mod
## this is the dataframe used to fit the original model
data <- model.frame(newMod)
## this changes effect size of effect we're interested in.
fixef(newMod)[[effectOfInterest]] <- beta
##
## begin process of changing sample size for subjects and items
##
## If we want to do fewer subjects or items than in the original dataset,
## we need to choose a subset of subjects/items s.t. we don't fall into one of
## two traps:
## 1. not having all values of factors present (that were in original dataset)
## 2. having a model matrix that's not full rank
## these vars flag if we will need to get a subset of subjects or items
chooseSubjects <- ifelse(nS < nSubject, TRUE, FALSE)
chooseItems <- ifelse(nI < nItem, TRUE, FALSE)
## keep choosing a subset of items until we get one that does *not* have just a
## single value for item-level variables (this gives an error about contrasts not
## being able to be applied to factors with 1 level), and also
## doesn't get the model matrix downsized (excluding a fixed-effect term)
## because of something like choosing items s.t. place can be in part predicted
## based on vowel.
selected <- FALSE
while(!selected){
## choose subjects
if(chooseSubjects){
newSubjects <- sample(levels(data$subject), nS, replace = FALSE)
} else{
newSubjects <- levels(data$subject)
}
## choose items
if(chooseItems){
newItems <- sample(levels(data$item_pair), nI, replace = FALSE)
} else{
newItems <- levels(data$item_pair)
}
## check that this isn't a subset where there is only one value for item-level variables
selected = TRUE
temp <- subset(data, subject%in%newSubjects & item_pair %in% newItems)
for(var in factorVars){
if(length(unique(temp[[var]]))==1){
selected = FALSE
cat("issue with var", var, "\n")
}
}
## provided we haven't already ruled out this subset of subjects/items,
##
## have to check if the resulting model has fewer fixed effects
## than original model, which can happen due to data not being
## fully crossed
if(selected && (chooseSubjects || chooseItems)){
tempMod <- lmer(formula(mod), temp)
if(length(fixef(tempMod))<length(fixef(mod))){
cat("fewer fixed effects",'\n')
selected = FALSE
}
}
}
## these are messages you should see a lot for small enough subject or item numbers
if(chooseSubjects){cat("chose working subset of subjects",'\n')}
if(chooseItems){cat("chose working subset of items",'\n')}
## subset the data to just the subset of subjects and items chosen, and
## make this the dataframe returned by the simr function getData
data <- filter(data, subject%in%newSubjects & item_pair %in% newItems)
getData(newMod) <- data
## if we want a *larger* sample size for subjects or items than in original data, just
## use simR extension method (which just repeats items/subjects as necessary)
if(nS>nSubject){
newMod <- extend(newMod, along='subject', n=nS)
}
if(nI>nItem){
newMod <- extend(newMod, along='item_pair', n=nI)
}
## dealing with changing nRep is easy, and we
## only have to worry about the case where nRep > origNRep, because the
## latter is 1.
##
## NB: not clear if this will work if for data (not Roettger et al.) where in the original
## data nRep>1
if(nRep!=origNRep){
newMod <- extend(newMod, within='subject+item_pair', n=nRep)
}
return(newMod)
}
## Run
## Function to run a simulation for one set of parameter values
##
## parameters::
## nS = # subjects
## nI = # items (= pairs of voiced/voiceless words)
## nRep = # repetitions (per subject:item)
## beta: effect size (for R et al data: difference between voiced and voiceless)
##
runSim <- function(mod, nS = nSubject, nI = nItem, nRep = origNRep,
beta = origEffectSize, nSims=nSims){
results <- data.frame()
## changes effect size (beta) if applicable, and
## changes nS, nI, or nRep.
newMod <- sampleNewDataset(mod, nS=nS, nI=nI, nRep=nRep,beta=beta)
## dummy function to fit new models and put info summarizing them in dataframe
tempFun <- function(dummy){
## simr function: simulate the response variable for this new dataset, once
y <- doSim(newMod)
## simr function: refit the model using the new dataset with new response variable
z <- doFit(y, newMod)
## get the coeff value of interest in simulated model
newBeta <- fixef(z)[effectOfInterest]
## p value for z test of effect of interest
lrTestP <- as.numeric(doTest(z, fixed(effectOfInterest, 'lr')))
lrTestModSelect=ifelse(lrTestP<0.05, 'yes', 'no')
## dataframe just giving beta value, p value, and yes/no for 'is this term sig?'
results <- data.frame(beta=newBeta,
lrTestP = lrTestP,
## asks: " ", based on LR test?
lrTestModSelect = lrTestModSelect)
}
## do this nSims time, in parallel
results <- ldply(seq(1,nSims), tempFun, .parallel = useParallel)
## now add in parameter values for this run
results <- data.frame(results, nS=nS, nI=nI, nRep=nRep, n=nrow(getData(newMod)),trueBeta=beta, nSims=nSims)
return(results)
}
runSweep <- function(mod, betaVals, nSVals, nIVals, nRepVals, nSims, saveFName){
sweepRuns <- data.frame()
ptm <- proc.time()
for(beta in betaVals){
cat("beta =", beta, "\n")
for(nS in nSVals){
cat("nS =", nS, "\n")
for(nI in nIVals){
cat("nI =", nI, "\n")
for(nRep in nRepVals){
cat("nRep =", nRep, "\n")
sweepRuns <- rbind(sweepRuns, runSim(origMod, nRep=nRep, nS=nS, nI=nI, beta=beta, nSims=nSims))
}
}
}
}
t <- (proc.time() - ptm)
cat(t[3])
saveRDS(sweepRuns, file= saveFName)
return(sweepRuns)
}