-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathae_5_matching.R
323 lines (290 loc) · 13.1 KB
/
ae_5_matching.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
library(tidyverse)
library(foreach)
library(optmatch)
library(lubridate)
library(biglm)
library(tictoc)
options("optmatch_max_problem_size"=Inf)
MAX_TREATMENT <- 1000
CONTROL_MULTIPLIER <- 50
# Function to allow rbinding dataframes with foreach even when some dataframes
# may not have any rows
foreach_rbind <- function(d1, d2) {
if (is.null(d1) & is.null(d2)) {
return(NULL)
} else if (!is.null(d1) & is.null(d2)) {
return(d1)
} else if (is.null(d1) & !is.null(d2)) {
return(d2)
} else {
return(bind_rows(d1, d2))
}
}
# Basic function to extract variable names from a formula object
get_names <- function(f) {
f <- paste0(as.character(f), collapse=' ')
v <- strsplit(f, split='[+ ~]')[[1]]
v <- v[v != '']
gsub('strata\\(([a-zA-Z_]*)\\)', '\\1', v)
}
get_matches <- function(d, dists) {
# If the controls are too far from the treatments (due to a caliper) then
# the matching may fail. Can test for this by seeing if subdim runs
# successfully
subdim_works <- tryCatch(is.data.frame(subdim(dists)),
error=function(e) return(FALSE))
if (subdim_works) {
#m <- pairmatch(dists, data=d, remove.unmatchables = TRUE)
m <- fullmatch(dists, min.controls=1, max.controls=1, data=d)
d <- d[matched(m), ]
} else {
d <- data.frame()
}
return(d)
}
match_ae <- function(d, f) {
m <- foreach(this_group=unique(d$group), .combine=foreach_rbind) %do% {
this_d <- filter(d, group == this_group)
# Calculate propensity scores with a GLM, or else use Mahalanobis
# distance if there aren't enough points to run a glm
if (sum(this_d$treatment) > 30) {
model <- glm(f, data=this_d, family=binomial())
dists <- match_on(model, data=this_d)
} else {
dists <- match_on(f, data=this_d)
}
return(get_matches(this_d, dists))
}
# Need to handle the possibility that there were no matches for this
# treatment, meaning d will be an empty data.frame
if (nrow(m) == 0) {
return(NULL)
} else {
return(m)
}
}
###############################################################################
### Load sites and covariates
treatment_key <- readRDS('ae_output/treatment_cell_key.RDS')
readRDS('data/avoided_emissions/sites.RDS') %>%
select(-geometry) %>%
as_tibble() -> sites
# Filter to include only values of group that appear in the treatment pixels,
# and to not include values that appear only in the treatment pixels
filter_groups <- function(vals) {
vals$group <- interaction(vals$region, vals$ecoregion, vals$pa)
vals <- filter(vals, group %in% unique(filter(vals, treatment)$group))
treatment_groups <- unique(filter(vals, treatment)$group)
control_groups <- unique(filter(vals, !treatment)$group)
vals <- filter(vals, group %in% treatment_groups[treatment_groups %in% control_groups])
# Filter out values of group that appear ONLY in the treatment pixels
vals$group <- droplevels(vals$group)
return(vals)
}
###############################################################################
### Run matching
set.seed(31)
ae <- foreach(this_year=unique(treatment_key$Data_Year),
.combine=foreach_rbind, .inorder=FALSE) %do% {
foreach(this_CI_ID=unique(treatment_key$CI_ID),
.combine=foreach_rbind, .inorder=FALSE) %do% {
tic()
###############
# Load datasets
site <- filter(sites,
CI_ID == this_CI_ID,
Data_Year == this_year)
if (file.exists(paste0('ae_output/m_', this_CI_ID, '_', this_year, '.RDS'))) {
print(paste0('Skipping ', this_CI_ID, ' for year ', this_year, '. Already processed.'))
return(NULL)
} else {
print(paste0('Processing ', this_CI_ID, ' for year ', this_year, '.'))
}
treatment_cell_IDs <- filter(treatment_key,
CI_ID == this_CI_ID,
!is.na(region),
Data_Year == this_year)
n_treatment_cells_total <- nrow(treatment_cell_IDs)
if (n_treatment_cells_total == 0) {
print(paste0('Skipping ', this_CI_ID, ' for year ', this_year, '. No treatment cells.'))
return(NULL)
}
vals <- foreach(this_region = unique(treatment_cell_IDs$region),
.combine=rbind) %do% {
v <- readRDS(paste0('ae_output/treatments_and_controls_', this_region,
'.RDS'))
filter(v, region == this_region)
}
vals %>% full_join(
treatment_cell_IDs %>%
select(cell, Data_Year) %>%
mutate(treatment=TRUE)
, by='cell') -> vals
vals$treatment <- as.logical(vals$treatment)
vals$treatment[is.na(vals$treatment)] <- FALSE
vals$Data_Year <- this_year
# Remove areas falling within another CI site from the control sample
# (but DON'T remove those areas falling within this site)
filter(vals,
!(cell %in% filter(treatment_key,
!(cell %in% filter(treatment_key,
CI_ID == this_CI_ID)$cell))$cell)) -> vals
################
# Setup grouping
# Eliminate any pixels with NAs in group variables (happens occasionally
# where polygons overlap some ocean, leading to undefined ecoregion, for
# example)
n_filtered <- nrow(vals)
vals <- filter(vals,
!is.na(region),
!is.na(ecoregion),
!is.na(pa))
n_filtered <- n_filtered - nrow(vals)
if (n_filtered > 0) {
print(paste0(this_CI_ID, ': Filtered ', n_filtered, ' rows due to missing data in grouping variables.'))
}
# Eliminate any groups that are only in the control pixels, or only in the
# treatment pixels
vals <- filter_groups(vals)
sample_sizes <- vals %>%
count(treatment, group)
# Sample the treatment cells if there are more than MAX_TREATMENT pixels,
# and the control cells if there are more than CONTROL_MULTIPLIER *
# MAX_TREATMENT pixels
bind_rows(
filter(vals, treatment) %>%
group_by(group) %>%
sample_n(min(MAX_TREATMENT, n())),
filter(vals, !treatment) %>%
group_by(this_group=group) %>%
sample_n(min(CONTROL_MULTIPLIER * filter(sample_sizes,
treatment == TRUE,
group == this_group[1])$n,
n()))
) %>%
ungroup() %>%
select(-this_group) -> vals
# Refilter in case any groups were lost due to the sampling
vals <- filter_groups(vals)
# Project all items to cylindrical equal area
# d_crop <- projectRaster(d_crop, crs=CRS('+proj=cea'), method='ngb')
################
# Add defor data
# For sites that were established in or after 2005, match on the five years
# of deforestation data preceding the year of establishment. For sites
# estab prior to 2005, don't match on defor rate
estab_year <- year(site$CI_Start_Date_clean)
f <- readRDS('ae_output/formula.RDS')
if (estab_year >= 2005) {
init <- vals[, grepl(paste0('fc_', substr(estab_year - 5, 3, 4)), names(vals))]
final <- vals[,grepl(paste0('fc_', substr(estab_year, 3, 4)), names(vals))]
defor_pre_intervention <- ((final - init) / init) * 100
# Correct for division by zero in places that had no forest cover in
# year 0
defor_pre_intervention[init == 0] <- 0
names(defor_pre_intervention) <- 'defor_pre_intervention'
vals <- cbind(vals, defor_pre_intervention)
f <- update(f, ~ . + defor_pre_intervention)
}
#vals <- vals %>% select(-starts_with('fc_'), -starts_with('fcc_'))
sample_sizes <- vals %>%
count(treatment, group)
print(paste0(this_CI_ID, ': ', paste(filter(sample_sizes, treatment)$n, collapse=', '), ' treatment pixels'))
print(paste0(this_CI_ID, ': ', paste(filter(sample_sizes, !treatment)$n, collapse=', '), ' control pixels'))
##############
# Run matching
if (nrow(filter(vals, treatment)) == 0) {
print(paste0(this_CI_ID, ': No treatment values remaining after filtering'))
return(NULL)
} else {
m <- match_ae(vals, f)
print(paste0(this_CI_ID, ': Formatting output'))
if (is.null(m)) {
print(paste0(this_CI_ID, ': no matches'))
} else {
m$CI_ID <- this_CI_ID
m$Data_Year <- this_year
m <- m %>% dplyr::select(CI_ID, everything())
print(paste0(this_CI_ID, ': saving output'))
m$sampled_fraction <- sum(vals$treatment) / n_treatment_cells_total
saveRDS(m, paste0('ae_output/m_', this_CI_ID, '_', this_year, '.RDS'))
}
}
toc()
return(m)
}
}
###############################################################################
# Load all output and resave in one file
m <- foreach(f=list.files('ae_output', pattern ='^m_[0-9]*_[0-9]{4}.RDS$'),
.combine=foreach_rbind) %do% {
readRDS(paste0('ae_output/', f))
}
saveRDS(m, 'ae_output/m_ALL.RDS')
###############################################################################
# Summarize results by site
readRDS('data/avoided_emissions/sites.RDS') %>%
select(CI_ID,
Data_Year,
CI_Start_Date_clean,
CI_End_Date_clean) %>%
mutate(CI_Start_Year=year(CI_Start_Date_clean),
CI_End_Year=ifelse(is.na(year(CI_End_Date_clean)), 2099, year(CI_End_Date_clean))) %>%
select(-CI_Start_Date_clean, -CI_End_Date_clean) -> sites
get_chunk <- function(d, n, n_chunks=10) {
start_ind <- round(seq(1, length(d), length.out=n_chunks + 1))[1:(n_chunks)]
end_ind <- c(start_ind[2:n_chunks] - 1, length(d))
return(d[start_ind[n]:end_ind[n]])
}
# Process in chunks to save memory
n_chunks <- 20
# data_files <- list.files('ae_output', pattern ='^m_[0-9]*_[0-9]{4}.RDS$') # think we want letters too?
data_files <- list.files('ae_output', pattern ='^m_[A-Z]{3}[0-9]*_[0-9]{4}.RDS$')
m_site <- foreach (i=1:n_chunks, .combine=bind_rows) %do% {
#m_site <- foreach (i=10:11, .combine=bind_rows) %do% {
print(paste0('Progress: ', ((i-1)/n_chunks)*100, '%'))
tic()
this_m <- foreach(f=get_chunk(data_files, i, n_chunks),
.combine=bind_rows, .inorder=FALSE) %do% {
readRDS(paste0('ae_output/', f)) %>%
select(cell,
CI_ID,
Data_Year,
treatment,
sampled_fraction,
total_biomass,
starts_with('fc_')) %>%
left_join(sites, by=c('CI_ID', 'Data_Year')) %>%
gather(year, forest_at_year_end, starts_with('fc_')) %>%
mutate(year=2000 + as.numeric(str_replace(year, 'fc_', ''))) %>%
group_by(CI_ID, Data_Year, cell, treatment) %>%
filter(between(year, CI_Start_Year[1] - 1, CI_End_Year[1])) %>% # include one year prior to project start to get initial forest cover
arrange(cell, year) %>%
mutate(forest_loss_during_year=c(NA, diff(forest_at_year_end)),
forest_frac_remaining = forest_at_year_end / forest_at_year_end[1],
biomass_at_year_end = total_biomass * forest_frac_remaining,
# to convert biomass to carbon * .5
C_change=c(NA, diff(biomass_at_year_end)) * .5,
# to convert change in C to CO2e * 3.67
Emissions_MgCO2e=C_change * -3.67) %>%
filter(between(year, CI_Start_Year[1], CI_End_Year[1])) %>% # drop year prior to project start as no longer needed
as_tibble() -> x
}
this_m %>%
group_by(CI_ID, Data_Year, year, treatment) %>%
summarise(CI_Start_Year=CI_Start_Year[1],
CI_End_Year=CI_End_Year[1],
# correct totals for areas where only a partial sample was used
# by taking into account the fraction sampled
forest_loss_ha=sum(forest_loss_during_year, na.rm=TRUE) * (1 / sampled_fraction[1]),
Emissions_MgCO2e=sum(Emissions_MgCO2e, na.rm=TRUE) * (1 / sampled_fraction[1]),
n_pixels=n()) %>%
as_tibble() -> this_m_site
toc()
gc()
return(this_m_site)
}
saveRDS(m_site, file='ae_output/output_raw_by_site.RDS')
write_csv(m_site, 'ae_output/output_raw_by_site.csv')