-
Notifications
You must be signed in to change notification settings - Fork 1
/
0_temporal_kelp_prep.R
175 lines (133 loc) · 5.17 KB
/
0_temporal_kelp_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#####################
# Script to prep the kelp timeseries data
# for other analyses yielding
# rawData - the rawData loaded and cleaned
# rawDataMod - the rawData loaded and cleaned with no singletons
# rawDataMod2 - the rawData loaded and cleaned with no singletons or doubletons
# rawDataMod_3points_3years - the rawData loaded and cleaned with
# n years>=3 and 3 data points >=3
#
# Last Changed 11/1/2015
# 11/1/2015 - fixed too few years bug in shortData2
# 9/12/2015 - added shortData2 to filter by nyears and n data points
#####################
library(plyr)
library(ggplot2)
getValues <- function (alist) {
avec <- vector()
sapply(alist, function(x) avec <<- c(avec, x)) -> blank
avec
}
rawData <- read.csv("./CleanDataWithRegions.csv")
rawData$sasDate <- as.Date(as.character(rawData$sasDate), format="%m/%d/%Y")
#get - out of ecoregions
rawData$ECOREGION <- as.character(rawData$ECOREGION )
#rawData$ECOREGION <- gsub("/", "-", rawData$ECOREGION )
rawData$ECOREGION <- factor(rawData$ECOREGION)
###########
#deal with sites that only have one or sample point and exclude them
###########
singletons <- ddply(rawData, .(study_ID, Study, Site, focalUnit), summarise, len = length(focalKelp))
singletons <- singletons[which(singletons$len==1),]
singletonIDX <- apply(singletons, 1, function(arow) which(rawData$Study==arow[2] & rawData$Site==arow[3] & rawData$focalUnit==arow[4]))
if(length(singletonIDX)>0) rawDataMod <- rawData[-singletonIDX,]
###########
#deal with sites that only have kelp
###########
nokelp <- ddply(rawDataMod, .(study_ID, Study, Site), summarise, kelp = sum(focalKelp, na.rm=T))
nokelp <- nokelp[which(nokelp$kelp==0),]
nokelpIDX <- getValues(apply(nokelp, 1, function(arow) which(rawDataMod$Study==arow[2] & rawDataMod$Site==arow[3]))) #used getvalues b/c list return
if(length(nokelpIDX)>0) rawDataMod <- rawDataMod[-nokelpIDX,]
#############
# Rebalance factor levels
#############
rawDataMod$Study <- factor(rawDataMod$Study)
rawDataMod$Site <- factor(rawDataMod$Site)
rawDataMod$StudySite <- with(rawDataMod, paste(Study, Site, study_ID,trajectory_ID,sep="-"))
#############
# Add a more meaningful date label
#############
rawDataMod$Date <- as.numeric(gsub("-", "", rawDataMod$sasDate))
rawDataMod$DaysSince1900 <- as.numeric(julian(as.POSIXlt(rawDataMod$sasDate, format = "%y-%m-%d"),
origin = as.POSIXct("1900-01-01", tz = "UTC")))
rawDataMod$YearsSince1900 <- rawDataMod$DaysSince1900/365.25
rawDataMod$Decade <- floor(rawDataMod$year/10)*10
###########
# Create some standardized columns
#
# I bet this is easier with dplyr, but no time to learn
###########
stdMax <- function(x, samp=3) {
m <- mean(sort(x, decreasing=T, na.last=T)[1:samp])
x/m
}
ln_stdMax <- function(x, samp=3) {
m <- mean(sort(x, decreasing=T, na.last=T)[1:samp])
log(x/m+1/m)
}
stdZ <- function(x) (x)/sd(x, na.rm=T)
rawDataMod <- ddply(rawDataMod, .(ECOREGION, focalUnit), function(adf){
adf$stdByECOREGION <- stdMax(adf$focalKelp)
adf$stdByECOREGIONZ <- stdZ(adf$focalKelp)
adf$ln_stdByECOREGION <- ln_stdMax(adf$focalKelp)
adf
}
)
rawDataMod <- ddply(rawDataMod, .(REALM, focalUnit), function(adf){
adf$stdByREALM <- stdMax(adf$focalKelp)
adf$stdByREALMZ <- stdZ(adf$focalKelp)
adf$ln_stdByREALM <- ln_stdMax(adf$focalKelp)
adf
}
)
rawDataMod <- ddply(rawDataMod, .(PROVINCE, focalUnit), function(adf){
adf$stdByPROVINCE <- stdMax(adf$focalKelp)
adf$stdByPROVINCEZ <- stdZ(adf$focalKelp)
adf$ln_stdByPROVINCE <- ln_stdMax(adf$focalKelp)
adf
}
)
#########
# Add a relative Latitude by Realm
#########
rawDataMod <- ddply(rawDataMod, .(REALM), function(adf){
adf$realLatByRealm <- abs(adf$Latitude)-min(abs(adf$Latitude))
adf
}
)
###########
#make subdatasets of studies that have n or greater points
###########
shortData <- function(studyPoints=2, d = rawDataMod){
smalls <- ddply(d, .(study_ID, Study, Site, trajectory_ID, focalUnit), summarise, len = length(focalKelp))
smalls <- smalls[which(smalls$len<=studyPoints),]
smallsIDX <- apply(smalls, 1, function(arow) which(d$Study==arow[2] & d$Site==arow[3] & d$focalUnit==arow[5]))
smallsIDX <- getValues(smallsIDX)
if(length(smallsIDX)>0) return( d[-smallsIDX,])
return(NA)
}
shortData2 <- function(studyPoints=3, studyYears=3, d = rawDataMod){
require(dplyr)
#group the data
d %>% group_by(study_ID, Study, Site, trajectory_ID, focalUnit) %>%
# what is the study length and number of data points
mutate(len=length(study_ID), num_years = max(year) - min(year)) %>%
ungroup() %>%
#boot studies without studyPoints or more data points
filter(len >= studyPoints) %>%
#boot studies with fewer than studyYears years
filter(num_years>=studyYears-1)
}
#add filter by year?
###########
#dataset of sites that only have > two sample points (i.e., 3 or more points)
###########
rawDataMod2 <- shortData()
rawDataMod_3points_3years <- shortData2()
#length(unique(rawDataMod2$trajectory_ID))
#length(unique(rawDataMod_3points_3years$trajectory_ID))
###########
#dataset of sites that only >= five sample points
###########
rawDataMod5 <- shortData(5)
rawDataMod_5points_3years <- shortData2(5, 3)