PiGu_DataProc_early.Rmd

---
title: "Pigeon Guillemot Data Exploration - Whidbey"
output:
  word_document:
    reference_docx: mytemplate.docx
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, warning = F, error = F, messages = F, results = 'asis', fig.width = 11, fig.height = 8)
```

```{r, results = 'hide'}

library(tidyr)
#library(ggmap)
library(data.table)
library(ggfortify)
library(dplyr)
library(ggplot2)
library(cowplot)
library(ggmap)
library(maps)
library(mapdata)
library(scales)
#library(captioner)
library(knitr)
library(reshape2)
library(stringr)
library(magrittr) 
library(IDPmisc)
#library(ggTimeSeries)
library(stats) 
library(zoo)
library(sciplot) #se()
library(gvlma) #universal assumptions test gvlma(mod)
library(chron) #times()

plot_theme <- function(...) {
  theme(
    #text = element_text(size = 11),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, color = "black", size = 10), 
    axis.text = element_text(vjust = 0.5, color = "black", size = 10), 
    axis.title = element_text(size = 11),
    axis.line.y = element_line(colour = "black"), 
    axis.line.x = element_line(colour = "black"), 
    plot.background = element_rect(), 
    panel.background = element_rect(fill = 'white'),
    panel.border = element_rect(fill = NA),
    panel.grid = element_blank(), 
    legend.key = element_blank(),
    strip.background = element_blank(), 
    strip.text = element_text(size = 10),
    legend.text = element_text(size = 9),
    ...)
}

fig_theme <- plot_theme

color3 <- c("#3b98ee", "#a3d39c", "#e45f56", "#f6b61c")

```


```{r data, results = 'hide', message = FALSE}

setwd("~/Documents/SAFS/PigeonGuillemots/PiGuData")

#mannually added raw website output for 2015-2017 to earlier years that were compiled from annual report spreadsheets

data_count <- read.csv("PGdata_pre_proc_allyrs.csv", stringsAsFactors = F) %>%
  transform(date = as.Date(date, format = "%m/%d/%y")) %>%
  transform(year = as.numeric(format(date, "%Y"))) %>%
  #mutate(PG_count = as.numeric(PG_count)) %>%
  # transform(week = strftime(date, format = "%V")) %>%
  # #transform(week = as.numeric(week)) %>%
  # transform(week = as.numeric(as.character(week))) %>%
  transform(week = week(date)) %>%
  transform(yday = yday(date)) %>%
  filter(site != 'All South Sound') %>%
  #mutate(intern_data = ifelse(intern_data == "on", "Y", intern_data)) %>%
  mutate(site = gsub(" -- INTERN", "", site)) %>%
  mutate(site = gsub("- INTERN", "", site)) %>%
  mutate(site = gsub(" @E", " E", site)) %>%
  mutate(site = gsub("@", " ", site)) %>%
  mutate(site = gsub("- ", " ", site)) %>%
  mutate(site = gsub("'", "", site)) %>%
  filter(intern_data != 'on') %>%
  distinct() %>%
  filter(region == 'Whidbey')
data_count[is.na(data_count)] <- 0

#contains all multiple counts (2018 onward)
data_count_mult <- data_count %>%
  filter(count_type != 'pg')

write.csv(data_count_mult, 'data_2018_proc.csv', row.names = F)

#to include/exclude 2018 counts
data_count <- data_count %>%
  filter(count_type == 'pg')

#cleaning website output for burrow-level data 2015-2017 so that it matches earlier years
burrow_2015_18 <- read.csv("burrow_data_2015_2018.csv", stringsAsFactors = F) %>%
  distinct() %>%
  mutate(burrow_name = gsub("b.", "", burrow_name)) %>%
  filter(site != 'All South Sound') %>%
  #mutate(intern_data = ifelse(intern_data == "on", "Y", intern_data)) %>%
  mutate(site = gsub(" -- INTERN", "", site)) %>%
  mutate(site = gsub("- INTERN", "", site)) %>%
  mutate(site = gsub(" @E", " E", site)) %>%
  mutate(site = gsub("@", " ", site)) %>%
  mutate(site = gsub("- ", " ", site)) %>%
  mutate(site = gsub("'", "", site)) %>%
  mutate(visit_type = ifelse(visit_type == 'gunnels', 'Gunnel', 
                             ifelse(visit_type == 'sculpin', 'Sculpin',
                                    ifelse(visit_type == 'noprey', 'burrow_visit', 'Other')))) %>%
  dcast(region + site + date + burrow_name + intern_data ~ visit_type, value.var = 'visits', 
        fun.aggregate = mean, fill = 0) %>%
  select(region, site, date, burrow_name, Gunnel, Sculpin, Other, burrow_visit, intern_data) %>%
  transform(burrow_name = gsub(' ', '', burrow_name)) %>%
  filter(region == 'Whidbey') 

burrow_allyrs <- read.csv("burrow_pre_proc.csv", stringsAsFactors = F) %>%
  distinct() %>%
  mutate(Gunnel = coalesce(as.numeric(Gunnel), 0)) %>% #forcing NAs to 0, other ways weren't working
  mutate(Sculpin = coalesce(as.numeric(Sculpin), 0)) %>%
  mutate(Other = coalesce(as.numeric(Other), 0)) %>%
  mutate(burrow_visit = coalesce(as.numeric(burrow_visit), 0)) %>%
  bind_rows(burrow_2015_18) %>%
  transform(burrow_name = gsub(' ', '', burrow_name)) %>%
  filter(region == 'Whidbey')
  
#this df contains dates where no burrows were visited (but surveys were done) but does NOT contain
#dates for certain burrows that were not visited when others were
data_burrow <- burrow_allyrs %>%
  filter(intern_data != 'on') %>%
  filter(intern_data != 'Y') %>%
  transform(date = as.Date(date, format = "%m/%d/%y")) %>%
  transform(year = format(date, "%Y")) %>%
  mutate(burrow_name = gsub("-","", burrow_name)) %>%
  # transform(week = strftime(date, format = "%V")) %>%
  # #transform(week = as.numeric(week)) %>%
  # transform(week = as.numeric(as.character(week))) %>%
  transform(week = week(date)) %>% 
  transform(yday = yday(date)) %>%
  transform(tot_prey = Gunnel + Sculpin + Other) %>%
  select(region, site, year, date, yday, week, burrow_name, burrow_visit, burrow_visit,
         Gunnel, Sculpin, Other, tot_prey) #%>%
  #filter(site == 'Shore Meadows' & year == 2009 & burrow_name %in% c(15, 31))

weekLims <- data_burrow %>%
  group_by(region, year, site) %>%
  summarize(minWeek = min(week), maxWeek = max(week))

#dates for empty burrow visits (non-visits not recorded in burrow-level data, so use dates of count data and merge in)
#BUT BE CAREFUL! sometimes more than one survey was done in a week, which messes up burrow data
count_dates <- data_count %>%
  filter(intern_data != 'on') %>%
  distinct(region, site, date, year, week, yday, start_time) %>% 
  filter(region == 'Whidbey') %>%
  merge(weekLims, by = c('region', 'year', 'site')) %>%
  filter(week < minWeek | week > maxWeek) %>%
  dplyr::select(-c(minWeek, maxWeek))

data_burrow <- data_burrow %>%
  merge(count_dates, by = c('region', 'site', 'year', 'date', 'week', 'yday'), all = T)  %>%
  filter(region == 'Whidbey') %>% 
  distinct()

#adding on columns for number of weeks of prey deliveries
prey_weeks_range <- data_burrow %>% 
  group_by(region, site, year, burrow_name) %>%
  filter(tot_prey > 0) %>%
  summarize(start_prey = min(week), end_prey = max(week)) %>%
  transform(prey_week_range = end_prey - (start_prey-1)) %>%
  filter(region == 'Whidbey')
prey_weeks <- data_burrow %>% 
  group_by(region, site, year, burrow_name) %>%
  filter(tot_prey > 0) %>%
  #filter(intern_data == 'Y') %>%
  summarize(prey_weeks = n_distinct(week)) %>%
  #transform(nest_succ = ifelse(prey_weeks > 2, 'Y', 'N')) %>%
  merge(prey_weeks_range, by = c('region', 'year', 'site', 'burrow_name')) %>%
  transform(prey_discrep = prey_week_range - prey_weeks) %>%
  transform(nest_succ = ifelse(prey_week_range > 2, 'Y', 'N')) %>%
  filter(region == 'Whidbey')

setwd("~/Documents/SAFS/PigeonGuillemots")
write.csv(prey_weeks, "prey_weeks.csv", row.names = F)
write.csv(data_burrow, "data_burrow.csv", row.names = F)
write.csv(data_count, "data_count.csv", row.names = F)

#combining Rolling Hills sites for count data, but not burrow data - edited by hand
# RH <- data_count %>%
#   filter(intern_data != 'on') %>%
#   filter(grepl("Rolling Hills #", site)) %>%
#   #group_by(year, week) %>%
#   group_by(year, week) %>%
#   summarize(PG_count = sum(PG_count), burrow_count = sum(burrow_count), burrow_visit = sum(burrow_visit))
# dates <- data_count %>%
#     filter(intern_data != 'on') %>%
#   filter(grepl("Rolling Hills #", site)) %>%
#   group_by(year, week) %>%
#   summarize(date = min(date)) %>%
#   select(year, week, date)
# RH <- RH %>%
#   merge(dates, by = c('year', 'week'), all.x = T)
# write.csv(RH, "rollinghillsfix.csv", row.names = F)

```


```{r}
# bringing in tide data

#MOVED TO ME_SETUP for CH covariates, but keep this for counts and time to hilo and visualizing data
tides <- read.csv("tides_all.csv", stringsAsFactors = F, header = T) %>%
  transform(date = as.Date(date)) %>%
  transform(month = month(date)) %>%
  transform(time = as.POSIXct(time)) %>%
  filter(region != 'SS')

PG_count_tides <- data_count %>%
  filter(intern_data != 'on' & PG_count != '') %>%
  select(region, site, date, PG_count, start_time, yday, year, week) %>%
  merge(tides, by = c('site', 'date', 'region'), all.x = T) %>%
  transform(start_time = ifelse(start_time == "0:00" | start_time == "", "8:00", start_time)) %>%
  transform(survey = as.POSIXlt(paste0(paste(date, start_time, sep = " "), ":00"))) %>%
  transform(from_low = ifelse(type == "L", as.numeric(time-survey, units = "mins"), NA),
            from_high = ifelse(type == "H", as.numeric(time-survey, units = "mins"), NA)) %>%
  transform(from_hilo = as.numeric(coalesce(from_low, from_high))) %>%
  select(-c(t, month, from_low, from_high)) %>%
  transform(time_stamp = times(format(time, format = "%H:%M:%S"))) %>%
  transform(rank = ifelse(is.na(station), 2, #for 2018 rows without tidal info
                          ifelse(time_stamp <= "2:30:00" | time_stamp >= "21:00:00", 1,
                          ifelse(time_stamp > "3:00:00" & time_stamp <= "9:00:00", 2,
                                 ifelse(time_stamp > "17:00:00", 4, 3))))) %>%
  filter(rank == 2 | rank == 3) %>%
  filter(region == 'Whidbey')

#table(PG_count_tides$rank)

# # test <- PG_count_tides %>%
# #   group_by(site, date) %>%
# #   summarize(cnt = n_distinct(v)) %>%
# #   filter(cnt < 2)

PG_count_tides_trim <- PG_count_tides %>%
  transform(month = month(date)) %>%
  filter(month > 6 & month < 9) %>%
  filter(region == 'Whidbey')

mod <- lm(PG_count ~ abs(from_hilo) + site, data = PG_count_tides_trim %>% filter(type == 'L'))
summary(mod)

#from low tide; loess
from_low <- ggplot(PG_count_tides_trim %>% filter(type == 'L'), aes(x = from_hilo, y = PG_count)) +
  #geom_point() + 
  geom_smooth(se = T) +
  xlab("Minutes from Low Tide" ) + ylab("PG Count") + 
  plot_theme()

############

fig_theme <- plot_theme
#for frances
from_high <- ggplot(PG_count_tides_trim %>% filter(type == 'H'), aes(x = from_hilo, y = PG_count)) +
  #geom_point() +
  geom_smooth(se = T) +
  xlab("Minutes from High Tide" ) + ylab("Jul-Aug Guillemot Counts per Colony") +
  fig_theme()

ggplot(PG_count_tides_trim %>% filter(type == 'H'), aes(x = from_hilo)) + geom_histogram()
#from low and high tide; loess

ggplot(PG_count_tides_trim, aes(x = from_hilo, y = PG_count, col = type, group = type)) +
  #geom_point() +
  geom_smooth(se = T) +
  xlab("Minutes from Low or High Tide" ) + ylab("PG Count") +
  fig_theme()

#gam method; abs value
ggplot(PG_count_tides_trim, aes(x = abs(from_hilo), y = PG_count, col = type, group = type)) +
  #geom_point() +
  geom_smooth(se = F) +
  xlab("Minutes from Low or High Tide") + ylab("PG Count") +
  fig_theme()


################


```

```{r}
#temp data

#only up through 2015?
sst_dat <- read.csv('sst_day.csv', header = T, stringsAsFactors = F) %>%
  #transform(year = factor(year)) %>%
  filter(station == 9444900)

ggplot(sst_dat, aes(y = temp, x = yday, group = yday)) +
  geom_boxplot() + xlab('Day of Year') + ylab('Mean Daily SST') +
  facet_grid(~station) + theme_bw()

ggplot(sst_dat, aes(y = temp, x = yday, group = factor(year), col = factor(year))) +
  geom_line() + ylab('Mean Daily SST') + xlab('Day of Year') +
  scale_color_brewer(palette = 'Set3', name = '') + facet_grid(~station) +
  theme_bw()

ggplot(sst_dat, aes(y = temp, x = day, group = factor(month), col = factor(month))) +
  geom_line() + xlab('Day') + ylab('Mean Daily SST') +
  facet_wrap(~year) + scale_color_brewer(palette = 'Set1') + 
  theme_bw()

##maybe this is older?
#read in upwelling and SST data; https://www.pfeg.noaa.gov/products/PFEL/modeled/indices/upwelling/NA/data_download.html
#surface pressure, cubic meters per second per 100m coastline
#average daily value; #monthly anomalies relative to mean monthly value from 1948-1967 

#uwpelling and temp at the monthly level; up through 2018
setwd("~/Documents/SAFS/PigeonGuillemots/PiGuData/EnvData")

upwell_day <- read.csv("Upwell_day.csv", header = T, stringsAsFactors = F) %>%
  transform(year = year(as.Date(date, format = '%m/%d/%y'))) %>%
  filter(year > 2007)
#monthly upwelling anomalies
upwell_anom <- read.csv("Upwell_anom.csv", header = T, stringsAsFactors = F) %>%
  filter(year > 2007) %>%
  filter(lat == '48N') %>%
  melt(id.vars = c('lat', 'long', 'year'), variable.name = 'month', value.name = 'up_anom') %>%
  transform(mo = as.numeric(as.factor(as.numeric(month)))) %>%
  filter(month %in% c('Jun', 'Jul', 'Aug', 'Sep', 'Oct')) %>%
  select(-c(month, lat, long))
#monthly
# SST_44 <- read.csv("SST_44N.csv", header = T, stringsAsFactors = F) %>%
#   transform(date = as.Date(date), year = year(date)) %>%
#   filter(year > 2007)

covs <- PG_count_tides %>%
  merge(sst_dat %>% select(-c(day, month, station)), by = c('year', 'yday'), all.x = T, all.y = F) %>%
  transform(mo = month(date)) #%>% filter(site == 'Ledgewood' & year == 2014)#%>%
  #merge(upwell_anom, by = c('mo', 'year'), all.y = F)

setwd("~/Documents/SAFS/PigeonGuillemots")

```

```{r}

#all covariates
#PG_covs from mod_setup file

covs_cnt <- covs %>%
  #transform(week = week(date)) %>%
  select(year, week, site, yday, PG_count, v, type, from_hilo, temp, mo) %>%
  rename(from_hilo_orig = from_hilo) %>%
  transform(v = round(v, 1), temp = round(temp, 2)) %>%
  group_by(year, week, site, type, mo) %>% #getting rid of duplicates from multiple visits per week
  summarize(PG_count = mean(PG_count), v = mean(v), yday = mean(yday),
            from_hilo_orig = mean(from_hilo_orig), temp = mean(temp)) %>%
  filter(type == 'H' | is.na(type)) 

#test <- covs_cnt %>% filter(site == 'Lagoon North #2' & year == 2010)

#filtered for only high tide in *_setup file
covs_burrow <- read.csv('PG_covs.csv', header = T, stringsAsFactors = F) %>%
  group_by(year, site, yday, from_hilo, v, temp) %>%
  summarize(bv = sum(burrow_visit), pv = sum(tot_prey)) %>%
  #filter(site == 'Ledgewood' & year == 2014) %>%
  transform(v = round(v, 1), temp = round(temp, 2)) 

covs_col <- covs_cnt %>%
  merge(covs_burrow, by = c('year', 'site', 'yday', 'v', 'temp'), all.x = T) %>%
  merge(upwell_anom, by = c('year', 'mo')) %>%
  select(site, year, week, mo, yday, PG_count, bv, pv, type, v, from_hilo_orig, temp, up_anom) %>%
  transform(v = as.numeric(round(scale(v),2)), mins = round(scale(from_hilo_orig),2) , 
            temp = as.numeric(round(scale(temp),2)), upwell = round(scale(up_anom),2)) %>%
  select(-c(from_hilo_orig, up_anom))
covs_col[is.na(covs_col)] <- 0

#write.csv(covs_col, 'covs_col.csv', row.names = F)

count_dat <- covs_col 
write.csv(count_dat, 'count_dat.csv', row.names = F)

covs_isl <- covs_col %>%
  group_by(year, week, yday) %>%
  summarize(PG_count = sum(PG_count, na.rm = T), bv = sum(bv, na.rm = T), pv = sum(pv, na.rm = T),
            temp = mean(temp, na.rm = T), v = mean(v, na.rm = T), upwell = mean(upwell), 
            mins = mean(mins, na.rm = T)) %>%
  transform(year = factor(year)) %>%
  filter(week > 24 & week < 35) 

summary(lm(PG_count ~ v + upwell + week, covs_isl))
summary(lm(bv ~ temp + v + upwell + mins + year, covs_isl))
summary(lm(pv ~ temp + v + upwell + mins + year, covs_isl))


```

```{r}
#trim with covariates df instead

max_col_yr_w <- covs_col %>%
  select(year, week, site, PG_count) %>%
  distinct() %>%
  group_by(year, site) %>%
  summarize(max_cnt = max(PG_count, na.rm = T)) 
max <- covs_col %>%
  transform(v = as.numeric(v), temp = as.numeric(temp)) %>%
  select(year, week, site, PG_count) %>%
  group_by(week, year, site) %>%
  distinct() %>%
  merge(max_col_yr_w, by = c('year', 'site')) %>%
  transform(max_week = ifelse(max_cnt == PG_count, 'y', 'n')) %>%
  filter(max_week == 'y') %>%
  select(year, site, week, PG_count) %>%
  transform(min_week = week-3, max_week = week+1) %>% #adjust here
  select(-c(week, PG_count))

col_trim <- covs_col %>% transform(v = as.numeric(v), temp = as.numeric(temp)) %>%
  merge(max, by = c('year', 'site')) %>%
  filter(week <= max_week & week >= min_week) %>% select(-c(min_week, max_week))
  
check_plot <- ggplot(col_trim, aes(week, PG_count, group = year, color = year)) +
  geom_line() + geom_point() +
  facet_wrap(~site) +
  plot_theme(legend.position = 'none')

#mean annual per site, go back up to max_week to adjust
mean_trim <- col_trim %>% 
  group_by(site, week) %>%
  summarize(mean_cnt = mean(PG_count))

ggplot(mean_trim, aes(week, mean_cnt)) +
  geom_line() + geom_point() +
  facet_wrap(~site) +
  plot_theme(legend.position = 'none')

#sum to island level
isl_sum_trim <- col_trim %>%
  group_by(year, week) %>%
  summarize(cnt = mean(PG_count))

ggplot(isl_sum_trim, aes(week, cnt, group = year, color = year)) +
  geom_line() + geom_point() +
  facet_wrap(~year) +
  plot_theme(legend.position = 'none')

#count_dat <- col_trim

#count
summary(lm(PG_count ~ week + year, count_dat)) #declining a little with week, but non-sig
summary(lm(PG_count ~ v + upwell + week, count_dat))

#bv
summary(mod1 <- lm(bv ~ temp + v + upwell + mins + year, count_dat))
summary(mod1 <- lm(bv ~ v + upwell + mins + year, count_dat))
summary(mod1 <- lm(bv ~ temp + v + upwell + mins + year, count_dat))

#pv
summary(mod1 <- lm(pv ~ temp + v + upwell + year, count_dat))


```


```{r}

tot_nest <- data_burrow %>%
  filter(region == 'Whidbey') %>%
  group_by(region, year, site) %>% #need site bc same names across sites
  summarise(cnt = n_distinct(burrow_name))
sum(tot_nest$cnt)

tot_nest <- tot_nest %>%
  group_by(region, year) %>%
  summarize(tot = sum(cnt))
sum(tot_nest$tot)

```


#Data Availability and Status

###Adult counts  
+ 2008-2017 Whidbey
+ Have added intern counts for 2010, 2011, 2012 at Rolling Hills, Harrington, Mutiny Sands, and Shore Meadows (multiple counts/day, once/week); 
+ Intern data are different depending on the intern/year. 2010-2012 is timed counts for 4-5 hours, more recent years are one count per day but multiple survey days per week. Can these be combined for multiple counts per week?  

+ South Sound: Terence's intern is working on it.
+ Sequim: spoke with Ed and Jeff - must be pulled from website (Ryan)

###Prey deliveries/nest survival  
+ 2008-2014: needs a lot of checking  
+ 2015-2017: must be compiled by hand (Ryan)
+ Intern data 2010-2012 does not contain non-prey visits (VB), which means it is hard to confirm the survey started before prey were being delivered. Apparently they use the 3-week delivery criteria for a successful fledge. I'm thinking we can improve on that, because they probably don't exclude weeks where deliveries have already started.

###Quality control  
+ (mis)matching site-date combinations  
+ Prey deliveries when PG_count = 0
+ 10% of cases where day-level total visits doesn't match summed burrow-level visits

###Questions for Frances  
+ Why do some colonies have so few observation days (Coupeville Wharf)
+ Why wouldn't a colony have data for a given year (Hancock South, 2016)


```{r, results = 'hide', message = FALSE}
##Questions to answer and issues to fix

##UPDATE - dfs are separate now, so will have to do fancier magic to look at these issues
#There are a few cases where PG_count is zero and there are prey deliveries
# mystery_prey <- data %>%
#   filter(PG_count == 0 & burrow_visits_day > 0)


```

```{r, results = 'hide', message = FALSE}

##data exploration

#check duplicate site-date-burrow: no burrow_name has more than one row per date - no duplictes
dups <- data_burrow %>%
  group_by(region, site, date, burrow_name) %>%
  summarize(cnt = n_distinct(date)) %>%
  filter(cnt > 1) %>% nrow()

#consistency in number of week observations per site per season
weeks <- data_count %>% 
  filter(intern_data != "no") %>%
  group_by(region, site, year) %>%
  summarize(start_week = min(week), end_week = max(week)) %>%
  transform(study_length = end_week - start_week)

weeks_plot_data <- weeks %>% 
  group_by(site, region) %>%
  summarize(ymin = min(study_length), ymax = max(study_length), 
            mean = mean(study_length), se = se(study_length)) %>%
  merge(weeks, by = c("site", "region"))

weeks_plot <- ggplot(weeks_plot_data, 
                     aes(as.numeric(as.character(year)), study_length, group = region, col = region)) +
  geom_line() + geom_point() +
  xlab("") + ylab("Number of Weeks") +
  facet_wrap(~site) + plot_theme()

weeks_range <- ggplot(weeks_plot_data, 
                      aes(as.numeric(as.character(year)), study_length, group = region, col = region)) +
  geom_linerange(aes(ymin = ymin, ymax = ymax)) + geom_point() +
  xlab("") + ylab("Number of Weeks") + scale_x_continuous(breaks = c(2008, 2010, 2012, 2014, 2016, 2018)) +
  facet_wrap(~site) + plot_theme()

#differences in start weeks over years?
#visually inspected 'weeks' - mostly start within three weeks across years, aside from an early 
#start in 2008 in a few sites and a very late start in 2009

#check number of burrows per site across years: looks fairly consistent with the exception of shore meadows (2010) and double bluff north/south (2010)
burrow_count <- data_burrow %>% 
  #filter(intern_data != 'on') %>%
  group_by(site, year) %>%
  summarize(cnt = n_distinct(burrow_name))

burrow_count_plot <- ggplot(burrow_count, aes(as.numeric(as.character(year)), cnt)) +
  geom_line() + geom_point() +
  xlab("") + ylab("Number of Burrows") +
  facet_wrap(~site) + 
  plot_theme() +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016))

#number of prey burrows per site/year - for scott
#need count of active burrows per site per year
#site level
prey_burrows <- prey_weeks %>%
  filter(region == 'Whidbey') %>% 
  transform(id = 1:1712) %>%
  group_by(site, year) %>% 
  summarize(cnt = n_distinct(id)) %>%
  transform(year = as.numeric(as.character(year)))

ggplot(prey_burrows, aes(year, cnt)) +
  geom_point() + geom_line() +
  facet_wrap(~site) +
  xlab('') + ylab('Active Burrows') +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  scale_x_continuous(breaks = c(seq(2008,2018, 5)))

#all sites combined
prey_burrows <- prey_weeks %>%
  #filter(region == 'Whidbey') %>% 
  #filter(nest_succ == 'Y') %>%
  #transform(id = 1:1792) %>%
  transform(id = 1:1712) %>%
  group_by(year) %>% 
  summarize(cnt = n_distinct(id)) %>%
  transform(year = as.numeric(as.character(year)))

ggplot(prey_burrows, aes(year, cnt)) + 
  geom_point() + geom_line() +
  geom_smooth() + 
  xlab('') + ylab('Active Burrows') +
  scale_x_continuous(breaks = c(seq(2008,2018, 2))) +
  plot_theme()

tot_nest <- tot_nest %>%
  group_by(region, year) %>%
  summarize(tot = sum(cnt))

ggplot(tot_nest, aes(year, tot)) +
  geom_point() + geom_line() +
  geom_smooth() +
  #facet_wrap(~site) +
  xlab('') + ylab('Active Burrows') +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  scale_x_continuous(breaks = c(seq(2008,2018, 5)))

#number of years of data per site: 30 of the 40 unique site names have more than two years of data
site_yrs <- data_count %>%
  group_by(site, region) %>%
  summarize(cnt = n_distinct(year)) %>%
  filter(cnt > 2) %>% nrow()
  
```

###Burrow counts and study length  

Number of burrows identified per site per year seems reasonably consistent (ignore year > 2014). Some sites range more than others in terms of how long the study was conducted each year.  Not all sites can be used for looking at nest fate.


###Adult counts  

Many sites show relative consistency in the average number of adults counted per week across years. This consistency is also reflected at the island-level. Counts dropped in 2015.

```{r, results = 'hide', message = FALSE}

#region-level total; mean of the season - compare to trimmed

island_count_yr <- data_count %>%
  filter(intern_data != 'on') %>%
  select(region, site, week, year, PG_count, date) %>%
  distinct() %>%
  group_by(region, week, year) %>%
  summarize(cnt = sum(PG_count, na.rm = T)) %>% #sum across colonies
  group_by(region, year) %>%
  summarize(PG_count = mean(cnt, na.rm = T)) #mean of weeks

island_count_yr_trim <- weekly_count_isl_trim %>%
  group_by(year, region) %>%
  summarize(PG_count = mean(cnt, na.rm = T))

trim_compare_data <- island_count_yr %>%
  merge(island_count_yr_trim, by = c("region", "year"), suffixes = c("", ".trim")) %>%
  melt(id.vars = c("year", "region"))

trim_compare_plot <- ggplot(trim_compare_data, aes(as.numeric(as.character(year)), as.numeric(value), group = variable, col = variable)) +
  geom_point() + xlab("") + ylab("Season Mean of Regional Weekly Counts") +
  geom_line() + facet_wrap(~region) +
  scale_x_continuous(breaks = 2008:2018) +
  plot_theme(legend.position = 'top')

#I think this is less useful than the sum to the regional level
#island-level average; averages across dates and across colony sites - more consistent than I expected?
#should zeros be included here?
island_mean <- data_count %>% 
  filter(intern_data != "on") %>%
  group_by(year, region) %>%
  summarize(mean_cnt = mean(PG_count, na.rm = T), 
            se = se(PG_count, na.rm = T))

isl_mean <- ggplot(island_mean, aes(as.numeric(as.character(year)), mean_cnt, group = region, col = region)) +
  geom_line() +
  geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.4, linetype = 'blank') +
  xlab("") + ylab("Mean Daily Site Count") + scale_x_continuous(breaks = 2008:2018) +
  plot_theme_1()


```

```{r}

#for frances
countplotdata_isl <- weekly_count_isl_trim %>%
  group_by(region, year) %>%
  summarize(mean_cnt = mean(cnt), se = se(cnt))

countplotdata_col <- weekly_count_col_trim %>%
  group_by(region, year, site) %>%
  summarize(mean_cnt = mean(PG_count), se = se(PG_count))

ggplot(countplotdata_isl %>% filter(region == 'Whidbey'), aes(as.numeric(as.character(year)), mean_cnt, group = region, col = region)) +
  geom_line(col = 'black') +
  geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.5, linetype = 'blank') +
  xlab("") + ylab("Mean Daily Island Count") + scale_x_continuous(breaks = 2008:2018) +
  plot_theme_1(legend.position = 'none')

ggplot(countplotdata_col %>% filter(region == 'Whidbey' & 
                                      site != "Rolling Hills #2"), 
       aes(as.numeric(as.character(year)), mean_cnt)) +
  geom_line(col = 'black') +
  geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.5, linetype = 'blank') +
  xlab("") + ylab("Mean Daily Colony Count") + scale_x_continuous(breaks = c(2008, 2012, 2015, 2018)) +
  plot_theme(legend.position = 'none') +
  facet_wrap(~site)

#prey deliveries

# get start/stops for each colony and year, then average across years and then get length
prey_stop_start_col_plot_data <- data_burrow %>%
  #filter(region == 'Whidbey' & intern_data != 'on') %>%
  filter(tot_prey > 0 & !is.na(tot_prey)) %>%
  group_by(site, year) %>%
  summarize(start = min(yday), stop = max(yday)) %>%
  group_by(site) %>%
  summarize(start_mean = mean(start), stop_mean = mean(stop)) %>%
  transform(length = stop_mean-start_mean) 

 ggplot(prey_stop_start_col_plot_data, aes(site, length)) +
  geom_linerange(aes(ymin = start_mean, ymax = stop_mean)) + coord_flip() +
   xlab("") + ylab("Day of Year") +
   scale_y_continuous(limits = c(170,230)) +
   plot_theme()

prey_stop_start_isl_plot_data <- data_burrow %>%
  #filter(region == 'Whidbey' & intern_data != 'on') %>%
  filter(tot_prey > 0 & !is.na(tot_prey)) %>%
  group_by(year) %>%
  summarize(start = min(yday), stop = max(yday)) %>%
  transform(length = stop-start)

ggplot(prey_stop_start_isl_plot_data, aes(year, length)) +
  geom_linerange(aes(ymin = start, ymax = stop)) + coord_flip() +
  xlab("") + ylab("Day of Year") +
  scale_y_continuous(limits = c(150,270)) +
  plot_theme()

```


```{r, results = 'hide', message = FALSE}
col_count_week
isl_mean
isl_tot
check_plot
```

####Multiple counts - intern data  

We have multiple counts for ~5 colonies. For 2010-2012, we have 10-11 counts per week. For 2015-2017, we have 2-3 counts per week. Variation is less in earlier years because there are 10 counts on the same day, and so are less affected by anything that could be different throughout the week - some days may just be quieter than others (weather/ocean conditions, feeding opportunities, beach disturbance), so probably best not to combine the two types?  

The group made a point to estimate fledgling success in 2013 for 2009-2013.  
```{r, results = 'hide', message = FALSE}
# intern_counts <- data_count %>%
#   filter(intern_data == 'Y') %>%
#   transform(week_year = strftime(date, format = "%V")) %>%
#   transform(week = as.numeric(week_year)) 
# 
# intern_week_counts <- intern_counts %>%
#   group_by(site, year, week) %>%
#   summarize(cnt = n_distinct(id))
# 
# int_count_data <- intern_counts %>%
#   group_by(site, year, week) %>%
#   summarize(mean_cnt = mean(PG_count), se = se(PG_count))
# 
# int_count <- ggplot(int_count_data, aes(week, mean_cnt)) +
#   geom_line() + geom_point() +
#   geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.4, linetype = 'blank') +
#   xlab("") + ylab("Mean Weekly Site Count") +
#   facet_wrap(~site + year) +
#   scale_x_continuous(limit = c(0,15), breaks = c(3,6,9,12), labels = c(3,6,9,12)) +
#   plot_theme()
# 
# int_count

```


```{r}

#assumptions testing

#daily counts at island level
# island_counts_daily <- data %>%
#   select(year, date, week, PG_count) %>%
#   filter(!is.na(PG_count)) %>% distinct()
# 
# mod <- lm(PG_count ~ year + week, data = island_counts_daily)
# gvlma(mod)

#daily counts at colony level
site_counts_daily <- data_count %>% 
  filter(intern_data != 'on') %>%
  select(region, year, date, week, site, PG_count) %>% 
  filter(!is.na(PG_count)) %>% distinct()

mod <- lm(PG_count ~ year + site + region, data = site_counts_daily)
#mod <- lm(PG_count ~ year + site + week, data = site_counts_daily)
par(mfrow = c(2, 2))
plot(mod)
gvlma(mod)

#yearly counts at colony level
site_counts_yearly <- data_count %>% 
  group_by(site, year, week) %>%
  summarize(mean_cnt = mean(PG_count, na.rm = T), 
            se = se(PG_count, na.rm = T))

mod_1<- lm(mean_cnt ~ year + site, data = site_counts_yearly)
# summary(mod_1)
# gvlma(mod_1)
# plot(mod_1)


```

###Prey deliveries

My suspicion is that the seasonal deliveries pattern/peak is smoothed out across years, which is why it is more evident in the island-level figure than at each site.  
```{r, results = 'hide', message = FALSE}
#prey delivery timing

#all prey deliveries - colony level
prey_dels_daily <- data_burrow %>% filter(as.numeric(year) < 8) %>%
  group_by(week, site) %>% 
  summarize(mean_dels = mean(tot_prey, na.rm = T), se = se(tot_prey, na.rm = T)) %>% #week averages across years
  filter(!is.na(mean_dels)) #taking out single-year rows, I think

#for frances
ggplot(prey_dels_daily, 
       aes(week, mean_dels, group = site)) +
  geom_line() + geom_point() + 
  #scale_x_continuous(limit = c(0,16), breaks = c(3,6,9,12,15),
                                                #labels = c(3,6,9,12,15)) +
  geom_ribbon(aes(ymin = mean_dels - se, ymax = mean_dels + se), alpha = 0.4, linetype = 'blank') +
  xlab("Week of the Year") + ylab("Mean Weekly Prey Deliveries") +
  facet_wrap(~site) + 
  #scale_color_brewer(values = color3) +
  plot_theme(legend.position = 'none')

#all prey deliveries - island level
isl_prey_dels_weekly <- data_burrow %>% filter(as.numeric(year) < 8) %>%
  group_by(week, year) %>% 
  summarize(tot_dels = sum(tot_prey, na.rm = T)) %>% #first sum all sites
  group_by(week) %>%
  summarize(mean_dels = mean(tot_dels, na.rm = T), se = se(tot_dels, na.rm = T)) #mean of years
ggplot(isl_prey_dels_weekly, 
       aes(week, mean_dels)) +
  geom_line() + geom_point() + 
  #scale_x_continuous(limit = c(23,38), breaks = c(3,6,9,12,15),
                                                #labels = c(3,6,9,12,15)) +
  geom_ribbon(aes(ymin = mean_dels - se, ymax = mean_dels + se), alpha = 0.4, linetype = 'blank') +
  xlab("Week of the Year") + ylab("Mean Weekly Prey Deliveries") +
  fig_theme(legend.position = 'none') 
#for frances
ggplot(isl_prey_dels_weekly, 
       aes(week, mean_dels)) +
  geom_line() + geom_point() + 
  #scale_x_continuous(limit = c(23,38), breaks = c(3,6,9,12,15),
                                                #labels = c(3,6,9,12,15)) +
  geom_ribbon(aes(ymin = mean_dels - se, ymax = mean_dels + se), alpha = 0.4, linetype = 'blank') +
  xlab("Week of the Year") + ylab("Mean Weekly Prey Deliveries") +
  plot_theme(legend.position = 'none') 

```

```{r, results = 'hide', message = FALSE}
  
#all visits (prey and no prey) according to counts df (rather than burrow level); not sure if this is meaningful

# #island-level
# isl_tot_visits_weekly <- data %>% 
#   group_by(week, year) %>%
#   summarize(tot_visits = sum(burrow_visits_day), na.rm = T) %>% #first sum all sites
#   group_by(week) %>%
#   summarize(mean_visits = mean(tot_visits, na.rm = T), se = se(tot_visits, na.rm = T))
# ggplot(isl_tot_visits_weekly, 
#        aes(week, mean_visits)) +
#   geom_line() + geom_point() + scale_x_continuous(limit = c(0,16), breaks = c(3,6,9,12,15),
#                                                 labels = c(3,6,9,12,15)) +
#   geom_ribbon(aes(ymin = mean_visits - se, ymax = mean_visits + se), alpha = 0.4, linetype = 'blank') +
#   xlab("Week") + ylab("Mean Weekly Total Visits") +
#   fig_theme(legend.position = 'none') 
# 
# #colony-level
# tot_visits_weekly <- data %>%
#   group_by(week, site) %>% 
#   summarize(mean_BV = mean(burrow_visits_day, na.rm = T), se = se(burrow_visits_day, na.rm = T)) #week averages across years
# ggplot(tot_visits_weekly, 
#        aes(week, mean_BV, group = site, color = site)) +
#   geom_line() + geom_point() + scale_x_continuous(limit = c(0,16), breaks = c(3,6,9,12),
#                                                 labels = c(3,6,9,12)) +
#   geom_ribbon(aes(ymin = mean_BV - se, ymax = mean_BV + se), alpha = 0.4, linetype = 'blank') +
#   xlab("Week") + ylab("Mean Weekly Burrow Visits") +
#   facet_wrap(~site) +
#   fig_theme(legend.position = 'none')
#   

```

```{r}

#use prey_weeks, prey_weeks_range column more believable until further exploration

#have range of prey deliveries changed over time? Is it different at different colonies?

prey_weeks_mean <- prey_weeks %>%
  group_by(year, site) %>%
  summarize(mean_cnt = mean(prey_week_range)) %>%
  transform(year = as.numeric(as.character(year)))
plot <- ggplot(prey_weeks_mean, aes(year, mean_cnt)) +
  geom_point() +
  geom_smooth(data = prey_weeks_mean, aes(year, mean_cnt), 
              method = "loess", se = FALSE, col = 'black', linetype = 3) +
  plot_theme()

#looks like slight increase in mean range of weeks over time... maybe coinciding with potential increase in
#study length?
#summary(lm(mean_cnt ~ year, data = prey_weeks_mean))
  

#tot burrows per colony per year for denominators
tot_burrow <- prey_weeks %>%
  group_by(year, site) %>%
  summarize(tot = n_distinct(burrow_name))

#active burrows per site per year
burrow_range <- tot_burrow %>%
  group_by(site) %>%
  summarize(min = min(tot), mean = mean(tot), max = max(tot))

ggplot(burrow_range, aes(site, mean)) +
  geom_point() +
  geom_linerange(aes(ymin = min, ymax = max)) + coord_flip() +
  xlab('') + ylab('Number of Active Burrows') +
  plot_theme()
range(burrow_range$mean)

#perc success at colony level
perc_suc_col <- prey_weeks %>%
  filter(prey_week_range > 2) %>%
  group_by(year, site) %>%
  summarize(cnt = n_distinct(burrow_name)) %>%
  merge(tot_burrow, by = c('year', 'site')) %>%
  transform(perc_suc = round((cnt/tot), 2)) %>%
  transform(year = as.numeric(as.character(year)))
summary(lm(year ~ perc_suc, data = perc_suc_col)) #perc success increasing over time

ggplot(perc_suc_col, aes(year, perc_suc)) +
  geom_point() +
  geom_smooth() +
  fig_theme()

#perc fledgling success at island level
tot_burrow_isl <- prey_weeks %>%
  group_by(year) %>%
  summarize(tot = n_distinct(burrow_name))
perc_suc_isl <- prey_weeks %>%
  filter(prey_week_range > 4) %>%
  group_by(year) %>%
  summarize(cnt = n_distinct(burrow_name)) %>%
  merge(tot_burrow_isl, by = c('year')) %>%
  transform(perc_suc = round((cnt/tot), 2)) %>%
  transform(year = as.numeric(as.character(year)))

summary(lm(year ~ perc_suc, perc_suc_isl)) #increasing success

#par(mfrow = c(1,2))
#from above
plot_a <- ggplot(tot_nest, aes(year, tot)) +
  geom_point() + geom_line() +
  geom_smooth() +
  #facet_wrap(~site) +
  xlab('') + ylab('Active Burrows') +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  scale_x_continuous(breaks = c(2008, 2010, 2012, 2014, 2016)) +
  fig_theme()

plot_b <- ggplot(perc_suc_isl, aes(year, perc_suc)) +
  geom_point() + 
  geom_smooth() + 
  geom_line() +
  xlab('') + ylab('Naive Nest Success') + #here
  scale_x_continuous(breaks = c(2008, 2010, 2012, 2014, 2016)) +
  fig_theme()

plot_grid(plot_a, plot_b, labels = c('(a)', '(b)'), label_size = 10)
mean(perc_suc_isl$perc_suc)

##write.csv(perc_suc_col, "perc_suc_col.csv", row.names = F)
##write.csv(perc_suc_isl, "perc_suc_isl.csv", row.names = F)


```

```{r}

#number of potential fledglings per colony and at island level

#colony level

#number of fledglings
fledge_col <- prey_weeks %>%
  filter(prey_week_range > 2) %>%
  group_by(year, site) %>%
  summarize(cnt = n_distinct(burrow_name)) %>%
  transform(year = as.numeric(as.character(year)))
summary(lm(year ~ cnt, data = fledge)) #no change over time

ggplot(fledge_col, aes(year, cnt)) +
  geom_point() +
  geom_smooth() +
  fig_theme_1()

#island level

#number of fledglings
fledge_isl <- prey_weeks %>%
  filter(prey_week_range > 2) %>%
  group_by(year) %>%
  summarize(cnt = n_distinct(burrow_name)) %>%
  transform(year = as.numeric(as.character(year)))
summary(lm(year ~ cnt, data = fledge_isl)) #no change over time

ggplot(fledge_isl, aes(year, cnt)) +
  geom_point() +
  geom_smooth() +
  fig_theme_1()

```

###Prey composition

Not sure we have a use for this, but I was mainly interested in looking for (in)consistencies and general patterns.
```{r, results = 'hide', message = FALSE}
#prey delivery composition

#prey composition over the season - island level
isl_prey_comp <- data_burrow %>%
  filter(tot_prey > 0) %>% #I think just want cases where there were prey
  group_by(week, year) %>%
  summarize(Gunnel = sum(Gunnel), Sculpin = sum(Sculpin), Other = sum(Other)) %>%
  group_by(week) %>%
  summarize(Gunnel = mean(Gunnel), Sculpin = mean(Sculpin), Other = mean(Other)) %>%
  melt(id.vars = "week")
ggplot(isl_prey_comp, aes(week, value, group = variable, fill = variable)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  xlab("Week") + ylab("Mean Prey Deliveries") + 
  fig_theme_1(legend.position = 'top') + scale_fill_manual(values = color3) 

#density plot
isl_prey_comp_dens <- data_burrow %>%
  select(date, Gunnel, Sculpin, Other) %>%
  melt(id.vars = "date") %>%
  filter(value > 0) %>%
  transform(yday = yday(date))
# ggplot(isl_prey_comp_dens, aes(yday, color = variable)) +
#   geom_line(stat = 'density') + 
#   geom_rug(sides = "b", aes(y = 0), position = 'jitter', alpha = 0.25) +
#   xlab("") + ylab("Density") + 
#   fig_theme() + scale_color_manual(values = color3) +
#   scale_fill_manual(values = color3) +
#   scale_y_continuous(limit = c(0, 0.03)) +
#   scale_x_continuous(limit = c(150, 270), breaks = c(150, 180, 210, 240, 270),
#                      labels = c('Jun', 'Jul', 'Aug', 'Sep', 'Oct'))

#prey composition over the season - colony level
col_prey_comp <- data_burrow %>%
  filter(tot_prey > 0) %>%
  group_by(week, site) %>%
  summarize(Gunnel = mean(Gunnel, na.rm = T), 
            Sculpin = mean(Sculpin, na.rm = T), 
            Other = mean(Other, na.rm = T)) %>%
  melt(id.vars = c("week", "site"))
ggplot(col_prey_comp, aes(week, value, fill = variable)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  xlab("Week") + ylab("Mean Prey Deliveries") +
  facet_wrap(~site) +
  fig_theme(legend.position = 'bottom') + scale_fill_manual(values = color3) 

```

```{r}

wa_state <- map_data("state", region = c("washington")) 
islcounty_outline <- map_data("county", region = c("washington"), subregion = 'island') %>%
  filter(subregion == 'island')

col_locs <- data.frame(Colony = c('Deception Pass', 'Cliffside', 'Swantown', 'Hastie Lake', 'Fort Casey', 'Keystone', 'Ledgewood', 'Hancock', 'Lagoon N.', 'Lagoon S.', 'Malmo Bluff', 'Shore Meadows', 'Mutiny Sands', 'Limpet Lane', 'Double Bluff', 'Possession Point', 'Langley', 'Pratts Bluff', 'Harrington', 'Coupeville',
                                  'Rolling Hills', 'Monroe Landing', 'Maylor Point', 'Forbes Point',
                                  'Crescent Harbor', 'Mariners Cove'), 
                       lat = c(48.401, 48.368, 48.299, 48.264, 48.1719, 48.163, 48.1416, 48.12, 48.09,
                               48.068, 48.0255, 48.0113, 48.008, 47.98, 47.9717, 47.906, 48.0363, 48.1168,
                               48.2099, 48.2236, 48.2332, 48.2378, 48.268, 48.2731, 48.2974, 48.299),
                       long = c(-122.655, -122.6697, -122.725, -122.749, -122.687, -122.655, -122.606,
                                -122.60, -122.60, -122.610, -122.5948, -122.582, -122.562, -122.5488,
                                -122.5344, -122.3862, -122.3931, -122.559, -122.6077, -122.673, -122.7308,
                                -122.695, -122.6414, -122.6249, -122.5876, -122.505))
library(rgdal)
library(broom)
#wd <- getwd()
wd <- "~/Documents/SAFS/Stellers/Data/"
#dir_spatial <- paste0(wd, "/Data/")
dir_shp <- paste0(wd, "gz_2010_us_040_00_500k")
us_shp <- readOGR(dsn = dir_shp, layer = "gz_2010_us_040_00_500k")

wa_shp <- us_shp[us_shp$NAME == 'Washington',]
wa <- tidy(wa_shp, region = "GEO_ID") %>%
  data.table() 

# ggplot() +
ggplot() +
geom_polygon(data = wa, aes(x = long, y = lat, group = group), 
             fill = "grey93", color = "grey50", size = 0.2) + 
  xlab("") + ylab("") + coord_fixed(1.5, ylim = c(47.891, 48.4599),
                                    xlim = c(-122.168, -122.93)) +
    geom_point(data = col_locs, aes(x = long, y = lat), size = 0.6) +
# annotate("text", x = max(col_locs$long) + 2.6,
#                  y = min(col_locs$lat) + 4.25 - 0.18*(26:1),
#          label = col_locs$Colony, size = 2.5, hjust = 0, fontface = 'bold') +
  #geom_path(data = islcounty_outline, colour = "grey20", size = 0.3) +
  # geom_label_repel(data = col_locs, inherit.aes = FALSE,
  #                  aes(x = long, y = lat, label = Colony), show.legend = TRUE,
  #   fontface = 'bold', size = 1.5, color = 'black', fill = 'white', force = 1, point.padding = .5) +
theme_classic() +
   theme(
    axis.line = element_blank(),
    axis.text = element_blank(), axis.ticks = element_blank(),
    strip.text = element_text(size = 9),
    strip.background = element_blank(),
    panel.border = element_rect(colour = "black", fill = NA, size = 1))

```

##archive
```{r}
#peak/trimmed counts - before having added covariates

################################################################################################
################################################################################################
##################################       Whidbey       #########################################
################################################################################################
################################################################################################

# #mean counts per week in each year and region
# peak_mean_w <- data_count %>% 
#   filter(region == 'Whidbey') %>%
#   filter(intern_data != 'on') %>%
#   group_by(year, week) %>%
#   summarize(mean_cnt = mean(PG_count, na.rm = T), se = se(PG_count, na.rm = T)) 
# 
# #sum of weekly counts at the regional level each year
# peak_sum_w <- data_count %>%
#   filter(region == 'Whidbey') %>%
#   filter(intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(sum_cnt = sum(PG_count, na.rm = T)) %>%
#   transform(quant = 0.9*sum_cnt)
# 
# #colony noise makes it harder to see the peaks when looking at means
# peak_plot_mean <- ggplot(peak_mean_w, aes(week, mean_cnt)) +
#   geom_line() + 
#   geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.4, linetype = 'blank') +
#   plot_theme(legend.position = 'none') + facet_wrap(~year) #+
#  #guides(color = guide_legend(override.aes=list(fill=NA)))
# 
# #for frances
# #apparent seasonal peak when looking at sum counts across the regions each week
# peak_plot_sum <- ggplot(peak_sum_w, aes(week, sum_cnt)) +
#   geom_line() + xlab("Week of the Year") + ylab("Total Guillemot Count") +
#   facet_wrap(~year) +
#   plot_theme() 
# 
# #taking max - this looks much better/clearer than the mean
# max_yr_w <- data_count %>%
#   filter(region == 'Whidbey' & intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(sum = sum(PG_count, na.rm = T)) %>% #first sum counts across colonies each week
#   group_by(year) %>%
#   summarize(max_cnt = max(sum, na.rm = T)) %>% #then take the week w/ max count
#   transform(quant = 0.9*max_cnt)
# week_perc_w <- data_count %>%
#   filter(region == 'Whidbey' & intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(cnt = sum(PG_count, na.rm = T)) %>% #sum across colonies
#   merge(max_yr_w, by = c('year')) %>%
#   transform(perc_week = cnt/max_cnt)  #proportion of max that each week comprises
# 
# #only keep weeks where count was at least 70% of max count in that year
# #first tried 50%, but that captured a steep downturn for a few years, this looks much better
# isl_trim <- week_perc_w %>%
#   filter(perc_week > 0.5) %>%
#   select(-c(max_cnt, perc_week)) %>%
#   transform(region = 'Whidbey')
# check_plot <- ggplot(isl_trim, aes(week, cnt, group = year, color = year)) +
#   geom_line() + geom_point() +
#   facet_wrap(~year) +
#   plot_theme(legend.position = 'none')
# 
# #colony level
# max_col_yr_w <- data_count %>%
#   filter(region == 'Whidbey' & intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, site) %>%
#   summarize(max_cnt = max(PG_count, na.rm = T)) 
# week_perc_yr_w <- data_count %>%
#   filter(region == 'Whidbey' & intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   group_by(week, year, site) %>%
#   distinct() %>%
#   merge(max_col_yr_w, by = c('year', 'site')) %>%
#   transform(perc_week = PG_count/max_cnt) %>%
#   transform(max_week = ifelse(max_cnt == PG_count, 'y', 'n'))
# 
# max_week <- week_perc_yr_w %>%
#   filter(max_week == 'y') %>%
#   select(year, site, week, PG_count) %>%
#   transform(min_week = week-3, max_week = week) %>%
#   select(-c(week, PG_count))
# 
# # col_trim <- week_perc_yr_w %>%
# #   filter(perc_week > 0.9) %>%
# #   select(-c(max_cnt, perc_week)) %>%
# #   transform(region = 'Whidbey')
# 
# col_trim <- week_perc_yr_w %>% select(-max_week) %>%
#   merge(max_week, by = c('year', 'site')) %>%
#   filter(week <= max_week & week >= min_week)
#   
# check_plot <- ggplot(col_trim, aes(week, PG_count, group = year, color = year)) +
#   geom_line() + geom_point() +
#   facet_wrap(~site) +
#   plot_theme(legend.position = 'none')
# 
# #mean annual per site, go back up to max_week to adjust
# mean_trim <- col_trim %>% 
#   group_by(site, week) %>%
#   summarize(mean_cnt = mean(PG_count))
# 
# ggplot(mean_trim, aes(week, mean_cnt)) +
#   geom_line() + geom_point() +
#   facet_wrap(~site) +
#   plot_theme(legend.position = 'none')
# 
# #sum to island level
# isl_sum_trim <- col_trim %>%
#   group_by(year, week) %>%
#   summarize(cnt = mean(PG_count))
# 
# ggplot(isl_sum_trim, aes(week, cnt, group = year, color = year)) +
#   geom_line() + geom_point() +
#   facet_wrap(~year) +
#   plot_theme(legend.position = 'none')
# 
# # col_dat <- col_trim %>%
# #   merge(covs_col, by = c('year', 'site', 'week', 'PG_count'), all.x = T, all.y = F) %>%
# #   filter(bv != 'NA')
# 
# count_dat <- col_trim
# 
# summary(lm(PG_count ~ week + year, count_dat))

```

```{r}

################################################################################################
################################################################################################
##################################    South Sound      #########################################
################################################################################################
################################################################################################

# #mean counts per week in each year and region
# peak_mean_SS <- data_count %>% 
#   filter(region == 'SS') %>%
#   filter(intern_data != 'on') %>%
#   group_by(year, week) %>%
#   summarize(mean_cnt = mean(PG_count, na.rm = T), se = se(PG_count, na.rm = T)) 
# 
# #sum of weekly counts at the regional level each year
# peak_sum_SS <- data_count %>%
#   filter(region == 'SS') %>%
#   filter(intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(sum_cnt = sum(PG_count, na.rm = T))
# 
# #colony noise makes it harder to see the peaks when looking at means
# peak_plot_mean <- ggplot(peak_mean_SS, aes(week, mean_cnt)) +
#   geom_line() + 
#   geom_ribbon(aes(ymin = mean_cnt - se, ymax = mean_cnt + se), alpha = 0.4, linetype = 'blank') +
#   plot_theme(legend.position = 'none') + facet_wrap(~year) #+
#  #guides(color = guide_legend(override.aes=list(fill=NA)))
# 
# #apparent seasonal peak when looking at sum counts across the regions each week
# peak_plot_sum <- ggplot(peak_sum_SS, aes(week, sum_cnt)) +
#   geom_line() + 
#   #facet_wrap(~year) +
#   plot_theme(legend.position = 'none') 
# 
# #taking max - this looks much better/clearer than the mean
# max_yr_SS <- data_count %>%
#   filter(region == 'SS') %>%
#   filter(intern_data != 'on') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(sum = sum(PG_count, na.rm = T)) %>% #first sum counts across colonies each week
#   group_by(year) %>%
#   summarize(max_cnt = max(sum, na.rm = T))  #then take the week w/ max count
# week_perc_SS <- data_count %>%
#   filter(intern_data != 'on') %>%
#   filter(region == 'SS') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, week) %>%
#   summarize(cnt = sum(PG_count, na.rm = T)) %>% #sum across colonies
#   merge(max_yr_SS, by = c('year')) %>%
#   transform(perc_week = cnt/max_cnt)  #proportion of max that each week comprises
# 
# #only keep weeks where count was at least 65% of max count in that year
# #first tried 50%, but that captured a steep downturn for a few years, this looks much better
# weekly_count_isl_trim_SS <- week_perc_SS %>%
#   filter(perc_week > 0.499) %>%
#   select(-c(max_cnt, perc_week)) %>%
#   transform(region = 'SS')
# # check_plot <- ggplot(weekly_count_isl_trim, aes(week, cnt, group = year, color = year)) +
# #   geom_line() + geom_point() +
# #   facet_wrap(~year) +
# #   plot_theme(legend.position = 'none') 
# 
# #colony level
# max_col_yr_SS <- data_count %>%
#   filter(intern_data != 'on') %>%
#   filter(region == 'SS') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   group_by(year, site) %>%
#   summarize(max_cnt = max(PG_count, na.rm = T)) 
# week_perc_yr_SS <- data_count %>%
#   filter(intern_data != 'on') %>%
#   filter(region == 'SS') %>%
#   select(year, week, site, PG_count) %>%
#   distinct() %>%
#   merge(max_col_yr_SS, by = c('year', 'site')) %>%
#   transform(perc_week = PG_count/max_cnt)
# 
# weekly_count_col_trim_SS <- week_perc_yr_SS %>%
#   filter(perc_week > 0.49) %>%
#   select(-c(max_cnt, perc_week)) %>%
#   transform(region = 'SS')
# # check_plot <- ggplot(weekly_count_col_trim, aes(week, PG_count, group = year, color = year)) +
# #   geom_line() + geom_point() +
# #   facet_wrap(~site) +
# #   plot_theme(legend.position = 'none')

########combined regions##########

# weekly_count_isl_trim <- weekly_count_isl_trim_w
# weekly_count_col_trim <- weekly_count_col_trim_w
# # weekly_count_isl_trim <- bind_rows(weekly_count_isl_trim_w, weekly_count_isl_trim_SS)
# # weekly_count_col_trim <- bind_rows(weekly_count_col_trim_w, weekly_count_col_trim_SS)
# 
# ### count data moving forward: weekly_count_col_trim and weekly_count_isl_trim
# write.csv(weekly_count_col_trim, "weekly_count_col_trim.csv", row.names = F)
# write.csv(weekly_count_isl_trim, "weekly_count_isl_trim.csv", row.names = F)
```