From 4c803220b4715ee054c1e7ff3e21c6930c342554 Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Tue, 15 Oct 2019 16:37:18 -0400
Subject: [PATCH 01/11] added updated thredds function

---
 modules/data.remote/R/download.thredds.R | 194 ++++++++++++++---------
 1 file changed, 122 insertions(+), 72 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index 4b3fadacd2f..7c1babaa7a7 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -1,17 +1,27 @@
 #
-##' @title download.thredds.AGB
-##' @name  download.thredds.AGB
+##' @title download.thredds.data
+##' @name  download.thredds.data
 ##' 
-##' @param outdir Where to place output
-##' @param site_ids What locations to download data at? 
+##' @param outdir file location to place output
+##' @param site_info information about the site. i.e. site_id, latitude, longitude 
+##' @param dates character vector of start and end date for dataset as YYYYmmdd
+##' @param varid character vector of shorthand variable name. i.e. LAI
+##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website
+##' @param data_url opendap url of data from ncei.noaa.gov/thredds website
 ##' @param run_parallel Logical. Download and extract files in parallel?
-##' @param ncores Optional. If run_parallel=TRUE how many cores to use?  If left as NULL will select max number -1
 ##' 
 ##' @return data.frame summarize the results of the function call
 ##' 
 ##' @examples
 ##' \dontrun{
-##' outdir <- "~/scratch/abg_data/"
+##' outdir <- directory to store downloaded data
+##' site_info <- dataframe that contains information about site_id, latitude, longitude, and site_names
+##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd
+##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index.
+##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP
+##' data_url <- OpenDAP URL that actually downloads the netcdf file.
+##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer
+##' 
 
 ##' results <- PEcAn.data.remote::download.thredds.AGB(outdir=outdir, 
 ##'            site_ids = c(676, 678, 679, 755, 767, 1000000030, 1000000145, 1000025731), 
@@ -20,84 +30,124 @@
 ##' @export
 ##' @author Bailey Morrison
 ##'
-download.thredds.AGB <- function(outdir = NULL, site_ids, run_parallel = FALSE, 
-                                    ncores = NULL) {
+download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201", "19961215"), 
+                                  varid = "LAI",
+                                  dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", 
+                                  data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files",
+                                  run_parallel = TRUE) {
+  # require("XML")
+  # require("RCurl") 
+  require("foreach")
+  
+  # check that dates are within the date range of the dataset
+  dates = c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
+  if (!(is.null(dir_url)))
+  {
+    #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads
+    result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=TRUE, dirlistonly = TRUE)
+    files = XML::getHTMLLinks(result)
+    
+    date_year_range = unique(range(c(year(as.Date(dates[1], "%Y")), year(as.Date(dates[2], "%Y")))))
+    if (all((!(substr(files, 1, 4) %in% date_year_range))))
+    {
+      # give warning that dates aren't available
+      print(test)
+    }
+    
+  }
+  
+  # get list of catalog file links to determine actual dates that can be downloaded with in user range
+  links = vector()
+  for (i in 1:length(date_year_range))
+  {
+    links[i] = RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=T, dirlistonly = T)
+  }
   
+  # get list of all dates available from year range provided
+  files = foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i])
   
-  bety <- list(user='bety', password='bety', host='localhost',
-               dbname='bety', driver='PostgreSQL',write=TRUE)
-  con <- PEcAn.DB::db.open(bety)
-  bety$con <- con
-  site_ID <- as.character(site_ids)
-  suppressWarnings(site_qry <- glue::glue_sql("SELECT *, ST_X(ST_CENTROID(geometry)) AS lon,
-                                              ST_Y(ST_CENTROID(geometry)) AS lat FROM sites WHERE id IN ({ids*})",
-                                              ids = site_ID, .con = con))
-  suppressWarnings(qry_results <- DBI::dbSendQuery(con,site_qry))
-  suppressWarnings(qry_results <- DBI::dbFetch(qry_results))
-  site_info <- list(site_id=qry_results$id, site_name=qry_results$sitename, lat=qry_results$lat,
-                    lon=qry_results$lon, time_zone=qry_results$time_zone)
+  #remove files with no dates and get list of dates available.
+  index_dates = regexpr(pattern = "[0-9]{8}", files)
+  files = files[-(which(index_dates < 0))]
+  index_dates = index_dates[which(index_dates > 0)]
   
-  mylat = site_info$lat
-  mylon = site_info$lon
+  # get list of files that fall within the specific date range user asks for (Ymd, not Y)
+  dates_avail = as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d")
+  date_range = seq(dates[1], dates[2], by = "day")
+  get_dates =  date_range[which(date_range %in% dates_avail)]
   
-  # site specific URL for dataset --> these will be made to work for all THREDDS datasets in the future, but for now, just testing with
-  # this one dataset. This specific dataset only has 1 year (2005), so no temporal looping for now.
-  obs_file = "https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1221/agb_5k.nc4"
-  obs_err = "https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1221/agb_SE_5k.nc4"
-  files = c(obs_file, obs_err)
+  # only keep files that are within the true yyyymmdd date range user requested
+  files = files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))]
+  filenames = basename(files)
   
-  # function to extract ncdf data from lat and lon values for value + SE URLs
-  get_data = function(i)
+  # user must supply data_URL or the netcdf files cannot be downloaded through thredds. if user has supplied no data_url, the job will fail
+  # supply a warning
+  if (!(is.null(data_url)))
   {
-    data = ncdf4::nc_open(files[1])
-    agb_lats = ncdf4::ncvar_get(data, "latitude")
-    agb_lons = ncdf4::ncvar_get(data, "longitude")
-    
-    agb_x = which(abs(agb_lons- mylon[i]) == min(abs(agb_lons - mylon[i])))
-    agb_y = which(abs(agb_lats- mylat[i]) == min(abs(agb_lats - mylat[i])))
-   
-    start = c(agb_x, agb_y)
-    count = c(1,1)
-    d  = ncdf4::ncvar_get(ncdf4::nc_open(files[1]), "abvgrndbiomass", start=start, count = count)
-    if (is.na(d)) d <- NA
-    sd = ncdf4::ncvar_get(ncdf4::nc_open(files[2]), "agbSE", start=start, count = count)
-    if (is.na(sd)) sd <- NA
-    date = "2005"
-    site = site_ID[i]
-    output = as.data.frame(cbind(d, sd, date, site))
-    names(output) = c("value", "sd", "date", "siteID")
+    #https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1981/AVHRR-Land_v005_AVH15C1_NOAA-07_19810624_c20181025194251.nc.html
+    # this is what a link looks like to download threeds data.
+    urls = sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/"))
     
-    # option to save output dataset to directory for user.
-    if (!(is.null(outdir)))
+    extract_nc = function(site_info, url, run_parallel)
     {
-      write.csv(output, file = paste0(outdir, "THREDDS_", sub("^([^.]*).*", "\\1",basename(files[1])), "_site_", site, ".csv"), row.names = FALSE)
+      require("foreach")
+      require("ncdf4")
+      
+      mylats = site_info$lat
+      mylons = site_info$lon
+      sites = site_info$site_id
+      
+      # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf
+      data = ncdf4::nc_open(url)
+      vars = names(data$var)
+      var = vars[grep(vars, pattern = varid, ignore.case = T)]
+      
+      # get list of all xy coordinates in netcdf
+      lats = ncdf4::ncvar_get(data, "latitude")
+      lons = ncdf4::ncvar_get(data, "longitude")
+      
+      # find the cell that site coordinates are located in
+      dist_y = foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2)
+      dist_x = foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2)
+      y = foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = T)
+      x = foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = T)
+      
+      scale = data$var[[var]]$scaleFact
+      
+      d = as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1)))
+      
+      info = as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = F)
+      names(info) = c("site_id", "lon", "lat", "value")
+      
+      return(info)
     }
     
-    return(output)
-  }
-  
-  ## setup parallel
-  if (run_parallel) {
-    if (!is.null(ncores)) {
-      ncores <- ncores
-    } else {
-      ncores <- parallel::detectCores() -1
-    }
-    require(doParallel)
-    PEcAn.logger::logger.info(paste0("Running in parallel with: ", ncores))
-    cl = parallel::makeCluster(ncores)
-    doParallel::registerDoParallel(cl)
-    data = foreach(i = seq_along(mylat), .combine = rbind) %dopar% get_data(i)
-    stopCluster(cl)
     
-  } else {
-    # setup sequential run
-    data = data.frame()
-    for (i in seq_along(mylat))
+    
+    if (run_parallel)
     {
-      data = rbind(data, get_data(i))
+      require("parallel")
+      require("doParallel")
+      ncores = parallel::detectCores(all.tests = FALSE, logical = TRUE)
+      if (ncores >= 3)
+      {
+        # failsafe in case someone has a computer with 2 nodes.
+        ncores = ncores-2
+      }
+      # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time
+      if (ncores >= 10)
+      {
+        ncores = 9 # went 1 less becasue it still fails sometimes
+      }
+      cl <- parallel::makeCluster(ncores, outfile="")
+      doParallel::registerDoParallel(cl)
+      output = foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel)
+      stopCluster(cl)
+    } else {
+      output = foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel)
     }
+    
+    return(output)
+    
   }
-  
-  return(data)
 }

From 2071b3eb7492e1ad7c886f45ce1c7111d2ce174d Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 12:53:08 -0400
Subject: [PATCH 02/11] some updated changes

---
 modules/data.remote/R/download.thredds.R | 151 +++++++++++++++--------
 1 file changed, 97 insertions(+), 54 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index 7c1babaa7a7..aed5d5ed835 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -1,9 +1,58 @@
-#
-##' @title download.thredds.data
-##' @name  download.thredds.data
+##' @title get_site_info
+##' @name  get_site_info
+##' 
+##' 
+##' @param xmlfile full path to pecan xml settings file
+##' 
+##' 
+##' @return a list of site information derived from BETY using a pecan .xml settings file with site_id, site_name, lat, lon, and time_zone.
+##' 
+##' @examples
+##' \dontrun{
+##' xmlfile <- the full path to a pecan .xml settings file.
+##' 
+
+##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml")
+##'            
+##' @export
+##' @author Bailey Morrison
+##'
+get_site_info <- function(xmlfile) {
+  require(PEcAn.all)
+  
+  settings <- read.settings(xmlfile)
+  
+  observation <- c()
+  for (i in seq_along(1:length(settings$run))) {
+    command <- paste0("settings$run$settings.", i, "$site$id")
+    obs <- eval(parse(text = command))
+    observation <- c(observation, obs)
+  }
+  
+  
+  PEcAn.logger::logger.info("**** Extracting LandTrendr AGB data for model sites ****")
+  bety <- list(user = 'bety', password = 'bety', host = 'localhost',
+               dbname = 'bety', driver = 'PostgreSQL', write = TRUE)
+  con <- PEcAn.DB::db.open(bety)
+  bety$con <- con
+  site_ID <- observation
+  suppressWarnings(site_qry <- glue::glue_sql("SELECT *, ST_X(ST_CENTROID(geometry)) AS lon,
+                                              ST_Y(ST_CENTROID(geometry)) AS lat FROM sites WHERE id IN ({ids*})",
+                                              ids = site_ID, .con = con))
+  suppressWarnings(qry_results <- DBI::dbSendQuery(con,site_qry))
+  suppressWarnings(qry_results <- DBI::dbFetch(qry_results))
+  site_info <- list(site_id = qry_results$id, site_name = qry_results$sitename, lat = qry_results$lat,
+                    lon = qry_results$lon, time_zone = qry_results$time_zone)
+  return(site_info)
+}
+
+
+##' @title download.thredds
+##' @name  download.thredds
+##' 
 ##' 
 ##' @param outdir file location to place output
-##' @param site_info information about the site. i.e. site_id, latitude, longitude 
+##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
 ##' @param dates character vector of start and end date for dataset as YYYYmmdd
 ##' @param varid character vector of shorthand variable name. i.e. LAI
 ##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website
@@ -15,7 +64,7 @@
 ##' @examples
 ##' \dontrun{
 ##' outdir <- directory to store downloaded data
-##' site_info <- dataframe that contains information about site_id, latitude, longitude, and site_names
+##' site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone
 ##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd
 ##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index.
 ##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP
@@ -23,62 +72,56 @@
 ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer
 ##' 
 
-##' results <- PEcAn.data.remote::download.thredds.AGB(outdir=outdir, 
-##'            site_ids = c(676, 678, 679, 755, 767, 1000000030, 1000000145, 1000025731), 
-##'            run_parallel = TRUE, ncores = 8)
+##' results <- download_thredds(outdir = NULL, site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE)
+##'            
 ##'            
 ##' @export
 ##' @author Bailey Morrison
 ##'
-download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201", "19961215"), 
-                                  varid = "LAI",
-                                  dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", 
-                                  data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files",
-                                  run_parallel = TRUE) {
-  # require("XML")
-  # require("RCurl") 
+download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, data_url,run_parallel = TRUE) {
+  
   require("foreach")
   
   # check that dates are within the date range of the dataset
-  dates = c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
+  dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
   if (!(is.null(dir_url)))
   {
     #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads
-    result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=TRUE, dirlistonly = TRUE)
-    files = XML::getHTMLLinks(result)
+    result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE)
+    files <- XML::getHTMLLinks(result)
     
-    date_year_range = unique(range(c(year(as.Date(dates[1], "%Y")), year(as.Date(dates[2], "%Y")))))
+    date_year_range <- unique(range(c(lubridate::year(as.Date(dates[1], "%Y")), lubridate::year(as.Date(dates[2], "%Y")))))
     if (all((!(substr(files, 1, 4) %in% date_year_range))))
     {
       # give warning that dates aren't available
-      print(test)
+      print("something")
     }
     
   }
   
   # get list of catalog file links to determine actual dates that can be downloaded with in user range
-  links = vector()
+  links <- vector()
   for (i in 1:length(date_year_range))
   {
-    links[i] = RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=T, dirlistonly = T)
+    links[i] <- RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose= FALSE, ftp.use.epsv = TRUE, dirlistonly = TRUE)
   }
   
   # get list of all dates available from year range provided
-  files = foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i])
+  files <- foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i])
   
   #remove files with no dates and get list of dates available.
-  index_dates = regexpr(pattern = "[0-9]{8}", files)
-  files = files[-(which(index_dates < 0))]
-  index_dates = index_dates[which(index_dates > 0)]
+  index_dates <- regexpr(pattern = "[0-9]{8}", files)
+  files <- files[-(which(index_dates < 0))]
+  index_dates <- index_dates[which(index_dates > 0)]
   
   # get list of files that fall within the specific date range user asks for (Ymd, not Y)
-  dates_avail = as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d")
-  date_range = seq(dates[1], dates[2], by = "day")
-  get_dates =  date_range[which(date_range %in% dates_avail)]
+  dates_avail <- as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d")
+  date_range <- seq(dates[1], dates[2], by = "day")
+  get_dates <-  date_range[which(date_range %in% dates_avail)]
   
   # only keep files that are within the true yyyymmdd date range user requested
-  files = files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))]
-  filenames = basename(files)
+  files <- files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))]
+  filenames <- basename(files)
   
   # user must supply data_URL or the netcdf files cannot be downloaded through thredds. if user has supplied no data_url, the job will fail
   # supply a warning
@@ -86,38 +129,38 @@ download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201"
   {
     #https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1981/AVHRR-Land_v005_AVH15C1_NOAA-07_19810624_c20181025194251.nc.html
     # this is what a link looks like to download threeds data.
-    urls = sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/"))
+    urls <- sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/"))
     
-    extract_nc = function(site_info, url, run_parallel)
+    extract_nc <- function(site_info, url, run_parallel)
     {
       require("foreach")
       require("ncdf4")
       
-      mylats = site_info$lat
-      mylons = site_info$lon
-      sites = site_info$site_id
+      mylats <- site_info$lat
+      mylons <- site_info$lon
+      sites <- site_info$site_id
       
       # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf
-      data = ncdf4::nc_open(url)
-      vars = names(data$var)
-      var = vars[grep(vars, pattern = varid, ignore.case = T)]
+      data <- ncdf4::nc_open(url)
+      vars <- names(data$var)
+      var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)]
       
       # get list of all xy coordinates in netcdf
-      lats = ncdf4::ncvar_get(data, "latitude")
-      lons = ncdf4::ncvar_get(data, "longitude")
+      lats <- ncdf4::ncvar_get(data, "latitude")
+      lons <- ncdf4::ncvar_get(data, "longitude")
       
       # find the cell that site coordinates are located in
-      dist_y = foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2)
-      dist_x = foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2)
-      y = foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = T)
-      x = foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = T)
+      dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2)
+      dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2)
+      y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE)
+      x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE)
       
-      scale = data$var[[var]]$scaleFact
+      scale <- data$var[[var]]$scaleFact
       
-      d = as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1)))
+      d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1)))
       
-      info = as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = F)
-      names(info) = c("site_id", "lon", "lat", "value")
+      info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE)
+      names(info) <- c("site_id", "lon", "lat", "value")
       
       return(info)
     }
@@ -128,23 +171,23 @@ download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201"
     {
       require("parallel")
       require("doParallel")
-      ncores = parallel::detectCores(all.tests = FALSE, logical = TRUE)
+      ncores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
       if (ncores >= 3)
       {
         # failsafe in case someone has a computer with 2 nodes.
-        ncores = ncores-2
+        ncores <- ncores-2
       }
       # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time
       if (ncores >= 10)
       {
-        ncores = 9 # went 1 less becasue it still fails sometimes
+        ncores <- 9 # went 1 less becasue it still fails sometimes
       }
       cl <- parallel::makeCluster(ncores, outfile="")
       doParallel::registerDoParallel(cl)
-      output = foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel)
+      output <- foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel)
       stopCluster(cl)
     } else {
-      output = foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel)
+      output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel)
     }
     
     return(output)

From 0ff2b048ffb869de53ec32297cc41dba63e520c2 Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 13:08:05 -0400
Subject: [PATCH 03/11] added outdir option in function

---
 modules/data.remote/R/download.thredds.R | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index aed5d5ed835..0f94781948e 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -72,13 +72,13 @@ get_site_info <- function(xmlfile) {
 ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer
 ##' 
 
-##' results <- download_thredds(outdir = NULL, site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE)
+##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL)
 ##'            
 ##'            
 ##' @export
 ##' @author Bailey Morrison
 ##'
-download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, data_url,run_parallel = TRUE) {
+download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = TRUE, outdir = NULL) {
   
   require("foreach")
   
@@ -107,7 +107,7 @@ download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, da
   }
   
   # get list of all dates available from year range provided
-  files <- foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i])
+  files <- foreach::foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i])
   
   #remove files with no dates and get list of dates available.
   index_dates <- regexpr(pattern = "[0-9]{8}", files)
@@ -190,6 +190,11 @@ download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, da
       output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel)
     }
     
+    if (outdir)
+    {
+      write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = ""))
+    } 
+    
     return(output)
     
   }

From bd136a267c55844631937998b399cec9da9112ea Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 13:24:25 -0400
Subject: [PATCH 04/11] added date corrections

---
 modules/data.remote/R/download.thredds.R | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index 0f94781948e..a82eb30e35e 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -82,8 +82,24 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
   
   require("foreach")
   
-  # check that dates are within the date range of the dataset
-  dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
+  #### check that dates are within the date range of the dataset
+  
+  #first make sure dates are in date format. Correct if not.
+  if (!(lubridate::is.Date(dates))){
+    if (!(is.character(dates))) {
+      dates = as.character(dates)
+    }
+    if (length(grep(dates, pattern = "-")) > 0) {
+      dates <- c(as.Date(dates[1], "%Y-%m-%d"), as.Date(dates[2], "%Y-%m-%d"))
+    } else {
+      dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
+    }
+    # Julien Date
+    if (nchar(dates) == 7) {
+      dates <- c(as.Date(dates[1], "%Y%j"), as.Date(dates[2], "%Y%j"))
+    }
+  }
+  
   if (!(is.null(dir_url)))
   {
     #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads

From 89af55e67fe90a46daa0d33521c7e38e0a6e16a1 Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 13:27:58 -0400
Subject: [PATCH 05/11] updated @params

---
 modules/data.remote/R/download.thredds.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index a82eb30e35e..99bc83a50b8 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -53,7 +53,7 @@ get_site_info <- function(xmlfile) {
 ##' 
 ##' @param outdir file location to place output
 ##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
-##' @param dates character vector of start and end date for dataset as YYYYmmdd
+##' @param dates vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object.
 ##' @param varid character vector of shorthand variable name. i.e. LAI
 ##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website
 ##' @param data_url opendap url of data from ncei.noaa.gov/thredds website
@@ -65,7 +65,7 @@ get_site_info <- function(xmlfile) {
 ##' \dontrun{
 ##' outdir <- directory to store downloaded data
 ##' site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone
-##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd
+##' dates <- date range to download data. Should be a vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object.
 ##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index.
 ##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP
 ##' data_url <- OpenDAP URL that actually downloads the netcdf file.

From 0629c0b986798f7c28cd4fab7aab78ebbbfbd75c Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 13:37:35 -0400
Subject: [PATCH 06/11] updated date_year_range

---
 modules/data.remote/R/download.thredds.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index 99bc83a50b8..d316df02415 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -106,7 +106,7 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
     result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE)
     files <- XML::getHTMLLinks(result)
     
-    date_year_range <- unique(range(c(lubridate::year(as.Date(dates[1], "%Y")), lubridate::year(as.Date(dates[2], "%Y")))))
+    date_year_range = unique(lubridate::year(dates))
     if (all((!(substr(files, 1, 4) %in% date_year_range))))
     {
       # give warning that dates aren't available

From 10eae350b20bee234656ab059e09494e997010d0 Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 14:36:45 -0400
Subject: [PATCH 07/11] separated nc extract function from download function

---
 modules/data.remote/R/download.thredds.R | 113 ++++++++++++++---------
 1 file changed, 70 insertions(+), 43 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index d316df02415..0060d4c1e2f 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -78,8 +78,10 @@ get_site_info <- function(xmlfile) {
 ##' @export
 ##' @author Bailey Morrison
 ##'
-download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = TRUE, outdir = NULL) {
+download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) {
   
+  #until the issues with parallel runs are fixed.
+  run_parallel = FALSE
   require("foreach")
   
   #### check that dates are within the date range of the dataset
@@ -147,50 +149,16 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
     # this is what a link looks like to download threeds data.
     urls <- sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/"))
     
-    extract_nc <- function(site_info, url, run_parallel)
-    {
-      require("foreach")
-      require("ncdf4")
-      
-      mylats <- site_info$lat
-      mylons <- site_info$lon
-      sites <- site_info$site_id
-      
-      # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf
-      data <- ncdf4::nc_open(url)
-      vars <- names(data$var)
-      var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)]
-      
-      # get list of all xy coordinates in netcdf
-      lats <- ncdf4::ncvar_get(data, "latitude")
-      lons <- ncdf4::ncvar_get(data, "longitude")
-      
-      # find the cell that site coordinates are located in
-      dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2)
-      dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2)
-      y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE)
-      x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE)
-      
-      scale <- data$var[[var]]$scaleFact
-      
-      d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1)))
-      
-      info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE)
-      names(info) <- c("site_id", "lon", "lat", "value")
-      
-      return(info)
-    }
-    
-    
-    
+    # parallel seems to have a problem right now with > 500 urls.
     if (run_parallel)
     {
-      require("parallel")
+      #require("parallel")
       require("doParallel")
       ncores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+      # This is a failsafe for computers with low numbers of CPUS to reduce risk of blowing RAM.
       if (ncores >= 3)
       {
-        # failsafe in case someone has a computer with 2 nodes.
+        # failsafe in case someone has a computer with 2-4 nodes.
         ncores <- ncores-2
       }
       # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time
@@ -200,14 +168,15 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
       }
       cl <- parallel::makeCluster(ncores, outfile="")
       doParallel::registerDoParallel(cl)
-      output <- foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel)
-      stopCluster(cl)
+      output <- foreach(i = urls, .combine = rbind) %dopar% extract_thredds_nc(site_info = site_info, url = i)
+      parallel::stopCluster(cl)
     } else {
-      output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel)
+      output <- foreach(i = urls, .combine = rbind) %do% extract_thredds_nc(site_info, url = i)
     }
     
-    if (outdir)
+    if (!(is.null(outdir)))
     {
+      # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates.
       write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = ""))
     } 
     
@@ -215,3 +184,61 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
     
   }
 }
+
+##' @title extract_thredds_nc
+##' @name  extract_thredds_nc
+##' 
+##' 
+##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
+##' @param url a THREDDS url of a .nc file to extract data from.
+##' @param run_parallel T or F option to extra data in parallel.
+##' 
+##' 
+##' @return a dataframe with the values for each date/site combination from a THREDDS file 
+##' 
+##' @examples
+##' \dontrun{
+##' site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
+##' url <- url a THREDDS url of a .nc file to extract data from.
+##' run_parallel <- T or F option to extra data in parallel.
+
+##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml")
+##'            
+##' @export
+##' @author Bailey Morrison
+##'
+extract_thredds_nc <- function(site_info, url_info, run_parallel)
+{
+  #print(url)
+  require("foreach")
+  require("ncdf4")
+  
+  mylats <- site_info$lat
+  mylons <- site_info$lon
+  sites <- site_info$site_id
+  
+  # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf
+  data <- ncdf4::nc_open(url_info)
+  vars <- names(data$var)
+  var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)]
+  
+  # get list of all xy coordinates in netcdf
+  lats <- ncdf4::ncvar_get(data, "latitude")
+  lons <- ncdf4::ncvar_get(data, "longitude")
+  
+  # find the cell that site coordinates are located in
+  dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2)
+  dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2)
+  y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE)
+  x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE)
+  
+  scale <- data$var[[var]]$scaleFact
+  
+  d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1)))
+  
+  info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE)
+  names(info) <- c("site_id", "lon", "lat", "value")
+  
+  return(info)
+}
+

From 24b835fa41271c66d8b2a6b388473e9ca5d7cba3 Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Wed, 16 Oct 2019 14:45:28 -0400
Subject: [PATCH 08/11]  fixed documentation issue

---
 modules/data.remote/NAMESPACE                 |  4 ++-
 modules/data.remote/R/download.thredds.R      | 14 ++++----
 .../data.remote/man/download.thredds.AGB.Rd   | 27 ---------------
 modules/data.remote/man/download.thredds.Rd   | 34 +++++++++++++++++++
 modules/data.remote/man/extract_thredds_nc.Rd | 24 +++++++++++++
 modules/data.remote/man/get_site_info.Rd      | 20 +++++++++++
 6 files changed, 87 insertions(+), 36 deletions(-)
 delete mode 100644 modules/data.remote/man/download.thredds.AGB.Rd
 create mode 100644 modules/data.remote/man/download.thredds.Rd
 create mode 100644 modules/data.remote/man/extract_thredds_nc.Rd
 create mode 100644 modules/data.remote/man/get_site_info.Rd

diff --git a/modules/data.remote/NAMESPACE b/modules/data.remote/NAMESPACE
index d84c728b44c..2bb4941b69a 100644
--- a/modules/data.remote/NAMESPACE
+++ b/modules/data.remote/NAMESPACE
@@ -3,6 +3,8 @@
 export(call_MODIS)
 export(download.LandTrendr.AGB)
 export(download.NLCD)
-export(download.thredds.AGB)
+export(download_thredds)
 export(extract.LandTrendr.AGB)
 export(extract_NLCD)
+export(extract_thredds_nc)
+export(get_site_info)
diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R
index 0060d4c1e2f..b3b17b59891 100755
--- a/modules/data.remote/R/download.thredds.R
+++ b/modules/data.remote/R/download.thredds.R
@@ -13,7 +13,7 @@
 ##' 
 
 ##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml")
-##'            
+##'  }          
 ##' @export
 ##' @author Bailey Morrison
 ##'
@@ -73,7 +73,7 @@ get_site_info <- function(xmlfile) {
 ##' 
 
 ##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL)
-##'            
+##' }        
 ##'            
 ##' @export
 ##' @author Bailey Morrison
@@ -191,7 +191,6 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
 ##' 
 ##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
 ##' @param url a THREDDS url of a .nc file to extract data from.
-##' @param run_parallel T or F option to extra data in parallel.
 ##' 
 ##' 
 ##' @return a dataframe with the values for each date/site combination from a THREDDS file 
@@ -200,14 +199,13 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
 ##' \dontrun{
 ##' site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
 ##' url <- url a THREDDS url of a .nc file to extract data from.
-##' run_parallel <- T or F option to extra data in parallel.
-
-##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml")
-##'            
+##' 
+##' output <- extract_thredds_nc(site_info = site_info, url_info = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1995/AVHRR-Land_v005_AVH15C1_NOAA-14_19950201_c20180831220722.nc")
+##'}            
 ##' @export
 ##' @author Bailey Morrison
 ##'
-extract_thredds_nc <- function(site_info, url_info, run_parallel)
+extract_thredds_nc <- function(site_info, url_info)
 {
   #print(url)
   require("foreach")
diff --git a/modules/data.remote/man/download.thredds.AGB.Rd b/modules/data.remote/man/download.thredds.AGB.Rd
deleted file mode 100644
index 35dfd405cd5..00000000000
--- a/modules/data.remote/man/download.thredds.AGB.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/download.thredds.R
-\name{download.thredds.AGB}
-\alias{download.thredds.AGB}
-\title{download.thredds.AGB}
-\usage{
-download.thredds.AGB(outdir = NULL, site_ids, run_parallel = FALSE,
-  ncores = NULL)
-}
-\arguments{
-\item{outdir}{Where to place output}
-
-\item{site_ids}{What locations to download data at?}
-
-\item{run_parallel}{Logical. Download and extract files in parallel?}
-
-\item{ncores}{Optional. If run_parallel=TRUE how many cores to use?  If left as NULL will select max number -1}
-}
-\value{
-data.frame summarize the results of the function call
-}
-\description{
-download.thredds.AGB
-}
-\author{
-Bailey Morrison
-}
diff --git a/modules/data.remote/man/download.thredds.Rd b/modules/data.remote/man/download.thredds.Rd
new file mode 100644
index 00000000000..048f78957ac
--- /dev/null
+++ b/modules/data.remote/man/download.thredds.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download.thredds.R
+\name{download.thredds}
+\alias{download.thredds}
+\alias{download_thredds}
+\title{download.thredds}
+\usage{
+download_thredds(site_info, dates, varid, dir_url, data_url,
+  run_parallel = FALSE, outdir = NULL)
+}
+\arguments{
+\item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.}
+
+\item{dates}{vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object.}
+
+\item{varid}{character vector of shorthand variable name. i.e. LAI}
+
+\item{dir_url}{catalog url of data from ncei.noaa.gov/thredds website}
+
+\item{data_url}{opendap url of data from ncei.noaa.gov/thredds website}
+
+\item{run_parallel}{Logical. Download and extract files in parallel?}
+
+\item{outdir}{file location to place output}
+}
+\value{
+data.frame summarize the results of the function call
+}
+\description{
+download.thredds
+}
+\author{
+Bailey Morrison
+}
diff --git a/modules/data.remote/man/extract_thredds_nc.Rd b/modules/data.remote/man/extract_thredds_nc.Rd
new file mode 100644
index 00000000000..1f8e41ed231
--- /dev/null
+++ b/modules/data.remote/man/extract_thredds_nc.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download.thredds.R
+\name{extract_thredds_nc}
+\alias{extract_thredds_nc}
+\title{extract_thredds_nc}
+\usage{
+extract_thredds_nc(site_info, url_info, run_parallel)
+}
+\arguments{
+\item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.}
+
+\item{run_parallel}{T or F option to extra data in parallel.}
+
+\item{url}{a THREDDS url of a .nc file to extract data from.}
+}
+\value{
+a dataframe with the values for each date/site combination from a THREDDS file
+}
+\description{
+extract_thredds_nc
+}
+\author{
+Bailey Morrison
+}
diff --git a/modules/data.remote/man/get_site_info.Rd b/modules/data.remote/man/get_site_info.Rd
new file mode 100644
index 00000000000..e73834879ce
--- /dev/null
+++ b/modules/data.remote/man/get_site_info.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download.thredds.R
+\name{get_site_info}
+\alias{get_site_info}
+\title{get_site_info}
+\usage{
+get_site_info(xmlfile)
+}
+\arguments{
+\item{xmlfile}{full path to pecan xml settings file}
+}
+\value{
+a list of site information derived from BETY using a pecan .xml settings file with site_id, site_name, lat, lon, and time_zone.
+}
+\description{
+get_site_info
+}
+\author{
+Bailey Morrison
+}

From bd6d4b57e8f101f2dcbac16d659de6e3125b94ee Mon Sep 17 00:00:00 2001
From: "bmorrison@bnl.gov" <bmorrison@modex.bnl.gov>
Date: Thu, 24 Oct 2019 15:49:49 -0400
Subject: [PATCH 09/11] some other changes i dont remember

---
 modules/data.remote/man/download.thredds.Rd   | 14 ++++++++++++++
 modules/data.remote/man/extract_thredds_nc.Rd | 12 +++++++++---
 modules/data.remote/man/get_site_info.Rd      |  7 +++++++
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/modules/data.remote/man/download.thredds.Rd b/modules/data.remote/man/download.thredds.Rd
index 048f78957ac..9983594ad5a 100644
--- a/modules/data.remote/man/download.thredds.Rd
+++ b/modules/data.remote/man/download.thredds.Rd
@@ -28,6 +28,20 @@ data.frame summarize the results of the function call
 }
 \description{
 download.thredds
+}
+\examples{
+\dontrun{
+outdir <- directory to store downloaded data
+site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone
+dates <- date range to download data. Should be a vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object.
+varod <- character shorthand name of variable to download. Example: LAI for leaf area index.
+dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP
+data_url <- OpenDAP URL that actually downloads the netcdf file.
+run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer
+
+results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL)
+}        
+           
 }
 \author{
 Bailey Morrison
diff --git a/modules/data.remote/man/extract_thredds_nc.Rd b/modules/data.remote/man/extract_thredds_nc.Rd
index 1f8e41ed231..694fdafcc1c 100644
--- a/modules/data.remote/man/extract_thredds_nc.Rd
+++ b/modules/data.remote/man/extract_thredds_nc.Rd
@@ -4,13 +4,11 @@
 \alias{extract_thredds_nc}
 \title{extract_thredds_nc}
 \usage{
-extract_thredds_nc(site_info, url_info, run_parallel)
+extract_thredds_nc(site_info, url_info)
 }
 \arguments{
 \item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.}
 
-\item{run_parallel}{T or F option to extra data in parallel.}
-
 \item{url}{a THREDDS url of a .nc file to extract data from.}
 }
 \value{
@@ -19,6 +17,14 @@ a dataframe with the values for each date/site combination from a THREDDS file
 \description{
 extract_thredds_nc
 }
+\examples{
+\dontrun{
+site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. 
+url <- url a THREDDS url of a .nc file to extract data from.
+
+output <- extract_thredds_nc(site_info = site_info, url_info = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1995/AVHRR-Land_v005_AVH15C1_NOAA-14_19950201_c20180831220722.nc")
+}            
+}
 \author{
 Bailey Morrison
 }
diff --git a/modules/data.remote/man/get_site_info.Rd b/modules/data.remote/man/get_site_info.Rd
index e73834879ce..98d06bff0f5 100644
--- a/modules/data.remote/man/get_site_info.Rd
+++ b/modules/data.remote/man/get_site_info.Rd
@@ -15,6 +15,13 @@ a list of site information derived from BETY using a pecan .xml settings file wi
 \description{
 get_site_info
 }
+\examples{
+\dontrun{
+xmlfile <- the full path to a pecan .xml settings file.
+
+site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml")
+ }          
+}
 \author{
 Bailey Morrison
 }

From 1e1fbefda2a094d2f26838189ba091ec619cc3ff Mon Sep 17 00:00:00 2001
From: Morrison <bmorrison@modex.bnl.gov>
Date: Thu, 14 May 2020 15:37:45 -0400
Subject: [PATCH 10/11] some changes I dont remember

---
 .../R/download.thredds.AVHRR.monthAGG.R       | 78 +++++++++----------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R b/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R
index 0084adda9fd..16aaf2b2e6b 100755
--- a/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R
+++ b/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R
@@ -72,49 +72,32 @@ get_site_info <- function(xmlfile) {
 ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer
 ##' 
 
-##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = FALSE, outdir = NULL)
+##' results <- download_thredds(site_info = site_info, years = c("2000", "2003"), months = c(6,7,8), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = FALSE, outdir = NULL)
 ##' }        
 ##' @importFrom foreach %do% %dopar%           
 ##' @export
 ##' @author Bailey Morrison
 ##'
-download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) {
+download_thredds <- function(site_info, years, months, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) {
   
   #until the issues with parallel runs are fixed.
   run_parallel = FALSE
-  #require("foreach")
-  
-  
-  #### check that dates are within the date range of the dataset
-  
-  #first make sure dates are in date format. Correct if not.
-  if (!(lubridate::is.Date(dates))){
-    if (!(is.character(dates))) {
-      dates = as.character(dates)
-    }
-    if (length(grep(dates, pattern = "-")) > 0) {
-      dates <- c(as.Date(dates[1], "%Y-%m-%d"), as.Date(dates[2], "%Y-%m-%d"))
-    } else {
-      dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d"))
-    }
-    # Julien Date
-    if (any(nchar(dates) == 7)) {
-      dates <- c(as.Date(dates[1], "%Y%j"), as.Date(dates[2], "%Y%j"))
-    }
-  }
-  
-  date_range = unique(lubridate::year(seq(dates[1], dates[2], by = '1 year')))
+
   
-  output = data.frame()
+  #assumes there is a max of 31 possible days in a month. This covers leap years!
+  years_range = sort(rep(seq(years[1], years[2]), 31))
+
   if (!(is.null(dir_url)))
   {
-    for (i in seq_along(date_range))
+    output = data.frame()
+    
+    for (i in seq_along(unique(years_range)))
     {
-      result <- RCurl::getURL(paste(dir_url, date_range[i], "/catalog.html", sep = "/"), 
+      result <- RCurl::getURL(paste(dir_url, unique(years_range)[i], "/catalog.html", sep = "/"), 
                               verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE)
       files <- XML::getHTMLLinks(result)
       
-      index_dates <- regexpr(pattern = "_[0-9]{8}_", files)
+      index_dates <- regexpr(pattern = paste0("_[0-9]{4}0[", months[1], "-", months[length(months)], "]{1}[0-9]{2}_"), files)
       files <- files[-(which(index_dates < 0))]
       index_dates <- index_dates[which(index_dates > 0)]
       
@@ -148,19 +131,35 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
         } else {
           out <- foreach::foreach(i = urls, .combine = rbind) %do% 
             extract_thredds_nc(site_info, url_info = i)
-        }
-        output = rbind(output, out)
-        
-        if (!(is.null(outdir)))
-        {
-          # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates.
-          write.csv(out, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = ""))
-        } 
+          
+          # get max LAI for each site instead of all days with missing NA fillers
+          test = foreach::foreach(i = unique(out$site_id), .combine = rbind) %do%
+            max_lai(x = out, site = i)
+          test$date = lubridate::year(test$date)
+          
+          output = rbind(output, test)
+          
+          }
       }
-      
     }
+    
+    # if (!(is.null(outdir)))
+    #           {
+    #             # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates.
+    #             write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", years[1], "-", years[2], "_",months[1], "-", months[length(months)], ".csv", sep = ""))
+    #           }
+    return(output)
   }
-  return(output)
+}   
+    
+    
+
+
+max_lai = function(x, site)
+{
+  site_info_max = as.data.frame(x[x$site_id == site,][1,1:4], stringsAsFactors = FALSE)
+  site_info_max$max = as.numeric(max(x[x$site_id == site,]$value, na.rm = TRUE))
+  return(site_info_max)
 }
 
 
@@ -186,9 +185,6 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
 ##'
 extract_thredds_nc <- function(site_info, url_info)
 {
-  #print(url)
-  #require("foreach")
-  #require("ncdf4")
   index = regexpr(pattern = "_[0-9]{8}_", url_info)
   date<- as.Date(substr(url_info, index+1, index+8), "%Y%m%d")
   

From 936e3b77c4daa29d5530e674306b1adab4a6e008 Mon Sep 17 00:00:00 2001
From: Morrison <bmorrison@modex.bnl.gov>
Date: Mon, 1 Jun 2020 22:54:08 -0400
Subject: [PATCH 11/11] update thredds function

---
 modules/data.remote/R/download.thredds.AVHRR.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/data.remote/R/download.thredds.AVHRR.R b/modules/data.remote/R/download.thredds.AVHRR.R
index 9dfad069062..5575396db58 100755
--- a/modules/data.remote/R/download.thredds.AVHRR.R
+++ b/modules/data.remote/R/download.thredds.AVHRR.R
@@ -146,8 +146,11 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para
           #   extract_thredds_nc(site_info = site_info, url_info = i)
           # parallel::stopCluster(cl)
         } else {
-          out <- foreach::foreach(i = urls, .combine = rbind) %do% 
-            extract_thredds_nc(site_info, url_info = i)
+          #start_time <- Sys.time()
+          out <- foreach::foreach(j = urls, .combine = rbind) %do% 
+            extract_thredds_nc(site_info, url_info = j)
+          # end_time <- Sys.time()
+          # end_time - start_time
         }
         output = rbind(output, out)