From 4c803220b4715ee054c1e7ff3e21c6930c342554 Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Tue, 15 Oct 2019 16:37:18 -0400 Subject: [PATCH 01/11] added updated thredds function --- modules/data.remote/R/download.thredds.R | 194 ++++++++++++++--------- 1 file changed, 122 insertions(+), 72 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index 4b3fadacd2f..7c1babaa7a7 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -1,17 +1,27 @@ # -##' @title download.thredds.AGB -##' @name download.thredds.AGB +##' @title download.thredds.data +##' @name download.thredds.data ##' -##' @param outdir Where to place output -##' @param site_ids What locations to download data at? +##' @param outdir file location to place output +##' @param site_info information about the site. i.e. site_id, latitude, longitude +##' @param dates character vector of start and end date for dataset as YYYYmmdd +##' @param varid character vector of shorthand variable name. i.e. LAI +##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website +##' @param data_url opendap url of data from ncei.noaa.gov/thredds website ##' @param run_parallel Logical. Download and extract files in parallel? -##' @param ncores Optional. If run_parallel=TRUE how many cores to use? If left as NULL will select max number -1 ##' ##' @return data.frame summarize the results of the function call ##' ##' @examples ##' \dontrun{ -##' outdir <- "~/scratch/abg_data/" +##' outdir <- directory to store downloaded data +##' site_info <- dataframe that contains information about site_id, latitude, longitude, and site_names +##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd +##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index. +##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP +##' data_url <- OpenDAP URL that actually downloads the netcdf file. +##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer +##' ##' results <- PEcAn.data.remote::download.thredds.AGB(outdir=outdir, ##' site_ids = c(676, 678, 679, 755, 767, 1000000030, 1000000145, 1000025731), @@ -20,84 +30,124 @@ ##' @export ##' @author Bailey Morrison ##' -download.thredds.AGB <- function(outdir = NULL, site_ids, run_parallel = FALSE, - ncores = NULL) { +download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201", "19961215"), + varid = "LAI", + dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", + data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", + run_parallel = TRUE) { + # require("XML") + # require("RCurl") + require("foreach") + + # check that dates are within the date range of the dataset + dates = c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) + if (!(is.null(dir_url))) + { + #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads + result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=TRUE, dirlistonly = TRUE) + files = XML::getHTMLLinks(result) + + date_year_range = unique(range(c(year(as.Date(dates[1], "%Y")), year(as.Date(dates[2], "%Y"))))) + if (all((!(substr(files, 1, 4) %in% date_year_range)))) + { + # give warning that dates aren't available + print(test) + } + + } + + # get list of catalog file links to determine actual dates that can be downloaded with in user range + links = vector() + for (i in 1:length(date_year_range)) + { + links[i] = RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=T, dirlistonly = T) + } + # get list of all dates available from year range provided + files = foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i]) - bety <- list(user='bety', password='bety', host='localhost', - dbname='bety', driver='PostgreSQL',write=TRUE) - con <- PEcAn.DB::db.open(bety) - bety$con <- con - site_ID <- as.character(site_ids) - suppressWarnings(site_qry <- glue::glue_sql("SELECT *, ST_X(ST_CENTROID(geometry)) AS lon, - ST_Y(ST_CENTROID(geometry)) AS lat FROM sites WHERE id IN ({ids*})", - ids = site_ID, .con = con)) - suppressWarnings(qry_results <- DBI::dbSendQuery(con,site_qry)) - suppressWarnings(qry_results <- DBI::dbFetch(qry_results)) - site_info <- list(site_id=qry_results$id, site_name=qry_results$sitename, lat=qry_results$lat, - lon=qry_results$lon, time_zone=qry_results$time_zone) + #remove files with no dates and get list of dates available. + index_dates = regexpr(pattern = "[0-9]{8}", files) + files = files[-(which(index_dates < 0))] + index_dates = index_dates[which(index_dates > 0)] - mylat = site_info$lat - mylon = site_info$lon + # get list of files that fall within the specific date range user asks for (Ymd, not Y) + dates_avail = as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d") + date_range = seq(dates[1], dates[2], by = "day") + get_dates = date_range[which(date_range %in% dates_avail)] - # site specific URL for dataset --> these will be made to work for all THREDDS datasets in the future, but for now, just testing with - # this one dataset. This specific dataset only has 1 year (2005), so no temporal looping for now. - obs_file = "https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1221/agb_5k.nc4" - obs_err = "https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1221/agb_SE_5k.nc4" - files = c(obs_file, obs_err) + # only keep files that are within the true yyyymmdd date range user requested + files = files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))] + filenames = basename(files) - # function to extract ncdf data from lat and lon values for value + SE URLs - get_data = function(i) + # user must supply data_URL or the netcdf files cannot be downloaded through thredds. if user has supplied no data_url, the job will fail + # supply a warning + if (!(is.null(data_url))) { - data = ncdf4::nc_open(files[1]) - agb_lats = ncdf4::ncvar_get(data, "latitude") - agb_lons = ncdf4::ncvar_get(data, "longitude") - - agb_x = which(abs(agb_lons- mylon[i]) == min(abs(agb_lons - mylon[i]))) - agb_y = which(abs(agb_lats- mylat[i]) == min(abs(agb_lats - mylat[i]))) - - start = c(agb_x, agb_y) - count = c(1,1) - d = ncdf4::ncvar_get(ncdf4::nc_open(files[1]), "abvgrndbiomass", start=start, count = count) - if (is.na(d)) d <- NA - sd = ncdf4::ncvar_get(ncdf4::nc_open(files[2]), "agbSE", start=start, count = count) - if (is.na(sd)) sd <- NA - date = "2005" - site = site_ID[i] - output = as.data.frame(cbind(d, sd, date, site)) - names(output) = c("value", "sd", "date", "siteID") + #https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1981/AVHRR-Land_v005_AVH15C1_NOAA-07_19810624_c20181025194251.nc.html + # this is what a link looks like to download threeds data. + urls = sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/")) - # option to save output dataset to directory for user. - if (!(is.null(outdir))) + extract_nc = function(site_info, url, run_parallel) { - write.csv(output, file = paste0(outdir, "THREDDS_", sub("^([^.]*).*", "\\1",basename(files[1])), "_site_", site, ".csv"), row.names = FALSE) + require("foreach") + require("ncdf4") + + mylats = site_info$lat + mylons = site_info$lon + sites = site_info$site_id + + # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf + data = ncdf4::nc_open(url) + vars = names(data$var) + var = vars[grep(vars, pattern = varid, ignore.case = T)] + + # get list of all xy coordinates in netcdf + lats = ncdf4::ncvar_get(data, "latitude") + lons = ncdf4::ncvar_get(data, "longitude") + + # find the cell that site coordinates are located in + dist_y = foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2) + dist_x = foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2) + y = foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = T) + x = foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = T) + + scale = data$var[[var]]$scaleFact + + d = as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1))) + + info = as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = F) + names(info) = c("site_id", "lon", "lat", "value") + + return(info) } - return(output) - } - - ## setup parallel - if (run_parallel) { - if (!is.null(ncores)) { - ncores <- ncores - } else { - ncores <- parallel::detectCores() -1 - } - require(doParallel) - PEcAn.logger::logger.info(paste0("Running in parallel with: ", ncores)) - cl = parallel::makeCluster(ncores) - doParallel::registerDoParallel(cl) - data = foreach(i = seq_along(mylat), .combine = rbind) %dopar% get_data(i) - stopCluster(cl) - } else { - # setup sequential run - data = data.frame() - for (i in seq_along(mylat)) + + if (run_parallel) { - data = rbind(data, get_data(i)) + require("parallel") + require("doParallel") + ncores = parallel::detectCores(all.tests = FALSE, logical = TRUE) + if (ncores >= 3) + { + # failsafe in case someone has a computer with 2 nodes. + ncores = ncores-2 + } + # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time + if (ncores >= 10) + { + ncores = 9 # went 1 less becasue it still fails sometimes + } + cl <- parallel::makeCluster(ncores, outfile="") + doParallel::registerDoParallel(cl) + output = foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel) + stopCluster(cl) + } else { + output = foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel) } + + return(output) + } - - return(data) } From 2071b3eb7492e1ad7c886f45ce1c7111d2ce174d Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 12:53:08 -0400 Subject: [PATCH 02/11] some updated changes --- modules/data.remote/R/download.thredds.R | 151 +++++++++++++++-------- 1 file changed, 97 insertions(+), 54 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index 7c1babaa7a7..aed5d5ed835 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -1,9 +1,58 @@ -# -##' @title download.thredds.data -##' @name download.thredds.data +##' @title get_site_info +##' @name get_site_info +##' +##' +##' @param xmlfile full path to pecan xml settings file +##' +##' +##' @return a list of site information derived from BETY using a pecan .xml settings file with site_id, site_name, lat, lon, and time_zone. +##' +##' @examples +##' \dontrun{ +##' xmlfile <- the full path to a pecan .xml settings file. +##' + +##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml") +##' +##' @export +##' @author Bailey Morrison +##' +get_site_info <- function(xmlfile) { + require(PEcAn.all) + + settings <- read.settings(xmlfile) + + observation <- c() + for (i in seq_along(1:length(settings$run))) { + command <- paste0("settings$run$settings.", i, "$site$id") + obs <- eval(parse(text = command)) + observation <- c(observation, obs) + } + + + PEcAn.logger::logger.info("**** Extracting LandTrendr AGB data for model sites ****") + bety <- list(user = 'bety', password = 'bety', host = 'localhost', + dbname = 'bety', driver = 'PostgreSQL', write = TRUE) + con <- PEcAn.DB::db.open(bety) + bety$con <- con + site_ID <- observation + suppressWarnings(site_qry <- glue::glue_sql("SELECT *, ST_X(ST_CENTROID(geometry)) AS lon, + ST_Y(ST_CENTROID(geometry)) AS lat FROM sites WHERE id IN ({ids*})", + ids = site_ID, .con = con)) + suppressWarnings(qry_results <- DBI::dbSendQuery(con,site_qry)) + suppressWarnings(qry_results <- DBI::dbFetch(qry_results)) + site_info <- list(site_id = qry_results$id, site_name = qry_results$sitename, lat = qry_results$lat, + lon = qry_results$lon, time_zone = qry_results$time_zone) + return(site_info) +} + + +##' @title download.thredds +##' @name download.thredds +##' ##' ##' @param outdir file location to place output -##' @param site_info information about the site. i.e. site_id, latitude, longitude +##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. ##' @param dates character vector of start and end date for dataset as YYYYmmdd ##' @param varid character vector of shorthand variable name. i.e. LAI ##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website @@ -15,7 +64,7 @@ ##' @examples ##' \dontrun{ ##' outdir <- directory to store downloaded data -##' site_info <- dataframe that contains information about site_id, latitude, longitude, and site_names +##' site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone ##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd ##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index. ##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP @@ -23,62 +72,56 @@ ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer ##' -##' results <- PEcAn.data.remote::download.thredds.AGB(outdir=outdir, -##' site_ids = c(676, 678, 679, 755, 767, 1000000030, 1000000145, 1000025731), -##' run_parallel = TRUE, ncores = 8) +##' results <- download_thredds(outdir = NULL, site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE) +##' ##' ##' @export ##' @author Bailey Morrison ##' -download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201", "19961215"), - varid = "LAI", - dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", - data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", - run_parallel = TRUE) { - # require("XML") - # require("RCurl") +download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, data_url,run_parallel = TRUE) { + require("foreach") # check that dates are within the date range of the dataset - dates = c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) + dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) if (!(is.null(dir_url))) { #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads - result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=TRUE, dirlistonly = TRUE) - files = XML::getHTMLLinks(result) + result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE) + files <- XML::getHTMLLinks(result) - date_year_range = unique(range(c(year(as.Date(dates[1], "%Y")), year(as.Date(dates[2], "%Y"))))) + date_year_range <- unique(range(c(lubridate::year(as.Date(dates[1], "%Y")), lubridate::year(as.Date(dates[2], "%Y"))))) if (all((!(substr(files, 1, 4) %in% date_year_range)))) { # give warning that dates aren't available - print(test) + print("something") } } # get list of catalog file links to determine actual dates that can be downloaded with in user range - links = vector() + links <- vector() for (i in 1:length(date_year_range)) { - links[i] = RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose=F,ftp.use.epsv=T, dirlistonly = T) + links[i] <- RCurl::getURL(paste(dir_url, date_year_range[i], "catalog.html", sep = "/"), verbose= FALSE, ftp.use.epsv = TRUE, dirlistonly = TRUE) } # get list of all dates available from year range provided - files = foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i]) + files <- foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i]) #remove files with no dates and get list of dates available. - index_dates = regexpr(pattern = "[0-9]{8}", files) - files = files[-(which(index_dates < 0))] - index_dates = index_dates[which(index_dates > 0)] + index_dates <- regexpr(pattern = "[0-9]{8}", files) + files <- files[-(which(index_dates < 0))] + index_dates <- index_dates[which(index_dates > 0)] # get list of files that fall within the specific date range user asks for (Ymd, not Y) - dates_avail = as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d") - date_range = seq(dates[1], dates[2], by = "day") - get_dates = date_range[which(date_range %in% dates_avail)] + dates_avail <- as.Date(substr(files, index_dates, index_dates+7), "%Y%m%d") + date_range <- seq(dates[1], dates[2], by = "day") + get_dates <- date_range[which(date_range %in% dates_avail)] # only keep files that are within the true yyyymmdd date range user requested - files = files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))] - filenames = basename(files) + files <- files[foreach(i = seq_along(get_dates), .combine = c) %do% grep(files, pattern = format(get_dates[i], '%Y%m%d'))] + filenames <- basename(files) # user must supply data_URL or the netcdf files cannot be downloaded through thredds. if user has supplied no data_url, the job will fail # supply a warning @@ -86,38 +129,38 @@ download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201" { #https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1981/AVHRR-Land_v005_AVH15C1_NOAA-07_19810624_c20181025194251.nc.html # this is what a link looks like to download threeds data. - urls = sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/")) + urls <- sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/")) - extract_nc = function(site_info, url, run_parallel) + extract_nc <- function(site_info, url, run_parallel) { require("foreach") require("ncdf4") - mylats = site_info$lat - mylons = site_info$lon - sites = site_info$site_id + mylats <- site_info$lat + mylons <- site_info$lon + sites <- site_info$site_id # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf - data = ncdf4::nc_open(url) - vars = names(data$var) - var = vars[grep(vars, pattern = varid, ignore.case = T)] + data <- ncdf4::nc_open(url) + vars <- names(data$var) + var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)] # get list of all xy coordinates in netcdf - lats = ncdf4::ncvar_get(data, "latitude") - lons = ncdf4::ncvar_get(data, "longitude") + lats <- ncdf4::ncvar_get(data, "latitude") + lons <- ncdf4::ncvar_get(data, "longitude") # find the cell that site coordinates are located in - dist_y = foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2) - dist_x = foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2) - y = foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = T) - x = foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = T) + dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2) + dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2) + y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE) + x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE) - scale = data$var[[var]]$scaleFact + scale <- data$var[[var]]$scaleFact - d = as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1))) + d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1))) - info = as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = F) - names(info) = c("site_id", "lon", "lat", "value") + info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE) + names(info) <- c("site_id", "lon", "lat", "value") return(info) } @@ -128,23 +171,23 @@ download.thredds.data <- function(outdir = NULL, site_info, dates = c("19950201" { require("parallel") require("doParallel") - ncores = parallel::detectCores(all.tests = FALSE, logical = TRUE) + ncores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) if (ncores >= 3) { # failsafe in case someone has a computer with 2 nodes. - ncores = ncores-2 + ncores <- ncores-2 } # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time if (ncores >= 10) { - ncores = 9 # went 1 less becasue it still fails sometimes + ncores <- 9 # went 1 less becasue it still fails sometimes } cl <- parallel::makeCluster(ncores, outfile="") doParallel::registerDoParallel(cl) - output = foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel) + output <- foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel) stopCluster(cl) } else { - output = foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel) + output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel) } return(output) From 0ff2b048ffb869de53ec32297cc41dba63e520c2 Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 13:08:05 -0400 Subject: [PATCH 03/11] added outdir option in function --- modules/data.remote/R/download.thredds.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index aed5d5ed835..0f94781948e 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -72,13 +72,13 @@ get_site_info <- function(xmlfile) { ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer ##' -##' results <- download_thredds(outdir = NULL, site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE) +##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL) ##' ##' ##' @export ##' @author Bailey Morrison ##' -download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, data_url,run_parallel = TRUE) { +download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = TRUE, outdir = NULL) { require("foreach") @@ -107,7 +107,7 @@ download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, da } # get list of all dates available from year range provided - files <- foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i]) + files <- foreach::foreach(i = 1:length(links), .combine = c) %do% XML::getHTMLLinks(links[i]) #remove files with no dates and get list of dates available. index_dates <- regexpr(pattern = "[0-9]{8}", files) @@ -190,6 +190,11 @@ download_thredds <- function(outdir = NULL, site_info, dates, varid, dir_url, da output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel) } + if (outdir) + { + write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = "")) + } + return(output) } From bd136a267c55844631937998b399cec9da9112ea Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 13:24:25 -0400 Subject: [PATCH 04/11] added date corrections --- modules/data.remote/R/download.thredds.R | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index 0f94781948e..a82eb30e35e 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -82,8 +82,24 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para require("foreach") - # check that dates are within the date range of the dataset - dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) + #### check that dates are within the date range of the dataset + + #first make sure dates are in date format. Correct if not. + if (!(lubridate::is.Date(dates))){ + if (!(is.character(dates))) { + dates = as.character(dates) + } + if (length(grep(dates, pattern = "-")) > 0) { + dates <- c(as.Date(dates[1], "%Y-%m-%d"), as.Date(dates[2], "%Y-%m-%d")) + } else { + dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) + } + # Julien Date + if (nchar(dates) == 7) { + dates <- c(as.Date(dates[1], "%Y%j"), as.Date(dates[2], "%Y%j")) + } + } + if (!(is.null(dir_url))) { #https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files/1981/catalog.html -> link for directory files, not downloads From 89af55e67fe90a46daa0d33521c7e38e0a6e16a1 Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 13:27:58 -0400 Subject: [PATCH 05/11] updated @params --- modules/data.remote/R/download.thredds.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index a82eb30e35e..99bc83a50b8 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -53,7 +53,7 @@ get_site_info <- function(xmlfile) { ##' ##' @param outdir file location to place output ##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. -##' @param dates character vector of start and end date for dataset as YYYYmmdd +##' @param dates vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object. ##' @param varid character vector of shorthand variable name. i.e. LAI ##' @param dir_url catalog url of data from ncei.noaa.gov/thredds website ##' @param data_url opendap url of data from ncei.noaa.gov/thredds website @@ -65,7 +65,7 @@ get_site_info <- function(xmlfile) { ##' \dontrun{ ##' outdir <- directory to store downloaded data ##' site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone -##' dates <- date range to download data. Should be a character vector with start and end date as YYYYmmdd +##' dates <- date range to download data. Should be a vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object. ##' varod <- character shorthand name of variable to download. Example: LAI for leaf area index. ##' dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP ##' data_url <- OpenDAP URL that actually downloads the netcdf file. From 0629c0b986798f7c28cd4fab7aab78ebbbfbd75c Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 13:37:35 -0400 Subject: [PATCH 06/11] updated date_year_range --- modules/data.remote/R/download.thredds.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index 99bc83a50b8..d316df02415 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -106,7 +106,7 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para result <- RCurl::getURL(paste(dir_url, "catalog.html", sep = "/"), verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE) files <- XML::getHTMLLinks(result) - date_year_range <- unique(range(c(lubridate::year(as.Date(dates[1], "%Y")), lubridate::year(as.Date(dates[2], "%Y"))))) + date_year_range = unique(lubridate::year(dates)) if (all((!(substr(files, 1, 4) %in% date_year_range)))) { # give warning that dates aren't available From 10eae350b20bee234656ab059e09494e997010d0 Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 14:36:45 -0400 Subject: [PATCH 07/11] separated nc extract function from download function --- modules/data.remote/R/download.thredds.R | 113 ++++++++++++++--------- 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index d316df02415..0060d4c1e2f 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -78,8 +78,10 @@ get_site_info <- function(xmlfile) { ##' @export ##' @author Bailey Morrison ##' -download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = TRUE, outdir = NULL) { +download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) { + #until the issues with parallel runs are fixed. + run_parallel = FALSE require("foreach") #### check that dates are within the date range of the dataset @@ -147,50 +149,16 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para # this is what a link looks like to download threeds data. urls <- sort(paste(data_url, substr(dates_avail, 1, 4), filenames, sep = "/")) - extract_nc <- function(site_info, url, run_parallel) - { - require("foreach") - require("ncdf4") - - mylats <- site_info$lat - mylons <- site_info$lon - sites <- site_info$site_id - - # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf - data <- ncdf4::nc_open(url) - vars <- names(data$var) - var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)] - - # get list of all xy coordinates in netcdf - lats <- ncdf4::ncvar_get(data, "latitude") - lons <- ncdf4::ncvar_get(data, "longitude") - - # find the cell that site coordinates are located in - dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2) - dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2) - y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE) - x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE) - - scale <- data$var[[var]]$scaleFact - - d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1))) - - info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE) - names(info) <- c("site_id", "lon", "lat", "value") - - return(info) - } - - - + # parallel seems to have a problem right now with > 500 urls. if (run_parallel) { - require("parallel") + #require("parallel") require("doParallel") ncores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + # This is a failsafe for computers with low numbers of CPUS to reduce risk of blowing RAM. if (ncores >= 3) { - # failsafe in case someone has a computer with 2 nodes. + # failsafe in case someone has a computer with 2-4 nodes. ncores <- ncores-2 } # THREDDS has a 10 job limit. Will fail if you try to download more than 10 values at a time @@ -200,14 +168,15 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para } cl <- parallel::makeCluster(ncores, outfile="") doParallel::registerDoParallel(cl) - output <- foreach(i = urls, .combine = rbind) %dopar% extract_nc(site_info, i, run_parallel) - stopCluster(cl) + output <- foreach(i = urls, .combine = rbind) %dopar% extract_thredds_nc(site_info = site_info, url = i) + parallel::stopCluster(cl) } else { - output <- foreach(i = urls, .combine = rbind) %do% extract_nc(site_info, i, run_parallel) + output <- foreach(i = urls, .combine = rbind) %do% extract_thredds_nc(site_info, url = i) } - if (outdir) + if (!(is.null(outdir))) { + # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates. write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = "")) } @@ -215,3 +184,61 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para } } + +##' @title extract_thredds_nc +##' @name extract_thredds_nc +##' +##' +##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. +##' @param url a THREDDS url of a .nc file to extract data from. +##' @param run_parallel T or F option to extra data in parallel. +##' +##' +##' @return a dataframe with the values for each date/site combination from a THREDDS file +##' +##' @examples +##' \dontrun{ +##' site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. +##' url <- url a THREDDS url of a .nc file to extract data from. +##' run_parallel <- T or F option to extra data in parallel. + +##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml") +##' +##' @export +##' @author Bailey Morrison +##' +extract_thredds_nc <- function(site_info, url_info, run_parallel) +{ + #print(url) + require("foreach") + require("ncdf4") + + mylats <- site_info$lat + mylons <- site_info$lon + sites <- site_info$site_id + + # open netcdf file and get the correct variable name based on varid parameter + var names of netcdf + data <- ncdf4::nc_open(url_info) + vars <- names(data$var) + var <- vars[grep(vars, pattern = varid, ignore.case = TRUE)] + + # get list of all xy coordinates in netcdf + lats <- ncdf4::ncvar_get(data, "latitude") + lons <- ncdf4::ncvar_get(data, "longitude") + + # find the cell that site coordinates are located in + dist_y <- foreach(i = mylats, .combine = cbind) %do% sqrt((lats - i)^2) + dist_x <- foreach(i = mylons, .combine = cbind) %do% sqrt((lons - i)^2) + y <- foreach(i = 1:ncol(dist_y), .combine = c) %do% which(dist_y[,i] == min(dist_y[,i]), arr.ind = TRUE) + x <- foreach(i = 1:ncol(dist_x), .combine = c) %do% which(dist_x[,i] == min(dist_x[,i]), arr.ind = TRUE) + + scale <- data$var[[var]]$scaleFact + + d <- as.vector(foreach(i = seq_along(x), .combine = rbind) %do% ncdf4::ncvar_get(data, var, start = c(x[i], y[i], 1), count = c(1,1,1))) + + info <- as.data.frame(cbind(sites, mylons, mylats, d), stringsAsFactors = FALSE) + names(info) <- c("site_id", "lon", "lat", "value") + + return(info) +} + From 24b835fa41271c66d8b2a6b388473e9ca5d7cba3 Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Wed, 16 Oct 2019 14:45:28 -0400 Subject: [PATCH 08/11] fixed documentation issue --- modules/data.remote/NAMESPACE | 4 ++- modules/data.remote/R/download.thredds.R | 14 ++++---- .../data.remote/man/download.thredds.AGB.Rd | 27 --------------- modules/data.remote/man/download.thredds.Rd | 34 +++++++++++++++++++ modules/data.remote/man/extract_thredds_nc.Rd | 24 +++++++++++++ modules/data.remote/man/get_site_info.Rd | 20 +++++++++++ 6 files changed, 87 insertions(+), 36 deletions(-) delete mode 100644 modules/data.remote/man/download.thredds.AGB.Rd create mode 100644 modules/data.remote/man/download.thredds.Rd create mode 100644 modules/data.remote/man/extract_thredds_nc.Rd create mode 100644 modules/data.remote/man/get_site_info.Rd diff --git a/modules/data.remote/NAMESPACE b/modules/data.remote/NAMESPACE index d84c728b44c..2bb4941b69a 100644 --- a/modules/data.remote/NAMESPACE +++ b/modules/data.remote/NAMESPACE @@ -3,6 +3,8 @@ export(call_MODIS) export(download.LandTrendr.AGB) export(download.NLCD) -export(download.thredds.AGB) +export(download_thredds) export(extract.LandTrendr.AGB) export(extract_NLCD) +export(extract_thredds_nc) +export(get_site_info) diff --git a/modules/data.remote/R/download.thredds.R b/modules/data.remote/R/download.thredds.R index 0060d4c1e2f..b3b17b59891 100755 --- a/modules/data.remote/R/download.thredds.R +++ b/modules/data.remote/R/download.thredds.R @@ -13,7 +13,7 @@ ##' ##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml") -##' +##' } ##' @export ##' @author Bailey Morrison ##' @@ -73,7 +73,7 @@ get_site_info <- function(xmlfile) { ##' ##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL) -##' +##' } ##' ##' @export ##' @author Bailey Morrison @@ -191,7 +191,6 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para ##' ##' @param site_info list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. ##' @param url a THREDDS url of a .nc file to extract data from. -##' @param run_parallel T or F option to extra data in parallel. ##' ##' ##' @return a dataframe with the values for each date/site combination from a THREDDS file @@ -200,14 +199,13 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para ##' \dontrun{ ##' site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. ##' url <- url a THREDDS url of a .nc file to extract data from. -##' run_parallel <- T or F option to extra data in parallel. - -##' site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml") -##' +##' +##' output <- extract_thredds_nc(site_info = site_info, url_info = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1995/AVHRR-Land_v005_AVH15C1_NOAA-14_19950201_c20180831220722.nc") +##'} ##' @export ##' @author Bailey Morrison ##' -extract_thredds_nc <- function(site_info, url_info, run_parallel) +extract_thredds_nc <- function(site_info, url_info) { #print(url) require("foreach") diff --git a/modules/data.remote/man/download.thredds.AGB.Rd b/modules/data.remote/man/download.thredds.AGB.Rd deleted file mode 100644 index 35dfd405cd5..00000000000 --- a/modules/data.remote/man/download.thredds.AGB.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download.thredds.R -\name{download.thredds.AGB} -\alias{download.thredds.AGB} -\title{download.thredds.AGB} -\usage{ -download.thredds.AGB(outdir = NULL, site_ids, run_parallel = FALSE, - ncores = NULL) -} -\arguments{ -\item{outdir}{Where to place output} - -\item{site_ids}{What locations to download data at?} - -\item{run_parallel}{Logical. Download and extract files in parallel?} - -\item{ncores}{Optional. If run_parallel=TRUE how many cores to use? If left as NULL will select max number -1} -} -\value{ -data.frame summarize the results of the function call -} -\description{ -download.thredds.AGB -} -\author{ -Bailey Morrison -} diff --git a/modules/data.remote/man/download.thredds.Rd b/modules/data.remote/man/download.thredds.Rd new file mode 100644 index 00000000000..048f78957ac --- /dev/null +++ b/modules/data.remote/man/download.thredds.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download.thredds.R +\name{download.thredds} +\alias{download.thredds} +\alias{download_thredds} +\title{download.thredds} +\usage{ +download_thredds(site_info, dates, varid, dir_url, data_url, + run_parallel = FALSE, outdir = NULL) +} +\arguments{ +\item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.} + +\item{dates}{vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object.} + +\item{varid}{character vector of shorthand variable name. i.e. LAI} + +\item{dir_url}{catalog url of data from ncei.noaa.gov/thredds website} + +\item{data_url}{opendap url of data from ncei.noaa.gov/thredds website} + +\item{run_parallel}{Logical. Download and extract files in parallel?} + +\item{outdir}{file location to place output} +} +\value{ +data.frame summarize the results of the function call +} +\description{ +download.thredds +} +\author{ +Bailey Morrison +} diff --git a/modules/data.remote/man/extract_thredds_nc.Rd b/modules/data.remote/man/extract_thredds_nc.Rd new file mode 100644 index 00000000000..1f8e41ed231 --- /dev/null +++ b/modules/data.remote/man/extract_thredds_nc.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download.thredds.R +\name{extract_thredds_nc} +\alias{extract_thredds_nc} +\title{extract_thredds_nc} +\usage{ +extract_thredds_nc(site_info, url_info, run_parallel) +} +\arguments{ +\item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.} + +\item{run_parallel}{T or F option to extra data in parallel.} + +\item{url}{a THREDDS url of a .nc file to extract data from.} +} +\value{ +a dataframe with the values for each date/site combination from a THREDDS file +} +\description{ +extract_thredds_nc +} +\author{ +Bailey Morrison +} diff --git a/modules/data.remote/man/get_site_info.Rd b/modules/data.remote/man/get_site_info.Rd new file mode 100644 index 00000000000..e73834879ce --- /dev/null +++ b/modules/data.remote/man/get_site_info.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download.thredds.R +\name{get_site_info} +\alias{get_site_info} +\title{get_site_info} +\usage{ +get_site_info(xmlfile) +} +\arguments{ +\item{xmlfile}{full path to pecan xml settings file} +} +\value{ +a list of site information derived from BETY using a pecan .xml settings file with site_id, site_name, lat, lon, and time_zone. +} +\description{ +get_site_info +} +\author{ +Bailey Morrison +} From bd6d4b57e8f101f2dcbac16d659de6e3125b94ee Mon Sep 17 00:00:00 2001 From: "bmorrison@bnl.gov" Date: Thu, 24 Oct 2019 15:49:49 -0400 Subject: [PATCH 09/11] some other changes i dont remember --- modules/data.remote/man/download.thredds.Rd | 14 ++++++++++++++ modules/data.remote/man/extract_thredds_nc.Rd | 12 +++++++++--- modules/data.remote/man/get_site_info.Rd | 7 +++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/modules/data.remote/man/download.thredds.Rd b/modules/data.remote/man/download.thredds.Rd index 048f78957ac..9983594ad5a 100644 --- a/modules/data.remote/man/download.thredds.Rd +++ b/modules/data.remote/man/download.thredds.Rd @@ -28,6 +28,20 @@ data.frame summarize the results of the function call } \description{ download.thredds +} +\examples{ +\dontrun{ +outdir <- directory to store downloaded data +site_info <- list that contains information about site_id, site_name, latitude, longitude, and time_zone +dates <- date range to download data. Should be a vector of start and end date for dataset as YYYYmmdd, YYYY-mm-dd, YYYYjjj, or date object. +varod <- character shorthand name of variable to download. Example: LAI for leaf area index. +dir_url <- catalog url from THREDDS that is used to determine which files are available for download using OPENDAP +data_url <- OpenDAP URL that actually downloads the netcdf file. +run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer + +results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = TRUE, outdir = NULL) +} + } \author{ Bailey Morrison diff --git a/modules/data.remote/man/extract_thredds_nc.Rd b/modules/data.remote/man/extract_thredds_nc.Rd index 1f8e41ed231..694fdafcc1c 100644 --- a/modules/data.remote/man/extract_thredds_nc.Rd +++ b/modules/data.remote/man/extract_thredds_nc.Rd @@ -4,13 +4,11 @@ \alias{extract_thredds_nc} \title{extract_thredds_nc} \usage{ -extract_thredds_nc(site_info, url_info, run_parallel) +extract_thredds_nc(site_info, url_info) } \arguments{ \item{site_info}{list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list.} -\item{run_parallel}{T or F option to extra data in parallel.} - \item{url}{a THREDDS url of a .nc file to extract data from.} } \value{ @@ -19,6 +17,14 @@ a dataframe with the values for each date/site combination from a THREDDS file \description{ extract_thredds_nc } +\examples{ +\dontrun{ +site_info <- list of information with the site_id, site_info, lat, lon, and time_zone. Derived from BETY using a PEcAn .xml settings file with site information. Can use the get_site_info function to generate this list. +url <- url a THREDDS url of a .nc file to extract data from. + +output <- extract_thredds_nc(site_info = site_info, url_info = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files/1995/AVHRR-Land_v005_AVH15C1_NOAA-14_19950201_c20180831220722.nc") +} +} \author{ Bailey Morrison } diff --git a/modules/data.remote/man/get_site_info.Rd b/modules/data.remote/man/get_site_info.Rd index e73834879ce..98d06bff0f5 100644 --- a/modules/data.remote/man/get_site_info.Rd +++ b/modules/data.remote/man/get_site_info.Rd @@ -15,6 +15,13 @@ a list of site information derived from BETY using a pecan .xml settings file wi \description{ get_site_info } +\examples{ +\dontrun{ +xmlfile <- the full path to a pecan .xml settings file. + +site_info <- get_site_info(xmlfile = "/data/bmorrison/sda/lai/pecan_MultiSite_SDA_LAI_AGB_8_Sites_2009.xml") + } +} \author{ Bailey Morrison } From 1e1fbefda2a094d2f26838189ba091ec619cc3ff Mon Sep 17 00:00:00 2001 From: Morrison Date: Thu, 14 May 2020 15:37:45 -0400 Subject: [PATCH 10/11] some changes I dont remember --- .../R/download.thredds.AVHRR.monthAGG.R | 78 +++++++++---------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R b/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R index 0084adda9fd..16aaf2b2e6b 100755 --- a/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R +++ b/modules/data.remote/R/download.thredds.AVHRR.monthAGG.R @@ -72,49 +72,32 @@ get_site_info <- function(xmlfile) { ##' run_parallel <- optional. Can be used to speed up download process if there are more than 2 cores available on computer ##' -##' results <- download_thredds(site_info = site_info, dates = c("19950201", "19961215"), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = FALSE, outdir = NULL) +##' results <- download_thredds(site_info = site_info, years = c("2000", "2003"), months = c(6,7,8), varid = "LAI", dir_url = "https://www.ncei.noaa.gov/thredds/catalog/cdr/lai/files", data_url = "https://www.ncei.noaa.gov/thredds/dodsC/cdr/lai/files", run_parallel = FALSE, outdir = NULL) ##' } ##' @importFrom foreach %do% %dopar% ##' @export ##' @author Bailey Morrison ##' -download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) { +download_thredds <- function(site_info, years, months, varid, dir_url, data_url,run_parallel = FALSE, outdir = NULL) { #until the issues with parallel runs are fixed. run_parallel = FALSE - #require("foreach") - - - #### check that dates are within the date range of the dataset - - #first make sure dates are in date format. Correct if not. - if (!(lubridate::is.Date(dates))){ - if (!(is.character(dates))) { - dates = as.character(dates) - } - if (length(grep(dates, pattern = "-")) > 0) { - dates <- c(as.Date(dates[1], "%Y-%m-%d"), as.Date(dates[2], "%Y-%m-%d")) - } else { - dates <- c(as.Date(dates[1], "%Y%m%d"), as.Date(dates[2], "%Y%m%d")) - } - # Julien Date - if (any(nchar(dates) == 7)) { - dates <- c(as.Date(dates[1], "%Y%j"), as.Date(dates[2], "%Y%j")) - } - } - - date_range = unique(lubridate::year(seq(dates[1], dates[2], by = '1 year'))) + - output = data.frame() + #assumes there is a max of 31 possible days in a month. This covers leap years! + years_range = sort(rep(seq(years[1], years[2]), 31)) + if (!(is.null(dir_url))) { - for (i in seq_along(date_range)) + output = data.frame() + + for (i in seq_along(unique(years_range))) { - result <- RCurl::getURL(paste(dir_url, date_range[i], "/catalog.html", sep = "/"), + result <- RCurl::getURL(paste(dir_url, unique(years_range)[i], "/catalog.html", sep = "/"), verbose=FALSE ,ftp.use.epsv = TRUE, dirlistonly = TRUE) files <- XML::getHTMLLinks(result) - index_dates <- regexpr(pattern = "_[0-9]{8}_", files) + index_dates <- regexpr(pattern = paste0("_[0-9]{4}0[", months[1], "-", months[length(months)], "]{1}[0-9]{2}_"), files) files <- files[-(which(index_dates < 0))] index_dates <- index_dates[which(index_dates > 0)] @@ -148,19 +131,35 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para } else { out <- foreach::foreach(i = urls, .combine = rbind) %do% extract_thredds_nc(site_info, url_info = i) - } - output = rbind(output, out) - - if (!(is.null(outdir))) - { - # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates. - write.csv(out, file = paste(outdir, "/THREDDS_", varid, "_", dates[1], "-", dates[2], ".csv", sep = "")) - } + + # get max LAI for each site instead of all days with missing NA fillers + test = foreach::foreach(i = unique(out$site_id), .combine = rbind) %do% + max_lai(x = out, site = i) + test$date = lubridate::year(test$date) + + output = rbind(output, test) + + } } - } + + # if (!(is.null(outdir))) + # { + # # this will need to be changed in the future if users want to be able to save data they haven't already extracted at different sites/dates. + # write.csv(output, file = paste(outdir, "/THREDDS_", varid, "_", years[1], "-", years[2], "_",months[1], "-", months[length(months)], ".csv", sep = "")) + # } + return(output) } - return(output) +} + + + + +max_lai = function(x, site) +{ + site_info_max = as.data.frame(x[x$site_id == site,][1,1:4], stringsAsFactors = FALSE) + site_info_max$max = as.numeric(max(x[x$site_id == site,]$value, na.rm = TRUE)) + return(site_info_max) } @@ -186,9 +185,6 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para ##' extract_thredds_nc <- function(site_info, url_info) { - #print(url) - #require("foreach") - #require("ncdf4") index = regexpr(pattern = "_[0-9]{8}_", url_info) date<- as.Date(substr(url_info, index+1, index+8), "%Y%m%d") From 936e3b77c4daa29d5530e674306b1adab4a6e008 Mon Sep 17 00:00:00 2001 From: Morrison Date: Mon, 1 Jun 2020 22:54:08 -0400 Subject: [PATCH 11/11] update thredds function --- modules/data.remote/R/download.thredds.AVHRR.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/data.remote/R/download.thredds.AVHRR.R b/modules/data.remote/R/download.thredds.AVHRR.R index 9dfad069062..5575396db58 100755 --- a/modules/data.remote/R/download.thredds.AVHRR.R +++ b/modules/data.remote/R/download.thredds.AVHRR.R @@ -146,8 +146,11 @@ download_thredds <- function(site_info, dates, varid, dir_url, data_url,run_para # extract_thredds_nc(site_info = site_info, url_info = i) # parallel::stopCluster(cl) } else { - out <- foreach::foreach(i = urls, .combine = rbind) %do% - extract_thredds_nc(site_info, url_info = i) + #start_time <- Sys.time() + out <- foreach::foreach(j = urls, .combine = rbind) %do% + extract_thredds_nc(site_info, url_info = j) + # end_time <- Sys.time() + # end_time - start_time } output = rbind(output, out)