package processing updates

moving preprocessing into the package structure to ensure traceability. However, vector size could be a concern, requiring larger memory.
JGCRI · Aug 7, 2023 · 6647ca6 · 6647ca6
1 parent dfb7c88
commit 6647ca6
Show file tree

Hide file tree

Showing 13 changed files with 2,784 additions and 1,037 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -28,11 +28,11 @@ export(standardize_iso)
 export(unprotect_integer_cols)
 importFrom(assertthat,assert_that)
 importFrom(data.table,data.table)
-importFrom(dplyr,arrange)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
+importFrom(dplyr,first)
 importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_at)
@@ -48,6 +48,7 @@ importFrom(dplyr,right_join)
 importFrom(dplyr,select)
 importFrom(dplyr,summarise)
 importFrom(dplyr,summarise_all)
+importFrom(dplyr,summarize)
 importFrom(dplyr,ungroup)
 importFrom(dplyr,vars)
 importFrom(grDevices,rainbow)

diff --git a/R/utils-data.R b/R/utils-data.R
@@ -233,6 +233,61 @@ get_data <- function(all_data, name, strip_attributes = FALSE) {
   }
 }
 
+#' get_data_list
+#'
+#' This function calls \code{get_data} for each data name in \code{data_list} and assigns
+#' the resulting data into the given \code{environ} with the data name being used as the
+#' variable name.  Note: for values in data_list "named" FILE the "basename" of the string
+#' will be used as the variable name.
+#'
+#' @param all_data Data structure
+#' @param data_list A character vector of data to load into the given environment
+#' @param strip_attributes A logical vector which will be passed on to \code{get_data}.  The length
+#' must be 1 (use the save logical for all values of data_list) or match the length of
+#' \code{data_list} (one logical value for each data_list item).
+#' @param environ The environment into which the data should be loaded.  If NULL (the default)
+#' the caller's environment will be used.
+get_data_list <- function(all_data, data_list, strip_attributes = FALSE, environ = NULL) {
+  # expecting a (potentially named) character vector for data_list
+  assertthat::assert_that(is.character(data_list))
+  data_list_names <- names(data_list)
+
+  # strip_attributes must be logical and either be length 1 (in which case the same value
+  # will be used for all calls to get_data) or match the length of data_list (values will be
+  # matched up per index)
+  assertthat::assert_that(is.logical(strip_attributes))
+  assertthat::assert_that(length(strip_attributes) == 1 || length(strip_attributes) == length(data_list))
+  if(length(strip_attributes) == 1) {
+    strip_attributes = rep_len(strip_attributes, length(data_list))
+  }
+
+  # if no environment was explicitly given the default behavior is to load into the caller's
+  # environment
+  if(is.null(environ)) {
+    environ = parent.frame()
+  }
+
+  # loop over each data_list, call get_data and assign the result to the same data name in
+  # the given environment
+  for(i in seq_along(data_list)) {
+    curr_var_name <- data_list[i]
+    # the variable name to assign for FILE is the "basename" of the file
+    # i.e. `FILE = "common/GCAM_region_names"` will result in `GCAM_region_names` being set
+    if(!is.null(data_list_names) && data_list_names[i] %in% c("FILE", "OPTIONAL_FILE")) {
+      # Note: strsplit returns a list (one per each str to be split) of character vector
+      # (one for each token split out).  Given we are split one string at a time
+      # we will just grab the first element of the list (`[[1]]`)
+      data_name_split = strsplit(curr_var_name, "/")[[1]]
+      # get the last element of the char vec to use as the var name
+      curr_var_name = tail(data_name_split, n = 1)
+    }
+    # get the data
+    data = get_data(all_data, data_list[i], strip_attributes[i])
+    # assign it into the environment
+    assign(curr_var_name, data, envir = environ)
+  }
+}
+
 
 #' return_data
 #'

diff --git a/R/xfaostat_L100_constants.R b/R/xfaostat_L100_constants.R
@@ -0,0 +1,6 @@
+# Copyright 2019 Battelle Memorial Institute; see the LICENSE file.
+
+# General behavior constants ======================================================================
+
+# having issues with package check here
+ DIR_RAW_DATA_FAOSTAT <- system.file("extdata", "aglu/FAO/FAOSTAT", package = "gcamdata")
diff --git a/R/xfaostat_L101_RawDataPreProcessing1.R b/R/xfaostat_L101_RawDataPreProcessing1.R
@@ -0,0 +1,196 @@
+# Copyright 2019 Battelle Memorial Institute; see the LICENSE file.
+
+#' module_xfaostat_L101_RawDataPreProcessing1
+#'
+#' Preprocess raw faostat data
+#'
+#' @param command API command to execute
+#' @param ... other optional parameters, depending on command
+#' @return Depends on \code{command}: either a vector of required inputs, a vector of output names, or (if
+#'   \code{command} is "MAKE") all the generated outputs
+#' @details This chunk compiles balanced supply utilization data in primary equivalent in GCAM region and commodities.
+#' @importFrom assertthat assert_that
+#' @importFrom dplyr summarize bind_rows filter if_else inner_join left_join mutate rename select n group_by_at
+#' first case_when vars
+#' @importFrom tibble tibble
+#' @importFrom tidyr complete drop_na gather nesting spread replace_na
+#' @author XZ 2023
+module_xfaostat_L101_RawDataPreProcessing1 <- function(command, ...) {
+
+  MODULE_INPUTS <-
+    c(FILE = "aglu/AGLU_ctry")
+
+  MODULE_OUTPUTS <-
+    c("QCL",              # Ag production quantity and harvested area
+      "PP",               # Producer prices
+      "SCL",              # Supply utilization accounting
+      "FBS",              # New food balance sheet
+      "QCL_area_code_map" # Country code
+    )
+
+  if(command == driver.DECLARE_INPUTS) {
+    return(MODULE_INPUTS)
+  } else if(command == driver.DECLARE_OUTPUTS) {
+    return(MODULE_OUTPUTS)
+  } else if(command == driver.MAKE) {
+
+    year <- value <- Year <- Value <- FAO_country <- iso <- NULL    # silence package check.
+
+    all_data <- list(...)[[1]]
+
+    # Load required inputs ----
+
+    get_data_list(all_data, MODULE_INPUTS, strip_attributes = TRUE)
+
+    #source("data-raw/generate_package_data_faostat_helper_funcs.R")
+    #DIR_RAW_DATA_FAOSTAT <- system.file("extdata", "aglu/FAO/FAOSTAT", package = "gcamdata.faostat")
+
+    # *[QCL] FAOSTAT Production and area ----
+
+    ## Load raw data
+    FAOSTAT_load_raw_data(DATASETCODE = "QCL", DATA_FOLDER = DIR_RAW_DATA_FAOSTAT)
+
+
+    QCL %>%
+      # Remove aggregated areas and items
+      filter(area_code < 350, item_code < 1700) %>%
+      select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
+      # When dealing with animal/livestock data, units are important
+      # Prod Popultn (5314) for Beewax and honey is removed since data is only before 1990
+      filter(element_code != 5314) %>%
+      # Remove NA for simplicity for now; expend.grid later
+      # All Coir (coconut fiber) is filtered out due to NA
+      filter(!is.na(value)) %>%
+      # remove accent
+      rm_accent("item", "area") -> QCL1
+
+
+  QCL1 %>%
+    distinct(area_code, area) %>%
+      add_title("FAO primary production country and code") %>%
+      add_units("NA") %>%
+      add_comments("FAO Country and code") ->
+      QCL_area_code_map
+
+    # Other data uses OCL area for consistency
+    QCL_area_code <- QCL1 %>% distinct(area_code) %>% pull()
+
+
+    QCL1 %>%
+      add_title("FAO primary production") %>%
+      add_units("USD/tonne") %>%
+      add_comments("Preprocessed FAOSTAT primary production") ->
+      QCL
+
+
+
+    # *[PP] Producer price ----
+
+    FAOSTAT_load_raw_data(DATASETCODE = "PP", DATA_FOLDER = DIR_RAW_DATA_FAOSTAT)
+
+    PP %>% distinct(element, element_code, unit)
+
+
+    PP %>%
+      filter(area_code < 350,  # rm aggregated regions
+             item_code < 1700, #rm aggregated items
+             area_code %in% QCL_area_code, # only keep regions with production
+             element_code %in% c(5532, 5539)) %>% #keep USD/tonne and index
+      rm_accent("item", "area") -> PP1
+
+
+    # Using index to fill in missing across years
+    PP1 %>%
+      filter(element_code %in% c(5532, 5539)) %>%
+      select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
+      # Not completing year and area here
+      spread(element, value) %>%
+      left_join(
+        PP1 %>% filter(element_code %in% c(5532, 5539)) %>%
+          select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
+          spread(element, value) %>%
+          rename(pp_base = `Producer Price (USD/tonne)`,
+                 pp_baseindex = `Producer Price Index (2014-2016 = 100)`) %>%
+          filter(!is.na(pp_base)) %>%
+          group_by(area, area_code, item) %>%
+          filter(year == 2015) %>% within(rm(year)) %>%
+          ungroup(),
+        by = c("area_code", "area", "item_code", "item")
+      ) %>% mutate(
+        `Producer Price (USD/tonne)` = if_else(is.na(`Producer Price (USD/tonne)`),
+                                               pp_base* `Producer Price Index (2014-2016 = 100)` /pp_baseindex,
+                                               `Producer Price (USD/tonne)`)
+      )  %>%
+      select(area_code, area, item_code, item, year, `Producer Price (USD/tonne)`) %>%
+      gather(element, value, `Producer Price (USD/tonne)`) %>%
+      mutate(element_code = 5532) -> PP2
+
+
+    ### output PP and clean memory ----
+    PP2 %>%
+      add_title("FAO producer prices") %>%
+      add_units("USD/tonne") %>%
+      add_comments("Preprocessed FAOSTAT producer prices") ->
+      PP
+
+    rm(PP, PP1, PP2)
+
+    # Food balance and Supply-Utilization-Account
+
+    ## *[FBS] new food balance sheet (2010-) ----
+
+    ## Load raw data
+    FAOSTAT_load_raw_data("FBS") # New FBS  2010+
+    FBS %>% distinct(element, element_code, unit)
+
+    FBS %>% filter(item_code < 2901, item_code != 2501,
+                   !element_code %in% c(511, 5301),
+                   area_code %in% QCL_area_code) %>%
+      select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
+      rm_accent("item", "area") -> FBS1
+
+
+    ### output FBS and clean memory ----
+    FBS1 %>%
+      add_title("FAO SCL") %>%
+      add_units("tonne") %>%
+      add_comments("Preprocessed FAOSTAT SCL") ->
+      FBS
+    rm(FBS, FBS1)
+
+
+    ## *[SCL] SUA: supply utilization accounting ----
+
+    FAOSTAT_load_raw_data("SCL")   # SUA      2010+
+    SCL %>% distinct(element, element_code, unit)
+    # FAOSTAT "accidentally" used CPC code in SCL;
+    if (is.numeric(SCL$item_code)) {
+      SCL %>% filter(item_code <= 1700, item_code != 1) -> SCL
+    }
+
+    SCL %>% filter(!element_code %in% c(664, 665, 674, 684, 511),
+                   # it is not useful to calculate cal/g using `Food supply (kcal/capita/day)` /`Food supply quantity (g/capita/day)`
+                   # unit too small so remove them here
+                   # `Calories/Year` / `Food supply quantity (tonnes)` is more accurate!
+                   # similarly for protein and fat
+                   # Use annual value in SUA to calculate the conversion rate!
+                   area_code %in% QCL_area_code) %>%
+      select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
+      rm_accent("item", "area") -> SCL1
+
+
+     ### output SCL and clean memory ----
+    SCL1 %>%
+       add_title("FAO SCL") %>%
+       add_units("tonne") %>%
+       add_comments("Preprocessed FAOSTAT SCL") ->
+      SCL
+     rm(SCL, SCL1)
+
+
+    return_data(MODULE_OUTPUTS)
+
+  } else {
+    stop("Unknown command")
+  }
+}