Skip to content

Commit

Permalink
package processing updates
Browse files Browse the repository at this point in the history
moving preprocessing into the package structure to ensure traceability.
However, vector size could be a concern, requiring larger memory.
  • Loading branch information
realxinzhao committed Aug 7, 2023
1 parent dfb7c88 commit 6647ca6
Show file tree
Hide file tree
Showing 13 changed files with 2,784 additions and 1,037 deletions.
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ export(standardize_iso)
export(unprotect_integer_cols)
importFrom(assertthat,assert_that)
importFrom(data.table,data.table)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,first)
importFrom(dplyr,full_join)
importFrom(dplyr,group_by)
importFrom(dplyr,group_by_at)
Expand All @@ -48,6 +48,7 @@ importFrom(dplyr,right_join)
importFrom(dplyr,select)
importFrom(dplyr,summarise)
importFrom(dplyr,summarise_all)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(dplyr,vars)
importFrom(grDevices,rainbow)
Expand Down
55 changes: 55 additions & 0 deletions R/utils-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,61 @@ get_data <- function(all_data, name, strip_attributes = FALSE) {
}
}

#' get_data_list
#'
#' This function calls \code{get_data} for each data name in \code{data_list} and assigns
#' the resulting data into the given \code{environ} with the data name being used as the
#' variable name. Note: for values in data_list "named" FILE the "basename" of the string
#' will be used as the variable name.
#'
#' @param all_data Data structure
#' @param data_list A character vector of data to load into the given environment
#' @param strip_attributes A logical vector which will be passed on to \code{get_data}. The length
#' must be 1 (use the save logical for all values of data_list) or match the length of
#' \code{data_list} (one logical value for each data_list item).
#' @param environ The environment into which the data should be loaded. If NULL (the default)
#' the caller's environment will be used.
get_data_list <- function(all_data, data_list, strip_attributes = FALSE, environ = NULL) {
# expecting a (potentially named) character vector for data_list
assertthat::assert_that(is.character(data_list))
data_list_names <- names(data_list)

# strip_attributes must be logical and either be length 1 (in which case the same value
# will be used for all calls to get_data) or match the length of data_list (values will be
# matched up per index)
assertthat::assert_that(is.logical(strip_attributes))
assertthat::assert_that(length(strip_attributes) == 1 || length(strip_attributes) == length(data_list))
if(length(strip_attributes) == 1) {
strip_attributes = rep_len(strip_attributes, length(data_list))
}

# if no environment was explicitly given the default behavior is to load into the caller's
# environment
if(is.null(environ)) {
environ = parent.frame()
}

# loop over each data_list, call get_data and assign the result to the same data name in
# the given environment
for(i in seq_along(data_list)) {
curr_var_name <- data_list[i]
# the variable name to assign for FILE is the "basename" of the file
# i.e. `FILE = "common/GCAM_region_names"` will result in `GCAM_region_names` being set
if(!is.null(data_list_names) && data_list_names[i] %in% c("FILE", "OPTIONAL_FILE")) {
# Note: strsplit returns a list (one per each str to be split) of character vector
# (one for each token split out). Given we are split one string at a time
# we will just grab the first element of the list (`[[1]]`)
data_name_split = strsplit(curr_var_name, "/")[[1]]
# get the last element of the char vec to use as the var name
curr_var_name = tail(data_name_split, n = 1)
}
# get the data
data = get_data(all_data, data_list[i], strip_attributes[i])
# assign it into the environment
assign(curr_var_name, data, envir = environ)
}
}


#' return_data
#'
Expand Down
6 changes: 6 additions & 0 deletions R/xfaostat_L100_constants.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright 2019 Battelle Memorial Institute; see the LICENSE file.

# General behavior constants ======================================================================

# having issues with package check here
DIR_RAW_DATA_FAOSTAT <- system.file("extdata", "aglu/FAO/FAOSTAT", package = "gcamdata")
196 changes: 196 additions & 0 deletions R/xfaostat_L101_RawDataPreProcessing1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# Copyright 2019 Battelle Memorial Institute; see the LICENSE file.

#' module_xfaostat_L101_RawDataPreProcessing1
#'
#' Preprocess raw faostat data
#'
#' @param command API command to execute
#' @param ... other optional parameters, depending on command
#' @return Depends on \code{command}: either a vector of required inputs, a vector of output names, or (if
#' \code{command} is "MAKE") all the generated outputs
#' @details This chunk compiles balanced supply utilization data in primary equivalent in GCAM region and commodities.
#' @importFrom assertthat assert_that
#' @importFrom dplyr summarize bind_rows filter if_else inner_join left_join mutate rename select n group_by_at
#' first case_when vars
#' @importFrom tibble tibble
#' @importFrom tidyr complete drop_na gather nesting spread replace_na
#' @author XZ 2023
module_xfaostat_L101_RawDataPreProcessing1 <- function(command, ...) {

MODULE_INPUTS <-
c(FILE = "aglu/AGLU_ctry")

MODULE_OUTPUTS <-
c("QCL", # Ag production quantity and harvested area
"PP", # Producer prices
"SCL", # Supply utilization accounting
"FBS", # New food balance sheet
"QCL_area_code_map" # Country code
)

if(command == driver.DECLARE_INPUTS) {
return(MODULE_INPUTS)
} else if(command == driver.DECLARE_OUTPUTS) {
return(MODULE_OUTPUTS)
} else if(command == driver.MAKE) {

year <- value <- Year <- Value <- FAO_country <- iso <- NULL # silence package check.

all_data <- list(...)[[1]]

# Load required inputs ----

get_data_list(all_data, MODULE_INPUTS, strip_attributes = TRUE)

#source("data-raw/generate_package_data_faostat_helper_funcs.R")
#DIR_RAW_DATA_FAOSTAT <- system.file("extdata", "aglu/FAO/FAOSTAT", package = "gcamdata.faostat")

# *[QCL] FAOSTAT Production and area ----

## Load raw data
FAOSTAT_load_raw_data(DATASETCODE = "QCL", DATA_FOLDER = DIR_RAW_DATA_FAOSTAT)


QCL %>%
# Remove aggregated areas and items
filter(area_code < 350, item_code < 1700) %>%
select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
# When dealing with animal/livestock data, units are important
# Prod Popultn (5314) for Beewax and honey is removed since data is only before 1990
filter(element_code != 5314) %>%
# Remove NA for simplicity for now; expend.grid later
# All Coir (coconut fiber) is filtered out due to NA
filter(!is.na(value)) %>%
# remove accent
rm_accent("item", "area") -> QCL1


QCL1 %>%
distinct(area_code, area) %>%
add_title("FAO primary production country and code") %>%
add_units("NA") %>%
add_comments("FAO Country and code") ->
QCL_area_code_map

# Other data uses OCL area for consistency
QCL_area_code <- QCL1 %>% distinct(area_code) %>% pull()


QCL1 %>%
add_title("FAO primary production") %>%
add_units("USD/tonne") %>%
add_comments("Preprocessed FAOSTAT primary production") ->
QCL



# *[PP] Producer price ----

FAOSTAT_load_raw_data(DATASETCODE = "PP", DATA_FOLDER = DIR_RAW_DATA_FAOSTAT)

PP %>% distinct(element, element_code, unit)


PP %>%
filter(area_code < 350, # rm aggregated regions
item_code < 1700, #rm aggregated items
area_code %in% QCL_area_code, # only keep regions with production
element_code %in% c(5532, 5539)) %>% #keep USD/tonne and index
rm_accent("item", "area") -> PP1


# Using index to fill in missing across years
PP1 %>%
filter(element_code %in% c(5532, 5539)) %>%
select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
# Not completing year and area here
spread(element, value) %>%
left_join(
PP1 %>% filter(element_code %in% c(5532, 5539)) %>%
select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
spread(element, value) %>%
rename(pp_base = `Producer Price (USD/tonne)`,
pp_baseindex = `Producer Price Index (2014-2016 = 100)`) %>%
filter(!is.na(pp_base)) %>%
group_by(area, area_code, item) %>%
filter(year == 2015) %>% within(rm(year)) %>%
ungroup(),
by = c("area_code", "area", "item_code", "item")
) %>% mutate(
`Producer Price (USD/tonne)` = if_else(is.na(`Producer Price (USD/tonne)`),
pp_base* `Producer Price Index (2014-2016 = 100)` /pp_baseindex,
`Producer Price (USD/tonne)`)
) %>%
select(area_code, area, item_code, item, year, `Producer Price (USD/tonne)`) %>%
gather(element, value, `Producer Price (USD/tonne)`) %>%
mutate(element_code = 5532) -> PP2


### output PP and clean memory ----
PP2 %>%
add_title("FAO producer prices") %>%
add_units("USD/tonne") %>%
add_comments("Preprocessed FAOSTAT producer prices") ->
PP

rm(PP, PP1, PP2)

# Food balance and Supply-Utilization-Account

## *[FBS] new food balance sheet (2010-) ----

## Load raw data
FAOSTAT_load_raw_data("FBS") # New FBS 2010+
FBS %>% distinct(element, element_code, unit)

FBS %>% filter(item_code < 2901, item_code != 2501,
!element_code %in% c(511, 5301),
area_code %in% QCL_area_code) %>%
select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
rm_accent("item", "area") -> FBS1


### output FBS and clean memory ----
FBS1 %>%
add_title("FAO SCL") %>%
add_units("tonne") %>%
add_comments("Preprocessed FAOSTAT SCL") ->
FBS
rm(FBS, FBS1)


## *[SCL] SUA: supply utilization accounting ----

FAOSTAT_load_raw_data("SCL") # SUA 2010+
SCL %>% distinct(element, element_code, unit)
# FAOSTAT "accidentally" used CPC code in SCL;
if (is.numeric(SCL$item_code)) {
SCL %>% filter(item_code <= 1700, item_code != 1) -> SCL
}

SCL %>% filter(!element_code %in% c(664, 665, 674, 684, 511),
# it is not useful to calculate cal/g using `Food supply (kcal/capita/day)` /`Food supply quantity (g/capita/day)`
# unit too small so remove them here
# `Calories/Year` / `Food supply quantity (tonnes)` is more accurate!
# similarly for protein and fat
# Use annual value in SUA to calculate the conversion rate!
area_code %in% QCL_area_code) %>%
select(area_code, area, item_code, item, element_code, element, year, value, unit) %>%
rm_accent("item", "area") -> SCL1


### output SCL and clean memory ----
SCL1 %>%
add_title("FAO SCL") %>%
add_units("tonne") %>%
add_comments("Preprocessed FAOSTAT SCL") ->
SCL
rm(SCL, SCL1)


return_data(MODULE_OUTPUTS)

} else {
stop("Unknown command")
}
}
Loading

0 comments on commit 6647ca6

Please sign in to comment.