epiverse-trace · chartgerink · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/R/check_date_sequence.R b/R/check_date_sequence.R
@@ -37,9 +37,11 @@
 #'   target_columns = c("date_first_pcr_positive_test", "date.of.admission")
 #' )
 check_date_sequence <- function(data, target_columns) {
-  checkmate::assert_vector(target_columns, any.missing = FALSE, min.len = 1L,
-                           max.len = dim(data)[2], null.ok = FALSE,
-                           unique = TRUE)
+  checkmate::assert_vector(target_columns,
+    any.missing = FALSE, min.len = 1L,
+    max.len = dim(data)[2], null.ok = FALSE,
+    unique = TRUE
+  )
   checkmate::assert_data_frame(data, null.ok = FALSE)
 
   # get the correct names in case some have been modified - see the
@@ -52,27 +54,31 @@ check_date_sequence <- function(data, target_columns) {
   # check if all columns are part of the data frame
   if (any(missing_cols)) {
     warning("Removing unrecognised column name: ", target_columns[missing_cols],
-            call. = FALSE)
+      call. = FALSE
+    )
     target_columns <- target_columns[!missing_cols]
     if (length(target_columns) < 2L) {
       stop("\nAt least 2 event dates are required!")
     }
   }
 
   # checking the date sequence
-  tmp_data   <- data %>% dplyr::select(dplyr::all_of(target_columns))
+  tmp_data <- data %>% dplyr::select(dplyr::all_of(target_columns))
   order_date <- apply(tmp_data, 1L, is_date_sequence_ordered)
-  bad_order  <- which(!order_date)
+  bad_order <- which(!order_date)
   if (!all(order_date)) {
     tmp_data <- tmp_data[bad_order, ]
     # adding incorrect records to the report
-    data     <- add_to_report(x     = data,
-                              key   = "incorrect_date_sequence",
-                              value = tmp_data)
+    data <- add_to_report(
+      x = data,
+      key = "incorrect_date_sequence",
+      value = tmp_data
+    )
     warning("Detected ", length(bad_order),
-            " incorrect date sequences at line(s): ",
-            toString(bad_order),
-            call. = FALSE)
+      " incorrect date sequences at line(s): ",
+      toString(bad_order),
+      call. = FALSE
+    )
   }
 
   return(data)

diff --git a/R/clean_data.R b/R/clean_data.R
@@ -62,36 +62,43 @@
 #' replace_missing_values <- list(target_columns = NULL, na_strings = "-99")
 #'
 #' # Parameters for duplicates removal across all columns
-#' remove_duplicates <- list(target_columns   = NULL)
+#' remove_duplicates <- list(target_columns = NULL)
 #'
 #' # Parameters for dates standardization
-#' standardize_dates <- list(target_columns  = NULL,
-#'                           error_tolerance = 0.4,
-#'                           format          = NULL,
-#'                           timeframe       = as.Date(c("1973-05-29",
-#'                                                       "2023-05-29")),
-#'                           orders          = list(
-#'                             world_named_months = c("Ybd", "dby"),
-#'                             world_digit_months = c("dmy", "Ymd"),
-#'                             US_formats         = c("Omdy", "YOmd")
-#'                           ),
-#'                           modern_excel    = TRUE)
+#' standardize_dates <- list(
+#'   target_columns = NULL,
+#'   error_tolerance = 0.4,
+#'   format = NULL,
+#'   timeframe = as.Date(c(
+#'     "1973-05-29",
+#'     "2023-05-29"
+#'   )),
+#'   orders = list(
+#'     world_named_months = c("Ybd", "dby"),
+#'     world_digit_months = c("dmy", "Ymd"),
+#'     US_formats         = c("Omdy", "YOmd")
+#'   ),
+#'   modern_excel = TRUE
+#' )
 #'
 #' # Parameters for subject IDs standardization
-#' standardize_subject_ids <- list(target_columns = "study_id",
-#'                                 prefix         = "PS",
-#'                                 suffix         = "P2",
-#'                                 range          = c(1, 100),
-#'                                 nchar          = 7)
+#' standardize_subject_ids <- list(
+#'   target_columns = "study_id",
+#'   prefix = "PS",
+#'   suffix = "P2",
+#'   range = c(1, 100),
+#'   nchar = 7
+#' )
 #'
 #' to_numeric <- list(target_columns = "sex", lang = "en")
 #'
 #'
 #' # dictionary = NULL the dictionary-based cleaning will not be performed here
 #'
 #' cleaned_data <- clean_data(
-#'   data   = readRDS(system.file("extdata", "test_df.RDS",
-#'                                package = "cleanepi")),
+#'   data = readRDS(system.file("extdata", "test_df.RDS",
+#'     package = "cleanepi"
+#'   )),
 #'   params = list(
 #'     standardize_column_names = standardize_col_names,
 #'     remove_constants         = remove_cte,
@@ -112,10 +119,12 @@ clean_data <- function(data, params = NULL) {
   checkmate::assert_list(params, min.len = 1L, max.len = 10L, null.ok = TRUE)
   checkmate::check_names(
     params,
-    subset.of = c("standardize_column_names", "remove_constants",
-                  "replace_missing_values", "remove_duplicates",
-                  "standardize_dates", "standardize_subject_ids",
-                  "to_numeric", "dictionary", "check_date_sequence", "span")
+    subset.of = c(
+      "standardize_column_names", "remove_constants",
+      "replace_missing_values", "remove_duplicates",
+      "standardize_dates", "standardize_subject_ids",
+      "to_numeric", "dictionary", "check_date_sequence", "span"
+    )
   )
 
   ## -----

diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R
@@ -15,35 +15,40 @@ scan_columns <- function(x) {
 
   # --- get the proportion of NA ---
   are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
+  x <- x[!is.na(x)]
 
   # --- get the proportion of numeric values ---
-  tmp         <- suppressWarnings(as.numeric(x))
+  tmp <- suppressWarnings(as.numeric(x))
   are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L)
 
   # --- get the proportion of date values ---
-  x        <- x[which(is.na(tmp))]
+  x <- x[which(is.na(tmp))]
   are_date <- 0L
-  if (!is.null(lubridate::guess_formats(x, c("ymd", "ydm", "dmy", "mdy", "myd",
-                                             "dym", "Ymd", "Ydm", "dmY", "mdY",
-                                             "mYd", "dYm")))) {
+  if (!is.null(lubridate::guess_formats(x, c(
+    "ymd", "ydm", "dmy", "mdy", "myd",
+    "dym", "Ymd", "Ydm", "dmY", "mdY",
+    "mYd", "dYm"
+  )))) {
     x <- suppressWarnings(
       as.Date(
         lubridate::parse_date_time(
-          x, orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
-                        "dmY", "mdY", "mYd", "dYm")
+          x,
+          orders = c(
+            "ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
+            "dmY", "mdY", "mYd", "dYm"
+          )
         )
       )
     )
     are_date <- round((sum(!is.na(x)) / n_rows), 6L)
   }
 
   # --- get the proportion of logical values ---
-  are_logical   <- round((sum(is.logical(x)) / n_rows), 6L)
+  are_logical <- round((sum(is.logical(x)) / n_rows), 6L)
 
   # --- get the proportion of character values ---
   are_character <- round((1.0 - (are_na + are_numeric +
-                                   are_date + are_logical)), 6L)
+    are_date + are_logical)), 6L)
 
   # --- return the output ---
   return(c(are_na, are_numeric, are_date, are_character, are_logical))
@@ -63,14 +68,17 @@ scan_columns <- function(x) {
 #' @examples
 #' scan_result <- scan_data(
 #'   data = readRDS(system.file("extdata", "messy_data.RDS",
-#'                              package = "cleanepi"))
+#'     package = "cleanepi"
+#'   ))
 #' )
 scan_data <- function(data) {
-  scan_result           <- data.frame(t(apply(data, 2L, scan_columns)))
-  names(scan_result)    <- c("missing", "numeric", "date", "character",
-                             "logical")
-  row_names             <- rownames(scan_result)
+  scan_result <- data.frame(t(apply(data, 2L, scan_columns)))
+  names(scan_result) <- c(
+    "missing", "numeric", "date", "character",
+    "logical"
+  )
+  row_names <- rownames(scan_result)
   rownames(scan_result) <- NULL
-  scan_result           <- cbind(Field_names = row_names, scan_result)
+  scan_result <- cbind(Field_names = row_names, scan_result)
   return(scan_result)
 }
diff --git a/R/column_name_standardization.R b/R/column_name_standardization.R
@@ -19,48 +19,55 @@
 #' # do not rename 'date.of.admission'
 #' cleaned_data <- standardize_column_names(
 #'   data = readRDS(system.file("extdata", "test_df.RDS",
-#'                              package = "cleanepi")),
+#'     package = "cleanepi"
+#'   )),
 #'   keep = "date.of.admission"
 #' )
 #'
 #' # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to
 #' # 'DOB' and 'gender' respectively
 #' cleaned_data <- standardize_column_names(
-#'   data   = readRDS(system.file("extdata", "test_df.RDS",
-#'                                package = "cleanepi")),
-#'   keep   = "date.of.admission",
+#'   data = readRDS(system.file("extdata", "test_df.RDS",
+#'     package = "cleanepi"
+#'   )),
+#'   keep = "date.of.admission",
 #'   rename = c(DOB = "dateOfBirth", gender = "sex")
 #' )
 #'
 standardize_column_names <- function(data, keep = NULL, rename = NULL) {
-  checkmate::assert_vector(keep, min.len = 0L, max.len = ncol(data),
-                           null.ok = TRUE,
-                           any.missing = FALSE)
-  checkmate::assert_character(rename, min.len = 0L, null.ok = TRUE,
-                              any.missing = FALSE)
+  checkmate::assert_vector(keep,
+    min.len = 0L, max.len = ncol(data),
+    null.ok = TRUE,
+    any.missing = FALSE
+  )
+  checkmate::assert_character(rename,
+    min.len = 0L, null.ok = TRUE,
+    any.missing = FALSE
+  )
   before <- colnames(data)
 
   # when rename is not NULL, get the indices of the old column names as a vector
   # and name them with the new names
   if (!is.null(rename)) {
-    new_names    <- names(rename)
+    new_names <- names(rename)
     current_names <- unname(rename)
     stopifnot(
       "Unrecognised column names specified in 'rename'" =
         all(current_names %in% before),
       "Replace column names already exists" =
         !any(new_names %in% before)
     )
-    rename        <- match(current_names, before)
+    rename <- match(current_names, before)
     names(rename) <- new_names
   }
 
   # when keep is 'linelist_tags', keep the tagged variables
   # also account for when target columns are provided as a vector or column
   # name or column indices or NULL
   keep <- get_target_column_names(data,
-                                  target_columns = keep,
-                                  cols           = NULL)
+    target_columns = keep,
+    cols           = NULL
+  )
   kept <- before %in% keep
 
   # if they're anything apart from ASCII e.g. arabic, throw error
@@ -71,12 +78,12 @@ standardize_column_names <- function(data, keep = NULL, rename = NULL) {
     sep = "_"
   )
   if (!all(kept)) {
-    after[kept]  <- before[kept]
+    after[kept] <- before[kept]
   }
-  after[rename]  <- names(rename)
+  after[rename] <- names(rename)
   colnames(data) <- after
-  colnames_info  <- data.frame(before, after)
-  data           <- add_to_report(data, "colnames", colnames_info)
+  colnames_info <- data.frame(before, after)
+  data <- add_to_report(data, "colnames", colnames_info)
   return(data)
 }
 
@@ -103,7 +110,7 @@ retrieve_column_names <- function(data, target_columns) {
   }
 
   # extract the report object to make it easily accessible
-  report    <- attr(data, "report")
+  report <- attr(data, "report")
   if (!"colnames" %in% names(report)) {
     return(target_columns)
   }
@@ -115,13 +122,13 @@ retrieve_column_names <- function(data, target_columns) {
 
   # detect the current names
   # identify the old names
-  new_names      <- target_columns[target_columns %in% names(data)]
+  new_names <- target_columns[target_columns %in% names(data)]
   target_columns <- target_columns[!(target_columns %in% names(data))]
   if ("colnames" %in% names(report) &&
-      all(target_columns %in% report[["colnames"]][["before"]])) {
+    all(target_columns %in% report[["colnames"]][["before"]])) {
     all_column_names <- report[["colnames"]]
-    idx              <- match(target_columns, all_column_names[["before"]])
-    new_names        <- c(new_names, all_column_names[["after"]][idx])
+    idx <- match(target_columns, all_column_names[["before"]])
+    new_names <- c(new_names, all_column_names[["after"]][idx])
   }
 
   return(new_names)

diff --git a/R/convert_numeric_to_date.R b/R/convert_numeric_to_date.R
@@ -23,11 +23,15 @@
 convert_numeric_to_date <- function(data, target_columns, ref_date,
                                     forward = TRUE) {
   if (!checkmate::test_character(ref_date, len = 1L, null.ok = FALSE)) {
-    checkmate::assert_date(ref_date, any.missing = FALSE, min.len = 1L,
-                           max.len = 1L, null.ok = FALSE)
+    checkmate::assert_date(ref_date,
+      any.missing = FALSE, min.len = 1L,
+      max.len = 1L, null.ok = FALSE
+    )
   }
-  checkmate::assert_vector(target_columns, min.len = 1, max.len = ncol(data),
-                           null.ok = FALSE, any.missing = FALSE)
+  checkmate::assert_vector(target_columns,
+    min.len = 1, max.len = ncol(data),
+    null.ok = FALSE, any.missing = FALSE
+  )
   checkmate::assert_data_frame(data, null.ok = FALSE, min.cols = 1L)
 
   # get the correct names in case some have been modified - see the