Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run styler::style_pkg() to make code formatting more consistent #160

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions R/check_date_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@
#' target_columns = c("date_first_pcr_positive_test", "date.of.admission")
#' )
check_date_sequence <- function(data, target_columns) {
checkmate::assert_vector(target_columns, any.missing = FALSE, min.len = 1L,
max.len = dim(data)[2], null.ok = FALSE,
unique = TRUE)
checkmate::assert_vector(target_columns,
any.missing = FALSE, min.len = 1L,
max.len = dim(data)[2], null.ok = FALSE,
unique = TRUE
)
checkmate::assert_data_frame(data, null.ok = FALSE)

# get the correct names in case some have been modified - see the
Expand All @@ -52,27 +54,31 @@ check_date_sequence <- function(data, target_columns) {
# check if all columns are part of the data frame
if (any(missing_cols)) {
warning("Removing unrecognised column name: ", target_columns[missing_cols],
call. = FALSE)
call. = FALSE
)
target_columns <- target_columns[!missing_cols]
if (length(target_columns) < 2L) {
stop("\nAt least 2 event dates are required!")
}
}

# checking the date sequence
tmp_data <- data %>% dplyr::select(dplyr::all_of(target_columns))
tmp_data <- data %>% dplyr::select(dplyr::all_of(target_columns))
order_date <- apply(tmp_data, 1L, is_date_sequence_ordered)
bad_order <- which(!order_date)
bad_order <- which(!order_date)
if (!all(order_date)) {
tmp_data <- tmp_data[bad_order, ]
# adding incorrect records to the report
data <- add_to_report(x = data,
key = "incorrect_date_sequence",
value = tmp_data)
data <- add_to_report(
x = data,
key = "incorrect_date_sequence",
value = tmp_data
)
warning("Detected ", length(bad_order),
" incorrect date sequences at line(s): ",
toString(bad_order),
call. = FALSE)
" incorrect date sequences at line(s): ",
toString(bad_order),
call. = FALSE
)
}

return(data)
Expand Down
55 changes: 32 additions & 23 deletions R/clean_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,36 +62,43 @@
#' replace_missing_values <- list(target_columns = NULL, na_strings = "-99")
#'
#' # Parameters for duplicates removal across all columns
#' remove_duplicates <- list(target_columns = NULL)
#' remove_duplicates <- list(target_columns = NULL)
#'
#' # Parameters for dates standardization
#' standardize_dates <- list(target_columns = NULL,
#' error_tolerance = 0.4,
#' format = NULL,
#' timeframe = as.Date(c("1973-05-29",
#' "2023-05-29")),
#' orders = list(
#' world_named_months = c("Ybd", "dby"),
#' world_digit_months = c("dmy", "Ymd"),
#' US_formats = c("Omdy", "YOmd")
#' ),
#' modern_excel = TRUE)
#' standardize_dates <- list(
#' target_columns = NULL,
#' error_tolerance = 0.4,
#' format = NULL,
#' timeframe = as.Date(c(
#' "1973-05-29",
#' "2023-05-29"
#' )),
#' orders = list(
#' world_named_months = c("Ybd", "dby"),
#' world_digit_months = c("dmy", "Ymd"),
#' US_formats = c("Omdy", "YOmd")
#' ),
#' modern_excel = TRUE
#' )
#'
#' # Parameters for subject IDs standardization
#' standardize_subject_ids <- list(target_columns = "study_id",
#' prefix = "PS",
#' suffix = "P2",
#' range = c(1, 100),
#' nchar = 7)
#' standardize_subject_ids <- list(
#' target_columns = "study_id",
#' prefix = "PS",
#' suffix = "P2",
#' range = c(1, 100),
#' nchar = 7
#' )
#'
#' to_numeric <- list(target_columns = "sex", lang = "en")
#'
#'
#' # dictionary = NULL the dictionary-based cleaning will not be performed here
#'
#' cleaned_data <- clean_data(
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi")),
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi"
#' )),
#' params = list(
#' standardize_column_names = standardize_col_names,
#' remove_constants = remove_cte,
Expand All @@ -112,10 +119,12 @@ clean_data <- function(data, params = NULL) {
checkmate::assert_list(params, min.len = 1L, max.len = 10L, null.ok = TRUE)
checkmate::check_names(
params,
subset.of = c("standardize_column_names", "remove_constants",
"replace_missing_values", "remove_duplicates",
"standardize_dates", "standardize_subject_ids",
"to_numeric", "dictionary", "check_date_sequence", "span")
subset.of = c(
"standardize_column_names", "remove_constants",
"replace_missing_values", "remove_duplicates",
"standardize_dates", "standardize_subject_ids",
"to_numeric", "dictionary", "check_date_sequence", "span"
)
)

## -----
Expand Down
40 changes: 24 additions & 16 deletions R/clean_data_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,40 @@ scan_columns <- function(x) {

# --- get the proportion of NA ---
are_na <- round((sum(is.na(x)) / n_rows), 6L)
x <- x[!is.na(x)]
x <- x[!is.na(x)]

# --- get the proportion of numeric values ---
tmp <- suppressWarnings(as.numeric(x))
tmp <- suppressWarnings(as.numeric(x))
are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L)

# --- get the proportion of date values ---
x <- x[which(is.na(tmp))]
x <- x[which(is.na(tmp))]
are_date <- 0L
if (!is.null(lubridate::guess_formats(x, c("ymd", "ydm", "dmy", "mdy", "myd",
"dym", "Ymd", "Ydm", "dmY", "mdY",
"mYd", "dYm")))) {
if (!is.null(lubridate::guess_formats(x, c(
"ymd", "ydm", "dmy", "mdy", "myd",
"dym", "Ymd", "Ydm", "dmY", "mdY",
"mYd", "dYm"
)))) {
x <- suppressWarnings(
as.Date(
lubridate::parse_date_time(
x, orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
"dmY", "mdY", "mYd", "dYm")
x,
orders = c(
"ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
"dmY", "mdY", "mYd", "dYm"
)
)
)
)
are_date <- round((sum(!is.na(x)) / n_rows), 6L)
}

# --- get the proportion of logical values ---
are_logical <- round((sum(is.logical(x)) / n_rows), 6L)
are_logical <- round((sum(is.logical(x)) / n_rows), 6L)

# --- get the proportion of character values ---
are_character <- round((1.0 - (are_na + are_numeric +
are_date + are_logical)), 6L)
are_date + are_logical)), 6L)

# --- return the output ---
return(c(are_na, are_numeric, are_date, are_character, are_logical))
Expand All @@ -63,14 +68,17 @@ scan_columns <- function(x) {
#' @examples
#' scan_result <- scan_data(
#' data = readRDS(system.file("extdata", "messy_data.RDS",
#' package = "cleanepi"))
#' package = "cleanepi"
#' ))
#' )
scan_data <- function(data) {
scan_result <- data.frame(t(apply(data, 2L, scan_columns)))
names(scan_result) <- c("missing", "numeric", "date", "character",
"logical")
row_names <- rownames(scan_result)
scan_result <- data.frame(t(apply(data, 2L, scan_columns)))
names(scan_result) <- c(
"missing", "numeric", "date", "character",
"logical"
)
row_names <- rownames(scan_result)
rownames(scan_result) <- NULL
scan_result <- cbind(Field_names = row_names, scan_result)
scan_result <- cbind(Field_names = row_names, scan_result)
return(scan_result)
}
51 changes: 29 additions & 22 deletions R/column_name_standardization.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,55 @@
#' # do not rename 'date.of.admission'
#' cleaned_data <- standardize_column_names(
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi")),
#' package = "cleanepi"
#' )),
#' keep = "date.of.admission"
#' )
#'
#' # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to
#' # 'DOB' and 'gender' respectively
#' cleaned_data <- standardize_column_names(
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi")),
#' keep = "date.of.admission",
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi"
#' )),
#' keep = "date.of.admission",
#' rename = c(DOB = "dateOfBirth", gender = "sex")
#' )
#'
standardize_column_names <- function(data, keep = NULL, rename = NULL) {
checkmate::assert_vector(keep, min.len = 0L, max.len = ncol(data),
null.ok = TRUE,
any.missing = FALSE)
checkmate::assert_character(rename, min.len = 0L, null.ok = TRUE,
any.missing = FALSE)
checkmate::assert_vector(keep,
min.len = 0L, max.len = ncol(data),
null.ok = TRUE,
any.missing = FALSE
)
checkmate::assert_character(rename,
min.len = 0L, null.ok = TRUE,
any.missing = FALSE
)
before <- colnames(data)

# when rename is not NULL, get the indices of the old column names as a vector
# and name them with the new names
if (!is.null(rename)) {
new_names <- names(rename)
new_names <- names(rename)
current_names <- unname(rename)
stopifnot(
"Unrecognised column names specified in 'rename'" =
all(current_names %in% before),
"Replace column names already exists" =
!any(new_names %in% before)
)
rename <- match(current_names, before)
rename <- match(current_names, before)
names(rename) <- new_names
}

# when keep is 'linelist_tags', keep the tagged variables
# also account for when target columns are provided as a vector or column
# name or column indices or NULL
keep <- get_target_column_names(data,
target_columns = keep,
cols = NULL)
target_columns = keep,
cols = NULL
)
kept <- before %in% keep

# if they're anything apart from ASCII e.g. arabic, throw error
Expand All @@ -71,12 +78,12 @@ standardize_column_names <- function(data, keep = NULL, rename = NULL) {
sep = "_"
)
if (!all(kept)) {
after[kept] <- before[kept]
after[kept] <- before[kept]
}
after[rename] <- names(rename)
after[rename] <- names(rename)
colnames(data) <- after
colnames_info <- data.frame(before, after)
data <- add_to_report(data, "colnames", colnames_info)
colnames_info <- data.frame(before, after)
data <- add_to_report(data, "colnames", colnames_info)
return(data)
}

Expand All @@ -103,7 +110,7 @@ retrieve_column_names <- function(data, target_columns) {
}

# extract the report object to make it easily accessible
report <- attr(data, "report")
report <- attr(data, "report")
if (!"colnames" %in% names(report)) {
return(target_columns)
}
Expand All @@ -115,13 +122,13 @@ retrieve_column_names <- function(data, target_columns) {

# detect the current names
# identify the old names
new_names <- target_columns[target_columns %in% names(data)]
new_names <- target_columns[target_columns %in% names(data)]
target_columns <- target_columns[!(target_columns %in% names(data))]
if ("colnames" %in% names(report) &&
all(target_columns %in% report[["colnames"]][["before"]])) {
all(target_columns %in% report[["colnames"]][["before"]])) {
all_column_names <- report[["colnames"]]
idx <- match(target_columns, all_column_names[["before"]])
new_names <- c(new_names, all_column_names[["after"]][idx])
idx <- match(target_columns, all_column_names[["before"]])
new_names <- c(new_names, all_column_names[["after"]][idx])
}

return(new_names)
Expand Down
12 changes: 8 additions & 4 deletions R/convert_numeric_to_date.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@
convert_numeric_to_date <- function(data, target_columns, ref_date,
forward = TRUE) {
if (!checkmate::test_character(ref_date, len = 1L, null.ok = FALSE)) {
checkmate::assert_date(ref_date, any.missing = FALSE, min.len = 1L,
max.len = 1L, null.ok = FALSE)
checkmate::assert_date(ref_date,
any.missing = FALSE, min.len = 1L,
max.len = 1L, null.ok = FALSE
)
}
checkmate::assert_vector(target_columns, min.len = 1, max.len = ncol(data),
null.ok = FALSE, any.missing = FALSE)
checkmate::assert_vector(target_columns,
min.len = 1, max.len = ncol(data),
null.ok = FALSE, any.missing = FALSE
)
checkmate::assert_data_frame(data, null.ok = FALSE, min.cols = 1L)

# get the correct names in case some have been modified - see the
Expand Down
Loading
Loading