report_participants: gender supports alternate spellings, convert a…

…ge to numeric (#389) * Gender now also supports more alternate spellings, and age converts the respective column to numeric. * snapshots * styler * comma * lints * lints2 (stringsAsFactors = FALSE) * lints3 (stringsAsFactors = FALSE in tests, etc.)
easystats · Aug 21, 2023 · b5821f5 · b5821f5
1 parent 0b1046d
commit b5821f5
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 49 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -56,7 +56,7 @@ Depends:
     R (>= 3.6)
 Imports:
     bayestestR (>= 0.13.0),
-    effectsize (> 0.8.2),
+    effectsize (>= 0.8.5),
     insight (>= 0.19.3.2),
     parameters (>= 0.20.2),
     performance (>= 0.10.2),
@@ -143,4 +143,6 @@ Collate:
     'utils_grouped_df.R'
     'zzz.R'
 Roxygen: list(markdown = TRUE)
-Remotes: easystats/insight
+Remotes: 
+  easystats/insight,
+  easystats/effectsize
diff --git a/NEWS.md b/NEWS.md
@@ -26,6 +26,8 @@ Bug fixes
 
 * Fixed issue in `report_participants`, which did not print the `"gender"`
   category for grouped output when that argument was written in lower-case.
+  Gender now also supports more alternate spellings, and age converts the
+  respective column to numeric.
 
 * Fixed printing issue for intercept-only models.
 

diff --git a/R/report_participants.R b/R/report_participants.R
@@ -133,7 +133,7 @@ report_participants <- function(data,
     x[which(x == "")] <- NA
     x
   })
-  data <- as.data.frame(data_list)
+  data <- as.data.frame(data_list, stringsAsFactors = FALSE)
 
   # find age variable automatically
   if (is.null(age)) {
@@ -256,33 +256,37 @@ report_participants <- function(data,
   country <- .replace_names(data, country)
   race <- .replace_names(data, race)
 
+  # Set age as numeric
+  data[[age]] <- as.numeric(data[[age]])
+
   # Grouped data
   if (!is.null(participants)) {
     data <- data.frame(
-      "Age" = stats::aggregate(data[[age]],
+      Age = stats::aggregate(data[[age]],
         by = list(data[[participants]]),
         FUN = mean
       )[[2]],
-      "Sex" = stats::aggregate(data[[sex]],
+      Sex = stats::aggregate(data[[sex]],
         by = list(data[[participants]]),
         FUN = utils::head, n = 1
       )[[2]],
-      "Gender" = stats::aggregate(data[[gender]],
+      Gender = stats::aggregate(data[[gender]],
         by = list(data[[participants]]),
         FUN = utils::head, n = 1
       )[[2]],
-      "Education" = stats::aggregate(data[[education]],
+      Education = stats::aggregate(data[[education]],
         by = list(data[[participants]]),
         FUN = utils::head, n = 1
       )[[2]],
-      "Country" = stats::aggregate(data[[country]],
+      Country = stats::aggregate(data[[country]],
         by = list(data[[participants]]),
         FUN = utils::head, n = 1
       )[[2]],
-      "Race" = stats::aggregate(data[[race]],
+      Race = stats::aggregate(data[[race]],
         by = list(data[[participants]]),
         FUN = utils::head, n = 1
-      )[[2]]
+      )[[2]],
+      stringsAsFactors = FALSE
     )
     age <- "Age"
     sex <- "Sex"
@@ -344,22 +348,32 @@ report_participants <- function(data,
     )
   }
 
+  genders_woman <- c(
+    "woman", "w", "female", "women", "girl",
+    "lady", "miss", "madam", "dame", "lass"
+  )
+  genders_man <- c(
+    "man", "m", "male", "men", "boy",
+    "guy", "dude", "lad", "sir"
+  )
+  both_genders <- c(genders_woman, genders_man, NA, "na")
+
   text_gender <- if (all(is.na(data[[gender]]))) {
     ""
   } else {
     paste0(
       "Gender: ",
       insight::format_value(length(data[[gender]][tolower(
         data[[gender]]
-      ) %in% c("woman", "w", "f", "female")]) / nrow(data) * 100, digits = digits),
+      ) %in% genders_woman]) / nrow(data) * 100, digits = digits),
       "% women, ",
       insight::format_value(length(data[[gender]][tolower(
         data[[gender]]
-      ) %in% c("man", "m", "male")]) / nrow(data) * 100, digits = digits),
+      ) %in% genders_man]) / nrow(data) * 100, digits = digits),
       "% men, ",
       insight::format_value(100 - length(data[[gender]][tolower(
         data[[gender]]
-      ) %in% c("woman", "w", "f", "female", "man", "m", "male", NA, "na")]) /
+      ) %in% both_genders]) /
         nrow(data) * 100), "% non-binary",
       if (!insight::format_value(length(data[[gender]][tolower(
         data[[gender]]
@@ -405,14 +419,16 @@ report_participants <- function(data,
   } else {
     data[[country]] <- as.character(data[[country]])
     data[which(data[[country]] %in% c(NA, "NA")), country] <- "missing"
-    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[country]]))[c(2, 4)]
+    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[country]]),
+                                     stringsAsFactors = FALSE)[c(2, 4)]
     names(frequency_table)[2] <- "Percent"
     frequency_table <- frequency_table[-which(is.na(frequency_table$Value)), ]
     frequency_table <- frequency_table[order(-frequency_table$Percent), ]
     upper <- frequency_table[which(frequency_table$Percent >= threshold), ]
     lower <- frequency_table[which(frequency_table$Percent < threshold), ]
     if (nrow(lower) > 0) {
-      lower_sum <- data.frame(Value = "other", Percent = sum(lower$Percent), stringsAsFactors = FALSE)
+      lower_sum <- data.frame(Value = "other", Percent = sum(lower$Percent),
+                              stringsAsFactors = FALSE)
       combined <- rbind(upper, lower_sum)
     } else {
       combined <- upper
@@ -427,14 +443,16 @@ report_participants <- function(data,
   } else {
     data[[race]] <- as.character(data[[race]])
     data[which(data[[race]] %in% c(NA, "NA")), race] <- "missing"
-    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[race]]))[c(2, 4)]
+    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[race]]),
+                                     stringsAsFactors = FALSE)[c(2, 4)]
     names(frequency_table)[2] <- "Percent"
     frequency_table <- frequency_table[-which(is.na(frequency_table$Value)), ]
     frequency_table <- frequency_table[order(-frequency_table$Percent), ]
     upper <- frequency_table[which(frequency_table$Percent >= threshold), ]
     lower <- frequency_table[which(frequency_table$Percent < threshold), ]
     if (nrow(lower) > 0) {
-      lower_sum <- data.frame(Value = "other", Percent = sum(lower$Percent), stringsAsFactors = FALSE)
+      lower_sum <- data.frame(Value = "other", Percent = sum(lower$Percent),
+                              stringsAsFactors = FALSE)
       combined <- rbind(upper, lower_sum)
     } else {
       combined <- upper

diff --git a/tests/testthat/_snaps/windows/report.htest-chi2.md b/tests/testthat/_snaps/windows/report.htest-chi2.md
@@ -84,9 +84,9 @@
     Code
       report_effectsize(x, type = "tschuprows_t")
     Output
-       
+      Effect sizes were labelled following Funder's (2019) recommendations. 
       
-       (Adjusted Tschuprow's t = 0.08, 95% CI [0.06, 1.00])
+      very small (Adjusted Tschuprow's t = 0.08, 95% CI [0.06, 1.00])
 
 ---
 

diff --git a/tests/testthat/_snaps/windows/report_participants.md b/tests/testthat/_snaps/windows/report_participants.md
@@ -59,5 +59,19 @@
     Code
       report_participants(data, age = "Age", sex = "Sex")
     Output
-      [1] "6 participants (8, n = 1; 21, n = 1; 22, n = 1; 23, n = 1; 42, n = 1; 54, n = 1; Sex: 16.7% females, 33.3% males, 16.7% other, 33.33% missing; Gender: 33.3% women, 16.7% men, 16.67% non-binary, 33.33% missing; Education: -5, 16.67%; -3, 16.67%; 0, 16.67%; 3, 16.67%; 5, 16.67%; 8, 16.67%; Country: 33.33% Canada, 16.67% Germany, 16.67% India, 16.67% USA, 16.67% missing; Race: 16.67% A, 16.67% B, 16.67% C, 16.67% D, 16.67% E, 16.67% missing)"
+      [1] "6 participants (Mean age = 3.5, SD = 1.9, range: [1, 6]; Sex: 16.7% females, 33.3% males, 16.7% other, 33.33% missing; Gender: 33.3% women, 16.7% men, 16.67% non-binary, 33.33% missing; Education: -5, 16.67%; -3, 16.67%; 0, 16.67%; 3, 16.67%; 5, 16.67%; 8, 16.67%; Country: 33.33% Canada, 16.67% Germany, 16.67% India, 16.67% USA, 16.67% missing; Race: 16.67% A, 16.67% B, 16.67% C, 16.67% D, 16.67% E, 16.67% missing)"
+
+# report_participants age as character
+
+    Code
+      report_participants(data)
+    Output
+      [1] "6 participants (Mean age = 29.5, SD = 15.0, range: [11, 52])"
+
+# report_participants different gender spellings
+
+    Code
+      report_participants(data)
+    Output
+      [1] "20 participants (Gender: 50.0% women, 45.0% men, 0.00% non-binary, 5.00% missing)"
 
diff --git a/tests/testthat/test-report_participants.R b/tests/testthat/test-report_participants.R
@@ -1,9 +1,9 @@
 test_that("report_participants, argument gender works", {
   # Works when capitalized
   data <- data.frame(
-    "Age" = c(22, 22, 54, 54, 8, 8, 42, 42),
-    "Gender" = c("N", "N", "W", "M", "M", "M", "Non-Binary", "Non-Binary"),
-    "Condition" = c("A", "A", "A", "A", "B", "B", "B", "B"),
+    Age = c(22, 22, 54, 54, 8, 8, 42, 42),
+    Gender = c("N", "N", "W", "M", "M", "M", "Non-Binary", "Non-Binary"),
+    Condition = c("A", "A", "A", "A", "B", "B", "B", "B"),
     stringsAsFactors = FALSE
   )
   out <- report_participants(data)
@@ -41,10 +41,10 @@ test_that("report_participants, argument gender works", {
 
 test_that("report_participants", {
   data <- data.frame(
-    "Age" = c(22, 22, 54, 54, 8, 8),
-    "Sex" = c("F", "F", "M", "M", "I", "I"),
-    "Gender" = c("W", "W", "M", "M", "N", "N"),
-    "Participant" = c("S1", "S1", "s2", "s2", "s3", "s3"),
+    Age = c(22, 22, 54, 54, 8, 8),
+    Sex = c("F", "F", "M", "M", "I", "I"),
+    Gender = c("W", "W", "M", "M", "N", "N"),
+    Participant = c("S1", "S1", "s2", "s2", "s3", "s3"),
     stringsAsFactors = FALSE
   )
 
@@ -70,9 +70,9 @@ test_that("report_participants", {
   )
 
   data2 <- data.frame(
-    "Age" = c(22, 22, 54, 54, 8, 8),
-    "Sex" = c("F", "F", "M", "O", "F", "O"),
-    "Gender" = c("W", "W", "M", "O", "W", "O"),
+    Age = c(22, 22, 54, 54, 8, 8),
+    Sex = c("F", "F", "M", "O", "F", "O"),
+    Gender = c("W", "W", "M", "O", "W", "O"),
     stringsAsFactors = FALSE
   )
 
@@ -82,9 +82,9 @@ test_that("report_participants", {
   )
 
   data3 <- data.frame(
-    "Age" = c(22, 82, NA, NA, NA, NA),
-    "Sex" = c("F", "F", "", "", "NA", NA),
-    "Gender" = c("W", "W", "", "", "NA", NA),
+    Age = c(22, 82, NA, NA, NA, NA),
+    Sex = c("F", "F", "", "", "NA", NA),
+    Gender = c("W", "W", "", "", "NA", NA),
     stringsAsFactors = FALSE
   )
 
@@ -95,13 +95,13 @@ test_that("report_participants", {
 
   # Add tests for education, country, race
   data4 <- data.frame(
-    "Education" = c(0, 8, -3, -5, 3, 5, NA),
-    "Education2" = c(
+    Education = c(0, 8, -3, -5, 3, 5, NA),
+    Education2 = c(
       "Bachelor", "PhD", "Highschool", "Highschool",
       "Bachelor", "Bachelor", NA
     ),
-    "Country" = c("USA", "Canada", "Canada", "India", "Germany", "USA", NA),
-    "Race" = c("Black", NA, "White", "Asian", "Black", "Black", "White"),
+    Country = c("USA", "Canada", "Canada", "India", "Germany", "USA", NA),
+    Race = c("Black", NA, "White", "Asian", "Black", "Black", "White"),
     stringsAsFactors = FALSE
   )
 
@@ -121,28 +121,55 @@ test_that("report_participants", {
 
 test_that("report_participants test NAs no warning", {
   data <- data.frame(
-    "Age" = c(22, 23, 54, 21, 8, 42),
-    "Sex" = (c("Intersex", "F", "M", "M", "NA", NA)),
-    "Gender" = (c("N", "W", "W", "M", "NA", NA)),
-    "Country" = (c("USA", NA, "Canada", "Canada", "India", "Germany")),
-    "Education" = factor(c(0, 8, -3, -5, 3, 5)),
-    "Race" = c(LETTERS[1:5], NA)
+    Age = c(22, 23, 54, 21, 8, 42),
+    Sex = (c("Intersex", "F", "M", "M", "NA", NA)),
+    Gender = (c("N", "W", "W", "M", "NA", NA)),
+    Country = (c("USA", NA, "Canada", "Canada", "India", "Germany")),
+    Education = factor(c(0, 8, -3, -5, 3, 5)),
+    Race = c(LETTERS[1:5], NA)
   )
   expect_snapshot(
     variant = "windows",
     report_participants(data)
   )
 
   data <- data.frame(
-    "Age" = factor(c(22, 23, 54, 21, 8, 42)),
-    "Sex" = factor(c("Intersex", "F", "M", "M", "NA", NA)),
-    "Gender" = factor(c("N", "W", "W", "M", "NA", NA)),
-    "Country" = factor(c("USA", NA, "Canada", "Canada", "India", "Germany")),
-    "Education" = factor(c(0, 8, -3, -5, 3, 5)),
-    "Race" = factor(c(LETTERS[1:5], NA))
+    Age = factor(c(22, 23, 54, 21, 8, 42)),
+    Sex = factor(c("Intersex", "F", "M", "M", "NA", NA)),
+    Gender = factor(c("N", "W", "W", "M", "NA", NA)),
+    Country = factor(c("USA", NA, "Canada", "Canada", "India", "Germany")),
+    Education = factor(c(0, 8, -3, -5, 3, 5)),
+    Race = factor(c(LETTERS[1:5], NA))
   )
   expect_snapshot(
     variant = "windows",
     report_participants(data, age = "Age", sex = "Sex")
   )
 })
+
+test_that("report_participants age as character", {
+  data <- data.frame(
+    Age = as.character(c(22, 22, 28, 11, 42, 52), stringsAsFactors = FALSE),
+    stringsAsFactors = FALSE
+  )
+  expect_snapshot(
+    variant = "windows",
+    report_participants(data)
+  )
+})
+
+test_that("report_participants different gender spellings", {
+  data <- data.frame(
+    Gender = c(
+      "Man", "M", "Male", "Men", "Boy", "Guy",
+      "Dude", "Lad", "Sir",
+      "Woman", "W", "Female", "Women", "Girl",
+      "Lady", "Miss", "Madam", "Dame", "Lass",
+      NA
+    ), stringsAsFactors = FALSE
+  )
+  expect_snapshot(
+    variant = "windows",
+    report_participants(data)
+  )
+})