pfmc-assessments · chantelwetzel-noaa · May 8, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,7 @@ export(check_dir)
 export(check_survey)
 export(codify_sex)
 export(codify_sex_SS3)
+export(combine_tows)
 export(createMatrix)
 export(draw_USEEZ)
 export(draw_land)
@@ -67,6 +68,8 @@ import(glue)
 import(janitor)
 import(reshape2)
 import(stringr)
+importFrom(dplyr,distinct)
+importFrom(dplyr,group_by)
 importFrom(dplyr,left_join)
 importFrom(dplyr,rename)
 importFrom(grDevices,dev.off)

diff --git a/R/PullCatch.fn.R b/R/PullCatch.fn.R
@@ -47,7 +47,7 @@
 #'   * NWFSC.Hook.Line (not yet working),
 #'   * NWFSC.Video,
 #'   * Triennial.Canada
-#' 
+#'
 #' Currently, you must pull data one survey at a time, though we are working on
 #' allowing for a vector of survey names and
 #' `NWFSC.Shelf.Rockfish` and `NWFSC.Hook.Line` are not supported.
@@ -58,6 +58,7 @@
 #'   The name of the file within `Dir` will start with Catch_ and end with .rdata.
 #'   Default NULL which will not save an output file.
 #' @template verbose
+#' @template sample_types
 #'
 #' @author Chantel Wetzel (maintainer) based on code by John Wallace
 #' @export
@@ -95,7 +96,8 @@ PullCatch.fn <- function(
     SurveyName = NULL,
     SaveFile = lifecycle::deprecated(),
     Dir = NULL,
-    verbose = TRUE) {
+    verbose = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")[1:2]) {
 
   lifecycle::deprecate_soft(
     when = "2.3",
@@ -117,7 +119,8 @@ PullCatch.fn <- function(
     survey = SurveyName,
     dir = Dir,
     convert = TRUE,
-    verbose = verbose)
+    verbose = verbose,
+    sample_types = sample_types)
 
   return(Out)
 }
diff --git a/R/combine_tows.R b/R/combine_tows.R
@@ -0,0 +1,58 @@
+#' Combine catch data by trawl_id for data pulled using the
+#' pull_catch function. Generally, only a single tow is returned
+#' by species and in those instances this function is not needed.
+#' This function is only needed in unique data situations. For
+#' example, if data are pulled with all sample_types included then
+#' there will often be data from the same trawl_id included as
+#' multiple records. In these instances this function allows for
+#' the sample data to be combined for unique trawl_id.
+#'
+#'
+#' @template dir
+#' @param data A data frame of catches obtained by using the pull_catch
+#' function.
+#' @template verbose
+#'
+#' @author Chantel Wetzel
+#' @export
+#'
+#' @importFrom dplyr group_by distinct
+#'
+#'
+#'
+combine_tows <- function(dir = NULL, data, verbose = TRUE){
+
+  check_dir(dir = dir, verbose = verbose)
+
+  if(!"total_catch_numbers" %in% colnames(data)){
+    stop("The data object needs to be a data frame of pulled catches from the `pull_catch` function.")
+  }
+
+  original_colname <- colnames(data)
+  colnames(data) <- tolower(colnames(data))
+
+  find <- grep("trawl_id", colnames(data), ignore.case = TRUE)
+  n_id <- table(data[, find])
+  if(sum(n_id == 2) == 0){
+    stop("All trawl_ids are unique and there is no need to combine data.")
+  }
+
+  catch <- data |>
+    dplyr::group_by(common_name, trawl_id) |>
+    mutate(
+      total_catch_numbers = sum(total_catch_numbers),
+      total_catch_wt_kg = sum(total_catch_wt_kg),
+      subsample_count = sum(subsample_count),
+      subsample_wt_kg = sum(subsample_wt_kg),
+      cpue_kg_per_ha_der = sum(cpue_kg_per_ha_der),
+      cpue_kg_km2 = sum(cpue_kg_km2),
+      partition_sample_types = NA,
+      partition = NA
+    ) |>
+    dplyr::distinct(trawl_id, .keep_all = TRUE)
+
+  catch <- as.data.frame(catch)
+  colnames(catch) <- original_colname
+
+  return(catch)
+}
diff --git a/R/pull_catch.R b/R/pull_catch.R
@@ -10,15 +10,12 @@
 #' with the intent that they provide the best available information for use
 #' in an index-standardization procedure. The removed samples may be of use
 #' to others with a less-restrictive goal than producing an index of abundance.
-#' For example, life-stage samples are excluded because they are not collected
-#' using the same protocols as standard samples.
+#' For example, depths sampled outside the standard protocol are excluded.
 #' To download all data, we currently recommend going to the
 #' [NWFSC data warehouse](https://www.webapps.nwfsc.noaa.gov/data)
 #' and using the csv link to extract data for a single species at a time.
 #' In the future, we hope to add functionality to this package such that
 #' downloading all data can be done easily within this function.
-#' See [Issue #43](https://github.com/pfmc-assessments/nwfscSurvey/issues/43)
-#' for more information.
 #'
 #' @template common_name
 #' @template sci_name
@@ -27,6 +24,7 @@
 #' @template dir
 #' @template convert
 #' @template verbose
+#' @template sample_types
 #'
 #' @author Chantel Wetzel
 #' @export
@@ -65,7 +63,8 @@ pull_catch <- function(common_name = NULL,
                        survey,
                        dir = NULL,
                        convert = TRUE,
-                       verbose = TRUE) {
+                       verbose = TRUE,
+                       sample_types = c("NA", NA, "Life Stage", "Size")[1:2]) {
 
   if (survey %in% c("NWFSC.Shelf.Rockfish", "NWFSC.Hook.Line")) {
     stop("The catch pull currently does not work for NWFSC Hook & Line Survey data.",
@@ -105,14 +104,15 @@ pull_catch <- function(common_name = NULL,
   # would allow us to eliminate vars_long form the main pull
 
   perf_codes <- c(
-    "operation_dim$legacy_performance_code",
-    "statistical_partition_dim$statistical_partition_type"
+    "operation_dim$legacy_performance_code"
   )
 
   vars_long <- c(
     "common_name", "scientific_name", "project", "year", "vessel", "tow",
     "total_catch_numbers", "total_catch_wt_kg",
     "subsample_count", "subsample_wt_kg",  "cpue_kg_per_ha_der",
+    "statistical_partition_dim$statistical_partition_type",
+    "partition",
     perf_codes
   )
 
@@ -149,13 +149,9 @@ pull_catch <- function(common_name = NULL,
     positive_tows[water_hauls, "operation_dim$legacy_performance_code"] <- -999
   }
 
-  # Retain on standard survey samples
-  # whether values are NA or "NA" varies based on the presence of "Life Stage" samples
-  standard_samples <- sum(is.na(positive_tows[, "statistical_partition_dim$statistical_partition_type"])) != nrow(positive_tows)
-  if (standard_samples) {
-    keep <- positive_tows[, "statistical_partition_dim$statistical_partition_type"] == "NA"
-    positive_tows <- positive_tows[keep, ]
-  }
+  positive_tows <- positive_tows[
+    positive_tows[, "statistical_partition_dim$statistical_partition_type"] %in% sample_types,
+  ]
 
   good_tows <- positive_tows[, "operation_dim$legacy_performance_code"] != 8
   positive_tows <- positive_tows[good_tows, ]
@@ -191,9 +187,6 @@ pull_catch <- function(common_name = NULL,
 
   all_tows <- all_tows[
     !duplicated(paste(all_tows$year, all_tows$pass, all_tows$vessel, all_tows$tow)),
-    #c("project", "trawl_id", "year", "pass", "vessel", "tow", "datetime_utc_iso", "depth_m",
-    #  "longitude_dd", "latitude_dd", "area_swept_ha_der"
-    #)
   ]
 
   positive_tows_grouped <- dplyr::group_by(
@@ -238,6 +231,8 @@ pull_catch <- function(common_name = NULL,
   ) |>
     dplyr::arrange(common_name, trawl_id)
 
+  colnames(catch)[colnames(catch) == "statistical_partition_dim$statistical_partition_type"] <- "partition_sample_types"
+
   # Need to check what this is doing
   no_area <- which(is.na(catch$area_swept_ha_der))
   if (length(no_area) > 0) {
@@ -252,6 +247,8 @@ pull_catch <- function(common_name = NULL,
 
   # Fill in zeros where needed
   catch[is.na(catch)] <- 0
+  catch[catch[,"partition_sample_types"] == 0, "partition_sample_types"] <- NA
+  catch[catch[,"partition"] == 0, "partition"] <- NA
 
   catch$date <- chron::chron(
     format(as.POSIXlt(catch$datetime_utc_iso, format = "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d"),
@@ -262,17 +259,25 @@ pull_catch <- function(common_name = NULL,
   catch$cpue_kg_km2 <- catch$cpue_kg_per_ha_der * 100
   colnames(catch)[which(colnames(catch) == "area_swept_ha_der")] <- "area_swept_ha"
 
+  if(sum(c("Life Stage", "Size") %in% sample_types) == 2) {
+    n_id <- table(catch$trawl_id)
+    if(sum(n_id == 2) > 0){
+      print("Warning: Pulling all sample types (Life Stage and Size) has resulted in multiple records for unique tows (Trawl_id).")
+      print("The `combine_tows` function can be used to combine these multiple records for unique tows if needed.")
+    }
+  }
+
   if(convert) {
 
     firstup <- function(x) {
       substr(x, 1, 1) <- toupper(substr(x, 1, 1))
       x
     }
     colnames(catch) <- firstup(colnames(catch))
-    colnames(catch)[colnames(catch)=="Cpue_kg_km2"] <- "cpue_kg_km2"
-    colnames(catch)[colnames(catch)=="Cpue_kg_per_ha_der"] <- "cpue_kg_per_ha_der"
-    colnames(catch)[colnames(catch)=="Total_catch_numbers"] <- "total_catch_numbers"
-    colnames(catch)[colnames(catch)=="Total_catch_wt_kg"] <- "total_catch_wt_kg"
+    colnames(catch)[colnames(catch) == "Cpue_kg_km2"] <- "cpue_kg_km2"
+    colnames(catch)[colnames(catch) == "Cpue_kg_per_ha_der"] <- "cpue_kg_per_ha_der"
+    colnames(catch)[colnames(catch) == "Total_catch_numbers"] <- "total_catch_numbers"
+    colnames(catch)[colnames(catch) == "Total_catch_wt_kg"] <- "total_catch_wt_kg"
   }
 
   save_rdata(

diff --git a/man-roxygen/sample_types.R b/man-roxygen/sample_types.R
@@ -0,0 +1,10 @@
+#' @param sample_types A character vector of sample types, i.e.,
+#' `"statistical_partition_dim"`, that you would like to keep. The default is
+#' to only keep `NA` values, both real and character NA. But, for some
+#' instances you may want to keep Life Stage and Size samples. The majority
+#' of samples with `"statistical_partition_dim"` of Size and Life Stage are
+#' Pacific hake and should not be considered different than regular survey
+#' samples. The other types of samples that may be designated Life Stage are
+#' egg cases that can be caught and identified for select elasmobranch
+#' species. These type of samples would not be included in the data to estimate
+#' indices of abundance and are omitted by default.
diff --git a/man/PullCatch.fn.Rd b/man/PullCatch.fn.Rd
diff --git a/man/combine_tows.Rd b/man/combine_tows.Rd
diff --git a/man/pull_catch.Rd b/man/pull_catch.Rd
diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
@@ -57,6 +57,30 @@ test_that("pull_catch-multispecies", {
   )
 })
 
+test_that("pull-sample-types", {
+  skip_on_cran()
+
+  set.seed(123)
+  data_hake <- pull_catch(
+    common_name = "Pacific hake",
+    years = c(2014, 2018),
+    survey = "NWFSC.Combo",
+    verbose = TRUE,
+    convert = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")[1:4]
+  )
+  expect_is(data_hake, "data.frame")
+  expect_equal(nrow(data_hake), 3556)
+  expect_equal(length(which(data_hake$cpue_kg_km2 == 0)), 1622)
+  expect_equal(length(unique(data_hake$Trawl_id)), 3442)
+
+  combine_hake <- combine_tows(
+    data = data_hake
+  )
+  expect_equal(length(unique(data_hake$Trawl_id)), nrow(combine_hake))
+  expect_equal(sum(data_hake$total_catch_numbers), sum(combine_hake$total_catch_numbers))
+})
+
 test_that("PullHaul", {
   skip_on_cran()