From 71a5686ea4532bbcec8af1ef60a328a9d0bd008c Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 30 Apr 2024 16:35:07 -0700
Subject: [PATCH 01/12] add sample type for pulling catch data

---
 R/PullCatch.fn.R           |  9 +++++---
 R/pull_catch.R             | 47 +++++++++++++++++++++-----------------
 man-roxygen/sample_types.R |  4 ++++
 3 files changed, 36 insertions(+), 24 deletions(-)
 create mode 100644 man-roxygen/sample_types.R

diff --git a/R/PullCatch.fn.R b/R/PullCatch.fn.R
index fd96572..95a66c4 100644
--- a/R/PullCatch.fn.R
+++ b/R/PullCatch.fn.R
@@ -47,7 +47,7 @@
 #'   * NWFSC.Hook.Line (not yet working),
 #'   * NWFSC.Video,
 #'   * Triennial.Canada
-#' 
+#'
 #' Currently, you must pull data one survey at a time, though we are working on
 #' allowing for a vector of survey names and
 #' `NWFSC.Shelf.Rockfish` and `NWFSC.Hook.Line` are not supported.
@@ -58,6 +58,7 @@
 #'   The name of the file within `Dir` will start with Catch_ and end with .rdata.
 #'   Default NULL which will not save an output file.
 #' @template verbose
+#' @template sample_types
 #'
 #' @author Chantel Wetzel (maintainer) based on code by John Wallace
 #' @export
@@ -95,7 +96,8 @@ PullCatch.fn <- function(
     SurveyName = NULL,
     SaveFile = lifecycle::deprecated(),
     Dir = NULL,
-    verbose = TRUE) {
+    verbose = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")[1:2]) {
 
   lifecycle::deprecate_soft(
     when = "2.3",
@@ -117,7 +119,8 @@ PullCatch.fn <- function(
     survey = SurveyName,
     dir = Dir,
     convert = TRUE,
-    verbose = verbose)
+    verbose = verbose,
+    sample_types = sample_types)
 
   return(Out)
 }
diff --git a/R/pull_catch.R b/R/pull_catch.R
index 80bfc25..6205dad 100644
--- a/R/pull_catch.R
+++ b/R/pull_catch.R
@@ -10,15 +10,12 @@
 #' with the intent that they provide the best available information for use
 #' in an index-standardization procedure. The removed samples may be of use
 #' to others with a less-restrictive goal than producing an index of abundance.
-#' For example, life-stage samples are excluded because they are not collected
-#' using the same protocols as standard samples.
+#' For example, depths sampled outside the standard protocol are excluded.
 #' To download all data, we currently recommend going to the
 #' [NWFSC data warehouse](https://www.webapps.nwfsc.noaa.gov/data)
 #' and using the csv link to extract data for a single species at a time.
 #' In the future, we hope to add functionality to this package such that
 #' downloading all data can be done easily within this function.
-#' See [Issue #43](https://github.com/pfmc-assessments/nwfscSurvey/issues/43)
-#' for more information.
 #'
 #' @template common_name
 #' @template sci_name
@@ -27,6 +24,7 @@
 #' @template dir
 #' @template convert
 #' @template verbose
+#' @template sample_types
 #'
 #' @author Chantel Wetzel
 #' @export
@@ -65,7 +63,8 @@ pull_catch <- function(common_name = NULL,
                        survey,
                        dir = NULL,
                        convert = TRUE,
-                       verbose = TRUE) {
+                       verbose = TRUE,
+                       sample_types = c("NA", NA, "Life Stage", "Size")[1:2]) {
 
   if (survey %in% c("NWFSC.Shelf.Rockfish", "NWFSC.Hook.Line")) {
     stop("The catch pull currently does not work for NWFSC Hook & Line Survey data.",
@@ -105,14 +104,15 @@ pull_catch <- function(common_name = NULL,
   # would allow us to eliminate vars_long form the main pull
 
   perf_codes <- c(
-    "operation_dim$legacy_performance_code",
-    "statistical_partition_dim$statistical_partition_type"
+    "operation_dim$legacy_performance_code"
   )
 
   vars_long <- c(
     "common_name", "scientific_name", "project", "year", "vessel", "tow",
     "total_catch_numbers", "total_catch_wt_kg",
     "subsample_count", "subsample_wt_kg",  "cpue_kg_per_ha_der",
+    "statistical_partition_dim$statistical_partition_type",
+    "partition",
     perf_codes
   )
 
@@ -149,13 +149,9 @@ pull_catch <- function(common_name = NULL,
     positive_tows[water_hauls, "operation_dim$legacy_performance_code"] <- -999
   }
 
-  # Retain on standard survey samples
-  # whether values are NA or "NA" varies based on the presence of "Life Stage" samples
-  standard_samples <- sum(is.na(positive_tows[, "statistical_partition_dim$statistical_partition_type"])) != nrow(positive_tows)
-  if (standard_samples) {
-    keep <- positive_tows[, "statistical_partition_dim$statistical_partition_type"] == "NA"
-    positive_tows <- positive_tows[keep, ]
-  }
+  positive_tows <- positive_tows[
+    positive_tows[, "statistical_partition_dim$statistical_partition_type"] %in% sample_types,
+  ]
 
   good_tows <- positive_tows[, "operation_dim$legacy_performance_code"] != 8
   positive_tows <- positive_tows[good_tows, ]
@@ -191,9 +187,6 @@ pull_catch <- function(common_name = NULL,
 
   all_tows <- all_tows[
     !duplicated(paste(all_tows$year, all_tows$pass, all_tows$vessel, all_tows$tow)),
-    #c("project", "trawl_id", "year", "pass", "vessel", "tow", "datetime_utc_iso", "depth_m",
-    #  "longitude_dd", "latitude_dd", "area_swept_ha_der"
-    #)
   ]
 
   positive_tows_grouped <- dplyr::group_by(
@@ -238,6 +231,8 @@ pull_catch <- function(common_name = NULL,
   ) |>
     dplyr::arrange(common_name, trawl_id)
 
+  colnames(catch)[colnames(catch) == "statistical_partition_dim$statistical_partition_type"] <- "partition_sample_types"
+
   # Need to check what this is doing
   no_area <- which(is.na(catch$area_swept_ha_der))
   if (length(no_area) > 0) {
@@ -252,6 +247,8 @@ pull_catch <- function(common_name = NULL,
 
   # Fill in zeros where needed
   catch[is.na(catch)] <- 0
+  catch[catch[,"partition_sample_types"] == 0, "partition_sample_types"] <- NA
+  catch[catch[,"partition"] == 0, "partition"] <- NA
 
   catch$date <- chron::chron(
     format(as.POSIXlt(catch$datetime_utc_iso, format = "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d"),
@@ -262,6 +259,14 @@ pull_catch <- function(common_name = NULL,
   catch$cpue_kg_km2 <- catch$cpue_kg_per_ha_der * 100
   colnames(catch)[which(colnames(catch) == "area_swept_ha_der")] <- "area_swept_ha"
 
+  if(sum(c("Life Stage", "Size") %in% sample_types) == 2) {
+    n_id <- table(catch$trawl_id)
+    if(sum(n_id == 2) > 0){
+      print("Warning: Pulling all sample types (Life Stage and Size) has resulted in multiple records for unique tows (Trawl_id).")
+      print("The `combine_tows` function can be used to combine these multiple records for unique tows if needed.")
+    }
+  }
+
   if(convert) {
 
     firstup <- function(x) {
@@ -269,10 +274,10 @@ pull_catch <- function(common_name = NULL,
       x
     }
     colnames(catch) <- firstup(colnames(catch))
-    colnames(catch)[colnames(catch)=="Cpue_kg_km2"] <- "cpue_kg_km2"
-    colnames(catch)[colnames(catch)=="Cpue_kg_per_ha_der"] <- "cpue_kg_per_ha_der"
-    colnames(catch)[colnames(catch)=="Total_catch_numbers"] <- "total_catch_numbers"
-    colnames(catch)[colnames(catch)=="Total_catch_wt_kg"] <- "total_catch_wt_kg"
+    colnames(catch)[colnames(catch) == "Cpue_kg_km2"] <- "cpue_kg_km2"
+    colnames(catch)[colnames(catch) == "Cpue_kg_per_ha_der"] <- "cpue_kg_per_ha_der"
+    colnames(catch)[colnames(catch) == "Total_catch_numbers"] <- "total_catch_numbers"
+    colnames(catch)[colnames(catch) == "Total_catch_wt_kg"] <- "total_catch_wt_kg"
   }
 
   save_rdata(
diff --git a/man-roxygen/sample_types.R b/man-roxygen/sample_types.R
new file mode 100644
index 0000000..9eb970a
--- /dev/null
+++ b/man-roxygen/sample_types.R
@@ -0,0 +1,4 @@
+#' @param sample_types A character vector of sample types, i.e.,
+#' `"statistical_partition_dim"`, that you would like to keep. The default is
+#' to only keep `NA` values, both real and character NA. But, for some
+#' instances you may want to keep Life Stage and Size samples.

From b2e1daa911dd925f1e99046a76f37f34f884f3d0 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 30 Apr 2024 16:35:31 -0700
Subject: [PATCH 02/12] add new function to combine multiple catch records for
 trawls

Function that can be used if sample types have been used and multiple records are returned for the same trawl_id.  This function would be up to the user to apply but a message has been added to the pull_catch function that prints if trawl_ids are not uniuqe.
---
 R/combine_tows.R | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 R/combine_tows.R

diff --git a/R/combine_tows.R b/R/combine_tows.R
new file mode 100644
index 0000000..ca9fedc
--- /dev/null
+++ b/R/combine_tows.R
@@ -0,0 +1,62 @@
+#' Combine catch data by trawl_id for data pulled using the
+#' pull_catch function. Generally, only a single tow is returned
+#' by species and in those instances this function is not needed.
+#' This function is only needed in unique data situations. For
+#' example, if data are pulled with all sample_types included then
+#' there will often be data from the same trawl_id included as
+#' multiple records. In these instances this function allows for
+#' the sample data to be combined for unique trawl_id.
+#'
+#' @details
+#'
+#'
+#' @template dir
+#' @param data A data frame of catches obtained by using the pull_catch
+#' function.
+#' @template verbose
+#'
+#' @author Chantel Wetzel
+#' @export
+#'
+#' @importFrom dplyr group_by distinct
+#'
+#' @examples
+#' \dontrun{
+#'
+#'
+combine_tows <- function(dir = NULL, data, verbose = TRUE){
+
+  check_dir(dir = dir, verbose = verbose)
+
+  if(!"total_catch_numbers" %in% colnames(data)){
+    stop("The data object needs to be a data frame of pulled catches from the `pull_catch` function.")
+  }
+
+  original_colname <- colnames(data)
+  colnames(data) <- tolower(colnames(data))
+
+  find <- grep("trawl_id", colnames(data), ignore.case = TRUE)
+  n_id <- table(data[, find])
+  if(sum(n_id == 2) == 0){
+    stop("All trawl_ids are unique and there is no need to combine data.")
+  }
+
+  catch <- data |>
+    dplyr::group_by(common_name, trawl_id) |>
+    mutate(
+      total_catch_numbers = sum(total_catch_numbers),
+      total_catch_wt_kg = sum(total_catch_wt_kg),
+      subsample_count = sum(subsample_count),
+      subsample_wt_kg = sum(subsample_wt_kg),
+      cpue_kg_per_ha_der = sum(cpue_kg_per_ha_der),
+      cpue_kg_km2 = sum(cpue_kg_km2),
+      partition_sample_types = NA,
+      partition = NA
+    ) |>
+    dplyr::distinct(trawl_id, .keep_all = TRUE)
+
+  catch <- as.data.frame(catch)
+  colnames(catch) <- original_colname
+
+  return(catch)
+}

From c55a9914c6c84be5b96acec5c4489b37a72f98cc Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 30 Apr 2024 16:35:47 -0700
Subject: [PATCH 03/12] add tests for pulling with sample type

---
 tests/testthat/test-data.R | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
index 7868526..bd2433b 100644
--- a/tests/testthat/test-data.R
+++ b/tests/testthat/test-data.R
@@ -55,6 +55,25 @@ test_that("pull_catch-multispecies", {
     label = "entries of 2 species filtered for lingcod",
     expected.label = "entries of lingcod"
   )
+
+  data_hake <- pull_catch(
+    common_name = "Pacific hake",
+    years = c(2014, 2018),
+    survey = "NWFSC.Combo",
+    verbose = TRUE,
+    covert = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")[1:4]
+  )
+  expect_is(data_hake, "data.frame")
+  expect_equal(nrow(data_hake), 3556)
+  expect_equal(length(which(data_hake$cpue_kg_km2 == 0)), 1622)
+  expect_equal(length(unique(data_hake$Trawl_id)) == 3442)
+
+  combine_hake <- combine_tows(
+    data = data_hake
+  )
+  expect_equal(length(unique(data_hake$Trawl_id)) == nrow(combine_hake))
+  expect_equal(sum(data_hake$total_catch_numbers) == sum(combine_hake$total_catch_numbers))
 })
 
 test_that("PullHaul", {

From 1f882a18bc3ecf99b3a347ec062c6808d64ecfca Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Wed, 1 May 2024 10:54:25 -0700
Subject: [PATCH 04/12] fix test

---
 tests/testthat/test-data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
index bd2433b..f93438a 100644
--- a/tests/testthat/test-data.R
+++ b/tests/testthat/test-data.R
@@ -61,7 +61,7 @@ test_that("pull_catch-multispecies", {
     years = c(2014, 2018),
     survey = "NWFSC.Combo",
     verbose = TRUE,
-    covert = TRUE,
+    convert = TRUE,
     sample_types = c("NA", NA, "Life Stage", "Size")[1:4]
   )
   expect_is(data_hake, "data.frame")

From 33bae9c6acd061f0ad59fdd0115c605364f65435 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Wed, 1 May 2024 10:55:08 -0700
Subject: [PATCH 05/12] documentation

---
 NAMESPACE                  |  3 +++
 R/combine_tows.R           |  4 ----
 man-roxygen/sample_types.R |  8 +++++++-
 man/PullCatch.fn.Rd        | 14 +++++++++++++-
 man/combine_tows.Rd        | 38 ++++++++++++++++++++++++++++++++++++++
 man/pull_catch.Rd          | 19 ++++++++++++++-----
 6 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 man/combine_tows.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 217758b..d0db6d2 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -35,6 +35,7 @@ export(check_dir)
 export(check_survey)
 export(codify_sex)
 export(codify_sex_SS3)
+export(combine_tows)
 export(createMatrix)
 export(draw_USEEZ)
 export(draw_land)
@@ -67,6 +68,8 @@ import(glue)
 import(janitor)
 import(reshape2)
 import(stringr)
+importFrom(dplyr,distinct)
+importFrom(dplyr,group_by)
 importFrom(dplyr,left_join)
 importFrom(dplyr,rename)
 importFrom(grDevices,dev.off)
diff --git a/R/combine_tows.R b/R/combine_tows.R
index ca9fedc..1dd3ba3 100644
--- a/R/combine_tows.R
+++ b/R/combine_tows.R
@@ -7,8 +7,6 @@
 #' multiple records. In these instances this function allows for
 #' the sample data to be combined for unique trawl_id.
 #'
-#' @details
-#'
 #'
 #' @template dir
 #' @param data A data frame of catches obtained by using the pull_catch
@@ -20,8 +18,6 @@
 #'
 #' @importFrom dplyr group_by distinct
 #'
-#' @examples
-#' \dontrun{
 #'
 #'
 combine_tows <- function(dir = NULL, data, verbose = TRUE){
diff --git a/man-roxygen/sample_types.R b/man-roxygen/sample_types.R
index 9eb970a..b89078e 100644
--- a/man-roxygen/sample_types.R
+++ b/man-roxygen/sample_types.R
@@ -1,4 +1,10 @@
 #' @param sample_types A character vector of sample types, i.e.,
 #' `"statistical_partition_dim"`, that you would like to keep. The default is
 #' to only keep `NA` values, both real and character NA. But, for some
-#' instances you may want to keep Life Stage and Size samples.
+#' instances you may want to keep Life Stage and Size samples. The majority
+#' of samples with `"statistical_partition_dim"` of Size and Life Stage are
+#' Pacific hake and should not be considered different than regular survey
+#' samples. The other types of samples that may be designated Life Stage are
+#' egg cases that can be caught and identified for select elasmobranch
+#' species. These type of samples would not be included in the data to estimate
+#' indices of abundance and are omitted by default.
diff --git a/man/PullCatch.fn.Rd b/man/PullCatch.fn.Rd
index 05cf462..f5d049a 100644
--- a/man/PullCatch.fn.Rd
+++ b/man/PullCatch.fn.Rd
@@ -11,7 +11,8 @@ PullCatch.fn(
   SurveyName = NULL,
   SaveFile = lifecycle::deprecated(),
   Dir = NULL,
-  verbose = TRUE
+  verbose = TRUE,
+  sample_types = c("NA", NA, "Life Stage", "Size")[1:2]
 )
 }
 \arguments{
@@ -62,6 +63,17 @@ Default NULL which will not save an output file.}
 
 \item{verbose}{A logical that specifies if you want to print messages and
 warnings to the console. The default is \code{TRUE}.}
+
+\item{sample_types}{A character vector of sample types, i.e.,
+\code{"statistical_partition_dim"}, that you would like to keep. The default is
+to only keep \code{NA} values, both real and character NA. But, for some
+instances you may want to keep Life Stage and Size samples. The majority
+of samples with \code{"statistical_partition_dim"} of Size and Life Stage are
+Pacific hake and should not be considered different than regular survey
+samples. The other types of samples that may be designated Life Stage are
+egg cases that can be caught and identified for select elasmobranch
+species. These type of samples would not be included in the data to estimate
+indices of abundance and are omitted by default.}
 }
 \description{
 Pull catch data from the
diff --git a/man/combine_tows.Rd b/man/combine_tows.Rd
new file mode 100644
index 0000000..63036f2
--- /dev/null
+++ b/man/combine_tows.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/combine_tows.R
+\name{combine_tows}
+\alias{combine_tows}
+\title{Combine catch data by trawl_id for data pulled using the
+pull_catch function. Generally, only a single tow is returned
+by species and in those instances this function is not needed.
+This function is only needed in unique data situations. For
+example, if data are pulled with all sample_types included then
+there will often be data from the same trawl_id included as
+multiple records. In these instances this function allows for
+the sample data to be combined for unique trawl_id.}
+\usage{
+combine_tows(dir = NULL, data, verbose = TRUE)
+}
+\arguments{
+\item{dir}{directory where ouptut will be saved. The directory where the file should be saved.
+If dir = NULL no output will be saved.}
+
+\item{data}{A data frame of catches obtained by using the pull_catch
+function.}
+
+\item{verbose}{A logical that specifies if you want to print messages and
+warnings to the console. The default is \code{TRUE}.}
+}
+\description{
+Combine catch data by trawl_id for data pulled using the
+pull_catch function. Generally, only a single tow is returned
+by species and in those instances this function is not needed.
+This function is only needed in unique data situations. For
+example, if data are pulled with all sample_types included then
+there will often be data from the same trawl_id included as
+multiple records. In these instances this function allows for
+the sample data to be combined for unique trawl_id.
+}
+\author{
+Chantel Wetzel
+}
diff --git a/man/pull_catch.Rd b/man/pull_catch.Rd
index 8b95eda..6af87da 100644
--- a/man/pull_catch.Rd
+++ b/man/pull_catch.Rd
@@ -11,7 +11,8 @@ pull_catch(
   survey,
   dir = NULL,
   convert = TRUE,
-  verbose = TRUE
+  verbose = TRUE,
+  sample_types = c("NA", NA, "Life Stage", "Size")[1:2]
 )
 }
 \arguments{
@@ -76,6 +77,17 @@ which aligns with the expected names in data processing functions.}
 
 \item{verbose}{A logical that specifies if you want to print messages and
 warnings to the console. The default is \code{TRUE}.}
+
+\item{sample_types}{A character vector of sample types, i.e.,
+\code{"statistical_partition_dim"}, that you would like to keep. The default is
+to only keep \code{NA} values, both real and character NA. But, for some
+instances you may want to keep Life Stage and Size samples. The majority
+of samples with \code{"statistical_partition_dim"} of Size and Life Stage are
+Pacific hake and should not be considered different than regular survey
+samples. The other types of samples that may be designated Life Stage are
+egg cases that can be caught and identified for select elasmobranch
+species. These type of samples would not be included in the data to estimate
+indices of abundance and are omitted by default.}
 }
 \description{
 Pull catch data from the
@@ -88,15 +100,12 @@ The data available in the warehouse are cleaned prior to being downloaded
 with the intent that they provide the best available information for use
 in an index-standardization procedure. The removed samples may be of use
 to others with a less-restrictive goal than producing an index of abundance.
-For example, life-stage samples are excluded because they are not collected
-using the same protocols as standard samples.
+For example, depths sampled outside the standard protocol are excluded.
 To download all data, we currently recommend going to the
 \href{https://www.webapps.nwfsc.noaa.gov/data}{NWFSC data warehouse}
 and using the csv link to extract data for a single species at a time.
 In the future, we hope to add functionality to this package such that
 downloading all data can be done easily within this function.
-See \href{https://github.com/pfmc-assessments/nwfscSurvey/issues/43}{Issue #43}
-for more information.
 }
 \examples{
 \dontrun{

From 8d00fcf5f0b6f0ce6694d5244247e0010e8a9309 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Wed, 1 May 2024 11:37:01 -0700
Subject: [PATCH 06/12] separate out sample_type tests

---
 tests/testthat/test-data.R | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
index f93438a..550bb65 100644
--- a/tests/testthat/test-data.R
+++ b/tests/testthat/test-data.R
@@ -55,7 +55,12 @@ test_that("pull_catch-multispecies", {
     label = "entries of 2 species filtered for lingcod",
     expected.label = "entries of lingcod"
   )
+})
+
+test_that("pull-sample-types", {
+  skip_on_cran()
 
+  set.seed(123)
   data_hake <- pull_catch(
     common_name = "Pacific hake",
     years = c(2014, 2018),
@@ -67,13 +72,13 @@ test_that("pull_catch-multispecies", {
   expect_is(data_hake, "data.frame")
   expect_equal(nrow(data_hake), 3556)
   expect_equal(length(which(data_hake$cpue_kg_km2 == 0)), 1622)
-  expect_equal(length(unique(data_hake$Trawl_id)) == 3442)
+  expect_equal(length(unique(data_hake$Trawl_id)), 3442)
 
   combine_hake <- combine_tows(
     data = data_hake
   )
-  expect_equal(length(unique(data_hake$Trawl_id)) == nrow(combine_hake))
-  expect_equal(sum(data_hake$total_catch_numbers) == sum(combine_hake$total_catch_numbers))
+  expect_equal(length(unique(data_hake$Trawl_id)), nrow(combine_hake))
+  expect_equal(sum(data_hake$total_catch_numbers), sum(combine_hake$total_catch_numbers))
 })
 
 test_that("PullHaul", {

From cc545b1b7bf64f17f218f1b5a5f564e7e7fe81a9 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 7 May 2024 10:32:55 -0700
Subject: [PATCH 07/12] add tests for partition samples

Removed the set.seed that was used throughout this function when these tests were created.
---
 tests/testthat/test-data.R | 40 ++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
index 550bb65..8d014ef 100644
--- a/tests/testthat/test-data.R
+++ b/tests/testthat/test-data.R
@@ -7,7 +7,6 @@ set.seed(1)
 test_that("pull_catch", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- pull_catch(
     common_name = "lingcod",
     years = c(2003, 2018),
@@ -21,7 +20,6 @@ test_that("pull_catch", {
 test_that("pull_catch-multispecies", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- pull_catch(
     years = 2017,
     survey = "NWFSC.Combo",
@@ -60,14 +58,13 @@ test_that("pull_catch-multispecies", {
 test_that("pull-sample-types", {
   skip_on_cran()
 
-  set.seed(123)
   data_hake <- pull_catch(
     common_name = "Pacific hake",
     years = c(2014, 2018),
     survey = "NWFSC.Combo",
     verbose = TRUE,
     convert = TRUE,
-    sample_types = c("NA", NA, "Life Stage", "Size")[1:4]
+    sample_types = c("NA", NA, "Life Stage", "Size")
   )
   expect_is(data_hake, "data.frame")
   expect_equal(nrow(data_hake), 3556)
@@ -79,12 +76,42 @@ test_that("pull-sample-types", {
   )
   expect_equal(length(unique(data_hake$Trawl_id)), nrow(combine_hake))
   expect_equal(sum(data_hake$total_catch_numbers), sum(combine_hake$total_catch_numbers))
+
+  data_hake_3_types <- pull_catch(
+    common_name = "Pacific hake",
+    years = c(2014, 2018),
+    survey = "NWFSC.Combo",
+    verbose = TRUE,
+    convert = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")[1:3]
+  )
+  expect_equal(
+    sum(table(data_hake[which(data_hake$Partition_sample_types != "Size"), "Partition_sample_types"])),
+    sum(table(data_hake_3_types[, "Partition_sample_types"])))
+
+  data_eggs <- pull_catch(
+    common_name = "big skate",
+    years = c(2014, 2019),
+    survey = "NWFSC.Combo",
+    verbose = TRUE,
+    convert = TRUE,
+    sample_types = c("NA", NA, "Life Stage", "Size")
+  )
+
+  combine_eggs <- combine_tows(
+    data = data_eggs
+  )
+  expect_equal(
+    nrow(data_eggs) - sum(data_eggs$Partition %in% c("Eggs", "Egg Cases")),
+    nrow(combine_eggs))
+  expect_equal(sum(data_eggs$total_catch_numbers[which(!data_eggs$Partition %in% c("Eggs", "Egg Cases"))]),
+               sum(combine_eggs$total_catch_numbers))
+
 })
 
 test_that("PullHaul", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- PullHaul.fn(
     YearRange = c(2003, 2018),
     SurveyName = "NWFSC.Combo",
@@ -97,7 +124,6 @@ test_that("PullHaul", {
 test_that("pull_bio", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- pull_bio(
     common_name = "lingcod",
     years = c(2016, 2017),
@@ -111,7 +137,6 @@ test_that("pull_bio", {
 test_that("pull_bio_triennial", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- pull_bio(
     common_name = "lingcod",
     years = c(1980, 1992),
@@ -126,7 +151,6 @@ test_that("pull_bio_triennial", {
 test_that("pull_biological_samples", {
   skip_on_cran()
 
-  set.seed(123)
   dat <- pull_biological_samples(
     common_name = "lingcod",
     years = c(2003, 2017),

From 9771222ebf772a43cba5892cec34bc99aac50b5d Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 7 May 2024 10:33:17 -0700
Subject: [PATCH 08/12] rm sample_type function input

---
 R/PullCatch.fn.R | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/R/PullCatch.fn.R b/R/PullCatch.fn.R
index 95a66c4..f02dcdb 100644
--- a/R/PullCatch.fn.R
+++ b/R/PullCatch.fn.R
@@ -58,7 +58,6 @@
 #'   The name of the file within `Dir` will start with Catch_ and end with .rdata.
 #'   Default NULL which will not save an output file.
 #' @template verbose
-#' @template sample_types
 #'
 #' @author Chantel Wetzel (maintainer) based on code by John Wallace
 #' @export
@@ -96,8 +95,7 @@ PullCatch.fn <- function(
     SurveyName = NULL,
     SaveFile = lifecycle::deprecated(),
     Dir = NULL,
-    verbose = TRUE,
-    sample_types = c("NA", NA, "Life Stage", "Size")[1:2]) {
+    verbose = TRUE) {
 
   lifecycle::deprecate_soft(
     when = "2.3",
@@ -119,8 +117,7 @@ PullCatch.fn <- function(
     survey = SurveyName,
     dir = Dir,
     convert = TRUE,
-    verbose = verbose,
-    sample_types = sample_types)
+    verbose = verbose)
 
   return(Out)
 }

From 4ee063d8588327cdae43cbf0bbfe2dce66f9110b Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 7 May 2024 10:33:45 -0700
Subject: [PATCH 09/12] refactor to warning rather than print

---
 R/pull_catch.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pull_catch.R b/R/pull_catch.R
index 6205dad..87159cb 100644
--- a/R/pull_catch.R
+++ b/R/pull_catch.R
@@ -262,8 +262,8 @@ pull_catch <- function(common_name = NULL,
   if(sum(c("Life Stage", "Size") %in% sample_types) == 2) {
     n_id <- table(catch$trawl_id)
     if(sum(n_id == 2) > 0){
-      print("Warning: Pulling all sample types (Life Stage and Size) has resulted in multiple records for unique tows (Trawl_id).")
-      print("The `combine_tows` function can be used to combine these multiple records for unique tows if needed.")
+      warning("Warning: Pulling all sample types (Life Stage and Size) has resulted in multiple records for unique tows (Trawl_id).
+      \n The `combine_tows` function can be used to combine these multiple records for unique tows if needed.")
     }
   }
 

From afcda13ed80787b445de3ffed2bc014655acdd57 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 7 May 2024 10:35:37 -0700
Subject: [PATCH 10/12] add code to rm specific partition and add save

1. Remove partition samples that should not be used for index creation.
2. Add code to save the combined tows data
3. Add spaces for code style
4. Revise function documentation
---
 R/combine_tows.R | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/R/combine_tows.R b/R/combine_tows.R
index 1dd3ba3..874243a 100644
--- a/R/combine_tows.R
+++ b/R/combine_tows.R
@@ -1,30 +1,31 @@
-#' Combine catch data by trawl_id for data pulled using the
+#' Combine data with multiple records for unique tows
+#'
+#' Combine catch data by `trawl_id` for data pulled using the
 #' pull_catch function. Generally, only a single tow is returned
 #' by species and in those instances this function is not needed.
-#' This function is only needed in unique data situations. For
-#' example, if data are pulled with all sample_types included then
-#' there will often be data from the same trawl_id included as
-#' multiple records. In these instances this function allows for
-#' the sample data to be combined for unique trawl_id.
+#' For example, if data are pulled with all sample_types included,
+#' then there will often be data from the same `trawl_id` included
+#' as multiple records. In these instances, this function allows
+#' for the sample data to be combined for each unique `trawl_id`.
 #'
 #'
-#' @template dir
-#' @param data A data frame of catches obtained by using the pull_catch
+#' @param data A data frame of catches obtained by using the `pull_catch`
 #' function.
+#' @template dir
 #' @template verbose
 #'
 #' @author Chantel Wetzel
 #' @export
 #'
-#' @importFrom dplyr group_by distinct
+#' @importFrom dplyr group_by reframe
 #'
 #'
 #'
-combine_tows <- function(dir = NULL, data, verbose = TRUE){
+combine_tows <- function(data, dir = NULL, verbose = TRUE){
 
   check_dir(dir = dir, verbose = verbose)
 
-  if(!"total_catch_numbers" %in% colnames(data)){
+  if (!"total_catch_numbers" %in% colnames(data)) {
     stop("The data object needs to be a data frame of pulled catches from the `pull_catch` function.")
   }
 
@@ -33,26 +34,40 @@ combine_tows <- function(dir = NULL, data, verbose = TRUE){
 
   find <- grep("trawl_id", colnames(data), ignore.case = TRUE)
   n_id <- table(data[, find])
-  if(sum(n_id == 2) == 0){
+  if (sum(n_id == 2) == 0) {
     stop("All trawl_ids are unique and there is no need to combine data.")
   }
 
+  partition_to_keep <- c("NA", NA, "Large", "Small", "Unspecified", "YOY")
+
   catch <- data |>
+    dplyr::filter(partition %in% partition_to_keep) |>
     dplyr::group_by(common_name, trawl_id) |>
-    mutate(
+    dplyr::mutate(
       total_catch_numbers = sum(total_catch_numbers),
       total_catch_wt_kg = sum(total_catch_wt_kg),
       subsample_count = sum(subsample_count),
       subsample_wt_kg = sum(subsample_wt_kg),
       cpue_kg_per_ha_der = sum(cpue_kg_per_ha_der),
       cpue_kg_km2 = sum(cpue_kg_km2),
-      partition_sample_types = NA,
-      partition = NA
+      partition_sample_types = paste0(partition_sample_types, collapse = "_"),
+      partition = paste0(partition, collapse = "_")
     ) |>
     dplyr::distinct(trawl_id, .keep_all = TRUE)
 
   catch <- as.data.frame(catch)
+
+  sp <- unique(data$common_name)
+  if (length(sp) > 1) {
+    sp <- "multispecies"
+  }
+  survey <- unique(data$project)
+
   colnames(catch) <- original_colname
+  if (!is.null(dir)) {
+    save(catch, file = file.path(dir, paste0("combined_catch_data_", sp, "_", survey, "_", Sys.Date(), ".rdata")))
+  }
+
 
   return(catch)
 }

From dd46b8da35de45857b753224d34f8cb4192dc0ad Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Tue, 7 May 2024 10:36:49 -0700
Subject: [PATCH 11/12] udpate documentation

---
 NAMESPACE                  |  2 +-
 man-roxygen/sample_types.R |  4 ++--
 man/Format.AKSlope.fn.Rd   |  8 ++++----
 man/PullCatch.fn.Rd        | 14 +-------------
 man/combine_tows.Rd        | 28 ++++++++++------------------
 man/pull_catch.Rd          |  4 ++--
 6 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index d0db6d2..674fb95 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -68,9 +68,9 @@ import(glue)
 import(janitor)
 import(reshape2)
 import(stringr)
-importFrom(dplyr,distinct)
 importFrom(dplyr,group_by)
 importFrom(dplyr,left_join)
+importFrom(dplyr,reframe)
 importFrom(dplyr,rename)
 importFrom(grDevices,dev.off)
 importFrom(grDevices,gray)
diff --git a/man-roxygen/sample_types.R b/man-roxygen/sample_types.R
index b89078e..13590e3 100644
--- a/man-roxygen/sample_types.R
+++ b/man-roxygen/sample_types.R
@@ -6,5 +6,5 @@
 #' Pacific hake and should not be considered different than regular survey
 #' samples. The other types of samples that may be designated Life Stage are
 #' egg cases that can be caught and identified for select elasmobranch
-#' species. These type of samples would not be included in the data to estimate
-#' indices of abundance and are omitted by default.
+#' species. These type of samples should not be included in the data used
+#' to estimate indices of abundance and are omitted by default.
diff --git a/man/Format.AKSlope.fn.Rd b/man/Format.AKSlope.fn.Rd
index 9838a9f..14e5573 100644
--- a/man/Format.AKSlope.fn.Rd
+++ b/man/Format.AKSlope.fn.Rd
@@ -48,14 +48,14 @@ of containing catch, length, and age data.
 \dontrun{
   # load data files for catch and biological data
    load("Tri.Shelf.and.AFSC.Slope.canary.Catch.24.May.11.dmp")
-   catch = Tri.Shelf.and.AFSC.Slope.canary.Catch.24.May.11 
+   catch = Tri.Shelf.and.AFSC.Slope.canary.Catch.24.May.11
    load("AFSC.Slope.Shelf.sable.bio.5.24.11.dmp")
    bio = AK.Surveys.Bio.sablefish.24.May.11
    # call function and reformat the data
    filter.dat = Format.AKSlope.fn(
-     datTows = catch, 
-     datL = bio, 
-     start.year = 1997) 
+     datTows = catch,
+     datL = bio,
+     start.year = 1997)
    catch = filter.dat$datTows
    len = filter.dat$length
    age = filter.dat$age
diff --git a/man/PullCatch.fn.Rd b/man/PullCatch.fn.Rd
index f5d049a..05cf462 100644
--- a/man/PullCatch.fn.Rd
+++ b/man/PullCatch.fn.Rd
@@ -11,8 +11,7 @@ PullCatch.fn(
   SurveyName = NULL,
   SaveFile = lifecycle::deprecated(),
   Dir = NULL,
-  verbose = TRUE,
-  sample_types = c("NA", NA, "Life Stage", "Size")[1:2]
+  verbose = TRUE
 )
 }
 \arguments{
@@ -63,17 +62,6 @@ Default NULL which will not save an output file.}
 
 \item{verbose}{A logical that specifies if you want to print messages and
 warnings to the console. The default is \code{TRUE}.}
-
-\item{sample_types}{A character vector of sample types, i.e.,
-\code{"statistical_partition_dim"}, that you would like to keep. The default is
-to only keep \code{NA} values, both real and character NA. But, for some
-instances you may want to keep Life Stage and Size samples. The majority
-of samples with \code{"statistical_partition_dim"} of Size and Life Stage are
-Pacific hake and should not be considered different than regular survey
-samples. The other types of samples that may be designated Life Stage are
-egg cases that can be caught and identified for select elasmobranch
-species. These type of samples would not be included in the data to estimate
-indices of abundance and are omitted by default.}
 }
 \description{
 Pull catch data from the
diff --git a/man/combine_tows.Rd b/man/combine_tows.Rd
index 63036f2..6349468 100644
--- a/man/combine_tows.Rd
+++ b/man/combine_tows.Rd
@@ -2,36 +2,28 @@
 % Please edit documentation in R/combine_tows.R
 \name{combine_tows}
 \alias{combine_tows}
-\title{Combine catch data by trawl_id for data pulled using the
-pull_catch function. Generally, only a single tow is returned
-by species and in those instances this function is not needed.
-This function is only needed in unique data situations. For
-example, if data are pulled with all sample_types included then
-there will often be data from the same trawl_id included as
-multiple records. In these instances this function allows for
-the sample data to be combined for unique trawl_id.}
+\title{Combine data with multiple records for unique tows}
 \usage{
-combine_tows(dir = NULL, data, verbose = TRUE)
+combine_tows(data, dir = NULL, verbose = TRUE)
 }
 \arguments{
+\item{data}{A data frame of catches obtained by using the \code{pull_catch}
+function.}
+
 \item{dir}{directory where ouptut will be saved. The directory where the file should be saved.
 If dir = NULL no output will be saved.}
 
-\item{data}{A data frame of catches obtained by using the pull_catch
-function.}
-
 \item{verbose}{A logical that specifies if you want to print messages and
 warnings to the console. The default is \code{TRUE}.}
 }
 \description{
-Combine catch data by trawl_id for data pulled using the
+Combine catch data by \code{trawl_id} for data pulled using the
 pull_catch function. Generally, only a single tow is returned
 by species and in those instances this function is not needed.
-This function is only needed in unique data situations. For
-example, if data are pulled with all sample_types included then
-there will often be data from the same trawl_id included as
-multiple records. In these instances this function allows for
-the sample data to be combined for unique trawl_id.
+For example, if data are pulled with all sample_types included,
+then there will often be data from the same \code{trawl_id} included
+as multiple records. In these instances, this function allows
+for the sample data to be combined for each unique \code{trawl_id}.
 }
 \author{
 Chantel Wetzel
diff --git a/man/pull_catch.Rd b/man/pull_catch.Rd
index 6af87da..fbaba14 100644
--- a/man/pull_catch.Rd
+++ b/man/pull_catch.Rd
@@ -86,8 +86,8 @@ of samples with \code{"statistical_partition_dim"} of Size and Life Stage are
 Pacific hake and should not be considered different than regular survey
 samples. The other types of samples that may be designated Life Stage are
 egg cases that can be caught and identified for select elasmobranch
-species. These type of samples would not be included in the data to estimate
-indices of abundance and are omitted by default.}
+species. These type of samples should not be included in the data used
+to estimate indices of abundance and are omitted by default.}
 }
 \description{
 Pull catch data from the

From 36441667308c00e68517cfda833c48d3abf33718 Mon Sep 17 00:00:00 2001
From: Chantel Wetzel <chantelwetzel@noaa.gov>
Date: Wed, 8 May 2024 10:19:03 -0700
Subject: [PATCH 12/12] make tow number check more robust

---
 R/combine_tows.R |  2 +-
 R/pull_catch.R   | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/combine_tows.R b/R/combine_tows.R
index 874243a..7900504 100644
--- a/R/combine_tows.R
+++ b/R/combine_tows.R
@@ -34,7 +34,7 @@ combine_tows <- function(data, dir = NULL, verbose = TRUE){
 
   find <- grep("trawl_id", colnames(data), ignore.case = TRUE)
   n_id <- table(data[, find])
-  if (sum(n_id == 2) == 0) {
+  if (all(n_id == 1)) {
     stop("All trawl_ids are unique and there is no need to combine data.")
   }
 
diff --git a/R/pull_catch.R b/R/pull_catch.R
index 87159cb..c806164 100644
--- a/R/pull_catch.R
+++ b/R/pull_catch.R
@@ -72,7 +72,7 @@ pull_catch <- function(common_name = NULL,
       "\nContact John Harms (john.harms@noaa.gov) for the full data set.")
   }
 
-  if(length(c(common_name, sci_name)) != max(c(length(common_name), length(sci_name)))){
+  if (length(c(common_name, sci_name)) != max(c(length(common_name), length(sci_name)))) {
     stop("Can not pull data using both the common_name or sci_name together.
          \n Please retry using only one." )
   }
@@ -139,7 +139,7 @@ pull_catch <- function(common_name = NULL,
 
   # Pull data from positive tows for selected species
   positive_tows <- try(get_json(url = url_text))
-  if(!is.data.frame(positive_tows)){
+  if (!is.data.frame(positive_tows)) {
     stop()
   }
 
@@ -157,7 +157,7 @@ pull_catch <- function(common_name = NULL,
   positive_tows <- positive_tows[good_tows, ]
   positive_tows <- positive_tows[, vars_short]
 
-  if(sum(is.na(positive_tows[, "common_name"])) > 0) {
+  if (sum(is.na(positive_tows[, "common_name"])) > 0) {
     replace <- which(is.na(positive_tows[, "common_name"]))
     positive_tows[replace, "common_name"] <- positive_tows[replace, "scientific_name"]
   }
@@ -259,15 +259,15 @@ pull_catch <- function(common_name = NULL,
   catch$cpue_kg_km2 <- catch$cpue_kg_per_ha_der * 100
   colnames(catch)[which(colnames(catch) == "area_swept_ha_der")] <- "area_swept_ha"
 
-  if(sum(c("Life Stage", "Size") %in% sample_types) == 2) {
+  if (sum(c("Life Stage", "Size") %in% sample_types) == 2) {
     n_id <- table(catch$trawl_id)
-    if(sum(n_id == 2) > 0){
+    if (any(n_id > 0)) {
       warning("Warning: Pulling all sample types (Life Stage and Size) has resulted in multiple records for unique tows (Trawl_id).
       \n The `combine_tows` function can be used to combine these multiple records for unique tows if needed.")
     }
   }
 
-  if(convert) {
+  if (convert) {
 
     firstup <- function(x) {
       substr(x, 1, 1) <- toupper(substr(x, 1, 1))