From 2d9acb8b335a9dc281b7a4b460706e90e1a3d218 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 5 Oct 2024 08:58:46 +0300 Subject: [PATCH 01/61] Add parameter definitions to summarize.R Signed-off-by: Awa Synthia --- NAMESPACE | 1 - R/summarize.R | 157 +++++++++++++++++++++++++++++++---------- man/count_bycol.Rd | 22 +++++- man/elements2words.Rd | 23 ++++-- man/filter_freq.Rd | 10 ++- man/summ.DA.Rd | 13 +++- man/summ.DA.byLin.Rd | 9 ++- man/summ.GC.Rd | 14 +++- man/summ.GC.byDALin.Rd | 15 +++- man/summarize_bylin.Rd | 15 +++- man/total_counts.Rd | 24 +++++-- man/words2wc.Rd | 11 ++- 12 files changed, 249 insertions(+), 65 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 16cf0813..9d73120a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -131,7 +131,6 @@ importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) -importFrom(dplyr,n_distinct) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,right_join) diff --git a/R/summarize.R b/R/summarize.R index a9b13e43..e03ca463 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -91,18 +91,31 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov ## Function to obtain element counts (DA, GC) #' Count Bycol #' -#' @param prot -#' @param column -#' @param min.freq +#' @param prot A data frame containing the dataset to analyze, typically with +#' multiple columns including the one specified by the `column` parameter. +#' @param column A character string specifying the name of the column to analyze. +#' The default is "DomArch". +#' @param min.freq An integer specifying the minimum frequency an element must +#' have to be included in the output. Default is 1. #' #' @importFrom dplyr arrange as_tibble filter select #' -#' @return Describe return, in detail +#' @return A tibble with two columns: +#' \describe{ +#' \item{`column`}{The unique elements from the specified column +#' (e.g., "DomArch").} +#' \item{`freq`}{The frequency of each element, i.e., the number of times +#' each element appears in the specified column.} +#' } +#' The tibble is filtered to only include elements that have a frequency +#' greater than or equal to `min.freq` and does not include elements with `NA` +#' values or those starting with a hyphen ("-"). +#' #' @export #' #' @examples #' \dontrun{ -#' count_bycol() +#' count_bycol(prot = my_data, column = "DomArch", min.freq = 10) #' } count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { counts <- prot %>% @@ -123,19 +136,30 @@ count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic #' context (GC) #' -#' @param prot [dataframe] -#' @param column [string] column name -#' @param conversion_type [string] type of conversion: 'da2doms': domain architectures to -#' domains. 'gc2da' genomic context to domain architectures +#' @param prot A dataframe containing the dataset to analyze. The specified +#' `column` contains the string elements to be processed. +#' @param column A character string specifying the name of the column to analyze. +#' Default is "DomArch". +#' @param conversion_type A character string specifying the type of conversion. +#' Two options are available: +#' \describe{ +#' \item{`da2doms`}{Convert domain architectures into individual domains by +#' replacing `+` symbols with spaces.} +#' \item{`gc2da`}{Convert genomic context into domain architectures by +#' replacing directional symbols (`<-`, `->`, and `|`) with spaces.} +#' } #' #' @importFrom dplyr pull #' @importFrom stringr str_replace_all #' -#' @return [string] with words delimited by spaces +#' @return A single string where elements are delimited by spaces. The function +#' performs necessary substitutions based on the `conversion_type` and cleans up +#' extraneous characters like newlines, tabs, and multiple spaces. #' #' @examples #' \dontrun{ -#' tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2words() +#' tibble::tibble(DomArch = c("aaa+bbb", +#' "a+b", "b+c", "b-c")) |> elements2words() #' } #' elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms") { @@ -175,11 +199,19 @@ elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] #' -#' @param string +#' @param string A character string containing the elements (words) to count. +#' This would typically be a space-delimited string representing domain +#' architectures or genomic contexts. #' -#' @importFrom dplyr as_tibble filter +#' @importFrom dplyr as_tibble filter arrange +#' @importFrom stringr str_replace_all #' -#' @return [tbl_df] table with 2 columns: 1) words & 2) counts/frequency +#' @return A tibble (tbl_df) with two columns: +#' \describe{ +#' \item{`words`}{A column containing the individual words +#' (domains or domain architectures).} +#' \item{`freq`}{A column containing the frequency counts for each word.} +#' } #' #' @examples #' \dontrun{ @@ -219,10 +251,15 @@ words2wc <- function(string) { ## Function to filter based on frequencies #' Filter Frequency #' -#' @param x -#' @param min.freq +#' @param x A tibble (tbl_df) containing at least two columns: one for +#' elements (e.g., `words`) and one for their frequency (e.g., `freq`). +#' @param min.freq A numeric value specifying the minimum frequency threshold. +#' Only elements with frequencies greater than or equal to this value will be +#' retained. +#' +#' @return A tibble with the same structure as `x`, but filtered to include +#' only rows where the frequency is greater than or equal to `min.freq`. #' -#' @return Describe return, in detail #' @export #' #' @examples @@ -239,15 +276,20 @@ filter_freq <- function(x, min.freq) { ######################### #' Summarize by Lineage #' -#' @param prot -#' @param column -#' @param by -#' @param query +#' @param prot A dataframe or tibble containing the data. +#' @param column A string representing the column to be summarized +#' (e.g., `DomArch`). Default is "DomArch". +#' @param by A string representing the grouping column (e.g., `Lineage`). +#' Default is "Lineage". +#' @param query A string specifying the query pattern for filtering the target +#' column. Use "all" to skip filtering and include all rows. #' #' @importFrom dplyr arrange filter group_by summarise #' @importFrom rlang sym #' -#' @return Describe return, in detail +#' @return A tibble summarizing the counts of occurrences of elements in +#' the `column`, grouped by the `by` column. The result includes the number +#' of occurrences (`count`) and is arranged in descending order of count. #' @export #' #' @examples @@ -283,11 +325,17 @@ summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", #' Function to summarize and retrieve counts by Domains & Domains+Lineage #' #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `DomArch` and `Lineage`. #' #' @importFrom dplyr arrange count desc filter group_by summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing the counts of unique domain architectures +#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three +#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency +#' of each domain architecture for each lineage. The results are arranged in +#' descending order of `count`. +#' #' @export #' #' @examples @@ -309,11 +357,18 @@ summ.DA.byLin <- function(x) { #' @description #' Function to retrieve counts of how many lineages a DomArch appears in #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have a column +#' named `DomArch` and a count column, such as `count`, which represents the +#' occurrences of each architecture in various lineages. #' #' @importFrom dplyr arrange group_by filter summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing each unique `DomArch`, along with the following +#' columns: +#' - `totalcount`: The total occurrences of each `DomArch` across all lineages. +#' - `totallin`: The total number of unique lineages in which each `DomArch` +#' appears. +#' The results are arranged in descending order of `totallin` and `totalcount`. #' @export #' #' @examples @@ -332,11 +387,20 @@ summ.DA <- function(x) { #' summ.GC.byDALin #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' -#' @return Define return, in detail +#' @return A tibble summarizing each unique combination of `GenContext`, +#' `DomArch`, and `Lineage`, along with the following columns: +#' - `GenContext`: The genomic context for each entry. +#' - `DomArch`: The domain architecture for each entry. +#' - `Lineage`: The lineage associated with each entry. +#' - `count`: The total number of occurrences for each combination of +#' `GenContext`, `DomArch`, and `Lineage`. +#' +#' The results are arranged in descending order of `count`. #' @export #' #' @examples @@ -382,11 +446,19 @@ summ.GC.byLin <- function(x) { #' summ.GC #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `GenContext`, `DomArch`, and `Lineage`. #' -#' @importFrom dplyr arrange desc filter group_by n_distinct summarise +#' @importFrom dplyr arrange desc filter group_by n summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing each unique combination of `GenContext` and +#' `Lineage`, along with the following columns: +#' - `GenContext`: The genomic context for each entry. +#' - `Lineage`: The lineage associated with each entry. +#' - `count`: The total number of occurrences for each combination of +#' `GenContext` and `Lineage`. +#' +#' The results are arranged in descending order of `count`. #' @export #' #' @examples @@ -419,16 +491,27 @@ summ.GC <- function(x) { #' #' @param prot A data frame that must contain columns: #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count} -#' @param column Character. The column to summarize -#' @param lineage_col -#' @param cutoff Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0. -#' @param RowsCutoff -#' @param digits +#' @param column Character. The column to summarize, default is "DomArch". +#' @param lineage_col Character. The name of the lineage column, default is +#' "Lineage". +#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value +#' will not be shown. Default is 0. +#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage +#' cutoff. Default is FALSE. +#' @param digits Numeric. Number of decimal places for percentage columns. +#' Default is 2. +#' #' #' @importFrom dplyr arrange distinct filter group_by left_join mutate select summarise ungroup #' @importFrom rlang as_string sym #' -#' @return Define return, in detail +#' @return A data frame with the following columns: +#' - `{{ column }}`: Unique values from the specified column. +#' - `totalcount`: The total count of occurrences for each unique value in +#' the specified column. +#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to +#' the overall count. +#' - `CumulativePercent`: The cumulative percentage of total counts. #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or diff --git a/man/count_bycol.Rd b/man/count_bycol.Rd index 884c0f0f..946a7ea2 100644 --- a/man/count_bycol.Rd +++ b/man/count_bycol.Rd @@ -7,16 +7,32 @@ count_bycol(prot = prot, column = "DomArch", min.freq = 1) } \arguments{ -\item{min.freq}{} +\item{prot}{A data frame containing the dataset to analyze, typically with +multiple columns including the one specified by the \code{column} parameter.} + +\item{column}{A character string specifying the name of the column to analyze. +The default is "DomArch".} + +\item{min.freq}{An integer specifying the minimum frequency an element must +have to be included in the output. Default is 1.} } \value{ -Describe return, in detail +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). } \description{ Count Bycol } \examples{ \dontrun{ -count_bycol() +count_bycol(prot = my_data, column = "DomArch", min.freq = 10) } } diff --git a/man/elements2words.Rd b/man/elements2words.Rd index 80fcbafb..bda447db 100644 --- a/man/elements2words.Rd +++ b/man/elements2words.Rd @@ -7,15 +7,25 @@ elements2words(prot, column = "DomArch", conversion_type = "da2doms") } \arguments{ -\item{prot}{\link{dataframe}} +\item{prot}{A dataframe containing the dataset to analyze. The specified +\code{column} contains the string elements to be processed.} -\item{column}{\link{string} column name} +\item{column}{A character string specifying the name of the column to analyze. +Default is "DomArch".} -\item{conversion_type}{\link{string} type of conversion: 'da2doms': domain architectures to -domains. 'gc2da' genomic context to domain architectures} +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} } \value{ -\link{string} with words delimited by spaces +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. } \description{ Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -23,7 +33,8 @@ context (GC) } \examples{ \dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2words() +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2words() } } diff --git a/man/filter_freq.Rd b/man/filter_freq.Rd index ce4db5ac..9dfba73b 100644 --- a/man/filter_freq.Rd +++ b/man/filter_freq.Rd @@ -7,10 +7,16 @@ filter_freq(x, min.freq) } \arguments{ -\item{min.freq}{} +\item{x}{A tibble (tbl_df) containing at least two columns: one for +elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} } \value{ -Describe return, in detail +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. } \description{ Filter Frequency diff --git a/man/summ.DA.Rd b/man/summ.DA.Rd index 13717140..01d15b3c 100644 --- a/man/summ.DA.Rd +++ b/man/summ.DA.Rd @@ -7,10 +7,19 @@ summ.DA(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have a column +named \code{DomArch} and a count column, such as \code{count}, which represents the +occurrences of each architecture in various lineages.} } \value{ -Describe return, in detail +A tibble summarizing each unique \code{DomArch}, along with the following +columns: +\itemize{ +\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. +\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} +appears. +The results are arranged in descending order of \code{totallin} and \code{totalcount}. +} } \description{ Function to retrieve counts of how many lineages a DomArch appears in diff --git a/man/summ.DA.byLin.Rd b/man/summ.DA.byLin.Rd index 66555fd5..d88e5d37 100644 --- a/man/summ.DA.byLin.Rd +++ b/man/summ.DA.byLin.Rd @@ -7,10 +7,15 @@ summ.DA.byLin(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{DomArch} and \code{Lineage}.} } \value{ -Describe return, in detail +A tibble summarizing the counts of unique domain architectures +(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three +columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency +of each domain architecture for each lineage. The results are arranged in +descending order of \code{count}. } \description{ Function to summarize and retrieve counts by Domains & Domains+Lineage diff --git a/man/summ.GC.Rd b/man/summ.GC.Rd index fa52a6bf..2ec4d651 100644 --- a/man/summ.GC.Rd +++ b/man/summ.GC.Rd @@ -7,10 +7,20 @@ summ.GC(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} } \value{ -Describe return, in detail +A tibble summarizing each unique combination of \code{GenContext} and +\code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext} and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. } \description{ summ.GC diff --git a/man/summ.GC.byDALin.Rd b/man/summ.GC.byDALin.Rd index 34c9f84d..7fc8d443 100644 --- a/man/summ.GC.byDALin.Rd +++ b/man/summ.GC.byDALin.Rd @@ -7,10 +7,21 @@ summ.GC.byDALin(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} } \value{ -Define return, in detail +A tibble summarizing each unique combination of \code{GenContext}, +\code{DomArch}, and \code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{DomArch}: The domain architecture for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext}, \code{DomArch}, and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. } \description{ summ.GC.byDALin diff --git a/man/summarize_bylin.Rd b/man/summarize_bylin.Rd index a94c54c1..92b93652 100644 --- a/man/summarize_bylin.Rd +++ b/man/summarize_bylin.Rd @@ -7,10 +7,21 @@ summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) } \arguments{ -\item{query}{} +\item{prot}{A dataframe or tibble containing the data.} + +\item{column}{A string representing the column to be summarized +(e.g., \code{DomArch}). Default is "DomArch".} + +\item{by}{A string representing the grouping column (e.g., \code{Lineage}). +Default is "Lineage".} + +\item{query}{A string specifying the query pattern for filtering the target +column. Use "all" to skip filtering and include all rows.} } \value{ -Describe return, in detail +A tibble summarizing the counts of occurrences of elements in +the \code{column}, grouped by the \code{by} column. The result includes the number +of occurrences (\code{count}) and is arranged in descending order of count. } \description{ Summarize by Lineage diff --git a/man/total_counts.Rd b/man/total_counts.Rd index 49db8822..53d70096 100644 --- a/man/total_counts.Rd +++ b/man/total_counts.Rd @@ -17,14 +17,30 @@ total_counts( \item{prot}{A data frame that must contain columns: \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} -\item{column}{Character. The column to summarize} +\item{column}{Character. The column to summarize, default is "DomArch".} -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} +\item{lineage_col}{Character. The name of the lineage column, default is +"Lineage".} -\item{digits}{} +\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value +will not be shown. Default is 0.} + +\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage +cutoff. Default is FALSE.} + +\item{digits}{Numeric. Number of decimal places for percentage columns. +Default is 2.} } \value{ -Define return, in detail +A data frame with the following columns: +\itemize{ +\item \code{{{ column }}}: Unique values from the specified column. +\item \code{totalcount}: The total count of occurrences for each unique value in +the specified column. +\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to +the overall count. +\item \code{CumulativePercent}: The cumulative percentage of total counts. +} } \description{ Creates a data frame with a totalcount column diff --git a/man/words2wc.Rd b/man/words2wc.Rd index 1eba5dc4..69d006d5 100644 --- a/man/words2wc.Rd +++ b/man/words2wc.Rd @@ -7,10 +7,17 @@ words2wc(string) } \arguments{ -\item{string}{} +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} } \value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} } \description{ Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} From 30d4bf3ab57c6296a81d6f792911c87586ca896e Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Sat, 5 Oct 2024 12:29:37 +0100 Subject: [PATCH 02/61] usethis::pr_init("Implement error handling in acc2lin.R functions - Added validation checks for input parameters (accessions, ipg_file, assembly_path, lineagelookup_path). - Included error messages for missing or invalid inputs and file existence checks. - Wrapped main logic in tryCatch for graceful error handling during execution. ") --- R/acc2lin.R | 267 ++++++++++++++++++++++++++++++++++------------ man/acc2lin.Rd | 3 +- man/efetch_ipg.Rd | 3 +- man/ipg2lin.Rd | 3 +- man/sink.reset.Rd | 1 + 5 files changed, 207 insertions(+), 70 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index f8d71949..dfb33da9 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -10,6 +10,8 @@ #' Sink Reset #' #' @return No return, but run to close all outstanding `sink()`s +#' and handles any errors or warnings that occur during the process. +#' #' @export #' #' @examples @@ -17,9 +19,19 @@ #' sink.reset() #' } sink.reset <- function() { + # Handle all errors and warnings + tryCatch({ for (i in seq_len(sink.number())) { - sink(NULL) + sink(NULL) } + print("All sinks closed") + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("resetSink function execution completed.") + }) } @@ -44,23 +56,61 @@ sink.reset <- function() { #' add_lins() #' } add_lins <- function(df, acc_col = "AccNum", assembly_path, - lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - s_acc_col <- sym(acc_col) - accessions <- df %>% pull(acc_col) - lins <- acc2lin(accessions, assembly_path, lineagelookup_path, ipgout_path, plan) + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + # check for validate inputs + if (!is.data.frame(df)) { + stop("Input 'df' must be a data frame.") + } + + if (!acc_col %in% colnames(df)) { + stop(paste("Column", acc_col, "not found in data frame.")) + } + + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } - # Drop a lot of the unimportant columns for now? will make merging much easier - lins <- lins[, c( + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + tryCatch({ + # Attempt to add lineages + acc_col <- sym(acc_col) + accessions <- df %>% pull(acc_col) + lins <- acc2lin( + accessions, assembly_path, lineagelookup_path, ipgout_path, plan + ) + + # Drop a lot of the unimportant columns for now? + # will make merging much easier + lins <- lins[, c( "Strand", "Start", "Stop", "Nucleotide Accession", "Source", "Id", "Strain" - ) := NULL] - lins <- unique(lins) + ) := NULL] + lins <- unique(lins) + + # dup <- lins %>% group_by(Protein) %>% + # summarize(count = n()) %>% filter(count > 1) %>% + # pull(Protein) - # dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>% - # pull(Protein) + merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) + return(merged) + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("addLineages function execution completed.") + }) - merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) - return(merged) } @@ -68,7 +118,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +#' @description This function combines 'efetch_ipg()' +#' and 'ipg2lin()' to map a set #' of protein accessions to their assembly (GCA_ID), tax ID, and lineage. #' #' @param accessions Character vector of protein accessions @@ -76,7 +127,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results of the efetch run of the accessions +#' @param ipgout_path Path to write the results +#' of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan #' @@ -87,27 +139,43 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' \dontrun{ #' acc2lin() #' } -acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - tmp_ipg <- F - if (is.null(ipgout_path)) { - tmp_ipg <- T - ipgout_path <- tempfile("ipg", fileext = ".txt") - } +acc2lin <- function(accessions, assembly_path, + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + tmp_ipg <- F + if (is.null(ipgout_path)) { + tmp_ipg <- T + ipgout_path <- tempfile("ipg", fileext = ".txt") + } + + lins <- NULL + tryCatch({ + # Attempt to fetch IPG efetch_ipg(accessions, out_path = ipgout_path, plan) + # Attempt to process IPG to lineages lins <- ipg2lin(accessions, ipgout_path, assembly_path, lineagelookup_path) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("acc2lin function execution completed.") + }) - if (tmp_ipg) { - unlink(tempdir(), recursive = T) - } - return(lins) + if (tmp_ipg) { + unlink(tempdir(), recursive = T) + } + return(lins) } + #' efetch_ipg #' #' @author Samuel Chen, Janani Ravi #' -#' @description Perform efetch on the ipg database and write the results to out_path +#' @description Perform efetch on the ipg database +#' and write the results to out_path #' #' @param accnums Character vector containing the accession numbers to query on #' the ipg database @@ -126,57 +194,84 @@ acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = #' efetch_ipg() #' } efetch_ipg <- function(accnums, out_path, plan = "sequential") { - if (length(accnums) > 0) { - partition <- function(in_data, groups) { - # \\TODO This function should be defined outside of efetch_ipg(). It can be non-exported/internal - # Partition data to limit number of queries per second for rentrez fetch: - # limit of 10/second w/ key - l <- length(in_data) - - partitioned <- list() - for (i in 1:groups) - { - partitioned[[i]] <- in_data[seq.int(i, l, groups)] - } - - return(partitioned) - } + # Argument validation + if (!is.character(accnums) || length(accnums) == 0) { + stop("Error: 'accnums' must be a non-empty character vector.") + } + + if (!is.character(out_path) || nchar(out_path) == 0) { + stop("Error: 'out_path' must be a non-empty string.") + } + + if (!is.function(plan)) { + stop("Error: 'plan' must be a valid plan function.") + } + if (length(accnums) > 0) { + partition <- function(in_data, groups) { + # \\TODO This function should be defined outside of efetch_ipg(). + # It can be non-exported/internal + # Partition data to limit number of queries per second for rentrez fetch: + # limit of 10/second w/ key + l <- length(in_data) - plan(strategy = plan, .skip = T) - - - min_groups <- length(accnums) / 200 - groups <- min(max(min_groups, 15), length(accnums)) - partitioned_acc <- partition(accnums, groups) - sink(out_path) - - a <- future_map(1:length(partitioned_acc), function(x) { - # Avoid hitting the rate API limit - if (x %% 9 == 0) { - Sys.sleep(1) - } - cat( - entrez_fetch( - id = partitioned_acc[[x]], - db = "ipg", - rettype = "xml", - api_key = "YOUR_KEY_HERE" ## Can this be included in public package? - ) - ) - }) - sink(NULL) + partitioned <- list() + for (i in 1:groups){ + partitioned[[i]] <- in_data[seq.int(i, l, groups)] + } + + return(partitioned) } + tryCatch({ + # Set the future plan strategy + plan(strategy = plan, .skip = T) + + + min_groups <- length(accnums) / 200 + groups <- min(max(min_groups, 15), length(accnums)) + partitioned_acc <- partition(accnums, groups) + + # Open the sink to the output path + sink(out_path) + + a <- future_map(1:length(partitioned_acc), function(x) { + # Avoid hitting the rate API limit + if (x %% 9 == 0) { + Sys.sleep(1) + } + cat( + entrez_fetch( + id = partitioned_acc[[x]], + db = "ipg", + rettype = "xml", + api_key = "YOUR_KEY_HERE" ## Can this be included in public package? + ) + ) + }) + sink(NULL) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("efetch_ipg function execution completed.") + }) + } } + + #' ipg2lin #' #' @author Samuel Chen, Janani Ravi #' -#' @description Takes the resulting file of an efetch run on the ipg database and +#' @description Takes the resulting file +#' of an efetch run on the ipg database and #' #' @param accessions Character vector of protein accessions -#' @param ipg_file Filepath to the file containing results of an efetch run on the -#' ipg database. The protein accession in 'accessions' should be contained in this +#' @param ipg_file Filepath to the file +#' containing results of an efetch run on the +#' ipg database. The protein accession in +#' 'accessions' should be contained in this #' file #' @param assembly_path String of the path to the assembly_summary path #' This file can be generated using the "DownloadAssemblySummary()" function @@ -195,16 +290,54 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") { #' } #' ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) { + # Argument validation for accessions + if (!is.character(accessions) || length(accessions) == 0) { + stop("Input 'accessions' must be a non-empty character vector.") + } + + # check for validate inputs + if (!is.character(ipg_file)) { + stop("Input 'ipg_file' must be a character string.") + } + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } + + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + + try({ + # Attempt to read the IPG file ipg_dt <- fread(ipg_file, sep = "\t", fill = T) + # Filter the IPG data table to only include the accessions ipg_dt <- ipg_dt[Protein %in% accessions] + # Rename the 'Assembly' column to 'GCA_ID' ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID") + # Convert the IPG data table to a lineage data table lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path) + + # Filter out rows with missing lineage information lins <- lins[!is.na(Lineage)] %>% unique() return(lins) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("ipg2lin function execution completed.") + }) } diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd index 6255b290..d3f2468b 100644 --- a/man/acc2lin.Rd +++ b/man/acc2lin.Rd @@ -38,7 +38,8 @@ on the ipg database. If NULL, the file will not be written. Defaults to NULL} Describe return, in detail } \description{ -This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +This function combines 'efetch_ipg()' +and 'ipg2lin()' to map a set of protein accessions to their assembly (GCA_ID), tax ID, and lineage. Function to map protein accession numbers to lineage diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd index ec5b6bcb..1fbb9d92 100644 --- a/man/efetch_ipg.Rd +++ b/man/efetch_ipg.Rd @@ -23,7 +23,8 @@ the ipg database} Describe return, in detail } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd index 3a14eada..453668b0 100644 --- a/man/ipg2lin.Rd +++ b/man/ipg2lin.Rd @@ -38,7 +38,8 @@ This file can be generated using the "DownloadAssemblySummary()" function} Describe return, in detail } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd index a31b841d..64087c49 100644 --- a/man/sink.reset.Rd +++ b/man/sink.reset.Rd @@ -8,6 +8,7 @@ sink.reset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset From aff97e433e5a0c367dfbb8f284ea200e1876a5da Mon Sep 17 00:00:00 2001 From: teddyCodex <15376476+teddyCodex@users.noreply.github.com> Date: Sat, 5 Oct 2024 16:16:51 +0100 Subject: [PATCH 03/61] Update CONTRIBUTING.md Added a couple of clearer steps to the pull request process. --- .github/CONTRIBUTING.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 180ecf6c..5f240176 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -18,8 +18,17 @@ See our guide on [how to create a great issue](https://code-review.tidyverse.org ### Pull request process -* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE)`. - +* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis`. + +* Install and load the `usethis` package with: + ``` + install.packages("usethis") + library(usethis) + ``` +* Clone and fork the MolEvolvR package using: + ``` + usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE) + ``` * Install all development dependencies with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`. If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing. * Create a Git branch for your pull request (PR). We recommend using `usethis::pr_init("brief-description-of-change")`. From 3a0376fc7024f6069580ce2059c27510bffb16d0 Mon Sep 17 00:00:00 2001 From: teddyCodex <15376476+teddyCodex@users.noreply.github.com> Date: Sat, 5 Oct 2024 16:29:11 +0100 Subject: [PATCH 04/61] Update CONTRIBUTING.md --- .github/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 5f240176..9465c683 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -23,7 +23,7 @@ See our guide on [how to create a great issue](https://code-review.tidyverse.org * Install and load the `usethis` package with: ``` install.packages("usethis") - library(usethis) + library("usethis") ``` * Clone and fork the MolEvolvR package using: ``` From 67a6d0eaeded334e6869621a70c781cd917dd3bc Mon Sep 17 00:00:00 2001 From: teddyCodex <15376476+teddyCodex@users.noreply.github.com> Date: Sun, 6 Oct 2024 09:30:55 +0100 Subject: [PATCH 05/61] Update CONTRIBUTING.md to include explicit installation steps and improved clarity for development process - Added explicit instructions for installing and loading the `usethis`, `devtools`, and `lintr` packages. - Overall improvements to make the documentation more user-friendly, especially for new contributors. --- .github/CONTRIBUTING.md | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 9465c683..5db3f961 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -23,15 +23,29 @@ See our guide on [how to create a great issue](https://code-review.tidyverse.org * Install and load the `usethis` package with: ``` install.packages("usethis") + library("usethis") ``` * Clone and fork the MolEvolvR package using: ``` usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE) ``` -* Install all development dependencies with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`. - If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing. -* Create a Git branch for your pull request (PR). We recommend using `usethis::pr_init("brief-description-of-change")`. +* Install all development dependencies and then make sure the package passes R CMD check using `devtools`: + ``` + install.packages("devtools") + + library("devtools") + + devtools::install_dev_deps() + + devtools::check() + ``` + _If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing._ + +* Create a Git branch for your pull request (PR). We recommend using + ``` + usethis::pr_init("brief-description-of-change") + ``` * Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser. The title of your PR should briefly describe the change. @@ -44,7 +58,14 @@ See our guide on [how to create a great issue](https://code-review.tidyverse.org * New code should follow the tidyverse [style guide](https://style.tidyverse.org). You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR. -* Lint Your Code: Ensure your code adheres to our style guidelines by using [lintr](https://lintr.r-lib.org/): `lintr::lint("path/to/your/file.R")` +* Lint Your Code: Ensure your code adheres to our style guidelines by using [lintr](https://lintr.r-lib.org/): + ``` + install.packages("lintr") + + library("lintr") + + lintr::lint("path/to/your/file.R") + ``` * We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. From d9fa04bc729586ab336275083d67fb75420ac138 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Mon, 7 Oct 2024 07:42:08 +0300 Subject: [PATCH 06/61] use one documentation file Signed-off-by: Awa Synthia --- R/summarize.R | 16 ++- man/count_bycol.Rd | 38 ------ man/elements2words.Rd | 40 ------- man/filter_by_doms.Rd | 44 ------- man/filter_freq.Rd | 28 ----- man/summ.DA.Rd | 31 ----- man/summ.DA.byLin.Rd | 27 ----- man/summ.GC.Rd | 32 ----- man/summ.GC.byDALin.Rd | 33 ------ man/summ.GC.byLin.Rd | 22 ---- man/summarize.Rd | 260 +++++++++++++++++++++++++++++++++++++++++ man/summarize_bylin.Rd | 36 ------ man/total_counts.Rd | 58 --------- man/words2wc.Rd | 32 ----- 14 files changed, 274 insertions(+), 423 deletions(-) delete mode 100644 man/count_bycol.Rd delete mode 100644 man/elements2words.Rd delete mode 100644 man/filter_by_doms.Rd delete mode 100644 man/filter_freq.Rd delete mode 100644 man/summ.DA.Rd delete mode 100644 man/summ.DA.byLin.Rd delete mode 100644 man/summ.GC.Rd delete mode 100644 man/summ.GC.byDALin.Rd delete mode 100644 man/summ.GC.byLin.Rd create mode 100644 man/summarize.Rd delete mode 100644 man/summarize_bylin.Rd delete mode 100644 man/total_counts.Rd delete mode 100644 man/words2wc.Rd diff --git a/R/summarize.R b/R/summarize.R index e03ca463..0580c15d 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -29,6 +29,7 @@ #' #' @return Filtered data frame #' @note There is no need to make the domains 'regex safe', that will be handled by this function +#' @name summarize #' @export #' #' @examples @@ -110,7 +111,7 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov #' The tibble is filtered to only include elements that have a frequency #' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). -#' +#' @name summarize #' @export #' #' @examples @@ -155,6 +156,7 @@ count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @return A single string where elements are delimited by spaces. The function #' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. +#' @name summarize #' #' @examples #' \dontrun{ @@ -212,6 +214,8 @@ elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' (domains or domain architectures).} #' \item{`freq`}{A column containing the frequency counts for each word.} #' } +#' +#' @name summarize #' #' @examples #' \dontrun{ @@ -259,6 +263,7 @@ words2wc <- function(string) { #' #' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. +#' @name summarize #' #' @export #' @@ -290,6 +295,7 @@ filter_freq <- function(x, min.freq) { #' @return A tibble summarizing the counts of occurrences of elements in #' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. +#' @name summarize #' @export #' #' @examples @@ -335,6 +341,7 @@ summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", #' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency #' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. +#' @name summarize #' #' @export #' @@ -369,6 +376,7 @@ summ.DA.byLin <- function(x) { #' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. +#' @name summarize #' @export #' #' @examples @@ -401,6 +409,7 @@ summ.DA <- function(x) { #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. +#' @name summarize #' @export #' #' @examples @@ -421,11 +430,12 @@ summ.GC.byDALin <- function(x) { #' summ.GC.byLin #' -#' @param x +#' @param x A dataframe or tibble containing the data. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' #' @return Describe return, in detail +#' @name summarize #' @export #' #' @examples @@ -459,6 +469,7 @@ summ.GC.byLin <- function(x) { #' `GenContext` and `Lineage`. #' #' The results are arranged in descending order of `count`. +#' @name summarize #' @export #' #' @examples @@ -512,6 +523,7 @@ summ.GC <- function(x) { #' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. +#' @name summarize #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or diff --git a/man/count_bycol.Rd b/man/count_bycol.Rd deleted file mode 100644 index 946a7ea2..00000000 --- a/man/count_bycol.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{count_bycol} -\alias{count_bycol} -\title{Count Bycol} -\usage{ -count_bycol(prot = prot, column = "DomArch", min.freq = 1) -} -\arguments{ -\item{prot}{A data frame containing the dataset to analyze, typically with -multiple columns including the one specified by the \code{column} parameter.} - -\item{column}{A character string specifying the name of the column to analyze. -The default is "DomArch".} - -\item{min.freq}{An integer specifying the minimum frequency an element must -have to be included in the output. Default is 1.} -} -\value{ -A tibble with two columns: -\describe{ -\item{\code{column}}{The unique elements from the specified column -(e.g., "DomArch").} -\item{\code{freq}}{The frequency of each element, i.e., the number of times -each element appears in the specified column.} -} -The tibble is filtered to only include elements that have a frequency -greater than or equal to \code{min.freq} and does not include elements with \code{NA} -values or those starting with a hyphen ("-"). -} -\description{ -Count Bycol -} -\examples{ -\dontrun{ -count_bycol(prot = my_data, column = "DomArch", min.freq = 10) -} -} diff --git a/man/elements2words.Rd b/man/elements2words.Rd deleted file mode 100644 index bda447db..00000000 --- a/man/elements2words.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{elements2words} -\alias{elements2words} -\title{Elements 2 Words} -\usage{ -elements2words(prot, column = "DomArch", conversion_type = "da2doms") -} -\arguments{ -\item{prot}{A dataframe containing the dataset to analyze. The specified -\code{column} contains the string elements to be processed.} - -\item{column}{A character string specifying the name of the column to analyze. -Default is "DomArch".} - -\item{conversion_type}{A character string specifying the type of conversion. -Two options are available: -\describe{ -\item{\code{da2doms}}{Convert domain architectures into individual domains by -replacing \code{+} symbols with spaces.} -\item{\code{gc2da}}{Convert genomic context into domain architectures by -replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} -}} -} -\value{ -A single string where elements are delimited by spaces. The function -performs necessary substitutions based on the \code{conversion_type} and cleans up -extraneous characters like newlines, tabs, and multiple spaces. -} -\description{ -Break string ELEMENTS into WORDS for domain architecture (DA) and genomic -context (GC) -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", -"a+b", "b+c", "b-c")) |> elements2words() -} - -} diff --git a/man/filter_by_doms.Rd b/man/filter_by_doms.Rd deleted file mode 100644 index cfe255ca..00000000 --- a/man/filter_by_doms.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filter_by_doms} -\alias{filter_by_doms} -\title{Filter by Domains} -\usage{ -filter_by_doms( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) -} -\arguments{ -\item{prot}{Dataframe to filter} - -\item{column}{Column to search for domains in (DomArch column)} - -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} -} -\value{ -Filtered data frame -} -\description{ -filter_by_doms filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain -} -\note{ -There is no need to make the domains 'regex safe', that will be handled by this function -} -\examples{ -\dontrun{ -filter_by_doms() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/filter_freq.Rd b/man/filter_freq.Rd deleted file mode 100644 index 9dfba73b..00000000 --- a/man/filter_freq.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filter_freq} -\alias{filter_freq} -\title{Filter Frequency} -\usage{ -filter_freq(x, min.freq) -} -\arguments{ -\item{x}{A tibble (tbl_df) containing at least two columns: one for -elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} - -\item{min.freq}{A numeric value specifying the minimum frequency threshold. -Only elements with frequencies greater than or equal to this value will be -retained.} -} -\value{ -A tibble with the same structure as \code{x}, but filtered to include -only rows where the frequency is greater than or equal to \code{min.freq}. -} -\description{ -Filter Frequency -} -\examples{ -\dontrun{ -filter_freq() -} -} diff --git a/man/summ.DA.Rd b/man/summ.DA.Rd deleted file mode 100644 index 01d15b3c..00000000 --- a/man/summ.DA.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.DA} -\alias{summ.DA} -\title{summ.DA} -\usage{ -summ.DA(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have a column -named \code{DomArch} and a count column, such as \code{count}, which represents the -occurrences of each architecture in various lineages.} -} -\value{ -A tibble summarizing each unique \code{DomArch}, along with the following -columns: -\itemize{ -\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. -\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} -appears. -The results are arranged in descending order of \code{totallin} and \code{totalcount}. -} -} -\description{ -Function to retrieve counts of how many lineages a DomArch appears in -} -\examples{ -\dontrun{ -summ.DA() -} -} diff --git a/man/summ.DA.byLin.Rd b/man/summ.DA.byLin.Rd deleted file mode 100644 index d88e5d37..00000000 --- a/man/summ.DA.byLin.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.DA.byLin} -\alias{summ.DA.byLin} -\title{summ.DA.byLin} -\usage{ -summ.DA.byLin(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{DomArch} and \code{Lineage}.} -} -\value{ -A tibble summarizing the counts of unique domain architectures -(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three -columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency -of each domain architecture for each lineage. The results are arranged in -descending order of \code{count}. -} -\description{ -Function to summarize and retrieve counts by Domains & Domains+Lineage -} -\examples{ -\dontrun{ -summ.DA.byLin() -} -} diff --git a/man/summ.GC.Rd b/man/summ.GC.Rd deleted file mode 100644 index 2ec4d651..00000000 --- a/man/summ.GC.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC} -\alias{summ.GC} -\title{summ.GC} -\usage{ -summ.GC(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} -} -\value{ -A tibble summarizing each unique combination of \code{GenContext} and -\code{Lineage}, along with the following columns: -\itemize{ -\item \code{GenContext}: The genomic context for each entry. -\item \code{Lineage}: The lineage associated with each entry. -\item \code{count}: The total number of occurrences for each combination of -\code{GenContext} and \code{Lineage}. -} - -The results are arranged in descending order of \code{count}. -} -\description{ -summ.GC -} -\examples{ -\dontrun{ -summ.GC() -} -} diff --git a/man/summ.GC.byDALin.Rd b/man/summ.GC.byDALin.Rd deleted file mode 100644 index 7fc8d443..00000000 --- a/man/summ.GC.byDALin.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC.byDALin} -\alias{summ.GC.byDALin} -\title{summ.GC.byDALin} -\usage{ -summ.GC.byDALin(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} -} -\value{ -A tibble summarizing each unique combination of \code{GenContext}, -\code{DomArch}, and \code{Lineage}, along with the following columns: -\itemize{ -\item \code{GenContext}: The genomic context for each entry. -\item \code{DomArch}: The domain architecture for each entry. -\item \code{Lineage}: The lineage associated with each entry. -\item \code{count}: The total number of occurrences for each combination of -\code{GenContext}, \code{DomArch}, and \code{Lineage}. -} - -The results are arranged in descending order of \code{count}. -} -\description{ -summ.GC.byDALin -} -\examples{ -\dontrun{ -summ.GC.byDALin -} -} diff --git a/man/summ.GC.byLin.Rd b/man/summ.GC.byLin.Rd deleted file mode 100644 index df2a8fb8..00000000 --- a/man/summ.GC.byLin.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC.byLin} -\alias{summ.GC.byLin} -\title{summ.GC.byLin} -\usage{ -summ.GC.byLin(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summ.GC.byLin -} -\examples{ -\dontrun{ -summ.GC.byLin() -} -} diff --git a/man/summarize.Rd b/man/summarize.Rd new file mode 100644 index 00000000..f149f686 --- /dev/null +++ b/man/summarize.Rd @@ -0,0 +1,260 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{summarize} +\alias{summarize} +\alias{filter_by_doms} +\alias{count_bycol} +\alias{elements2words} +\alias{words2wc} +\alias{filter_freq} +\alias{summarize_bylin} +\alias{summ.DA.byLin} +\alias{summ.DA} +\alias{summ.GC.byDALin} +\alias{summ.GC.byLin} +\alias{summ.GC} +\alias{total_counts} +\title{Filter by Domains} +\usage{ +filter_by_doms( + prot, + column = "DomArch", + doms_keep = c(), + doms_remove = c(), + ignore.case = FALSE +) + +count_bycol(prot = prot, column = "DomArch", min.freq = 1) + +elements2words(prot, column = "DomArch", conversion_type = "da2doms") + +words2wc(string) + +filter_freq(x, min.freq) + +summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) + +summ.DA.byLin(x) + +summ.DA(x) + +summ.GC.byDALin(x) + +summ.GC.byLin(x) + +summ.GC(x) + +total_counts( + prot, + column = "DomArch", + lineage_col = "Lineage", + cutoff = 90, + RowsCutoff = FALSE, + digits = 2 +) +} +\arguments{ +\item{prot}{A data frame that must contain columns: +\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} + +\item{column}{Character. The column to summarize, default is "DomArch".} + +\item{doms_keep}{Vector of domains that must be identified within column in order for +observation to be kept} + +\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} + +\item{ignore.case}{Should the matching be non case sensitive} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} + +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} + +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} + +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} + +\item{by}{A string representing the grouping column (e.g., \code{Lineage}). +Default is "Lineage".} + +\item{query}{A string specifying the query pattern for filtering the target +column. Use "all" to skip filtering and include all rows.} + +\item{lineage_col}{Character. The name of the lineage column, default is +"Lineage".} + +\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value +will not be shown. Default is 0.} + +\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage +cutoff. Default is FALSE.} + +\item{digits}{Numeric. Number of decimal places for percentage columns. +Default is 2.} +} +\value{ +Filtered data frame + +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). + +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. + +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} + +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. + +A tibble summarizing the counts of occurrences of elements in +the \code{column}, grouped by the \code{by} column. The result includes the number +of occurrences (\code{count}) and is arranged in descending order of count. + +A tibble summarizing the counts of unique domain architectures +(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three +columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency +of each domain architecture for each lineage. The results are arranged in +descending order of \code{count}. + +A tibble summarizing each unique \code{DomArch}, along with the following +columns: +\itemize{ +\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. +\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} +appears. +The results are arranged in descending order of \code{totallin} and \code{totalcount}. +} + +A tibble summarizing each unique combination of \code{GenContext}, +\code{DomArch}, and \code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{DomArch}: The domain architecture for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext}, \code{DomArch}, and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. + +Describe return, in detail + +A tibble summarizing each unique combination of \code{GenContext} and +\code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext} and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. + +A data frame with the following columns: +\itemize{ +\item \code{{{ column }}}: Unique values from the specified column. +\item \code{totalcount}: The total count of occurrences for each unique value in +the specified column. +\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to +the overall count. +\item \code{CumulativePercent}: The cumulative percentage of total counts. +} +} +\description{ +filter_by_doms filters a data frame by identifying exact domain matches +and either keeping or removing rows with the identified domain + +Break string ELEMENTS into WORDS for domain architecture (DA) and genomic +context (GC) + +Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} + +Function to summarize and retrieve counts by Domains & Domains+Lineage + +Function to retrieve counts of how many lineages a DomArch appears in + +Creates a data frame with a totalcount column + +This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. +} +\note{ +There is no need to make the domains 'regex safe', that will be handled by this function + +Please refer to the source code if you have alternate file formats and/or +column names. +} +\examples{ +\dontrun{ +filter_by_doms() +} +\dontrun{ +count_bycol(prot = my_data, column = "DomArch", min.freq = 10) +} +\dontrun{ +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2words() +} + +\dontrun{ +tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> + elements2words() |> + words2wc() +} + +\dontrun{ +filter_freq() +} +\dontrun{ +library(tidyverse) +tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> + summarize_bylin(query = "all") +} + +\dontrun{ +summ.DA.byLin() +} +\dontrun{ +summ.DA() +} +\dontrun{ +summ.GC.byDALin +} +\dontrun{ +summ.GC.byLin() +} +\dontrun{ +summ.GC() +} +\dontrun{ +total_counts(pspa - gc_lin_counts, 0, "GC") +} +} +\author{ +Samuel Chen, Janani Ravi +} diff --git a/man/summarize_bylin.Rd b/man/summarize_bylin.Rd deleted file mode 100644 index 92b93652..00000000 --- a/man/summarize_bylin.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarize_bylin} -\alias{summarize_bylin} -\title{Summarize by Lineage} -\usage{ -summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{prot}{A dataframe or tibble containing the data.} - -\item{column}{A string representing the column to be summarized -(e.g., \code{DomArch}). Default is "DomArch".} - -\item{by}{A string representing the grouping column (e.g., \code{Lineage}). -Default is "Lineage".} - -\item{query}{A string specifying the query pattern for filtering the target -column. Use "all" to skip filtering and include all rows.} -} -\value{ -A tibble summarizing the counts of occurrences of elements in -the \code{column}, grouped by the \code{by} column. The result includes the number -of occurrences (\code{count}) and is arranged in descending order of count. -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarize_bylin(query = "all") -} - -} diff --git a/man/total_counts.Rd b/man/total_counts.Rd deleted file mode 100644 index 53d70096..00000000 --- a/man/total_counts.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{total_counts} -\alias{total_counts} -\title{Total Counts} -\usage{ -total_counts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize, default is "DomArch".} - -\item{lineage_col}{Character. The name of the lineage column, default is -"Lineage".} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value -will not be shown. Default is 0.} - -\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage -cutoff. Default is FALSE.} - -\item{digits}{Numeric. Number of decimal places for percentage columns. -Default is 2.} -} -\value{ -A data frame with the following columns: -\itemize{ -\item \code{{{ column }}}: Unique values from the specified column. -\item \code{totalcount}: The total count of occurrences for each unique value in -the specified column. -\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to -the overall count. -\item \code{CumulativePercent}: The cumulative percentage of total counts. -} -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -total_counts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wc.Rd b/man/words2wc.Rd deleted file mode 100644 index 69d006d5..00000000 --- a/man/words2wc.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{words2wc} -\alias{words2wc} -\title{Words 2 Word Counts} -\usage{ -words2wc(string) -} -\arguments{ -\item{string}{A character string containing the elements (words) to count. -This would typically be a space-delimited string representing domain -architectures or genomic contexts.} -} -\value{ -A tibble (tbl_df) with two columns: -\describe{ -\item{\code{words}}{A column containing the individual words -(domains or domain architectures).} -\item{\code{freq}}{A column containing the frequency counts for each word.} -} -} -\description{ -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2words() |> - words2wc() -} - -} From 5758ad993fa2b80cba9297a83786a4a59556e544 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Tue, 8 Oct 2024 00:29:13 +0300 Subject: [PATCH 07/61] add error handling in tree.R Signed-off-by: Awa Synthia --- R/tree.R | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/R/tree.R b/R/tree.R index 01e9ead5..9386bbfe 100755 --- a/R/tree.R +++ b/R/tree.R @@ -51,6 +51,30 @@ convert_fa2tre <- function(fa_path = here("data/alns/pspa_snf7.fa"), # fa_path=here("data/alns/pspa_snf7.fa") # tre_path=here("data/alns/pspa_snf7.tre") # fasttree_path=here("src/FastTree") + + # Check if the FASTA file exists + if (!file.exists(fa_path)) { + stop(paste("Error: The FASTA file does not exist at:", fa_path)) + } + + # Check if the FastTree executable exists + if (!file.exists(fasttree_path)) { + stop(paste("Error: The FastTree executable does not exist at:", + fasttree_path)) + } + + # Check if the output directory exists + tre_dir <- dirname(tre_path) + if (!dir.exists(tre_dir)) { + stop(paste("Error: The output directory does not exist:", tre_dir)) + } + + # Check if the output file already exists + if (file.exists(tre_path)) { + cat("Warning: The output file already exists and will be overwritten:", + tre_path, "\n") + } + print(fa_path) system2( command = fasttree_path, @@ -83,8 +107,18 @@ convert_fa2tre <- function(fa_path = here("data/alns/pspa_snf7.fa"), #' #' @examples generate_trees <- function(aln_path = here("data/alns/")) { + + # Check if the alignment directory exists + if (!dir.exists(aln_path)) { + stop(paste("Error: The alignment directory does not exist:", aln_path)) + } # finding all fasta alignment files fa_filenames <- list.files(path = aln_path, pattern = "*.fa") + # Check if any FASTA files were found + if (length(fa_filenames) == 0) { + stop("Error: No FASTA files found in the specified directory.") + } + fa_paths <- paste0(aln_path, fa_filenames) variable <- str_replace_all(basename(fa_filenames), pattern = ".fa", replacement = "" @@ -139,6 +173,23 @@ generate_fa2tre <- function(fa_file = "data/alns/pspa_snf7.fa", ## SAMPLE ARGS # fa_file="data/alns/pspa_snf7.fa" # out_file="data/alns/pspa_snf7.tre" + + # Check if the FASTA file exists + if (!file.exists(fa_file)) { + stop(paste("Error: The FASTA file does not exist at:", fa_file)) + } + + # Check if the output directory exists + out_dir <- dirname(out_file) + if (!dir.exists(out_dir)) { + stop(paste("Error: The output directory does not exist:", out_dir)) + } + + # Check if the output file already exists + if (file.exists(out_file)) { + cat("Warning: The output file already exists and will be overwritten:", + out_file, "\n") + } ########################### ## Approach 1 From bf40f2da6cb35beb466a92dadf5e39c943b35d5d Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Tue, 8 Oct 2024 00:45:02 +0300 Subject: [PATCH 08/61] add error handling Signed-off-by: Awa Synthia --- R/summarize.R | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/R/summarize.R b/R/summarize.R index a9b13e43..4b0eaa55 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -41,6 +41,23 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov # Any row containing a domain in doms_remove will be removed # ^word$|(?<=\+)word$|(?<=\+)word(?=\+)|word(?=\+) + + # Check if prot is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + + # Check if the specified column exists in the data frame + if (!column %in% names(prot)) { + stop(paste("Error: The specified column '", column, "' does not exist + in the data frame.", sep = "")) + } + + # If doms_keep or doms_remove are not provided, inform the user + if (length(doms_keep) == 0 && length(doms_remove) == 0) { + warning("Warning: No domains specified to keep or remove. Returning the + original data frame.") + } # Make regex safe doms_keep <- str_replace_all(string = doms_keep, pattern = "\\(", replacement = "\\\\(") @@ -105,6 +122,23 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov #' count_bycol() #' } count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { + + # Check if 'prot' is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + + # Check if the specified column exists in the data frame + if (!column %in% names(prot)) { + stop(paste("Error: The specified column '", column, "' does not exist in + the data frame.", sep = "")) + } + + # Check if min.freq is a positive integer + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + floor(min.freq) != min.freq) { + stop("Error: 'min.freq' must be a positive integer.") + } counts <- prot %>% select(column) %>% table() %>% @@ -139,6 +173,24 @@ count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { #' } #' elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms") { + # Check if 'prot' is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + + # Check if the specified column exists in the data frame + if (!column %in% names(prot)) { + stop(paste("Error: The specified column '", column, "' does not exist in + the data frame.", sep = "")) + } + + # Check for valid conversion_type values + valid_types <- c("da2doms", "doms2da") + if (!conversion_type %in% valid_types) { + stop(paste("Error: Invalid 'conversion_type'. Must be one of:", + paste(valid_types, collapse = ", "))) + } + z1 <- prot %>% dplyr::pull(column) %>% str_replace_all("\\,", " ") %>% @@ -189,6 +241,11 @@ elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' } #' words2wc <- function(string) { + # Check if 'string' is a character vector of length 1 + if (!is.character(string) || length(string) != 1) { + stop("Error: 'string' must be a single character vector.") + } + df_word_count <- string %>% # reduce spaces with length 2 or greater to a single space str_replace_all("\\s{2,}", " ") %>% @@ -230,6 +287,22 @@ words2wc <- function(string) { #' filter_freq() #' } filter_freq <- function(x, min.freq) { + + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } + + # Check if 'min.freq' is a positive integer + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + floor(min.freq) != min.freq) { + stop("Error: 'min.freq' must be a positive integer.") + } + + # Check if the 'freq' column exists in the data frame + if (!"freq" %in% names(x)) { + stop("Error: The data frame must contain a 'freq' column.") + } x %>% filter(freq >= min.freq) } @@ -259,6 +332,23 @@ filter_freq <- function(x, min.freq) { #' summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", query) { + # Check if 'prot' is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + + # Check if the specified column exists in the data frame + if (!column %in% names(prot)) { + stop(paste("Error: The specified column '", column, "' does not exist in + the data frame.", sep = "")) + } + + # Check if the 'by' column exists in the data frame + if (!by %in% names(prot)) { + stop(paste("Error: The specified 'by' column '", by, "' does not exist + n the data frame.", sep = "")) + } + column <- sym(column) by <- sym(by) if (query == "all") { @@ -295,6 +385,19 @@ summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", #' summ.DA.byLin() #' } summ.DA.byLin <- function(x) { + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } + + # Check if required columns exist in the data frame + required_columns <- c("DomArch", "Lineage") + missing_columns <- setdiff(required_columns, names(x)) + + if (length(missing_columns) > 0) { + stop(paste("Error: The following required columns are + missing:", paste(missing_columns, collapse = ", "))) + } ## Note: it is better to reserve dots for S3 Objects. Consider replacing '.' with '_' x %>% filter(!grepl("^-$", DomArch)) %>% @@ -321,6 +424,10 @@ summ.DA.byLin <- function(x) { #' summ.DA() #' } summ.DA <- function(x) { + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } ## Note: it is better to reserve dots for S3 Objects. Consider replacing '.' with '_' x %>% group_by(DomArch) %>% @@ -344,6 +451,10 @@ summ.DA <- function(x) { #' summ.GC.byDALin #' } summ.GC.byDALin <- function(x) { + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } ## Note: it is better to reserve dots for S3 Objects. Consider replacing '.' with '_' x %>% filter(!grepl("^-$", GenContext)) %>% @@ -369,6 +480,10 @@ summ.GC.byDALin <- function(x) { #' summ.GC.byLin() #' } summ.GC.byLin <- function(x) { + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } ## Note: it is better to reserve dots for S3 Objects. Consider replacing '.' with '_' x %>% filter(!grepl("^-$", GenContext)) %>% @@ -394,6 +509,10 @@ summ.GC.byLin <- function(x) { #' summ.GC() #' } summ.GC <- function(x) { + # Check if 'x' is a data frame + if (!is.data.frame(x)) { + stop("Error: 'x' must be a data frame.") + } ## Note: it is better to reserve dots for S3 Objects. Consider replacing '.' with '_' x %>% group_by(GenContext) %>% @@ -442,6 +561,31 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", cutoff = 90, RowsCutoff = FALSE, digits = 2 # type = "GC" ) { + # Check if 'prot' is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + + # Check if the specified columns exist in the data frame + required_columns <- c(column, lineage_col) + missing_columns <- setdiff(required_columns, names(prot)) + + if (length(missing_columns) > 0) { + stop(paste("Error: The following required columns are missing:", + paste(missing_columns, collapse = ", "))) + } + + # Check that cutoff is a numeric value between 0 and 100 + if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) { + stop("Error: 'cutoff' must be a numeric value between 0 and 100.") + } + + # Check that digits is a non-negative integer + if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || + floor(digits) != digits) { + stop("Error: 'digits' must be a non-negative integer.") + } + column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% @@ -601,6 +745,11 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", #' find_paralogs(pspa) #' } find_paralogs <- function(prot) { + # Check if 'prot' is a data frame + if (!is.data.frame(prot)) { + stop("Error: 'prot' must be a data frame.") + } + # Remove eukaryotes prot <- prot %>% filter(!grepl("^eukaryota", Lineage)) paralogTable <- prot %>% From 4aeaa113927b6f94b21c9f0dd0956bb7e48004a5 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Mon, 7 Oct 2024 22:50:16 +0100 Subject: [PATCH 09/61] Add error handling to multiple functions - Implement error handling for mapOption2Process, get_proc_medians, write_proc_medians_table, get_proc_weights, advanced_opts2est_walltime, assign_job_queue, and plot_estimated_walltimes . - Validate input arguments for each function to ensure they meet expected criteria. - Use tryCatch blocks to gracefully handle errors and warnings. - Provide informative error messages and detailed logging where appropriate. - Ensure functions fail gracefully and provide useful feedback. Also renamed the functions to the following; assign_job_queue -> assignJobQueue make_opts2procs -> mapOption2Process map_advanced_opts2procs -> mapAdvOption2Process get_proc_medians - calculateProcessRuntime write_proc_medians_table -> writeProcessRuntime2TSV write_proc_medians_yml -> writeProcessRuntime2YML get_proc_weights -> getProcessRuntimeWeights advanced_opts2est_walltime -> calculateEstimatedWallTimeFromOpts plot_estimated_walltimes -> plotEstimatedWallTimes --- NAMESPACE | 26 +- R/assign_job_queue.R | 484 ++++++++++++------ R/clean_clust_file.R | 4 +- R/combine_analysis.R | 4 +- R/combine_files.R | 10 +- R/create_lineage_lookup.R | 6 +- ...{assign_job_queue.Rd => assignJobQueue.Rd} | 13 +- ... calculateEstimatedWallTimeFromOptions.Rd} | 12 +- ..._medians.Rd => calculateProcessRuntime.Rd} | 10 +- ...lean_clust_file.Rd => cleanClusterFile.Rd} | 8 +- man/{combine_files.Rd => combineFiles.Rd} | 6 +- ...combine_full.Rd => combineFullAnalysis.Rd} | 6 +- man/{combine_ipr.Rd => combineIPR.Rd} | 6 +- ...neage_lookup.Rd => createLineageLookup.Rd} | 6 +- ...weights.Rd => getProcessRuntimeWeights.Rd} | 8 +- ..._opts2procs.Rd => mapAdvOption2Process.Rd} | 8 +- ...ake_opts2procs.Rd => mapOption2Process.Rd} | 8 +- ...walltimes.Rd => plotEstimatedWallTimes.Rd} | 11 +- ...ns_table.Rd => writeProcessRuntime2TSV.Rd} | 8 +- ...ans_yml.Rd => writeProcessRuntimeToYML.Rd} | 13 +- 20 files changed, 416 insertions(+), 241 deletions(-) rename man/{assign_job_queue.Rd => assignJobQueue.Rd} (64%) rename man/{advanced_opts2est_walltime.Rd => calculateEstimatedWallTimeFromOptions.Rd} (68%) rename man/{get_proc_medians.Rd => calculateProcessRuntime.Rd} (76%) rename man/{clean_clust_file.Rd => cleanClusterFile.Rd} (82%) rename man/{combine_files.Rd => combineFiles.Rd} (92%) rename man/{combine_full.Rd => combineFullAnalysis.Rd} (69%) rename man/{combine_ipr.Rd => combineIPR.Rd} (74%) rename man/{create_lineage_lookup.Rd => createLineageLookup.Rd} (91%) rename man/{get_proc_weights.Rd => getProcessRuntimeWeights.Rd} (73%) rename man/{map_advanced_opts2procs.Rd => mapAdvOption2Process.Rd} (76%) rename man/{make_opts2procs.Rd => mapOption2Process.Rd} (75%) rename man/{plot_estimated_walltimes.Rd => plotEstimatedWallTimes.Rd} (55%) rename man/{write_proc_medians_table.Rd => writeProcessRuntime2TSV.Rd} (77%) rename man/{write_proc_medians_yml.Rd => writeProcessRuntimeToYML.Rd} (61%) diff --git a/NAMESPACE b/NAMESPACE index 16cf0813..9c038631 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,26 +12,27 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) -export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assign_job_queue) +export(assignJobQueue) +export(calculateEstimatedWallTimeFromOptions) +export(calculateProcessRuntime) export(cleanup_GeneDesc) export(cleanup_clust) export(cleanup_domarch) export(cleanup_gencontext) export(cleanup_lineage) export(cleanup_species) -export(combine_files) -export(combine_full) -export(combine_ipr) +export(combineFiles) +export(combineFullAnalysis) +export(combineIPR) export(convert_aln2fa) export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) +export(createLineageLookup) export(create_all_col_params) -export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) @@ -45,10 +46,9 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) +export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) -export(get_proc_medians) -export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -58,12 +58,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) -export(make_opts2procs) +export(mapAdvOption2Process) +export(mapOption2Process) export(map_acc2name) -export(map_advanced_opts2procs) export(msa_pdf) export(pick_longer_duplicate) -export(plot_estimated_walltimes) +export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) export(remove_astrk) @@ -95,8 +95,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(write_proc_medians_table) -export(write_proc_medians_yml) +export(writeProcessRuntime2TSV) +export(writeProcessRuntimeToYML) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index bc5253d4..f1fcb6db 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,22 +3,32 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") +# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- make_opts2procs +#' example: list_opts2procs <- mapOption2Process #' @export -make_opts2procs <- function() { +mapOption2Process <- function() { + tryCatch({ opts2processes <- list( - "homology_search" = c("dblast", "dblast_cleanup"), - "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), - "always" = c("blast_clust", "clust2table") # processes always present agnostic of advanced options + "homology_search" = c("dblast", "dblast_cleanup"), + "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), + # processes always present agnostic of advanced options + "always" = c("blast_clust", "clust2table") ) return(opts2processes) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("mapOption2Process function execution completed.") + }) + } #' Use MolEvolvR advanced options to get associated processes @@ -30,17 +40,29 @@ make_opts2procs <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- map_advanced_opts2procs(advanced_opts) +#' procs <- mapAdvOption2Process(advanced_opts) #' @export -map_advanced_opts2procs <- function(advanced_opts) { +mapAdvOption2Process <- function(advanced_opts) { + if (!is.character(advanced_opts)) { + stop("Argument must be a character vector!") + } + tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- make_opts2procs() + opts2proc <- mapOption2Process() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run procs <- opts2proc[idx] |> unlist() return(procs) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("mapOption2Process function execution completed.") + }) + } #' Scrape MolEvolvR logs and calculate median processes @@ -58,47 +80,68 @@ map_advanced_opts2procs <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export -get_proc_medians <- function(dir_job_results) { +calculateProcessRuntime <- function(dir_job_results) { + tryCatch({ + # Check if dir_job_results is a character string + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + # Check if dir_job_results exists + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } + source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) # aggregate logs from - path_log_data <- file.path(common_root, "molevol_scripts", "log_data", "prod_logs.rda") + path_log_data <- file.path(common_root, + "molevol_scripts", "log_data", "prod_logs.rda") # ensure the folder exists to the location if (!dir.exists(path_log_data)) { - dir.create(dirname(path_log_data), recursive = TRUE, showWarnings = FALSE) + dir.create(dirname(path_log_data), + recursive = TRUE, showWarnings = FALSE) } # attempt to load pre-generated logdata if (!file.exists(path_log_data)) { - logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) - save(logs, file = path_log_data) + logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) + save(logs, file = path_log_data) } else { - load(path_log_data) # loads the logs object + load(path_log_data) # loads the logs object } df_log <- logs$df_log procs <- c( - "dblast", "dblast_cleanup", "iprscan", - "ipr2lineage", "ipr2da", "blast_clust", - "clust2table" + "dblast", "dblast_cleanup", "iprscan", + "ipr2lineage", "ipr2da", "blast_clust", + "clust2table" ) list_proc_medians <- df_log |> - dplyr::select(dplyr::all_of(procs)) |> - dplyr::summarise( - dplyr::across( - dplyr::everything(), - \(x) median(x, na.rm = TRUE) - ) - ) |> - as.list() + dplyr::select(dplyr::all_of(procs)) |> + dplyr::summarise( + dplyr::across( + dplyr::everything(), + \(x) median(x, na.rm = TRUE) + ) + ) |> + as.list() return(list_proc_medians) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("calculateProcessRuntime function execution completed.") + }) + } #' Write a table of 2 columns: 1) process and 2) median seconds @@ -113,51 +156,99 @@ get_proc_medians <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: write_proc_medians_table( +#' example: writeProcessRuntime2TSV( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -write_proc_medians_table <- function(dir_job_results, filepath) { - df_proc_medians <- get_proc_medians(dir_job_results) |> - tibble::as_tibble() |> - tidyr::pivot_longer( - dplyr::everything(), - names_to = "process", - values_to = "median_seconds" - ) |> - dplyr::arrange(dplyr::desc(median_seconds)) +writeProcessRuntime2TSV <- function(dir_job_results, filepath) { + tryCatch({ + # Error handling for input arguments + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } + + if (!is.character(filepath) || length(filepath) != 1) { + stop("Input 'filepath' must be a single character string.") + } + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + tibble::as_tibble() |> + tidyr::pivot_longer( + dplyr::everything(), + names_to = "process", + values_to = "median_seconds" + ) |> + dplyr::arrange(dplyr::desc(median_seconds)) + + # Write the resulting tibble to a TSV file readr::write_tsv(df_proc_medians, file = filepath) return(df_proc_medians) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("writeProcessRuntime2TSV function execution completed.") + }) + } #' Compute median process runtimes, then write a YAML list of the processes and #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which get_proc_weights() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory -#' @param filepath [chr] path to save YAML file; if NULL, uses ./molevol_scripts/log_data/job_proc_weights.yml +#' @param filepath [chr] path to save YAML file; if NULL, +#' uses ./molevol_scripts/log_data/job_proc_weights.yml #' #' @importFrom yaml write_yaml #' #' @examples #' \dontrun{ -#' write_proc_medians_yml( +#' writeProcessRuntimeToYML( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { +writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { + tryCatch({ + # Error handling for dir_job_results arguments + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } if (is.null(filepath)) { - filepath <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") + filepath <- file.path(common_root, + "molevol_scripts", + "log_data", + "job_proc_weights.yml") + } + if (!is.character(filepath) || length(filepath) != 1) { + stop("Input 'filepath' must be a single character string.") } - medians <- get_proc_medians(dir_job_results) + medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) + }, error = function(e) { + message(paste("Encountered an error: "), e$message) + }, warning = function(w) { + message(paste("Warning: "), w$message) + }, finally = { + message("write_proc_medians_table function execution completed.") + } + ) + } #' Quickly get the runtime weights for MolEvolvR backend processes @@ -170,50 +261,52 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: get_proc_weights() +#' example: writeProcessRuntimeToYML() #' @export -get_proc_weights <- function(medians_yml_path = NULL) { - if (is.null(medians_yml_path)) { - medians_yml_path <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") +getProcessRuntimeWeights <- function(medians_yml_path = NULL) { + if (is.null(medians_yml_path)) { + medians_yml_path <- file.path(common_root, + "molevol_scripts", + "log_data", + "job_proc_weights.yml") + } + + proc_weights <- tryCatch({ + # attempt to read the weights from the YAML file produced by + # writeProcessRuntimeToYML() + if (stringr::str_trim(medians_yml_path) == "") { + stop( + stringr::str_glue("medians_yml_path is empty + ({medians_yml_path}), returning default weights") + ) } - proc_weights <- tryCatch( - { - # attempt to read the weights from the YAML file produced by - # write_proc_medians_yml() - if (stringr::str_trim(medians_yml_path) == "") { - stop( - stringr::str_glue("medians_yml_path is empty ({medians_yml_path}), returning default weights") - ) - } - - proc_weights <- yaml::read_yaml(medians_yml_path) - }, - # to avoid fatal errors in reading the proc weights yaml, - # some median process runtimes have been hardcoded based on - # the result of get_proc_medians() from Jan 2024 - error = function(cond) { - proc_weights <- list( - "dblast" = 2810, - "iprscan" = 1016, - "dblast_cleanup" = 79, - "ipr2lineage" = 18, - "ipr2da" = 12, - "blast_clust" = 2, - "clust2table" = 2 - ) - proc_weights - } + proc_weights <- yaml::read_yaml(medians_yml_path) + }, + # to avoid fatal errors in reading the proc weights yaml, + # some median process runtimes have been hardcoded based on + # the result of calculateProcessRuntime() from Jan 2024 + error = function(cond) { + proc_weights <- list( + "dblast" = 2810, + "iprscan" = 1016, + "dblast_cleanup" = 79, + "ipr2lineage" = 18, + "ipr2da" = 12, + "blast_clust" = 2, + "clust2table" = 2 ) + proc_weights + }) - return(proc_weights) + return(proc_weights) } #' Given MolEvolvR advanced options and number of inputs, #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see make_opts2procs for the options) +#' (see mapOption2Process for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -221,68 +314,129 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: advanced_opts2est_walltime(c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) +#' example: calculateEstimatedWallTimeFromOptions(c("homology_search", +#' "domain_architecture"), +#' n_inputs = 3, n_hits = 50L) #' @export -advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { +calculateEstimatedWallTimeFromOptions <- function(advanced_opts, + n_inputs = 1L, + n_hits = NULL, + verbose = FALSE) { + + tryCatch({ # to calculate est walltime for a homology search job, the number of hits # must be provided validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts stopifnot(!validation_fail) - proc_weights <- get_proc_weights() + # Validate advanced_opts + if (!is.character(advanced_opts)) { + stop("Argument 'advanced_opts' must be a character vector.") + } + + # Validate n_inputs + if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { + stop("Argument 'n_inputs' must be a single positive numeric value.") + } + + # Validate n_hits if homology_search is in advanced_opts + if ("homology_search" %in% advanced_opts && + (is.null(n_hits)|| !is.numeric(n_hits) + || length(n_hits) != 1 || n_hits < 0)) { + stop("Argument 'n_hits' must be a single non-negative numeric value when + 'homology_search' is in 'advanced_opts'.") + } + + # Get process weights + proc_weights <- writeProcessRuntimeToYML() + if (!is.list(proc_weights)) { + stop("Process weights could not be retrieved correctly.") + } + # sort process weights by names and convert to vec proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- map_advanced_opts2procs(advanced_opts) + procs_from_opts <- mapAdvOption2Process(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) # dot product of weights and procs to run; scaled by the number of inputs est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> - as.numeric() + as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- make_opts2procs() - # exclude the homology search processes for the homologous hits - procs2exclude_for_homologs <- opts2procs[["homology_search"]] - procs_homologs <- procs_from_opts[!(procs_from_opts %in% procs2exclude_for_homologs)] - binary_proc_vec_homolog <- dplyr::if_else(all_procs %in% procs_homologs, 1L, 0L) - # add the estimated walltime for processes run on the homologous hits - est_walltime <- est_walltime + - (n_hits * (binary_proc_vec_homolog %*% proc_weights) |> as.numeric()) + opts2procs <- mapOption2Process() + # exclude the homology search processes for the homologous hits + procs2exclude_for_homologs <- opts2procs[["homology_search"]] + procs_homologs <- procs_from_opts[!(procs_from_opts + %in% procs2exclude_for_homologs)] + binary_proc_vec_homolog <- dplyr::if_else(all_procs + %in% procs_homologs, 1L, 0L) + # add the estimated walltime for processes run on the homologous hits + est_walltime <- est_walltime + + (n_hits * (binary_proc_vec_homolog + %*% proc_weights) |> as.numeric()) } if (verbose) { - msg <- stringr::str_glue( - "warnings from advanced_opts2est_walltime():\n", - "\tn_inputs={n_inputs}\n", - "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", - "\test_walltime={est_walltime}\n\n" - ) - cat(file = stderr(), msg) + msg <- stringr::str_glue( + "warnings from calculateEstimatedWallTimeFromOptions():\n", + "\tn_inputs={n_inputs}\n", + "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", + "\test_walltime={est_walltime}\n\n" + ) + cat(file = stderr(), msg) } return(est_walltime) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("calculateEstimatedWallTimeFromOptions + function execution completed.") + }) + } + #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from advanced_opts2est_walltime()) +#' (from calculateEstimatedWallTimeFromOptions()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' advanced_opts2est_walltime(c("homology_search", "domain_architecture"), 3) |> -#' assign_job_queue() +#' calculateEstimatedWallTimeFromOptions(c("homology_search", +#' "domain_architecture"), 3) |> +#' assignJobQueue() #' @export -assign_job_queue <- function( - t_sec_estimate, - t_cutoff = 21600 # 6 hours - ) { +assignJobQueue <- function( + t_sec_estimate, + t_cutoff = 21600 # 6 hours +) { + tryCatch({ + if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { + stop("Argument 't_sec_estimate' must be a single numeric value.") + } + + if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { + stop("Argument 't_cutoff' must be a single non-negative numeric value.") + } + queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") return(queue) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("assignJobQueue function execution completed.") + }) + } #' Plot the estimated runtimes for different advanced options and number @@ -297,81 +451,97 @@ assign_job_queue <- function( #' @return line plot object #' #' example: -#' p <- plot_estimated_walltimes() -#' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) +#' p <- plotEstimatedWallTimes() +#' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ +#' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plot_estimated_walltimes <- function() { - opts <- make_opts2procs() |> names() +plotEstimatedWallTimes <- function() { + tryCatch({ + opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { - # generate powerset (do not include empty set) - n <- length(vec) - indices <- 1:n - powerset <- lapply(1:n, function(x) combn(indices, x, simplify = FALSE)) - powerset <- unlist(powerset, recursive = FALSE) - powerset <- lapply(powerset, function(index) vec[index]) - powerset + # generate powerset (do not include empty set) + n <- length(vec) + indices <- 1:n + powerset <- lapply(1:n, function(x) combn(indices, x, simplify = FALSE)) + powerset <- unlist(powerset, recursive = FALSE) + powerset <- lapply(powerset, function(index) vec[index]) + powerset } opts_power_set <- get_powerset(opts) est_walltimes <- list() for (i in 1:20) { - est_walltimes <- append( - x = est_walltimes, - values = sapply( - opts_power_set, - FUN = function(advanced_opts) { - # for simplicity, assume the default number of homologus hits (100) - n_hits <- if ("homology_search" %in% advanced_opts) { - 100 - } else { - NULL - } - est_walltime <- advanced_opts2est_walltime( - advanced_opts, - n_inputs = i, - n_hits = n_hits, - verbose = TRUE - ) - names(est_walltime) <- paste0(advanced_opts, collapse = "_") - est_walltime - } + est_walltimes <- append( + x = est_walltimes, + values = sapply( + opts_power_set, + FUN = function(advanced_opts) { + # for simplicity, assume the default number of homologus hits (100) + n_hits <- if ("homology_search" %in% advanced_opts) { + 100 + } else { + NULL + } + est_walltime <- calculateEstimatedWallTimeFromOptions( + advanced_opts, + n_inputs = i, + n_hits = n_hits, + verbose = TRUE ) + names(est_walltime) <- paste0(advanced_opts, collapse = "_") + est_walltime + } ) + ) } # concat all results to their unique names est_walltimes <- tapply( - unlist( - est_walltimes, - use.names = FALSE - ), - rep( - names(est_walltimes), - lengths(est_walltimes) - ), - FUN = c + unlist( + est_walltimes, + use.names = FALSE + ), + rep( + names(est_walltimes), + lengths(est_walltimes) + ), + FUN = c ) df_walltimes <- est_walltimes |> - unlist() |> - matrix(nrow = length(est_walltimes[[1]]), ncol = length(names(est_walltimes))) + unlist() |> + matrix(nrow = length(est_walltimes[[1]]), + ncol = length(names(est_walltimes))) colnames(df_walltimes) <- names(est_walltimes) df_walltimes <- df_walltimes |> tibble::as_tibble() # rm always col or powerset outcome without the "always" processes col_idx_keep <- grep(pattern = "always$", x = names(df_walltimes)) df_walltimes <- df_walltimes |> - dplyr::select(col_idx_keep) + dplyr::select(col_idx_keep) # bind n_inputs df_walltimes <- df_walltimes |> - dplyr::mutate(n_inputs = 1:20) - df_walltimes <- tidyr::gather(df_walltimes, key = "advanced_opts", value = "est_walltime", -n_inputs) + dplyr::mutate(n_inputs = 1:20) + df_walltimes <- tidyr::gather(df_walltimes, + key = "advanced_opts", + value = "est_walltime", + n_inputs) # sec to hrs df_walltimes <- df_walltimes |> - dplyr::mutate(est_walltime = est_walltime / 3600) - p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, y = est_walltime, color = advanced_opts)) + - ggplot2::geom_line() + - ggplot2::labs( - title = "MolEvolvR estimated runtimes", - x = "Number of inputs", - y = "Estimated walltime (hours)" - ) + dplyr::mutate(est_walltime = est_walltime / 3600) + p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, + y = est_walltime, + color = advanced_opts)) + + ggplot2::geom_line() + + ggplot2::labs( + title = "MolEvolvR estimated runtimes", + x = "Number of inputs", + y = "Estimated walltime (hours)" + ) return(p) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("plotEstimatedWallTimes function execution completed.") + }) + } diff --git a/R/clean_clust_file.R b/R/clean_clust_file.R index d3f813e5..87dcde70 100755 --- a/R/clean_clust_file.R +++ b/R/clean_clust_file.R @@ -55,9 +55,9 @@ #' #' @examples #' \dontrun{ -#' clean_clust_file("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") +#' cleanClusterFile("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") #' } -clean_clust_file <- function(path, writepath = NULL, query) { +cleanClusterFile <- function(path, writepath = NULL, query) { # ?? does the following line need to be changed to read_lines()? prot <- read_tsv(path, col_names = F) diff --git a/R/combine_analysis.R b/R/combine_analysis.R index bb3b3ce2..58ce1f14 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,7 +17,7 @@ #' @export #' #' @examples -combine_full <- function(inpath, ret = FALSE) { +combineFullAnalysis <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combine_files(inpath, pattern = "*.full_analysis.tsv", skip = 0, @@ -44,7 +44,7 @@ combine_full <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combine_ipr <- function(inpath, ret = FALSE) { +combineIPR <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combine_files(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, diff --git a/R/combine_files.R b/R/combine_files.R index 76c5fa09..455ddd53 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combine_files(inpath, +# full_combnd <- combineFiles(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combine_files(inpath, +# cln_combnd <- combineFiles(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combine_files(inpath, +# cl_blast_combnd <- combineFiles(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combine_files(inpath, +# ipr_combnd <- combineFiles(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index e7374df3..d911934a 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), +createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - shorten_NA <- function(Lineage) { + .shortenNA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, shorten_NA))) + mutate(Lineage = unlist(map(Lineage, .shortenNA))) diff --git a/man/assign_job_queue.Rd b/man/assignJobQueue.Rd similarity index 64% rename from man/assign_job_queue.Rd rename to man/assignJobQueue.Rd index ceb6fa77..27511b6a 100644 --- a/man/assign_job_queue.Rd +++ b/man/assignJobQueue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assign_job_queue} -\alias{assign_job_queue} +\name{assignJobQueue} +\alias{assignJobQueue} \title{Decision function to assign job queue} \usage{ -assign_job_queue(t_sec_estimate, t_cutoff = 21600) +assignJobQueue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from advanced_opts2est_walltime())} +(from calculateEstimatedWallTimeFromOptions())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,8 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -advanced_opts2est_walltime(c("homology_search", "domain_architecture"), 3) |> -assign_job_queue() +calculateEstimatedWallTimeFromOptions(c("homology_search", +"domain_architecture"), 3) |> +assignJobQueue() } \description{ Decision function to assign job queue diff --git a/man/advanced_opts2est_walltime.Rd b/man/calculateEstimatedWallTimeFromOptions.Rd similarity index 68% rename from man/advanced_opts2est_walltime.Rd rename to man/calculateEstimatedWallTimeFromOptions.Rd index ea4b29e6..e4eec3fd 100644 --- a/man/advanced_opts2est_walltime.Rd +++ b/man/calculateEstimatedWallTimeFromOptions.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{advanced_opts2est_walltime} -\alias{advanced_opts2est_walltime} +\name{calculateEstimatedWallTimeFromOptions} +\alias{calculateEstimatedWallTimeFromOptions} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -advanced_opts2est_walltime( +calculateEstimatedWallTimeFromOptions( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,16 @@ advanced_opts2est_walltime( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see make_opts2procs for the options)} +(see mapOption2Process for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: advanced_opts2est_walltime(c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) +example: calculateEstimatedWallTimeFromOptions(c("homology_search", +"domain_architecture"), +n_inputs = 3, n_hits = 50L) } \description{ Given MolEvolvR advanced options and number of inputs, diff --git a/man/get_proc_medians.Rd b/man/calculateProcessRuntime.Rd similarity index 76% rename from man/get_proc_medians.Rd rename to man/calculateProcessRuntime.Rd index b6db0b56..bb6dd1ed 100644 --- a/man/get_proc_medians.Rd +++ b/man/calculateProcessRuntime.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_medians} -\alias{get_proc_medians} +\name{calculateProcessRuntime} +\alias{calculateProcessRuntime} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -get_proc_medians(dir_job_results) +calculateProcessRuntime(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) } } \description{ diff --git a/man/clean_clust_file.Rd b/man/cleanClusterFile.Rd similarity index 82% rename from man/clean_clust_file.Rd rename to man/cleanClusterFile.Rd index bba3072e..d2818662 100644 --- a/man/clean_clust_file.Rd +++ b/man/cleanClusterFile.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/clean_clust_file.R -\name{clean_clust_file} -\alias{clean_clust_file} +\name{cleanClusterFile} +\alias{cleanClusterFile} \title{Clean Cluster File} \usage{ -clean_clust_file(path, writepath = NULL, query) +cleanClusterFile(path, writepath = NULL, query) } \arguments{ \item{path}{A character to the path of the cluster file to be cleaned} @@ -24,6 +24,6 @@ This function reads a space-separated cluster file and converts it to a cleaned } \examples{ \dontrun{ -clean_clust_file("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") +cleanClusterFile("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") } } diff --git a/man/combine_files.Rd b/man/combineFiles.Rd similarity index 92% rename from man/combine_files.Rd rename to man/combineFiles.Rd index 4126eb9e..3b56b923 100644 --- a/man/combine_files.Rd +++ b/man/combineFiles.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combine_files} -\alias{combine_files} +\name{combineFiles} +\alias{combineFiles} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combine_files( +combineFiles( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combine_full.Rd b/man/combineFullAnalysis.Rd similarity index 69% rename from man/combine_full.Rd rename to man/combineFullAnalysis.Rd index f4e6597b..35925e86 100644 --- a/man/combine_full.Rd +++ b/man/combineFullAnalysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_full} -\alias{combine_full} +\name{combineFullAnalysis} +\alias{combineFullAnalysis} \title{Combining full_analysis files} \usage{ -combine_full(inpath, ret = FALSE) +combineFullAnalysis(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combine_ipr.Rd b/man/combineIPR.Rd similarity index 74% rename from man/combine_ipr.Rd rename to man/combineIPR.Rd index 52aa3057..035c4274 100644 --- a/man/combine_ipr.Rd +++ b/man/combineIPR.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_ipr} -\alias{combine_ipr} +\name{combineIPR} +\alias{combineIPR} \title{Combining clean ipr files} \usage{ -combine_ipr(inpath, ret = FALSE) +combineIPR(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/create_lineage_lookup.Rd b/man/createLineageLookup.Rd similarity index 91% rename from man/create_lineage_lookup.Rd rename to man/createLineageLookup.Rd index 51670f35..5dbab978 100644 --- a/man/create_lineage_lookup.Rd +++ b/man/createLineageLookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{create_lineage_lookup} -\alias{create_lineage_lookup} +\name{createLineageLookup} +\alias{createLineageLookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -create_lineage_lookup( +createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" diff --git a/man/get_proc_weights.Rd b/man/getProcessRuntimeWeights.Rd similarity index 73% rename from man/get_proc_weights.Rd rename to man/getProcessRuntimeWeights.Rd index 0f4beb57..8eff0347 100644 --- a/man/get_proc_weights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_weights} -\alias{get_proc_weights} +\name{getProcessRuntimeWeights} +\alias{getProcessRuntimeWeights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -get_proc_weights(medians_yml_path = NULL) +getProcessRuntimeWeights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: get_proc_weights() +example: writeProcessRuntimeToYML() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/map_advanced_opts2procs.Rd b/man/mapAdvOption2Process.Rd similarity index 76% rename from man/map_advanced_opts2procs.Rd rename to man/mapAdvOption2Process.Rd index 631708b4..5bd9ee65 100644 --- a/man/map_advanced_opts2procs.Rd +++ b/man/mapAdvOption2Process.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{map_advanced_opts2procs} -\alias{map_advanced_opts2procs} +\name{mapAdvOption2Process} +\alias{mapAdvOption2Process} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -map_advanced_opts2procs(advanced_opts) +mapAdvOption2Process(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- map_advanced_opts2procs(advanced_opts) +procs <- mapAdvOption2Process(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/make_opts2procs.Rd b/man/mapOption2Process.Rd similarity index 75% rename from man/make_opts2procs.Rd rename to man/mapOption2Process.Rd index 07e208b2..ff6905c5 100644 --- a/man/make_opts2procs.Rd +++ b/man/mapOption2Process.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{make_opts2procs} -\alias{make_opts2procs} +\name{mapOption2Process} +\alias{mapOption2Process} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -make_opts2procs() +mapOption2Process() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- make_opts2procs +example: list_opts2procs <- mapOption2Process } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/plot_estimated_walltimes.Rd b/man/plotEstimatedWallTimes.Rd similarity index 55% rename from man/plot_estimated_walltimes.Rd rename to man/plotEstimatedWallTimes.Rd index 3669e0e0..0d53cb32 100644 --- a/man/plot_estimated_walltimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -1,18 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plot_estimated_walltimes} -\alias{plot_estimated_walltimes} +\name{plotEstimatedWallTimes} +\alias{plotEstimatedWallTimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plot_estimated_walltimes() +plotEstimatedWallTimes() } \value{ line plot object example: -p <- plot_estimated_walltimes() -ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) +p <- plotEstimatedWallTimes() +ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ +dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } \description{ this function was just for fun; very, very messy code diff --git a/man/write_proc_medians_table.Rd b/man/writeProcessRuntime2TSV.Rd similarity index 77% rename from man/write_proc_medians_table.Rd rename to man/writeProcessRuntime2TSV.Rd index 2ae7a97b..03cbbd68 100644 --- a/man/write_proc_medians_table.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_table} -\alias{write_proc_medians_table} +\name{writeProcessRuntime2TSV} +\alias{writeProcessRuntime2TSV} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -write_proc_medians_table(dir_job_results, filepath) +writeProcessRuntime2TSV(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ write_proc_medians_table(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: write_proc_medians_table( +example: writeProcessRuntime2TSV( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/write_proc_medians_yml.Rd b/man/writeProcessRuntimeToYML.Rd similarity index 61% rename from man/write_proc_medians_yml.Rd rename to man/writeProcessRuntimeToYML.Rd index a3d8ee5f..e4a5c8ad 100644 --- a/man/write_proc_medians_yml.Rd +++ b/man/writeProcessRuntimeToYML.Rd @@ -1,25 +1,26 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_yml} -\alias{write_proc_medians_yml} +\name{writeProcessRuntimeToYML} +\alias{writeProcessRuntimeToYML} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -write_proc_medians_yml(dir_job_results, filepath = NULL) +writeProcessRuntimeToYML(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} -\item{filepath}{\link{chr} path to save YAML file; if NULL, uses ./molevol_scripts/log_data/job_proc_weights.yml} +\item{filepath}{\link{chr} path to save YAML file; if NULL, +uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which get_proc_weights() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default read location. } \examples{ \dontrun{ -write_proc_medians_yml( +writeProcessRuntimeToYML( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From 091d32ebb31b6f295268b4e0a38ef0fab1066358 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Tue, 8 Oct 2024 07:17:56 +0100 Subject: [PATCH 10/61] fixing merge issue in NAMESPACE --- NAMESPACE | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 739c76d7..d2ef5463 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,9 @@ export(cleanSpecies) export(combineFiles) export(combineFullAnalysis) export(combineIPR) +export(condenseRepeatedDomains) +export(convert2TitleCase) +export(convertAlignment2FA) export(convert_aln2fa) export(convert_fa2tre) export(count_bycol) @@ -63,13 +66,15 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) +export(mapAcc2Name) export(mapAdvOption2Process) export(mapOption2Process) -export(mapAcc2Name) +export(map_acc2name) export(msa_pdf) export(pick_longer_duplicate) export(plotEstimatedWallTimes) export(prot2tax) +export(prot2tax_old) export(removeAsterisks) export(removeEmptyRows) export(removeTails) From fc63187c4985d8a9fad15582691b4ee4f9c273e6 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Tue, 8 Oct 2024 08:18:42 +0100 Subject: [PATCH 11/61] Added updated function name to NAMESPACE and removed unused argument in readAAStringSet --- NAMESPACE | 3 +-- R/msa.R | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d2ef5463..cd135cc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,9 +20,9 @@ export(assert_count_df) export(assignJobQueue) export(calculateEstimatedWallTimeFromOptions) export(calculateProcessRuntime) -export(cleanGeneDescription) export(cleanClusters) export(cleanDomainArchitecture) +export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) @@ -71,7 +71,6 @@ export(mapAdvOption2Process) export(mapOption2Process) export(map_acc2name) export(msa_pdf) -export(pick_longer_duplicate) export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) diff --git a/R/msa.R b/R/msa.R index e56cc32c..0b1b6e34 100644 --- a/R/msa.R +++ b/R/msa.R @@ -197,21 +197,21 @@ msa_pdf <- function(fasta_path, out_path = NULL, #' #' @examples generate_msa <- function(fa_file = "", outfile = "") { - prot_aa <- readAAStringSet( - path = fa_file, - format = "fasta" - ) - prot_aa + prot_aa <- readAAStringSet( + fa_file, + format = "fasta" + ) + prot_aa - ## Install kalign ?rMSA_INSTALL - ## Messed up! Reimplement from kalign.R - ## https://github.com/mhahsler/rMSA/blob/master/R/kalign.R + ## Install kalign ?rMSA_INSTALL + ## Messed up! Reimplement from kalign.R + ## https://github.com/mhahsler/rMSA/blob/master/R/kalign.R - # source("scripts/c2r.R") + # source("scripts/c2r.R") - ## align the sequences - al <- kalign(prot_aa) # !! won't work! - al + ## align the sequences + al <- kalign(prot_aa) # !! won't work! + al } ############################ From 208b9e02d0bedfd6d16d663dfb109fcce23040ac Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Tue, 8 Oct 2024 18:42:40 +0100 Subject: [PATCH 12/61] refactor function names in R/ipr2vis.R --- R/ipr2viz.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index bf3650f7..5d8a0a03 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -13,7 +13,7 @@ ################################# ## Modified gggenes::theme_genes ################################# -## theme_genes2 adapted from theme_genes (w/o strip.text()) +## themeGenes2 adapted from theme_genes (w/o strip.text()) ## https://github.com/wilkox/gggenes/blob/master/R/theme_genes.R #' Theme Genes2 #' @@ -23,7 +23,7 @@ #' @export #' #' @examples -theme_genes2 <- function() { +themeGenes2 <- function() { ggplot2::theme_grey() + ggplot2::theme( panel.background = ggplot2::element_blank(), panel.grid.major.y = ggplot2::element_line(colour = "grey80", size = 0.2), @@ -58,7 +58,7 @@ theme_genes2 <- function() { #' @export #' #' @examples -find_top_acc <- function(infile_full, +getTopAccByLinDomArch <- function(infile_full, DA_col = "DomArch.Pfam", lin_col = "Lineage_short", n = 20, @@ -113,7 +113,7 @@ find_top_acc <- function(infile_full, #' @export #' #' @examples -ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), +plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), group_by = "Analysis", # "Analysis" topn = 20, name = "Name", text_size = 15, query = "All") { @@ -141,8 +141,8 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), ## To filter by Analysis analysis <- paste(analysis, collapse = "|") ## @SAM: This can't be set in stone since the analysis may change! - ## Getting top n accession numbers using find_top_acc() - top_acc <- find_top_acc( + ## Getting top n accession numbers using getTopAccByLinDomArch() + top_acc <- getTopAccByLinDomArch( infile_full = infile_full, DA_col = "DomArch.Pfam", ## @SAM, you could pick by the Analysis w/ max rows! @@ -202,7 +202,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), # , ncol = 1 + #scales = "free", scale_fill_manual(values = CPCOLS, na.value = "#A9A9A9") + theme_minimal() + - theme_genes2() + + themeGenes2() + theme( legend.position = "bottom", legend.box = "horizontal", @@ -232,7 +232,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), ) + scale_fill_manual(values = CPCOLS, na.value = "#A9A9A9") + theme_minimal() + - theme_genes2() + + themeGenes2() + theme( legend.position = "bottom", legend.box = "horizontal", @@ -268,7 +268,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' @export #' #' @examples -ipr2viz_web <- function(infile_ipr, +plotIPR2VizWeb <- function(infile_ipr, accessions, analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), group_by = "Analysis", name = "Name", @@ -344,7 +344,7 @@ ipr2viz_web <- function(infile_ipr, # , ncol = 1 + #scales = "free", scale_fill_manual(values = CPCOLS, na.value = "#A9A9A9") + theme_minimal() + - theme_genes2() + + themeGenes2() + theme( legend.position = "bottom", legend.box = "horizontal", @@ -374,7 +374,7 @@ ipr2viz_web <- function(infile_ipr, ) + scale_fill_manual(values = CPCOLS, na.value = "#A9A9A9") + theme_minimal() + - theme_genes2() + + themeGenes2() + theme( legend.position = "bottom", legend.box = "horizontal", From 44f0a766f29b36cdab6d7fbddc9c31cd4d0df20d Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Tue, 8 Oct 2024 18:51:23 +0100 Subject: [PATCH 13/61] update namespace and rd files with roxygen2 --- NAMESPACE | 8 ++-- man/countbycolumn.Rd | 22 ---------- man/filterbydomains.Rd | 44 ------------------- man/filterbyfrequency.Rd | 22 ---------- man/findparalogs.Rd | 26 ----------- ...nd_top_acc.Rd => getTopAccByLinDomArch.Rd} | 6 +-- man/{ipr2viz.Rd => plotIPR2Viz.Rd} | 6 +-- man/{ipr2viz_web.Rd => plotIPR2VizWeb.Rd} | 6 +-- man/summarizebylineage.Rd | 25 ----------- man/{theme_genes2.Rd => themeGenes2.Rd} | 6 +-- man/totalgencontextordomarchcounts.Rd | 42 ------------------ man/words2wordcounts.Rd | 25 ----------- 12 files changed, 16 insertions(+), 222 deletions(-) delete mode 100644 man/countbycolumn.Rd delete mode 100644 man/filterbydomains.Rd delete mode 100644 man/filterbyfrequency.Rd delete mode 100644 man/findparalogs.Rd rename man/{find_top_acc.Rd => getTopAccByLinDomArch.Rd} (79%) rename man/{ipr2viz.Rd => plotIPR2Viz.Rd} (87%) rename man/{ipr2viz_web.Rd => plotIPR2VizWeb.Rd} (85%) delete mode 100644 man/summarizebylineage.Rd rename man/{theme_genes2.Rd => themeGenes2.Rd} (72%) delete mode 100644 man/totalgencontextordomarchcounts.Rd delete mode 100644 man/words2wordcounts.Rd diff --git a/NAMESPACE b/NAMESPACE index 53332439..ddbd1dd5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,22 +46,22 @@ export(extractAccNum) export(filterByDomains) export(filterByFrequency) export(findParalogs) -export(find_top_acc) export(formatJobArgumentsHTML) export(gc_undirected_network) export(generateAllAlignments2FA) export(generate_all_aln2fa) export(generate_msa) +export(getTopAccByLinDomArch) export(get_accnums_from_fasta_file) export(get_proc_medians) export(get_proc_weights) -export(ipr2viz) -export(ipr2viz_web) export(make_opts2procs) export(mapAcc2Name) export(map_acc2name) export(map_advanced_opts2procs) export(msa_pdf) +export(plotIPR2Viz) +export(plotIPR2VizWeb) export(plotLineageDA) export(plotLineageDomainRepeats) export(plotLineageHeatmap) @@ -97,7 +97,7 @@ export(summarizeDomArch_ByLineage) export(summarizeGenContext) export(summarizeGenContext_ByDomArchLineage) export(summarizeGenContext_ByLineage) -export(theme_genes2) +export(themeGenes2) export(to_titlecase) export(totalGenContextOrDomArchCounts) export(validateCountDF) diff --git a/man/countbycolumn.Rd b/man/countbycolumn.Rd deleted file mode 100644 index 34fcc3e0..00000000 --- a/man/countbycolumn.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{countByColumn} -\alias{countByColumn} -\title{Count By Column} -\usage{ -countByColumn(prot = prot, column = "DomArch", min.freq = 1) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Count By Column -} -\examples{ -\dontrun{ -countByColumn() -} -} diff --git a/man/filterbydomains.Rd b/man/filterbydomains.Rd deleted file mode 100644 index 8c885363..00000000 --- a/man/filterbydomains.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByDomains} -\alias{filterByDomains} -\title{Filter by Domains} -\usage{ -filterByDomains( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) -} -\arguments{ -\item{prot}{Dataframe to filter} - -\item{column}{Column to search for domains in (DomArch column)} - -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} -} -\value{ -Filtered data frame -} -\description{ -filterByDomains filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain -} -\note{ -There is no need to make the domains 'regex safe', that will be handled by this function -} -\examples{ -\dontrun{ -filterByDomains() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/filterbyfrequency.Rd b/man/filterbyfrequency.Rd deleted file mode 100644 index d2c5f9cd..00000000 --- a/man/filterbyfrequency.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByFrequency} -\alias{filterByFrequency} -\title{Filter Frequency} -\usage{ -filterByFrequency(x, min.freq) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Filter Frequency -} -\examples{ -\dontrun{ -filterByFrequency() -} -} diff --git a/man/findparalogs.Rd b/man/findparalogs.Rd deleted file mode 100644 index 4b5edbcf..00000000 --- a/man/findparalogs.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{findParalogs} -\alias{findParalogs} -\title{Find Paralogs} -\usage{ -findParalogs(prot) -} -\arguments{ -\item{prot}{A data frame filtered by a Query, containing columns Species and Lineage} -} -\value{ -returns a dataframe containing paralogs and the counts. -} -\description{ -Creates a data frame of paralogs. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -findParalogs(pspa) -} -} diff --git a/man/find_top_acc.Rd b/man/getTopAccByLinDomArch.Rd similarity index 79% rename from man/find_top_acc.Rd rename to man/getTopAccByLinDomArch.Rd index 780cde11..a00da5c7 100644 --- a/man/find_top_acc.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ipr2viz.R -\name{find_top_acc} -\alias{find_top_acc} +\name{getTopAccByLinDomArch} +\alias{getTopAccByLinDomArch} \title{Group by lineage + DA then take top 20} \usage{ -find_top_acc( +getTopAccByLinDomArch( infile_full, DA_col = "DomArch.Pfam", lin_col = "Lineage_short", diff --git a/man/ipr2viz.Rd b/man/plotIPR2Viz.Rd similarity index 87% rename from man/ipr2viz.Rd rename to man/plotIPR2Viz.Rd index 79063497..22297312 100644 --- a/man/ipr2viz.Rd +++ b/man/plotIPR2Viz.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ipr2viz.R -\name{ipr2viz} -\alias{ipr2viz} +\name{plotIPR2Viz} +\alias{plotIPR2Viz} \title{IPR2Viz} \usage{ -ipr2viz( +plotIPR2Viz( infile_ipr = NULL, infile_full = NULL, accessions = c(), diff --git a/man/ipr2viz_web.Rd b/man/plotIPR2VizWeb.Rd similarity index 85% rename from man/ipr2viz_web.Rd rename to man/plotIPR2VizWeb.Rd index 896445bd..4b4394ad 100644 --- a/man/ipr2viz_web.Rd +++ b/man/plotIPR2VizWeb.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ipr2viz.R -\name{ipr2viz_web} -\alias{ipr2viz_web} +\name{plotIPR2VizWeb} +\alias{plotIPR2VizWeb} \title{IPR2Viz Web} \usage{ -ipr2viz_web( +plotIPR2VizWeb( infile_ipr, accessions, analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), diff --git a/man/summarizebylineage.Rd b/man/summarizebylineage.Rd deleted file mode 100644 index 2e445913..00000000 --- a/man/summarizebylineage.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeByLineage} -\alias{summarizeByLineage} -\title{Summarize by Lineage} -\usage{ -summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{query}{} -} -\value{ -Describe return, in detail -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarizeByLineage(query = "all") -} - -} diff --git a/man/theme_genes2.Rd b/man/themeGenes2.Rd similarity index 72% rename from man/theme_genes2.Rd rename to man/themeGenes2.Rd index 29f79673..1553e019 100644 --- a/man/theme_genes2.Rd +++ b/man/themeGenes2.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ipr2viz.R -\name{theme_genes2} -\alias{theme_genes2} +\name{themeGenes2} +\alias{themeGenes2} \title{Theme Genes2} \usage{ -theme_genes2() +themeGenes2() } \description{ Theme Genes2 diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalgencontextordomarchcounts.Rd deleted file mode 100644 index f457cb6a..00000000 --- a/man/totalgencontextordomarchcounts.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{totalGenContextOrDomArchCounts} -\alias{totalGenContextOrDomArchCounts} -\title{Total Counts} -\usage{ -totalGenContextOrDomArchCounts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} - -\item{digits}{} -} -\value{ -Define return, in detail -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wordcounts.Rd b/man/words2wordcounts.Rd deleted file mode 100644 index 7f60f226..00000000 --- a/man/words2wordcounts.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{words2WordCounts} -\alias{words2WordCounts} -\title{Words 2 Word Counts} -\usage{ -words2WordCounts(string) -} -\arguments{ -\item{string}{} -} -\value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency -} -\description{ -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2Words() |> - words2WordCounts() -} - -} From ae9e737616acc95e03ee4b7f4ca997e68675cc0d Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Tue, 8 Oct 2024 22:20:07 +0100 Subject: [PATCH 14/61] refactor: externalize internal functions for global use --- .gitignore | 1 + R/plotting.R | 87 +++++++++++++++++++++++++++++----------------------- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index 50d1aa13..ef11006e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .Rproj.user docs .Rhistory +.DS_Store \ No newline at end of file diff --git a/R/plotting.R b/R/plotting.R index da95ea5f..5d949cd5 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -18,6 +18,47 @@ # suppressPackageStartupMessages(library(d3r)) # suppressPackageStartupMessages(library(viridis)) +######################## +## Internal Functions ## +######################## +#' +#' +.LevelReduction <- function(lin, level) { + if (level == 1) { + gt_loc <- str_locate(lin, ">")[[1]] + if (is.na(gt_loc)) { + # No '>' in lineage + return(lin) + } else { + lin <- substring(lin, first = 0, last = (gt_loc - 1)) + return(lin) + } + } + # Out of bounds guard + gt_loc <- str_locate_all(lin, ">")[[1]] + l <- length(gt_loc) / 2 + if (level > l) { + # Not enough '>' in lineage + return(lin) + } else { + gt_loc <- gt_loc[level, ][1] %>% as.numeric() + lin <- substring(lin, first = 0, last = (gt_loc - 1)) + return(lin) + } +} + +.GetKingdom <- function(lin) { + gt_loc <- str_locate(lin, ">")[, "start"] + if (is.na(gt_loc)) { + # No '>' in lineage + return(lin) + } else { + kingdom <- substring(lin, first = 0, last = (gt_loc - 1)) + return(kingdom) + } +} + + #' Shorten Lineage #' #' @param data @@ -665,30 +706,6 @@ plotLineageDomainRepeats <- function(query_data, colname) { #' } #' plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size = 8) { - .LevelReduction <- function(lin) { - if (level == 1) { - gt_loc <- str_locate(lin, ">")[[1]] - if (is.na(gt_loc)) { - # No '>' in lineage - return(lin) - } else { - lin <- substring(lin, first = 0, last = (gt_loc - 1)) - return(lin) - } - } - #### Add guard here to protect from out of bounds - gt_loc <- str_locate_all(lin, ">")[[1]] # [(level-1),][1] - l <- length(gt_loc) / 2 - if (level > l) { - # Not enough '>' in lineage - return(lin) - } else { - gt_loc <- gt_loc[level, ][1] %>% as.numeric() - lin <- substring(lin, first = 0, last = (gt_loc - 1)) - return(lin) - } - } - all_grouped <- data.frame("Query" = character(0), "Lineage" = character(0), "count" = integer()) for (dom in domains_of_interest) { @@ -703,19 +720,7 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size all_grouped <- dplyr::union(all_grouped, domSub) } - .GetKingdom <- function(lin) { - gt_loc <- str_locate(lin, ">")[, "start"] - - if (is.na(gt_loc)) { - # No '>' in lineage - return(lin) - } else { - kingdom <- substring(lin, first = 0, last = (gt_loc - 1)) - return(kingdom) - } - } - - all_grouped <- all_grouped %>% mutate(ReducedLin = unlist(purrr::map(Lineage, .LevelReduction))) + all_grouped <- all_grouped %>% mutate(ReducedLin = unlist(purrr::map(Lineage, ~.LevelReduction(.x, level)))) all_grouped_reduced <- all_grouped %>% group_by(Query, ReducedLin) %>% @@ -739,6 +744,10 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size append(eukaryota_colors) %>% append(virus_colors) + if (length(colors) < length(unique(all_grouped_reduced$ReducedLin))) { + colors <- rep("black", length(unique(all_grouped_reduced$ReducedLin))) # Fallback to black + } + all_grouped_reduced$ReducedLin <- map( all_grouped_reduced$ReducedLin, function(lin) { @@ -766,7 +775,7 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size ) ggplot( data = all_grouped_reduced, - aes_string(x = "ReducedLin", y = "Query") + aes(x = "ReducedLin", y = "Query") ) + geom_tile( data = subset( @@ -774,7 +783,7 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size !is.na(count) ), aes(fill = count), - colour = "darkred", size = 0.3 + colour = "darkred", linewidth = 0.3 ) + # , width=0.7, height=0.7), scale_fill_gradient(low = "white", high = "darkred") + # scale_x_discrete(position="top") + From a246339f47d37ff60bdfb76a6861807b546c93f3 Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Tue, 8 Oct 2024 23:36:16 +0100 Subject: [PATCH 15/61] refactor function names in R/pre-msa-tree and R/reverse-operons.R --- NAMESPACE | 7 ++- R/pre-msa-tree.R | 16 +++---- R/reverse_operons.R | 12 ++--- man/RepresentativeAccNums.Rd | 4 +- man/countbycolumn.Rd | 22 ---------- man/createRepresentativeAccNum.Rd | 27 ++++++++++++ man/filterbydomains.Rd | 44 ------------------- man/filterbyfrequency.Rd | 22 ---------- man/findparalogs.Rd | 26 ----------- man/getAccNumFromFA.Rd | 14 ++++++ man/get_accnums_from_fasta_file.Rd | 6 +-- man/{reveql.Rd => reverseOperonSeq.Rd} | 10 ++--- ...verse_operon.Rd => straightenOperonSeq.Rd} | 10 ++--- man/summarizebylineage.Rd | 25 ----------- man/totalgencontextordomarchcounts.Rd | 42 ------------------ man/words2wordcounts.Rd | 25 ----------- man/write.MsaAAMultipleAlignment.Rd | 8 +--- man/writeMSA_AA2FA.Rd | 21 +++++++++ 18 files changed, 94 insertions(+), 247 deletions(-) delete mode 100644 man/countbycolumn.Rd create mode 100644 man/createRepresentativeAccNum.Rd delete mode 100644 man/filterbydomains.Rd delete mode 100644 man/filterbyfrequency.Rd delete mode 100644 man/findparalogs.Rd create mode 100644 man/getAccNumFromFA.Rd rename man/{reveql.Rd => reverseOperonSeq.Rd} (56%) rename man/{reverse_operon.Rd => straightenOperonSeq.Rd} (53%) delete mode 100644 man/summarizebylineage.Rd delete mode 100644 man/totalgencontextordomarchcounts.Rd delete mode 100644 man/words2wordcounts.Rd create mode 100644 man/writeMSA_AA2FA.Rd diff --git a/NAMESPACE b/NAMESPACE index 53332439..fe2ad999 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(countByColumn) export(createFA2Tree) export(createJobResultsURL) export(createJobStatusEmailMessage) +export(createRepresentativeAccNum) export(createWordCloud2Element) export(createWordCloudElement) export(create_lineage_lookup) @@ -52,6 +53,7 @@ export(gc_undirected_network) export(generateAllAlignments2FA) export(generate_all_aln2fa) export(generate_msa) +export(getAccNumFromFA) export(get_accnums_from_fasta_file) export(get_proc_medians) export(get_proc_weights) @@ -83,14 +85,14 @@ export(removeTails) export(renameFA) export(rename_fasta) export(replaceQuestionMarks) -export(reveql) -export(reverse_operon) +export(reverseOperonSeq) export(run_deltablast) export(run_rpsblast) export(selectLongestDuplicate) export(sendJobStatusEmail) export(shortenLineage) export(sinkReset) +export(straightenOperonSeq) export(summarizeByLineage) export(summarizeDomArch) export(summarizeDomArch_ByLineage) @@ -103,6 +105,7 @@ export(totalGenContextOrDomArchCounts) export(validateCountDF) export(wordcloud3) export(write.MsaAAMultipleAlignment) +export(writeMSA_AA2FA) export(write_proc_medians_table) export(write_proc_medians_yml) importFrom(Biostrings,AAStringSet) diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 44979c3c..fed495f4 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -546,7 +546,7 @@ acc2fa <- function(accessions, outpath, plan = "sequential") { return(result) } -#' RepresentativeAccNums +#' createRepresentativeAccNum #' #' @description #' Function to generate a vector of one Accession number per distinct observation from 'reduced' column @@ -566,7 +566,7 @@ acc2fa <- function(accessions, outpath, plan = "sequential") { #' @export #' #' @examples -RepresentativeAccNums <- function(prot_data, +createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column @@ -623,15 +623,15 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { ) if (typeof(outpath) == "character") { - write.MsaAAMultipleAlignment(aligned, outpath) + writeMSA_AA2FA(aligned, outpath) } return(aligned) } -#' write.MsaAAMultipleAlignment +#' writeMSA_AA2FA #' #' @description -#' Write MsaAAMultpleAlignment Objects as algined fasta sequence +#' Write MsaAAMultpleAlignment Objects as aligned fasta sequence #' MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega #' and msaMuscle from the 'msa' package #' @@ -647,7 +647,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @export #' #' @examples -write.MsaAAMultipleAlignment <- function(alignment, outpath) { +writeMSA_AA2FA <- function(alignment, outpath) { l <- length(rownames(alignment)) fasta <- "" for (i in 1:l) @@ -660,7 +660,7 @@ write.MsaAAMultipleAlignment <- function(alignment, outpath) { return(fasta) } -#' get_accnums_from_fasta_file +#' getAccNumFromFA #' #' @param fasta_file #' @@ -671,7 +671,7 @@ write.MsaAAMultipleAlignment <- function(alignment, outpath) { #' @export #' #' @examples -get_accnums_from_fasta_file <- function(fasta_file) { +getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] return(accnums) diff --git a/R/reverse_operons.R b/R/reverse_operons.R index e4bbd50e..a2570e8d 100755 --- a/R/reverse_operons.R +++ b/R/reverse_operons.R @@ -3,7 +3,7 @@ # Modified by Janani Ravi and Samuel Chen -#' reveql +#' straightenOperonSeq #' #' @param prot #' @@ -11,7 +11,7 @@ #' @export #' #' @examples -reveql <- function(prot) { +straightenOperonSeq <- function(prot) { w <- prot # $GenContext.orig # was 'x' y <- rep(NA, length(w)) @@ -57,7 +57,7 @@ reveql <- function(prot) { ## The function to reverse operons -#' reverse_operon +#' reverseOperonSeq #' #' @param prot #' @@ -65,7 +65,7 @@ reveql <- function(prot) { #' @export #' #' @examples -reverse_operon <- function(prot) { +reverseOperonSeq <- function(prot) { gencontext <- prot$GenContext gencontext <- gsub(pattern = ">", replacement = ">|", x = gencontext) @@ -108,7 +108,7 @@ reverse_operon <- function(prot) { - ge <- lapply(1:length(ge), function(x) reveql(ge[[x]])) + ge <- lapply(1:length(ge), function(x) straightenOperonSeq(ge[[x]])) ye <- te[withouteq] @@ -141,4 +141,4 @@ reverse_operon <- function(prot) { # colnames(prot) <- c("AccNum","GenContext.orig","len", "GeneName","TaxID","Species") ## ??? straighten operons -# prot$GenContext.orig <- reverse_operon(prot) +# prot$GenContext.orig <- reverseOperonSeq(prot) diff --git a/man/RepresentativeAccNums.Rd b/man/RepresentativeAccNums.Rd index f617cde4..57d1f1ab 100644 --- a/man/RepresentativeAccNums.Rd +++ b/man/RepresentativeAccNums.Rd @@ -1,11 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R \name{RepresentativeAccNums} \alias{RepresentativeAccNums} \title{Function to generate a vector of one Accession number per distinct observation from 'reduced' column} \usage{ -RepresentativeAccNums(prot_data, reduced = "Lineage", accnum_col = "AccNum") - RepresentativeAccNums(prot_data, reduced = "Lineage", accnum_col = "AccNum") } \arguments{ diff --git a/man/countbycolumn.Rd b/man/countbycolumn.Rd deleted file mode 100644 index 34fcc3e0..00000000 --- a/man/countbycolumn.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{countByColumn} -\alias{countByColumn} -\title{Count By Column} -\usage{ -countByColumn(prot = prot, column = "DomArch", min.freq = 1) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Count By Column -} -\examples{ -\dontrun{ -countByColumn() -} -} diff --git a/man/createRepresentativeAccNum.Rd b/man/createRepresentativeAccNum.Rd new file mode 100644 index 00000000..3703fe1a --- /dev/null +++ b/man/createRepresentativeAccNum.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pre-msa-tree.R +\name{createRepresentativeAccNum} +\alias{createRepresentativeAccNum} +\title{createRepresentativeAccNum} +\usage{ +createRepresentativeAccNum( + prot_data, + reduced = "Lineage", + accnum_col = "AccNum" +) +} +\arguments{ +\item{prot_data}{Data frame containing Accession Numbers} + +\item{reduced}{Column from prot_data from which distinct observations +will be generated from. +One accession number will be assigned for each of these observations} + +\item{accnum_col}{Column from prot_data that contains Accession Numbers} +} +\description{ +Function to generate a vector of one Accession number per distinct observation from 'reduced' column +} +\author{ +Samuel Chen, Janani Ravi +} diff --git a/man/filterbydomains.Rd b/man/filterbydomains.Rd deleted file mode 100644 index 8c885363..00000000 --- a/man/filterbydomains.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByDomains} -\alias{filterByDomains} -\title{Filter by Domains} -\usage{ -filterByDomains( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) -} -\arguments{ -\item{prot}{Dataframe to filter} - -\item{column}{Column to search for domains in (DomArch column)} - -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} -} -\value{ -Filtered data frame -} -\description{ -filterByDomains filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain -} -\note{ -There is no need to make the domains 'regex safe', that will be handled by this function -} -\examples{ -\dontrun{ -filterByDomains() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/filterbyfrequency.Rd b/man/filterbyfrequency.Rd deleted file mode 100644 index d2c5f9cd..00000000 --- a/man/filterbyfrequency.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByFrequency} -\alias{filterByFrequency} -\title{Filter Frequency} -\usage{ -filterByFrequency(x, min.freq) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Filter Frequency -} -\examples{ -\dontrun{ -filterByFrequency() -} -} diff --git a/man/findparalogs.Rd b/man/findparalogs.Rd deleted file mode 100644 index 4b5edbcf..00000000 --- a/man/findparalogs.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{findParalogs} -\alias{findParalogs} -\title{Find Paralogs} -\usage{ -findParalogs(prot) -} -\arguments{ -\item{prot}{A data frame filtered by a Query, containing columns Species and Lineage} -} -\value{ -returns a dataframe containing paralogs and the counts. -} -\description{ -Creates a data frame of paralogs. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -findParalogs(pspa) -} -} diff --git a/man/getAccNumFromFA.Rd b/man/getAccNumFromFA.Rd new file mode 100644 index 00000000..f2409965 --- /dev/null +++ b/man/getAccNumFromFA.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pre-msa-tree.R +\name{getAccNumFromFA} +\alias{getAccNumFromFA} +\title{getAccNumFromFA} +\usage{ +getAccNumFromFA(fasta_file) +} +\arguments{ +\item{fasta_file}{} +} +\description{ +getAccNumFromFA +} diff --git a/man/get_accnums_from_fasta_file.Rd b/man/get_accnums_from_fasta_file.Rd index 84c163cc..f545d1a0 100644 --- a/man/get_accnums_from_fasta_file.Rd +++ b/man/get_accnums_from_fasta_file.Rd @@ -1,11 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R \name{get_accnums_from_fasta_file} \alias{get_accnums_from_fasta_file} \title{Get accnums from fasta file} \usage{ -get_accnums_from_fasta_file(fasta_file) - get_accnums_from_fasta_file(fasta_file) } \arguments{ @@ -13,6 +11,4 @@ get_accnums_from_fasta_file(fasta_file) } \description{ Get accnums from fasta file - -get_accnums_from_fasta_file } diff --git a/man/reveql.Rd b/man/reverseOperonSeq.Rd similarity index 56% rename from man/reveql.Rd rename to man/reverseOperonSeq.Rd index 9dc2bcb8..d61ec5f2 100644 --- a/man/reveql.Rd +++ b/man/reverseOperonSeq.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/reverse_operons.R -\name{reveql} -\alias{reveql} -\title{reveql} +\name{reverseOperonSeq} +\alias{reverseOperonSeq} +\title{reverseOperonSeq} \usage{ -reveql(prot) +reverseOperonSeq(prot) } \arguments{ \item{prot}{} } \description{ -reveql +reverseOperonSeq } diff --git a/man/reverse_operon.Rd b/man/straightenOperonSeq.Rd similarity index 53% rename from man/reverse_operon.Rd rename to man/straightenOperonSeq.Rd index 270e2a62..fcd0c923 100644 --- a/man/reverse_operon.Rd +++ b/man/straightenOperonSeq.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/reverse_operons.R -\name{reverse_operon} -\alias{reverse_operon} -\title{reverse_operon} +\name{straightenOperonSeq} +\alias{straightenOperonSeq} +\title{straightenOperonSeq} \usage{ -reverse_operon(prot) +straightenOperonSeq(prot) } \arguments{ \item{prot}{} } \description{ -reverse_operon +straightenOperonSeq } diff --git a/man/summarizebylineage.Rd b/man/summarizebylineage.Rd deleted file mode 100644 index 2e445913..00000000 --- a/man/summarizebylineage.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeByLineage} -\alias{summarizeByLineage} -\title{Summarize by Lineage} -\usage{ -summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{query}{} -} -\value{ -Describe return, in detail -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarizeByLineage(query = "all") -} - -} diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalgencontextordomarchcounts.Rd deleted file mode 100644 index f457cb6a..00000000 --- a/man/totalgencontextordomarchcounts.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{totalGenContextOrDomArchCounts} -\alias{totalGenContextOrDomArchCounts} -\title{Total Counts} -\usage{ -totalGenContextOrDomArchCounts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} - -\item{digits}{} -} -\value{ -Define return, in detail -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wordcounts.Rd b/man/words2wordcounts.Rd deleted file mode 100644 index 7f60f226..00000000 --- a/man/words2wordcounts.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{words2WordCounts} -\alias{words2WordCounts} -\title{Words 2 Word Counts} -\usage{ -words2WordCounts(string) -} -\arguments{ -\item{string}{} -} -\value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency -} -\description{ -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2Words() |> - words2WordCounts() -} - -} diff --git a/man/write.MsaAAMultipleAlignment.Rd b/man/write.MsaAAMultipleAlignment.Rd index 17a05f50..e26f26e7 100644 --- a/man/write.MsaAAMultipleAlignment.Rd +++ b/man/write.MsaAAMultipleAlignment.Rd @@ -1,11 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R \name{write.MsaAAMultipleAlignment} \alias{write.MsaAAMultipleAlignment} \title{Write MsaAAMultpleAlignment Objects as algined fasta sequence} \usage{ -write.MsaAAMultipleAlignment(alignment, outpath) - write.MsaAAMultipleAlignment(alignment, outpath) } \arguments{ @@ -16,10 +14,6 @@ write.MsaAAMultipleAlignment(alignment, outpath) \description{ MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package - -Write MsaAAMultpleAlignment Objects as algined fasta sequence -MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega -and msaMuscle from the 'msa' package } \author{ Samuel Chen, Janani Ravi diff --git a/man/writeMSA_AA2FA.Rd b/man/writeMSA_AA2FA.Rd new file mode 100644 index 00000000..068e5b63 --- /dev/null +++ b/man/writeMSA_AA2FA.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pre-msa-tree.R +\name{writeMSA_AA2FA} +\alias{writeMSA_AA2FA} +\title{writeMSA_AA2FA} +\usage{ +writeMSA_AA2FA(alignment, outpath) +} +\arguments{ +\item{alignment}{MsaAAMultipleAlignment object to be written as a fasta} + +\item{outpath}{Where the resulting FASTA file should be written to} +} +\description{ +Write MsaAAMultpleAlignment Objects as aligned fasta sequence +MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega +and msaMuscle from the 'msa' package +} +\author{ +Samuel Chen, Janani Ravi +} From 38f3cb000ddf35028c1e7c940920dd051db1a2dc Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Wed, 9 Oct 2024 11:32:03 +0100 Subject: [PATCH 16/61] added error handling functionality for the run_deltablast and run_rpsblast functions. This includes arguments check before wrapping code logic in a tryCatch block. --- R/blastWrappers.R | 109 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 25 deletions(-) diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 552b1ff6..15484a1b 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -18,25 +18,56 @@ #' #' @examples run_deltablast <- function(deltablast_path, db_search_path, - db = "refseq", query, evalue = "1e-5", - out, num_alignments, num_threads = 1) { - start <- Sys.time() + db = "refseq", query, evalue = "1e-5", + out, num_alignments, num_threads = 1) { + # Argument validation + if (!file.exists(deltablast_path)) { + stop("The DELTABLAST executable path is invalid: ", deltablast_path) + } + if (!dir.exists(db_search_path)) { + stop("The database search path is invalid: ", db_search_path) + } + if (!file.exists(query)) { + stop("The query file path is invalid: ", query) + } + if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { + stop("The evalue must be a positive number: ", evalue) + } + if (!is.numeric(num_alignments) || num_alignments <= 0) { + stop("The number of alignments must be a + positive integer: ", num_alignments) + } + if (!is.numeric(num_threads) || num_threads <= 0) { + stop("The number of threads must be a positive integer: ", num_threads) + } + + start <- Sys.time() + + tryCatch({ system(paste0("export BLASTDB=/", db_search_path)) system2( - command = deltablast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads, - "-num_alignments", num_alignments - # ,"-outfmt", outfmt - ) + command = deltablast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads, + "-num_alignments", num_alignments + # ,"-outfmt", outfmt + ) ) print(Sys.time() - start) + }, error = function(e) { + message(paste("Error in run_deltablast: ", e)) + }, warning = function(w) { + message(paste("Warning in run_deltablast: ", w)) + }, finally = { + message("run_deltablast completed") + }) + } @@ -55,20 +86,48 @@ run_deltablast <- function(deltablast_path, db_search_path, #' #' @examples run_rpsblast <- function(rpsblast_path, db_search_path, - db = "refseq", query, evalue = "1e-5", - out, num_threads = 1) { - start <- Sys.time() + db = "refseq", query, evalue = "1e-5", + out, num_threads = 1) { + # Argument validation + if (!file.exists(rpsblast_path)) { + stop("The RPSBLAST executable path is invalid: ", rpsblast_path) + } + if (!dir.exists(db_search_path)) { + stop("The database search path is invalid: ", db_search_path) + } + if (!file.exists(query)) { + stop("The query file path is invalid: ", query) + } + if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { + stop("The evalue must be a positive number: ", evalue) + } + if (!is.numeric(num_threads) || num_threads <= 0) { + stop("The number of threads must be a positive integer: ", num_threads) + } + + start <- Sys.time() + + tryCatch({ + system(paste0("export BLASTDB=/", db_search_path)) + system2( - command = rpsblast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads - # , "-outfmt", outfmt - ) + command = rpsblast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads + ) ) print(Sys.time() - start) + }, error = function(e) { + message(paste("Error in run_rpsblast: ", e)) + }, warning = function(w) { + message(paste("Warning in run_rpsblast: ", w)) + }, finally = { + message("run_rpsblast completed") + }) + } From 527c470104805b093f7da3e9f45335f53945cb1a Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Wed, 9 Oct 2024 22:55:15 +0100 Subject: [PATCH 17/61] update CONTRIBUTING.md --- .github/CONTRIBUTING.md | 123 ++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 54 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 5db3f961..9fcd6b7f 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -5,72 +5,87 @@ For a detailed discussion on contributing to this and other tidyverse packages, ## Fixing typos -You can fix typos, spelling mistakes, or grammatical errors in the documentation directly using the GitHub web interface, as long as the changes are made in the _source_ file. -This generally means you'll need to edit [roxygen2 comments](https://roxygen2.r-lib.org/articles/roxygen2.html) in an `.R`, not a `.Rd` file. +You can fix typos, spelling mistakes, or grammatical errors in the documentation directly using the GitHub web interface, as long as the changes are made in the _source_ file. +This generally means you'll need to edit [roxygen2 comments](https://roxygen2.r-lib.org/articles/roxygen2.html) in an `.R`, not a `.Rd` file. You can find the `.R` file that generates the `.Rd` by reading the comment in the first line. ## Bigger changes -If you want to make a bigger change, it's a good idea to first file an issue and make sure someone from the team agrees that it’s needed. -If you’ve found a bug, please file an issue that illustrates the bug with a minimal +If you want to make a bigger change, it's a good idea to first file an issue and make sure someone from the team agrees that it’s needed. +If you’ve found a bug, please file an issue that illustrates the bug with a minimal [reprex](https://www.tidyverse.org/help/#reprex) (this will also help you write a unit test, if needed). See our guide on [how to create a great issue](https://code-review.tidyverse.org/issues/) for more advice. ### Pull request process -* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis`. - -* Install and load the `usethis` package with: - ``` - install.packages("usethis") - - library("usethis") - ``` -* Clone and fork the MolEvolvR package using: - ``` - usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE) - ``` -* Install all development dependencies and then make sure the package passes R CMD check using `devtools`: - ``` - install.packages("devtools") - - library("devtools") - - devtools::install_dev_deps() - - devtools::check() - ``` - _If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing._ - -* Create a Git branch for your pull request (PR). We recommend using - ``` - usethis::pr_init("brief-description-of-change") - ``` - -* Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser. - The title of your PR should briefly describe the change. - The body of your PR should contain `Fixes #issue-number`. - -* For user-facing changes, add a bullet to the top of `NEWS.md` (i.e. just below the first header). Follow the style described in . +- Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis`. + +- Install and load the `usethis` package with: + + ``` + install.packages("usethis") + + library("usethis") + ``` + +- Clone and fork the MolEvolvR package using: + ``` + usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE) + ``` +- Install Bioconductor dependencies: + + ``` + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install(version = "3.19") + ``` + +- Install other development dependencies and then ensure that the package passes R CMD check using `devtools`: + + ``` + install.packages("devtools") + + library("devtools") + + devtools::install_dev_deps() + + devtools::check() + ``` + + _If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing._ + +- Create a Git branch for your pull request (PR). We recommend using: + + ``` + usethis::pr_init("brief-description-of-change") + ``` + +- Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser. + The title of your PR should briefly describe the change. + The body of your PR should contain `Fixes #issue-number`. + + + + ### Code style -* New code should follow the tidyverse [style guide](https://style.tidyverse.org). - You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR. - -* Lint Your Code: Ensure your code adheres to our style guidelines by using [lintr](https://lintr.r-lib.org/): - ``` - install.packages("lintr") - - library("lintr") - - lintr::lint("path/to/your/file.R") - ``` - -* We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. - -* We use [testthat](https://cran.r-project.org/package=testthat) for unit tests. - Contributions with test cases included are easier to accept. +- New code should follow the tidyverse [style guide](https://style.tidyverse.org). + You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR. +- Lint Your Code: Ensure your code adheres to our style guidelines by using [lintr](https://lintr.r-lib.org/): + + ``` + install.packages("lintr") + + library("lintr") + + lintr::lint("path/to/your/file.R") + ``` + +- We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. + +- We use [testthat](https://cran.r-project.org/package=testthat) for unit tests. + Contributions with test cases included are easier to accept. ## Code of Conduct From 4ff68fb06395842093879dea47e45aaae1967225 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 08:27:02 +0100 Subject: [PATCH 18/61] Reverting to old function names for the following functions to create a separate pr for their updates and on a different branch: R/combine_analysis.R combine_full combine_ipr R/combine_files.R combine_files R/create_lineage_lookup.R create_lineage_lookup shorten_NA --- R/combine_analysis.R | 4 ++-- R/combine_files.R | 10 +++++----- R/create_lineage_lookup.R | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/combine_analysis.R b/R/combine_analysis.R index 58ce1f14..bb3b3ce2 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,7 +17,7 @@ #' @export #' #' @examples -combineFullAnalysis <- function(inpath, ret = FALSE) { +combine_full <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combine_files(inpath, pattern = "*.full_analysis.tsv", skip = 0, @@ -44,7 +44,7 @@ combineFullAnalysis <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combineIPR <- function(inpath, ret = FALSE) { +combine_ipr <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combine_files(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, diff --git a/R/combine_files.R b/R/combine_files.R index 455ddd53..76c5fa09 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combineFiles(inpath, +# full_combnd <- combine_files(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combineFiles(inpath, +# cln_combnd <- combine_files(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combineFiles(inpath, +# cl_blast_combnd <- combine_files(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combineFiles(inpath, +# ipr_combnd <- combine_files(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index d911934a..8e365cbb 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), +create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - .shortenNA <- function(Lineage) { + shorten_NA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, .shortenNA))) + mutate(Lineage = unlist(map(Lineage, shorten_NA))) @@ -101,7 +101,7 @@ createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), -#' CreateLineageLookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") +#' create_lineage_lookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") #' { #' #' Create a look up table that goes from GCA_ID, to TaxID, to Lineage #' #' @author Samuel Chen From 035c5e13b4cfe54b4ba7ff1d5c7618ade13720d1 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 08:41:47 +0100 Subject: [PATCH 19/61] minor updates to namespace and Rd files after running devtool::check() --- NAMESPACE | 8 ++++---- man/{combineFiles.Rd => combine_files.Rd} | 6 +++--- man/{combineFullAnalysis.Rd => combine_full.Rd} | 6 +++--- man/{combineIPR.Rd => combine_ipr.Rd} | 6 +++--- man/{createLineageLookup.Rd => create_lineage_lookup.Rd} | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) rename man/{combineFiles.Rd => combine_files.Rd} (92%) rename man/{combineFullAnalysis.Rd => combine_full.Rd} (69%) rename man/{combineIPR.Rd => combine_ipr.Rd} (74%) rename man/{createLineageLookup.Rd => create_lineage_lookup.Rd} (91%) diff --git a/NAMESPACE b/NAMESPACE index cd135cc8..f49975b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,9 +26,9 @@ export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) -export(combineFiles) -export(combineFullAnalysis) -export(combineIPR) +export(combine_files) +export(combine_full) +export(combine_ipr) export(condenseRepeatedDomains) export(convert2TitleCase) export(convertAlignment2FA) @@ -37,8 +37,8 @@ export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) -export(createLineageLookup) export(create_all_col_params) +export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) diff --git a/man/combineFiles.Rd b/man/combine_files.Rd similarity index 92% rename from man/combineFiles.Rd rename to man/combine_files.Rd index 3b56b923..4126eb9e 100644 --- a/man/combineFiles.Rd +++ b/man/combine_files.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combineFiles} -\alias{combineFiles} +\name{combine_files} +\alias{combine_files} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combineFiles( +combine_files( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combineFullAnalysis.Rd b/man/combine_full.Rd similarity index 69% rename from man/combineFullAnalysis.Rd rename to man/combine_full.Rd index 35925e86..f4e6597b 100644 --- a/man/combineFullAnalysis.Rd +++ b/man/combine_full.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combineFullAnalysis} -\alias{combineFullAnalysis} +\name{combine_full} +\alias{combine_full} \title{Combining full_analysis files} \usage{ -combineFullAnalysis(inpath, ret = FALSE) +combine_full(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combineIPR.Rd b/man/combine_ipr.Rd similarity index 74% rename from man/combineIPR.Rd rename to man/combine_ipr.Rd index 035c4274..52aa3057 100644 --- a/man/combineIPR.Rd +++ b/man/combine_ipr.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combineIPR} -\alias{combineIPR} +\name{combine_ipr} +\alias{combine_ipr} \title{Combining clean ipr files} \usage{ -combineIPR(inpath, ret = FALSE) +combine_ipr(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/createLineageLookup.Rd b/man/create_lineage_lookup.Rd similarity index 91% rename from man/createLineageLookup.Rd rename to man/create_lineage_lookup.Rd index 5dbab978..51670f35 100644 --- a/man/createLineageLookup.Rd +++ b/man/create_lineage_lookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{createLineageLookup} -\alias{createLineageLookup} +\name{create_lineage_lookup} +\alias{create_lineage_lookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -createLineageLookup( +create_lineage_lookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" From fb5ac23f8a3e8e5709498aa24308a950802d1c29 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 09:20:22 +0100 Subject: [PATCH 20/61] Renamed the following function; R/combine_analysis.R combine_full combine_ipr R/combine_files.R combine_files R/create_lineage_lookup.R create_lineage_lookup shorten_NA with approved names from #44 --- NAMESPACE | 8 ++++---- R/acc2lin.R | 2 +- R/combine_analysis.R | 8 ++++---- R/combine_files.R | 10 +++++----- R/create_lineage_lookup.R | 8 ++++---- R/lineage.R | 4 ++-- man/GCA2lin.Rd | 2 +- man/{combine_files.Rd => combineFiles.Rd} | 6 +++--- man/{combine_full.Rd => combineFullAnalysis.Rd} | 6 +++--- man/{combine_ipr.Rd => combineIPR.Rd} | 6 +++--- ...create_lineage_lookup.Rd => createLineageLookup.Rd} | 6 +++--- man/ipg2lin.Rd | 2 +- 12 files changed, 34 insertions(+), 34 deletions(-) rename man/{combine_files.Rd => combineFiles.Rd} (92%) rename man/{combine_full.Rd => combineFullAnalysis.Rd} (69%) rename man/{combine_ipr.Rd => combineIPR.Rd} (74%) rename man/{create_lineage_lookup.Rd => createLineageLookup.Rd} (91%) diff --git a/NAMESPACE b/NAMESPACE index f49975b4..cd135cc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,9 +26,9 @@ export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) -export(combine_files) -export(combine_full) -export(combine_ipr) +export(combineFiles) +export(combineFullAnalysis) +export(combineIPR) export(condenseRepeatedDomains) export(convert2TitleCase) export(convertAlignment2FA) @@ -37,8 +37,8 @@ export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) +export(createLineageLookup) export(create_all_col_params) -export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) diff --git a/R/acc2lin.R b/R/acc2lin.R index dfb33da9..a6551247 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -277,7 +277,7 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") { #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' #' @importFrom data.table fread #' diff --git a/R/combine_analysis.R b/R/combine_analysis.R index bb3b3ce2..55e36925 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,9 +17,9 @@ #' @export #' #' @examples -combine_full <- function(inpath, ret = FALSE) { +combineFullAnalysis <- function(inpath, ret = FALSE) { ## Combining full_analysis files - full_combnd <- combine_files(inpath, + full_combnd <- combineFiles(inpath, pattern = "*.full_analysis.tsv", skip = 0, col_names = T ) @@ -44,9 +44,9 @@ combine_full <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combine_ipr <- function(inpath, ret = FALSE) { +combineIPR <- function(inpath, ret = FALSE) { ## Combining clean ipr files - ipr_combnd <- combine_files(inpath, + ipr_combnd <- combineFiles(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, col_names = T ) diff --git a/R/combine_files.R b/R/combine_files.R index 76c5fa09..455ddd53 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combine_files(inpath, +# full_combnd <- combineFiles(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combine_files(inpath, +# cln_combnd <- combineFiles(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combine_files(inpath, +# cl_blast_combnd <- combineFiles(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combine_files(inpath, +# ipr_combnd <- combineFiles(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index 8e365cbb..78e79048 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), +createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - shorten_NA <- function(Lineage) { + .shortenNA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, shorten_NA))) + mutate(Lineage = unlist(map(Lineage, .shortenNA))) @@ -101,7 +101,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), -#' create_lineage_lookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") +#' createLineageLookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") #' { #' #' Create a look up table that goes from GCA_ID, to TaxID, to Lineage #' #' @author Samuel Chen diff --git a/R/lineage.R b/R/lineage.R index 20acec04..7ceed847 100644 --- a/R/lineage.R +++ b/R/lineage.R @@ -77,7 +77,7 @@ DownloadAssemblySummary <- function(outpath, #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' @param acc_col #' #' @importFrom dplyr pull @@ -309,7 +309,7 @@ efetch_ipg <- function(accessions, out_path, plan = "multicore") { #' @param genbank_assembly_path #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' #' @importFrom data.table fread setnames #' diff --git a/man/GCA2lin.Rd b/man/GCA2lin.Rd index ad83ca39..47acc3d7 100644 --- a/man/GCA2lin.Rd +++ b/man/GCA2lin.Rd @@ -19,7 +19,7 @@ This file can be generated using the "DownloadAssemblySummary()" function} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{acc_col}{} } diff --git a/man/combine_files.Rd b/man/combineFiles.Rd similarity index 92% rename from man/combine_files.Rd rename to man/combineFiles.Rd index 4126eb9e..3b56b923 100644 --- a/man/combine_files.Rd +++ b/man/combineFiles.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combine_files} -\alias{combine_files} +\name{combineFiles} +\alias{combineFiles} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combine_files( +combineFiles( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combine_full.Rd b/man/combineFullAnalysis.Rd similarity index 69% rename from man/combine_full.Rd rename to man/combineFullAnalysis.Rd index f4e6597b..35925e86 100644 --- a/man/combine_full.Rd +++ b/man/combineFullAnalysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_full} -\alias{combine_full} +\name{combineFullAnalysis} +\alias{combineFullAnalysis} \title{Combining full_analysis files} \usage{ -combine_full(inpath, ret = FALSE) +combineFullAnalysis(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combine_ipr.Rd b/man/combineIPR.Rd similarity index 74% rename from man/combine_ipr.Rd rename to man/combineIPR.Rd index 52aa3057..035c4274 100644 --- a/man/combine_ipr.Rd +++ b/man/combineIPR.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_ipr} -\alias{combine_ipr} +\name{combineIPR} +\alias{combineIPR} \title{Combining clean ipr files} \usage{ -combine_ipr(inpath, ret = FALSE) +combineIPR(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/create_lineage_lookup.Rd b/man/createLineageLookup.Rd similarity index 91% rename from man/create_lineage_lookup.Rd rename to man/createLineageLookup.Rd index 51670f35..5dbab978 100644 --- a/man/create_lineage_lookup.Rd +++ b/man/createLineageLookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{create_lineage_lookup} -\alias{create_lineage_lookup} +\name{createLineageLookup} +\alias{createLineageLookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -create_lineage_lookup( +createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd index 453668b0..5850e86c 100644 --- a/man/ipg2lin.Rd +++ b/man/ipg2lin.Rd @@ -29,7 +29,7 @@ file} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{assembly_path}{String of the path to the assembly_summary path This file can be generated using the "DownloadAssemblySummary()" function} From 106eb14b4e2eace66737a07cf5840011e490d116 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 10:24:49 +0100 Subject: [PATCH 21/61] reverting to old function names; make_opts2procs, map_advanced_opts2procs, get_proc_medians, write_proc_medians_table, write_proc_medians_yml, get_proc_weights, advanced_opts2est_walltime in R/assign_job_queue.R to be updated in a separate full request --- NAMESPACE | 18 ++-- R/assign_job_queue.R | 84 +++++++++---------- ...tions.Rd => advanced_opts2est_walltime.Rd} | 10 +-- ...{assignJobQueue.Rd => assign_job_queue.Rd} | 12 +-- ...eProcessRuntime.Rd => get_proc_medians.Rd} | 10 +-- ...sRuntimeWeights.Rd => get_proc_weights.Rd} | 8 +- ...apOption2Process.Rd => make_opts2procs.Rd} | 8 +- ...2Process.Rd => map_advanced_opts2procs.Rd} | 8 +- ...llTimes.Rd => plot_estimated_walltimes.Rd} | 8 +- ...ime2TSV.Rd => write_proc_medians_table.Rd} | 8 +- ...timeToYML.Rd => write_proc_medians_yml.Rd} | 10 +-- 11 files changed, 92 insertions(+), 92 deletions(-) rename man/{calculateEstimatedWallTimeFromOptions.Rd => advanced_opts2est_walltime.Rd} (73%) rename man/{assignJobQueue.Rd => assign_job_queue.Rd} (68%) rename man/{calculateProcessRuntime.Rd => get_proc_medians.Rd} (76%) rename man/{getProcessRuntimeWeights.Rd => get_proc_weights.Rd} (73%) rename man/{mapOption2Process.Rd => make_opts2procs.Rd} (75%) rename man/{mapAdvOption2Process.Rd => map_advanced_opts2procs.Rd} (76%) rename man/{plotEstimatedWallTimes.Rd => plot_estimated_walltimes.Rd} (77%) rename man/{writeProcessRuntime2TSV.Rd => write_proc_medians_table.Rd} (77%) rename man/{writeProcessRuntimeToYML.Rd => write_proc_medians_yml.Rd} (74%) diff --git a/NAMESPACE b/NAMESPACE index f49975b4..b4be51ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,11 +15,10 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) +export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assignJobQueue) -export(calculateEstimatedWallTimeFromOptions) -export(calculateProcessRuntime) +export(assign_job_queue) export(cleanClusters) export(cleanDomainArchitecture) export(cleanGeneDescription) @@ -54,9 +53,10 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) -export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) +export(get_proc_medians) +export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -66,12 +66,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) +export(make_opts2procs) export(mapAcc2Name) -export(mapAdvOption2Process) -export(mapOption2Process) export(map_acc2name) +export(map_advanced_opts2procs) export(msa_pdf) -export(plotEstimatedWallTimes) +export(plot_estimated_walltimes) export(prot2tax) export(prot2tax_old) export(removeAsterisks) @@ -103,8 +103,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(writeProcessRuntime2TSV) -export(writeProcessRuntimeToYML) +export(write_proc_medians_table) +export(write_proc_medians_yml) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index f1fcb6db..c531fb09 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,16 +3,16 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") +# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- mapOption2Process +#' example: list_opts2procs <- make_opts2procs #' @export -mapOption2Process <- function() { +make_opts2procs <- function() { tryCatch({ opts2processes <- list( "homology_search" = c("dblast", "dblast_cleanup"), @@ -26,7 +26,7 @@ mapOption2Process <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("mapOption2Process function execution completed.") + message("make_opts2procs function execution completed.") }) } @@ -40,16 +40,16 @@ mapOption2Process <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- mapAdvOption2Process(advanced_opts) +#' procs <- map_advanced_opts2procs(advanced_opts) #' @export -mapAdvOption2Process <- function(advanced_opts) { +map_advanced_opts2procs <- function(advanced_opts) { if (!is.character(advanced_opts)) { stop("Argument must be a character vector!") } tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- mapOption2Process() + opts2proc <- make_opts2procs() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run @@ -60,7 +60,7 @@ mapAdvOption2Process <- function(advanced_opts) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("mapOption2Process function execution completed.") + message("make_opts2procs function execution completed.") }) } @@ -80,14 +80,14 @@ mapAdvOption2Process <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- calculateProcessRuntime(dir_job_results) +#' list_proc_medians <- get_proc_medians(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- calculateProcessRuntime(dir_job_results) +#' list_proc_medians <- get_proc_medians(dir_job_results) #' @export -calculateProcessRuntime <- function(dir_job_results) { +get_proc_medians <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -139,7 +139,7 @@ calculateProcessRuntime <- function(dir_job_results) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("calculateProcessRuntime function execution completed.") + message("get_proc_medians function execution completed.") }) } @@ -156,12 +156,12 @@ calculateProcessRuntime <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: writeProcessRuntime2TSV( +#' example: write_proc_medians_table( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -writeProcessRuntime2TSV <- function(dir_job_results, filepath) { +write_proc_medians_table <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -175,7 +175,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { if (!is.character(filepath) || length(filepath) != 1) { stop("Input 'filepath' must be a single character string.") } - df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + df_proc_medians <- get_proc_medians(dir_job_results) |> tibble::as_tibble() |> tidyr::pivot_longer( dplyr::everything(), @@ -192,7 +192,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("writeProcessRuntime2TSV function execution completed.") + message("write_proc_medians_table function execution completed.") }) } @@ -201,7 +201,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -212,13 +212,13 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' #' @examples #' \dontrun{ -#' writeProcessRuntimeToYML( +#' write_proc_medians_yml( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { +write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -238,7 +238,7 @@ writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { stop("Input 'filepath' must be a single character string.") } - medians <- calculateProcessRuntime(dir_job_results) + medians <- get_proc_medians(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { message(paste("Encountered an error: "), e$message) @@ -261,9 +261,9 @@ writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: writeProcessRuntimeToYML() +#' example: write_proc_medians_yml() #' @export -getProcessRuntimeWeights <- function(medians_yml_path = NULL) { +get_proc_weights <- function(medians_yml_path = NULL) { if (is.null(medians_yml_path)) { medians_yml_path <- file.path(common_root, "molevol_scripts", @@ -273,7 +273,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { proc_weights <- tryCatch({ # attempt to read the weights from the YAML file produced by - # writeProcessRuntimeToYML() + # write_proc_medians_yml() if (stringr::str_trim(medians_yml_path) == "") { stop( stringr::str_glue("medians_yml_path is empty @@ -285,7 +285,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { }, # to avoid fatal errors in reading the proc weights yaml, # some median process runtimes have been hardcoded based on - # the result of calculateProcessRuntime() from Jan 2024 + # the result of get_proc_medians() from Jan 2024 error = function(cond) { proc_weights <- list( "dblast" = 2810, @@ -306,7 +306,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see mapOption2Process for the options) +#' (see make_opts2procs for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -314,11 +314,11 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: calculateEstimatedWallTimeFromOptions(c("homology_search", +#' example: advanced_opts2est_walltime (c("homology_search", #' "domain_architecture"), #' n_inputs = 3, n_hits = 50L) #' @export -calculateEstimatedWallTimeFromOptions <- function(advanced_opts, +advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { @@ -348,7 +348,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, } # Get process weights - proc_weights <- writeProcessRuntimeToYML() + proc_weights <- write_proc_medians_yml() if (!is.list(proc_weights)) { stop("Process weights could not be retrieved correctly.") } @@ -357,7 +357,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- mapAdvOption2Process(advanced_opts) + procs_from_opts <- map_advanced_opts2procs(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) @@ -366,7 +366,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- mapOption2Process() + opts2procs <- make_opts2procs() # exclude the homology search processes for the homologous hits procs2exclude_for_homologs <- opts2procs[["homology_search"]] procs_homologs <- procs_from_opts[!(procs_from_opts @@ -380,7 +380,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, } if (verbose) { msg <- stringr::str_glue( - "warnings from calculateEstimatedWallTimeFromOptions():\n", + "warnings from advanced_opts2est_walltime ():\n", "\tn_inputs={n_inputs}\n", "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", "\test_walltime={est_walltime}\n\n" @@ -393,7 +393,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("calculateEstimatedWallTimeFromOptions + message("advanced_opts2est_walltime function execution completed.") }) @@ -403,18 +403,18 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from calculateEstimatedWallTimeFromOptions()) +#' (from advanced_opts2est_walltime ()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' calculateEstimatedWallTimeFromOptions(c("homology_search", +#' advanced_opts2est_walltime (c("homology_search", #' "domain_architecture"), 3) |> -#' assignJobQueue() +#' assign_job_queue() #' @export -assignJobQueue <- function( +assign_job_queue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { @@ -434,7 +434,7 @@ assignJobQueue <- function( }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("assignJobQueue function execution completed.") + message("assign_job_queue function execution completed.") }) } @@ -451,13 +451,13 @@ assignJobQueue <- function( #' @return line plot object #' #' example: -#' p <- plotEstimatedWallTimes() +#' p <- plot_estimated_walltimes() #' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plotEstimatedWallTimes <- function() { +plot_estimated_walltimes <- function() { tryCatch({ - opts <- mapOption2Process() |> names() + opts <- make_opts2procs() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { # generate powerset (do not include empty set) @@ -482,7 +482,7 @@ plotEstimatedWallTimes <- function() { } else { NULL } - est_walltime <- calculateEstimatedWallTimeFromOptions( + est_walltime <- advanced_opts2est_walltime ( advanced_opts, n_inputs = i, n_hits = n_hits, @@ -541,7 +541,7 @@ plotEstimatedWallTimes <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("plotEstimatedWallTimes function execution completed.") + message("plot_estimated_walltimes function execution completed.") }) } diff --git a/man/calculateEstimatedWallTimeFromOptions.Rd b/man/advanced_opts2est_walltime.Rd similarity index 73% rename from man/calculateEstimatedWallTimeFromOptions.Rd rename to man/advanced_opts2est_walltime.Rd index e4eec3fd..02ae9621 100644 --- a/man/calculateEstimatedWallTimeFromOptions.Rd +++ b/man/advanced_opts2est_walltime.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{calculateEstimatedWallTimeFromOptions} -\alias{calculateEstimatedWallTimeFromOptions} +\name{advanced_opts2est_walltime} +\alias{advanced_opts2est_walltime} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -calculateEstimatedWallTimeFromOptions( +advanced_opts2est_walltime( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,14 @@ calculateEstimatedWallTimeFromOptions( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see mapOption2Process for the options)} +(see make_opts2procs for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: calculateEstimatedWallTimeFromOptions(c("homology_search", +example: advanced_opts2est_walltime (c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) } diff --git a/man/assignJobQueue.Rd b/man/assign_job_queue.Rd similarity index 68% rename from man/assignJobQueue.Rd rename to man/assign_job_queue.Rd index 27511b6a..d2650fed 100644 --- a/man/assignJobQueue.Rd +++ b/man/assign_job_queue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assignJobQueue} -\alias{assignJobQueue} +\name{assign_job_queue} +\alias{assign_job_queue} \title{Decision function to assign job queue} \usage{ -assignJobQueue(t_sec_estimate, t_cutoff = 21600) +assign_job_queue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from calculateEstimatedWallTimeFromOptions())} +(from advanced_opts2est_walltime ())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,9 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -calculateEstimatedWallTimeFromOptions(c("homology_search", +advanced_opts2est_walltime (c("homology_search", "domain_architecture"), 3) |> -assignJobQueue() +assign_job_queue() } \description{ Decision function to assign job queue diff --git a/man/calculateProcessRuntime.Rd b/man/get_proc_medians.Rd similarity index 76% rename from man/calculateProcessRuntime.Rd rename to man/get_proc_medians.Rd index bb6dd1ed..b6db0b56 100644 --- a/man/calculateProcessRuntime.Rd +++ b/man/get_proc_medians.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{calculateProcessRuntime} -\alias{calculateProcessRuntime} +\name{get_proc_medians} +\alias{get_proc_medians} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -calculateProcessRuntime(dir_job_results) +get_proc_medians(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- calculateProcessRuntime(dir_job_results) +list_proc_medians <- get_proc_medians(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- calculateProcessRuntime(dir_job_results) +list_proc_medians <- get_proc_medians(dir_job_results) } } \description{ diff --git a/man/getProcessRuntimeWeights.Rd b/man/get_proc_weights.Rd similarity index 73% rename from man/getProcessRuntimeWeights.Rd rename to man/get_proc_weights.Rd index 8eff0347..f48585cc 100644 --- a/man/getProcessRuntimeWeights.Rd +++ b/man/get_proc_weights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{getProcessRuntimeWeights} -\alias{getProcessRuntimeWeights} +\name{get_proc_weights} +\alias{get_proc_weights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -getProcessRuntimeWeights(medians_yml_path = NULL) +get_proc_weights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: writeProcessRuntimeToYML() +example: write_proc_medians_yml() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/mapOption2Process.Rd b/man/make_opts2procs.Rd similarity index 75% rename from man/mapOption2Process.Rd rename to man/make_opts2procs.Rd index ff6905c5..07e208b2 100644 --- a/man/mapOption2Process.Rd +++ b/man/make_opts2procs.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{mapOption2Process} -\alias{mapOption2Process} +\name{make_opts2procs} +\alias{make_opts2procs} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -mapOption2Process() +make_opts2procs() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- mapOption2Process +example: list_opts2procs <- make_opts2procs } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/mapAdvOption2Process.Rd b/man/map_advanced_opts2procs.Rd similarity index 76% rename from man/mapAdvOption2Process.Rd rename to man/map_advanced_opts2procs.Rd index 5bd9ee65..631708b4 100644 --- a/man/mapAdvOption2Process.Rd +++ b/man/map_advanced_opts2procs.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{mapAdvOption2Process} -\alias{mapAdvOption2Process} +\name{map_advanced_opts2procs} +\alias{map_advanced_opts2procs} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -mapAdvOption2Process(advanced_opts) +map_advanced_opts2procs(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- mapAdvOption2Process(advanced_opts) +procs <- map_advanced_opts2procs(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/plotEstimatedWallTimes.Rd b/man/plot_estimated_walltimes.Rd similarity index 77% rename from man/plotEstimatedWallTimes.Rd rename to man/plot_estimated_walltimes.Rd index 0d53cb32..884fed50 100644 --- a/man/plotEstimatedWallTimes.Rd +++ b/man/plot_estimated_walltimes.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plotEstimatedWallTimes} -\alias{plotEstimatedWallTimes} +\name{plot_estimated_walltimes} +\alias{plot_estimated_walltimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plotEstimatedWallTimes() +plot_estimated_walltimes() } \value{ line plot object example: -p <- plotEstimatedWallTimes() +p <- plot_estimated_walltimes() ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } diff --git a/man/writeProcessRuntime2TSV.Rd b/man/write_proc_medians_table.Rd similarity index 77% rename from man/writeProcessRuntime2TSV.Rd rename to man/write_proc_medians_table.Rd index 03cbbd68..2ae7a97b 100644 --- a/man/writeProcessRuntime2TSV.Rd +++ b/man/write_proc_medians_table.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{writeProcessRuntime2TSV} -\alias{writeProcessRuntime2TSV} +\name{write_proc_medians_table} +\alias{write_proc_medians_table} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -writeProcessRuntime2TSV(dir_job_results, filepath) +write_proc_medians_table(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ writeProcessRuntime2TSV(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: writeProcessRuntime2TSV( +example: write_proc_medians_table( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/writeProcessRuntimeToYML.Rd b/man/write_proc_medians_yml.Rd similarity index 74% rename from man/writeProcessRuntimeToYML.Rd rename to man/write_proc_medians_yml.Rd index e4a5c8ad..74757f1f 100644 --- a/man/writeProcessRuntimeToYML.Rd +++ b/man/write_proc_medians_yml.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{writeProcessRuntimeToYML} -\alias{writeProcessRuntimeToYML} +\name{write_proc_medians_yml} +\alias{write_proc_medians_yml} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -writeProcessRuntimeToYML(dir_job_results, filepath = NULL) +write_proc_medians_yml(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} @@ -15,12 +15,12 @@ uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default read location. } \examples{ \dontrun{ -writeProcessRuntimeToYML( +write_proc_medians_yml( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From a543898c8579065cbe3125f40b8cdf66200fc06f Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 11:00:41 +0100 Subject: [PATCH 22/61] Renamed the following functions in R/assign_job_queue.R; MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | Original | Modified | User Facing | |---------------------------------|----------------------------------|----------------------------------| | assign_job_queue | assignJobQueue | ✔️ | | make_opts2procs | mapOption2Process | ✔️ | | map_advanced_opts2procs | mapAdvOption2Process | ✔️ | | get_proc_medians | calculateProcessRuntime | ✔️ | | write_proc_medians_table | writeProcessRuntime2TSV | ✔️ | | write_proc_medians_yml | writeProcessRuntime2YML | ✔️ | | get_proc_weights | getProcessRuntimeWeights | ✔️ | | advanced_opts2est_walltime | calculateEstimatedWallTimeFromOpts| ✔️ | | plot_estimated_walltimes | plotEstimatedWallTimes | ✔️ | --- NAMESPACE | 18 ++-- R/assign_job_queue.R | 86 +++++++++---------- ...{assign_job_queue.Rd => assignJobQueue.Rd} | 12 +-- ... => calculateEstimatedWallTimeFromOpts.Rd} | 10 +-- ..._medians.Rd => calculateProcessRuntime.Rd} | 10 +-- ...weights.Rd => getProcessRuntimeWeights.Rd} | 8 +- ..._opts2procs.Rd => mapAdvOption2Process.Rd} | 8 +- ...ake_opts2procs.Rd => mapOption2Process.Rd} | 8 +- ...walltimes.Rd => plotEstimatedWallTimes.Rd} | 8 +- ...ns_table.Rd => writeProcessRuntime2TSV.Rd} | 8 +- ...ians_yml.Rd => writeProcessRuntime2YML.Rd} | 10 +-- 11 files changed, 93 insertions(+), 93 deletions(-) rename man/{assign_job_queue.Rd => assignJobQueue.Rd} (68%) rename man/{advanced_opts2est_walltime.Rd => calculateEstimatedWallTimeFromOpts.Rd} (74%) rename man/{get_proc_medians.Rd => calculateProcessRuntime.Rd} (76%) rename man/{get_proc_weights.Rd => getProcessRuntimeWeights.Rd} (73%) rename man/{map_advanced_opts2procs.Rd => mapAdvOption2Process.Rd} (76%) rename man/{make_opts2procs.Rd => mapOption2Process.Rd} (75%) rename man/{plot_estimated_walltimes.Rd => plotEstimatedWallTimes.Rd} (77%) rename man/{write_proc_medians_table.Rd => writeProcessRuntime2TSV.Rd} (77%) rename man/{write_proc_medians_yml.Rd => writeProcessRuntime2YML.Rd} (74%) diff --git a/NAMESPACE b/NAMESPACE index c811bac3..65cc791e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,10 +15,11 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) -export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assign_job_queue) +export(assignJobQueue) +export(calculateEstimatedWallTimeFromOpts) +export(calculateProcessRuntime) export(cleanClusters) export(cleanDomainArchitecture) export(cleanGeneDescription) @@ -53,10 +54,9 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) +export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) -export(get_proc_medians) -export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -66,12 +66,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) -export(make_opts2procs) export(mapAcc2Name) +export(mapAdvOption2Process) +export(mapOption2Process) export(map_acc2name) -export(map_advanced_opts2procs) export(msa_pdf) -export(plot_estimated_walltimes) +export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) export(removeAsterisks) @@ -103,8 +103,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(write_proc_medians_table) -export(write_proc_medians_yml) +export(writeProcessRuntime2TSV) +export(writeProcessRuntime2YML) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index c531fb09..10df1e3a 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,16 +3,16 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") +# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- make_opts2procs +#' example: list_opts2procs <- mapOption2Process #' @export -make_opts2procs <- function() { +mapOption2Process <- function() { tryCatch({ opts2processes <- list( "homology_search" = c("dblast", "dblast_cleanup"), @@ -26,7 +26,7 @@ make_opts2procs <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("make_opts2procs function execution completed.") + message("mapOption2Process function execution completed.") }) } @@ -40,16 +40,16 @@ make_opts2procs <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- map_advanced_opts2procs(advanced_opts) +#' procs <- mapAdvOption2Process(advanced_opts) #' @export -map_advanced_opts2procs <- function(advanced_opts) { +mapAdvOption2Process <- function(advanced_opts) { if (!is.character(advanced_opts)) { stop("Argument must be a character vector!") } tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- make_opts2procs() + opts2proc <- mapOption2Process() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run @@ -60,7 +60,7 @@ map_advanced_opts2procs <- function(advanced_opts) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("make_opts2procs function execution completed.") + message("mapOption2Process function execution completed.") }) } @@ -80,14 +80,14 @@ map_advanced_opts2procs <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export -get_proc_medians <- function(dir_job_results) { +calculateProcessRuntime <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -139,7 +139,7 @@ get_proc_medians <- function(dir_job_results) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("get_proc_medians function execution completed.") + message("calculateProcessRuntime function execution completed.") }) } @@ -156,12 +156,12 @@ get_proc_medians <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: write_proc_medians_table( +#' example: writeProcessRuntime2TSV( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -write_proc_medians_table <- function(dir_job_results, filepath) { +writeProcessRuntime2TSV <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -175,7 +175,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { if (!is.character(filepath) || length(filepath) != 1) { stop("Input 'filepath' must be a single character string.") } - df_proc_medians <- get_proc_medians(dir_job_results) |> + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> tibble::as_tibble() |> tidyr::pivot_longer( dplyr::everything(), @@ -192,7 +192,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("write_proc_medians_table function execution completed.") + message("writeProcessRuntime2TSV function execution completed.") }) } @@ -201,7 +201,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -212,13 +212,13 @@ write_proc_medians_table <- function(dir_job_results, filepath) { #' #' @examples #' \dontrun{ -#' write_proc_medians_yml( +#' writeProcessRuntime2YML( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { +writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -238,14 +238,14 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { stop("Input 'filepath' must be a single character string.") } - medians <- get_proc_medians(dir_job_results) + medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { message(paste("Encountered an error: "), e$message) }, warning = function(w) { message(paste("Warning: "), w$message) }, finally = { - message("write_proc_medians_table function execution completed.") + message("writeProcessRuntime2TSV function execution completed.") } ) @@ -261,9 +261,9 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: write_proc_medians_yml() +#' example: writeProcessRuntime2YML() #' @export -get_proc_weights <- function(medians_yml_path = NULL) { +getProcessRuntimeWeights <- function(medians_yml_path = NULL) { if (is.null(medians_yml_path)) { medians_yml_path <- file.path(common_root, "molevol_scripts", @@ -273,7 +273,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { proc_weights <- tryCatch({ # attempt to read the weights from the YAML file produced by - # write_proc_medians_yml() + # writeProcessRuntime2YML() if (stringr::str_trim(medians_yml_path) == "") { stop( stringr::str_glue("medians_yml_path is empty @@ -285,7 +285,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { }, # to avoid fatal errors in reading the proc weights yaml, # some median process runtimes have been hardcoded based on - # the result of get_proc_medians() from Jan 2024 + # the result of calculateProcessRuntime() from Jan 2024 error = function(cond) { proc_weights <- list( "dblast" = 2810, @@ -306,7 +306,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see make_opts2procs for the options) +#' (see mapOption2Process for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -314,11 +314,11 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: advanced_opts2est_walltime (c("homology_search", +#' example: calculateEstimatedWallTimeFromOpts (c("homology_search", #' "domain_architecture"), #' n_inputs = 3, n_hits = 50L) #' @export -advanced_opts2est_walltime <- function(advanced_opts, +calculateEstimatedWallTimeFromOpts <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { @@ -348,7 +348,7 @@ advanced_opts2est_walltime <- function(advanced_opts, } # Get process weights - proc_weights <- write_proc_medians_yml() + proc_weights <- writeProcessRuntime2YML() if (!is.list(proc_weights)) { stop("Process weights could not be retrieved correctly.") } @@ -357,7 +357,7 @@ advanced_opts2est_walltime <- function(advanced_opts, proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- map_advanced_opts2procs(advanced_opts) + procs_from_opts <- mapAdvOption2Process(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) @@ -366,7 +366,7 @@ advanced_opts2est_walltime <- function(advanced_opts, as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- make_opts2procs() + opts2procs <- mapOption2Process() # exclude the homology search processes for the homologous hits procs2exclude_for_homologs <- opts2procs[["homology_search"]] procs_homologs <- procs_from_opts[!(procs_from_opts @@ -380,7 +380,7 @@ advanced_opts2est_walltime <- function(advanced_opts, } if (verbose) { msg <- stringr::str_glue( - "warnings from advanced_opts2est_walltime ():\n", + "warnings from calculateEstimatedWallTimeFromOpts ():\n", "\tn_inputs={n_inputs}\n", "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", "\test_walltime={est_walltime}\n\n" @@ -393,7 +393,7 @@ advanced_opts2est_walltime <- function(advanced_opts, }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("advanced_opts2est_walltime + message("calculateEstimatedWallTimeFromOpts function execution completed.") }) @@ -403,18 +403,18 @@ advanced_opts2est_walltime <- function(advanced_opts, #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from advanced_opts2est_walltime ()) +#' (from calculateEstimatedWallTimeFromOpts ()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' advanced_opts2est_walltime (c("homology_search", +#' calculateEstimatedWallTimeFromOpts (c("homology_search", #' "domain_architecture"), 3) |> -#' assign_job_queue() +#' assignJobQueue() #' @export -assign_job_queue <- function( +assignJobQueue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { @@ -434,7 +434,7 @@ assign_job_queue <- function( }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("assign_job_queue function execution completed.") + message("assignJobQueue function execution completed.") }) } @@ -451,13 +451,13 @@ assign_job_queue <- function( #' @return line plot object #' #' example: -#' p <- plot_estimated_walltimes() +#' p <- plotEstimatedWallTimes() #' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plot_estimated_walltimes <- function() { +plotEstimatedWallTimes <- function() { tryCatch({ - opts <- make_opts2procs() |> names() + opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { # generate powerset (do not include empty set) @@ -482,7 +482,7 @@ plot_estimated_walltimes <- function() { } else { NULL } - est_walltime <- advanced_opts2est_walltime ( + est_walltime <- calculateEstimatedWallTimeFromOpts ( advanced_opts, n_inputs = i, n_hits = n_hits, @@ -541,7 +541,7 @@ plot_estimated_walltimes <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("plot_estimated_walltimes function execution completed.") + message("plotEstimatedWallTimes function execution completed.") }) } diff --git a/man/assign_job_queue.Rd b/man/assignJobQueue.Rd similarity index 68% rename from man/assign_job_queue.Rd rename to man/assignJobQueue.Rd index d2650fed..3663ce56 100644 --- a/man/assign_job_queue.Rd +++ b/man/assignJobQueue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assign_job_queue} -\alias{assign_job_queue} +\name{assignJobQueue} +\alias{assignJobQueue} \title{Decision function to assign job queue} \usage{ -assign_job_queue(t_sec_estimate, t_cutoff = 21600) +assignJobQueue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from advanced_opts2est_walltime ())} +(from calculateEstimatedWallTimeFromOpts ())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,9 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -advanced_opts2est_walltime (c("homology_search", +calculateEstimatedWallTimeFromOpts (c("homology_search", "domain_architecture"), 3) |> -assign_job_queue() +assignJobQueue() } \description{ Decision function to assign job queue diff --git a/man/advanced_opts2est_walltime.Rd b/man/calculateEstimatedWallTimeFromOpts.Rd similarity index 74% rename from man/advanced_opts2est_walltime.Rd rename to man/calculateEstimatedWallTimeFromOpts.Rd index 02ae9621..c09cf6a6 100644 --- a/man/advanced_opts2est_walltime.Rd +++ b/man/calculateEstimatedWallTimeFromOpts.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{advanced_opts2est_walltime} -\alias{advanced_opts2est_walltime} +\name{calculateEstimatedWallTimeFromOpts} +\alias{calculateEstimatedWallTimeFromOpts} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -advanced_opts2est_walltime( +calculateEstimatedWallTimeFromOpts( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,14 @@ advanced_opts2est_walltime( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see make_opts2procs for the options)} +(see mapOption2Process for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: advanced_opts2est_walltime (c("homology_search", +example: calculateEstimatedWallTimeFromOpts (c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) } diff --git a/man/get_proc_medians.Rd b/man/calculateProcessRuntime.Rd similarity index 76% rename from man/get_proc_medians.Rd rename to man/calculateProcessRuntime.Rd index b6db0b56..bb6dd1ed 100644 --- a/man/get_proc_medians.Rd +++ b/man/calculateProcessRuntime.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_medians} -\alias{get_proc_medians} +\name{calculateProcessRuntime} +\alias{calculateProcessRuntime} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -get_proc_medians(dir_job_results) +calculateProcessRuntime(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) } } \description{ diff --git a/man/get_proc_weights.Rd b/man/getProcessRuntimeWeights.Rd similarity index 73% rename from man/get_proc_weights.Rd rename to man/getProcessRuntimeWeights.Rd index f48585cc..ff3c8e5d 100644 --- a/man/get_proc_weights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_weights} -\alias{get_proc_weights} +\name{getProcessRuntimeWeights} +\alias{getProcessRuntimeWeights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -get_proc_weights(medians_yml_path = NULL) +getProcessRuntimeWeights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: write_proc_medians_yml() +example: writeProcessRuntime2YML() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/map_advanced_opts2procs.Rd b/man/mapAdvOption2Process.Rd similarity index 76% rename from man/map_advanced_opts2procs.Rd rename to man/mapAdvOption2Process.Rd index 631708b4..5bd9ee65 100644 --- a/man/map_advanced_opts2procs.Rd +++ b/man/mapAdvOption2Process.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{map_advanced_opts2procs} -\alias{map_advanced_opts2procs} +\name{mapAdvOption2Process} +\alias{mapAdvOption2Process} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -map_advanced_opts2procs(advanced_opts) +mapAdvOption2Process(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- map_advanced_opts2procs(advanced_opts) +procs <- mapAdvOption2Process(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/make_opts2procs.Rd b/man/mapOption2Process.Rd similarity index 75% rename from man/make_opts2procs.Rd rename to man/mapOption2Process.Rd index 07e208b2..ff6905c5 100644 --- a/man/make_opts2procs.Rd +++ b/man/mapOption2Process.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{make_opts2procs} -\alias{make_opts2procs} +\name{mapOption2Process} +\alias{mapOption2Process} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -make_opts2procs() +mapOption2Process() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- make_opts2procs +example: list_opts2procs <- mapOption2Process } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/plot_estimated_walltimes.Rd b/man/plotEstimatedWallTimes.Rd similarity index 77% rename from man/plot_estimated_walltimes.Rd rename to man/plotEstimatedWallTimes.Rd index 884fed50..0d53cb32 100644 --- a/man/plot_estimated_walltimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plot_estimated_walltimes} -\alias{plot_estimated_walltimes} +\name{plotEstimatedWallTimes} +\alias{plotEstimatedWallTimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plot_estimated_walltimes() +plotEstimatedWallTimes() } \value{ line plot object example: -p <- plot_estimated_walltimes() +p <- plotEstimatedWallTimes() ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } diff --git a/man/write_proc_medians_table.Rd b/man/writeProcessRuntime2TSV.Rd similarity index 77% rename from man/write_proc_medians_table.Rd rename to man/writeProcessRuntime2TSV.Rd index 2ae7a97b..03cbbd68 100644 --- a/man/write_proc_medians_table.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_table} -\alias{write_proc_medians_table} +\name{writeProcessRuntime2TSV} +\alias{writeProcessRuntime2TSV} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -write_proc_medians_table(dir_job_results, filepath) +writeProcessRuntime2TSV(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ write_proc_medians_table(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: write_proc_medians_table( +example: writeProcessRuntime2TSV( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/write_proc_medians_yml.Rd b/man/writeProcessRuntime2YML.Rd similarity index 74% rename from man/write_proc_medians_yml.Rd rename to man/writeProcessRuntime2YML.Rd index 74757f1f..b43f39ee 100644 --- a/man/write_proc_medians_yml.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_yml} -\alias{write_proc_medians_yml} +\name{writeProcessRuntime2YML} +\alias{writeProcessRuntime2YML} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -write_proc_medians_yml(dir_job_results, filepath = NULL) +writeProcessRuntime2YML(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} @@ -15,12 +15,12 @@ uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default read location. } \examples{ \dontrun{ -write_proc_medians_yml( +writeProcessRuntime2YML( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From 823af96d484a1ec075548ce181f52147cff54af5 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Thu, 10 Oct 2024 09:13:26 -0600 Subject: [PATCH 23/61] - remove old .Rd leftovers and update with new docs - let R-CMD sort NAMESPACE --- NAMESPACE | 1 - man/IPG2Lineage.Rd | 3 ++- man/acc2Lineage.Rd | 3 ++- man/acc2lin.Rd | 0 man/efetchIPG.Rd | 3 ++- man/efetch_ipg.Rd | 0 man/ipg2lin.Rd | 0 man/sink.reset.Rd | 0 man/sinkReset.Rd | 1 + 9 files changed, 7 insertions(+), 4 deletions(-) delete mode 100644 man/acc2lin.Rd delete mode 100644 man/efetch_ipg.Rd delete mode 100644 man/ipg2lin.Rd delete mode 100644 man/sink.reset.Rd diff --git a/NAMESPACE b/NAMESPACE index 50af36df..078f971b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -77,7 +77,6 @@ export(prepareColumnParams) export(prepareSingleColumnParams) export(proteinAcc2TaxID) export(proteinAcc2TaxID_old) -export(prot2tax_old) export(removeAsterisks) export(removeEmptyRows) export(removeTails) diff --git a/man/IPG2Lineage.Rd b/man/IPG2Lineage.Rd index e24ab617..f8434c7f 100644 --- a/man/IPG2Lineage.Rd +++ b/man/IPG2Lineage.Rd @@ -38,7 +38,8 @@ This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} f Describe return, in detail } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/acc2Lineage.Rd b/man/acc2Lineage.Rd index a24bdc9a..836a677f 100644 --- a/man/acc2Lineage.Rd +++ b/man/acc2Lineage.Rd @@ -38,7 +38,8 @@ on the ipg database. If NULL, the file will not be written. Defaults to NULL} Describe return, in detail } \description{ -This function combines 'efetchIPG()' and 'IPG2Lineage()' to map a set +This function combines 'efetchIPG()' +and 'IPG2Lineage()' to map a set of protein accessions to their assembly (GCA_ID), tax ID, and lineage. Function to map protein accession numbers to lineage diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index 6a5d85a4..5d2e8372 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -23,7 +23,8 @@ the ipg database} Describe return, in detail } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index 0285c0b2..e3fc7ce4 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,6 +8,7 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset From b116442be77ea2dc267b638f4ecd604a090a9ede Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Fri, 11 Oct 2024 01:40:21 +0300 Subject: [PATCH 24/61] document functions Signed-off-by: Awa Synthia --- NAMESPACE | 1 + R/CHANGED-pre-msa-tree.R | 108 ++++++-- R/blastWrappers.R | 51 ++-- R/cleanup.R | 81 +++--- R/combine_analysis.R | 28 ++- R/combine_files.R | 24 +- R/create_lineage_lookup.R | 17 +- R/fa2domain.R | 21 +- R/ipr2viz.R | 121 ++++++--- R/lineage.R | 155 +++++++++--- R/msa.R | 20 +- R/networks_domarch.R | 39 +-- R/networks_gencontext.R | 36 ++- R/plotme.R | 43 ++-- R/plotting.R | 230 ++++++++++++------ R/pre-msa-tree.R | 114 ++++++--- R/reverse_operons.R | 38 ++- man/BinaryDomainNetwork.Rd | 24 +- man/GCA2Lineage.Rd | 15 +- man/GenContextNetwork.Rd | 11 +- man/IPG2Lineage.Rd | 16 ++ man/RepresentativeAccNums.Rd | 23 +- man/acc2FA.Rd | 39 +++ man/acc2Lineage.Rd | 15 +- man/acc2fa.Rd | 16 +- man/addLeaves2Alignment.Rd | 4 + man/addLineage.Rd | 32 ++- man/addName.Rd | 10 + man/addTaxID.Rd | 20 +- man/add_leaves.Rd | 4 + man/add_name.Rd | 9 +- man/alignFasta.Rd | 18 +- man/cleanDomainArchitecture.Rd | 27 +- man/cleanFAHeaders.Rd | 4 +- man/cleanGeneDescription.Rd | 5 +- man/cleanLineage.Rd | 9 +- man/cleanSpecies.Rd | 2 +- man/combine_files.Rd | 26 +- man/combine_full.Rd | 16 +- man/combine_ipr.Rd | 16 +- man/condenseRepeatedDomains.Rd | 2 +- man/convert2TitleCase.Rd | 8 + man/convertAlignment2FA.Rd | 5 + man/convert_aln2fa.Rd | 9 +- man/{countbycolumn.Rd => countByColumn.Rd} | 0 man/createWordCloud2Element.Rd | 13 +- man/createWordCloudElement.Rd | 13 +- man/create_lineage_lookup.Rd | 19 +- man/domain_network.Rd | 17 +- man/downloadAssemblySummary.Rd | 16 +- man/efetchIPG.Rd | 12 +- man/extractAccNum.Rd | 3 +- ...{filterbydomains.Rd => filterByDomains.Rd} | 0 ...terbyfrequency.Rd => filterByFrequency.Rd} | 0 man/{findparalogs.Rd => findParalogs.Rd} | 0 man/find_top_acc.Rd | 26 +- man/gc_undirected_network.Rd | 27 +- man/generateAllAlignments2FA.Rd | 19 +- man/generate_all_aln2fa.Rd | 18 +- man/generate_msa.Rd | 15 +- man/get_accnums_from_fasta_file.Rd | 19 +- man/ipr2viz.Rd | 45 +++- man/ipr2viz_web.Rd | 46 +++- man/mapAcc2Name.Rd | 15 +- man/map_acc2name.Rd | 15 +- man/msa_pdf.Rd | 8 +- man/plotLineageDA.Rd | 8 + man/plotLineageDomainRepeats.Rd | 11 +- man/plotLineageHeatmap.Rd | 5 + man/plotLineageNeighbors.Rd | 5 + man/plotLineageQuery.Rd | 20 +- man/plotLineageSunburst.Rd | 31 ++- man/plotStackedLineage.Rd | 39 ++- man/plotSunburst.Rd | 6 +- man/plotUpSet.Rd | 19 +- man/prepareColumnParams.Rd | 17 +- man/prepareSingleColumnParams.Rd | 18 +- man/proteinAcc2TaxID.Rd | 26 +- man/proteinAcc2TaxID_old.Rd | 20 +- man/removeAsterisks.Rd | 10 +- man/removeEmptyRows.Rd | 3 +- man/removeTails.Rd | 3 +- man/renameFA.Rd | 9 + man/rename_fasta.Rd | 9 + man/replaceQuestionMarks.Rd | 4 +- man/reveql.Rd | 19 +- man/reverse_operon.Rd | 21 +- man/runIPRScan.Rd | 24 +- man/run_deltablast.Rd | 29 ++- man/run_rpsblast.Rd | 27 +- man/selectLongestDuplicate.Rd | 9 +- man/shortenLineage.Rd | 24 +- ...rizebylineage.Rd => summarizeByLineage.Rd} | 0 man/theme_genes2.Rd | 13 + man/to_titlecase.Rd | 7 + ...s.Rd => totalGenContextOrDomArchCounts.Rd} | 0 man/validateCountDF.Rd | 10 +- man/wordcloud3.Rd | 54 +++- ...ords2wordcounts.Rd => words2WordCounts.Rd} | 0 man/write.MsaAAMultipleAlignment.Rd | 16 ++ 100 files changed, 1913 insertions(+), 461 deletions(-) create mode 100644 man/acc2FA.Rd rename man/{countbycolumn.Rd => countByColumn.Rd} (100%) rename man/{filterbydomains.Rd => filterByDomains.Rd} (100%) rename man/{filterbyfrequency.Rd => filterByFrequency.Rd} (100%) rename man/{findparalogs.Rd => findParalogs.Rd} (100%) rename man/{summarizebylineage.Rd => summarizeByLineage.Rd} (100%) rename man/{totalgencontextordomarchcounts.Rd => totalGenContextOrDomArchCounts.Rd} (100%) rename man/{words2wordcounts.Rd => words2WordCounts.Rd} (100%) diff --git a/NAMESPACE b/NAMESPACE index 078f971b..50943690 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -230,6 +230,7 @@ importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,pmap) importFrom(purrr,pmap_dfr) +importFrom(rMSA,kalign) importFrom(readr,cols) importFrom(readr,read_delim) importFrom(readr,read_file) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index c4a97589..76c13859 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -40,10 +40,14 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param y Delimitter. Default is space (" "). #' @seealso chartr, toupper, and tolower. #' -#' @return +#' @return Character vector with the input strings converted to title case. +#' #' @export #' #' @examples +#' # Convert a single string to title case +#' convert2TitleCase("hello world") # Returns "Hello World" +#' convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -76,7 +80,8 @@ convert2TitleCase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return +#' @return A data frame containing the enriched alignment data with lineage +#' information. #' #' @details The alignment file would need two columns: 1. accession + #' number and 2. alignment. The protein homolog accession to lineage mapping + @@ -203,6 +208,14 @@ addLeaves2Alignment <- function(aln_file = "", #' @export #' #' @examples +#' # Example usage of the addName function +#' data <- data.frame( +#' AccNum = c("ACC123", "ACC456"), +#' Species = c("Homo sapiens", "Mus musculus"), +#' Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") +#' ) +#' enriched_data <- addName(data) +#' print(enriched_data) addName <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { @@ -278,7 +291,9 @@ addName <- function(data, #' @note Please refer to the source code if you have alternate + #' file formats and/or column names. #' -#' @return +#' @return A character string representing the FASTA formatted sequences. +#' If `fa_outpath` is provided, the FASTA will also be saved to the specified +#' file. #' @export #' #' @examples @@ -323,18 +338,24 @@ convertAlignment2FA <- function(aln_file = "", #' Default renameFA() replacement function. Maps an accession number to its name #' #' @param line The line of a fasta file starting with '>' -#' @param acc2name Data Table containing a column of accession numbers and a name column +#' @param acc2name Data Table containing a column of accession numbers and a +#' name column #' @param acc_col Name of the column containing Accession numbers -#' @param name_col Name of the column containing the names that the accession numbers +#' @param name_col Name of the column containing the names that the accession +#' numbers #' are mapped to #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character string representing the updated FASTA line, where the +#' accession number is replaced with its corresponding name. #' @export #' #' @examples +#' \dontrun{ +#' mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name") +#' } mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an addNames column # Find the first ' ' @@ -360,10 +381,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { #' @importFrom purrr map #' @importFrom readr read_lines write_lines #' -#' @return +#' @return A character vector of the modified lines in the FASTA file. #' @export #' #' @examples +#' \dontrun{ +#' renameFA("path/to/input.fasta", +#' "path/to/output.fasta", mapAcc2Name, acc2name) +#' } renameFA <- function(fa_path, outpath, replacement_function = mapAcc2Name, ...) { lines <- read_lines(fa_path) @@ -389,20 +414,26 @@ renameFA <- function(fa_path, outpath, #' #' @param aln_path Character. Path to alignment files. #' Default is 'here("data/rawdata_aln/")' -#' @param fa_outpath Character. Path to file. Master protein file with AccNum & lineages. +#' @param fa_outpath Character. Path to file. Master protein file with AccNum & +#' lineages. #' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' #' @param lin_file Character. Path to the written fasta file. #' Default is 'here("data/alns/")'. -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. #' Default is 'FALSE'. #' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return +#' @return NULL. The function saves the output FASTA files to the specified +#' directory. #' -#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats and/or column names. +#' @details The alignment files would need two columns separated by spaces: +#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, +#' Species, Lineages. +#' @note Please refer to the source code if you have alternate + file formats +#' and/or column names. #' #' @export #' @@ -449,24 +480,29 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' -#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for. +#' @param accessions Character vector containing protein accession numbers to +#' generate fasta sequences for. #' Function may not work for vectors of length > 10,000 #' @param outpath [str] Location where fasta file should be written to. -#' @param plan +#' @param plan Character string specifying the parallel processing strategy to +#' use with the `future` package. Default is "sequential". #' #' @importFrom Biostrings readAAStringSet #' @importFrom future future plan value #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return A logical value indicating whether the retrieval and conversion were +#' successful. Returns `TRUE` if successful and `FALSE` otherwise. #' @export #' #' @examples #' \dontrun{ -#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") +#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' outpath = "my_proteins.fasta") #' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +#' EBI:accessions <- c("P12345", "Q9UHC1", +#' "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") #' } acc2FA <- function(accessions, outpath, plan = "sequential") { # validation @@ -539,7 +575,8 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { return(result) } -#' Function to generate a vector of one Accession number per distinct observation from 'reduced' column +#' Function to generate a vector of one Accession number per distinct +#' observation from 'reduced' column #' #' @author Samuel Chen, Janani Ravi #' @@ -552,14 +589,20 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character vector containing one Accession number per distinct +#' observation from the specified reduced column. #' @export #' #' @examples +#' \dontrun{ +#' representative_accessions <- RepresentativeAccNums(prot_data, +#' reduced = "Lineage", accnum_col = "AccNum") +#' } RepresentativeAccNums <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column + # Get Unique reduced column and then bind the AccNums back to get one + # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -590,8 +633,10 @@ RepresentativeAccNums <- function(prot_data, #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written +#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' "ClustalO", or "ClustalW" +#' @param outpath Path to write the resulting alignment to as a FASTA file. +#' If NULL, no file is written #' #' @importFrom Biostrings readAAStringSet #' @importFrom msa msaClustalOmega msaMuscle msaClustalW @@ -600,6 +645,10 @@ RepresentativeAccNums <- function(prot_data, #' @export #' #' @examples +#' \dontrun{ +#' aligned_sequences <- alignFasta("my_sequences.fasta", +#' tool = "Muscle", outpath = "aligned_output.fasta") +#' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { fasta <- readAAStringSet(fasta_file) @@ -628,10 +677,14 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @importFrom Biostrings toString unmasked #' @importFrom readr write_file #' -#' @return +#' @return Character string representing the content of the written FASTA file. #' @export #' #' @examples +#' \dontrun{ +#' alignment <- msaMuscle("my_sequences.fasta") +#' write.MsaAAMultipleAlignment(alignment, "aligned_sequences.fasta") +#' } write.MsaAAMultipleAlignment <- function(alignment, outpath) { l <- length(rownames(alignment)) fasta <- "" @@ -647,14 +700,19 @@ write.MsaAAMultipleAlignment <- function(alignment, outpath) { #' Get accnums from fasta file #' -#' @param fasta_file +#' @param fasta_file Character. The path to the FASTA file from which +#' accession numbers will be extracted. #' #' @importFrom stringi stri_extract_all_regex #' -#' @return +#' @return A character vector containing the extracted accession numbers. #' @export #' #' @examples +#' \dontrun{ +#' accnums <- get_accnums_from_fasta_file("my_sequences.fasta") +#' print(accnums) +#' } get_accnums_from_fasta_file <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 552b1ff6..2a0325ca 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -3,20 +3,28 @@ #' Run DELTABLAST to find homologs for proteins of interest #' #' @author Samuel Chen, Janani Ravi +#' @description +#' This function executes a Delta-BLAST search using the specified parameters +#' and database. It sets the BLAST database path, runs the Delta-BLAST command +#' with the given query, and outputs the results. #' -#' @param deltablast_path -#' @param db_search_path Path to the BLAST databases -#' @param db -#' @param query -#' @param evalue -#' @param out -#' @param num_alignments -#' @param num_threads +#' @param deltablast_path Path to the Delta-BLAST executable. +#' @param db_search_path Path to the BLAST databases. +#' @param db Name of the BLAST database to search against (default is "refseq"). +#' @param query Path to the input query file. +#' @param evalue E-value threshold for reporting matches (default is "1e-5"). +#' @param out Path to the output file where results will be saved. +#' @param num_alignments Number of alignments to report. +#' @param num_threads Number of threads to use for the search (default is 1). #' -#' @return +#' @return This function does not return a value; it outputs results to the +#' specified file. #' @export #' #' @examples +#' \dontrun{ +#' run_deltablast(deltablast_path, db_search_path, query, out, num_alignments) +#' } run_deltablast <- function(deltablast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { @@ -42,18 +50,27 @@ run_deltablast <- function(deltablast_path, db_search_path, #' Run RPSBLAST to generate domain architectures for proteins of interest #' -#' @param rpsblast_path -#' @param db_search_path Path to the BLAST databases -#' @param db -#' @param query -#' @param evalue -#' @param out -#' @param num_threads +#' @description +#' This function executes an RPS-BLAST search to generate domain architectures +#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST +#' command with the provided query, and outputs the results. #' -#' @return +#' @param rpsblast_path Path to the RPS-BLAST executable. +#' @param db_search_path Path to the BLAST databases. +#' @param db Name of the BLAST database to search against (default is "refseq"). +#' @param query Path to the input query file. +#' @param evalue E-value threshold for reporting matches (default is "1e-5"). +#' @param out Path to the output file where results will be saved. +#' @param num_threads Number of threads to use for the search (default is 1). +#' +#' @return This function does not return a value; it outputs results to the +#' specified file. #' @export #' #' @examples +#' \dontrun{ +#' run_rpsblast(rpsblast_path, db_search_path, query, out) +#' } run_rpsblast <- function(rpsblast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_threads = 1) { diff --git a/R/cleanup.R b/R/cleanup.R index 4fe074ee..a8e79e33 100755 --- a/R/cleanup.R +++ b/R/cleanup.R @@ -46,7 +46,8 @@ cleanString <- function(string) { # get_sequences() function to extract accession numbers #' extractAccNum #' -#' @param string +#' @param string A string from which to extract the accession number. +#' The string may contain accession information delimited by `|` or spaces. #' #' @return Describe return, in detail #' @export @@ -103,7 +104,9 @@ ensureUniqAccNum <- function(accnums) { #' Parse accesion numbers from fasta and add a #' suffix of the ith occurence to handle duplicates #' -#' @param fasta +#' @param fasta An [XStringSet] object representing the sequences from a +#' FASTA file. The sequence names (headers) will be adjusted for uniqueness +#' and sanitized. #' #' @importFrom purrr map_chr #' @importFrom fs path_sanitize @@ -148,7 +151,8 @@ cleanFAHeaders <- function(fasta) { #' #' @importFrom dplyr as_tibble filter #' -#' @return Describe return, in detail +#' @return A tibble with rows removed where the specified column contains +#' `"-"`, `"NA"`, or an empty string. #' @export #' #' @examples @@ -183,7 +187,7 @@ removeEmptyRows <- function(prot, by_column = "DomArch") { #' @param by_column Column in which repeats are condensed to domain+domain -> domain(s). #' @param excluded_prots Vector of strings that condenseRepeatedDomains should not reduce to (s). Defaults to c() #' -#' @return Describe return, in detail +#' @return A data frame with condensed repeated domains in the specified column. #' @export #' #' @importFrom dplyr pull @@ -244,7 +248,9 @@ condenseRepeatedDomains <- function(prot, by_column = "DomArch", excluded_prots #' @param prot DataTable to operate on #' @param by_column Column to operate on #' -#' @return Describe return, in detail +#' @return The original data frame with the specified column updated. All +#' consecutive '?' characters will be replaced with 'X(s)', and individual '?' +#' characters will be replaced with 'X'. #' @export #' #' @importFrom dplyr pull @@ -273,19 +279,21 @@ replaceQuestionMarks <- function(prot, by_column = "GenContext") { } -#' Remove Astrk +#' Remove Asterisk #' #' @description #' Remove the asterisks from a column of data #' Used for removing * from GenContext columns #' -#' @param query_data -#' @param colname +#' @param query_data A data frame containing the data to be processed. +#' @param colname The name of the column from which asterisks should be removed. +#' Defaults to "GenContext". #' #' @importFrom purrr map #' @importFrom stringr str_remove_all #' -#' @return Describe return, in detail +#' @return The original data frame with asterisks removed from the specified +#' column. #' @export #' #' @examples @@ -315,7 +323,8 @@ removeAsterisks <- function(query_data, colname = "GenContext") { #' @param by_column Default column is 'DomArch'. Can also take 'ClustName', 'GenContext' as input. #' @param keep_domains Default is False Keeps tail entries that contain the query domains. #' -#' @return Describe return, in detail +#' @return The original data frame with singletons removed from the specified +#' column. #' @export #' #' @importFrom dplyr count filter group_by pull n summarize @@ -374,7 +383,7 @@ removeTails <- function(prot, by_column = "DomArch", #' #' @importFrom stringr coll str_replace_all #' -#' @return Describe return, in detail +#' @return The original data frame with Species cleaned. #' @export #' #' @examples @@ -504,25 +513,34 @@ cleanClusters <- function(prot, #' The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. #' #' @param prot A data frame containing a 'DomArch' column -#' @param old -#' @param new +#' @param old The name of the original column containing domain architecture. +#' Defaults to "DomArch.orig". +#' @param new The name of the cleaned column to be created. Defaults to +#' "DomArch". #' @param domains_keep A data frame containing the domain names to be retained. -#' @param domains_rename A data frame containing the domain names to be replaced in a column 'old' and the +#' @param domains_rename A data frame containing the domain names to be replaced +#' in a column 'old' and the #' corresponding replacement values in a column 'new'. -#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE. -#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE. -#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE. -#' @param domains_ignore A data frame containing the domain names to be removed in a column called 'domains' +#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in +#' 'DomArch' are condensed. Default is TRUE. +#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on +#' domains to keep/remove. Default is FALSE. +#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values +#' in 'DomArch' are removed. Default is FALSE. +#' @param domains_ignore A data frame containing the domain names to be removed +#' in a column called 'domains' #' #' @importFrom dplyr pull #' @importFrom stringr coll str_replace_all #' -#' @return The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. +#' @return The original data frame is returned with the clean DomArchs column +#' and the old domains in the DomArchs.old column. #' @export #' #' @examples #' \dontrun{ -#' cleanDomainArchitecture(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL) +#' cleanDomainArchitecture(prot, TRUE, FALSE, +#' omains_keep, domains_rename, domains_ignore = NULL) #' } cleanDomainArchitecture <- function(prot, old = "DomArch.orig", new = "DomArch", domains_keep, domains_rename, @@ -658,8 +676,9 @@ cleanGenomicContext <- function(prot, domains_rename = data.frame("old" = charac #' Cleanup GeneDesc #' -#' @param prot -#' @param column +#' @param prot A data frame containing the gene descriptions. +#' @param column The name of the column from which gene descriptions are pulled +#' for cleanup. #' #' @return Return trailing period that occurs in GeneDesc column #' @export @@ -677,13 +696,16 @@ cleanGeneDescription <- function(prot, column) { #' Pick Longer Duplicate #' -#' @param prot -#' @param column +#' @param prot A data frame containing the data, with at least one column +#' named 'AccNum' for identification of duplicates. +#' @param column The name of the column from which the longest entry among +#' duplicates will be selected. #' #' @importFrom dplyr arrange filter group_by pull n select summarize #' @importFrom rlang sym #' -#' @return Describe return, in detail +#' @return A data frame containing only the longest entries among duplicates +#' based on the specified column. #' @export #' #' @examples @@ -728,10 +750,13 @@ selectLongestDuplicate <- function(prot, column) { #' Cleanup Lineage #' -#' @param prot -#' @param lins_rename +#' @param prot A data frame containing a 'Lineage' column that needs to be +#' cleaned up. +#' @param lins_rename A data frame with two columns: 'old' containing terms +#' to be replaced and 'new' containing the corresponding replacement terms. #' -#' @return Describe return, in detail +#' @return The original data frame with the 'Lineage' column updated based on +#' the provided replacements. #' @export #' #' @examples diff --git a/R/combine_analysis.R b/R/combine_analysis.R index bb3b3ce2..2361c213 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -8,15 +8,23 @@ #' Combining full_analysis files #' -#' @param inpath -#' @param ret +#' @param inpath Character. The path to the directory containing the +#' `.full_analysis.tsv` files to be combined. +#' @param ret Logical. If TRUE, the function will return the combined data frame. +#' Default is FALSE, meaning it will only write the file and not return the data. #' #' @importFrom readr write_tsv #' -#' @return +#' @return If `ret` is TRUE, a data frame containing the combined data from all +#' input files. If `ret` is FALSE, the function writes the combined data to a +#' TSV file named `cln_combined.tsv` in the specified directory and returns NULL. +#' #' @export #' #' @examples +#' \dontrun{ +#' combined_data <- combine_full("path/to/full_analysis/files", ret = TRUE) +#' } combine_full <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combine_files(inpath, @@ -35,15 +43,23 @@ combine_full <- function(inpath, ret = FALSE) { #' Combining clean ipr files #' -#' @param inpath -#' @param ret +#' @param inpath Character. The path to the directory containing the +#' `.iprscan_cln.tsv` files to be combined. +#' @param ret Logical. If TRUE, the function will return the combined data frame. +#' Default is FALSE, meaning it will only write the file and not return the data. #' #' @importFrom readr write_tsv #' -#' @return +#' @return If `ret` is TRUE, a data frame containing the combined data from all +#' input files. If `ret` is FALSE, the function writes the combined data to a +#' TSV file named `ipr_combined.tsv` in the specified directory and returns NULL. +#' #' @export #' #' @examples +#' \dontrun{ +#' combined_ipr_data <- combine_ipr("path/to/ipr/files", ret = TRUE) +#' } combine_ipr <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combine_files(inpath, diff --git a/R/combine_files.R b/R/combine_files.R index 76c5fa09..088f2d7b 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -24,20 +24,30 @@ #' #' @author Janani Ravi #' -#' @param inpath String of 'master' path where the files reside (recursive=T) -#' @param pattern Character vector containing search pattern for files -#' @param delim -#' @param skip -#' @param col_names Takes logical T/F arguments OR column names vector; -#' usage similar to col_names parameter in `readr::read_delim` +#' @param inpath Character. The master directory path where the files reside. +#' The search is recursive (i.e., it will look in subdirectories as well). +#' @param pattern Character. A search pattern to identify files to be combined. +#' Default is "*full_analysis.tsv". +#' @param delim Character. The delimiter used in the input files. +#' Default is tab ("\t"). +#' @param skip Integer. The number of lines to skip at the beginning of each file. +#' Default is 0. +#' @param col_names Logical or character vector. If TRUE, the first row of each file +#' is treated as column names. Alternatively, a character vector can +#' be provided to specify custom column names. #' #' @importFrom purrr pmap_dfr #' @importFrom readr cols #' -#' @return +#' @return A data frame containing the combined contents of all matched files. +#' Each row will include a new column "ByFile" indicating the source file of the data. +#' #' @export #' #' @examples +#' \dontrun{ +#' combined_data <- combine_files(inpath = "../molevol_data/project_data/phage_defense/") +#' } combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index e7374df3..7c581d2e 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -7,12 +7,12 @@ #' #' @author Samuel Chen #' -#' @param lineage_file Path to the rankedlineage.dmp file containing taxid's and their -#' corresponding taxonomic rank. rankedlineage.dmp can be downloaded at +#' @param lineage_file Path to the rankedlineage.dmp file containing taxid's +#' and their corresponding taxonomic rank. rankedlineage.dmp can be downloaded at #' https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/ #' @param outfile File the resulting lineage lookup table should be written to -#' @param taxonomic_rank The upperbound of taxonomic rank that the lineage includes. The lineaege will -#' include superkingdom>...>taxonomic_rank. +#' @param taxonomic_rank The upperbound of taxonomic rank that the lineage +#' includes. The lineaege will include superkingdom>...>taxonomic_rank. #' Choices include: "supperkingdom", "phylum", "class","order", "family", #' "genus", and "species" #' @@ -22,10 +22,17 @@ #' @importFrom stringr str_locate str_replace_all #' @importFrom tidyr unite #' -#' @return +#' @return A tibble containing the tax IDs and their respective lineages up to +#' the specified taxonomic rank, saved as a tab-separated file. +#' #' @export #' #' @examples +#' \dontrun{ +#' create_lineage_lookup(lineage_file = "data/rankedlineage.dmp", +#' outfile = "data/lineage_lookup.tsv", +#' taxonomic_rank = "family") +#' } create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { shorten_NA <- function(Lineage) { diff --git a/R/fa2domain.R b/R/fa2domain.R index 6dc6f622..29803b85 100644 --- a/R/fa2domain.R +++ b/R/fa2domain.R @@ -5,16 +5,29 @@ # interproscan CLI will return a completely empty file (0Bytes) #' runIPRScan +#' +#' Run InterProScan on a given FASTA file and save the results to an +#' output file. #' -#' @param filepath_fasta -#' @param filepath_out -#' @param appl +#' @param filepath_fasta A string representing the path to the input FASTA file. +#' @param filepath_out A string representing the base path for the output file. +#' @param appl A character vector specifying the InterProScan applications to +#' use (e.g., "Pfam", "Gene3D"). Default is `c("Pfam", "Gene3D")`. #' #' @importFrom stringr str_glue #' -#' @return +#' @return A data frame containing the results from the InterProScan output +#' TSV file. #' #' @examples +#' \dontrun{ +#' results <- runIPRScan( +#' filepath_fasta = "path/to/your_fasta_file.fasta", +#' filepath_out = "path/to/output_file", +#' appl = c("Pfam", "Gene3D") +#' ) +#' print(results) +#' } runIPRScan <- function( filepath_fasta, filepath_out, # do not inlucde file extension since ipr handles this diff --git a/R/ipr2viz.R b/R/ipr2viz.R index 0d417be0..c4006e51 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -19,10 +19,17 @@ #' #' @importFrom ggplot2 element_blank element_line theme theme_grey #' -#' @return +#' @return A ggplot2 theme object. #' @export -#' #' @examples +#' library(ggplot2) +#' +#' # Create a sample plot using the custom theme +#' ggplot(mtcars, aes(x = wt, y = mpg)) + +#' geom_point() + +#' theme_genes2() + +#' labs(title = "Car Weight vs MPG") +#' theme_genes2 <- function() { ggplot2::theme_grey() + ggplot2::theme( panel.background = ggplot2::element_blank(), @@ -43,11 +50,16 @@ theme_genes2 <- function() { ################################## #' Group by lineage + DA then take top 20 #' -#' @param infile_full -#' @param DA_col -#' @param lin_col -#' @param n -#' @param query +#' @param infile_full A data frame containing the full dataset with lineage and +#' domain architecture information. +#' @param DA_col A string representing the name of the domain architecture +#' column. Default is "DomArch.Pfam". +#' @param lin_col A string representing the name of the lineage column. +#' Default is "Lineage_short". +#' @param n An integer specifying the number of top accession numbers to return. +#' Default is 20. +#' @param query A string for filtering a specific query name. If it is not +#' "All", only the data matching this query will be processed. #' #' @importFrom dplyr arrange filter group_by select summarise #' @importFrom shiny showNotification @@ -55,10 +67,16 @@ theme_genes2 <- function() { #' @importFrom rlang sym #' @importFrom rlang .data #' -#' @return +#' @return A vector of the top N accession numbers (`AccNum`) based on counts +#' grouped by lineage and domain architecture. #' @export #' #' @examples +#' \dontrun{ +#' top_accessions <- find_top_acc(infile_full = my_data, +#' DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +#' n = 20, query = "specific_query_name") +#' } find_top_acc <- function(infile_full, DA_col = "DomArch.Pfam", lin_col = "Lineage_short", @@ -94,16 +112,26 @@ find_top_acc <- function(infile_full, ############################################# #' IPR2Viz #' -#' @param infile_ipr -#' @param infile_full -#' @param accessions -#' @param analysis -#' @param group_by -#' @param topn -#' @param name -#' @param text_size -#' @param query -#' +#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' domain information. +#' @param infile_full A path to the full input file (TSV format) containing +#' lineage and accession information. +#' @param accessions A character vector of accession numbers to filter the +#' analysis. Default is an empty vector. +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a +#' vector of these analyses. +#' @param group_by A string specifying how to group the visualization. +#' Default is "Analysis". Options include "Analysis" or "Query". +#' @param topn An integer specifying the number of top accessions to visualize. +#' Default is 20. +#' @param name A string representing the name to use for y-axis labels. +#' Default is "Name". +#' @param text_size An integer specifying the text size for the plot. +#' Default is 15. +#' @param query A string for filtering a specific query name. If it is not +#' "All", only the data matching this query will be processed. +#' #' @importFrom dplyr distinct filter select #' @importFrom gggenes geom_gene_arrow geom_subgene_arrow #' @importFrom ggplot2 aes aes_string as_labeller element_text facet_wrap ggplot guides margin scale_fill_manual theme theme_minimal unit ylab @@ -111,10 +139,22 @@ find_top_acc <- function(infile_full, #' @importFrom tidyr pivot_wider #' @importFrom stats as.formula #' -#' @return +#' @return A ggplot object representing the domain architecture visualization. #' @export #' #' @examples +#' \dontrun{ +#' plot <- ipr2viz(infile_ipr = "path/to/ipr_file.tsv", +#' infile_full = "path/to/full_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' topn = 20, +#' name = "Gene Name", +#' text_size = 15, +#' query = "All") +#' print(plot) +#' } ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), group_by = "Analysis", # "Analysis" @@ -250,15 +290,25 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' IPR2Viz Web #' -#' @param infile_ipr -#' @param accessions -#' @param analysis -#' @param group_by -#' @param name -#' @param text_size -#' @param legend_name -#' @param cols -#' @param rows +#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' domain information. +#' @param accessions A character vector of accession numbers to filter the +#' analysis. +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector +#' of these analyses. +#' @param group_by A string specifying how to group the visualization. +#' Default is "Analysis". Options include "Analysis" or "Query". +#' @param name A string representing the name to use for y-axis labels. +#' Default is "Name". +#' @param text_size An integer specifying the text size for the plot. +#' Default is 15. +#' @param legend_name A string representing the column to use for legend labels. +#' Default is "ShortName". +#' @param cols An integer specifying the number of columns in the facet wrap. +#' Default is 5. +#' @param rows An integer specifying the number of rows in the legend. +#' Default is 10. #' #' @importFrom dplyr arrange distinct filter select #' @importFrom gggenes geom_gene_arrow geom_subgene_arrow @@ -266,10 +316,23 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' @importFrom readr read_tsv #' @importFrom tidyr pivot_wider #' -#' @return +#' @return A ggplot object representing the domain architecture visualization +#' for web display. #' @export #' #' @examples +#' \dontrun{ +#' plot <- ipr2viz_web(infile_ipr = "path/to/ipr_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' name = "Gene Name", +#' text_size = 15, +#' legend_name = "ShortName", +#' cols = 5, +#' rows = 10) +#' print(plot) +#' } ipr2viz_web <- function(infile_ipr, accessions, analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), diff --git a/R/lineage.R b/R/lineage.R index d14246d7..ea1cd13a 100644 --- a/R/lineage.R +++ b/R/lineage.R @@ -11,17 +11,24 @@ #' #' @author Samuel Chen, Janani Ravi #' -#' @param outpath String of path where the assembly summary file should be written -#' @param keep Character vector containing which columns should be retained and downloaded +#' @param outpath String of path where the assembly summary file should be +#' written +#' @param keep Character vector containing which columns should be retained and +#' downloaded #' #' @importFrom data.table fwrite setnames #' @importFrom dplyr bind_rows select #' @importFrom biomartr getKingdomAssemblySummary #' -#' @return +#' @return A tab-separated file containing the assembly summary. The function +#' does notreturn any value but writes the output directly to the specified file. #' @export #' #' @examples +#' \dontrun{ +#' downloadAssemblySummary(outpath = "assembly_summary.tsv", +#' keep = c("assembly_accession", "taxid", "organism_name")) +#' } downloadAssemblySummary <- function(outpath, keep = c( "assembly_accession", "taxid", @@ -78,15 +85,24 @@ downloadAssemblySummary <- function(outpath, #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the #' "create_lineage_lookup()" function -#' @param acc_col +#' @param acc_col Character. The name of the column in `prot_data` containing +#' accession numbers. Default is "AccNum". #' #' @importFrom dplyr pull #' @importFrom data.table fread setnames #' -#' @return +#' @return A dataframe containing the merged information of GCA_IDs, TaxIDs, +#' and their corresponding lineage up to the phylum level. The dataframe +#' will include information from the input `prot_data` and lineage data. +#' #' @export #' #' @examples +#' \dontrun{ +#' result <- GCA2Lineage(prot_data = my_prot_data, +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv") +#' } GCA2Lineage <- function(prot_data, assembly_path = "/data/research/jravilab/common_data/assembly_summary_genbank.txt", lineagelookup_path = "/data/research/jravilab/common_data/lineage_lookup.tsv", @@ -135,20 +151,34 @@ GCA2Lineage <- function(prot_data, ################################### #' addLineage #' -#' @param df -#' @param acc_col -#' @param assembly_path -#' @param lineagelookup_path -#' @param ipgout_path -#' @param plan +#' @param df Dataframe containing accession numbers. The dataframe should +#' have a column specified by `acc_col` that contains these accession numbers. +#' @param acc_col Character. The name of the column in `df` containing +#' accession numbers. Default is "AccNum". +#' @param assembly_path String. The path to the assembly summary file generated +#' using the `downloadAssemblySummary()` function. +#' @param lineagelookup_path String. The path to the lineage lookup file (taxid +#' to lineage mapping) generated using the `create_lineage_lookup()` function. +#' @param ipgout_path String. Optional path to save intermediate output files. +#' Default is NULL. +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' #' @importFrom dplyr pull #' @importFrom rlang sym #' -#' @return +#' @return A dataframe that combines the original dataframe `df` with lineage +#' information retrieved based on the provided accession numbers. +#' #' @export #' #' @examples +#' \dontrun{ +#' enriched_df <- addLineage(df = my_data, +#' acc_col = "AccNum", +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv") +#' } addLineage <- function(df, acc_col = "AccNum", assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "multicore") { acc_sym <- sym(acc_col) @@ -194,12 +224,23 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' (taxid to lineage mapping). This file can be generated using the #' @param ipgout_path Path to write the results of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL -#' @param plan +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' -#' @return +#' @return A dataframe containing lineage information mapped to the given protein +#' accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, +#' Protein, Protein Name, Species, and Lineage. #' @export #' #' @examples +#' \dontrun{ +#' lineage_data <- acc2Lineage( +#' accessions = c("P12345", "Q67890"), +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv", +#' ipgout_path = "path/to/output.txt" +#' ) +#' } acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "multicore") { tmp_ipg <- F @@ -235,16 +276,25 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, #' @param accessions Character vector containing the accession numbers to query on #' the ipg database #' @param out_path Path to write the efetch results to -#' @param plan +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' #' @importFrom future future plan #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return The function does not return a value but writes the efetch results +#' directly to the specified `out_path`. +#' #' @export #' #' @examples +#' \dontrun{ +#' efetchIPG( +#' accessions = c("P12345", "Q67890", "A12345"), +#' out_path = "path/to/efetch_results.xml" +#' ) +#' } efetchIPG <- function(accessions, out_path, plan = "multicore") { if (length(accessions) > 0) { partition <- function(v, groups) { @@ -305,18 +355,28 @@ efetchIPG <- function(accessions, out_path, plan = "multicore") { #' @param ipg_file Path to the file containing results of an efetch run on the #' ipg database. The protein accession in 'accessions' should be contained in this #' file -#' @param refseq_assembly_path -#' @param genbank_assembly_path +#' @param refseq_assembly_path String. Path to the RefSeq assembly summary file. +#' @param genbank_assembly_path String. Path to the GenBank assembly summary file. #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the #' "create_lineage_lookup()" function #' #' @importFrom data.table fread setnames #' -#' @return +#' @return A data table containing protein accessions along with their +#' corresponding TaxIDs and lineage information. #' @export #' #' @examples +#' \dontrun{ +#' lins <- IPG2Lineage( +#' accessions = c("P12345", "Q67890"), +#' ipg_file = "path/to/ipg_results.txt", +#' refseq_assembly_path = "path/to/refseq_assembly_summary.txt", +#' genbank_assembly_path = "path/to/genbank_assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv" +#' ) +#' } IPG2Lineage <- function(accessions, ipg_file, refseq_assembly_path, genbank_assembly_path, lineagelookup_path) { @@ -383,16 +443,25 @@ IPG2Lineage <- function(accessions, ipg_file, ######################################### #' addTaxID #' -#' @param data -#' @param acc_col -#' @param version +#' @param data A data frame or data table containing protein accession numbers. +#' @param acc_col A string specifying the column name in `data` that contains +#' the accession numbers. Defaults to "AccNum". +#' @param version A logical indicating whether to remove the last two characters +#' from the accession numbers for TaxID retrieval. Defaults to TRUE. #' #' @importFrom data.table as.data.table #' -#' @return +#' @return A data table that includes the original data along with a new column +#' containing the corresponding TaxIDs. #' @export #' #' @examples +#' \dontrun{ +#' # Create a sample data table with accession numbers +#' sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) +#' enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) +#' print(enriched_data) +#' } addTaxID <- function(data, acc_col = "AccNum", version = T) { if (!is.data.table(data)) { data <- as.data.table(data) @@ -421,17 +490,30 @@ addTaxID <- function(data, acc_col = "AccNum", version = T) { ################################## #' proteinAcc2TaxID #' -#' @param accnums -#' @param suffix -#' @param out_path -#' @param return_dt +#' @param accnums A character vector of protein accession numbers to be mapped +#' to TaxIDs. +#' @param suffix A string suffix used to name the output file generated by the +#' script. +#' @param out_path A string specifying the directory where the output file will +#' be saved. +#' @param return_dt A logical indicating whether to return the result as a data +#' table. Defaults to FALSE. If TRUE, the output file is read into a data table +#' and returned. #' #' @importFrom data.table fread #' -#' @return +#' @return If `return_dt` is TRUE, a data table containing the mapping of protein +#' accession numbers to TaxIDs. If FALSE, the function returns NULL. #' @export #' #' @examples +#' \dontrun{ +#' # Example accession numbers +#' accessions <- c("ABC123", "XYZ456", "LMN789") +#' tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +#' out_path = "/path/to/output", return_dt = TRUE) +#' print(tax_data) +#' } proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { # Write accnums to a file acc_file <- tempfile() @@ -456,18 +538,25 @@ proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { #' @description Perform elink to go from protein database to taxonomy database #' and write the resulting file of taxid and lineage to out_path #' -#' @param accessions Character vector containing the accession numbers to query on -#' the ipg database -#' @param out_path Path to write the efetch results to -#' @param plan +#' @param accessions A character vector containing the accession numbers to query +#' in the protein database. +#' @param out_path A string specifying the path where the results of the query +#' will be written. If set to NULL, a temporary directory will be used. +#' @param plan A character string that specifies the execution plan for parallel +#' processing. The default is "multicore". #' #' @importFrom future plan #' @importFrom purrr map #' -#' @return +#' @return This function does not return a value. It writes the results to the +#' specified output path. #' @export #' #' @examples +#' \dontrun{ +#' accessions <- c("ABC123", "XYZ456", "LMN789") +#' proteinAcc2TaxID_old(accessions, out_path = "/path/to/output") +#' } proteinAcc2TaxID_old <- function(accessions, out_path, plan = "multicore") { if (length(accessions) > 0) { partition <- function(v, groups) { diff --git a/R/msa.R b/R/msa.R index e56cc32c..20089dba 100644 --- a/R/msa.R +++ b/R/msa.R @@ -50,12 +50,15 @@ #' @importFrom msa msa msaPrettyPrint #' @importFrom stringr str_replace #' -#' @return +#' @return A PDF file containing the multiple sequence alignment. #' @export #' #' @examples #' \dontrun{ -#' msa_pdf() +#' msa_pdf(fasta_path = "path/to/your/file.fasta", +#' out_path = "path/to/output/alignment.pdf", +#' lowerbound = 10, +#' upperbound = 200) #' } msa_pdf <- function(fasta_path, out_path = NULL, lowerbound = NULL, upperbound = NULL) { @@ -187,15 +190,22 @@ msa_pdf <- function(fasta_path, out_path = NULL, ## https://github.com/mhahsler/rMSA #' Function to generate MSA using kalign #' -#' @param fa_file -#' @param outfile +#' @param fa_file Character. The path to the input FASTA file containing protein +#' sequences. +#' @param outfile Character. The path to the output file where the alignment +#' will be saved. #' #' @importFrom Biostrings readAAStringSet +#' @importFrom rMSA kalign #' -#' @return +#' @return A list containing the alignment object and the output file path. #' @export #' #' @examples +#' \dontrun{ +#' generate_msa(fa_file = "path/to/sequences.fasta", +#' outfile = "path/to/alignment.txt") +#' } generate_msa <- function(fa_file = "", outfile = "") { prot_aa <- readAAStringSet( path = fa_file, diff --git a/R/networks_domarch.R b/R/networks_domarch.R index fea0a195..65090fa4 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,13 +24,17 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying domains of interest. +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color +#' @param query_color Character. Color to represent the queried domain in the +#' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select #' @importFrom igraph delete_vertices graph_from_edgelist vertex @@ -41,7 +45,7 @@ #' @importFrom tidyr pivot_wider #' @importFrom visNetwork visIgraph visIgraphLayout visNetwork visOptions #' -#' @return +#' @return A network visualization of domain architectures. #' @export #' #' @examples @@ -227,15 +231,20 @@ domain_network <- function(prot, column = "DomArch", domains_of_interest, cutoff #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying the domains of interest. +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the domains_of_interest vector are colored -#' @param border_color +#' @param query_color Color that the nodes of the domains in the +#' domains_of_interest vector are colored +#' @param partner_color Color that the nodes that are not part of the +#' domains_of_interest vector are colored +#' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows #' #' @importFrom dplyr distinct filter group_by mutate pull select summarize @@ -245,12 +254,12 @@ domain_network <- function(prot, column = "DomArch", domains_of_interest, cutoff #' @importFrom stringr str_replace_all str_split #' @importFrom visNetwork visEdges visGroups visIgraphLayout visLegend visNetwork visOptions #' -#' @return +#' @return A network visualization of domain architectures. #' @export #' #' @examples #' \dontrun{ -#' domain_network(pspa) +#' BinaryDomainNetwork(pspa) #' } BinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, layout = "nice", query_color = adjustcolor("yellow", alpha.f = .5), diff --git a/R/networks_gencontext.R b/R/networks_gencontext.R index e0dd63da..02733cdf 100755 --- a/R/networks_gencontext.R +++ b/R/networks_gencontext.R @@ -17,13 +17,19 @@ #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff_type Character. Used to determine how data should be filtered. Either -#' \itemize{\item "Lineage" to filter domains based off how many lineages the Domain architecture appears in -#' \item "Total Count" to filter off the total amount of times a domain architecture occurs } -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying the domains of interest. +#' @param cutoff_type Character. Used to determine how data should be filtered. +#' Either +#' \itemize{\item "Lineage" to filter domains based off how many lineages the +#' Domain architecture appears in +#' \item "Total Count" to filter off the total amount of times a +#' domain architecture occurs } +#' @param cutoff Integer. Only use domains that occur at or above the cutoff +#' for total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} #' @@ -32,12 +38,14 @@ #' @importFrom igraph E graph_from_edgelist layout.auto layout.circle layout_on_grid layout_randomly plot.igraph V #' @importFrom stringr str_replace_all str_split #' -#' @return +#' @return A plot of the domain architecture network. #' @export #' #' @examples #' \dontrun{ -#' domain_network(pspa) +#' domain_network(pspa, column = "DomArch", +#' domains_of_interest = c("Domain1", "Domain2"), +#' cutoff_type = "Total Count", cutoff = 10) #' } gc_undirected_network <- function(prot, column = "GenContext", domains_of_interest, cutoff_type = "Lineage", cutoff = 1, layout = "grid") { # by domain networks or all, as required. @@ -127,8 +135,10 @@ gc_undirected_network <- function(prot, column = "GenContext", domains_of_intere #' #' @param prot A data frame that contains the column 'GenContext'. #' @param domains_of_interest Character vector of domains of interest. -#' @param column Name of column containing Genomic Context from which nodes and edges are generated. -#' @param cutoff Integer. Only use GenContexts that occur at or above the cutoff percentage for total count +#' @param column Name of column containing Genomic Context from which nodes and +#' edges are generated. +#' @param cutoff Integer. Only use GenContexts that occur at or above the cutoff +#' percentage for total count #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto" \item "nice"} #' @param directed Is the network directed? @@ -139,12 +149,12 @@ gc_undirected_network <- function(prot, column = "GenContext", domains_of_intere #' @importFrom stringr str_replace_all #' @importFrom visNetwork visIgraphLayout visLegend visNetwork visOptions #' -#' @return +#' @return A plot of the genomic context network. #' @export #' #' @examples #' \dontrun{ -#' gc_directed_network(pspa, column = "GenContex", cutoff = 55) +#' gc_directed_network(pspa, column = "GenContext", cutoff = 55) #' } GenContextNetwork <- function(prot, domains_of_interest, column = "GenContext", cutoff = 40, diff --git a/R/plotme.R b/R/plotme.R index 906e85ec..3527f170 100644 --- a/R/plotme.R +++ b/R/plotme.R @@ -44,10 +44,9 @@ plotSunburst <- function(count_data, fill_by_n = FALSE, sort_by_n = FALSE, maxde } -#' @param count_data -#' -#' @param fill_by_n -#' @param sort_by_n +#' @param count_data A data frame containing the data. +#' @param fill_by_n Logical indicating if fill color is based on counts. +#' @param sort_by_n Logical indicating if data should be sorted by counts. #' #' @importFrom plotly plot_ly #' @importFrom purrr exec @@ -68,18 +67,24 @@ plotTreemap <- function(count_data, fill_by_n = FALSE, sort_by_n = FALSE) { #' prepareColumnParams #' -#' @param count_data -#' @param fill_by_n -#' @param sort_by_n +#' @param count_data A data frame containing the data. +#' @param fill_by_n Logical indicating if fill color is based on counts. +#' @param sort_by_n Logical indicating if data should be sorted by counts. #' #' @importFrom assertthat assert_that #' @importFrom dplyr bind_rows mutate #' @importFrom purrr map #' -#' @return +#' @return A data frame of parameters for treemap visualization. #' @export #' #' @examples +#' \dontrun{ +#' count_data <- data.frame(Category = c("A", "B", "C"), +#' n = c(10, 20, 15)) +#' params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) +#' print(params) +#' } prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { validateCountDF(count_data) assertthat::assert_that(is.logical(fill_by_n), @@ -116,17 +121,24 @@ prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { #' prepareSingleColumnParams #' -#' @param df -#' @param col_num -#' @param root +#' @param df A data frame containing the data to be processed. +#' @param col_num An integer representing the column number to process. +#' @param root A string representing the root node for the treemap. #' #' @importFrom dplyr c_across group_by mutate rowwise select summarise ungroup #' @importFrom stringr str_glue #' -#' @return +#' @return A data frame containing parameters for the specified column for +#' treemap visualization. #' @export #' #' @examples +#' \dontrun{ +#' df <- data.frame(Category = c("A", "A", "B", "B", "C"), +#' n = c(10, 20, 30, 40, 50)) +#' params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") +#' print(params) +#' } prepareSingleColumnParams <- function(df, col_num, root) { @@ -158,15 +170,18 @@ prepareSingleColumnParams <- function(df, } #' validateCountDF #' -#' @param var +#' @param var A data frame whose columns are to be converted. #' #' @importFrom assertthat assert_that has_name #' @importFrom dplyr across mutate #' -#' @return +#' @return A data frame with non-'n' columns converted to character type. #' @export #' #' @examples +#' \dontrun{ +#' new_df <- .all_non_n_cols_to_char(my_data) +#' } validateCountDF <- function(var) { msg <- paste(substitute(var), "must be a count dataframe (output of dplyr::count)") assertthat::assert_that(is.data.frame(var), diff --git a/R/plotting.R b/R/plotting.R index 7191eace..b9a2758a 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -18,20 +18,34 @@ # suppressPackageStartupMessages(library(d3r)) # suppressPackageStartupMessages(library(viridis)) -#' Shorten Lineage +#' Shorten Lineage Names #' -#' @param data -#' @param colname -#' @param abr_len +#' @description +#' This function abbreviates lineage names by shortening the first part of the +#' string (up to a given delimiter). +#' +#' @param data A data frame that contains a column with lineage names to be +#' shortened. +#' @param colname Character. The name of the column in the data frame containing +#' the lineage strings to be shortened. Default is `"Lineage"`. +#' @param abr_len Integer. The number of characters to retain after the first +#' letter. If set to 1, only the first letter of each segment before the +#' delimiter (`>`) is retained. Default is 1. #' #' @importFrom stringr str_locate +#' @importFrom purrr pmap +#' +#' @return A modified data frame where the specified lineage column has been +#' shortened. #' -#' @return #' @export #' #' @examples #' \dontrun{ -#' shortenLineage() +#' df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +#' "Archaea>Euryarchaeota>Thermococci")) +#' shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) +#' print(shortened_df) #' } shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { abbrv <- function(x) { @@ -65,23 +79,29 @@ shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is toast_rack.sub -#' @param colname +#' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", +#' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". #' @param cutoff Numeric. Cutoff for word frequency. Default is 90. -#' @param RowsCutoff -#' @param text.scale Allows scaling of axis title, tick lables, and numbers above the intersection size bars. +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' based on a certain condition. Default is FALSE. +#' @param text.scale Allows scaling of axis title, tick lables, and numbers +#' above the intersection size bars. #' text.scale can either take a universal scale in the form of an integer, #' or a vector of specific scales in the format: c(intersection size title, #' intersection size tick labels, set size title, set size tick labels, set names, #' numbers above bars) -#' @param point.size -#' @param line.size +#' @param point.size Numeric. Sets the size of points in the UpSet plot. +#' Default is 2.2. +#' @param line.size Numeric. Sets the line width in the UpSet plot. +#' Default is 0.8. #' #' @importFrom dplyr across distinct filter if_else mutate pull select where #' @importFrom rlang sym #' @importFrom stringr str_detect str_replace_all str_split #' @importFrom UpSetR upset #' -#' @return +#' @return An UpSet plot object. The plot visualizes intersections of sets based +#' on the provided colname in query_data. #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or @@ -230,8 +250,9 @@ plotUpSet <- function(query_data = "toast_rack.sub", #' Default is prot (variable w/ protein data). #' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", #' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". -#' @param cutoff -#' @param RowsCutoff +#' @param cutoff Numeric. Cutoff for word frequency. Default is 90. +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' based on a certain condition. Default is FALSE. #' @param color Color for the heatmap. One of six options: "default", "magma", "inferno", #' "plasma", "viridis", or "cividis" #' @@ -243,7 +264,7 @@ plotUpSet <- function(query_data = "toast_rack.sub", #' @importFrom viridis scale_fill_viridis #' @importFrom rlang sym #' -#' @return +#' @return A LineageDA plot object. #' @export #' #' @details @@ -325,7 +346,7 @@ plotLineageDA <- function(query_data = "prot", #' Lineage Plot: Heatmap of Queries vs Lineages #' -#' @authors Janani Ravi, Samuel Chen +#' @author Janani Ravi, Samuel Chen #' @keywords Lineages, Domains, Domain Architectures, GenomicContexts #' @description #' Lineage plot for queries. Heatmap. @@ -333,10 +354,14 @@ plotLineageDA <- function(query_data = "prot", #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). #' Default is prot (variable w/ protein data). -#' @param queries Character Vector containing the queries that will be used for the categories -#' @param colname -#' @param cutoff -#' @param color +#' @param queries Character Vector containing the queries that will be used for +#' the categories. +#' @param colname Character. The column used for filtering based on the `queries`. +#' Default is "ClustName". +#' @param cutoff Numeric. The cutoff value for filtering rows based on their +#' total count. Rows with values below this cutoff are excluded. +#' @param color Character. Defines the color palette used for the heatmap. +#' Default is a red gradient. #' #' @importFrom dplyr arrange desc filter group_by select summarise union #' @importFrom ggplot2 aes aes_string element_rect element_text geom_tile ggplot scale_fill_gradient scale_x_discrete theme theme_minimal @@ -346,7 +371,9 @@ plotLineageDA <- function(query_data = "prot", #' @importFrom tidyr drop_na #' @importFrom viridis scale_fill_viridis #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) showing the +#' relationship between queries and lineages, with the intensity of color +#' representing the count of matching records. #' @export #' #' @note @@ -476,7 +503,9 @@ plotLineageQuery <- function(query_data = all, #' @importFrom stringr str_replace_all #' @importFrom tidyr gather #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of lineage versus +#' the top neighboring domain architectures, with color intensity representing +#' the frequency of occurrences. #' @export #' #' @details @@ -554,15 +583,19 @@ plotLineageNeighbors <- function(query_data = "prot", query = "pspa", #' Lineage Domain Repeats Plot #' -#' @param query_data -#' @param colname +#' @param query_data Data frame containing protein homolog data, including +#' relevant domain architectures and lineages. +#' @param colname Character. The name of the column in query_data that contains +#' domain architectures or other structural information. #' #' @importFrom dplyr across mutate select where #' @importFrom ggplot2 aes element_text geom_tile ggplot scale_fill_gradient scale_x_discrete theme theme_minimal #' @importFrom stringr str_count str_replace_all #' @importFrom tidyr gather #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the +#' occurrence of domains. #' @export #' #' @examples @@ -646,7 +679,9 @@ plotLineageDomainRepeats <- function(query_data, colname) { #' @importFrom purrr map #' @importFrom stringr str_locate str_locate_all #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the +#' occurrence of domains. #' @export #' #' @examples @@ -791,25 +826,35 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size #' Stacked Lineage Plot #' -#' @param prot -#' @param column -#' @param cutoff -#' @param Lineage_col -#' @param xlabel -#' @param reduce_lineage -#' @param label.size -#' @param legend.position -#' @param legend.text.size -#' @param legend.cols -#' @param legend.size -#' @param coord_flip -#' @param legend +#' @param prot Data frame containing protein data including domain architecture +#' and lineage information. +#' @param column Character. The name of the column in prot representing domain +#' architectures (default is "DomArch"). +#' @param cutoff Numeric. A threshold value for filtering domain architectures +#' or protein counts. +#' @param Lineage_col Character. The name of the column representing lineage +#' data (default is "Lineage"). +#' @param xlabel Character. Label for the x-axis +#' (default is "Domain Architecture"). +#' @param reduce_lineage Logical. Whether to shorten lineage names +#' (default is TRUE). +#' @param label.size Numeric. The size of axis text labels (default is 8). +#' @param legend.position Numeric vector. Coordinates for placing the legend +#' (default is c(0.7, 0.4)). +#' @param legend.text.size Numeric. Size of the text in the legend +#' (default is 10). +#' @param legend.cols Numeric. Number of columns in the legend (default is 2). +#' @param legend.size Numeric. Size of the legend keys (default is 0.7). +#' @param coord_flip Logical. Whether to flip the coordinates of the plot +#' (default is TRUE). +#' @param legend Logical. Whether to display the legend (default is TRUE). #' #' @importFrom dplyr pull select #' @importFrom ggplot2 aes_string coord_flip element_blank element_line element_rect element_text geom_bar ggplot guides guide_legend scale_fill_manual xlab ylab theme theme_minimal #' @importFrom purrr map #' -#' @return +#' @return A ggplot object representing a stacked bar plot showing the +#' distribution of protein domain architectures across lineages. #' @export #' #' @examples @@ -937,31 +982,46 @@ plotStackedLineage <- function(prot, column = "DomArch", cutoff, Lineage_col = " #' plotWordCloud3 #' -#' @param data -#' @param size -#' @param minSize -#' @param gridSize -#' @param fontFamily -#' @param fontWeight -#' @param color -#' @param backgroundColor -#' @param minRotation -#' @param maxRotation -#' @param shuffle -#' @param rotateRatio -#' @param shape -#' @param ellipticity -#' @param widgetsize -#' @param figPath -#' @param hoverFunction +#' @param data Data frame or table containing words and their frequencies for +#' the word cloud. +#' @param size Numeric. Scaling factor for word sizes (default is 1). +#' @param minSize Numeric. Minimum font size for the smallest word +#' (default is 0). +#' @param gridSize Numeric. Size of the grid for placing words (default is 0). +#' @param fontFamily Character. Font family to use for the words +#' (default is "Segoe UI"). +#' @param fontWeight Character. Font weight for the words (default is "bold"). +#' @param color Character or vector. Color of the words. Use "random-dark" for +#' random dark colors (default) or specify a color. +#' @param backgroundColor Character. Background color of the word cloud +#' (default is "white"). +#' @param minRotation Numeric. Minimum rotation angle of words in radians +#' (default is -π/4). +#' @param maxRotation Numeric. Maximum rotation angle of words in radians +#' (default is π/4). +#' @param shuffle Logical. Whether to shuffle the words (default is TRUE). +#' @param rotateRatio Numeric. Proportion of words that are rotated +#' (default is 0.4). +#' @param shape Character. Shape of the word cloud ("circle" is default, but +#' you can use "cardioid", "star", "triangle", etc.). +#' @param ellipticity Numeric. Degree of ellipticity (default is 0.65). +#' @param widgetsize Numeric vector. Width and height of the widget +#' (default is NULL, which uses default size). +#' @param figPath Character. Path to an image file to use as a mask for the +#' word cloud (optional). +#' @param hoverFunction JS function. JavaScript function to run when hovering +#' over words (optional). #' #' @importFrom base64enc base64encode #' @importFrom htmlwidgets createWidget JS sizingPolicy #' -#' @return +#' @return An HTML widget object displaying a word cloud. #' @export #' #' @examples +#' \dontrun{ +#' wordcloud3(data = your_data, size = 1.5, color = "random-light") +#' } wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = "Segoe UI", fontWeight = "bold", color = "random-dark", backgroundColor = "white", minRotation = -pi / 4, maxRotation = pi / 4, shuffle = TRUE, @@ -1022,16 +1082,20 @@ wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = " #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname -#' @param cutoff -#' @param UsingRowsCutoff +#' @param colname Character. The name of the column in `query_data` to generate +#' the word cloud from. Default is "DomArch". +#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' frequency. Default is 70. +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull #' @importFrom RColorBrewer brewer.pal #' @importFrom rlang sym #' @importFrom wordcloud wordcloud #' -#' @return +#' @return A word cloud plot showing the frequency of elements from the selected +#' column. #' @export #' #' @details @@ -1102,14 +1166,18 @@ createWordCloudElement <- function(query_data = "prot", #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname -#' @param cutoff -#' @param UsingRowsCutoff +#' @param colname Character. The name of the column in `query_data` to generate +#' the word cloud from. Default is "DomArch". +#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' frequency. Default is 70. +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A word cloud plot showing the frequency of elements from the selected +#' column. #' @export #' #' @details @@ -1172,16 +1240,23 @@ createWordCloud2Element <- function(query_data = "prot", #### Sunburst ##### #' Lineage Sunburst #' -#' @param prot Data frame containing a lineage column that the sunburst plot will be generated for -#' @param lineage_column String. Name of the lineage column within the data frame. Defaults to "Lineage" -#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", a sunburst plot of the lineage +#' @param prot Data frame containing a lineage column that the sunburst plot +#' will be generated for +#' @param lineage_column String. Name of the lineage column within the +#' data frame. Defaults to "Lineage" +#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", +#' a sunburst plot of the lineage #' @param levels Integer. Number of levels the sunburst will have. -#' @param colors -#' @param legendOrder String vector. The order of the legend. If legendOrder is NULL, -#' @param showLegend Boolean. If TRUE, the legend will be enabled when the component first renders. -#' @param maxLevels Integer, the maximum number of levels to display in the sunburst; 5 by default, NULL to disable -#' then the legend will be in the descending order of the top level hierarchy. -#' will be rendered. If the type is sund2b, a sund2b plot will be rendered. +#' @param colors A vector of colors for the sunburst plot. +#' If NULL, default colors are used. +#' @param legendOrder String vector. The order of the legend. If legendOrder +#' is NULL, +#' @param showLegend Boolean. If TRUE, the legend will be enabled when the +#' component first renders. +#' @param maxLevels Integer, the maximum number of levels to display in the +#' sunburst; 5 by default, NULL to disable then the legend will be in the +#' descending order of the top level hierarchy. will be rendered. If the type is +#' sund2b, a sund2b plot will be rendered. #' #' @importFrom d3r d3_nest #' @importFrom dplyr arrange desc group_by_at select summarise @@ -1190,12 +1265,13 @@ createWordCloud2Element <- function(query_data = "prot", #' @importFrom sunburstR sunburst sund2b #' @importFrom tidyr drop_na separate #' -#' @return +#' @return A sunburst or sund2b plot based on the input lineage data. #' @export #' #' @examples #' \dontrun{ -#' plotLineageSunburst() +#' plotLineageSunburst(prot, lineage_column = "Lineage", +#' type = "sunburst", levels = 3) #' } plotLineageSunburst <- function(prot, lineage_column = "Lineage", type = "sunburst", diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 44979c3c..5904a522 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -45,10 +45,12 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param x Character vector. #' @param y Delimitter. Default is space (" "). #' -#' @return +#' @return A character vector in title case. #' @export #' #' @examples +#' to_titlecase("hello world") +#' to_titlecase("this is a test", "_") to_titlecase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -87,7 +89,8 @@ to_titlecase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return +#' @return A data frame containing the combined alignment and lineage +#' information. #' @export #' #' @note Please refer to the source code if you have alternate + @@ -188,8 +191,8 @@ add_leaves <- function(aln_file = "", #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function adds a new 'Name' column that is comprised of components from -#' Kingdom, Phylum, Genus, and species, as well as the accession +#' @description This function adds a new 'Name' column that is comprised of +#' components from Kingdom, Phylum, Genus, and species, as well as the accession #' #' @param data Data to add name column to #' @param accnum_col Column containing accession numbers @@ -209,6 +212,9 @@ add_leaves <- function(aln_file = "", #' @export #' #' @examples +#' \dontrun{ +#' add_name(data_frame) +#' } add_name <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { @@ -272,8 +278,8 @@ add_name <- function(data, #' Default is 'pspa.txt' #' @param fa_outpath Character. Path to the written fasta file. #' Default is 'NULL' -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -#' Default is 'FALSE' +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. Default is 'FALSE' #' #' @details The alignment file would need two columns: 1. accession + #' number and 2. alignment. The protein homolog accession to lineage mapping + @@ -283,7 +289,9 @@ add_name <- function(data, #' #' @importFrom readr write_file #' -#' @return +#' @return Character string containing the Fasta formatted sequences. +#' If `fa_outpath` is specified, the function also writes the sequences to the +#' Fasta file. #' @export #' #' @examples @@ -326,7 +334,7 @@ convert_aln2fa <- function(aln_file = "", #' Default rename_fasta() replacement function. Maps an accession number to its name #' -#' @param line he line of a fasta file starting with '>' +#' @param line The line of a fasta file starting with '>' #' @param acc2name Data Table containing a column of accession numbers and a name column #' @param acc_col Name of the column containing Accession numbers #' @param name_col Name of the column containing the names that the accession numbers @@ -336,10 +344,18 @@ convert_aln2fa <- function(aln_file = "", #' @importFrom stringr str_locate #' @importFrom rlang sym #' -#' @return +#' @return Character string. The modified line from the Fasta file header with +#' the name instead of the accession number. #' @export #' #' @examples +#' \dontrun{ +#' acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +#' Name = c("Species A", "Species B")) +#' line <- ">ACC001 some additional info" +#' mapped_line <- map_acc2name(line, acc2name_table) +#' print(mapped_line) # Expected output: ">Species A" +#' } map_acc2name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an add_names column # Find the first ' ' @@ -365,10 +381,14 @@ map_acc2name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") #' @importFrom purrr map #' @importFrom readr read_lines write_lines #' -#' @return +#' @return Character vector containing the modified lines of the Fasta file. #' @export #' #' @examples +#' \dontrun{ +#' rename_fasta("input.fasta", "output.fasta", +#' replacement_function = map_acc2name, acc2name = acc2name_table) +#' } rename_fasta <- function(fa_path, outpath, replacement_function = map_acc2name, ...) { lines <- read_lines(fa_path) @@ -397,18 +417,21 @@ rename_fasta <- function(fa_path, outpath, #' Default is 'here("data/rawdata_aln/")' #' @param fa_outpath Character. Path to the written fasta file. #' Default is 'here("data/alns/")'. -#' @param lin_file Character. Path to file. Master protein file with AccNum & lineages. -#' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -#' Default is 'FALSE'. +#' @param lin_file Character. Path to file. Master protein file with AccNum & +#' lineages. Default is 'here("data/rawdata_tsv/all_semiclean.txt")' +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. Default is 'FALSE'. #' -#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats and/or column names. +#' @details The alignment files would need two columns separated by spaces: 1. +#' AccNum and 2. alignment. The protein homolog file should have AccNum, +#' Species, Lineages. +#' @note Please refer to the source code if you have alternate + file +#' formats and/or column names. #' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return +#' @return A list of paths to the generated Fasta files. #' @export #' #' @examples @@ -456,24 +479,27 @@ generate_all_aln2fa <- function(aln_path = here("data/rawdata_aln/"), #' Resulting fasta file is written to the outpath. #' #' -#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for. -#' Function may not work for vectors of length > 10,000 +#' @param accessions Character vector containing protein accession numbers to +#' generate fasta sequences for. Function may not work for vectors of +#' length > 10,000 #' @param outpath [str]. Location where fasta file should be written to. -#' @param plan +#' @param plan Character. The plan to use for processing. Default is "sequential". #' #' @importFrom Biostrings readAAStringSet #' @importFrom future future plan #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return A Fasta file is written to the specified `outpath`. #' @export #' #' @examples #' \dontrun{ -#' acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") +#' acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' outpath = "my_proteins.fasta") #' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2fa(outpath = "ebi.fa") +#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +#' acc2fa(outpath = "ebi.fa") #' } acc2fa <- function(accessions, outpath, plan = "sequential") { # validation @@ -562,14 +588,23 @@ acc2fa <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character vector containing representative accession numbers, +#' one for each distinct observation in the specified 'reduced' column. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage with a data frame called `protein_data` +#' representative_accessions <- RepresentativeAccNums(prot_data = protein_data, +#' reduced = "Lineage", +#' accnum_col = "AccNum") +#' print(representative_accessions) +#' } RepresentativeAccNums <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column + # Get Unique reduced column and then bind the AccNums back to get one + # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -603,8 +638,10 @@ RepresentativeAccNums <- function(prot_data, #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written +#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' "ClustalO", or "ClustalW" +#' @param outpath Path to write the resulting alignment to as a FASTA file. If +#' NULL, no file is written #' #' @importFrom Biostrings readAAStringSet #' @importFrom msa msaMuscle msaClustalOmega msaClustalW @@ -613,6 +650,12 @@ RepresentativeAccNums <- function(prot_data, #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' aligned_sequences <- alignFasta("path/to/sequences.fasta", +#' tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") +#' print(aligned_sequences) +#' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { fasta <- readAAStringSet(fasta_file) @@ -643,10 +686,15 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @importFrom Biostrings unmasked #' @importFrom readr write_file #' -#' @return +#' @return Character string of the FASTA content that was written to the file. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' alignment <- alignFasta("path/to/sequences.fasta") +#' write.MsaAAMultipleAlignment(alignment, "path/to/aligned_sequences.fasta") +#' } write.MsaAAMultipleAlignment <- function(alignment, outpath) { l <- length(rownames(alignment)) fasta <- "" @@ -662,15 +710,21 @@ write.MsaAAMultipleAlignment <- function(alignment, outpath) { #' get_accnums_from_fasta_file #' -#' @param fasta_file +#' @param fasta_file Character. Path to the FASTA file from which +#' accession numbers will be extracted. #' #' @importFrom readr read_file #' @importFrom stringi stri_extract_all_regex #' -#' @return +#' @return A character vector containing the extracted accession numbers. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' accnums <- get_accnums_from_fasta_file("path/to/sequences.fasta") +#' print(accnums) +#' } get_accnums_from_fasta_file <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] diff --git a/R/reverse_operons.R b/R/reverse_operons.R index e4bbd50e..b165ef72 100755 --- a/R/reverse_operons.R +++ b/R/reverse_operons.R @@ -3,14 +3,26 @@ # Modified by Janani Ravi and Samuel Chen -#' reveql +#' reveql: Reverse Equalities in Genomic Context #' -#' @param prot +#' @description +#' This function processes the genomic context strings (GenContext) and reverses +#' directional signs based on the presence of an equal sign ("="). +#' +#' @param prot [vector] A vector of genomic context strings to be processed. +#' +#' @return [vector] A vector of the same length as the input, where each genomic +#' element is annotated with either a forward ("->") or reverse ("<-") direction, +#' depending on its position relative to the "=" symbols. #' -#' @return #' @export #' #' @examples +#' # Example input: Genomic context with directional symbols and an asterisk +#' genomic_context <- c("A", "B", "*", "C", "D", "=", "E", "F") +#' reveql(genomic_context) +#' +#' # Output: "A->", "B->", "*", "<-C", "<-D", "=", "E->", "F->" reveql <- function(prot) { w <- prot # $GenContext.orig # was 'x' @@ -57,14 +69,28 @@ reveql <- function(prot) { ## The function to reverse operons -#' reverse_operon +#' reverse_operon: Reverse the Direction of Operons in Genomic Context +#' +#' @description +#' This function processes a genomic context data frame to reverse the direction +#' of operons based on specific patterns in the GenContext column. It handles +#' elements represented by ">" and "<" and restructures the genomic context by +#' flipping the direction of operons while preserving the relationships +#' indicated by "=". +#' +#' @param prot [data.frame] A data frame containing at least a column named +#' 'GenContext', which represents the genomic contexts that need to be reversed. #' -#' @param prot +#' @return [data.frame] The input data frame with the 'GenContext' column updated t +#' o reflect the reversed operons. #' -#' @return #' @export #' #' @examples +#' # Example genomic context data frame +#' prot <- data.frame(GenContext = c("A>B", "CI")) +#' reversed_prot <- reverse_operon(prot) +#' print(reversed_prot) reverse_operon <- function(prot) { gencontext <- prot$GenContext diff --git a/man/BinaryDomainNetwork.Rd b/man/BinaryDomainNetwork.Rd index bb7e2353..5c35be0f 100644 --- a/man/BinaryDomainNetwork.Rd +++ b/man/BinaryDomainNetwork.Rd @@ -19,20 +19,32 @@ BinaryDomainNetwork( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{domains_of_interest}{Character vector specifying the domains of interest.} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for +total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} -\item{query_color}{Color that the nodes of the domains in the domains_of_interest vector are colored} +\item{query_color}{Color that the nodes of the domains in the +domains_of_interest vector are colored} + +\item{partner_color}{Color that the nodes that are not part of the +domains_of_interest vector are colored} -\item{partner_color}{Color that the nodes that are not part of the domains_of_interest vector are colored} +\item{border_color}{Color for the borders of the nodes.} \item{IsDirected}{Is the network directed? Set to false to eliminate arrows} } +\value{ +A network visualization of domain architectures. +} \description{ This function creates a domain network from the 'DomArch' column. @@ -42,6 +54,6 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -domain_network(pspa) +BinaryDomainNetwork(pspa) } } diff --git a/man/GCA2Lineage.Rd b/man/GCA2Lineage.Rd index 9ec0ce56..796c2efb 100644 --- a/man/GCA2Lineage.Rd +++ b/man/GCA2Lineage.Rd @@ -21,7 +21,13 @@ This file can be generated using the "downloadAssemblySummary()" function} (taxid to lineage mapping). This file can be generated using the "create_lineage_lookup()" function} -\item{acc_col}{} +\item{acc_col}{Character. The name of the column in \code{prot_data} containing +accession numbers. Default is "AccNum".} +} +\value{ +A dataframe containing the merged information of GCA_IDs, TaxIDs, +and their corresponding lineage up to the phylum level. The dataframe +will include information from the input \code{prot_data} and lineage data. } \description{ Function to map GCA_ID to TaxID, and TaxID to Lineage @@ -29,6 +35,13 @@ Function to map GCA_ID to TaxID, and TaxID to Lineage \note{ Currently configured to have at most kingdom and phylum } +\examples{ +\dontrun{ +result <- GCA2Lineage(prot_data = my_prot_data, + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv") +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/GenContextNetwork.Rd b/man/GenContextNetwork.Rd index 2eeebbc5..08d4f476 100644 --- a/man/GenContextNetwork.Rd +++ b/man/GenContextNetwork.Rd @@ -18,15 +18,20 @@ GenContextNetwork( \item{domains_of_interest}{Character vector of domains of interest.} -\item{column}{Name of column containing Genomic Context from which nodes and edges are generated.} +\item{column}{Name of column containing Genomic Context from which nodes and +edges are generated.} -\item{cutoff}{Integer. Only use GenContexts that occur at or above the cutoff percentage for total count} +\item{cutoff}{Integer. Only use GenContexts that occur at or above the cutoff +percentage for total count} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto" \item "nice"}} \item{directed}{Is the network directed?} } +\value{ +A plot of the genomic context network. +} \description{ This function creates a Genomic Context network from the 'GenContext' column. @@ -34,6 +39,6 @@ A network of Genomic Context is returned. } \examples{ \dontrun{ -gc_directed_network(pspa, column = "GenContex", cutoff = 55) +gc_directed_network(pspa, column = "GenContext", cutoff = 55) } } diff --git a/man/IPG2Lineage.Rd b/man/IPG2Lineage.Rd index 282d5cbf..42b9b943 100644 --- a/man/IPG2Lineage.Rd +++ b/man/IPG2Lineage.Rd @@ -27,6 +27,10 @@ IPG2Lineage( ipg database. The protein accession in 'accessions' should be contained in this file} +\item{refseq_assembly_path}{String. Path to the RefSeq assembly summary file.} + +\item{genbank_assembly_path}{String. Path to the GenBank assembly summary file.} + \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the "create_lineage_lookup()" function} @@ -37,6 +41,9 @@ This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} f \value{ A \code{data.table} with the lineage information for the provided protein accessions. + +A data table containing protein accessions along with their +corresponding TaxIDs and lineage information. } \description{ Takes the resulting file of an efetch run on the ipg database and @@ -49,6 +56,15 @@ append lineage, and taxid columns IPG2Lineage() } +\dontrun{ +lins <- IPG2Lineage( + accessions = c("P12345", "Q67890"), + ipg_file = "path/to/ipg_results.txt", + refseq_assembly_path = "path/to/refseq_assembly_summary.txt", + genbank_assembly_path = "path/to/genbank_assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/RepresentativeAccNums.Rd b/man/RepresentativeAccNums.Rd index f617cde4..49192f8e 100644 --- a/man/RepresentativeAccNums.Rd +++ b/man/RepresentativeAccNums.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{RepresentativeAccNums} \alias{RepresentativeAccNums} -\title{Function to generate a vector of one Accession number per distinct observation from 'reduced' column} +\title{Function to generate a vector of one Accession number per distinct +observation from 'reduced' column} \usage{ RepresentativeAccNums(prot_data, reduced = "Lineage", accnum_col = "AccNum") @@ -17,9 +18,29 @@ One accession number will be assigned for each of these observations} \item{accnum_col}{Column from prot_data that contains Accession Numbers} } +\value{ +A character vector containing one Accession number per distinct +observation from the specified reduced column. + +A character vector containing representative accession numbers, +one for each distinct observation in the specified 'reduced' column. +} \description{ Function to generate a vector of one Accession number per distinct observation from 'reduced' column } +\examples{ +\dontrun{ +representative_accessions <- RepresentativeAccNums(prot_data, +reduced = "Lineage", accnum_col = "AccNum") +} +\dontrun{ +# Example usage with a data frame called `protein_data` +representative_accessions <- RepresentativeAccNums(prot_data = protein_data, + reduced = "Lineage", + accnum_col = "AccNum") +print(representative_accessions) +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/acc2FA.Rd b/man/acc2FA.Rd new file mode 100644 index 00000000..6c6ea43c --- /dev/null +++ b/man/acc2FA.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CHANGED-pre-msa-tree.R +\name{acc2FA} +\alias{acc2FA} +\title{acc2FA converts protein accession numbers to a fasta format.} +\usage{ +acc2FA(accessions, outpath, plan = "sequential") +} +\arguments{ +\item{accessions}{Character vector containing protein accession numbers to +generate fasta sequences for. +Function may not work for vectors of length > 10,000} + +\item{outpath}{\link{str} Location where fasta file should be written to.} + +\item{plan}{Character string specifying the parallel processing strategy to +use with the \code{future} package. Default is "sequential".} +} +\value{ +A logical value indicating whether the retrieval and conversion were +successful. Returns \code{TRUE} if successful and \code{FALSE} otherwise. +} +\description{ +Resulting fasta file is written to the outpath. +} +\examples{ +\dontrun{ +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +outpath = "my_proteins.fasta") +Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") +EBI:accessions <- c("P12345", "Q9UHC1", +"O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +} +} +\author{ +Samuel Chen, Janani Ravi +} +\keyword{accnum,} +\keyword{fasta} diff --git a/man/acc2Lineage.Rd b/man/acc2Lineage.Rd index a46b6f20..ce499592 100644 --- a/man/acc2Lineage.Rd +++ b/man/acc2Lineage.Rd @@ -32,11 +32,16 @@ This file can be generated using the "downloadAssemblySummary()" function} \item{ipgout_path}{Path to write the results of the efetch run of the accessions on the ipg database. If NULL, the file will not be written. Defaults to NULL} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} } \value{ A \code{data.table} that contains the lineage information, mapping protein accessions to their tax IDs and lineages. + +A dataframe containing lineage information mapped to the given protein +accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, +Protein, Protein Name, Species, and Lineage. } \description{ This function combines 'efetchIPG()' and 'IPG2Lineage()' to map a set @@ -51,6 +56,14 @@ of protein accessions to their assembly (GCA_ID), tax ID, and lineage. \dontrun{ acc2Lineage() } +\dontrun{ +lineage_data <- acc2Lineage( + accessions = c("P12345", "Q67890"), + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv", + ipgout_path = "path/to/output.txt" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/acc2fa.Rd b/man/acc2fa.Rd index 158b2d51..517ee3d6 100644 --- a/man/acc2fa.Rd +++ b/man/acc2fa.Rd @@ -7,12 +7,16 @@ acc2fa(accessions, outpath, plan = "sequential") } \arguments{ -\item{accessions}{Character vector containing protein accession numbers to generate fasta sequences for. -Function may not work for vectors of length > 10,000} +\item{accessions}{Character vector containing protein accession numbers to +generate fasta sequences for. Function may not work for vectors of +length > 10,000} \item{outpath}{\link{str}. Location where fasta file should be written to.} -\item{plan}{} +\item{plan}{Character. The plan to use for processing. Default is "sequential".} +} +\value{ +A Fasta file is written to the specified \code{outpath}. } \description{ acc2fa converts protein accession numbers to a fasta format. @@ -20,9 +24,11 @@ Resulting fasta file is written to the outpath. } \examples{ \dontrun{ -acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") +acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +outpath = "my_proteins.fasta") Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2fa(outpath = "ebi.fa") +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +acc2fa(outpath = "ebi.fa") } } \author{ diff --git a/man/addLeaves2Alignment.Rd b/man/addLeaves2Alignment.Rd index a758ebd5..d7055fbf 100644 --- a/man/addLeaves2Alignment.Rd +++ b/man/addLeaves2Alignment.Rd @@ -22,6 +22,10 @@ Default is 'pspa.txt'} \item{reduced}{Boolean. If TRUE, a reduced data frame will be generated with only one sequence per lineage. Default is FALSE.} } +\value{ +A data frame containing the enriched alignment data with lineage +information. +} \description{ Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. diff --git a/man/addLineage.Rd b/man/addLineage.Rd index ab02a5ab..e2363463 100644 --- a/man/addLineage.Rd +++ b/man/addLineage.Rd @@ -23,26 +23,30 @@ addLineage( ) } \arguments{ -\item{df}{A \code{data.frame} containing the input data. One column must contain -the accession numbers.} +\item{df}{Dataframe containing accession numbers. The dataframe should +have a column specified by \code{acc_col} that contains these accession numbers.} -\item{acc_col}{A string specifying the column name in \code{df} that holds the -accession numbers. Defaults to \code{"AccNum"}.} +\item{acc_col}{Character. The name of the column in \code{df} containing +accession numbers. Default is "AccNum".} -\item{assembly_path}{A string specifying the path to the \code{assembly_summary.txt} -file. This file contains metadata about assemblies.} +\item{assembly_path}{String. The path to the assembly summary file generated +using the \code{downloadAssemblySummary()} function.} -\item{lineagelookup_path}{A string specifying the path to the lineage lookup -file, which contains a mapping from tax IDs to their corresponding lineages.} +\item{lineagelookup_path}{String. The path to the lineage lookup file (taxid +to lineage mapping) generated using the \code{create_lineage_lookup()} function.} -\item{ipgout_path}{(Optional) A string specifying the path where IPG database -fetch results will be saved. If \code{NULL}, the results are not written to a file.} +\item{ipgout_path}{String. Optional path to save intermediate output files. +Default is NULL.} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} } \value{ A \code{data.frame} that combines the original \code{df} with the lineage information. + +A dataframe that combines the original dataframe \code{df} with lineage +information retrieved based on the provided accession numbers. } \description{ addLineage @@ -53,4 +57,10 @@ addLineage \dontrun{ addLineage() } +\dontrun{ +enriched_df <- addLineage(df = my_data, + acc_col = "AccNum", + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv") +} } diff --git a/man/addName.Rd b/man/addName.Rd index e04f9849..5bf400b4 100644 --- a/man/addName.Rd +++ b/man/addName.Rd @@ -34,6 +34,16 @@ Original data with a 'Name' column This function adds a new 'Name' column that is comprised of components from Kingdom, Phylum, Genus, and species, as well as the accession } +\examples{ +# Example usage of the addName function +data <- data.frame( + AccNum = c("ACC123", "ACC456"), + Species = c("Homo sapiens", "Mus musculus"), + Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") +) +enriched_data <- addName(data) +print(enriched_data) +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/addTaxID.Rd b/man/addTaxID.Rd index d2fe139d..e960769b 100644 --- a/man/addTaxID.Rd +++ b/man/addTaxID.Rd @@ -7,8 +7,26 @@ addTaxID(data, acc_col = "AccNum", version = T) } \arguments{ -\item{version}{} +\item{data}{A data frame or data table containing protein accession numbers.} + +\item{acc_col}{A string specifying the column name in \code{data} that contains +the accession numbers. Defaults to "AccNum".} + +\item{version}{A logical indicating whether to remove the last two characters +from the accession numbers for TaxID retrieval. Defaults to TRUE.} +} +\value{ +A data table that includes the original data along with a new column +containing the corresponding TaxIDs. } \description{ addTaxID } +\examples{ +\dontrun{ +# Create a sample data table with accession numbers +sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) +enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) +print(enriched_data) +} +} diff --git a/man/add_leaves.Rd b/man/add_leaves.Rd index f1eeed10..5e462a2b 100644 --- a/man/add_leaves.Rd +++ b/man/add_leaves.Rd @@ -22,6 +22,10 @@ Default is 'pspa.txt'} \item{reduced}{Boolean. If TRUE, a reduced data frame will be generated with only one sequence per lineage. Default is FALSE.} } +\value{ +A data frame containing the combined alignment and lineage +information. +} \description{ Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. diff --git a/man/add_name.Rd b/man/add_name.Rd index f19139e1..db7b7339 100644 --- a/man/add_name.Rd +++ b/man/add_name.Rd @@ -31,8 +31,13 @@ Lineage, and AccNum info} Original data with a 'Name' column } \description{ -This function adds a new 'Name' column that is comprised of components from -Kingdom, Phylum, Genus, and species, as well as the accession +This function adds a new 'Name' column that is comprised of +components from Kingdom, Phylum, Genus, and species, as well as the accession +} +\examples{ +\dontrun{ +add_name(data_frame) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/alignFasta.Rd b/man/alignFasta.Rd index 21b020cf..54678d0a 100644 --- a/man/alignFasta.Rd +++ b/man/alignFasta.Rd @@ -11,9 +11,11 @@ alignFasta(fasta_file, tool = "Muscle", outpath = NULL) \arguments{ \item{fasta_file}{Path to the FASTA file to be aligned} -\item{tool}{Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW"} +\item{tool}{Type of alignment tool to use. One of three options: "Muscle", +"ClustalO", or "ClustalW"} -\item{outpath}{Path to write the resulting alignment to as a FASTA file. If NULL, no file is written} +\item{outpath}{Path to write the resulting alignment to as a FASTA file. If +NULL, no file is written} } \value{ aligned fasta sequence as a MsaAAMultipleAlignment object @@ -23,6 +25,18 @@ aligned fasta sequence as a MsaAAMultipleAlignment object \description{ Perform a Multiple Sequence Alignment on a FASTA file. } +\examples{ +\dontrun{ +aligned_sequences <- alignFasta("my_sequences.fasta", +tool = "Muscle", outpath = "aligned_output.fasta") +} +\dontrun{ +# Example usage +aligned_sequences <- alignFasta("path/to/sequences.fasta", +tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") +print(aligned_sequences) +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/cleanDomainArchitecture.Rd b/man/cleanDomainArchitecture.Rd index 887b5388..f12f1083 100644 --- a/man/cleanDomainArchitecture.Rd +++ b/man/cleanDomainArchitecture.Rd @@ -19,21 +19,33 @@ cleanDomainArchitecture( \arguments{ \item{prot}{A data frame containing a 'DomArch' column} +\item{old}{The name of the original column containing domain architecture. +Defaults to "DomArch.orig".} + +\item{new}{The name of the cleaned column to be created. Defaults to +"DomArch".} + \item{domains_keep}{A data frame containing the domain names to be retained.} -\item{domains_rename}{A data frame containing the domain names to be replaced in a column 'old' and the +\item{domains_rename}{A data frame containing the domain names to be replaced +in a column 'old' and the corresponding replacement values in a column 'new'.} -\item{condenseRepeatedDomains}{Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE.} +\item{condenseRepeatedDomains}{Boolean. If TRUE, repeated domains in +'DomArch' are condensed. Default is TRUE.} -\item{removeTails}{Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.} +\item{removeTails}{Boolean. If TRUE, 'ClustName' will be filtered based on +domains to keep/remove. Default is FALSE.} -\item{removeEmptyRows}{Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE.} +\item{removeEmptyRows}{Boolean. If TRUE, rows with empty/unnecessary values +in 'DomArch' are removed. Default is FALSE.} -\item{domains_ignore}{A data frame containing the domain names to be removed in a column called 'domains'} +\item{domains_ignore}{A data frame containing the domain names to be removed +in a column called 'domains'} } \value{ -The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. +The original data frame is returned with the clean DomArchs column +and the old domains in the DomArchs.old column. } \description{ Cleanup Domain Architectures @@ -46,6 +58,7 @@ The original data frame is returned with the clean DomArchs column and the old d } \examples{ \dontrun{ -cleanDomainArchitecture(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL) +cleanDomainArchitecture(prot, TRUE, FALSE, +omains_keep, domains_rename, domains_ignore = NULL) } } diff --git a/man/cleanFAHeaders.Rd b/man/cleanFAHeaders.Rd index e9ad9b30..e93d0ca3 100644 --- a/man/cleanFAHeaders.Rd +++ b/man/cleanFAHeaders.Rd @@ -7,7 +7,9 @@ cleanFAHeaders(fasta) } \arguments{ -\item{fasta}{} +\item{fasta}{An \link{XStringSet} object representing the sequences from a +FASTA file. The sequence names (headers) will be adjusted for uniqueness +and sanitized.} } \value{ \link{XStringSet} fasta with adjusted names (headers) diff --git a/man/cleanGeneDescription.Rd b/man/cleanGeneDescription.Rd index f98a25d4..3d106ae6 100644 --- a/man/cleanGeneDescription.Rd +++ b/man/cleanGeneDescription.Rd @@ -7,7 +7,10 @@ cleanGeneDescription(prot, column) } \arguments{ -\item{column}{} +\item{prot}{A data frame containing the gene descriptions.} + +\item{column}{The name of the column from which gene descriptions are pulled +for cleanup.} } \value{ Return trailing period that occurs in GeneDesc column diff --git a/man/cleanLineage.Rd b/man/cleanLineage.Rd index adcea312..071b37d2 100644 --- a/man/cleanLineage.Rd +++ b/man/cleanLineage.Rd @@ -7,10 +7,15 @@ cleanLineage(prot, lins_rename) } \arguments{ -\item{lins_rename}{} +\item{prot}{A data frame containing a 'Lineage' column that needs to be +cleaned up.} + +\item{lins_rename}{A data frame with two columns: 'old' containing terms +to be replaced and 'new' containing the corresponding replacement terms.} } \value{ -Describe return, in detail +The original data frame with the 'Lineage' column updated based on +the provided replacements. } \description{ Cleanup Lineage diff --git a/man/cleanSpecies.Rd b/man/cleanSpecies.Rd index 82b5444c..93fc2e05 100644 --- a/man/cleanSpecies.Rd +++ b/man/cleanSpecies.Rd @@ -13,7 +13,7 @@ cleanSpecies(prot, removeEmptyRows = FALSE) Default is false.} } \value{ -Describe return, in detail +The original data frame with Species cleaned. } \description{ Cleanup Species diff --git a/man/combine_files.Rd b/man/combine_files.Rd index 4126eb9e..432513d6 100644 --- a/man/combine_files.Rd +++ b/man/combine_files.Rd @@ -13,16 +13,34 @@ combine_files( ) } \arguments{ -\item{inpath}{String of 'master' path where the files reside (recursive=T)} +\item{inpath}{Character. The master directory path where the files reside. +The search is recursive (i.e., it will look in subdirectories as well).} -\item{pattern}{Character vector containing search pattern for files} +\item{pattern}{Character. A search pattern to identify files to be combined. +Default is "*full_analysis.tsv".} -\item{col_names}{Takes logical T/F arguments OR column names vector; -usage similar to col_names parameter in \code{readr::read_delim}} +\item{delim}{Character. The delimiter used in the input files. +Default is tab ("\t").} + +\item{skip}{Integer. The number of lines to skip at the beginning of each file. +Default is 0.} + +\item{col_names}{Logical or character vector. If TRUE, the first row of each file +is treated as column names. Alternatively, a character vector can +be provided to specify custom column names.} +} +\value{ +A data frame containing the combined contents of all matched files. +Each row will include a new column "ByFile" indicating the source file of the data. } \description{ Download the combined assembly summaries of genbank and refseq } +\examples{ +\dontrun{ +combined_data <- combine_files(inpath = "../molevol_data/project_data/phage_defense/") +} +} \author{ Janani Ravi } diff --git a/man/combine_full.Rd b/man/combine_full.Rd index f4e6597b..563a5450 100644 --- a/man/combine_full.Rd +++ b/man/combine_full.Rd @@ -7,8 +7,22 @@ combine_full(inpath, ret = FALSE) } \arguments{ -\item{ret}{} +\item{inpath}{Character. The path to the directory containing the +\code{.full_analysis.tsv} files to be combined.} + +\item{ret}{Logical. If TRUE, the function will return the combined data frame. +Default is FALSE, meaning it will only write the file and not return the data.} +} +\value{ +If \code{ret} is TRUE, a data frame containing the combined data from all +input files. If \code{ret} is FALSE, the function writes the combined data to a +TSV file named \code{cln_combined.tsv} in the specified directory and returns NULL. } \description{ Combining full_analysis files } +\examples{ +\dontrun{ +combined_data <- combine_full("path/to/full_analysis/files", ret = TRUE) +} +} diff --git a/man/combine_ipr.Rd b/man/combine_ipr.Rd index 52aa3057..ddb3e6af 100644 --- a/man/combine_ipr.Rd +++ b/man/combine_ipr.Rd @@ -7,8 +7,22 @@ combine_ipr(inpath, ret = FALSE) } \arguments{ -\item{ret}{} +\item{inpath}{Character. The path to the directory containing the +\code{.iprscan_cln.tsv} files to be combined.} + +\item{ret}{Logical. If TRUE, the function will return the combined data frame. +Default is FALSE, meaning it will only write the file and not return the data.} +} +\value{ +If \code{ret} is TRUE, a data frame containing the combined data from all +input files. If \code{ret} is FALSE, the function writes the combined data to a +TSV file named \code{ipr_combined.tsv} in the specified directory and returns NULL. } \description{ Combining clean ipr files } +\examples{ +\dontrun{ +combined_ipr_data <- combine_ipr("path/to/ipr/files", ret = TRUE) +} +} diff --git a/man/condenseRepeatedDomains.Rd b/man/condenseRepeatedDomains.Rd index 3b239129..ee51a544 100644 --- a/man/condenseRepeatedDomains.Rd +++ b/man/condenseRepeatedDomains.Rd @@ -14,7 +14,7 @@ condenseRepeatedDomains(prot, by_column = "DomArch", excluded_prots = c()) \item{excluded_prots}{Vector of strings that condenseRepeatedDomains should not reduce to (s). Defaults to c()} } \value{ -Describe return, in detail +A data frame with condensed repeated domains in the specified column. } \description{ Condense repeated domains diff --git a/man/convert2TitleCase.Rd b/man/convert2TitleCase.Rd index 84e7fa00..cd8634ef 100644 --- a/man/convert2TitleCase.Rd +++ b/man/convert2TitleCase.Rd @@ -13,8 +13,16 @@ convert2TitleCase(text, delimitter) \item{y}{Delimitter. Default is space (" ").} } +\value{ +Character vector with the input strings converted to title case. +} \description{ Translate string to Title Case w/ delimitter. +} +\examples{ +# Convert a single string to title case +convert2TitleCase("hello world") # Returns "Hello World" + } \seealso{ chartr, toupper, and tolower. diff --git a/man/convertAlignment2FA.Rd b/man/convertAlignment2FA.Rd index d6b4dc56..027267ad 100644 --- a/man/convertAlignment2FA.Rd +++ b/man/convertAlignment2FA.Rd @@ -26,6 +26,11 @@ Default is 'NULL'} \item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. Default is 'FALSE'} } +\value{ +A character string representing the FASTA formatted sequences. +If \code{fa_outpath} is provided, the FASTA will also be saved to the specified +file. +} \description{ Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. diff --git a/man/convert_aln2fa.Rd b/man/convert_aln2fa.Rd index 8bebe31d..8ca9a3a0 100644 --- a/man/convert_aln2fa.Rd +++ b/man/convert_aln2fa.Rd @@ -23,8 +23,13 @@ Default is 'pspa.txt'} \item{fa_outpath}{Character. Path to the written fasta file. Default is 'NULL'} -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -Default is 'FALSE'} +\item{reduced}{Boolean. If TRUE, the fasta file will contain only one +sequence per lineage. Default is 'FALSE'} +} +\value{ +Character string containing the Fasta formatted sequences. +If \code{fa_outpath} is specified, the function also writes the sequences to the +Fasta file. } \description{ Adding Leaves to an alignment file w/ accessions diff --git a/man/countbycolumn.Rd b/man/countByColumn.Rd similarity index 100% rename from man/countbycolumn.Rd rename to man/countByColumn.Rd diff --git a/man/createWordCloud2Element.Rd b/man/createWordCloud2Element.Rd index a6279e2f..b1fd827f 100644 --- a/man/createWordCloud2Element.Rd +++ b/man/createWordCloud2Element.Rd @@ -15,7 +15,18 @@ createWordCloud2Element( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is "prot".} -\item{UsingRowsCutoff}{} +\item{colname}{Character. The name of the column in \code{query_data} to generate +the word cloud from. Default is "DomArch".} + +\item{cutoff}{Numeric. The cutoff value for filtering elements based on their +frequency. Default is 70.} + +\item{UsingRowsCutoff}{Logical. Whether to use a row-based cutoff instead of +a frequency cutoff. Default is FALSE.} +} +\value{ +A word cloud plot showing the frequency of elements from the selected +column. } \description{ Wordclouds for the predominant domains (from DAs) and DAs (from GC) diff --git a/man/createWordCloudElement.Rd b/man/createWordCloudElement.Rd index 7f27ef41..42b32da0 100644 --- a/man/createWordCloudElement.Rd +++ b/man/createWordCloudElement.Rd @@ -15,7 +15,18 @@ createWordCloudElement( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is "prot".} -\item{UsingRowsCutoff}{} +\item{colname}{Character. The name of the column in \code{query_data} to generate +the word cloud from. Default is "DomArch".} + +\item{cutoff}{Numeric. The cutoff value for filtering elements based on their +frequency. Default is 70.} + +\item{UsingRowsCutoff}{Logical. Whether to use a row-based cutoff instead of +a frequency cutoff. Default is FALSE.} +} +\value{ +A word cloud plot showing the frequency of elements from the selected +column. } \description{ Wordclouds for the predominant domains (from DAs) and DAs (from GC) diff --git a/man/create_lineage_lookup.Rd b/man/create_lineage_lookup.Rd index 51670f35..869db71a 100644 --- a/man/create_lineage_lookup.Rd +++ b/man/create_lineage_lookup.Rd @@ -11,20 +11,31 @@ create_lineage_lookup( ) } \arguments{ -\item{lineage_file}{Path to the rankedlineage.dmp file containing taxid's and their -corresponding taxonomic rank. rankedlineage.dmp can be downloaded at +\item{lineage_file}{Path to the rankedlineage.dmp file containing taxid's +and their corresponding taxonomic rank. rankedlineage.dmp can be downloaded at https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/} \item{outfile}{File the resulting lineage lookup table should be written to} -\item{taxonomic_rank}{The upperbound of taxonomic rank that the lineage includes. The lineaege will -include superkingdom>...>taxonomic_rank. +\item{taxonomic_rank}{The upperbound of taxonomic rank that the lineage +includes. The lineaege will include superkingdom>...>taxonomic_rank. Choices include: "supperkingdom", "phylum", "class","order", "family", "genus", and "species"} } +\value{ +A tibble containing the tax IDs and their respective lineages up to +the specified taxonomic rank, saved as a tab-separated file. +} \description{ Create a look up table that goes from TaxID, to Lineage } +\examples{ +\dontrun{ +create_lineage_lookup(lineage_file = "data/rankedlineage.dmp", + outfile = "data/lineage_lookup.tsv", + taxonomic_rank = "family") +} +} \author{ Samuel Chen } diff --git a/man/domain_network.Rd b/man/domain_network.Rd index 528e4924..0580b4d2 100644 --- a/man/domain_network.Rd +++ b/man/domain_network.Rd @@ -16,15 +16,24 @@ domain_network( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{domains_of_interest}{Character vector specifying domains of interest.} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for +total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} -\item{query_color}{} +\item{query_color}{Character. Color to represent the queried domain in the +network.} +} +\value{ +A network visualization of domain architectures. } \description{ This function creates a domain network from the 'DomArch' column. diff --git a/man/downloadAssemblySummary.Rd b/man/downloadAssemblySummary.Rd index 636af878..bad2b603 100644 --- a/man/downloadAssemblySummary.Rd +++ b/man/downloadAssemblySummary.Rd @@ -10,13 +10,25 @@ downloadAssemblySummary( ) } \arguments{ -\item{outpath}{String of path where the assembly summary file should be written} +\item{outpath}{String of path where the assembly summary file should be +written} -\item{keep}{Character vector containing which columns should be retained and downloaded} +\item{keep}{Character vector containing which columns should be retained and +downloaded} +} +\value{ +A tab-separated file containing the assembly summary. The function +does notreturn any value but writes the output directly to the specified file. } \description{ Download the combined assembly summaries of genbank and refseq } +\examples{ +\dontrun{ +downloadAssemblySummary(outpath = "assembly_summary.tsv", + keep = c("assembly_accession", "taxid", "organism_name")) +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index 047e2652..e55c342a 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -14,13 +14,17 @@ the ipg database} \item{out_path}{Path to write the efetch results to} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} \item{accnums}{Character vector containing the accession numbers to query on the ipg database} } \value{ No return value. The function writes the fetched results to \code{out_path}. + +The function does not return a value but writes the efetch results +directly to the specified \code{out_path}. } \description{ Perform efetch on the ipg database and write the results to out_path @@ -31,6 +35,12 @@ Perform efetch on the ipg database and write the results to out_path \dontrun{ efetchIPG() } +\dontrun{ +efetchIPG( + accessions = c("P12345", "Q67890", "A12345"), + out_path = "path/to/efetch_results.xml" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/extractAccNum.Rd b/man/extractAccNum.Rd index 15870f3f..caf9e5db 100644 --- a/man/extractAccNum.Rd +++ b/man/extractAccNum.Rd @@ -7,7 +7,8 @@ extractAccNum(string) } \arguments{ -\item{string}{} +\item{string}{A string from which to extract the accession number. +The string may contain accession information delimited by \code{|} or spaces.} } \value{ Describe return, in detail diff --git a/man/filterbydomains.Rd b/man/filterByDomains.Rd similarity index 100% rename from man/filterbydomains.Rd rename to man/filterByDomains.Rd diff --git a/man/filterbyfrequency.Rd b/man/filterByFrequency.Rd similarity index 100% rename from man/filterbyfrequency.Rd rename to man/filterByFrequency.Rd diff --git a/man/findparalogs.Rd b/man/findParalogs.Rd similarity index 100% rename from man/findparalogs.Rd rename to man/findParalogs.Rd diff --git a/man/find_top_acc.Rd b/man/find_top_acc.Rd index 780cde11..ffce1640 100644 --- a/man/find_top_acc.Rd +++ b/man/find_top_acc.Rd @@ -13,8 +13,32 @@ find_top_acc( ) } \arguments{ -\item{query}{} +\item{infile_full}{A data frame containing the full dataset with lineage and +domain architecture information.} + +\item{DA_col}{A string representing the name of the domain architecture +column. Default is "DomArch.Pfam".} + +\item{lin_col}{A string representing the name of the lineage column. +Default is "Lineage_short".} + +\item{n}{An integer specifying the number of top accession numbers to return. +Default is 20.} + +\item{query}{A string for filtering a specific query name. If it is not +"All", only the data matching this query will be processed.} +} +\value{ +A vector of the top N accession numbers (\code{AccNum}) based on counts +grouped by lineage and domain architecture. } \description{ Group by lineage + DA then take top 20 } +\examples{ +\dontrun{ +top_accessions <- find_top_acc(infile_full = my_data, +DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +n = 20, query = "specific_query_name") +} +} diff --git a/man/gc_undirected_network.Rd b/man/gc_undirected_network.Rd index 28cf1abb..5dab8a70 100644 --- a/man/gc_undirected_network.Rd +++ b/man/gc_undirected_network.Rd @@ -16,18 +16,29 @@ gc_undirected_network( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff_type}{Character. Used to determine how data should be filtered. Either -\itemize{\item "Lineage" to filter domains based off how many lineages the Domain architecture appears in -\item "Total Count" to filter off the total amount of times a domain architecture occurs }} +\item{domains_of_interest}{Character vector specifying the domains of interest.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{cutoff_type}{Character. Used to determine how data should be filtered. +Either +\itemize{\item "Lineage" to filter domains based off how many lineages the +Domain architecture appears in +\item "Total Count" to filter off the total amount of times a +domain architecture occurs }} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff +for total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} } +\value{ +A plot of the domain architecture network. +} \description{ This function creates a domain network from the 'DomArch' column. @@ -35,6 +46,8 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -domain_network(pspa) +domain_network(pspa, column = "DomArch", +domains_of_interest = c("Domain1", "Domain2"), +cutoff_type = "Total Count", cutoff = 10) } } diff --git a/man/generateAllAlignments2FA.Rd b/man/generateAllAlignments2FA.Rd index 3bf9938a..1100f241 100644 --- a/man/generateAllAlignments2FA.Rd +++ b/man/generateAllAlignments2FA.Rd @@ -15,23 +15,34 @@ generateAllAlignments2FA( \item{aln_path}{Character. Path to alignment files. Default is 'here("data/rawdata_aln/")'} -\item{fa_outpath}{Character. Path to file. Master protein file with AccNum & lineages. +\item{fa_outpath}{Character. Path to file. Master protein file with AccNum & +lineages. Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} \item{lin_file}{Character. Path to the written fasta file. Default is 'here("data/alns/")'.} -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. +\item{reduced}{Boolean. If TRUE, the fasta file will contain only one +sequence per lineage. Default is 'FALSE'.} } +\value{ +NULL. The function saves the output FASTA files to the specified +directory. +} \description{ Adding Leaves to all alignment files w/ accessions & DAs? } \details{ -The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. +The alignment files would need two columns separated by spaces: +\enumerate{ +\item AccNum and 2. alignment. The protein homolog file should have AccNum, +Species, Lineages. +} } \note{ -Please refer to the source code if you have alternate + file formats and/or column names. +Please refer to the source code if you have alternate + file formats +and/or column names. } \examples{ \dontrun{ diff --git a/man/generate_all_aln2fa.Rd b/man/generate_all_aln2fa.Rd index ad6b7136..0a9b7e0f 100644 --- a/man/generate_all_aln2fa.Rd +++ b/man/generate_all_aln2fa.Rd @@ -18,20 +18,26 @@ Default is 'here("data/rawdata_aln/")'} \item{fa_outpath}{Character. Path to the written fasta file. Default is 'here("data/alns/")'.} -\item{lin_file}{Character. Path to file. Master protein file with AccNum & lineages. -Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} +\item{lin_file}{Character. Path to file. Master protein file with AccNum & +lineages. Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -Default is 'FALSE'.} +\item{reduced}{Boolean. If TRUE, the fasta file will contain only one +sequence per lineage. Default is 'FALSE'.} +} +\value{ +A list of paths to the generated Fasta files. } \description{ Adding Leaves to all alignment files w/ accessions & DAs? } \details{ -The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. +The alignment files would need two columns separated by spaces: 1. +AccNum and 2. alignment. The protein homolog file should have AccNum, +Species, Lineages. } \note{ -Please refer to the source code if you have alternate + file formats and/or column names. +Please refer to the source code if you have alternate + file +formats and/or column names. } \examples{ \dontrun{ diff --git a/man/generate_msa.Rd b/man/generate_msa.Rd index a68eb8b4..90f2ca91 100644 --- a/man/generate_msa.Rd +++ b/man/generate_msa.Rd @@ -7,8 +7,21 @@ generate_msa(fa_file = "", outfile = "") } \arguments{ -\item{outfile}{} +\item{fa_file}{Character. The path to the input FASTA file containing protein +sequences.} + +\item{outfile}{Character. The path to the output file where the alignment +will be saved.} +} +\value{ +A list containing the alignment object and the output file path. } \description{ Function to generate MSA using kalign } +\examples{ +\dontrun{ +generate_msa(fa_file = "path/to/sequences.fasta", +outfile = "path/to/alignment.txt") +} +} diff --git a/man/get_accnums_from_fasta_file.Rd b/man/get_accnums_from_fasta_file.Rd index 84c163cc..3a3c1784 100644 --- a/man/get_accnums_from_fasta_file.Rd +++ b/man/get_accnums_from_fasta_file.Rd @@ -9,10 +9,27 @@ get_accnums_from_fasta_file(fasta_file) get_accnums_from_fasta_file(fasta_file) } \arguments{ -\item{fasta_file}{} +\item{fasta_file}{Character. Path to the FASTA file from which +accession numbers will be extracted.} +} +\value{ +A character vector containing the extracted accession numbers. + +A character vector containing the extracted accession numbers. } \description{ Get accnums from fasta file get_accnums_from_fasta_file } +\examples{ +\dontrun{ +accnums <- get_accnums_from_fasta_file("my_sequences.fasta") +print(accnums) +} +\dontrun{ +# Example usage +accnums <- get_accnums_from_fasta_file("path/to/sequences.fasta") +print(accnums) +} +} diff --git a/man/ipr2viz.Rd b/man/ipr2viz.Rd index 79063497..728c188c 100644 --- a/man/ipr2viz.Rd +++ b/man/ipr2viz.Rd @@ -17,8 +17,51 @@ ipr2viz( ) } \arguments{ -\item{query}{} +\item{infile_ipr}{A path to the input IPR file (TSV format) containing +domain information.} + +\item{infile_full}{A path to the full input file (TSV format) containing +lineage and accession information.} + +\item{accessions}{A character vector of accession numbers to filter the +analysis. Default is an empty vector.} + +\item{analysis}{A character vector specifying the types of analysis to +include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a +vector of these analyses.} + +\item{group_by}{A string specifying how to group the visualization. +Default is "Analysis". Options include "Analysis" or "Query".} + +\item{topn}{An integer specifying the number of top accessions to visualize. +Default is 20.} + +\item{name}{A string representing the name to use for y-axis labels. +Default is "Name".} + +\item{text_size}{An integer specifying the text size for the plot. +Default is 15.} + +\item{query}{A string for filtering a specific query name. If it is not +"All", only the data matching this query will be processed.} +} +\value{ +A ggplot object representing the domain architecture visualization. } \description{ IPR2Viz } +\examples{ +\dontrun{ +plot <- ipr2viz(infile_ipr = "path/to/ipr_file.tsv", + infile_full = "path/to/full_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + topn = 20, + name = "Gene Name", + text_size = 15, + query = "All") +print(plot) +} +} diff --git a/man/ipr2viz_web.Rd b/man/ipr2viz_web.Rd index 896445bd..defa5b2d 100644 --- a/man/ipr2viz_web.Rd +++ b/man/ipr2viz_web.Rd @@ -17,8 +17,52 @@ ipr2viz_web( ) } \arguments{ -\item{rows}{} +\item{infile_ipr}{A path to the input IPR file (TSV format) containing +domain information.} + +\item{accessions}{A character vector of accession numbers to filter the +analysis.} + +\item{analysis}{A character vector specifying the types of analysis to +include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector +of these analyses.} + +\item{group_by}{A string specifying how to group the visualization. +Default is "Analysis". Options include "Analysis" or "Query".} + +\item{name}{A string representing the name to use for y-axis labels. +Default is "Name".} + +\item{text_size}{An integer specifying the text size for the plot. +Default is 15.} + +\item{legend_name}{A string representing the column to use for legend labels. +Default is "ShortName".} + +\item{cols}{An integer specifying the number of columns in the facet wrap. +Default is 5.} + +\item{rows}{An integer specifying the number of rows in the legend. +Default is 10.} +} +\value{ +A ggplot object representing the domain architecture visualization +for web display. } \description{ IPR2Viz Web } +\examples{ +\dontrun{ +plot <- ipr2viz_web(infile_ipr = "path/to/ipr_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + name = "Gene Name", + text_size = 15, + legend_name = "ShortName", + cols = 5, + rows = 10) +print(plot) +} +} diff --git a/man/mapAcc2Name.Rd b/man/mapAcc2Name.Rd index 0f5d447d..a59c8760 100644 --- a/man/mapAcc2Name.Rd +++ b/man/mapAcc2Name.Rd @@ -9,13 +9,24 @@ mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") \arguments{ \item{line}{The line of a fasta file starting with '>'} -\item{acc2name}{Data Table containing a column of accession numbers and a name column} +\item{acc2name}{Data Table containing a column of accession numbers and a +name column} \item{acc_col}{Name of the column containing Accession numbers} -\item{name_col}{Name of the column containing the names that the accession numbers +\item{name_col}{Name of the column containing the names that the accession +numbers are mapped to} } +\value{ +A character string representing the updated FASTA line, where the +accession number is replaced with its corresponding name. +} \description{ Default renameFA() replacement function. Maps an accession number to its name } +\examples{ +\dontrun{ +mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name") +} +} diff --git a/man/map_acc2name.Rd b/man/map_acc2name.Rd index fcdb3023..88377eea 100644 --- a/man/map_acc2name.Rd +++ b/man/map_acc2name.Rd @@ -7,7 +7,7 @@ map_acc2name(line, acc2name, acc_col = "AccNum", name_col = "Name") } \arguments{ -\item{line}{he line of a fasta file starting with '>'} +\item{line}{The line of a fasta file starting with '>'} \item{acc2name}{Data Table containing a column of accession numbers and a name column} @@ -16,6 +16,19 @@ map_acc2name(line, acc2name, acc_col = "AccNum", name_col = "Name") \item{name_col}{Name of the column containing the names that the accession numbers are mapped to} } +\value{ +Character string. The modified line from the Fasta file header with +the name instead of the accession number. +} \description{ Default rename_fasta() replacement function. Maps an accession number to its name } +\examples{ +\dontrun{ +acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +Name = c("Species A", "Species B")) +line <- ">ACC001 some additional info" +mapped_line <- map_acc2name(line, acc2name_table) +print(mapped_line) # Expected output: ">Species A" +} +} diff --git a/man/msa_pdf.Rd b/man/msa_pdf.Rd index 4d5fed17..0f42eb9f 100644 --- a/man/msa_pdf.Rd +++ b/man/msa_pdf.Rd @@ -18,6 +18,9 @@ Default is NULL. If value is NULL, the entire multiple sequence alignment is pri \item{upperbound}{Numeric. The column that determines the ending location of the MSA. Default is NULL. If value is NULL, the entire multiple sequence alignment is printed.} } +\value{ +A PDF file containing the multiple sequence alignment. +} \description{ Generates a multiple sequence alignment from a fasta file @@ -26,6 +29,9 @@ a pdf } \examples{ \dontrun{ -msa_pdf() +msa_pdf(fasta_path = "path/to/your/file.fasta", + out_path = "path/to/output/alignment.pdf", + lowerbound = 10, + upperbound = 200) } } diff --git a/man/plotLineageDA.Rd b/man/plotLineageDA.Rd index 7e84bcfd..a752eb9b 100644 --- a/man/plotLineageDA.Rd +++ b/man/plotLineageDA.Rd @@ -20,9 +20,17 @@ Default is prot (variable w/ protein data).} \item{colname}{Column name from query_data: "DomArch.norep", "GenContext.norep", "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep".} +\item{cutoff}{Numeric. Cutoff for word frequency. Default is 90.} + +\item{RowsCutoff}{Boolean. If TRUE, applies a row cutoff to remove data rows +based on a certain condition. Default is FALSE.} + \item{color}{Color for the heatmap. One of six options: "default", "magma", "inferno", "plasma", "viridis", or "cividis"} } +\value{ +A LineageDA plot object. +} \description{ Lineage plot for Domains, Domain Architectures and Genomic Contexts. Heatmap. diff --git a/man/plotLineageDomainRepeats.Rd b/man/plotLineageDomainRepeats.Rd index 8ccfba41..45d31d68 100644 --- a/man/plotLineageDomainRepeats.Rd +++ b/man/plotLineageDomainRepeats.Rd @@ -7,7 +7,16 @@ plotLineageDomainRepeats(query_data, colname) } \arguments{ -\item{colname}{} +\item{query_data}{Data frame containing protein homolog data, including +relevant domain architectures and lineages.} + +\item{colname}{Character. The name of the column in query_data that contains +domain architectures or other structural information.} +} +\value{ +A ggplot object representing a heatmap (tile plot) of domain repeat +counts across different lineages, with color intensity representing the +occurrence of domains. } \description{ Lineage Domain Repeats Plot diff --git a/man/plotLineageHeatmap.Rd b/man/plotLineageHeatmap.Rd index 5449f8ec..e6870edb 100644 --- a/man/plotLineageHeatmap.Rd +++ b/man/plotLineageHeatmap.Rd @@ -15,6 +15,11 @@ plotLineageHeatmap(prot, domains_of_interest, level = 3, label.size = 8) \item{label.size}{Size of the text labels} } +\value{ +A ggplot object representing a heatmap (tile plot) of domain repeat +counts across different lineages, with color intensity representing the +occurrence of domains. +} \description{ Generate a lineage plot } diff --git a/man/plotLineageNeighbors.Rd b/man/plotLineageNeighbors.Rd index 85adf175..2c7ca448 100644 --- a/man/plotLineageNeighbors.Rd +++ b/man/plotLineageNeighbors.Rd @@ -18,6 +18,11 @@ additional word columns (0/1 format). Default is pspa_data.} \item{colname}{Column name from query_data. Default is "GenContext.norep".} } +\value{ +A ggplot object representing a heatmap (tile plot) of lineage versus +the top neighboring domain architectures, with color intensity representing +the frequency of occurrences. +} \description{ Lineage plot for top neighbors obtained from DAs of Genomic Contexts. diff --git a/man/plotLineageQuery.Rd b/man/plotLineageQuery.Rd index ad52a4d2..aa3793b7 100644 --- a/man/plotLineageQuery.Rd +++ b/man/plotLineageQuery.Rd @@ -17,9 +17,22 @@ plotLineageQuery( additional word columns (0/1 format). Default is prot (variable w/ protein data).} -\item{queries}{Character Vector containing the queries that will be used for the categories} +\item{queries}{Character Vector containing the queries that will be used for +the categories.} -\item{color}{} +\item{colname}{Character. The column used for filtering based on the \code{queries}. +Default is "ClustName".} + +\item{cutoff}{Numeric. The cutoff value for filtering rows based on their +total count. Rows with values below this cutoff are excluded.} + +\item{color}{Character. Defines the color palette used for the heatmap. +Default is a red gradient.} +} +\value{ +A ggplot object representing a heatmap (tile plot) showing the +relationship between queries and lineages, with the intensity of color +representing the count of matching records. } \description{ Lineage plot for queries. Heatmap. @@ -33,6 +46,9 @@ column names. plotLineageQuery(prot, c("PspA", "PspB", "PspC", "PspM", "PspN"), 95) } } +\author{ +Janani Ravi, Samuel Chen +} \keyword{Architectures,} \keyword{Domain} \keyword{Domains,} diff --git a/man/plotLineageSunburst.Rd b/man/plotLineageSunburst.Rd index 972bbe5d..3240d77d 100644 --- a/man/plotLineageSunburst.Rd +++ b/man/plotLineageSunburst.Rd @@ -16,27 +16,40 @@ plotLineageSunburst( ) } \arguments{ -\item{prot}{Data frame containing a lineage column that the sunburst plot will be generated for} +\item{prot}{Data frame containing a lineage column that the sunburst plot +will be generated for} -\item{lineage_column}{String. Name of the lineage column within the data frame. Defaults to "Lineage"} +\item{lineage_column}{String. Name of the lineage column within the +data frame. Defaults to "Lineage"} -\item{type}{String, either "sunburst" or "sund2b". If type is "sunburst", a sunburst plot of the lineage} +\item{type}{String, either "sunburst" or "sund2b". If type is "sunburst", +a sunburst plot of the lineage} \item{levels}{Integer. Number of levels the sunburst will have.} -\item{legendOrder}{String vector. The order of the legend. If legendOrder is NULL,} +\item{colors}{A vector of colors for the sunburst plot. +If NULL, default colors are used.} -\item{showLegend}{Boolean. If TRUE, the legend will be enabled when the component first renders.} +\item{legendOrder}{String vector. The order of the legend. If legendOrder +is NULL,} -\item{maxLevels}{Integer, the maximum number of levels to display in the sunburst; 5 by default, NULL to disable -then the legend will be in the descending order of the top level hierarchy. -will be rendered. If the type is sund2b, a sund2b plot will be rendered.} +\item{showLegend}{Boolean. If TRUE, the legend will be enabled when the +component first renders.} + +\item{maxLevels}{Integer, the maximum number of levels to display in the +sunburst; 5 by default, NULL to disable then the legend will be in the +descending order of the top level hierarchy. will be rendered. If the type is +sund2b, a sund2b plot will be rendered.} +} +\value{ +A sunburst or sund2b plot based on the input lineage data. } \description{ Lineage Sunburst } \examples{ \dontrun{ -plotLineageSunburst() +plotLineageSunburst(prot, lineage_column = "Lineage", +type = "sunburst", levels = 3) } } diff --git a/man/plotStackedLineage.Rd b/man/plotStackedLineage.Rd index 9d1cde6d..63ae9b66 100644 --- a/man/plotStackedLineage.Rd +++ b/man/plotStackedLineage.Rd @@ -21,7 +21,44 @@ plotStackedLineage( ) } \arguments{ -\item{legend}{} +\item{prot}{Data frame containing protein data including domain architecture +and lineage information.} + +\item{column}{Character. The name of the column in prot representing domain +architectures (default is "DomArch").} + +\item{cutoff}{Numeric. A threshold value for filtering domain architectures +or protein counts.} + +\item{Lineage_col}{Character. The name of the column representing lineage +data (default is "Lineage").} + +\item{xlabel}{Character. Label for the x-axis +(default is "Domain Architecture").} + +\item{reduce_lineage}{Logical. Whether to shorten lineage names +(default is TRUE).} + +\item{label.size}{Numeric. The size of axis text labels (default is 8).} + +\item{legend.position}{Numeric vector. Coordinates for placing the legend +(default is c(0.7, 0.4)).} + +\item{legend.text.size}{Numeric. Size of the text in the legend +(default is 10).} + +\item{legend.cols}{Numeric. Number of columns in the legend (default is 2).} + +\item{legend.size}{Numeric. Size of the legend keys (default is 0.7).} + +\item{coord_flip}{Logical. Whether to flip the coordinates of the plot +(default is TRUE).} + +\item{legend}{Logical. Whether to display the legend (default is TRUE).} +} +\value{ +A ggplot object representing a stacked bar plot showing the +distribution of protein domain architectures across lineages. } \description{ Stacked Lineage Plot diff --git a/man/plotSunburst.Rd b/man/plotSunburst.Rd index 5ee465a6..37da9df5 100644 --- a/man/plotSunburst.Rd +++ b/man/plotSunburst.Rd @@ -10,11 +10,11 @@ plotSunburst(count_data, fill_by_n = FALSE, sort_by_n = FALSE, maxdepth = 2) plotTreemap(count_data, fill_by_n = FALSE, sort_by_n = FALSE) } \arguments{ -\item{count_data}{} +\item{count_data}{A data frame containing the data.} -\item{fill_by_n}{If TRUE, uses a continuous scale to fill plot by group size} +\item{fill_by_n}{Logical indicating if fill color is based on counts.} -\item{sort_by_n}{} +\item{sort_by_n}{Logical indicating if data should be sorted by counts.} } \description{ These functions help you quickly create interactive hierarchical plots diff --git a/man/plotUpSet.Rd b/man/plotUpSet.Rd index 84169987..47dd12e1 100644 --- a/man/plotUpSet.Rd +++ b/man/plotUpSet.Rd @@ -18,15 +18,30 @@ plotUpSet( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is toast_rack.sub} +\item{colname}{Column name from query_data: "DomArch.norep", "GenContext.norep", +"DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep".} + \item{cutoff}{Numeric. Cutoff for word frequency. Default is 90.} -\item{text.scale}{Allows scaling of axis title, tick lables, and numbers above the intersection size bars. +\item{RowsCutoff}{Boolean. If TRUE, applies a row cutoff to remove data rows +based on a certain condition. Default is FALSE.} + +\item{text.scale}{Allows scaling of axis title, tick lables, and numbers +above the intersection size bars. text.scale can either take a universal scale in the form of an integer, or a vector of specific scales in the format: c(intersection size title, intersection size tick labels, set size title, set size tick labels, set names, numbers above bars)} -\item{line.size}{} +\item{point.size}{Numeric. Sets the size of points in the UpSet plot. +Default is 2.2.} + +\item{line.size}{Numeric. Sets the line width in the UpSet plot. +Default is 0.8.} +} +\value{ +An UpSet plot object. The plot visualizes intersections of sets based +on the provided colname in query_data. } \description{ UpSet plot for Domain Architectures vs Domains and diff --git a/man/prepareColumnParams.Rd b/man/prepareColumnParams.Rd index bb0b9a29..8a9f566b 100644 --- a/man/prepareColumnParams.Rd +++ b/man/prepareColumnParams.Rd @@ -7,8 +7,23 @@ prepareColumnParams(count_data, fill_by_n, sort_by_n) } \arguments{ -\item{sort_by_n}{} +\item{count_data}{A data frame containing the data.} + +\item{fill_by_n}{Logical indicating if fill color is based on counts.} + +\item{sort_by_n}{Logical indicating if data should be sorted by counts.} +} +\value{ +A data frame of parameters for treemap visualization. } \description{ prepareColumnParams } +\examples{ +\dontrun{ +count_data <- data.frame(Category = c("A", "B", "C"), + n = c(10, 20, 15)) +params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) +print(params) +} +} diff --git a/man/prepareSingleColumnParams.Rd b/man/prepareSingleColumnParams.Rd index d823852b..0070497e 100644 --- a/man/prepareSingleColumnParams.Rd +++ b/man/prepareSingleColumnParams.Rd @@ -7,8 +7,24 @@ prepareSingleColumnParams(df, col_num, root) } \arguments{ -\item{root}{} +\item{df}{A data frame containing the data to be processed.} + +\item{col_num}{An integer representing the column number to process.} + +\item{root}{A string representing the root node for the treemap.} +} +\value{ +A data frame containing parameters for the specified column for +treemap visualization. } \description{ prepareSingleColumnParams } +\examples{ +\dontrun{ +df <- data.frame(Category = c("A", "A", "B", "B", "C"), + n = c(10, 20, 30, 40, 50)) +params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") +print(params) +} +} diff --git a/man/proteinAcc2TaxID.Rd b/man/proteinAcc2TaxID.Rd index c0317bba..9be09d53 100644 --- a/man/proteinAcc2TaxID.Rd +++ b/man/proteinAcc2TaxID.Rd @@ -7,8 +7,32 @@ proteinAcc2TaxID(accnums, suffix, out_path, return_dt = FALSE) } \arguments{ -\item{return_dt}{} +\item{accnums}{A character vector of protein accession numbers to be mapped +to TaxIDs.} + +\item{suffix}{A string suffix used to name the output file generated by the +script.} + +\item{out_path}{A string specifying the directory where the output file will +be saved.} + +\item{return_dt}{A logical indicating whether to return the result as a data +table. Defaults to FALSE. If TRUE, the output file is read into a data table +and returned.} +} +\value{ +If \code{return_dt} is TRUE, a data table containing the mapping of protein +accession numbers to TaxIDs. If FALSE, the function returns NULL. } \description{ proteinAcc2TaxID } +\examples{ +\dontrun{ +# Example accession numbers +accessions <- c("ABC123", "XYZ456", "LMN789") +tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +out_path = "/path/to/output", return_dt = TRUE) +print(tax_data) +} +} diff --git a/man/proteinAcc2TaxID_old.Rd b/man/proteinAcc2TaxID_old.Rd index 0c2a85ba..fb6cd5a0 100644 --- a/man/proteinAcc2TaxID_old.Rd +++ b/man/proteinAcc2TaxID_old.Rd @@ -7,17 +7,29 @@ proteinAcc2TaxID_old(accessions, out_path, plan = "multicore") } \arguments{ -\item{accessions}{Character vector containing the accession numbers to query on -the ipg database} +\item{accessions}{A character vector containing the accession numbers to query +in the protein database.} -\item{out_path}{Path to write the efetch results to} +\item{out_path}{A string specifying the path where the results of the query +will be written. If set to NULL, a temporary directory will be used.} -\item{plan}{} +\item{plan}{A character string that specifies the execution plan for parallel +processing. The default is "multicore".} +} +\value{ +This function does not return a value. It writes the results to the +specified output path. } \description{ Perform elink to go from protein database to taxonomy database and write the resulting file of taxid and lineage to out_path } +\examples{ +\dontrun{ +accessions <- c("ABC123", "XYZ456", "LMN789") +proteinAcc2TaxID_old(accessions, out_path = "/path/to/output") +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/removeAsterisks.Rd b/man/removeAsterisks.Rd index 691a7adf..c62b7651 100644 --- a/man/removeAsterisks.Rd +++ b/man/removeAsterisks.Rd @@ -2,15 +2,19 @@ % Please edit documentation in R/cleanup.R \name{removeAsterisks} \alias{removeAsterisks} -\title{Remove Astrk} +\title{Remove Asterisk} \usage{ removeAsterisks(query_data, colname = "GenContext") } \arguments{ -\item{colname}{} +\item{query_data}{A data frame containing the data to be processed.} + +\item{colname}{The name of the column from which asterisks should be removed. +Defaults to "GenContext".} } \value{ -Describe return, in detail +The original data frame with asterisks removed from the specified +column. } \description{ Remove the asterisks from a column of data diff --git a/man/removeEmptyRows.Rd b/man/removeEmptyRows.Rd index 66551810..4e52cc99 100644 --- a/man/removeEmptyRows.Rd +++ b/man/removeEmptyRows.Rd @@ -13,7 +13,8 @@ removeEmptyRows(prot, by_column = "DomArch") Default column is 'DomArch'. Can also take the following as input, 'Species', 'GenContext', 'ClustName'.} } \value{ -Describe return, in detail +A tibble with rows removed where the specified column contains +\code{"-"}, \code{"NA"}, or an empty string. } \description{ Remove empty rows by column diff --git a/man/removeTails.Rd b/man/removeTails.Rd index 76d1e18a..0c63e89d 100644 --- a/man/removeTails.Rd +++ b/man/removeTails.Rd @@ -14,7 +14,8 @@ removeTails(prot, by_column = "DomArch", keep_domains = FALSE) \item{keep_domains}{Default is False Keeps tail entries that contain the query domains.} } \value{ -Describe return, in detail +The original data frame with singletons removed from the specified +column. } \description{ Remove tails/singletons diff --git a/man/renameFA.Rd b/man/renameFA.Rd index 7b6fd579..da7d339b 100644 --- a/man/renameFA.Rd +++ b/man/renameFA.Rd @@ -15,6 +15,15 @@ renameFA(fa_path, outpath, replacement_function = mapAcc2Name, ...) \item{...}{Additional arguments to pass to replacement_function} } +\value{ +A character vector of the modified lines in the FASTA file. +} \description{ Rename the labels of fasta files } +\examples{ +\dontrun{ +renameFA("path/to/input.fasta", +"path/to/output.fasta", mapAcc2Name, acc2name) +} +} diff --git a/man/rename_fasta.Rd b/man/rename_fasta.Rd index 6b4e5dd7..3089d530 100644 --- a/man/rename_fasta.Rd +++ b/man/rename_fasta.Rd @@ -15,6 +15,15 @@ rename_fasta(fa_path, outpath, replacement_function = map_acc2name, ...) \item{...}{Additional arguments to pass to replacement_function} } +\value{ +Character vector containing the modified lines of the Fasta file. +} \description{ Rename the labels of fasta files } +\examples{ +\dontrun{ +rename_fasta("input.fasta", "output.fasta", +replacement_function = map_acc2name, acc2name = acc2name_table) +} +} diff --git a/man/replaceQuestionMarks.Rd b/man/replaceQuestionMarks.Rd index 0949568f..8b16992a 100644 --- a/man/replaceQuestionMarks.Rd +++ b/man/replaceQuestionMarks.Rd @@ -12,7 +12,9 @@ replaceQuestionMarks(prot, by_column = "GenContext") \item{by_column}{Column to operate on} } \value{ -Describe return, in detail +The original data frame with the specified column updated. All +consecutive '?' characters will be replaced with 'X(s)', and individual '?' +characters will be replaced with 'X'. } \description{ Replace consecutive '?' separated by '->', '<-' or '||' with 'X(s)' diff --git a/man/reveql.Rd b/man/reveql.Rd index 9dc2bcb8..b16ed7be 100644 --- a/man/reveql.Rd +++ b/man/reveql.Rd @@ -2,13 +2,26 @@ % Please edit documentation in R/reverse_operons.R \name{reveql} \alias{reveql} -\title{reveql} +\title{reveql: Reverse Equalities in Genomic Context} \usage{ reveql(prot) } \arguments{ -\item{prot}{} +\item{prot}{\link{vector} A vector of genomic context strings to be processed.} +} +\value{ +\link{vector} A vector of the same length as the input, where each genomic +element is annotated with either a forward ("->") or reverse ("<-") direction, +depending on its position relative to the "=" symbols. } \description{ -reveql +This function processes the genomic context strings (GenContext) and reverses +directional signs based on the presence of an equal sign ("="). +} +\examples{ +# Example input: Genomic context with directional symbols and an asterisk +genomic_context <- c("A", "B", "*", "C", "D", "=", "E", "F") +reveql(genomic_context) + +# Output: "A->", "B->", "*", "<-C", "<-D", "=", "E->", "F->" } diff --git a/man/reverse_operon.Rd b/man/reverse_operon.Rd index 270e2a62..1c27aecc 100644 --- a/man/reverse_operon.Rd +++ b/man/reverse_operon.Rd @@ -2,13 +2,28 @@ % Please edit documentation in R/reverse_operons.R \name{reverse_operon} \alias{reverse_operon} -\title{reverse_operon} +\title{reverse_operon: Reverse the Direction of Operons in Genomic Context} \usage{ reverse_operon(prot) } \arguments{ -\item{prot}{} +\item{prot}{\link{data.frame} A data frame containing at least a column named +'GenContext', which represents the genomic contexts that need to be reversed.} +} +\value{ +\link{data.frame} The input data frame with the 'GenContext' column updated t +o reflect the reversed operons. } \description{ -reverse_operon +This function processes a genomic context data frame to reverse the direction +of operons based on specific patterns in the GenContext column. It handles +elements represented by ">" and "<" and restructures the genomic context by +flipping the direction of operons while preserving the relationships +indicated by "=". +} +\examples{ +# Example genomic context data frame +prot <- data.frame(GenContext = c("A>B", "CI")) +reversed_prot <- reverse_operon(prot) +print(reversed_prot) } diff --git a/man/runIPRScan.Rd b/man/runIPRScan.Rd index 678d8652..8431efb4 100644 --- a/man/runIPRScan.Rd +++ b/man/runIPRScan.Rd @@ -7,8 +7,28 @@ runIPRScan(filepath_fasta, filepath_out, appl = c("Pfam", "Gene3D")) } \arguments{ -\item{appl}{} +\item{filepath_fasta}{A string representing the path to the input FASTA file.} + +\item{filepath_out}{A string representing the base path for the output file.} + +\item{appl}{A character vector specifying the InterProScan applications to +use (e.g., "Pfam", "Gene3D"). Default is \code{c("Pfam", "Gene3D")}.} +} +\value{ +A data frame containing the results from the InterProScan output +TSV file. } \description{ -runIPRScan +Run InterProScan on a given FASTA file and save the results to an +output file. +} +\examples{ +\dontrun{ +results <- runIPRScan( + filepath_fasta = "path/to/your_fasta_file.fasta", + filepath_out = "path/to/output_file", + appl = c("Pfam", "Gene3D") +) +print(results) +} } diff --git a/man/run_deltablast.Rd b/man/run_deltablast.Rd index 3c934d77..2a9f01b0 100644 --- a/man/run_deltablast.Rd +++ b/man/run_deltablast.Rd @@ -16,12 +16,35 @@ run_deltablast( ) } \arguments{ -\item{db_search_path}{Path to the BLAST databases} +\item{deltablast_path}{Path to the Delta-BLAST executable.} -\item{num_threads}{} +\item{db_search_path}{Path to the BLAST databases.} + +\item{db}{Name of the BLAST database to search against (default is "refseq").} + +\item{query}{Path to the input query file.} + +\item{evalue}{E-value threshold for reporting matches (default is "1e-5").} + +\item{out}{Path to the output file where results will be saved.} + +\item{num_alignments}{Number of alignments to report.} + +\item{num_threads}{Number of threads to use for the search (default is 1).} +} +\value{ +This function does not return a value; it outputs results to the +specified file. } \description{ -Run DELTABLAST to find homologs for proteins of interest +This function executes a Delta-BLAST search using the specified parameters +and database. It sets the BLAST database path, runs the Delta-BLAST command +with the given query, and outputs the results. +} +\examples{ +\dontrun{ +run_deltablast(deltablast_path, db_search_path, query, out, num_alignments) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/run_rpsblast.Rd b/man/run_rpsblast.Rd index bc4474f1..4b638a72 100644 --- a/man/run_rpsblast.Rd +++ b/man/run_rpsblast.Rd @@ -15,10 +15,31 @@ run_rpsblast( ) } \arguments{ -\item{db_search_path}{Path to the BLAST databases} +\item{rpsblast_path}{Path to the RPS-BLAST executable.} -\item{num_threads}{} +\item{db_search_path}{Path to the BLAST databases.} + +\item{db}{Name of the BLAST database to search against (default is "refseq").} + +\item{query}{Path to the input query file.} + +\item{evalue}{E-value threshold for reporting matches (default is "1e-5").} + +\item{out}{Path to the output file where results will be saved.} + +\item{num_threads}{Number of threads to use for the search (default is 1).} +} +\value{ +This function does not return a value; it outputs results to the +specified file. } \description{ -Run RPSBLAST to generate domain architectures for proteins of interest +This function executes an RPS-BLAST search to generate domain architectures +for specified proteins. It sets the BLAST database path, runs the RPS-BLAST +command with the provided query, and outputs the results. +} +\examples{ +\dontrun{ +run_rpsblast(rpsblast_path, db_search_path, query, out) +} } diff --git a/man/selectLongestDuplicate.Rd b/man/selectLongestDuplicate.Rd index c177d289..bd535455 100644 --- a/man/selectLongestDuplicate.Rd +++ b/man/selectLongestDuplicate.Rd @@ -7,10 +7,15 @@ selectLongestDuplicate(prot, column) } \arguments{ -\item{column}{} +\item{prot}{A data frame containing the data, with at least one column +named 'AccNum' for identification of duplicates.} + +\item{column}{The name of the column from which the longest entry among +duplicates will be selected.} } \value{ -Describe return, in detail +A data frame containing only the longest entries among duplicates +based on the specified column. } \description{ Pick Longer Duplicate diff --git a/man/shortenLineage.Rd b/man/shortenLineage.Rd index f495fb32..00200f96 100644 --- a/man/shortenLineage.Rd +++ b/man/shortenLineage.Rd @@ -2,18 +2,34 @@ % Please edit documentation in R/plotting.R \name{shortenLineage} \alias{shortenLineage} -\title{Shorten Lineage} +\title{Shorten Lineage Names} \usage{ shortenLineage(data, colname = "Lineage", abr_len = 1) } \arguments{ -\item{abr_len}{} +\item{data}{A data frame that contains a column with lineage names to be +shortened.} + +\item{colname}{Character. The name of the column in the data frame containing +the lineage strings to be shortened. Default is \code{"Lineage"}.} + +\item{abr_len}{Integer. The number of characters to retain after the first +letter. If set to 1, only the first letter of each segment before the +delimiter (\code{>}) is retained. Default is 1.} +} +\value{ +A modified data frame where the specified lineage column has been +shortened. } \description{ -Shorten Lineage +This function abbreviates lineage names by shortening the first part of the +string (up to a given delimiter). } \examples{ \dontrun{ -shortenLineage() +df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +"Archaea>Euryarchaeota>Thermococci")) +shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) +print(shortened_df) } } diff --git a/man/summarizebylineage.Rd b/man/summarizeByLineage.Rd similarity index 100% rename from man/summarizebylineage.Rd rename to man/summarizeByLineage.Rd diff --git a/man/theme_genes2.Rd b/man/theme_genes2.Rd index 29f79673..d1420067 100644 --- a/man/theme_genes2.Rd +++ b/man/theme_genes2.Rd @@ -6,6 +6,19 @@ \usage{ theme_genes2() } +\value{ +A ggplot2 theme object. +} \description{ Theme Genes2 } +\examples{ +library(ggplot2) + +# Create a sample plot using the custom theme +ggplot(mtcars, aes(x = wt, y = mpg)) + + geom_point() + + theme_genes2() + + labs(title = "Car Weight vs MPG") + +} diff --git a/man/to_titlecase.Rd b/man/to_titlecase.Rd index 45139d3b..1b142875 100644 --- a/man/to_titlecase.Rd +++ b/man/to_titlecase.Rd @@ -13,10 +13,17 @@ to_titlecase(text, delimitter) \item{y}{Delimitter. Default is space (" ").} } +\value{ +A character vector in title case. +} \description{ Translate string to Title Case w/ delimitter. Changing case to 'Title Case' } +\examples{ +to_titlecase("hello world") +to_titlecase("this is a test", "_") +} \seealso{ chartr, toupper, and tolower. } diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalGenContextOrDomArchCounts.Rd similarity index 100% rename from man/totalgencontextordomarchcounts.Rd rename to man/totalGenContextOrDomArchCounts.Rd diff --git a/man/validateCountDF.Rd b/man/validateCountDF.Rd index fc4aefa2..5943723e 100644 --- a/man/validateCountDF.Rd +++ b/man/validateCountDF.Rd @@ -7,8 +7,16 @@ validateCountDF(var) } \arguments{ -\item{var}{} +\item{var}{A data frame whose columns are to be converted.} +} +\value{ +A data frame with non-'n' columns converted to character type. } \description{ validateCountDF } +\examples{ +\dontrun{ +new_df <- .all_non_n_cols_to_char(my_data) +} +} diff --git a/man/wordcloud3.Rd b/man/wordcloud3.Rd index cce07a82..1406ea0d 100644 --- a/man/wordcloud3.Rd +++ b/man/wordcloud3.Rd @@ -25,8 +25,60 @@ wordcloud3( ) } \arguments{ -\item{hoverFunction}{} +\item{data}{Data frame or table containing words and their frequencies for +the word cloud.} + +\item{size}{Numeric. Scaling factor for word sizes (default is 1).} + +\item{minSize}{Numeric. Minimum font size for the smallest word +(default is 0).} + +\item{gridSize}{Numeric. Size of the grid for placing words (default is 0).} + +\item{fontFamily}{Character. Font family to use for the words +(default is "Segoe UI").} + +\item{fontWeight}{Character. Font weight for the words (default is "bold").} + +\item{color}{Character or vector. Color of the words. Use "random-dark" for +random dark colors (default) or specify a color.} + +\item{backgroundColor}{Character. Background color of the word cloud +(default is "white").} + +\item{minRotation}{Numeric. Minimum rotation angle of words in radians +(default is -π/4).} + +\item{maxRotation}{Numeric. Maximum rotation angle of words in radians +(default is π/4).} + +\item{shuffle}{Logical. Whether to shuffle the words (default is TRUE).} + +\item{rotateRatio}{Numeric. Proportion of words that are rotated +(default is 0.4).} + +\item{shape}{Character. Shape of the word cloud ("circle" is default, but +you can use "cardioid", "star", "triangle", etc.).} + +\item{ellipticity}{Numeric. Degree of ellipticity (default is 0.65).} + +\item{widgetsize}{Numeric vector. Width and height of the widget +(default is NULL, which uses default size).} + +\item{figPath}{Character. Path to an image file to use as a mask for the +word cloud (optional).} + +\item{hoverFunction}{JS function. JavaScript function to run when hovering +over words (optional).} +} +\value{ +An HTML widget object displaying a word cloud. } \description{ plotWordCloud3 } +\examples{ +\dontrun{ +wordcloud3(data = your_data, size = 1.5, color = "random-light") +} +} diff --git a/man/words2wordcounts.Rd b/man/words2WordCounts.Rd similarity index 100% rename from man/words2wordcounts.Rd rename to man/words2WordCounts.Rd diff --git a/man/write.MsaAAMultipleAlignment.Rd b/man/write.MsaAAMultipleAlignment.Rd index 17a05f50..6d660b9e 100644 --- a/man/write.MsaAAMultipleAlignment.Rd +++ b/man/write.MsaAAMultipleAlignment.Rd @@ -13,6 +13,11 @@ write.MsaAAMultipleAlignment(alignment, outpath) \item{outpath}{Where the resulting FASTA file should be written to} } +\value{ +Character string representing the content of the written FASTA file. + +Character string of the FASTA content that was written to the file. +} \description{ MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package @@ -21,6 +26,17 @@ Write MsaAAMultpleAlignment Objects as algined fasta sequence MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package } +\examples{ +\dontrun{ +alignment <- msaMuscle("my_sequences.fasta") +write.MsaAAMultipleAlignment(alignment, "aligned_sequences.fasta") +} +\dontrun{ +# Example usage +alignment <- alignFasta("path/to/sequences.fasta") +write.MsaAAMultipleAlignment(alignment, "path/to/aligned_sequences.fasta") +} +} \author{ Samuel Chen, Janani Ravi } From 74b83ab58bbd3463217f211b861918f5daa2b6dd Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Fri, 11 Oct 2024 01:59:14 +0300 Subject: [PATCH 25/61] remove import Signed-off-by: Awa Synthia --- NAMESPACE | 1 - R/msa.R | 1 - 2 files changed, 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 50943690..078f971b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -230,7 +230,6 @@ importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,pmap) importFrom(purrr,pmap_dfr) -importFrom(rMSA,kalign) importFrom(readr,cols) importFrom(readr,read_delim) importFrom(readr,read_file) diff --git a/R/msa.R b/R/msa.R index 20089dba..7d0d9be5 100644 --- a/R/msa.R +++ b/R/msa.R @@ -196,7 +196,6 @@ msa_pdf <- function(fasta_path, out_path = NULL, #' will be saved. #' #' @importFrom Biostrings readAAStringSet -#' @importFrom rMSA kalign #' #' @return A list containing the alignment object and the output file path. #' @export From 2da3d1a1eadb1c3d6f140700444e15db46c341d2 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 08:40:17 -0600 Subject: [PATCH 26/61] summarize.R adjustments - add back importFrom n_distinct() as it appears to be used by summarizeGenContext() - use function call as title -- may specify this in MolEvolvR style guide for consistency - adjust Rd grouping with MolEvolvR_summary @rdname tag for functions that had a clear summary element. This will hopefully avoid confusion with the rather ubiquitous dplyr::summarize - converted some code comments to placeholder descriptions --- NAMESPACE | 1 + R/summarize.R | 58 +++---- man/{summarize.Rd => MolEvolvR_summary.Rd} | 159 ++++---------------- man/countbycolumn.Rd | 26 +++- man/elements2Words.Rd | 25 ++- man/filterbydomains.Rd | 2 +- man/filterbyfrequency.Rd | 14 +- man/findparalogs.Rd | 2 +- man/summarizeDomArch.Rd | 22 --- man/summarizeDomArch_ByLineage.Rd | 22 --- man/summarizeGenContext.Rd | 22 --- man/summarizeGenContext_ByDomArchLineage.Rd | 22 --- man/summarizeGenContext_ByLineage.Rd | 22 --- man/summarizebylineage.Rd | 25 --- man/totalgencontextordomarchcounts.Rd | 42 ------ man/words2wordcounts.Rd | 13 +- 16 files changed, 122 insertions(+), 355 deletions(-) rename man/{summarize.Rd => MolEvolvR_summary.Rd} (52%) delete mode 100644 man/summarizeDomArch.Rd delete mode 100644 man/summarizeDomArch_ByLineage.Rd delete mode 100644 man/summarizeGenContext.Rd delete mode 100644 man/summarizeGenContext_ByDomArchLineage.Rd delete mode 100644 man/summarizeGenContext_ByLineage.Rd delete mode 100644 man/summarizebylineage.Rd delete mode 100644 man/totalgencontextordomarchcounts.Rd diff --git a/NAMESPACE b/NAMESPACE index 2326fc1f..53332439 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,6 +139,7 @@ importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) +importFrom(dplyr,n_distinct) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,right_join) diff --git a/R/summarize.R b/R/summarize.R index 321a0488..2816f174 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -10,7 +10,7 @@ # suppressPackageStartupMessages(library(rlang)) # conflicted::conflict_prefer("filter", "dplyr") -#' Filter by Domains +#' filterByDomains #' #' @author Samuel Chen, Janani Ravi #' @description filterByDomains filters a data frame by identifying exact domain matches @@ -29,7 +29,6 @@ #' #' @return Filtered data frame #' @note There is no need to make the domains 'regex safe', that will be handled by this function -#' @name summarize #' @export #' #' @examples @@ -89,9 +88,11 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo ## COUNTS of DAs and GCs ## ## Before/after break up ## ########################### -## Function to obtain element counts (DA, GC) -#' Count By Column -#' + +#' countByColumn +#' @description +#' Function to obtain element counts (DA, GC) +#' #' @param prot A data frame containing the dataset to analyze, typically with #' multiple columns including the one specified by the `column` parameter. #' @param column A character string specifying the name of the column to analyze. @@ -111,7 +112,6 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' The tibble is filtered to only include elements that have a frequency #' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). -#' @name summarize #' @export #' #' @examples @@ -131,7 +131,7 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { return(counts) } -#' Elements 2 Words +#' elements2Words #' #' @description #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -156,7 +156,6 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @return A single string where elements are delimited by spaces. The function #' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. -#' @name summarize #' #' @examples #' \dontrun{ @@ -196,7 +195,7 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" return(z3) } -#' Words 2 Word Counts +#' words2WordCounts #' #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] @@ -215,7 +214,6 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' \item{`freq`}{A column containing the frequency counts for each word.} #' } #' -#' @name summarize #' #' @examples #' \dontrun{ @@ -252,9 +250,11 @@ words2WordCounts <- function(string) { arrange(-freq) return(df_word_count) } -## Function to filter based on frequencies -#' Filter Frequency -#' + +#' filterByFrequency +#' @description +#' Function to filter based on frequencies +#' #' @param x A tibble (tbl_df) containing at least two columns: one for #' elements (e.g., `words`) and one for their frequency (e.g., `freq`). #' @param min.freq A numeric value specifying the minimum frequency threshold. @@ -263,7 +263,6 @@ words2WordCounts <- function(string) { #' #' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. -#' @name summarize #' #' @export #' @@ -279,7 +278,14 @@ filterByFrequency <- function(x, min.freq) { ######################### ## SUMMARY FUNCTIONS #### ######################### -#' Summarize by Lineage +#' MolEvolvR Summary +#' @name MolEvolvR_summary +#' @description +#' A collection of summary functions for the MolEvolvR package. +#' +NULL + +#' summarizeByLineage #' #' @param prot A dataframe or tibble containing the data. #' @param column A string representing the column to be summarized @@ -295,7 +301,7 @@ filterByFrequency <- function(x, min.freq) { #' @return A tibble summarizing the counts of occurrences of elements in #' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -341,7 +347,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency #' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' #' @export #' @@ -357,7 +363,7 @@ summarizeDomArch_ByLineage <- function(x) { arrange(desc(count)) } -## Function to retrieve counts of how many lineages a DomArch appears in + #' summarizeDomArch #' #' @description @@ -375,7 +381,7 @@ summarizeDomArch_ByLineage <- function(x) { #' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -407,7 +413,7 @@ summarizeDomArch <- function(x) { #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -432,7 +438,7 @@ summarizeGenContext_ByDomArchLineage <- function(x) { #' @importFrom dplyr arrange desc filter group_by n summarise #' #' @return Describe return, in detail -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -455,7 +461,7 @@ summarizeGenContext_ByLineage <- function(x) { #' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' -#' @importFrom dplyr arrange desc filter group_by n summarise +#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise #' #' @return A tibble summarizing each unique combination of `GenContext` and #' `Lineage`, along with the following columns: @@ -465,7 +471,7 @@ summarizeGenContext_ByLineage <- function(x) { #' `GenContext` and `Lineage`. #' #' The results are arranged in descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -487,7 +493,7 @@ summarizeGenContext <- function(x) { ################## -#' Total Counts +#' totalGenContextOrDomArchCounts #' #' @description #' Creates a data frame with a totalcount column @@ -518,7 +524,7 @@ summarizeGenContext <- function(x) { #' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or @@ -670,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col -#' Find Paralogs +#' findParalogs #' #' @description #' Creates a data frame of paralogs. diff --git a/man/summarize.Rd b/man/MolEvolvR_summary.Rd similarity index 52% rename from man/summarize.Rd rename to man/MolEvolvR_summary.Rd index f149f686..262c4719 100644 --- a/man/summarize.Rd +++ b/man/MolEvolvR_summary.Rd @@ -1,50 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/summarize.R -\name{summarize} -\alias{summarize} -\alias{filter_by_doms} -\alias{count_bycol} -\alias{elements2words} -\alias{words2wc} -\alias{filter_freq} -\alias{summarize_bylin} -\alias{summ.DA.byLin} -\alias{summ.DA} -\alias{summ.GC.byDALin} -\alias{summ.GC.byLin} -\alias{summ.GC} -\alias{total_counts} -\title{Filter by Domains} +\name{MolEvolvR_summary} +\alias{MolEvolvR_summary} +\alias{summarizeByLineage} +\alias{summarizeDomArch_ByLineage} +\alias{summarizeDomArch} +\alias{summarizeGenContext_ByDomArchLineage} +\alias{summarizeGenContext_ByLineage} +\alias{summarizeGenContext} +\alias{totalGenContextOrDomArchCounts} +\title{MolEvolvR Summary} \usage{ -filter_by_doms( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) - -count_bycol(prot = prot, column = "DomArch", min.freq = 1) - -elements2words(prot, column = "DomArch", conversion_type = "da2doms") - -words2wc(string) +summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -filter_freq(x, min.freq) +summarizeDomArch_ByLineage(x) -summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) +summarizeDomArch(x) -summ.DA.byLin(x) +summarizeGenContext_ByDomArchLineage(x) -summ.DA(x) +summarizeGenContext_ByLineage(x) -summ.GC.byDALin(x) +summarizeGenContext(x) -summ.GC.byLin(x) - -summ.GC(x) - -total_counts( +totalGenContextOrDomArchCounts( prot, column = "DomArch", lineage_col = "Lineage", @@ -59,39 +38,15 @@ total_counts( \item{column}{Character. The column to summarize, default is "DomArch".} -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} - -\item{min.freq}{A numeric value specifying the minimum frequency threshold. -Only elements with frequencies greater than or equal to this value will be -retained.} - -\item{conversion_type}{A character string specifying the type of conversion. -Two options are available: -\describe{ -\item{\code{da2doms}}{Convert domain architectures into individual domains by -replacing \code{+} symbols with spaces.} -\item{\code{gc2da}}{Convert genomic context into domain architectures by -replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} -}} - -\item{string}{A character string containing the elements (words) to count. -This would typically be a space-delimited string representing domain -architectures or genomic contexts.} - -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} - \item{by}{A string representing the grouping column (e.g., \code{Lineage}). Default is "Lineage".} \item{query}{A string specifying the query pattern for filtering the target column. Use "all" to skip filtering and include all rows.} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} + \item{lineage_col}{Character. The name of the lineage column, default is "Lineage".} @@ -105,33 +60,6 @@ cutoff. Default is FALSE.} Default is 2.} } \value{ -Filtered data frame - -A tibble with two columns: -\describe{ -\item{\code{column}}{The unique elements from the specified column -(e.g., "DomArch").} -\item{\code{freq}}{The frequency of each element, i.e., the number of times -each element appears in the specified column.} -} -The tibble is filtered to only include elements that have a frequency -greater than or equal to \code{min.freq} and does not include elements with \code{NA} -values or those starting with a hyphen ("-"). - -A single string where elements are delimited by spaces. The function -performs necessary substitutions based on the \code{conversion_type} and cleans up -extraneous characters like newlines, tabs, and multiple spaces. - -A tibble (tbl_df) with two columns: -\describe{ -\item{\code{words}}{A column containing the individual words -(domains or domain architectures).} -\item{\code{freq}}{A column containing the frequency counts for each word.} -} - -A tibble with the same structure as \code{x}, but filtered to include -only rows where the frequency is greater than or equal to \code{min.freq}. - A tibble summarizing the counts of occurrences of elements in the \code{column}, grouped by the \code{by} column. The result includes the number of occurrences (\code{count}) and is arranged in descending order of count. @@ -187,13 +115,7 @@ the overall count. } } \description{ -filter_by_doms filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain - -Break string ELEMENTS into WORDS for domain architecture (DA) and genomic -context (GC) - -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} +A collection of summary functions for the MolEvolvR package. Function to summarize and retrieve counts by Domains & Domains+Lineage @@ -204,57 +126,32 @@ Creates a data frame with a totalcount column This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. } \note{ -There is no need to make the domains 'regex safe', that will be handled by this function - Please refer to the source code if you have alternate file formats and/or column names. } \examples{ \dontrun{ -filter_by_doms() -} -\dontrun{ -count_bycol(prot = my_data, column = "DomArch", min.freq = 10) -} -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", -"a+b", "b+c", "b-c")) |> elements2words() -} - -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2words() |> - words2wc() -} - -\dontrun{ -filter_freq() -} -\dontrun{ library(tidyverse) tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarize_bylin(query = "all") + summarizeByLineage(query = "all") } \dontrun{ -summ.DA.byLin() +summarizeDomArch_ByLineage() } \dontrun{ -summ.DA() +summarizeDomArch() } \dontrun{ -summ.GC.byDALin +summarizeGenContext_ByDomArchLineage } \dontrun{ -summ.GC.byLin() +summarizeGenContext_ByLineage() } \dontrun{ -summ.GC() +summarizeGenContext() } \dontrun{ -total_counts(pspa - gc_lin_counts, 0, "GC") -} +totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") } -\author{ -Samuel Chen, Janani Ravi } diff --git a/man/countbycolumn.Rd b/man/countbycolumn.Rd index 34fcc3e0..57ff9ac4 100644 --- a/man/countbycolumn.Rd +++ b/man/countbycolumn.Rd @@ -2,21 +2,37 @@ % Please edit documentation in R/summarize.R \name{countByColumn} \alias{countByColumn} -\title{Count By Column} +\title{countByColumn} \usage{ countByColumn(prot = prot, column = "DomArch", min.freq = 1) } \arguments{ -\item{min.freq}{} +\item{prot}{A data frame containing the dataset to analyze, typically with +multiple columns including the one specified by the \code{column} parameter.} + +\item{column}{A character string specifying the name of the column to analyze. +The default is "DomArch".} + +\item{min.freq}{An integer specifying the minimum frequency an element must +have to be included in the output. Default is 1.} } \value{ -Describe return, in detail +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). } \description{ -Count By Column +Function to obtain element counts (DA, GC) } \examples{ \dontrun{ -countByColumn() +countByColumn(prot = my_data, column = "DomArch", min.freq = 10) } } diff --git a/man/elements2Words.Rd b/man/elements2Words.Rd index 1094d363..bfd3071b 100644 --- a/man/elements2Words.Rd +++ b/man/elements2Words.Rd @@ -2,20 +2,30 @@ % Please edit documentation in R/summarize.R \name{elements2Words} \alias{elements2Words} -\title{Elements 2 Words} +\title{elements2Words} \usage{ elements2Words(prot, column = "DomArch", conversion_type = "da2doms") } \arguments{ -\item{prot}{\link{dataframe}} +\item{prot}{A dataframe containing the dataset to analyze. The specified +\code{column} contains the string elements to be processed.} -\item{column}{\link{string} column name} +\item{column}{A character string specifying the name of the column to analyze. +Default is "DomArch".} -\item{conversion_type}{\link{string} type of conversion: 'da2doms': domain architectures to -domains. 'gc2da' genomic context to domain architectures} +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} } \value{ -\link{string} with words delimited by spaces +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. } \description{ Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -23,7 +33,8 @@ context (GC) } \examples{ \dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2Words() +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2Words() } } diff --git a/man/filterbydomains.Rd b/man/filterbydomains.Rd index 8c885363..afb3e5cb 100644 --- a/man/filterbydomains.Rd +++ b/man/filterbydomains.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarize.R \name{filterByDomains} \alias{filterByDomains} -\title{Filter by Domains} +\title{filterByDomains} \usage{ filterByDomains( prot, diff --git a/man/filterbyfrequency.Rd b/man/filterbyfrequency.Rd index d2c5f9cd..15d06d67 100644 --- a/man/filterbyfrequency.Rd +++ b/man/filterbyfrequency.Rd @@ -2,18 +2,24 @@ % Please edit documentation in R/summarize.R \name{filterByFrequency} \alias{filterByFrequency} -\title{Filter Frequency} +\title{filterByFrequency} \usage{ filterByFrequency(x, min.freq) } \arguments{ -\item{min.freq}{} +\item{x}{A tibble (tbl_df) containing at least two columns: one for +elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} } \value{ -Describe return, in detail +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. } \description{ -Filter Frequency +Function to filter based on frequencies } \examples{ \dontrun{ diff --git a/man/findparalogs.Rd b/man/findparalogs.Rd index 4b5edbcf..d92edf71 100644 --- a/man/findparalogs.Rd +++ b/man/findparalogs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarize.R \name{findParalogs} \alias{findParalogs} -\title{Find Paralogs} +\title{findParalogs} \usage{ findParalogs(prot) } diff --git a/man/summarizeDomArch.Rd b/man/summarizeDomArch.Rd deleted file mode 100644 index 11db1afa..00000000 --- a/man/summarizeDomArch.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeDomArch} -\alias{summarizeDomArch} -\title{summarizeDomArch} -\usage{ -summarizeDomArch(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -Function to retrieve counts of how many lineages a DomArch appears in -} -\examples{ -\dontrun{ -summarizeDomArch() -} -} diff --git a/man/summarizeDomArch_ByLineage.Rd b/man/summarizeDomArch_ByLineage.Rd deleted file mode 100644 index cf5fac22..00000000 --- a/man/summarizeDomArch_ByLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeDomArch_ByLineage} -\alias{summarizeDomArch_ByLineage} -\title{summarizeDomArch_ByLineage} -\usage{ -summarizeDomArch_ByLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -Function to summarize and retrieve counts by Domains & Domains+Lineage -} -\examples{ -\dontrun{ -summarizeDomArch_ByLineage() -} -} diff --git a/man/summarizeGenContext.Rd b/man/summarizeGenContext.Rd deleted file mode 100644 index 5a40811b..00000000 --- a/man/summarizeGenContext.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext} -\alias{summarizeGenContext} -\title{summarizeGenContext} -\usage{ -summarizeGenContext(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summarizeGenContext -} -\examples{ -\dontrun{ -summarizeGenContext() -} -} diff --git a/man/summarizeGenContext_ByDomArchLineage.Rd b/man/summarizeGenContext_ByDomArchLineage.Rd deleted file mode 100644 index 59e0376e..00000000 --- a/man/summarizeGenContext_ByDomArchLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext_ByDomArchLineage} -\alias{summarizeGenContext_ByDomArchLineage} -\title{summarizeGenContext_ByDomArchLineage} -\usage{ -summarizeGenContext_ByDomArchLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Define return, in detail -} -\description{ -summarizeGenContext_ByDomArchLineage -} -\examples{ -\dontrun{ -summarizeGenContext_ByDomArchLineage -} -} diff --git a/man/summarizeGenContext_ByLineage.Rd b/man/summarizeGenContext_ByLineage.Rd deleted file mode 100644 index 932fe6a7..00000000 --- a/man/summarizeGenContext_ByLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext_ByLineage} -\alias{summarizeGenContext_ByLineage} -\title{summarizeGenContext_ByLineage} -\usage{ -summarizeGenContext_ByLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summarizeGenContext_ByLineage -} -\examples{ -\dontrun{ -summarizeGenContext_ByLineage() -} -} diff --git a/man/summarizebylineage.Rd b/man/summarizebylineage.Rd deleted file mode 100644 index 2e445913..00000000 --- a/man/summarizebylineage.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeByLineage} -\alias{summarizeByLineage} -\title{Summarize by Lineage} -\usage{ -summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{query}{} -} -\value{ -Describe return, in detail -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarizeByLineage(query = "all") -} - -} diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalgencontextordomarchcounts.Rd deleted file mode 100644 index f457cb6a..00000000 --- a/man/totalgencontextordomarchcounts.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{totalGenContextOrDomArchCounts} -\alias{totalGenContextOrDomArchCounts} -\title{Total Counts} -\usage{ -totalGenContextOrDomArchCounts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} - -\item{digits}{} -} -\value{ -Define return, in detail -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wordcounts.Rd b/man/words2wordcounts.Rd index 7f60f226..370dec7f 100644 --- a/man/words2wordcounts.Rd +++ b/man/words2wordcounts.Rd @@ -2,15 +2,22 @@ % Please edit documentation in R/summarize.R \name{words2WordCounts} \alias{words2WordCounts} -\title{Words 2 Word Counts} +\title{words2WordCounts} \usage{ words2WordCounts(string) } \arguments{ -\item{string}{} +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} } \value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} } \description{ Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} From 11b22113b52087c6a72e7df4b845d8f0323c367b Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 09:02:19 -0600 Subject: [PATCH 27/61] minor phrasing adjustment --- .github/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 9fcd6b7f..f9f8de97 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -32,7 +32,7 @@ See our guide on [how to create a great issue](https://code-review.tidyverse.org ``` usethis::create_from_github("JRaviLab/MolEvolvR", fork = TRUE) ``` -- Install Bioconductor dependencies: +- Install BiocManager from Bioconductor: ``` if (!require("BiocManager", quietly = TRUE)) From 851d8796c9d9f4d895fd92f5eacb8f1eab45eda9 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 09:02:36 -0600 Subject: [PATCH 28/61] skip sending quarto files to Git --- .github/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/.gitignore b/.github/.gitignore index 2d19fc76..5c86aa40 100644 --- a/.github/.gitignore +++ b/.github/.gitignore @@ -1 +1,3 @@ *.html + +/.quarto/ From 2d00b6fa42b124acf8cd3cd63e60cec745d71a10 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 13:46:03 -0600 Subject: [PATCH 29/61] modify .Rd names --- R/ipr2viz.R | 9 ++++---- man/countByColumn.Rd | 38 +++++++++++++++++++++++++++++++ man/filterByDomains.Rd | 44 ++++++++++++++++++++++++++++++++++++ man/filterByFrequency.Rd | 28 +++++++++++++++++++++++ man/findParalogs.Rd | 26 +++++++++++++++++++++ man/getTopAccByLinDomArch.Rd | 2 +- man/plotIPR2Viz.Rd | 4 ++-- man/plotIPR2VizWeb.Rd | 4 ++-- man/themeGenes2.Rd | 4 ++-- man/words2WordCounts.Rd | 32 ++++++++++++++++++++++++++ 10 files changed, 180 insertions(+), 11 deletions(-) create mode 100644 man/countByColumn.Rd create mode 100644 man/filterByDomains.Rd create mode 100644 man/filterByFrequency.Rd create mode 100644 man/findParalogs.Rd create mode 100644 man/words2WordCounts.Rd diff --git a/R/ipr2viz.R b/R/ipr2viz.R index dff6e67a..9b625d4e 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -15,7 +15,7 @@ ################################# ## themeGenes2 adapted from theme_genes (w/o strip.text()) ## https://github.com/wilkox/gggenes/blob/master/R/theme_genes.R -#' Theme Genes2 +#' themeGenes2 #' #' @importFrom ggplot2 element_blank element_line theme theme_grey #' @@ -41,7 +41,8 @@ themeGenes2 <- function() { ################################## ## Get Top N AccNum by Lin+DomArch ################################## -#' Group by lineage + DA then take top 20 +#' getTopAccByLinDomArch +#' @description Group by lineage + DA then take top 20 #' #' @param infile_full #' @param DA_col @@ -92,7 +93,7 @@ getTopAccByLinDomArch <- function(infile_full, ############################################# ## IPR + FULL files --> DomArch Visualization ############################################# -#' IPR2Viz +#' plotIPR2Viz #' #' @param infile_ipr #' @param infile_full @@ -248,7 +249,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), return(plot) } -#' IPR2Viz Web +#' plotIPR2VizWeb #' #' @param infile_ipr #' @param accessions diff --git a/man/countByColumn.Rd b/man/countByColumn.Rd new file mode 100644 index 00000000..57ff9ac4 --- /dev/null +++ b/man/countByColumn.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{countByColumn} +\alias{countByColumn} +\title{countByColumn} +\usage{ +countByColumn(prot = prot, column = "DomArch", min.freq = 1) +} +\arguments{ +\item{prot}{A data frame containing the dataset to analyze, typically with +multiple columns including the one specified by the \code{column} parameter.} + +\item{column}{A character string specifying the name of the column to analyze. +The default is "DomArch".} + +\item{min.freq}{An integer specifying the minimum frequency an element must +have to be included in the output. Default is 1.} +} +\value{ +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). +} +\description{ +Function to obtain element counts (DA, GC) +} +\examples{ +\dontrun{ +countByColumn(prot = my_data, column = "DomArch", min.freq = 10) +} +} diff --git a/man/filterByDomains.Rd b/man/filterByDomains.Rd new file mode 100644 index 00000000..afb3e5cb --- /dev/null +++ b/man/filterByDomains.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{filterByDomains} +\alias{filterByDomains} +\title{filterByDomains} +\usage{ +filterByDomains( + prot, + column = "DomArch", + doms_keep = c(), + doms_remove = c(), + ignore.case = FALSE +) +} +\arguments{ +\item{prot}{Dataframe to filter} + +\item{column}{Column to search for domains in (DomArch column)} + +\item{doms_keep}{Vector of domains that must be identified within column in order for +observation to be kept} + +\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} + +\item{ignore.case}{Should the matching be non case sensitive} +} +\value{ +Filtered data frame +} +\description{ +filterByDomains filters a data frame by identifying exact domain matches +and either keeping or removing rows with the identified domain +} +\note{ +There is no need to make the domains 'regex safe', that will be handled by this function +} +\examples{ +\dontrun{ +filterByDomains() +} +} +\author{ +Samuel Chen, Janani Ravi +} diff --git a/man/filterByFrequency.Rd b/man/filterByFrequency.Rd new file mode 100644 index 00000000..15d06d67 --- /dev/null +++ b/man/filterByFrequency.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{filterByFrequency} +\alias{filterByFrequency} +\title{filterByFrequency} +\usage{ +filterByFrequency(x, min.freq) +} +\arguments{ +\item{x}{A tibble (tbl_df) containing at least two columns: one for +elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} +} +\value{ +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. +} +\description{ +Function to filter based on frequencies +} +\examples{ +\dontrun{ +filterByFrequency() +} +} diff --git a/man/findParalogs.Rd b/man/findParalogs.Rd new file mode 100644 index 00000000..d92edf71 --- /dev/null +++ b/man/findParalogs.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{findParalogs} +\alias{findParalogs} +\title{findParalogs} +\usage{ +findParalogs(prot) +} +\arguments{ +\item{prot}{A data frame filtered by a Query, containing columns Species and Lineage} +} +\value{ +returns a dataframe containing paralogs and the counts. +} +\description{ +Creates a data frame of paralogs. +} +\note{ +Please refer to the source code if you have alternate file formats and/or +column names. +} +\examples{ +\dontrun{ +findParalogs(pspa) +} +} diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd index a00da5c7..b8571350 100644 --- a/man/getTopAccByLinDomArch.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ipr2viz.R \name{getTopAccByLinDomArch} \alias{getTopAccByLinDomArch} -\title{Group by lineage + DA then take top 20} +\title{getTopAccByLinDomArch} \usage{ getTopAccByLinDomArch( infile_full, diff --git a/man/plotIPR2Viz.Rd b/man/plotIPR2Viz.Rd index 22297312..7ed420c9 100644 --- a/man/plotIPR2Viz.Rd +++ b/man/plotIPR2Viz.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ipr2viz.R \name{plotIPR2Viz} \alias{plotIPR2Viz} -\title{IPR2Viz} +\title{plotIPR2Viz} \usage{ plotIPR2Viz( infile_ipr = NULL, @@ -20,5 +20,5 @@ plotIPR2Viz( \item{query}{} } \description{ -IPR2Viz +plotIPR2Viz } diff --git a/man/plotIPR2VizWeb.Rd b/man/plotIPR2VizWeb.Rd index 4b4394ad..3b94a5a7 100644 --- a/man/plotIPR2VizWeb.Rd +++ b/man/plotIPR2VizWeb.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ipr2viz.R \name{plotIPR2VizWeb} \alias{plotIPR2VizWeb} -\title{IPR2Viz Web} +\title{plotIPR2VizWeb} \usage{ plotIPR2VizWeb( infile_ipr, @@ -20,5 +20,5 @@ plotIPR2VizWeb( \item{rows}{} } \description{ -IPR2Viz Web +plotIPR2VizWeb } diff --git a/man/themeGenes2.Rd b/man/themeGenes2.Rd index 1553e019..64ae9273 100644 --- a/man/themeGenes2.Rd +++ b/man/themeGenes2.Rd @@ -2,10 +2,10 @@ % Please edit documentation in R/ipr2viz.R \name{themeGenes2} \alias{themeGenes2} -\title{Theme Genes2} +\title{themeGenes2} \usage{ themeGenes2() } \description{ -Theme Genes2 +themeGenes2 } diff --git a/man/words2WordCounts.Rd b/man/words2WordCounts.Rd new file mode 100644 index 00000000..370dec7f --- /dev/null +++ b/man/words2WordCounts.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{words2WordCounts} +\alias{words2WordCounts} +\title{words2WordCounts} +\usage{ +words2WordCounts(string) +} +\arguments{ +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} +} +\value{ +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} +} +\description{ +Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} +} +\examples{ +\dontrun{ +tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> + elements2Words() |> + words2WordCounts() +} + +} From 56b39da61292ae0facc31c104e90927f2483413e Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 13:54:22 -0600 Subject: [PATCH 30/61] let R manage NAMESPACE sort order --- NAMESPACE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 08f3aa92..dc5c95a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,8 +52,8 @@ export(gc_undirected_network) export(generateAllAlignments2FA) export(generate_all_aln2fa) export(generate_msa) -export(getTopAccByLinDomArch) export(getAccNumFromFA) +export(getTopAccByLinDomArch) export(get_accnums_from_fasta_file) export(get_proc_medians) export(get_proc_weights) From a74fb69a54f6a6ca39005f0b4d8cbf4dc15ee91c Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 18:41:02 -0600 Subject: [PATCH 31/61] maintain function name consistency with CHANGED-pre-msa-tree.R and pre-msa-tree.R while we determine where these functions should live. --- NAMESPACE | 1 - R/CHANGED-pre-msa-tree.R | 6 +++--- man/write.MsaAAMultipleAlignment.Rd | 20 -------------------- man/writeMSA_AA2FA.Rd | 7 ++++++- 4 files changed, 9 insertions(+), 25 deletions(-) delete mode 100644 man/write.MsaAAMultipleAlignment.Rd diff --git a/NAMESPACE b/NAMESPACE index dc5c95a4..7271b65f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -104,7 +104,6 @@ export(to_titlecase) export(totalGenContextOrDomArchCounts) export(validateCountDF) export(wordcloud3) -export(write.MsaAAMultipleAlignment) export(writeMSA_AA2FA) export(write_proc_medians_table) export(write_proc_medians_yml) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index c4a97589..a755df8c 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -610,12 +610,12 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { ) if (typeof(outpath) == "character") { - write.MsaAAMultipleAlignment(aligned, outpath) + writeMSA_AA2FA(aligned, outpath) } return(aligned) } -#' Write MsaAAMultpleAlignment Objects as algined fasta sequence +#' writeMSA_AA2FA #' #' @description #' MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega @@ -632,7 +632,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @export #' #' @examples -write.MsaAAMultipleAlignment <- function(alignment, outpath) { +writeMSA_AA2FA <- function(alignment, outpath) { l <- length(rownames(alignment)) fasta <- "" for (i in 1:l) diff --git a/man/write.MsaAAMultipleAlignment.Rd b/man/write.MsaAAMultipleAlignment.Rd deleted file mode 100644 index e26f26e7..00000000 --- a/man/write.MsaAAMultipleAlignment.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R -\name{write.MsaAAMultipleAlignment} -\alias{write.MsaAAMultipleAlignment} -\title{Write MsaAAMultpleAlignment Objects as algined fasta sequence} -\usage{ -write.MsaAAMultipleAlignment(alignment, outpath) -} -\arguments{ -\item{alignment}{MsaAAMultipleAlignment object to be written as a fasta} - -\item{outpath}{Where the resulting FASTA file should be written to} -} -\description{ -MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega -and msaMuscle from the 'msa' package -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/writeMSA_AA2FA.Rd b/man/writeMSA_AA2FA.Rd index 068e5b63..a6798469 100644 --- a/man/writeMSA_AA2FA.Rd +++ b/man/writeMSA_AA2FA.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{writeMSA_AA2FA} \alias{writeMSA_AA2FA} \title{writeMSA_AA2FA} \usage{ +writeMSA_AA2FA(alignment, outpath) + writeMSA_AA2FA(alignment, outpath) } \arguments{ @@ -12,6 +14,9 @@ writeMSA_AA2FA(alignment, outpath) \item{outpath}{Where the resulting FASTA file should be written to} } \description{ +MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega +and msaMuscle from the 'msa' package + Write MsaAAMultpleAlignment Objects as aligned fasta sequence MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package From 5fcd985a88ab270245a554a44adb557fa02acaed Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 18:42:56 -0600 Subject: [PATCH 32/61] maintain function name consistency across .R files while other determinations are made - getAccNumFromFA() --- NAMESPACE | 1 - R/CHANGED-pre-msa-tree.R | 4 ++-- man/getAccNumFromFA.Rd | 6 +++++- man/get_accnums_from_fasta_file.Rd | 14 -------------- 4 files changed, 7 insertions(+), 18 deletions(-) delete mode 100644 man/get_accnums_from_fasta_file.Rd diff --git a/NAMESPACE b/NAMESPACE index 7271b65f..23b29248 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -54,7 +54,6 @@ export(generate_all_aln2fa) export(generate_msa) export(getAccNumFromFA) export(getTopAccByLinDomArch) -export(get_accnums_from_fasta_file) export(get_proc_medians) export(get_proc_weights) export(make_opts2procs) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index a755df8c..767d51aa 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -645,7 +645,7 @@ writeMSA_AA2FA <- function(alignment, outpath) { return(fasta) } -#' Get accnums from fasta file +#' getAccNumFromFA #' #' @param fasta_file #' @@ -655,7 +655,7 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' @export #' #' @examples -get_accnums_from_fasta_file <- function(fasta_file) { +getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] return(accnums) diff --git a/man/getAccNumFromFA.Rd b/man/getAccNumFromFA.Rd index f2409965..d3ab8177 100644 --- a/man/getAccNumFromFA.Rd +++ b/man/getAccNumFromFA.Rd @@ -1,14 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{getAccNumFromFA} \alias{getAccNumFromFA} \title{getAccNumFromFA} \usage{ +getAccNumFromFA(fasta_file) + getAccNumFromFA(fasta_file) } \arguments{ \item{fasta_file}{} } \description{ +getAccNumFromFA + getAccNumFromFA } diff --git a/man/get_accnums_from_fasta_file.Rd b/man/get_accnums_from_fasta_file.Rd deleted file mode 100644 index f545d1a0..00000000 --- a/man/get_accnums_from_fasta_file.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R -\name{get_accnums_from_fasta_file} -\alias{get_accnums_from_fasta_file} -\title{Get accnums from fasta file} -\usage{ -get_accnums_from_fasta_file(fasta_file) -} -\arguments{ -\item{fasta_file}{} -} -\description{ -Get accnums from fasta file -} From d544f7ef932be8b44f04d1fae85bf715d976260b Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 18:54:49 -0600 Subject: [PATCH 33/61] additional cross .R file consistency while other function placement decisions are made --- NAMESPACE | 7 ---- R/CHANGED-pre-msa-tree.R | 25 +++++++++++---- R/pre-msa-tree.R | 36 +++++++++++++-------- man/RepresentativeAccNums.Rd | 23 -------------- man/acc2fa.Rd | 3 ++ man/addLeaves2Alignment.Rd | 25 +++++++++++++-- man/addName.Rd | 18 +++++++++-- man/add_leaves.Rd | 50 ----------------------------- man/add_name.Rd | 39 ----------------------- man/alignFasta.Rd | 4 ++- man/convert2TitleCase.Rd | 9 +++++- man/convertAlignment2FA.Rd | 21 ++++++++++-- man/convert_aln2fa.Rd | 53 ------------------------------- man/createRepresentativeAccNum.Rd | 10 +++++- man/generateAllAlignments2FA.Rd | 35 ++++++++++++++++---- man/generate_all_aln2fa.Rd | 48 ---------------------------- man/mapAcc2Name.Rd | 10 ++++-- man/map_acc2name.Rd | 21 ------------ man/to_titlecase.Rd | 25 --------------- 19 files changed, 158 insertions(+), 304 deletions(-) delete mode 100644 man/RepresentativeAccNums.Rd delete mode 100644 man/add_leaves.Rd delete mode 100644 man/add_name.Rd delete mode 100644 man/convert_aln2fa.Rd delete mode 100644 man/generate_all_aln2fa.Rd delete mode 100644 man/map_acc2name.Rd delete mode 100644 man/to_titlecase.Rd diff --git a/NAMESPACE b/NAMESPACE index 23b29248..fe4c23d6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ export(BinaryDomainNetwork) export(GCA2Lineage) export(GenContextNetwork) export(IPG2Lineage) -export(RepresentativeAccNums) export(acc2FA) export(acc2Lineage) export(acc2fa) @@ -12,8 +11,6 @@ export(addLeaves2Alignment) export(addLineage) export(addName) export(addTaxID) -export(add_leaves) -export(add_name) export(advanced_opts2est_walltime) export(alignFasta) export(assign_job_queue) @@ -31,7 +28,6 @@ export(convert2TitleCase) export(convertAlignment2FA) export(convertAlignment2Trees) export(convertFA2Tree) -export(convert_aln2fa) export(countByColumn) export(createFA2Tree) export(createJobResultsURL) @@ -50,7 +46,6 @@ export(findParalogs) export(formatJobArgumentsHTML) export(gc_undirected_network) export(generateAllAlignments2FA) -export(generate_all_aln2fa) export(generate_msa) export(getAccNumFromFA) export(getTopAccByLinDomArch) @@ -58,7 +53,6 @@ export(get_proc_medians) export(get_proc_weights) export(make_opts2procs) export(mapAcc2Name) -export(map_acc2name) export(map_advanced_opts2procs) export(msa_pdf) export(plotIPR2Viz) @@ -99,7 +93,6 @@ export(summarizeGenContext) export(summarizeGenContext_ByDomArchLineage) export(summarizeGenContext_ByLineage) export(themeGenes2) -export(to_titlecase) export(totalGenContextOrDomArchCounts) export(validateCountDF) export(wordcloud3) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index 767d51aa..2f6c8a62 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -54,7 +54,7 @@ convert2TitleCase <- function(x, y = " ") { ################################ ## Function to add leaves to an alignment file ## !! Add DA to leaves? -#' Adding Leaves to an alignment file w/ accessions +#' addLeaves2Alignment #' #' @author Janani Ravi #' @keywords alignment, accnum, leaves, lineage, species @@ -178,7 +178,7 @@ addLeaves2Alignment <- function(aln_file = "", } -#' Add Name +#' addName #' #' @author Samuel Chen, Janani Ravi #' @description This function adds a new 'Name' column that is comprised of components from @@ -252,7 +252,7 @@ addName <- function(data, ################################ ## Function to convert alignment 'aln' to fasta format for MSA + Tree -#' Adding Leaves to an alignment file w/ accessions +#' convertAlignment2FA #' #' @author Janani Ravi #' @keywords alignment, accnum, leaves, lineage, species @@ -320,6 +320,9 @@ convertAlignment2FA <- function(aln_file = "", return(fasta) } +#' mapAcc2Name +#' +#' @description #' Default renameFA() replacement function. Maps an accession number to its name #' #' @param line The line of a fasta file starting with '>' @@ -382,6 +385,9 @@ renameFA <- function(fa_path, outpath, ################################ ## generateAllAlignments2FA +#' generateAllAlignments2FA +#' +#' @description #' Adding Leaves to an alignment file w/ accessions #' #' @keywords alignment, accnum, leaves, lineage, species @@ -441,10 +447,11 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), # accessions <- c("P12345","Q9UHC1","O15530","Q14624","P0DTD1") # accessions <- rep("ANY95992.1", 201) -#' acc2FA converts protein accession numbers to a fasta format. +#' acc2FA #' #' @description -#' Resulting fasta file is written to the outpath. +#' converts protein accession numbers to a fasta format. Resulting +#' fasta file is written to the outpath. #' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta @@ -539,6 +546,9 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { return(result) } +#' createRepresentativeAccNum +#' +#' @description #' Function to generate a vector of one Accession number per distinct observation from 'reduced' column #' #' @author Samuel Chen, Janani Ravi @@ -556,7 +566,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { #' @export #' #' @examples -RepresentativeAccNums <- function(prot_data, +createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column @@ -585,6 +595,9 @@ RepresentativeAccNums <- function(prot_data, return(accessions) } +#' alignFasta +#' +#' @description #' Perform a Multiple Sequence Alignment on a FASTA file. #' #' @author Samuel Chen, Janani Ravi diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index fed495f4..290a1644 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -49,7 +49,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @export #' #' @examples -to_titlecase <- function(x, y = " ") { +convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "", collapse = y @@ -59,7 +59,7 @@ to_titlecase <- function(x, y = " ") { ################################ ## Function to add leaves to an alignment file ## !! Add DA to leaves? -#' Adding Leaves to an alignment file w/ accessions +#' addLeaves2Alignment #' #' @author Janani Ravi #' @@ -95,9 +95,9 @@ to_titlecase <- function(x, y = " ") { #' #' @examples #' \dontrun{ -#' add_leaves("pspa_snf7.aln", "pspa.txt") +#' addLeaves2Alignment("pspa_snf7.aln", "pspa.txt") #' } -add_leaves <- function(aln_file = "", +addLeaves2Alignment <- function(aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! # lin_file="data/rawdata_tsv/PspA.txt", reduced = FALSE) { @@ -184,7 +184,7 @@ add_leaves <- function(aln_file = "", } -#' Title +#' addName #' #' @author Samuel Chen, Janani Ravi #' @@ -209,7 +209,7 @@ add_leaves <- function(aln_file = "", #' @export #' #' @examples -add_name <- function(data, +addName <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp") @@ -258,7 +258,7 @@ add_name <- function(data, ################################ ## Function to convert alignment 'aln' to fasta format for MSA + Tree -#' Adding Leaves to an alignment file w/ accessions +#' convertAlignment2FA #' #' @author Janani Ravi #' @@ -288,9 +288,9 @@ add_name <- function(data, #' #' @examples #' \dontrun{ -#' add_leaves("pspa_snf7.aln", "pspa.txt") +#' convertAlignment2FA("pspa_snf7.aln", "pspa.txt") #' } -convert_aln2fa <- function(aln_file = "", +convertAlignment2FA <- function(aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! fa_outpath = "", reduced = FALSE) { @@ -324,6 +324,9 @@ convert_aln2fa <- function(aln_file = "", return(fasta) } +#' mapAcc2Name +#' +#' @description #' Default rename_fasta() replacement function. Maps an accession number to its name #' #' @param line he line of a fasta file starting with '>' @@ -340,7 +343,7 @@ convert_aln2fa <- function(aln_file = "", #' @export #' #' @examples -map_acc2name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { +mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an add_names column # Find the first ' ' end_acc <- str_locate(line, " ")[[1]] @@ -386,7 +389,10 @@ rename_fasta <- function(fa_path, outpath, } ################################ -## generate_all_aln2fa +## generateAllAlignments2FA +#' generateAllAlignments2FA +#' +#' @description #' Adding Leaves to an alignment file w/ accessions #' #' @author Janani Ravi @@ -413,9 +419,9 @@ rename_fasta <- function(fa_path, outpath, #' #' @examples #' \dontrun{ -#' generate_all_aln2fa() +#' generateAllAlignments2FA() #' } -generate_all_aln2fa <- function(aln_path = here("data/rawdata_aln/"), +generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), fa_outpath = here("data/alns/"), lin_file = here("data/rawdata_tsv/all_semiclean.txt"), reduced = F) { @@ -448,6 +454,10 @@ generate_all_aln2fa <- function(aln_path = here("data/rawdata_aln/"), # accessions <- rep("ANY95992.1", 201) #' acc2fa #' +#' @description +#' converts protein accession numbers to a fasta format. Resulting +#' fasta file is written to the outpath. +#' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' diff --git a/man/RepresentativeAccNums.Rd b/man/RepresentativeAccNums.Rd deleted file mode 100644 index 57d1f1ab..00000000 --- a/man/RepresentativeAccNums.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R -\name{RepresentativeAccNums} -\alias{RepresentativeAccNums} -\title{Function to generate a vector of one Accession number per distinct observation from 'reduced' column} -\usage{ -RepresentativeAccNums(prot_data, reduced = "Lineage", accnum_col = "AccNum") -} -\arguments{ -\item{prot_data}{Data frame containing Accession Numbers} - -\item{reduced}{Column from prot_data from which distinct observations -will be generated from. -One accession number will be assigned for each of these observations} - -\item{accnum_col}{Column from prot_data that contains Accession Numbers} -} -\description{ -Function to generate a vector of one Accession number per distinct observation from 'reduced' column -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/acc2fa.Rd b/man/acc2fa.Rd index 158b2d51..3e7a756d 100644 --- a/man/acc2fa.Rd +++ b/man/acc2fa.Rd @@ -15,6 +15,9 @@ Function may not work for vectors of length > 10,000} \item{plan}{} } \description{ +converts protein accession numbers to a fasta format. Resulting +fasta file is written to the outpath. + acc2fa converts protein accession numbers to a fasta format. Resulting fasta file is written to the outpath. } diff --git a/man/addLeaves2Alignment.Rd b/man/addLeaves2Alignment.Rd index a758ebd5..d00e6df7 100644 --- a/man/addLeaves2Alignment.Rd +++ b/man/addLeaves2Alignment.Rd @@ -1,9 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{addLeaves2Alignment} \alias{addLeaves2Alignment} -\title{Adding Leaves to an alignment file w/ accessions} +\title{addLeaves2Alignment} \usage{ +addLeaves2Alignment( + aln_file = "", + lin_file = "data/rawdata_tsv/all_semiclean.txt", + reduced = FALSE +) + addLeaves2Alignment( aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", @@ -11,7 +17,7 @@ addLeaves2Alignment( ) } \arguments{ -\item{aln_file}{haracter. Path to file. Input tab-delimited file + +\item{aln_file}{Character. Path to file. Input tab-delimited file + alignment file accnum & alignment. Default is 'pspa_snf7.aln'} @@ -23,15 +29,25 @@ Default is 'pspa.txt'} only one sequence per lineage. Default is FALSE.} } \description{ +Adding Leaves to an alignment file w/ accessions +Genomic Contexts vs Domain Architectures. + Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. } \details{ +The alignment file would need two columns: 1. accession + +number and 2. alignment. The protein homolog accession to lineage mapping + +file should have + The alignment file would need two columns: 1. accession + number and 2. alignment. The protein homolog accession to lineage mapping + file should have } \note{ +Please refer to the source code if you have alternate + +file formats and/or column names. + Please refer to the source code if you have alternate + file formats and/or column names. } @@ -39,6 +55,9 @@ file formats and/or column names. \dontrun{ addLeaves2Alignment("pspa_snf7.aln", "pspa.txt") } +\dontrun{ +addLeaves2Alignment("pspa_snf7.aln", "pspa.txt") +} } \author{ Janani Ravi diff --git a/man/addName.Rd b/man/addName.Rd index e04f9849..6f171456 100644 --- a/man/addName.Rd +++ b/man/addName.Rd @@ -1,9 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{addName} \alias{addName} -\title{Add Name} +\title{addName} \usage{ +addName( + data, + accnum_col = "AccNum", + spec_col = "Species", + lin_col = "Lineage", + lin_sep = ">", + out_col = "Name" +) + addName( data, accnum_col = "AccNum", @@ -28,9 +37,14 @@ addName( Lineage, and AccNum info} } \value{ +Original data with a 'Name' column + Original data with a 'Name' column } \description{ +This function adds a new 'Name' column that is comprised of components from +Kingdom, Phylum, Genus, and species, as well as the accession + This function adds a new 'Name' column that is comprised of components from Kingdom, Phylum, Genus, and species, as well as the accession } diff --git a/man/add_leaves.Rd b/man/add_leaves.Rd deleted file mode 100644 index f1eeed10..00000000 --- a/man/add_leaves.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{add_leaves} -\alias{add_leaves} -\title{Adding Leaves to an alignment file w/ accessions} -\usage{ -add_leaves( - aln_file = "", - lin_file = "data/rawdata_tsv/all_semiclean.txt", - reduced = FALSE -) -} -\arguments{ -\item{aln_file}{Character. Path to file. Input tab-delimited file + -alignment file accnum & alignment. -Default is 'pspa_snf7.aln'} - -\item{lin_file}{Character. Path to file. Protein file with accession + -number to lineage mapping. -Default is 'pspa.txt'} - -\item{reduced}{Boolean. If TRUE, a reduced data frame will be generated with -only one sequence per lineage. Default is FALSE.} -} -\description{ -Adding Leaves to an alignment file w/ accessions -Genomic Contexts vs Domain Architectures. -} -\details{ -The alignment file would need two columns: 1. accession + -number and 2. alignment. The protein homolog accession to lineage mapping + -file should have -} -\note{ -Please refer to the source code if you have alternate + -file formats and/or column names. -} -\examples{ -\dontrun{ -add_leaves("pspa_snf7.aln", "pspa.txt") -} -} -\author{ -Janani Ravi -} -\keyword{accnum,} -\keyword{alignment,} -\keyword{leaves,} -\keyword{lineage,} -\keyword{species} diff --git a/man/add_name.Rd b/man/add_name.Rd deleted file mode 100644 index f19139e1..00000000 --- a/man/add_name.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{add_name} -\alias{add_name} -\title{Title} -\usage{ -add_name( - data, - accnum_col = "AccNum", - spec_col = "Species", - lin_col = "Lineage", - lin_sep = ">", - out_col = "Name" -) -} -\arguments{ -\item{data}{Data to add name column to} - -\item{accnum_col}{Column containing accession numbers} - -\item{spec_col}{Column containing species} - -\item{lin_col}{Column containing lineage} - -\item{lin_sep}{Character separating lineage levels} - -\item{out_col}{Column that contains the new 'Name' derived from Species, -Lineage, and AccNum info} -} -\value{ -Original data with a 'Name' column -} -\description{ -This function adds a new 'Name' column that is comprised of components from -Kingdom, Phylum, Genus, and species, as well as the accession -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/alignFasta.Rd b/man/alignFasta.Rd index 21b020cf..02a3026b 100644 --- a/man/alignFasta.Rd +++ b/man/alignFasta.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{alignFasta} \alias{alignFasta} -\title{Perform a Multiple Sequence Alignment on a FASTA file.} +\title{alignFasta} \usage{ alignFasta(fasta_file, tool = "Muscle", outpath = NULL) @@ -21,6 +21,8 @@ aligned fasta sequence as a MsaAAMultipleAlignment object aligned fasta sequence as a MsaAAMultipleAlignment object } \description{ +Perform a Multiple Sequence Alignment on a FASTA file. + Perform a Multiple Sequence Alignment on a FASTA file. } \author{ diff --git a/man/convert2TitleCase.Rd b/man/convert2TitleCase.Rd index 84e7fa00..72619285 100644 --- a/man/convert2TitleCase.Rd +++ b/man/convert2TitleCase.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{convert2TitleCase} \alias{convert2TitleCase} \alias{totitle,} @@ -7,6 +7,8 @@ \title{Changing case to 'Title Case'} \usage{ convert2TitleCase(text, delimitter) + +to_titlecase(text, delimitter) } \arguments{ \item{x}{Character vector.} @@ -15,8 +17,13 @@ convert2TitleCase(text, delimitter) } \description{ Translate string to Title Case w/ delimitter. + +Translate string to Title Case w/ delimitter. +Changing case to 'Title Case' } \seealso{ +chartr, toupper, and tolower. + chartr, toupper, and tolower. } \author{ diff --git a/man/convertAlignment2FA.Rd b/man/convertAlignment2FA.Rd index d6b4dc56..8e9ceb94 100644 --- a/man/convertAlignment2FA.Rd +++ b/man/convertAlignment2FA.Rd @@ -1,9 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{convertAlignment2FA} \alias{convertAlignment2FA} -\title{Adding Leaves to an alignment file w/ accessions} +\title{convertAlignment2FA} \usage{ +convertAlignment2FA( + aln_file = "", + lin_file = "data/rawdata_tsv/all_semiclean.txt", + fa_outpath = "", + reduced = FALSE +) + convertAlignment2FA( aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", @@ -31,11 +38,18 @@ Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. } \details{ +The alignment file would need two columns: 1. accession + +number and 2. alignment. The protein homolog accession to lineage mapping + +file should have + The alignment file would need two columns: 1. accession + number and 2. alignment. The protein homolog accession to lineage mapping + file should have } \note{ +Please refer to the source code if you have alternate + +file formats and/or column names. + Please refer to the source code if you have alternate + file formats and/or column names. } @@ -44,6 +58,9 @@ file formats and/or column names. addLeaves2Alignment("pspa_snf7.aln", "pspa.txt") } +\dontrun{ +convertAlignment2FA("pspa_snf7.aln", "pspa.txt") +} } \author{ Janani Ravi diff --git a/man/convert_aln2fa.Rd b/man/convert_aln2fa.Rd deleted file mode 100644 index 8bebe31d..00000000 --- a/man/convert_aln2fa.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{convert_aln2fa} -\alias{convert_aln2fa} -\title{Adding Leaves to an alignment file w/ accessions} -\usage{ -convert_aln2fa( - aln_file = "", - lin_file = "data/rawdata_tsv/all_semiclean.txt", - fa_outpath = "", - reduced = FALSE -) -} -\arguments{ -\item{aln_file}{Character. Path to file. Input tab-delimited file + -alignment file accnum & alignment. -Default is 'pspa_snf7.aln'} - -\item{lin_file}{Character. Path to file. Protein file with accession + -number to lineage mapping. -Default is 'pspa.txt'} - -\item{fa_outpath}{Character. Path to the written fasta file. -Default is 'NULL'} - -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -Default is 'FALSE'} -} -\description{ -Adding Leaves to an alignment file w/ accessions -} -\details{ -The alignment file would need two columns: 1. accession + -number and 2. alignment. The protein homolog accession to lineage mapping + -file should have -} -\note{ -Please refer to the source code if you have alternate + -file formats and/or column names. -} -\examples{ -\dontrun{ -add_leaves("pspa_snf7.aln", "pspa.txt") -} -} -\author{ -Janani Ravi -} -\keyword{accnum,} -\keyword{alignment,} -\keyword{leaves,} -\keyword{lineage,} -\keyword{species} diff --git a/man/createRepresentativeAccNum.Rd b/man/createRepresentativeAccNum.Rd index 3703fe1a..3bd20522 100644 --- a/man/createRepresentativeAccNum.Rd +++ b/man/createRepresentativeAccNum.Rd @@ -1,9 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{createRepresentativeAccNum} \alias{createRepresentativeAccNum} \title{createRepresentativeAccNum} \usage{ +createRepresentativeAccNum( + prot_data, + reduced = "Lineage", + accnum_col = "AccNum" +) + createRepresentativeAccNum( prot_data, reduced = "Lineage", @@ -20,6 +26,8 @@ One accession number will be assigned for each of these observations} \item{accnum_col}{Column from prot_data that contains Accession Numbers} } \description{ +Function to generate a vector of one Accession number per distinct observation from 'reduced' column + Function to generate a vector of one Accession number per distinct observation from 'reduced' column } \author{ diff --git a/man/generateAllAlignments2FA.Rd b/man/generateAllAlignments2FA.Rd index 3bf9938a..8f9d8ffc 100644 --- a/man/generateAllAlignments2FA.Rd +++ b/man/generateAllAlignments2FA.Rd @@ -1,9 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{generateAllAlignments2FA} \alias{generateAllAlignments2FA} -\title{Adding Leaves to an alignment file w/ accessions} +\title{generateAllAlignments2FA} \usage{ +generateAllAlignments2FA( + aln_path = here("data/rawdata_aln/"), + fa_outpath = here("data/alns/"), + lin_file = here("data/rawdata_tsv/all_semiclean.txt"), + reduced = F +) + generateAllAlignments2FA( aln_path = here("data/rawdata_aln/"), fa_outpath = here("data/alns/"), @@ -15,28 +22,44 @@ generateAllAlignments2FA( \item{aln_path}{Character. Path to alignment files. Default is 'here("data/rawdata_aln/")'} -\item{fa_outpath}{Character. Path to file. Master protein file with AccNum & lineages. -Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} - -\item{lin_file}{Character. Path to the written fasta file. +\item{fa_outpath}{Character. Path to the written fasta file. Default is 'here("data/alns/")'.} +\item{lin_file}{Character. Path to file. Master protein file with AccNum & lineages. +Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} + \item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. Default is 'FALSE'.} } \description{ +Adding Leaves to an alignment file w/ accessions + +Adding Leaves to all alignment files w/ accessions & DAs? + +Adding Leaves to an alignment file w/ accessions + Adding Leaves to all alignment files w/ accessions & DAs? } \details{ +The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. + The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. } \note{ +Please refer to the source code if you have alternate + file formats and/or column names. + Please refer to the source code if you have alternate + file formats and/or column names. } \examples{ \dontrun{ generateAllAlignments2FA() } +\dontrun{ +generateAllAlignments2FA() +} +} +\author{ +Janani Ravi } \keyword{accnum,} \keyword{alignment,} diff --git a/man/generate_all_aln2fa.Rd b/man/generate_all_aln2fa.Rd deleted file mode 100644 index ad6b7136..00000000 --- a/man/generate_all_aln2fa.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{generate_all_aln2fa} -\alias{generate_all_aln2fa} -\title{Adding Leaves to an alignment file w/ accessions} -\usage{ -generate_all_aln2fa( - aln_path = here("data/rawdata_aln/"), - fa_outpath = here("data/alns/"), - lin_file = here("data/rawdata_tsv/all_semiclean.txt"), - reduced = F -) -} -\arguments{ -\item{aln_path}{Character. Path to alignment files. -Default is 'here("data/rawdata_aln/")'} - -\item{fa_outpath}{Character. Path to the written fasta file. -Default is 'here("data/alns/")'.} - -\item{lin_file}{Character. Path to file. Master protein file with AccNum & lineages. -Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} - -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -Default is 'FALSE'.} -} -\description{ -Adding Leaves to all alignment files w/ accessions & DAs? -} -\details{ -The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. -} -\note{ -Please refer to the source code if you have alternate + file formats and/or column names. -} -\examples{ -\dontrun{ -generate_all_aln2fa() -} -} -\author{ -Janani Ravi -} -\keyword{accnum,} -\keyword{alignment,} -\keyword{leaves,} -\keyword{lineage,} -\keyword{species} diff --git a/man/mapAcc2Name.Rd b/man/mapAcc2Name.Rd index 0f5d447d..39ecb065 100644 --- a/man/mapAcc2Name.Rd +++ b/man/mapAcc2Name.Rd @@ -1,13 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R \name{mapAcc2Name} \alias{mapAcc2Name} -\title{Default renameFA() replacement function. Maps an accession number to its name} +\title{mapAcc2Name} \usage{ +mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") + mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") } \arguments{ -\item{line}{The line of a fasta file starting with '>'} +\item{line}{he line of a fasta file starting with '>'} \item{acc2name}{Data Table containing a column of accession numbers and a name column} @@ -18,4 +20,6 @@ are mapped to} } \description{ Default renameFA() replacement function. Maps an accession number to its name + +Default rename_fasta() replacement function. Maps an accession number to its name } diff --git a/man/map_acc2name.Rd b/man/map_acc2name.Rd deleted file mode 100644 index fcdb3023..00000000 --- a/man/map_acc2name.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{map_acc2name} -\alias{map_acc2name} -\title{Default rename_fasta() replacement function. Maps an accession number to its name} -\usage{ -map_acc2name(line, acc2name, acc_col = "AccNum", name_col = "Name") -} -\arguments{ -\item{line}{he line of a fasta file starting with '>'} - -\item{acc2name}{Data Table containing a column of accession numbers and a name column} - -\item{acc_col}{Name of the column containing Accession numbers} - -\item{name_col}{Name of the column containing the names that the accession numbers -are mapped to} -} -\description{ -Default rename_fasta() replacement function. Maps an accession number to its name -} diff --git a/man/to_titlecase.Rd b/man/to_titlecase.Rd deleted file mode 100644 index 45139d3b..00000000 --- a/man/to_titlecase.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{to_titlecase} -\alias{to_titlecase} -\alias{totitle,} -\alias{to_title} -\title{To Titlecase} -\usage{ -to_titlecase(text, delimitter) -} -\arguments{ -\item{x}{Character vector.} - -\item{y}{Delimitter. Default is space (" ").} -} -\description{ -Translate string to Title Case w/ delimitter. -Changing case to 'Title Case' -} -\seealso{ -chartr, toupper, and tolower. -} -\author{ -Andrie, Janani Ravi -} From 6500e367effd56d9db7bada9505ec42aa3bb8dfa Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Fri, 11 Oct 2024 19:50:52 -0700 Subject: [PATCH 34/61] refactor function names in R/blastWrappers.R --- R/blastWrappers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 552b1ff6..dc11f589 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -17,7 +17,7 @@ #' @export #' #' @examples -run_deltablast <- function(deltablast_path, db_search_path, +runDeltaBlast <- function(deltablast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { start <- Sys.time() @@ -54,7 +54,7 @@ run_deltablast <- function(deltablast_path, db_search_path, #' @export #' #' @examples -run_rpsblast <- function(rpsblast_path, db_search_path, +runRPSBlast <- function(rpsblast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_threads = 1) { start <- Sys.time() From e45bb21f97ba1ecc1e7ee5fdaaa69349a6eca0e0 Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Fri, 11 Oct 2024 19:52:32 -0700 Subject: [PATCH 35/61] update .rd files and NAMESPACE --- NAMESPACE | 4 +- man/countbycolumn.Rd | 22 ----------- man/filterbydomains.Rd | 44 --------------------- man/filterbyfrequency.Rd | 22 ----------- man/findparalogs.Rd | 26 ------------ man/{run_deltablast.Rd => runDeltaBlast.Rd} | 6 +-- man/{run_rpsblast.Rd => runRPSBlast.Rd} | 6 +-- man/summarizebylineage.Rd | 25 ------------ man/totalgencontextordomarchcounts.Rd | 42 -------------------- man/words2wordcounts.Rd | 25 ------------ 10 files changed, 8 insertions(+), 214 deletions(-) delete mode 100644 man/countbycolumn.Rd delete mode 100644 man/filterbydomains.Rd delete mode 100644 man/filterbyfrequency.Rd delete mode 100644 man/findparalogs.Rd rename man/{run_deltablast.Rd => runDeltaBlast.Rd} (88%) rename man/{run_rpsblast.Rd => runRPSBlast.Rd} (89%) delete mode 100644 man/summarizebylineage.Rd delete mode 100644 man/totalgencontextordomarchcounts.Rd delete mode 100644 man/words2wordcounts.Rd diff --git a/NAMESPACE b/NAMESPACE index 53332439..dbab97b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,8 +85,8 @@ export(rename_fasta) export(replaceQuestionMarks) export(reveql) export(reverse_operon) -export(run_deltablast) -export(run_rpsblast) +export(runDeltaBlast) +export(runRPSBlast) export(selectLongestDuplicate) export(sendJobStatusEmail) export(shortenLineage) diff --git a/man/countbycolumn.Rd b/man/countbycolumn.Rd deleted file mode 100644 index 34fcc3e0..00000000 --- a/man/countbycolumn.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{countByColumn} -\alias{countByColumn} -\title{Count By Column} -\usage{ -countByColumn(prot = prot, column = "DomArch", min.freq = 1) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Count By Column -} -\examples{ -\dontrun{ -countByColumn() -} -} diff --git a/man/filterbydomains.Rd b/man/filterbydomains.Rd deleted file mode 100644 index 8c885363..00000000 --- a/man/filterbydomains.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByDomains} -\alias{filterByDomains} -\title{Filter by Domains} -\usage{ -filterByDomains( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) -} -\arguments{ -\item{prot}{Dataframe to filter} - -\item{column}{Column to search for domains in (DomArch column)} - -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} -} -\value{ -Filtered data frame -} -\description{ -filterByDomains filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain -} -\note{ -There is no need to make the domains 'regex safe', that will be handled by this function -} -\examples{ -\dontrun{ -filterByDomains() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/filterbyfrequency.Rd b/man/filterbyfrequency.Rd deleted file mode 100644 index d2c5f9cd..00000000 --- a/man/filterbyfrequency.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filterByFrequency} -\alias{filterByFrequency} -\title{Filter Frequency} -\usage{ -filterByFrequency(x, min.freq) -} -\arguments{ -\item{min.freq}{} -} -\value{ -Describe return, in detail -} -\description{ -Filter Frequency -} -\examples{ -\dontrun{ -filterByFrequency() -} -} diff --git a/man/findparalogs.Rd b/man/findparalogs.Rd deleted file mode 100644 index 4b5edbcf..00000000 --- a/man/findparalogs.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{findParalogs} -\alias{findParalogs} -\title{Find Paralogs} -\usage{ -findParalogs(prot) -} -\arguments{ -\item{prot}{A data frame filtered by a Query, containing columns Species and Lineage} -} -\value{ -returns a dataframe containing paralogs and the counts. -} -\description{ -Creates a data frame of paralogs. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -findParalogs(pspa) -} -} diff --git a/man/run_deltablast.Rd b/man/runDeltaBlast.Rd similarity index 88% rename from man/run_deltablast.Rd rename to man/runDeltaBlast.Rd index 3c934d77..8a32b954 100644 --- a/man/run_deltablast.Rd +++ b/man/runDeltaBlast.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/blastWrappers.R -\name{run_deltablast} -\alias{run_deltablast} +\name{runDeltaBlast} +\alias{runDeltaBlast} \title{Run DELTABLAST to find homologs for proteins of interest} \usage{ -run_deltablast( +runDeltaBlast( deltablast_path, db_search_path, db = "refseq", diff --git a/man/run_rpsblast.Rd b/man/runRPSBlast.Rd similarity index 89% rename from man/run_rpsblast.Rd rename to man/runRPSBlast.Rd index bc4474f1..088254ea 100644 --- a/man/run_rpsblast.Rd +++ b/man/runRPSBlast.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/blastWrappers.R -\name{run_rpsblast} -\alias{run_rpsblast} +\name{runRPSBlast} +\alias{runRPSBlast} \title{Run RPSBLAST to generate domain architectures for proteins of interest} \usage{ -run_rpsblast( +runRPSBlast( rpsblast_path, db_search_path, db = "refseq", diff --git a/man/summarizebylineage.Rd b/man/summarizebylineage.Rd deleted file mode 100644 index 2e445913..00000000 --- a/man/summarizebylineage.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeByLineage} -\alias{summarizeByLineage} -\title{Summarize by Lineage} -\usage{ -summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{query}{} -} -\value{ -Describe return, in detail -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarizeByLineage(query = "all") -} - -} diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalgencontextordomarchcounts.Rd deleted file mode 100644 index f457cb6a..00000000 --- a/man/totalgencontextordomarchcounts.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{totalGenContextOrDomArchCounts} -\alias{totalGenContextOrDomArchCounts} -\title{Total Counts} -\usage{ -totalGenContextOrDomArchCounts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} - -\item{digits}{} -} -\value{ -Define return, in detail -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wordcounts.Rd b/man/words2wordcounts.Rd deleted file mode 100644 index 7f60f226..00000000 --- a/man/words2wordcounts.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{words2WordCounts} -\alias{words2WordCounts} -\title{Words 2 Word Counts} -\usage{ -words2WordCounts(string) -} -\arguments{ -\item{string}{} -} -\value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency -} -\description{ -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2Words() |> - words2WordCounts() -} - -} From e9460610fb054c1c3109cf728561efe2e6619104 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:09:40 -0600 Subject: [PATCH 36/61] remove outdated .Rd --- man/GCA2lin.Rd | 0 man/acc2lin.Rd | 57 ----------------------------------------------- man/efetch_ipg.Rd | 0 man/ipg2lin.Rd | 0 man/sink.reset.Rd | 0 5 files changed, 57 deletions(-) delete mode 100644 man/GCA2lin.Rd delete mode 100644 man/acc2lin.Rd delete mode 100644 man/efetch_ipg.Rd delete mode 100644 man/ipg2lin.Rd delete mode 100644 man/sink.reset.Rd diff --git a/man/GCA2lin.Rd b/man/GCA2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd deleted file mode 100644 index d3f2468b..00000000 --- a/man/acc2lin.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/acc2lin.R, R/lineage.R -\name{acc2lin} -\alias{acc2lin} -\title{acc2lin} -\usage{ -acc2lin( - accessions, - assembly_path, - lineagelookup_path, - ipgout_path = NULL, - plan = "multicore" -) - -acc2lin( - accessions, - assembly_path, - lineagelookup_path, - ipgout_path = NULL, - plan = "multicore" -) -} -\arguments{ -\item{accessions}{Character vector of protein accessions} - -\item{assembly_path}{String of the path to the assembly_summary path -This file can be generated using the "DownloadAssemblySummary()" function} - -\item{lineagelookup_path}{String of the path to the lineage lookup file -(taxid to lineage mapping). This file can be generated using the} - -\item{ipgout_path}{Path to write the results of the efetch run of the accessions -on the ipg database. If NULL, the file will not be written. Defaults to NULL} - -\item{plan}{} -} -\value{ -Describe return, in detail -} -\description{ -This function combines 'efetch_ipg()' -and 'ipg2lin()' to map a set -of protein accessions to their assembly (GCA_ID), tax ID, and lineage. - -Function to map protein accession numbers to lineage - -This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set -of protein accessions to their assembly (GCA_ID), tax ID, and lineage. -} -\examples{ -\dontrun{ -acc2lin() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd deleted file mode 100644 index e69de29b..00000000 From 9571333c44ac879d9b2b6bc1a38d454fdda69a39 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:10 -0600 Subject: [PATCH 37/61] let R sort NAMESPACE --- NAMESPACE | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 60bec5b1..c448ff13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,9 +11,7 @@ export(addLeaves2Alignment) export(addLineage) export(addName) export(addTaxID) -export(advanced_opts2est_walltime) export(alignFasta) -export(assert_count_df) export(assignJobQueue) export(calculateEstimatedWallTimeFromOpts) export(calculateProcessRuntime) @@ -35,9 +33,9 @@ export(countByColumn) export(createFA2Tree) export(createJobResultsURL) export(createJobStatusEmailMessage) +export(createLineageLookup) export(createRepresentativeAccNum) export(createWordCloud2Element) -export(createLineageLookup) export(createWordCloudElement) export(domain_network) export(downloadAssemblySummary) @@ -50,14 +48,14 @@ export(formatJobArgumentsHTML) export(gc_undirected_network) export(generateAllAlignments2FA) export(generate_msa) -export(getProcessRuntimeWeights) export(getAccNumFromFA) +export(getProcessRuntimeWeights) export(getTopAccByLinDomArch) export(mapAcc2Name) export(mapAdvOption2Process) export(mapOption2Process) -export(map_acc2name) export(msa_pdf) +export(plotEstimatedWallTimes) export(plotIPR2Viz) export(plotIPR2VizWeb) export(plotLineageDA) @@ -70,12 +68,10 @@ export(plotStackedLineage) export(plotSunburst) export(plotTreemap) export(plotUpSet) -export(plotEstimatedWallTimes) export(prepareColumnParams) export(prepareSingleColumnParams) export(proteinAcc2TaxID) export(proteinAcc2TaxID_old) -export(prot2tax_old) export(removeAsterisks) export(removeEmptyRows) export(removeTails) From 8c573693b92f2aa216b269e24244d2d63fe0d3a9 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:26 -0600 Subject: [PATCH 38/61] regen new .Rd --- man/GCA2Lineage.Rd | 2 +- man/IPG2Lineage.Rd | 5 +++-- man/efetchIPG.Rd | 3 ++- man/sinkReset.Rd | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/man/GCA2Lineage.Rd b/man/GCA2Lineage.Rd index 9ec0ce56..9a2a7a30 100644 --- a/man/GCA2Lineage.Rd +++ b/man/GCA2Lineage.Rd @@ -19,7 +19,7 @@ This file can be generated using the "downloadAssemblySummary()" function} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{acc_col}{} } diff --git a/man/IPG2Lineage.Rd b/man/IPG2Lineage.Rd index 282d5cbf..118812ab 100644 --- a/man/IPG2Lineage.Rd +++ b/man/IPG2Lineage.Rd @@ -29,7 +29,7 @@ file} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{assembly_path}{String of the path to the assembly_summary path This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function} @@ -39,7 +39,8 @@ A \code{data.table} with the lineage information for the provided protein accessions. } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index 047e2652..db63024f 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -23,7 +23,8 @@ the ipg database} No return value. The function writes the fetched results to \code{out_path}. } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index 0285c0b2..e3fc7ce4 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,6 +8,7 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset From 2061d7a24b7a699bfeac72270817ae7225365ffa Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:48 -0600 Subject: [PATCH 39/61] remove old tryCatch code (for now) --- R/acc2lin.R | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 42315ece..a0a95033 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -72,14 +72,6 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) return(merged) - }, error = function(e) { - print(paste("Error: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("addLineages function execution completed.") - }) - } @@ -247,13 +239,6 @@ IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, lins <- lins[!is.na(Lineage)] %>% unique() return(lins) - }, error = function(e) { - print(paste("An error occurred: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("ipg2lin function execution completed.") - }) } From 48b7fd697b6c6cac7826ae3f09d315025db1a438 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Sun, 13 Oct 2024 18:02:36 +0100 Subject: [PATCH 40/61] Update error handling to use rlang functions in acc2lin.R file - Replaced base R error handling with rlang functions: `abort()`, `warn()`, and `inform()`. - Improved clarity and consistency in error and warning messages. - Enhanced robustness with detailed context for errors and warnings. --- R/acc2lin.R | 209 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 141 insertions(+), 68 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 08cb7d76..bd5cc289 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -5,6 +5,7 @@ # suppressPackageStartupMessages(library(data.table)) # suppressPackageStartupMessages(library(tidyverse)) # suppressPackageStartupMessages(library(biomartr)) +suppressPackageStartupMessages(library(rlang)) # https://stackoverflow.com/questions/18730491/sink-does-not-release-file #' Sink Reset @@ -24,13 +25,18 @@ sinkReset <- function() { for (i in seq_len(sink.number())) { sink(NULL) } - print("All sinks closed") + inform("All sinks closed", class = "sink_reset_info") }, error = function(e) { - print(paste("Error: ", e$message)) + abort(paste("Error: ", e$message), class = "sink_reset_error") }, warning = function(w) { - print(paste("Warning: ", w$message)) + warn(paste("Warning: ", w$message), class = "sink_reset_warning") }, finally = { - print("resetSink function execution completed.") + # If any additional cleanup is needed, it can be done here + if (sink.number() > 0) { + # Additional cleanup if sinks are still open + inform("Some sinks remain open, ensure proper cleanup.", + class = "sink_cleanup_warning") + } }) } @@ -56,60 +62,64 @@ sinkReset <- function() { #' addLineage() #' } addLineage <- function(df, acc_col = "AccNum", assembly_path, - lineagelookup_path, ipgout_path = NULL, - plan = "sequential", ...) { + lineagelookup_path, ipgout_path = NULL, + plan = "sequential", ...) { # check for validate inputs if (!is.data.frame(df)) { - stop("Input 'df' must be a data frame.") + abort("Input 'df' must be a data frame.", class = "input_error") } if (!acc_col %in% colnames(df)) { - stop(paste("Column", acc_col, "not found in data frame.")) + abort(paste("Column", acc_col, + "not found in data frame."), class = "column_error") } # Ensure paths are character strings if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { - stop("Both 'assembly_path' and - 'lineagelookup_path' must be character strings.") + abort("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.", + class = "path_type_error") } # Ensure paths exist if (!file.exists(assembly_path)) { - stop(paste("Assembly file not found at:", assembly_path)) + abort(paste("Assembly file not found at:", + assembly_path), class = "file_not_found_error") } if (!file.exists(lineagelookup_path)) { - stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + abort(paste("Lineage lookup file not found at:", + lineagelookup_path), class = "file_not_found_error") } - tryCatch({ - # Attempt to add lineages - acc_col <- sym(acc_col) - accessions <- df %>% pull(acc_col) - lins <- acc2Lineage( - accessions, assembly_path, lineagelookup_path, ipgout_path, plan - ) - - # Drop a lot of the unimportant columns for now? - # will make merging much easier - lins <- lins[, c( - "Strand", "Start", "Stop", "Nucleotide Accession", "Source", - "Id", "Strain" - ) := NULL] - lins <- unique(lins) - - # dup <- lins %>% group_by(Protein) %>% - # summarize(count = n()) %>% filter(count > 1) %>% - # pull(Protein) - - merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) - return(merged) - }, error = function(e) { - print(paste("Error: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("addLineages function execution completed.") - }) + tryCatch({ + # Attempt to add lineages + acc_col <- sym(acc_col) + accessions <- df %>% pull(acc_col) + lins <- acc2Lineage( + accessions, assembly_path, lineagelookup_path, ipgout_path, plan + ) + + # Drop a lot of the unimportant columns for now? + # will make merging much easier + lins <- lins[, c( + "Strand", "Start", "Stop", "Nucleotide Accession", "Source", + "Id", "Strain" + ) := NULL] + lins <- unique(lins) + + # dup <- lins %>% group_by(Protein) %>% + # summarize(count = n()) %>% filter(count > 1) %>% + # pull(Protein) + + merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) + return(merged) + }, error = function(e) { + abort(paste("Error during lineage addition:", e$message), + class = "lineage_addition_error") + }, warning = function(w) { + warn(paste("Warning during lineage addition:", w$message), + class = "lineage_addition_warning") + }) } @@ -140,11 +150,11 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' acc2Lineage() #' } acc2Lineage <- function(accessions, assembly_path, - lineagelookup_path, ipgout_path = NULL, - plan = "sequential", ...) { + lineagelookup_path, ipgout_path = NULL, + plan = "sequential", ...) { tmp_ipg <- F if (is.null(ipgout_path)) { - tmp_ipg <- T + tmp_ipg <- TRUE ipgout_path <- tempfile("ipg", fileext = ".txt") } @@ -154,18 +164,41 @@ acc2Lineage <- function(accessions, assembly_path, efetchIPG(accessions, out_path = ipgout_path, plan) # Attempt to process IPG to lineages - lins <- IPG2Lineage(accessions, ipgout_path, assembly_path, lineagelookup_path) + lins <- IPG2Lineage(accessions, ipgout_path, + assembly_path, lineagelookup_path) }, error = function(e) { - print(paste("An error occurred: ", e$message)) + abort( + message = paste("An error occurred during IPG fetching + or lineage processing:", e$message), + class = "lineage_processing_error", + # capturing the call stack + call = sys.call(), + # adding additional context + accessions = accessions, + assembly_path = assembly_path, + lineagelookup_path = lineagelookup_path, + ipgout_path = ipgout_path, + plan = plan + ) }, warning = function(w) { - print(paste("Warning: ", w$message)) + warn( + message = paste("Warning during IPG fetching + or lineage processing:", w$message), + class = "lineage_processing_warning", + call = sys.call(), # capturing the call stack + accessions = accessions, + assembly_path = assembly_path, + lineagelookup_path = lineagelookup_path, + ipgout_path = ipgout_path, + plan = plan + ) }, finally = { - print("acc2lin function execution completed.") + # Cleanup: delete temporary IPG file if it was created + if (tmp_ipg && file.exists(ipgout_path)) { + unlink(ipgout_path) + } }) - if (tmp_ipg) { - unlink(tempdir(), recursive = T) - } return(lins) } @@ -196,15 +229,18 @@ acc2Lineage <- function(accessions, assembly_path, efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { # Argument validation if (!is.character(accnums) || length(accnums) == 0) { - stop("Error: 'accnums' must be a non-empty character vector.") + abort("Error: 'accnums' must be a non-empty character vector.", + class = "validation_error") } if (!is.character(out_path) || nchar(out_path) == 0) { - stop("Error: 'out_path' must be a non-empty string.") + abort("Error: 'out_path' must be a non-empty string.", + class = "validation_error") } if (!is.function(plan)) { - stop("Error: 'plan' must be a valid plan function.") + abort("Error: 'plan' must be a valid plan function.", + class = "validation_error") } if (length(accnums) > 0) { partition <- function(in_data, groups) { @@ -249,11 +285,26 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { }) sink(NULL) }, error = function(e) { - print(paste("An error occurred: ", e$message)) + abort( + message = paste("An error occurred: ", e$message), + class = "fetch_error", + call = sys.call(), + accnums = accnums, + out_path = out_path, + plan = plan + ) }, warning = function(w) { - print(paste("Warning: ", w$message)) + warn( + message = paste("Warning: ", w$message), + class = "fetch_warning", + call = sys.call(), + accnums = accnums, + out_path = out_path, + plan = plan + ) }, finally = { - print("efetch_ipg function execution completed.") + # Ensure the sink is closed in case of errors + if (sink.number() > 0) sink(NULL) }) } } @@ -289,31 +340,38 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { #' IPG2Lineage() #' } #' -IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, ...) { +IPG2Lineage <- function(accessions, ipg_file, + assembly_path, lineagelookup_path, ...) { # Argument validation for accessions if (!is.character(accessions) || length(accessions) == 0) { - stop("Input 'accessions' must be a non-empty character vector.") + abort("Input 'accessions' must be a non-empty + character vector.", class = "validation_error") } # check for validate inputs if (!is.character(ipg_file)) { - stop("Input 'ipg_file' must be a character string.") + abort("Input 'ipg_file' must be a + character string.", class = "validation_error") } + # Ensure paths are character strings if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { - stop("Both 'assembly_path' and - 'lineagelookup_path' must be character strings.") + abort("Both 'assembly_path' and 'lineagelookup_path' + must be character strings.", class = "validation_error") } # Ensure paths exist if (!file.exists(assembly_path)) { - stop(paste("Assembly file not found at:", assembly_path)) + abort(paste("Assembly file not found at:", assembly_path), + class = "file_error") } if (!file.exists(lineagelookup_path)) { - stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + abort(paste("Lineage lookup file not found at:", lineagelookup_path), + class = "file_error") } + # Process the IPG file try({ # Attempt to read the IPG file ipg_dt <- fread(ipg_file, sep = "\t", fill = T) @@ -332,12 +390,27 @@ IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, return(lins) }, error = function(e) { - print(paste("An error occurred: ", e$message)) + abort( + message = paste("An error occurred: ", e$message), + class = "processing_error", + call = sys.call(), + accessions = accessions, + ipg_file = ipg_file, + assembly_path = assembly_path, + lineagelookup_path = lineagelookup_path + ) }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("ipg2lin function execution completed.") + warn( + message = paste("Warning: ", w$message), + class = "processing_warning", + call = sys.call(), + accessions = accessions, + ipg_file = ipg_file, + assembly_path = assembly_path, + lineagelookup_path = lineagelookup_path + ) }) + } From 70f0de8c57d610eaad122e59d4bf1e96fc455963 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sun, 13 Oct 2024 19:21:41 -0600 Subject: [PATCH 41/61] remove code not relevant to PR --- R/acc2lin.R | 50 +++--- R/assign_job_queue.R | 359 +++++++++++++------------------------------ R/blastWrappers.R | 105 +++---------- 3 files changed, 153 insertions(+), 361 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index a0a95033..61aae87c 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -157,40 +157,34 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { return(partitioned) } - tryCatch({ - # Set the future plan strategy - plan(strategy = plan, .skip = T) + # Set the future plan strategy + plan(strategy = plan, .skip = T) - min_groups <- length(accnums) / 200 - groups <- min(max(min_groups, 15), length(accnums)) - partitioned_acc <- partition(accnums, groups) - # Open the sink to the output path - sink(out_path) + min_groups <- length(accnums) / 200 + groups <- min(max(min_groups, 15), length(accnums)) + partitioned_acc <- partition(accnums, groups) - a <- future_map(1:length(partitioned_acc), function(x) { - # Avoid hitting the rate API limit - if (x %% 9 == 0) { - Sys.sleep(1) - } - cat( - entrez_fetch( - id = partitioned_acc[[x]], - db = "ipg", - rettype = "xml", - api_key = "YOUR_KEY_HERE" ## Can this be included in public package? - ) + # Open the sink to the output path + sink(out_path) + + a <- future_map(1:length(partitioned_acc), function(x) { + # Avoid hitting the rate API limit + if (x %% 9 == 0) { + Sys.sleep(1) + } + cat( + entrez_fetch( + id = partitioned_acc[[x]], + db = "ipg", + rettype = "xml", + api_key = "YOUR_KEY_HERE" ## Can this be included in public package? ) - }) - sink(NULL) - }, error = function(e) { - print(paste("An error occurred: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("efetch_ipg function execution completed.") + ) }) + sink(NULL) + } } diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 10df1e3a..4791b4a1 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -13,22 +13,13 @@ common_root <- Sys.getenv("COMMON_SRC_ROOT") #' example: list_opts2procs <- mapOption2Process #' @export mapOption2Process <- function() { - tryCatch({ - opts2processes <- list( - "homology_search" = c("dblast", "dblast_cleanup"), - "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), - # processes always present agnostic of advanced options - "always" = c("blast_clust", "clust2table") - ) - return(opts2processes) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("mapOption2Process function execution completed.") - }) - + opts2processes <- list( + "homology_search" = c("dblast", "dblast_cleanup"), + "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), + # processes always present agnostic of advanced options + "always" = c("blast_clust", "clust2table") + ) + return(opts2processes) } #' Use MolEvolvR advanced options to get associated processes @@ -43,26 +34,14 @@ mapOption2Process <- function() { #' procs <- mapAdvOption2Process(advanced_opts) #' @export mapAdvOption2Process <- function(advanced_opts) { - if (!is.character(advanced_opts)) { - stop("Argument must be a character vector!") - } - tryCatch({ - # append 'always' to add procs that always run - advanced_opts <- c(advanced_opts, "always") - opts2proc <- mapOption2Process() - # setup index for opts2proc based on advanced options - idx <- which(names(opts2proc) %in% advanced_opts) - # extract processes that will run - procs <- opts2proc[idx] |> unlist() - return(procs) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("mapOption2Process function execution completed.") - }) - + # append 'always' to add procs that always run + advanced_opts <- c(advanced_opts, "always") + opts2proc <- mapOption2Process() + # setup index for opts2proc based on advanced options + idx <- which(names(opts2proc) %in% advanced_opts) + # extract processes that will run + procs <- opts2proc[idx] |> unlist() + return(procs) } #' Scrape MolEvolvR logs and calculate median processes @@ -88,60 +67,41 @@ mapAdvOption2Process <- function(advanced_opts) { #' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export calculateProcessRuntime <- function(dir_job_results) { - tryCatch({ - # Check if dir_job_results is a character string - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } + source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) - # Check if dir_job_results exists - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } + # aggregate logs from + path_log_data <- file.path(common_root, + "molevol_scripts", "log_data", "prod_logs.rda") - source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) - - # aggregate logs from - path_log_data <- file.path(common_root, - "molevol_scripts", "log_data", "prod_logs.rda") - - # ensure the folder exists to the location - if (!dir.exists(path_log_data)) { - dir.create(dirname(path_log_data), - recursive = TRUE, showWarnings = FALSE) - } - - # attempt to load pre-generated logdata - if (!file.exists(path_log_data)) { - logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) - save(logs, file = path_log_data) - } else { - load(path_log_data) # loads the logs object - } - df_log <- logs$df_log - procs <- c( - "dblast", "dblast_cleanup", "iprscan", - "ipr2lineage", "ipr2da", "blast_clust", - "clust2table" - ) - list_proc_medians <- df_log |> - dplyr::select(dplyr::all_of(procs)) |> - dplyr::summarise( - dplyr::across( - dplyr::everything(), - \(x) median(x, na.rm = TRUE) - ) - ) |> - as.list() - return(list_proc_medians) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("calculateProcessRuntime function execution completed.") - }) + # ensure the folder exists to the location + if (!dir.exists(path_log_data)) { + dir.create(dirname(path_log_data), + recursive = TRUE, showWarnings = FALSE) + } + # attempt to load pre-generated logdata + if (!file.exists(path_log_data)) { + logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) + save(logs, file = path_log_data) + } else { + load(path_log_data) # loads the logs object + } + df_log <- logs$df_log + procs <- c( + "dblast", "dblast_cleanup", "iprscan", + "ipr2lineage", "ipr2da", "blast_clust", + "clust2table" + ) + list_proc_medians <- df_log |> + dplyr::select(dplyr::all_of(procs)) |> + dplyr::summarise( + dplyr::across( + dplyr::everything(), + \(x) median(x, na.rm = TRUE) + ) + ) |> + as.list() + return(list_proc_medians) } #' Write a table of 2 columns: 1) process and 2) median seconds @@ -162,39 +122,18 @@ calculateProcessRuntime <- function(dir_job_results) { #' ) #' @export writeProcessRuntime2TSV <- function(dir_job_results, filepath) { - tryCatch({ - # Error handling for input arguments - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } - - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } - - if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") - } - df_proc_medians <- calculateProcessRuntime(dir_job_results) |> - tibble::as_tibble() |> - tidyr::pivot_longer( - dplyr::everything(), - names_to = "process", - values_to = "median_seconds" - ) |> - dplyr::arrange(dplyr::desc(median_seconds)) - - # Write the resulting tibble to a TSV file - readr::write_tsv(df_proc_medians, file = filepath) - return(df_proc_medians) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("writeProcessRuntime2TSV function execution completed.") - }) - + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + tibble::as_tibble() |> + tidyr::pivot_longer( + dplyr::everything(), + names_to = "process", + values_to = "median_seconds" + ) |> + dplyr::arrange(dplyr::desc(median_seconds)) + + # Write the resulting tibble to a TSV file + readr::write_tsv(df_proc_medians, file = filepath) + return(df_proc_medians) } #' Compute median process runtimes, then write a YAML list of the processes and @@ -219,36 +158,8 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' } #' @export writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { - tryCatch({ - # Error handling for dir_job_results arguments - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } - - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } - if (is.null(filepath)) { - filepath <- file.path(common_root, - "molevol_scripts", - "log_data", - "job_proc_weights.yml") - } - if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") - } - - medians <- calculateProcessRuntime(dir_job_results) - yaml::write_yaml(medians, filepath) - }, error = function(e) { - message(paste("Encountered an error: "), e$message) - }, warning = function(w) { - message(paste("Warning: "), w$message) - }, finally = { - message("writeProcessRuntime2TSV function execution completed.") - } - ) - + medians <- calculateProcessRuntime(dir_job_results) + yaml::write_yaml(medians, filepath) } #' Quickly get the runtime weights for MolEvolvR backend processes @@ -322,81 +233,49 @@ calculateEstimatedWallTimeFromOpts <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { - - tryCatch({ - # to calculate est walltime for a homology search job, the number of hits - # must be provided - validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts - stopifnot(!validation_fail) - - # Validate advanced_opts - if (!is.character(advanced_opts)) { - stop("Argument 'advanced_opts' must be a character vector.") - } - - # Validate n_inputs - if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { - stop("Argument 'n_inputs' must be a single positive numeric value.") - } - - # Validate n_hits if homology_search is in advanced_opts - if ("homology_search" %in% advanced_opts && - (is.null(n_hits)|| !is.numeric(n_hits) - || length(n_hits) != 1 || n_hits < 0)) { - stop("Argument 'n_hits' must be a single non-negative numeric value when - 'homology_search' is in 'advanced_opts'.") - } - - # Get process weights - proc_weights <- writeProcessRuntime2YML() - if (!is.list(proc_weights)) { - stop("Process weights could not be retrieved correctly.") - } - - # sort process weights by names and convert to vec - proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() - all_procs <- names(proc_weights) |> sort() - # get processes from advanced options and sort by names - procs_from_opts <- mapAdvOption2Process(advanced_opts) - procs_from_opts <- sort(procs_from_opts) - # binary encode: yes proc will run (1); else 0 - binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) - # dot product of weights and procs to run; scaled by the number of inputs - est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> - as.numeric() - # calculate the additional processes to run for the homologous hits - if ("homology_search" %in% advanced_opts) { - opts2procs <- mapOption2Process() - # exclude the homology search processes for the homologous hits - procs2exclude_for_homologs <- opts2procs[["homology_search"]] - procs_homologs <- procs_from_opts[!(procs_from_opts - %in% procs2exclude_for_homologs)] - binary_proc_vec_homolog <- dplyr::if_else(all_procs - %in% procs_homologs, 1L, 0L) - # add the estimated walltime for processes run on the homologous hits - est_walltime <- est_walltime + - (n_hits * (binary_proc_vec_homolog - %*% proc_weights) |> as.numeric()) - } - if (verbose) { - msg <- stringr::str_glue( - "warnings from calculateEstimatedWallTimeFromOpts ():\n", - "\tn_inputs={n_inputs}\n", - "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", - "\test_walltime={est_walltime}\n\n" - ) - cat(file = stderr(), msg) - } - return(est_walltime) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("calculateEstimatedWallTimeFromOpts - function execution completed.") - }) - + # to calculate est walltime for a homology search job, the number of hits + # must be provided + validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts + stopifnot(!validation_fail) + + # Get process weights + proc_weights <- writeProcessRuntime2YML() + + # sort process weights by names and convert to vec + proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() + all_procs <- names(proc_weights) |> sort() + # get processes from advanced options and sort by names + procs_from_opts <- mapAdvOption2Process(advanced_opts) + procs_from_opts <- sort(procs_from_opts) + # binary encode: yes proc will run (1); else 0 + binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) + # dot product of weights and procs to run; scaled by the number of inputs + est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> + as.numeric() + # calculate the additional processes to run for the homologous hits + if ("homology_search" %in% advanced_opts) { + opts2procs <- mapOption2Process() + # exclude the homology search processes for the homologous hits + procs2exclude_for_homologs <- opts2procs[["homology_search"]] + procs_homologs <- procs_from_opts[!(procs_from_opts + %in% procs2exclude_for_homologs)] + binary_proc_vec_homolog <- dplyr::if_else(all_procs + %in% procs_homologs, 1L, 0L) + # add the estimated walltime for processes run on the homologous hits + est_walltime <- est_walltime + + (n_hits * (binary_proc_vec_homolog + %*% proc_weights) |> as.numeric()) + } + if (verbose) { + msg <- stringr::str_glue( + "warnings from calculateEstimatedWallTimeFromOpts ():\n", + "\tn_inputs={n_inputs}\n", + "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", + "\test_walltime={est_walltime}\n\n" + ) + cat(file = stderr(), msg) + } + return(est_walltime) } @@ -418,25 +297,8 @@ assignJobQueue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { - tryCatch({ - if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { - stop("Argument 't_sec_estimate' must be a single numeric value.") - } - - if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { - stop("Argument 't_cutoff' must be a single non-negative numeric value.") - } - - queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") - return(queue) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("assignJobQueue function execution completed.") - }) - + queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") + return(queue) } #' Plot the estimated runtimes for different advanced options and number @@ -456,7 +318,6 @@ assignJobQueue <- function( #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export plotEstimatedWallTimes <- function() { - tryCatch({ opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { @@ -536,12 +397,4 @@ plotEstimatedWallTimes <- function() { y = "Estimated walltime (hours)" ) return(p) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("plotEstimatedWallTimes function execution completed.") - }) - } diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 15484a1b..9b55f3ee 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -21,52 +21,24 @@ run_deltablast <- function(deltablast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { - # Argument validation - if (!file.exists(deltablast_path)) { - stop("The DELTABLAST executable path is invalid: ", deltablast_path) - } - if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) - } - if (!file.exists(query)) { - stop("The query file path is invalid: ", query) - } - if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) - } - if (!is.numeric(num_alignments) || num_alignments <= 0) { - stop("The number of alignments must be a - positive integer: ", num_alignments) - } - if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) - } - start <- Sys.time() - tryCatch({ - system(paste0("export BLASTDB=/", db_search_path)) - system2( - command = deltablast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads, - "-num_alignments", num_alignments - # ,"-outfmt", outfmt - ) + system(paste0("export BLASTDB=/", db_search_path)) + + system2( + command = deltablast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads, + "-num_alignments", num_alignments + # ,"-outfmt", outfmt ) - print(Sys.time() - start) - }, error = function(e) { - message(paste("Error in run_deltablast: ", e)) - }, warning = function(w) { - message(paste("Warning in run_deltablast: ", w)) - }, finally = { - message("run_deltablast completed") - }) + ) + print(Sys.time() - start) } @@ -88,46 +60,19 @@ run_deltablast <- function(deltablast_path, db_search_path, run_rpsblast <- function(rpsblast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_threads = 1) { - # Argument validation - if (!file.exists(rpsblast_path)) { - stop("The RPSBLAST executable path is invalid: ", rpsblast_path) - } - if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) - } - if (!file.exists(query)) { - stop("The query file path is invalid: ", query) - } - if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) - } - if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) - } start <- Sys.time() + system(paste0("export BLASTDB=/", db_search_path)) - tryCatch({ - - system(paste0("export BLASTDB=/", db_search_path)) - - system2( - command = rpsblast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads - ) + system2( + command = rpsblast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads ) - print(Sys.time() - start) - }, error = function(e) { - message(paste("Error in run_rpsblast: ", e)) - }, warning = function(w) { - message(paste("Warning in run_rpsblast: ", w)) - }, finally = { - message("run_rpsblast completed") - }) - + ) + print(Sys.time() - start) } From 392775de92dfc33b198b41a5a2843f5313dd2e0d Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sun, 13 Oct 2024 19:43:58 -0600 Subject: [PATCH 42/61] adjust .Rd title tags for renamed functions --- R/assign_job_queue.R | 27 +++++++++++++++++++++++ R/create_lineage_lookup.R | 3 +++ man/assignJobQueue.Rd | 2 +- man/calculateEstimatedWallTimeFromOpts.Rd | 3 +-- man/calculateProcessRuntime.Rd | 2 +- man/createLineageLookup.Rd | 2 +- man/getProcessRuntimeWeights.Rd | 2 +- man/mapAdvOption2Process.Rd | 2 +- man/mapOption2Process.Rd | 2 +- man/plotEstimatedWallTimes.Rd | 6 +++-- man/writeProcessRuntime2TSV.Rd | 2 +- man/writeProcessRuntime2YML.Rd | 6 +++-- 12 files changed, 46 insertions(+), 13 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 4791b4a1..20ba841f 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -6,6 +6,9 @@ # file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") +#' mapOption2Process +#' +#' @description #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes @@ -22,6 +25,9 @@ mapOption2Process <- function() { return(opts2processes) } +#' mapAdvOption2Process +#' +#' @description #' Use MolEvolvR advanced options to get associated processes #' #' @param advanced_opts character vector of MolEvolvR advanced options @@ -44,6 +50,9 @@ mapAdvOption2Process <- function(advanced_opts) { return(procs) } +#' calculateProcessRuntime +#' +#' @description #' Scrape MolEvolvR logs and calculate median processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -104,6 +113,9 @@ calculateProcessRuntime <- function(dir_job_results) { return(list_proc_medians) } +#' writeProcessRuntime2TSV +#' +#' @description #' Write a table of 2 columns: 1) process and 2) median seconds #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -136,6 +148,9 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { return(df_proc_medians) } +#' writeProcessRuntime2YML +#' +#' @description #' Compute median process runtimes, then write a YAML list of the processes and #' their median runtimes in seconds to the path specified by 'filepath'. #' @@ -162,6 +177,9 @@ writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { yaml::write_yaml(medians, filepath) } +#' getProcessRuntimeWeights +#' +#' @description #' Quickly get the runtime weights for MolEvolvR backend processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -213,6 +231,9 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { return(proc_weights) } +#' calculateEstimatedWallTimeFromOpts +#' +#' @description #' Given MolEvolvR advanced options and number of inputs, #' calculate the total estimated walltime for the job #' @@ -279,6 +300,9 @@ calculateEstimatedWallTimeFromOpts <- function(advanced_opts, } +#' assignJobQueue +#' +#' @description #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process @@ -301,6 +325,9 @@ assignJobQueue <- function( return(queue) } +#' plotEstimatedWallTimes +#' +#' @description #' Plot the estimated runtimes for different advanced options and number #' of inputs #' diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index 78e79048..2408c5e6 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -3,6 +3,9 @@ # library(biomartr) +#' createLineageLookup +#' +#' @description #' Create a look up table that goes from TaxID, to Lineage #' #' @author Samuel Chen diff --git a/man/assignJobQueue.Rd b/man/assignJobQueue.Rd index 3663ce56..de646a82 100644 --- a/man/assignJobQueue.Rd +++ b/man/assignJobQueue.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{assignJobQueue} \alias{assignJobQueue} -\title{Decision function to assign job queue} +\title{assignJobQueue} \usage{ assignJobQueue(t_sec_estimate, t_cutoff = 21600) } diff --git a/man/calculateEstimatedWallTimeFromOpts.Rd b/man/calculateEstimatedWallTimeFromOpts.Rd index c09cf6a6..d5361001 100644 --- a/man/calculateEstimatedWallTimeFromOpts.Rd +++ b/man/calculateEstimatedWallTimeFromOpts.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{calculateEstimatedWallTimeFromOpts} \alias{calculateEstimatedWallTimeFromOpts} -\title{Given MolEvolvR advanced options and number of inputs, -calculate the total estimated walltime for the job} +\title{calculateEstimatedWallTimeFromOpts} \usage{ calculateEstimatedWallTimeFromOpts( advanced_opts, diff --git a/man/calculateProcessRuntime.Rd b/man/calculateProcessRuntime.Rd index bb6dd1ed..579ea2b6 100644 --- a/man/calculateProcessRuntime.Rd +++ b/man/calculateProcessRuntime.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{calculateProcessRuntime} \alias{calculateProcessRuntime} -\title{Scrape MolEvolvR logs and calculate median processes} +\title{calculateProcessRuntime} \usage{ calculateProcessRuntime(dir_job_results) } diff --git a/man/createLineageLookup.Rd b/man/createLineageLookup.Rd index 5dbab978..132019ce 100644 --- a/man/createLineageLookup.Rd +++ b/man/createLineageLookup.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/create_lineage_lookup.R \name{createLineageLookup} \alias{createLineageLookup} -\title{Create a look up table that goes from TaxID, to Lineage} +\title{createLineageLookup} \usage{ createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), diff --git a/man/getProcessRuntimeWeights.Rd b/man/getProcessRuntimeWeights.Rd index ff3c8e5d..de0e2ea6 100644 --- a/man/getProcessRuntimeWeights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{getProcessRuntimeWeights} \alias{getProcessRuntimeWeights} -\title{Quickly get the runtime weights for MolEvolvR backend processes} +\title{getProcessRuntimeWeights} \usage{ getProcessRuntimeWeights(medians_yml_path = NULL) } diff --git a/man/mapAdvOption2Process.Rd b/man/mapAdvOption2Process.Rd index 5bd9ee65..6a210a20 100644 --- a/man/mapAdvOption2Process.Rd +++ b/man/mapAdvOption2Process.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{mapAdvOption2Process} \alias{mapAdvOption2Process} -\title{Use MolEvolvR advanced options to get associated processes} +\title{mapAdvOption2Process} \usage{ mapAdvOption2Process(advanced_opts) } diff --git a/man/mapOption2Process.Rd b/man/mapOption2Process.Rd index ff6905c5..9645617b 100644 --- a/man/mapOption2Process.Rd +++ b/man/mapOption2Process.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{mapOption2Process} \alias{mapOption2Process} -\title{Construct list where names (MolEvolvR advanced options) point to processes} +\title{mapOption2Process} \usage{ mapOption2Process() } diff --git a/man/plotEstimatedWallTimes.Rd b/man/plotEstimatedWallTimes.Rd index 0d53cb32..36b0ecd5 100644 --- a/man/plotEstimatedWallTimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{plotEstimatedWallTimes} \alias{plotEstimatedWallTimes} -\title{Plot the estimated runtimes for different advanced options and number -of inputs} +\title{plotEstimatedWallTimes} \usage{ plotEstimatedWallTimes() } @@ -16,5 +15,8 @@ ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } \description{ +Plot the estimated runtimes for different advanced options and number +of inputs + this function was just for fun; very, very messy code } diff --git a/man/writeProcessRuntime2TSV.Rd b/man/writeProcessRuntime2TSV.Rd index 03cbbd68..0e045a5c 100644 --- a/man/writeProcessRuntime2TSV.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{writeProcessRuntime2TSV} \alias{writeProcessRuntime2TSV} -\title{Write a table of 2 columns: 1) process and 2) median seconds} +\title{writeProcessRuntime2TSV} \usage{ writeProcessRuntime2TSV(dir_job_results, filepath) } diff --git a/man/writeProcessRuntime2YML.Rd b/man/writeProcessRuntime2YML.Rd index b43f39ee..865f23f7 100644 --- a/man/writeProcessRuntime2YML.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{writeProcessRuntime2YML} \alias{writeProcessRuntime2YML} -\title{Compute median process runtimes, then write a YAML list of the processes and -their median runtimes in seconds to the path specified by 'filepath'.} +\title{writeProcessRuntime2YML} \usage{ writeProcessRuntime2YML(dir_job_results, filepath = NULL) } @@ -14,6 +13,9 @@ writeProcessRuntime2YML(dir_job_results, filepath = NULL) uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ +Compute median process runtimes, then write a YAML list of the processes and +their median runtimes in seconds to the path specified by 'filepath'. + The default value of filepath is the value of the env var MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default read location. From 2057aa57a8101381adb9dffdd5a05e741843791e Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Mon, 14 Oct 2024 19:00:08 +0100 Subject: [PATCH 43/61] refactor functions in multiple files --- R/msa.R | 10 +++++----- R/networks_domarch.R | 8 ++++---- R/networks_gencontext.R | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/msa.R b/R/msa.R index e56cc32c..4c48f323 100644 --- a/R/msa.R +++ b/R/msa.R @@ -24,7 +24,7 @@ ############# ## Sample Runs -# msa_pdf(fasta_path="data/alns/pspb.gismo.fa" )#, out_path="data/msapdf") +# createMSA_PDF(fasta_path="data/alns/pspb.gismo.fa" )#, out_path="data/msapdf") ######################################### ## Generates MSA PDF from a Fasta file ## @@ -34,7 +34,7 @@ #' @description #' Generates a multiple sequence alignment from a fasta file #' -#' msa_pdf is a function that reads a fasta file and generates a multiple sequence alignment as +#' createMSA_PDF is a function that reads a fasta file and generates a multiple sequence alignment as #' a pdf #' #' @@ -55,9 +55,9 @@ #' #' @examples #' \dontrun{ -#' msa_pdf() +#' createMSA_PDF() #' } -msa_pdf <- function(fasta_path, out_path = NULL, +createMSA_PDF <- function(fasta_path, out_path = NULL, lowerbound = NULL, upperbound = NULL) { ## SAMPLE ARGUMENTS to test run # fasta_path=here("../molevol_data/project_data/phage_defense/full_analysis_20210108/g3d.both_lin.gen.da_sub.fa") @@ -196,7 +196,7 @@ msa_pdf <- function(fasta_path, out_path = NULL, #' @export #' #' @examples -generate_msa <- function(fa_file = "", outfile = "") { +createMSA_Kalign <- function(fa_file = "", outfile = "") { prot_aa <- readAAStringSet( path = fa_file, format = "fasta" diff --git a/R/networks_domarch.R b/R/networks_domarch.R index fea0a195..9215aa93 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -46,9 +46,9 @@ #' #' @examples #' \dontrun{ -#' domain_network(pspa) +#' createDomainNetwork(pspa) #' } -domain_network <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, layout = "nice", query_color = adjustcolor("green", alpha.f = .5)) { +createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, layout = "nice", query_color = adjustcolor("green", alpha.f = .5)) { # by domain networks or all, as required. tryCatch( { @@ -250,9 +250,9 @@ domain_network <- function(prot, column = "DomArch", domains_of_interest, cutoff #' #' @examples #' \dontrun{ -#' domain_network(pspa) +#' createDomainNetwork(pspa) #' } -BinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, +createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, layout = "nice", query_color = adjustcolor("yellow", alpha.f = .5), partner_color = adjustcolor("skyblue", alpha.f = .5), border_color = adjustcolor("grey", alpha.f = .8), diff --git a/R/networks_gencontext.R b/R/networks_gencontext.R index e0dd63da..7df6c270 100755 --- a/R/networks_gencontext.R +++ b/R/networks_gencontext.R @@ -39,7 +39,7 @@ #' \dontrun{ #' domain_network(pspa) #' } -gc_undirected_network <- function(prot, column = "GenContext", domains_of_interest, cutoff_type = "Lineage", cutoff = 1, layout = "grid") { +createUndirectedGenomicContextNetwork <- function(prot, column = "GenContext", domains_of_interest, cutoff_type = "Lineage", cutoff = 1, layout = "grid") { # by domain networks or all, as required. # ye is either all of prot.list or centered on one domain @@ -146,7 +146,7 @@ gc_undirected_network <- function(prot, column = "GenContext", domains_of_intere #' \dontrun{ #' gc_directed_network(pspa, column = "GenContex", cutoff = 55) #' } -GenContextNetwork <- function(prot, domains_of_interest, column = "GenContext", +createGenomicContextNetwork <- function(prot, domains_of_interest, column = "GenContext", cutoff = 40, layout = "grid", directed = TRUE) { From b665a4dfd062f3a359cb907040cb8c384a23450c Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Mon, 14 Oct 2024 19:03:29 +0100 Subject: [PATCH 44/61] Update NAMESPACE and .Rd files --- NAMESPACE | 12 ++++++------ ...ainNetwork.Rd => createBinaryDomainNetwork.Rd} | 8 ++++---- man/{domain_network.Rd => createDomainNetwork.Rd} | 8 ++++---- ...tNetwork.Rd => createGenomicContextNetwork.Rd} | 6 +++--- man/{generate_msa.Rd => createMSA_Kalign.Rd} | 6 +++--- man/{msa_pdf.Rd => createMSA_PDF.Rd} | 15 ++++++++++----- ...d => createUndirectedGenomicContextNetwork.Rd} | 6 +++--- 7 files changed, 33 insertions(+), 28 deletions(-) rename man/{BinaryDomainNetwork.Rd => createBinaryDomainNetwork.Rd} (92%) rename man/{domain_network.Rd => createDomainNetwork.Rd} (90%) rename man/{GenContextNetwork.Rd => createGenomicContextNetwork.Rd} (91%) rename man/{generate_msa.Rd => createMSA_Kalign.Rd} (70%) rename man/{msa_pdf.Rd => createMSA_PDF.Rd} (77%) rename man/{gc_undirected_network.Rd => createUndirectedGenomicContextNetwork.Rd} (90%) diff --git a/NAMESPACE b/NAMESPACE index fe4c23d6..4c05dc94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(BinaryDomainNetwork) export(GCA2Lineage) -export(GenContextNetwork) export(IPG2Lineage) export(acc2FA) export(acc2Lineage) @@ -29,14 +27,19 @@ export(convertAlignment2FA) export(convertAlignment2Trees) export(convertFA2Tree) export(countByColumn) +export(createBinaryDomainNetwork) +export(createDomainNetwork) export(createFA2Tree) +export(createGenomicContextNetwork) export(createJobResultsURL) export(createJobStatusEmailMessage) +export(createMSA_Kalign) +export(createMSA_PDF) export(createRepresentativeAccNum) +export(createUndirectedGenomicContextNetwork) export(createWordCloud2Element) export(createWordCloudElement) export(create_lineage_lookup) -export(domain_network) export(downloadAssemblySummary) export(efetchIPG) export(extractAccNum) @@ -44,9 +47,7 @@ export(filterByDomains) export(filterByFrequency) export(findParalogs) export(formatJobArgumentsHTML) -export(gc_undirected_network) export(generateAllAlignments2FA) -export(generate_msa) export(getAccNumFromFA) export(getTopAccByLinDomArch) export(get_proc_medians) @@ -54,7 +55,6 @@ export(get_proc_weights) export(make_opts2procs) export(mapAcc2Name) export(map_advanced_opts2procs) -export(msa_pdf) export(plotIPR2Viz) export(plotIPR2VizWeb) export(plotLineageDA) diff --git a/man/BinaryDomainNetwork.Rd b/man/createBinaryDomainNetwork.Rd similarity index 92% rename from man/BinaryDomainNetwork.Rd rename to man/createBinaryDomainNetwork.Rd index bb7e2353..4f0bdc5a 100644 --- a/man/BinaryDomainNetwork.Rd +++ b/man/createBinaryDomainNetwork.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/networks_domarch.R -\name{BinaryDomainNetwork} -\alias{BinaryDomainNetwork} +\name{createBinaryDomainNetwork} +\alias{createBinaryDomainNetwork} \title{Domain Network} \usage{ -BinaryDomainNetwork( +createBinaryDomainNetwork( prot, column = "DomArch", domains_of_interest, @@ -42,6 +42,6 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -domain_network(pspa) +createDomainNetwork(pspa) } } diff --git a/man/domain_network.Rd b/man/createDomainNetwork.Rd similarity index 90% rename from man/domain_network.Rd rename to man/createDomainNetwork.Rd index 528e4924..1588af17 100644 --- a/man/domain_network.Rd +++ b/man/createDomainNetwork.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/networks_domarch.R -\name{domain_network} -\alias{domain_network} +\name{createDomainNetwork} +\alias{createDomainNetwork} \title{Domain Network} \usage{ -domain_network( +createDomainNetwork( prot, column = "DomArch", domains_of_interest, @@ -33,6 +33,6 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -domain_network(pspa) +createDomainNetwork(pspa) } } diff --git a/man/GenContextNetwork.Rd b/man/createGenomicContextNetwork.Rd similarity index 91% rename from man/GenContextNetwork.Rd rename to man/createGenomicContextNetwork.Rd index 2eeebbc5..ac6deb84 100644 --- a/man/GenContextNetwork.Rd +++ b/man/createGenomicContextNetwork.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/networks_gencontext.R -\name{GenContextNetwork} -\alias{GenContextNetwork} +\name{createGenomicContextNetwork} +\alias{createGenomicContextNetwork} \title{Genomic Context Directed Network} \usage{ -GenContextNetwork( +createGenomicContextNetwork( prot, domains_of_interest, column = "GenContext", diff --git a/man/generate_msa.Rd b/man/createMSA_Kalign.Rd similarity index 70% rename from man/generate_msa.Rd rename to man/createMSA_Kalign.Rd index a68eb8b4..946f04ae 100644 --- a/man/generate_msa.Rd +++ b/man/createMSA_Kalign.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/msa.R -\name{generate_msa} -\alias{generate_msa} +\name{createMSA_Kalign} +\alias{createMSA_Kalign} \title{Function to generate MSA using kalign} \usage{ -generate_msa(fa_file = "", outfile = "") +createMSA_Kalign(fa_file = "", outfile = "") } \arguments{ \item{outfile}{} diff --git a/man/msa_pdf.Rd b/man/createMSA_PDF.Rd similarity index 77% rename from man/msa_pdf.Rd rename to man/createMSA_PDF.Rd index 4d5fed17..7cd7516a 100644 --- a/man/msa_pdf.Rd +++ b/man/createMSA_PDF.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/msa.R -\name{msa_pdf} -\alias{msa_pdf} +\name{createMSA_PDF} +\alias{createMSA_PDF} \title{Multiple Sequence Alignment} \usage{ -msa_pdf(fasta_path, out_path = NULL, lowerbound = NULL, upperbound = NULL) +createMSA_PDF( + fasta_path, + out_path = NULL, + lowerbound = NULL, + upperbound = NULL +) } \arguments{ \item{fasta_path}{Character. The path location of the fasta file to be read.} @@ -21,11 +26,11 @@ Default is NULL. If value is NULL, the entire multiple sequence alignment is pri \description{ Generates a multiple sequence alignment from a fasta file -msa_pdf is a function that reads a fasta file and generates a multiple sequence alignment as +createMSA_PDF is a function that reads a fasta file and generates a multiple sequence alignment as a pdf } \examples{ \dontrun{ -msa_pdf() +createMSA_PDF() } } diff --git a/man/gc_undirected_network.Rd b/man/createUndirectedGenomicContextNetwork.Rd similarity index 90% rename from man/gc_undirected_network.Rd rename to man/createUndirectedGenomicContextNetwork.Rd index 28cf1abb..d61c23df 100644 --- a/man/gc_undirected_network.Rd +++ b/man/createUndirectedGenomicContextNetwork.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/networks_gencontext.R -\name{gc_undirected_network} -\alias{gc_undirected_network} +\name{createUndirectedGenomicContextNetwork} +\alias{createUndirectedGenomicContextNetwork} \title{Domain Network} \usage{ -gc_undirected_network( +createUndirectedGenomicContextNetwork( prot, column = "GenContext", domains_of_interest, From 6babffe95d2729857b921c9305f25dcbc0c0ed49 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Tue, 15 Oct 2024 11:57:15 +0100 Subject: [PATCH 45/61] Update error handling to use rlang functions in R/assign_job_queue.R file - Replaced base R error handling with rlang functions: `abort()`, `warn()`, and `inform()`. - Improved clarity and consistency in error and warning messages. - Enhanced robustness with detailed context for errors and warnings. --- R/assign_job_queue.R | 227 +++++++++++++++++++++++++++++++------------ 1 file changed, 166 insertions(+), 61 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index c531fb09..df4f97e7 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -1,3 +1,4 @@ +suppressPackageStartupMessages(library(rlang)) # for now, we're using an env var, COMMON_SRC_ROOT, to specify this folder since # the working directory is changed in many parts of the current molevolvr # pipeline. @@ -22,11 +23,9 @@ make_opts2procs <- function() { ) return(opts2processes) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort(paste("Error: ", e$message), class = "Opts_to_process_error") }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("make_opts2procs function execution completed.") + warn(paste("Warning: ", w$message), class = "Opts_to_process_warning") }) } @@ -44,7 +43,7 @@ make_opts2procs <- function() { #' @export map_advanced_opts2procs <- function(advanced_opts) { if (!is.character(advanced_opts)) { - stop("Argument must be a character vector!") + abort("Argument must be a character vector!", class = "validation_error") } tryCatch({ # append 'always' to add procs that always run @@ -56,11 +55,19 @@ map_advanced_opts2procs <- function(advanced_opts) { procs <- opts2proc[idx] |> unlist() return(procs) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort( + message = paste("Encountered an error: ", e$message), + class = "map_advanced_opts2procs_error", + call = sys.call(), + advanced_opts = advanced_opts + ) }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("make_opts2procs function execution completed.") + warn( + message = paste("Warning: ", w$message), + class = "map_advanced_opts2procs_warning", + call = sys.call(), + advanced_opts = advanced_opts + ) }) } @@ -91,12 +98,14 @@ get_proc_medians <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") + abort("Input 'dir_job_results' must be a single character string.", + class = "validation_error") } # Check if dir_job_results exists if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) + abort(paste("The directory", dir_job_results, "does not exist."), + class = "file_error") } source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) @@ -135,11 +144,10 @@ get_proc_medians <- function(dir_job_results) { as.list() return(list_proc_medians) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort(paste("Encountered an error: ", e$message), + class = "processing_error") }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("get_proc_medians function execution completed.") + warn(paste("Warning: ", w$message), class = "processing_warning") }) } @@ -165,15 +173,18 @@ write_proc_medians_table <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") + abort("Input 'dir_job_results' must be a single character string.", + class = "validation_error") } if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) + abort(paste("The directory", dir_job_results, "does not exist."), + class = "file_error") } if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") + abort("Input 'filepath' must be a single character string.", + class = "validation_error") } df_proc_medians <- get_proc_medians(dir_job_results) |> tibble::as_tibble() |> @@ -188,11 +199,21 @@ write_proc_medians_table <- function(dir_job_results, filepath) { readr::write_tsv(df_proc_medians, file = filepath) return(df_proc_medians) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort( + message = paste("Encountered an error: ", e$message), + class = "processing_error", + call = sys.call(), + dir_job_results = dir_job_results, + filepath = filepath + ) }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("write_proc_medians_table function execution completed.") + warn( + message = paste("Warning: ", w$message), + class = "processing_warning", + call = sys.call(), + dir_job_results = dir_job_results, + filepath = filepath + ) }) } @@ -222,12 +243,21 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") + abort( + message = "Input 'dir_job_results' must be a single character string.", + class = "validation_error", + dir_job_results = dir_job_results + ) } if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) + abort( + message = paste("The directory", dir_job_results, "does not exist."), + class = "file_error", + dir_job_results = dir_job_results + ) } + if (is.null(filepath)) { filepath <- file.path(common_root, "molevol_scripts", @@ -235,20 +265,32 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { "job_proc_weights.yml") } if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") + abort( + message = "Input 'filepath' must be a single character string.", + class = "validation_error", + filepath = filepath + ) } medians <- get_proc_medians(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { - message(paste("Encountered an error: "), e$message) + abort( + message = paste("Encountered an error: ", e$message), + class = "processing_error", + call = sys.call(), + dir_job_results = dir_job_results, + filepath = filepath + ) }, warning = function(w) { - message(paste("Warning: "), w$message) - }, finally = { - message("write_proc_medians_table function execution completed.") - } - ) - + warn( + message = paste("Warning: ", w$message), + class = "processing_warning", + call = sys.call(), + dir_job_results = dir_job_results, + filepath = filepath + ) + }) } #' Quickly get the runtime weights for MolEvolvR backend processes @@ -275,13 +317,24 @@ get_proc_weights <- function(medians_yml_path = NULL) { # attempt to read the weights from the YAML file produced by # write_proc_medians_yml() if (stringr::str_trim(medians_yml_path) == "") { - stop( - stringr::str_glue("medians_yml_path is empty - ({medians_yml_path}), returning default weights") + abort( + message = stringr::str_glue("medians_yml_path is empty + ({medians_yml_path}), returning default weights"), + class = "input_error", + medians_yml_path = medians_yml_path ) } proc_weights <- yaml::read_yaml(medians_yml_path) + + if (!is.list(proc_weights) || length(proc_weights) == 0) { + abort( + message = "The loaded YAML file does not + contain valid process weights.", + class = "file_error", + medians_yml_path = medians_yml_path + ) + } }, # to avoid fatal errors in reading the proc weights yaml, # some median process runtimes have been hardcoded based on @@ -318,10 +371,9 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' "domain_architecture"), #' n_inputs = 3, n_hits = 50L) #' @export -advanced_opts2est_walltime <- function(advanced_opts, - n_inputs = 1L, - n_hits = NULL, - verbose = FALSE) { +advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, + n_hits = NULL, + verbose = FALSE) { tryCatch({ # to calculate est walltime for a homology search job, the number of hits @@ -331,26 +383,42 @@ advanced_opts2est_walltime <- function(advanced_opts, # Validate advanced_opts if (!is.character(advanced_opts)) { - stop("Argument 'advanced_opts' must be a character vector.") + abort( + message = "Argument 'advanced_opts' must be a character vector.", + class = "validation_error", + advanced_opts = advanced_opts + ) } # Validate n_inputs if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { - stop("Argument 'n_inputs' must be a single positive numeric value.") + abort( + message = "Argument 'n_inputs' + must be a single positive numeric value.", + class = "validation_error", + n_inputs = n_inputs + ) } # Validate n_hits if homology_search is in advanced_opts if ("homology_search" %in% advanced_opts && - (is.null(n_hits)|| !is.numeric(n_hits) - || length(n_hits) != 1 || n_hits < 0)) { - stop("Argument 'n_hits' must be a single non-negative numeric value when - 'homology_search' is in 'advanced_opts'.") + (is.null(n_hits) || !is.numeric(n_hits) || + length(n_hits) != 1 || n_hits < 0)) { + abort( + message = "Argument 'n_hits' must be a single non-negative numeric + value when 'homology_search' is in 'advanced_opts'.", + class = "validation_error", + n_hits = n_hits + ) } # Get process weights proc_weights <- write_proc_medians_yml() if (!is.list(proc_weights)) { - stop("Process weights could not be retrieved correctly.") + abort( + message = "Process weights could not be retrieved correctly.", + class = "processing_error" + ) } # sort process weights by names and convert to vec @@ -389,12 +457,23 @@ advanced_opts2est_walltime <- function(advanced_opts, } return(est_walltime) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort( + message = paste("Encountered an error: ", e$message), + class = "processing_error", + call = sys.call(), + advanced_opts = advanced_opts, + n_inputs = n_inputs, + n_hits = n_hits + ) }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("advanced_opts2est_walltime - function execution completed.") + warn( + message = paste("Warning: ", w$message), + class = "processing_warning", + call = sys.call(), + advanced_opts = advanced_opts, + n_inputs = n_inputs, + n_hits = n_hits + ) }) } @@ -419,22 +498,44 @@ assign_job_queue <- function( t_cutoff = 21600 # 6 hours ) { tryCatch({ + # Validate t_sec_estimate if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { - stop("Argument 't_sec_estimate' must be a single numeric value.") + abort( + message = "Argument 't_sec_estimate' must be a single numeric value.", + class = "validation_error", + t_sec_estimate = t_sec_estimate + ) } + # Validate t_cutoff if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { - stop("Argument 't_cutoff' must be a single non-negative numeric value.") + abort( + message = "Argument 't_cutoff' must be a + single non-negative numeric value.", + class = "validation_error", + t_cutoff = t_cutoff + ) } + queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") return(queue) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort( + message = paste("Encountered an error: ", e$message), + class = "processing_error", + call = sys.call(), + t_sec_estimate = t_sec_estimate, + t_cutoff = t_cutoff + ) }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("assign_job_queue function execution completed.") + warn( + message = paste("Warning: ", w$message), + class = "processing_warning", + call = sys.call(), + t_sec_estimate = t_sec_estimate, + t_cutoff = t_cutoff + ) }) } @@ -537,11 +638,15 @@ plot_estimated_walltimes <- function() { ) return(p) }, error = function(e) { - message(paste("Encountered an error: ", e$message)) + abort( + message = paste("Encountered an error:", e$message), + .internal = TRUE + ) }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("plot_estimated_walltimes function execution completed.") + warn( + message = paste("Warning:", w$message), + .internal = TRUE + ) }) } From 57a635671795984f5ace17076ef0029c6ff0336c Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Sun, 20 Oct 2024 12:01:02 +0100 Subject: [PATCH 46/61] Enhance error handling and validation across functions - Added robust error handling in run_deltablast and run_rpsblast functions. - Updated Roxygen documentation to import rlang::abort, rlang::warn and rlang::inform for better error management. - Refactored code for clarity and consistency based on the suggestion from the last review. --- NAMESPACE | 3 + R/acc2lin.R | 105 ++++++++++++++++++----------------- R/assign_job_queue.R | 128 ++++++++++++++++++++++--------------------- R/blastWrappers.R | 84 +++++++++++++++++++++------- 4 files changed, 184 insertions(+), 136 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 078f971b..9449e14b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -240,8 +240,11 @@ importFrom(readr,write_lines) importFrom(readr,write_tsv) importFrom(rentrez,entrez_fetch) importFrom(rlang,.data) +importFrom(rlang,abort) importFrom(rlang,as_string) +importFrom(rlang,inform) importFrom(rlang,sym) +importFrom(rlang,warn) importFrom(sendmailR,mime_part) importFrom(sendmailR,sendmail) importFrom(seqinr,dist.alignment) diff --git a/R/acc2lin.R b/R/acc2lin.R index bd5cc289..c1f3b34e 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -5,11 +5,13 @@ # suppressPackageStartupMessages(library(data.table)) # suppressPackageStartupMessages(library(tidyverse)) # suppressPackageStartupMessages(library(biomartr)) -suppressPackageStartupMessages(library(rlang)) + # https://stackoverflow.com/questions/18730491/sink-does-not-release-file #' Sink Reset #' +#' @importFrom rlang warn abort inform +#' #' @return No return, but run to close all outstanding `sink()`s #' and handles any errors or warnings that occur during the process. #' @@ -25,17 +27,17 @@ sinkReset <- function() { for (i in seq_len(sink.number())) { sink(NULL) } - inform("All sinks closed", class = "sink_reset_info") + rlang::inform("All sinks closed", class = "sink_reset_info") }, error = function(e) { - abort(paste("Error: ", e$message), class = "sink_reset_error") + rlang::abort(paste("Error: ", e$message), class = "sink_reset_error") }, warning = function(w) { - warn(paste("Warning: ", w$message), class = "sink_reset_warning") + rlang::warn(paste("Warning: ", w$message), class = "sink_reset_warning") }, finally = { # If any additional cleanup is needed, it can be done here if (sink.number() > 0) { # Additional cleanup if sinks are still open - inform("Some sinks remain open, ensure proper cleanup.", - class = "sink_cleanup_warning") + rlang::inform("Some sinks remain open, ensure proper cleanup.", + class = "sink_cleanup_warning") } }) } @@ -52,7 +54,7 @@ sinkReset <- function() { #' #' @importFrom dplyr pull #' @importFrom magrittr %>% -#' @importFrom rlang sym +#' @importFrom rlang sym warn abort inform #' #' @return Describe return, in detail #' @export @@ -66,30 +68,30 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, plan = "sequential", ...) { # check for validate inputs if (!is.data.frame(df)) { - abort("Input 'df' must be a data frame.", class = "input_error") + rlang::abort("Input 'df' must be a data frame.", class = "input_error") } if (!acc_col %in% colnames(df)) { - abort(paste("Column", acc_col, - "not found in data frame."), class = "column_error") + rlang::abort(paste("Column", acc_col, + "not found in data frame."), class = "column_error") } # Ensure paths are character strings if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { - abort("Both 'assembly_path' and - 'lineagelookup_path' must be character strings.", - class = "path_type_error") + rlang::abort("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.", + class = "path_type_error") } # Ensure paths exist if (!file.exists(assembly_path)) { - abort(paste("Assembly file not found at:", - assembly_path), class = "file_not_found_error") + rlang::abort(paste("Assembly file not found at:", + assembly_path), class = "file_not_found_error") } if (!file.exists(lineagelookup_path)) { - abort(paste("Lineage lookup file not found at:", - lineagelookup_path), class = "file_not_found_error") + rlang::abort(paste("Lineage lookup file not found at:", + lineagelookup_path), class = "file_not_found_error") } tryCatch({ # Attempt to add lineages @@ -99,7 +101,7 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, accessions, assembly_path, lineagelookup_path, ipgout_path, plan ) - # Drop a lot of the unimportant columns for now? + # Drop a lot of the unimportant columns for now? # will make merging much easier lins <- lins[, c( "Strand", "Start", "Stop", "Nucleotide Accession", "Source", @@ -107,18 +109,18 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, ) := NULL] lins <- unique(lins) - # dup <- lins %>% group_by(Protein) %>% + # dup <- lins %>% group_by(Protein) %>% # summarize(count = n()) %>% filter(count > 1) %>% # pull(Protein) merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) return(merged) }, error = function(e) { - abort(paste("Error during lineage addition:", e$message), - class = "lineage_addition_error") + rlang::abort(paste("Error during lineage addition:", e$message), + class = "lineage_addition_error") }, warning = function(w) { - warn(paste("Warning during lineage addition:", w$message), - class = "lineage_addition_warning") + rlang::warn(paste("Warning during lineage addition:", w$message), + class = "lineage_addition_warning") }) } @@ -137,11 +139,13 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results +#' @param ipgout_path Path to write the results #' of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan #' +#' @importFrom rlang warn abort inform +#' #' @return Describe return, in detail #' @export #' @@ -149,8 +153,8 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' \dontrun{ #' acc2Lineage() #' } -acc2Lineage <- function(accessions, assembly_path, - lineagelookup_path, ipgout_path = NULL, +acc2Lineage <- function(accessions, assembly_path, + lineagelookup_path, ipgout_path = NULL, plan = "sequential", ...) { tmp_ipg <- F if (is.null(ipgout_path)) { @@ -167,12 +171,10 @@ acc2Lineage <- function(accessions, assembly_path, lins <- IPG2Lineage(accessions, ipgout_path, assembly_path, lineagelookup_path) }, error = function(e) { - abort( + rlang::abort( message = paste("An error occurred during IPG fetching or lineage processing:", e$message), class = "lineage_processing_error", - # capturing the call stack - call = sys.call(), # adding additional context accessions = accessions, assembly_path = assembly_path, @@ -181,11 +183,10 @@ acc2Lineage <- function(accessions, assembly_path, plan = plan ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning during IPG fetching or lineage processing:", w$message), class = "lineage_processing_warning", - call = sys.call(), # capturing the call stack accessions = accessions, assembly_path = assembly_path, lineagelookup_path = lineagelookup_path, @@ -218,6 +219,7 @@ acc2Lineage <- function(accessions, assembly_path, #' @importFrom furrr future_map #' @importFrom future plan #' @importFrom rentrez entrez_fetch +#' @importFrom rlang warn abort inform #' #' @return Describe return, in detail #' @export @@ -229,18 +231,18 @@ acc2Lineage <- function(accessions, assembly_path, efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { # Argument validation if (!is.character(accnums) || length(accnums) == 0) { - abort("Error: 'accnums' must be a non-empty character vector.", - class = "validation_error") + rlang::abort("Error: 'accnums' must be a non-empty character vector.", + class = "validation_error") } if (!is.character(out_path) || nchar(out_path) == 0) { - abort("Error: 'out_path' must be a non-empty string.", - class = "validation_error") + rlang::abort("Error: 'out_path' must be a non-empty string.", + class = "validation_error") } if (!is.function(plan)) { - abort("Error: 'plan' must be a valid plan function.", - class = "validation_error") + rlang::abort("Error: 'plan' must be a valid plan function.", + class = "validation_error") } if (length(accnums) > 0) { partition <- function(in_data, groups) { @@ -285,19 +287,17 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { }) sink(NULL) }, error = function(e) { - abort( + rlang::abort( message = paste("An error occurred: ", e$message), class = "fetch_error", - call = sys.call(), accnums = accnums, out_path = out_path, plan = plan ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "fetch_warning", - call = sys.call(), accnums = accnums, out_path = out_path, plan = plan @@ -331,6 +331,7 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { #' "create_lineage_lookup()" function #' #' @importFrom data.table fread +#' @importFrom rlang warn abort inform #' #' @return Describe return, in detail #' @export @@ -344,31 +345,31 @@ IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, ...) { # Argument validation for accessions if (!is.character(accessions) || length(accessions) == 0) { - abort("Input 'accessions' must be a non-empty + rlang::abort("Input 'accessions' must be a non-empty character vector.", class = "validation_error") } # check for validate inputs if (!is.character(ipg_file)) { - abort("Input 'ipg_file' must be a + rlang::abort("Input 'ipg_file' must be a character string.", class = "validation_error") } # Ensure paths are character strings if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { - abort("Both 'assembly_path' and 'lineagelookup_path' - must be character strings.", class = "validation_error") + rlang::abort("Both 'assembly_path' and 'lineagelookup_path' + must be character strings.", class = "validation_error") } # Ensure paths exist if (!file.exists(assembly_path)) { - abort(paste("Assembly file not found at:", assembly_path), - class = "file_error") + rlang::abort(paste("Assembly file not found at:", assembly_path), + class = "file_error") } if (!file.exists(lineagelookup_path)) { - abort(paste("Lineage lookup file not found at:", lineagelookup_path), - class = "file_error") + rlang::abort(paste("Lineage lookup file not found at:", lineagelookup_path), + class = "file_error") } # Process the IPG file @@ -390,20 +391,18 @@ IPG2Lineage <- function(accessions, ipg_file, return(lins) }, error = function(e) { - abort( + rlang::abort( message = paste("An error occurred: ", e$message), class = "processing_error", - call = sys.call(), accessions = accessions, ipg_file = ipg_file, assembly_path = assembly_path, lineagelookup_path = lineagelookup_path ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "processing_warning", - call = sys.call(), accessions = accessions, ipg_file = ipg_file, assembly_path = assembly_path, diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index df4f97e7..8b227979 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -1,4 +1,4 @@ -suppressPackageStartupMessages(library(rlang)) + # for now, we're using an env var, COMMON_SRC_ROOT, to specify this folder since # the working directory is changed in many parts of the current molevolvr # pipeline. @@ -9,6 +9,8 @@ common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' +#' @importFrom rlang warn abort inform +#' #' @return list where names (MolEvolvR advanced options) point to processes #' #' example: list_opts2procs <- make_opts2procs @@ -23,9 +25,10 @@ make_opts2procs <- function() { ) return(opts2processes) }, error = function(e) { - abort(paste("Error: ", e$message), class = "Opts_to_process_error") + rlang::abort(paste("Error: ", e$message), class = "Opts_to_process_error") }, warning = function(w) { - warn(paste("Warning: ", w$message), class = "Opts_to_process_warning") + rlang::warn(paste("Warning: ", w$message), + class = "Opts_to_process_warning") }) } @@ -34,6 +37,8 @@ make_opts2procs <- function() { #' #' @param advanced_opts character vector of MolEvolvR advanced options #' +#' @importFrom rlang warn abort inform +#' #' @return character vector of process names that will execute given #' the advanced options #' @@ -43,7 +48,8 @@ make_opts2procs <- function() { #' @export map_advanced_opts2procs <- function(advanced_opts) { if (!is.character(advanced_opts)) { - abort("Argument must be a character vector!", class = "validation_error") + rlang::abort("Argument must be a character vector!", + class = "validation_error") } tryCatch({ # append 'always' to add procs that always run @@ -55,17 +61,15 @@ map_advanced_opts2procs <- function(advanced_opts) { procs <- opts2proc[idx] |> unlist() return(procs) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error: ", e$message), class = "map_advanced_opts2procs_error", - call = sys.call(), advanced_opts = advanced_opts ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "map_advanced_opts2procs_warning", - call = sys.call(), advanced_opts = advanced_opts ) }) @@ -78,6 +82,7 @@ map_advanced_opts2procs <- function(advanced_opts) { #' directory #' #' @importFrom dplyr across everything select summarise +#' @importFrom rlang warn abort inform #' #' @return [list] names: processes; values: median runtime (seconds) #' @@ -98,14 +103,14 @@ get_proc_medians <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - abort("Input 'dir_job_results' must be a single character string.", - class = "validation_error") + rlang::abort("Input 'dir_job_results' must be a single character string.", + class = "validation_error") } # Check if dir_job_results exists if (!dir.exists(dir_job_results)) { - abort(paste("The directory", dir_job_results, "does not exist."), - class = "file_error") + rlang::abort(paste("The directory", dir_job_results, "does not exist."), + class = "file_error") } source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) @@ -144,10 +149,10 @@ get_proc_medians <- function(dir_job_results) { as.list() return(list_proc_medians) }, error = function(e) { - abort(paste("Encountered an error: ", e$message), - class = "processing_error") + rlang::abort(paste("Encountered an error: ", e$message), + class = "processing_error") }, warning = function(w) { - warn(paste("Warning: ", w$message), class = "processing_warning") + rlang::warn(paste("Warning: ", w$message), class = "processing_warning") }) } @@ -161,6 +166,7 @@ get_proc_medians <- function(dir_job_results) { #' @importFrom tibble as_tibble #' @importFrom readr write_tsv #' @importFrom tidyr pivot_longer +#' @importFrom rlang warn abort inform #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' @@ -173,18 +179,18 @@ write_proc_medians_table <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - abort("Input 'dir_job_results' must be a single character string.", - class = "validation_error") + rlang::abort("Input 'dir_job_results' must be a single character string.", + class = "validation_error") } if (!dir.exists(dir_job_results)) { - abort(paste("The directory", dir_job_results, "does not exist."), - class = "file_error") + rlang::abort(paste("The directory", dir_job_results, "does not exist."), + class = "file_error") } if (!is.character(filepath) || length(filepath) != 1) { - abort("Input 'filepath' must be a single character string.", - class = "validation_error") + rlang::abort("Input 'filepath' must be a single character string.", + class = "validation_error") } df_proc_medians <- get_proc_medians(dir_job_results) |> tibble::as_tibble() |> @@ -199,18 +205,16 @@ write_proc_medians_table <- function(dir_job_results, filepath) { readr::write_tsv(df_proc_medians, file = filepath) return(df_proc_medians) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error: ", e$message), class = "processing_error", - call = sys.call(), dir_job_results = dir_job_results, filepath = filepath ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "processing_warning", - call = sys.call(), dir_job_results = dir_job_results, filepath = filepath ) @@ -226,10 +230,11 @@ write_proc_medians_table <- function(dir_job_results, filepath) { #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory -#' @param filepath [chr] path to save YAML file; if NULL, +#' @param filepath [chr] path to save YAML file; if NULL, #' uses ./molevol_scripts/log_data/job_proc_weights.yml #' #' @importFrom yaml write_yaml +#' @importFrom rlang warn abort inform #' #' @examples #' \dontrun{ @@ -243,7 +248,7 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - abort( + rlang::abort( message = "Input 'dir_job_results' must be a single character string.", class = "validation_error", dir_job_results = dir_job_results @@ -251,7 +256,7 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { } if (!dir.exists(dir_job_results)) { - abort( + rlang::abort( message = paste("The directory", dir_job_results, "does not exist."), class = "file_error", dir_job_results = dir_job_results @@ -265,7 +270,7 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { "job_proc_weights.yml") } if (!is.character(filepath) || length(filepath) != 1) { - abort( + rlang::abort( message = "Input 'filepath' must be a single character string.", class = "validation_error", filepath = filepath @@ -275,18 +280,16 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { medians <- get_proc_medians(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error: ", e$message), class = "processing_error", - call = sys.call(), dir_job_results = dir_job_results, filepath = filepath ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "processing_warning", - call = sys.call(), dir_job_results = dir_job_results, filepath = filepath ) @@ -300,6 +303,7 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { #' #' @importFrom stringr str_glue str_trim #' @importFrom yaml read_yaml +#' @importFrom rlang warn abort inform #' #' @return [list] names: processes; values: median runtime (seconds) #' @@ -317,9 +321,9 @@ get_proc_weights <- function(medians_yml_path = NULL) { # attempt to read the weights from the YAML file produced by # write_proc_medians_yml() if (stringr::str_trim(medians_yml_path) == "") { - abort( - message = stringr::str_glue("medians_yml_path is empty - ({medians_yml_path}), returning default weights"), + rlang::abort( + message = stringr::str_glue("medians_yml_path is empty + ({medians_yml_path}), returning default weights"), class = "input_error", medians_yml_path = medians_yml_path ) @@ -328,7 +332,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { proc_weights <- yaml::read_yaml(medians_yml_path) if (!is.list(proc_weights) || length(proc_weights) == 0) { - abort( + rlang::abort( message = "The loaded YAML file does not contain valid process weights.", class = "file_error", @@ -364,6 +368,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' #' @importFrom dplyr if_else #' @importFrom stringr str_glue +#' @importFrom rlang warn abort inform #' #' @return total estimated number of seconds a job will process (walltime) #' @@ -383,7 +388,7 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, # Validate advanced_opts if (!is.character(advanced_opts)) { - abort( + rlang::abort( message = "Argument 'advanced_opts' must be a character vector.", class = "validation_error", advanced_opts = advanced_opts @@ -392,8 +397,8 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, # Validate n_inputs if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { - abort( - message = "Argument 'n_inputs' + rlang::abort( + message = "Argument 'n_inputs' must be a single positive numeric value.", class = "validation_error", n_inputs = n_inputs @@ -404,8 +409,8 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, if ("homology_search" %in% advanced_opts && (is.null(n_hits) || !is.numeric(n_hits) || length(n_hits) != 1 || n_hits < 0)) { - abort( - message = "Argument 'n_hits' must be a single non-negative numeric + rlang::abort( + message = "Argument 'n_hits' must be a single non-negative numeric value when 'homology_search' is in 'advanced_opts'.", class = "validation_error", n_hits = n_hits @@ -415,7 +420,7 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, # Get process weights proc_weights <- write_proc_medians_yml() if (!is.list(proc_weights)) { - abort( + rlang::abort( message = "Process weights could not be retrieved correctly.", class = "processing_error" ) @@ -437,9 +442,9 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, opts2procs <- make_opts2procs() # exclude the homology search processes for the homologous hits procs2exclude_for_homologs <- opts2procs[["homology_search"]] - procs_homologs <- procs_from_opts[!(procs_from_opts + procs_homologs <- procs_from_opts[!(procs_from_opts %in% procs2exclude_for_homologs)] - binary_proc_vec_homolog <- dplyr::if_else(all_procs + binary_proc_vec_homolog <- dplyr::if_else(all_procs %in% procs_homologs, 1L, 0L) # add the estimated walltime for processes run on the homologous hits est_walltime <- est_walltime + @@ -457,19 +462,17 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, } return(est_walltime) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error: ", e$message), class = "processing_error", - call = sys.call(), advanced_opts = advanced_opts, n_inputs = n_inputs, n_hits = n_hits ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "processing_warning", - call = sys.call(), advanced_opts = advanced_opts, n_inputs = n_inputs, n_hits = n_hits @@ -486,6 +489,8 @@ advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' +#' @importFrom rlang warn abort inform +#' #' @return a string of "short" or "long" #' #' example: @@ -500,7 +505,7 @@ assign_job_queue <- function( tryCatch({ # Validate t_sec_estimate if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { - abort( + rlang::abort( message = "Argument 't_sec_estimate' must be a single numeric value.", class = "validation_error", t_sec_estimate = t_sec_estimate @@ -509,8 +514,8 @@ assign_job_queue <- function( # Validate t_cutoff if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { - abort( - message = "Argument 't_cutoff' must be a + rlang::abort( + message = "Argument 't_cutoff' must be a single non-negative numeric value.", class = "validation_error", t_cutoff = t_cutoff @@ -521,18 +526,16 @@ assign_job_queue <- function( queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") return(queue) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error: ", e$message), class = "processing_error", - call = sys.call(), t_sec_estimate = t_sec_estimate, t_cutoff = t_cutoff ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning: ", w$message), class = "processing_warning", - call = sys.call(), t_sec_estimate = t_sec_estimate, t_cutoff = t_cutoff ) @@ -548,6 +551,7 @@ assign_job_queue <- function( #' @importFrom dplyr mutate select #' @importFrom ggplot2 aes geom_line ggplot labs #' @importFrom tibble as_tibble +#' @importFrom rlang warn abort inform #' #' @return line plot object #' @@ -581,8 +585,8 @@ plot_estimated_walltimes <- function() { n_hits <- if ("homology_search" %in% advanced_opts) { 100 } else { - NULL - } + NULL + } est_walltime <- advanced_opts2est_walltime ( advanced_opts, n_inputs = i, @@ -627,8 +631,8 @@ plot_estimated_walltimes <- function() { # sec to hrs df_walltimes <- df_walltimes |> dplyr::mutate(est_walltime = est_walltime / 3600) - p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, - y = est_walltime, + p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, + y = est_walltime, color = advanced_opts)) + ggplot2::geom_line() + ggplot2::labs( @@ -638,12 +642,12 @@ plot_estimated_walltimes <- function() { ) return(p) }, error = function(e) { - abort( + rlang::abort( message = paste("Encountered an error:", e$message), .internal = TRUE ) }, warning = function(w) { - warn( + rlang::warn( message = paste("Warning:", w$message), .internal = TRUE ) diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 15484a1b..95643e24 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -13,6 +13,8 @@ #' @param num_alignments #' @param num_threads #' +#' @importFrom rlang warn abort inform +#' #' @return #' @export #' @@ -23,23 +25,25 @@ run_deltablast <- function(deltablast_path, db_search_path, # Argument validation if (!file.exists(deltablast_path)) { - stop("The DELTABLAST executable path is invalid: ", deltablast_path) + rlang::abort(paste("The DELTABLAST executable path is invalid:", + deltablast_path)) } if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) + rlang::abort(paste("The database search path is invalid:", db_search_path)) } if (!file.exists(query)) { - stop("The query file path is invalid: ", query) + rlang::abort(paste("The query file path is invalid:", query)) } if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) + rlang::abort(paste("The evalue must be a positive number:", evalue)) } if (!is.numeric(num_alignments) || num_alignments <= 0) { - stop("The number of alignments must be a - positive integer: ", num_alignments) + rlang::abort(paste("The number of alignments must be a positive integer:", + num_alignments)) } if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) + rlang::abort(paste("The number of threads must be a positive integer:", + num_threads)) } start <- Sys.time() @@ -61,13 +65,28 @@ run_deltablast <- function(deltablast_path, db_search_path, ) print(Sys.time() - start) }, error = function(e) { - message(paste("Error in run_deltablast: ", e)) + rlang::abort( + message = paste("Error in run_deltablast:", e$message), + class = "processing_error", + deltablast_path = deltablast_path, + db_search_path = db_search_path, + query = query, + out = out, + num_alignments = num_alignments, + num_threads = num_threads + ) }, warning = function(w) { - message(paste("Warning in run_deltablast: ", w)) - }, finally = { - message("run_deltablast completed") + rlang::warn( + message = paste("Warning in run_deltablast:", w$message), + class = "processing_warning", + deltablast_path = deltablast_path, + db_search_path = db_search_path, + query = query, + out = out, + num_alignments = num_alignments, + num_threads = num_threads + ) }) - } @@ -81,6 +100,8 @@ run_deltablast <- function(deltablast_path, db_search_path, #' @param out #' @param num_threads #' +#' @importFrom rlang warn abort inform +#' #' @return #' @export #' @@ -90,19 +111,26 @@ run_rpsblast <- function(rpsblast_path, db_search_path, out, num_threads = 1) { # Argument validation if (!file.exists(rpsblast_path)) { - stop("The RPSBLAST executable path is invalid: ", rpsblast_path) + rlang::abort(paste("The RPSBLAST executable path is invalid:", + rpsblast_path), + class = "file_error") } if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) + rlang::abort(paste("The database search path is invalid:", db_search_path), + class = "file_error") } if (!file.exists(query)) { - stop("The query file path is invalid: ", query) + rlang::abort(paste("The query file path is invalid:", query), + class = "file_error") } if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) + rlang::abort(paste("The evalue must be a positive number:", evalue), + class = "validation_error") } if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) + rlang::abort(paste("The number of threads must be a positive integer:", + num_threads), + class = "validation_error") } start <- Sys.time() @@ -123,11 +151,25 @@ run_rpsblast <- function(rpsblast_path, db_search_path, ) print(Sys.time() - start) }, error = function(e) { - message(paste("Error in run_rpsblast: ", e)) + rlang::abort( + message = paste("Error in run_rpsblast:", e$message), + class = "processing_error", + rpsblast_path = rpsblast_path, + db_search_path = db_search_path, + query = query, + out = out, + num_threads = num_threads + ) }, warning = function(w) { - message(paste("Warning in run_rpsblast: ", w)) - }, finally = { - message("run_rpsblast completed") + rlang::warn( + message = paste("Warning in run_rpsblast:", w$message), + class = "processing_warning", + rpsblast_path = rpsblast_path, + db_search_path = db_search_path, + query = query, + out = out, + num_threads = num_threads + ) }) } From df602dfd63cbab0d84dbcc8229e3da9c7646b9d5 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 13:52:56 -0600 Subject: [PATCH 47/61] https://github.com/JRaviLab/MolEvolvR/pull/95/files#r1805272251 - re-implement dropped check - fix .Rd --- R/assign_job_queue.R | 5 ++++- man/writeProcessRuntime2YML.Rd | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 20ba841f..69609417 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -155,7 +155,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which getProcessRuntimeWeights() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -173,6 +173,9 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' } #' @export writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { + if (is.null(filepath)) { + filepath <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") + } medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) } diff --git a/man/writeProcessRuntime2YML.Rd b/man/writeProcessRuntime2YML.Rd index 865f23f7..5e0a05a4 100644 --- a/man/writeProcessRuntime2YML.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -17,7 +17,7 @@ Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'. The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which getProcessRuntimeWeights() also uses as its default read location. } \examples{ From 1a0b66358eac637736a18868ae27e4049aa22628 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 14:43:47 -0600 Subject: [PATCH 48/61] https://github.com/JRaviLab/MolEvolvR/pull/95#discussion_r1805166466 - adjust roxygen skeleton readability --- R/acc2lin.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 61aae87c..7b6f570c 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -198,10 +198,8 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { #' of an efetch run on the ipg database and #' #' @param accessions Character vector of protein accessions -#' @param ipg_file Filepath to the file -#' containing results of an efetch run on the -#' ipg database. The protein accession in -#' 'accessions' should be contained in this +#' @param ipg_file Filepath to the file containing results of an efetch run on the +#' ipg database. The protein accession in 'accessions' should be contained in this #' file #' @param assembly_path String of the path to the assembly_summary path #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function From 13e70c75a197c02c395cbef2d7b3c5b991ea7649 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 15:02:39 -0600 Subject: [PATCH 49/61] formatting --- R/acc2lin.R | 8 ++------ man/efetchIPG.Rd | 3 +-- man/sinkReset.Rd | 1 - 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 7b6f570c..5f25afe2 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -10,7 +10,6 @@ #' Sink Reset #' #' @return No return, but run to close all outstanding `sink()`s -#' and handles any errors or warnings that occur during the process. #' #' @export #' @@ -87,8 +86,7 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results -#' of the efetch run of the accessions +#' @param ipgout_path Path to write the results of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan A string specifying the parallelization strategy for the future #' package, such as `"sequential"` or `"multisession"`. @@ -122,9 +120,7 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_pa #' #' @author Samuel Chen, Janani Ravi #' -#' @description Perform efetch on the ipg database -#' and write the results to out_path -#' +#' @description Perform efetch on the ipg database and write the results to out_path #' @param accnums Character vector containing the accession numbers to query on #' the ipg database #' @param out_path Path to write the efetch results to diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index db63024f..047e2652 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -23,8 +23,7 @@ the ipg database} No return value. The function writes the fetched results to \code{out_path}. } \description{ -Perform efetch on the ipg database -and write the results to out_path +Perform efetch on the ipg database and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index e3fc7ce4..0285c0b2 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,7 +8,6 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s -and handles any errors or warnings that occur during the process. } \description{ Sink Reset From cdac9a3cc8a446596474fdc27892c2cc5fffbb3b Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 15:27:32 -0600 Subject: [PATCH 50/61] let R sort NAMESPACE --- NAMESPACE | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 025f00cf..d91f16c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,9 +34,9 @@ export(createFA2Tree) export(createGenomicContextNetwork) export(createJobResultsURL) export(createJobStatusEmailMessage) +export(createLineageLookup) export(createMSA_Kalign) export(createMSA_PDF) -export(createLineageLookup) export(createRepresentativeAccNum) export(createUndirectedGenomicContextNetwork) export(createWordCloud2Element) @@ -55,7 +55,6 @@ export(getTopAccByLinDomArch) export(mapAcc2Name) export(mapAdvOption2Process) export(mapOption2Process) -export(msa_pdf) export(plotEstimatedWallTimes) export(plotIPR2Viz) export(plotIPR2VizWeb) From 22504868261a7a56fa93c4889ac42a9becb66fff Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 15:33:13 -0600 Subject: [PATCH 51/61] function/doc consistency --- R/networks_gencontext.R | 4 ++-- man/createUndirectedGenomicContextNetwork.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/networks_gencontext.R b/R/networks_gencontext.R index 7df6c270..ca1ef52d 100755 --- a/R/networks_gencontext.R +++ b/R/networks_gencontext.R @@ -8,7 +8,7 @@ ## GC Undirected Network ## ########################### -#' Domain Network +#' createUndirectedGenomicContextNetwork #' #' @description #' This function creates a domain network from the 'DomArch' column. @@ -37,7 +37,7 @@ #' #' @examples #' \dontrun{ -#' domain_network(pspa) +#' createUndirectedGenomicContextNetwork(pspa) #' } createUndirectedGenomicContextNetwork <- function(prot, column = "GenContext", domains_of_interest, cutoff_type = "Lineage", cutoff = 1, layout = "grid") { # by domain networks or all, as required. diff --git a/man/createUndirectedGenomicContextNetwork.Rd b/man/createUndirectedGenomicContextNetwork.Rd index d61c23df..b74da141 100644 --- a/man/createUndirectedGenomicContextNetwork.Rd +++ b/man/createUndirectedGenomicContextNetwork.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/networks_gencontext.R \name{createUndirectedGenomicContextNetwork} \alias{createUndirectedGenomicContextNetwork} -\title{Domain Network} +\title{createUndirectedGenomicContextNetwork} \usage{ createUndirectedGenomicContextNetwork( prot, @@ -35,6 +35,6 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -domain_network(pspa) +createUndirectedGenomicContextNetwork(pspa) } } From 6632fe4cc4a26451c17831ef25a5a03fa182bb81 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 16:01:06 -0600 Subject: [PATCH 52/61] replace rd --- man/acc2FA.Rd | 31 +++++++++++++++---------------- man/acc2fa.Rd | 38 -------------------------------------- 2 files changed, 15 insertions(+), 54 deletions(-) delete mode 100644 man/acc2fa.Rd diff --git a/man/acc2FA.Rd b/man/acc2FA.Rd index 6c6ea43c..517ee3d6 100644 --- a/man/acc2FA.Rd +++ b/man/acc2FA.Rd @@ -1,35 +1,34 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CHANGED-pre-msa-tree.R -\name{acc2FA} -\alias{acc2FA} -\title{acc2FA converts protein accession numbers to a fasta format.} +% Please edit documentation in R/pre-msa-tree.R +\name{acc2fa} +\alias{acc2fa} +\title{acc2fa} \usage{ -acc2FA(accessions, outpath, plan = "sequential") +acc2fa(accessions, outpath, plan = "sequential") } \arguments{ \item{accessions}{Character vector containing protein accession numbers to -generate fasta sequences for. -Function may not work for vectors of length > 10,000} +generate fasta sequences for. Function may not work for vectors of +length > 10,000} -\item{outpath}{\link{str} Location where fasta file should be written to.} +\item{outpath}{\link{str}. Location where fasta file should be written to.} -\item{plan}{Character string specifying the parallel processing strategy to -use with the \code{future} package. Default is "sequential".} +\item{plan}{Character. The plan to use for processing. Default is "sequential".} } \value{ -A logical value indicating whether the retrieval and conversion were -successful. Returns \code{TRUE} if successful and \code{FALSE} otherwise. +A Fasta file is written to the specified \code{outpath}. } \description{ +acc2fa converts protein accession numbers to a fasta format. Resulting fasta file is written to the outpath. } \examples{ \dontrun{ -acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") -Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", -"O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +acc2fa(outpath = "ebi.fa") } } \author{ diff --git a/man/acc2fa.Rd b/man/acc2fa.Rd deleted file mode 100644 index 517ee3d6..00000000 --- a/man/acc2fa.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{acc2fa} -\alias{acc2fa} -\title{acc2fa} -\usage{ -acc2fa(accessions, outpath, plan = "sequential") -} -\arguments{ -\item{accessions}{Character vector containing protein accession numbers to -generate fasta sequences for. Function may not work for vectors of -length > 10,000} - -\item{outpath}{\link{str}. Location where fasta file should be written to.} - -\item{plan}{Character. The plan to use for processing. Default is "sequential".} -} -\value{ -A Fasta file is written to the specified \code{outpath}. -} -\description{ -acc2fa converts protein accession numbers to a fasta format. -Resulting fasta file is written to the outpath. -} -\examples{ -\dontrun{ -acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), -outpath = "my_proteins.fasta") -Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> -acc2fa(outpath = "ebi.fa") -} -} -\author{ -Samuel Chen, Janani Ravi -} -\keyword{accnum,} -\keyword{fasta} From 5bedeee27a7fbd20eb17847bc1e4833d09f9d439 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Thu, 24 Oct 2024 13:45:02 -0600 Subject: [PATCH 53/61] update .Rd --- man/generateAllAlignments2FA.Rd | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/man/generateAllAlignments2FA.Rd b/man/generateAllAlignments2FA.Rd index 5babd22d..421d8cf7 100644 --- a/man/generateAllAlignments2FA.Rd +++ b/man/generateAllAlignments2FA.Rd @@ -22,18 +22,21 @@ generateAllAlignments2FA( \item{aln_path}{Character. Path to alignment files. Default is 'here("data/rawdata_aln/")'} -\item{fa_outpath}{Character. Path to the written fasta file. -Default is 'here("data/alns/")'.} - -\item{lin_file}{Character. Path to file. Master protein file with AccNum & +\item{fa_outpath}{Character. Path to file. Master protein file with AccNum & lineages. Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} +\item{lin_file}{Character. Path to the written fasta file. +Default is 'here("data/alns/")'.} + \item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. Default is 'FALSE'.} } \value{ +NULL. The function saves the output FASTA files to the specified +directory. + NULL. The function saves the output FASTA files to the specified directory. } @@ -47,6 +50,12 @@ Adding Leaves to an alignment file w/ accessions Adding Leaves to all alignment files w/ accessions & DAs? } \details{ +The alignment files would need two columns separated by spaces: +\enumerate{ +\item AccNum and 2. alignment. The protein homolog file should have AccNum, +Species, Lineages. +} + The alignment files would need two columns separated by spaces: \enumerate{ \item AccNum and 2. alignment. The protein homolog file should have AccNum, @@ -54,6 +63,9 @@ Species, Lineages. } } \note{ +Please refer to the source code if you have alternate + file formats +and/or column names. + Please refer to the source code if you have alternate + file formats and/or column names. } @@ -64,12 +76,6 @@ generateAllAlignments2FA() \dontrun{ generateAllAlignments2FA() } -\dontrun{ -generateAllAlignments2FA() -} -} -\author{ -Janani Ravi } \keyword{accnum,} \keyword{alignment,} From cb76c69eba5586c255834a370bc7ffa035700b8c Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 26 Oct 2024 12:40:34 +0300 Subject: [PATCH 54/61] change 'print(results)' to 'results' for brevity and to avoid potential issues Signed-off-by: Awa Synthia --- R/CHANGED-pre-msa-tree.R | 77 +++++++++-------- R/blastWrappers.R | 17 ++-- R/fa2domain.R | 10 +-- R/ipr2viz.R | 98 ++++++++++----------- R/lineage.R | 78 ++++++++--------- R/plotme.R | 6 +- R/plotting.R | 138 +++++++++++++++--------------- R/pre-msa-tree.R | 74 ++++++++-------- R/reverse_operons.R | 16 ++-- man/acc2FA.Rd | 8 +- man/addName.Rd | 2 +- man/addTaxID.Rd | 2 +- man/alignFasta.Rd | 6 +- man/convert2TitleCase.Rd | 4 +- man/createRepresentativeAccNum.Rd | 9 +- man/downloadAssemblySummary.Rd | 2 +- man/getAccNumFromFA.Rd | 5 +- man/getTopAccByLinDomArch.Rd | 4 +- man/mapAcc2Name.Rd | 4 +- man/plotIPR2Viz.Rd | 18 ++-- man/plotIPR2VizWeb.Rd | 18 ++-- man/plotLineageSunburst.Rd | 2 +- man/prepareColumnParams.Rd | 2 +- man/prepareSingleColumnParams.Rd | 2 +- man/proteinAcc2TaxID.Rd | 4 +- man/renameFA.Rd | 2 +- man/rename_fasta.Rd | 2 +- man/reverseOperonSeq.Rd | 2 +- man/runDeltaBlast.Rd | 11 ++- man/runIPRScan.Rd | 2 +- man/shortenLineage.Rd | 4 +- man/writeMSA_AA2FA.Rd | 3 + 32 files changed, 329 insertions(+), 303 deletions(-) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index 40bd672e..48d1abf9 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -47,7 +47,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @examples #' # Convert a single string to title case #' convert2TitleCase("hello world") # Returns "Hello World" -#' +#' convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -80,7 +80,7 @@ convert2TitleCase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return A data frame containing the enriched alignment data with lineage +#' @return A data frame containing the enriched alignment data with lineage #' information. #' #' @details The alignment file would need two columns: 1. accession + @@ -215,7 +215,7 @@ addLeaves2Alignment <- function(aln_file = "", #' Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") #' ) #' enriched_data <- addName(data) -#' print(enriched_data) +#' enriched_data addName <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { @@ -292,7 +292,7 @@ addName <- function(data, #' file formats and/or column names. #' #' @return A character string representing the FASTA formatted sequences. -#' If `fa_outpath` is provided, the FASTA will also be saved to the specified +#' If `fa_outpath` is provided, the FASTA will also be saved to the specified #' file. #' @export #' @@ -336,22 +336,22 @@ convertAlignment2FA <- function(aln_file = "", } #' mapAcc2Name -#' +#' #' @description #' Default renameFA() replacement function. Maps an accession number to its name #' #' @param line The line of a fasta file starting with '>' -#' @param acc2name Data Table containing a column of accession numbers and a +#' @param acc2name Data Table containing a column of accession numbers and a #' name column #' @param acc_col Name of the column containing Accession numbers -#' @param name_col Name of the column containing the names that the accession +#' @param name_col Name of the column containing the names that the accession #' numbers #' are mapped to #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return A character string representing the updated FASTA line, where the +#' @return A character string representing the updated FASTA line, where the #' accession number is replaced with its corresponding name. #' @export #' @@ -389,7 +389,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { #' #' @examples #' \dontrun{ -#' renameFA("path/to/input.fasta", +#' renameFA("path/to/input.fasta", #' "path/to/output.fasta", mapAcc2Name, acc2name) #' } renameFA <- function(fa_path, outpath, @@ -411,8 +411,8 @@ renameFA <- function(fa_path, outpath, ################################ ## generateAllAlignments2FA #' generateAllAlignments2FA -#' -#' @description +#' +#' @description #' Adding Leaves to an alignment file w/ accessions #' #' @keywords alignment, accnum, leaves, lineage, species @@ -420,25 +420,25 @@ renameFA <- function(fa_path, outpath, #' #' @param aln_path Character. Path to alignment files. #' Default is 'here("data/rawdata_aln/")' -#' @param fa_outpath Character. Path to file. Master protein file with AccNum & +#' @param fa_outpath Character. Path to file. Master protein file with AccNum & #' lineages. #' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' #' @param lin_file Character. Path to the written fasta file. #' Default is 'here("data/alns/")'. -#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' @param reduced Boolean. If TRUE, the fasta file will contain only one #' sequence per lineage. #' Default is 'FALSE'. #' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return NULL. The function saves the output FASTA files to the specified +#' @return NULL. The function saves the output FASTA files to the specified #' directory. #' -#' @details The alignment files would need two columns separated by spaces: -#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, +#' @details The alignment files would need two columns separated by spaces: +#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, #' Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats +#' @note Please refer to the source code if you have alternate + file formats #' and/or column names. #' #' @export @@ -478,20 +478,20 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), # accessions <- c("P12345","Q9UHC1","O15530","Q14624","P0DTD1") # accessions <- rep("ANY95992.1", 201) -#' acc2FA +#' acc2FA #' #' @description -#' converts protein accession numbers to a fasta format. Resulting +#' converts protein accession numbers to a fasta format. Resulting #' fasta file is written to the outpath. #' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' -#' @param accessions Character vector containing protein accession numbers to +#' @param accessions Character vector containing protein accession numbers to #' generate fasta sequences for. #' Function may not work for vectors of length > 10,000 #' @param outpath [str] Location where fasta file should be written to. -#' @param plan Character string specifying the parallel processing strategy to +#' @param plan Character string specifying the parallel processing strategy to #' use with the `future` package. Default is "sequential". #' #' @importFrom Biostrings readAAStringSet @@ -499,16 +499,16 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return A logical value indicating whether the retrieval and conversion were +#' @return A logical value indicating whether the retrieval and conversion were #' successful. Returns `TRUE` if successful and `FALSE` otherwise. #' @export #' #' @examples #' \dontrun{ -#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), #' outpath = "my_proteins.fasta") #' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", +#' EBI:accessions <- c("P12345", "Q9UHC1", #' "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") #' } acc2FA <- function(accessions, outpath, plan = "sequential") { @@ -583,9 +583,9 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { } #' createRepresentativeAccNum -#' +#' #' @description -#' Function to generate a vector of one Accession number per distinct +#' Function to generate a vector of one Accession number per distinct #' observation from 'reduced' column #' #' @author Samuel Chen, Janani Ravi @@ -599,15 +599,18 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return A character vector containing one Accession number per distinct +#' @return A character vector containing one Accession number per distinct #' observation from the specified reduced column. #' @export #' #' @examples +#' \dontrun{ +#' createRepresentativeAccNum(prot) +#' } createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one + # Get Unique reduced column and then bind the AccNums back to get one # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -635,16 +638,16 @@ createRepresentativeAccNum <- function(prot_data, } #' alignFasta -#' +#' #' @description #' Perform a Multiple Sequence Alignment on a FASTA file. #' #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' @param tool Type of alignment tool to use. One of three options: "Muscle", #' "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. +#' @param outpath Path to write the resulting alignment to as a FASTA file. #' If NULL, no file is written #' #' @importFrom Biostrings readAAStringSet @@ -655,7 +658,7 @@ createRepresentativeAccNum <- function(prot_data, #' #' @examples #' \dontrun{ -#' aligned_sequences <- alignFasta("my_sequences.fasta", +#' aligned_sequences <- alignFasta("my_sequences.fasta", #' tool = "Muscle", outpath = "aligned_output.fasta") #' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { @@ -690,7 +693,10 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @export #' #' @examples -writeMSA_AA2FA <- function(alignment, outpath) { +#' \dontrun{ +#' writeMSA_AA2FA("my_sequences.fasta", outpath = "aligned_output.fasta") +#' } +writeMSA_AA2FA <- function(writeMSA_AA2FA, outpath) { l <- length(rownames(alignment)) fasta <- "" for (i in 1:l) @@ -705,7 +711,7 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' getAccNumFromFA #' -#' @param fasta_file Character. The path to the FASTA file from which +#' @param fasta_file Character. The path to the FASTA file from which #' accession numbers will be extracted. #' #' @importFrom stringi stri_extract_all_regex @@ -714,6 +720,9 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' @export #' #' @examples +#' \dontrun{ +#' getAccNumFromFA("my_sequences.fasta") +#' } getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] diff --git a/R/blastWrappers.R b/R/blastWrappers.R index d89f9b95..3c9c4192 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -4,8 +4,8 @@ #' #' @author Samuel Chen, Janani Ravi #' @description -#' This function executes a Delta-BLAST search using the specified parameters -#' and database. It sets the BLAST database path, runs the Delta-BLAST command +#' This function executes a Delta-BLAST search using the specified parameters +#' and database. It sets the BLAST database path, runs the Delta-BLAST command #' with the given query, and outputs the results. #' #' @param deltablast_path Path to the Delta-BLAST executable. @@ -17,12 +17,15 @@ #' @param num_alignments Number of alignments to report. #' @param num_threads Number of threads to use for the search (default is 1). #' -#' @return This function does not return a value; it outputs results to the +#' @return This function does not return a value; it outputs results to the #' specified file. #' @export #' #' @examples -runDeltaBlast <- function(deltablast_path, db_search_path, +#' \dontrun{ +#' runDeltaBlast(runDeltaBlast, db_search_path) +#' } +runDeltaBlast <- function(runDeltaBlast, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { start <- Sys.time() @@ -49,8 +52,8 @@ runDeltaBlast <- function(deltablast_path, db_search_path, #' Run RPSBLAST to generate domain architectures for proteins of interest #' #' @description -#' This function executes an RPS-BLAST search to generate domain architectures -#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST +#' This function executes an RPS-BLAST search to generate domain architectures +#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST #' command with the provided query, and outputs the results. #' #' @param rpsblast_path Path to the RPS-BLAST executable. @@ -61,7 +64,7 @@ runDeltaBlast <- function(deltablast_path, db_search_path, #' @param out Path to the output file where results will be saved. #' @param num_threads Number of threads to use for the search (default is 1). #' -#' @return This function does not return a value; it outputs results to the +#' @return This function does not return a value; it outputs results to the #' specified file. #' @export #' diff --git a/R/fa2domain.R b/R/fa2domain.R index 29803b85..f53322ca 100644 --- a/R/fa2domain.R +++ b/R/fa2domain.R @@ -5,18 +5,18 @@ # interproscan CLI will return a completely empty file (0Bytes) #' runIPRScan -#' -#' Run InterProScan on a given FASTA file and save the results to an +#' +#' Run InterProScan on a given FASTA file and save the results to an #' output file. #' #' @param filepath_fasta A string representing the path to the input FASTA file. #' @param filepath_out A string representing the base path for the output file. -#' @param appl A character vector specifying the InterProScan applications to +#' @param appl A character vector specifying the InterProScan applications to #' use (e.g., "Pfam", "Gene3D"). Default is `c("Pfam", "Gene3D")`. #' #' @importFrom stringr str_glue #' -#' @return A data frame containing the results from the InterProScan output +#' @return A data frame containing the results from the InterProScan output #' TSV file. #' #' @examples @@ -26,7 +26,7 @@ #' filepath_out = "path/to/output_file", #' appl = c("Pfam", "Gene3D") #' ) -#' print(results) +#' results #' } runIPRScan <- function( filepath_fasta, diff --git a/R/ipr2viz.R b/R/ipr2viz.R index c976276d..e582ab09 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -23,7 +23,7 @@ #' @export #' @examples #' library(ggplot2) -#' +#' #' # Create a sample plot using the custom theme #' ggplot(mtcars, aes(x = wt, y = mpg)) + #' geom_point() + @@ -51,15 +51,15 @@ themeGenes2 <- function() { #' getTopAccByLinDomArch #' @description Group by lineage + DA then take top 20 #' -#' @param infile_full A data frame containing the full dataset with lineage and +#' @param infile_full A data frame containing the full dataset with lineage and #' domain architecture information. -#' @param DA_col A string representing the name of the domain architecture +#' @param DA_col A string representing the name of the domain architecture #' column. Default is "DomArch.Pfam". -#' @param lin_col A string representing the name of the lineage column. +#' @param lin_col A string representing the name of the lineage column. #' Default is "Lineage_short". -#' @param n An integer specifying the number of top accession numbers to return. +#' @param n An integer specifying the number of top accession numbers to return. #' Default is 20. -#' @param query A string for filtering a specific query name. If it is not +#' @param query A string for filtering a specific query name. If it is not #' "All", only the data matching this query will be processed. #' #' @importFrom dplyr arrange filter group_by select summarise @@ -68,14 +68,14 @@ themeGenes2 <- function() { #' @importFrom rlang sym #' @importFrom rlang .data #' -#' @return A vector of the top N accession numbers (`AccNum`) based on counts +#' @return A vector of the top N accession numbers (`AccNum`) based on counts #' grouped by lineage and domain architecture. #' @export #' #' @examples #' \dontrun{ -#' top_accessions <- getTopAccByLinDomArch(infile_full = my_data, -#' DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +#' top_accessions <- getTopAccByLinDomArch(infile_full = my_data, +#' DA_col = "DomArch.Pfam", lin_col = "Lineage_short", #' n = 20, query = "specific_query_name") #' } getTopAccByLinDomArch <- function(infile_full, @@ -113,26 +113,26 @@ getTopAccByLinDomArch <- function(infile_full, ############################################# #' plotIPR2Viz #' -#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' @param infile_ipr A path to the input IPR file (TSV format) containing #' domain information. -#' @param infile_full A path to the full input file (TSV format) containing +#' @param infile_full A path to the full input file (TSV format) containing #' lineage and accession information. -#' @param accessions A character vector of accession numbers to filter the +#' @param accessions A character vector of accession numbers to filter the #' analysis. Default is an empty vector. -#' @param analysis A character vector specifying the types of analysis to -#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a #' vector of these analyses. -#' @param group_by A string specifying how to group the visualization. +#' @param group_by A string specifying how to group the visualization. #' Default is "Analysis". Options include "Analysis" or "Query". -#' @param topn An integer specifying the number of top accessions to visualize. +#' @param topn An integer specifying the number of top accessions to visualize. #' Default is 20. -#' @param name A string representing the name to use for y-axis labels. +#' @param name A string representing the name to use for y-axis labels. #' Default is "Name". -#' @param text_size An integer specifying the text size for the plot. +#' @param text_size An integer specifying the text size for the plot. #' Default is 15. -#' @param query A string for filtering a specific query name. If it is not +#' @param query A string for filtering a specific query name. If it is not #' "All", only the data matching this query will be processed. -#' +#' #' @importFrom dplyr distinct filter select #' @importFrom gggenes geom_gene_arrow geom_subgene_arrow #' @importFrom ggplot2 aes aes_string as_labeller element_text facet_wrap ggplot guides margin scale_fill_manual theme theme_minimal unit ylab @@ -145,16 +145,16 @@ getTopAccByLinDomArch <- function(infile_full, #' #' @examples #' \dontrun{ -#' plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", -#' infile_full = "path/to/full_file.tsv", -#' accessions = c("ACC123", "ACC456"), -#' analysis = c("Pfam", "TMHMM"), -#' group_by = "Analysis", -#' topn = 20, -#' name = "Gene Name", -#' text_size = 15, +#' plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", +#' infile_full = "path/to/full_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' topn = 20, +#' name = "Gene Name", +#' text_size = 15, #' query = "All") -#' print(plot) +#' plot #' } plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), @@ -291,24 +291,24 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' plotIPR2VizWeb #' -#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' @param infile_ipr A path to the input IPR file (TSV format) containing #' domain information. -#' @param accessions A character vector of accession numbers to filter the +#' @param accessions A character vector of accession numbers to filter the #' analysis. -#' @param analysis A character vector specifying the types of analysis to -#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector #' of these analyses. -#' @param group_by A string specifying how to group the visualization. +#' @param group_by A string specifying how to group the visualization. #' Default is "Analysis". Options include "Analysis" or "Query". -#' @param name A string representing the name to use for y-axis labels. +#' @param name A string representing the name to use for y-axis labels. #' Default is "Name". -#' @param text_size An integer specifying the text size for the plot. +#' @param text_size An integer specifying the text size for the plot. #' Default is 15. -#' @param legend_name A string representing the column to use for legend labels. +#' @param legend_name A string representing the column to use for legend labels. #' Default is "ShortName". -#' @param cols An integer specifying the number of columns in the facet wrap. +#' @param cols An integer specifying the number of columns in the facet wrap. #' Default is 5. -#' @param rows An integer specifying the number of rows in the legend. +#' @param rows An integer specifying the number of rows in the legend. #' Default is 10. #' #' @importFrom dplyr arrange distinct filter select @@ -317,22 +317,22 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' @importFrom readr read_tsv #' @importFrom tidyr pivot_wider #' -#' @return A ggplot object representing the domain architecture visualization +#' @return A ggplot object representing the domain architecture visualization #' for web display. #' @export #' #' @examples #' \dontrun{ -#' plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", -#' accessions = c("ACC123", "ACC456"), -#' analysis = c("Pfam", "TMHMM"), -#' group_by = "Analysis", -#' name = "Gene Name", -#' text_size = 15, -#' legend_name = "ShortName", -#' cols = 5, +#' plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' name = "Gene Name", +#' text_size = 15, +#' legend_name = "ShortName", +#' cols = 5, #' rows = 10) -#' print(plot) +#' plot #' } plotIPR2VizWeb <- function(infile_ipr, accessions, diff --git a/R/lineage.R b/R/lineage.R index 73fa008a..46249c91 100644 --- a/R/lineage.R +++ b/R/lineage.R @@ -11,22 +11,22 @@ #' #' @author Samuel Chen, Janani Ravi #' -#' @param outpath String of path where the assembly summary file should be +#' @param outpath String of path where the assembly summary file should be #' written -#' @param keep Character vector containing which columns should be retained and +#' @param keep Character vector containing which columns should be retained and #' downloaded #' #' @importFrom data.table fwrite setnames #' @importFrom dplyr bind_rows select #' @importFrom biomartr getKingdomAssemblySummary #' -#' @return A tab-separated file containing the assembly summary. The function +#' @return A tab-separated file containing the assembly summary. The function #' does notreturn any value but writes the output directly to the specified file. #' @export #' #' @examples #' \dontrun{ -#' downloadAssemblySummary(outpath = "assembly_summary.tsv", +#' downloadAssemblySummary(outpath = "assembly_summary.tsv", #' keep = c("assembly_accession", "taxid", "organism_name")) #' } downloadAssemblySummary <- function(outpath, @@ -85,16 +85,16 @@ downloadAssemblySummary <- function(outpath, #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the #' "createLineageLookup()" function -#' @param acc_col Character. The name of the column in `prot_data` containing +#' @param acc_col Character. The name of the column in `prot_data` containing #' accession numbers. Default is "AccNum". #' #' @importFrom dplyr pull #' @importFrom data.table fread setnames #' -#' @return A dataframe containing the merged information of GCA_IDs, TaxIDs, -#' and their corresponding lineage up to the phylum level. The dataframe +#' @return A dataframe containing the merged information of GCA_IDs, TaxIDs, +#' and their corresponding lineage up to the phylum level. The dataframe #' will include information from the input `prot_data` and lineage data. -#' +#' #' @export #' #' @examples @@ -151,25 +151,25 @@ GCA2Lineage <- function(prot_data, ################################### #' addLineage #' -#' @param df Dataframe containing accession numbers. The dataframe should +#' @param df Dataframe containing accession numbers. The dataframe should #' have a column specified by `acc_col` that contains these accession numbers. -#' @param acc_col Character. The name of the column in `df` containing +#' @param acc_col Character. The name of the column in `df` containing #' accession numbers. Default is "AccNum". -#' @param assembly_path String. The path to the assembly summary file generated +#' @param assembly_path String. The path to the assembly summary file generated #' using the `downloadAssemblySummary()` function. -#' @param lineagelookup_path String. The path to the lineage lookup file (taxid +#' @param lineagelookup_path String. The path to the lineage lookup file (taxid #' to lineage mapping) generated using the `create_lineage_lookup()` function. -#' @param ipgout_path String. Optional path to save intermediate output files. +#' @param ipgout_path String. Optional path to save intermediate output files. #' Default is NULL. -#' @param plan Character. Specifies the execution plan for parallel processing. +#' @param plan Character. Specifies the execution plan for parallel processing. #' Default is "multicore". #' #' @importFrom dplyr pull #' @importFrom rlang sym #' -#' @return A dataframe that combines the original dataframe `df` with lineage +#' @return A dataframe that combines the original dataframe `df` with lineage #' information retrieved based on the provided accession numbers. -#' +#' #' @export #' #' @examples @@ -224,11 +224,11 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' (taxid to lineage mapping). This file can be generated using the #' @param ipgout_path Path to write the results of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL -#' @param plan Character. Specifies the execution plan for parallel processing. +#' @param plan Character. Specifies the execution plan for parallel processing. #' Default is "multicore". #' -#' @return A dataframe containing lineage information mapped to the given protein -#' accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, +#' @return A dataframe containing lineage information mapped to the given protein +#' accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, #' Protein, Protein Name, Species, and Lineage. #' @export #' @@ -276,16 +276,16 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, #' @param accessions Character vector containing the accession numbers to query on #' the ipg database #' @param out_path Path to write the efetch results to -#' @param plan Character. Specifies the execution plan for parallel processing. +#' @param plan Character. Specifies the execution plan for parallel processing. #' Default is "multicore". #' #' @importFrom future future plan #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return The function does not return a value but writes the efetch results +#' @return The function does not return a value but writes the efetch results #' directly to the specified `out_path`. -#' +#' #' @export #' #' @examples @@ -363,7 +363,7 @@ efetchIPG <- function(accessions, out_path, plan = "multicore") { #' #' @importFrom data.table fread setnames #' -#' @return A data table containing protein accessions along with their +#' @return A data table containing protein accessions along with their #' corresponding TaxIDs and lineage information. #' @export #' @@ -444,14 +444,14 @@ IPG2Lineage <- function(accessions, ipg_file, #' addTaxID #' #' @param data A data frame or data table containing protein accession numbers. -#' @param acc_col A string specifying the column name in `data` that contains +#' @param acc_col A string specifying the column name in `data` that contains #' the accession numbers. Defaults to "AccNum". -#' @param version A logical indicating whether to remove the last two characters +#' @param version A logical indicating whether to remove the last two characters #' from the accession numbers for TaxID retrieval. Defaults to TRUE. #' #' @importFrom data.table as.data.table #' -#' @return A data table that includes the original data along with a new column +#' @return A data table that includes the original data along with a new column #' containing the corresponding TaxIDs. #' @export #' @@ -460,7 +460,7 @@ IPG2Lineage <- function(accessions, ipg_file, #' # Create a sample data table with accession numbers #' sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) #' enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) -#' print(enriched_data) +#' enriched_data #' } addTaxID <- function(data, acc_col = "AccNum", version = T) { if (!is.data.table(data)) { @@ -490,19 +490,19 @@ addTaxID <- function(data, acc_col = "AccNum", version = T) { ################################## #' proteinAcc2TaxID #' -#' @param accnums A character vector of protein accession numbers to be mapped +#' @param accnums A character vector of protein accession numbers to be mapped #' to TaxIDs. -#' @param suffix A string suffix used to name the output file generated by the +#' @param suffix A string suffix used to name the output file generated by the #' script. -#' @param out_path A string specifying the directory where the output file will +#' @param out_path A string specifying the directory where the output file will #' be saved. -#' @param return_dt A logical indicating whether to return the result as a data -#' table. Defaults to FALSE. If TRUE, the output file is read into a data table +#' @param return_dt A logical indicating whether to return the result as a data +#' table. Defaults to FALSE. If TRUE, the output file is read into a data table #' and returned. #' #' @importFrom data.table fread #' -#' @return If `return_dt` is TRUE, a data table containing the mapping of protein +#' @return If `return_dt` is TRUE, a data table containing the mapping of protein #' accession numbers to TaxIDs. If FALSE, the function returns NULL. #' @export #' @@ -510,9 +510,9 @@ addTaxID <- function(data, acc_col = "AccNum", version = T) { #' \dontrun{ #' # Example accession numbers #' accessions <- c("ABC123", "XYZ456", "LMN789") -#' tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +#' tax_data <- proteinAcc2TaxID(accessions, suffix = "example", #' out_path = "/path/to/output", return_dt = TRUE) -#' print(tax_data) +#' tax_data #' } proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { # Write accnums to a file @@ -538,17 +538,17 @@ proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { #' @description Perform elink to go from protein database to taxonomy database #' and write the resulting file of taxid and lineage to out_path #' -#' @param accessions A character vector containing the accession numbers to query +#' @param accessions A character vector containing the accession numbers to query #' in the protein database. -#' @param out_path A string specifying the path where the results of the query +#' @param out_path A string specifying the path where the results of the query #' will be written. If set to NULL, a temporary directory will be used. -#' @param plan A character string that specifies the execution plan for parallel +#' @param plan A character string that specifies the execution plan for parallel #' processing. The default is "multicore". #' #' @importFrom future plan #' @importFrom purrr map #' -#' @return This function does not return a value. It writes the results to the +#' @return This function does not return a value. It writes the results to the #' specified output path. #' @export #' diff --git a/R/plotme.R b/R/plotme.R index 3527f170..3cfd54f8 100644 --- a/R/plotme.R +++ b/R/plotme.R @@ -83,7 +83,7 @@ plotTreemap <- function(count_data, fill_by_n = FALSE, sort_by_n = FALSE) { #' count_data <- data.frame(Category = c("A", "B", "C"), #' n = c(10, 20, 15)) #' params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) -#' print(params) +#' params #' } prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { validateCountDF(count_data) @@ -128,7 +128,7 @@ prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { #' @importFrom dplyr c_across group_by mutate rowwise select summarise ungroup #' @importFrom stringr str_glue #' -#' @return A data frame containing parameters for the specified column for +#' @return A data frame containing parameters for the specified column for #' treemap visualization. #' @export #' @@ -137,7 +137,7 @@ prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { #' df <- data.frame(Category = c("A", "A", "B", "B", "C"), #' n = c(10, 20, 30, 40, 50)) #' params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") -#' print(params) +#' params #' } prepareSingleColumnParams <- function(df, col_num, diff --git a/R/plotting.R b/R/plotting.R index b9a2758a..102ab6af 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -21,31 +21,31 @@ #' Shorten Lineage Names #' #' @description -#' This function abbreviates lineage names by shortening the first part of the -#' string (up to a given delimiter). +#' This function abbreviates lineage names by shortening the first part of the +#' string (up to a given delimiter). #' -#' @param data A data frame that contains a column with lineage names to be +#' @param data A data frame that contains a column with lineage names to be #' shortened. -#' @param colname Character. The name of the column in the data frame containing +#' @param colname Character. The name of the column in the data frame containing #' the lineage strings to be shortened. Default is `"Lineage"`. -#' @param abr_len Integer. The number of characters to retain after the first -#' letter. If set to 1, only the first letter of each segment before the +#' @param abr_len Integer. The number of characters to retain after the first +#' letter. If set to 1, only the first letter of each segment before the #' delimiter (`>`) is retained. Default is 1. #' #' @importFrom stringr str_locate #' @importFrom purrr pmap #' -#' @return A modified data frame where the specified lineage column has been +#' @return A modified data frame where the specified lineage column has been #' shortened. #' #' @export #' #' @examples #' \dontrun{ -#' df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +#' df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", #' "Archaea>Euryarchaeota>Thermococci")) #' shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) -#' print(shortened_df) +#' shortened_df #' } shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { abbrv <- function(x) { @@ -82,17 +82,17 @@ shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { #' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", #' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". #' @param cutoff Numeric. Cutoff for word frequency. Default is 90. -#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows #' based on a certain condition. Default is FALSE. -#' @param text.scale Allows scaling of axis title, tick lables, and numbers +#' @param text.scale Allows scaling of axis title, tick lables, and numbers #' above the intersection size bars. #' text.scale can either take a universal scale in the form of an integer, #' or a vector of specific scales in the format: c(intersection size title, #' intersection size tick labels, set size title, set size tick labels, set names, #' numbers above bars) -#' @param point.size Numeric. Sets the size of points in the UpSet plot. +#' @param point.size Numeric. Sets the size of points in the UpSet plot. #' Default is 2.2. -#' @param line.size Numeric. Sets the line width in the UpSet plot. +#' @param line.size Numeric. Sets the line width in the UpSet plot. #' Default is 0.8. #' #' @importFrom dplyr across distinct filter if_else mutate pull select where @@ -100,7 +100,7 @@ shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { #' @importFrom stringr str_detect str_replace_all str_split #' @importFrom UpSetR upset #' -#' @return An UpSet plot object. The plot visualizes intersections of sets based +#' @return An UpSet plot object. The plot visualizes intersections of sets based #' on the provided colname in query_data. #' @export #' @@ -251,7 +251,7 @@ plotUpSet <- function(query_data = "toast_rack.sub", #' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", #' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". #' @param cutoff Numeric. Cutoff for word frequency. Default is 90. -#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows #' based on a certain condition. Default is FALSE. #' @param color Color for the heatmap. One of six options: "default", "magma", "inferno", #' "plasma", "viridis", or "cividis" @@ -354,13 +354,13 @@ plotLineageDA <- function(query_data = "prot", #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). #' Default is prot (variable w/ protein data). -#' @param queries Character Vector containing the queries that will be used for +#' @param queries Character Vector containing the queries that will be used for #' the categories. -#' @param colname Character. The column used for filtering based on the `queries`. +#' @param colname Character. The column used for filtering based on the `queries`. #' Default is "ClustName". -#' @param cutoff Numeric. The cutoff value for filtering rows based on their +#' @param cutoff Numeric. The cutoff value for filtering rows based on their #' total count. Rows with values below this cutoff are excluded. -#' @param color Character. Defines the color palette used for the heatmap. +#' @param color Character. Defines the color palette used for the heatmap. #' Default is a red gradient. #' #' @importFrom dplyr arrange desc filter group_by select summarise union @@ -371,8 +371,8 @@ plotLineageDA <- function(query_data = "prot", #' @importFrom tidyr drop_na #' @importFrom viridis scale_fill_viridis #' -#' @return A ggplot object representing a heatmap (tile plot) showing the -#' relationship between queries and lineages, with the intensity of color +#' @return A ggplot object representing a heatmap (tile plot) showing the +#' relationship between queries and lineages, with the intensity of color #' representing the count of matching records. #' @export #' @@ -503,8 +503,8 @@ plotLineageQuery <- function(query_data = all, #' @importFrom stringr str_replace_all #' @importFrom tidyr gather #' -#' @return A ggplot object representing a heatmap (tile plot) of lineage versus -#' the top neighboring domain architectures, with color intensity representing +#' @return A ggplot object representing a heatmap (tile plot) of lineage versus +#' the top neighboring domain architectures, with color intensity representing #' the frequency of occurrences. #' @export #' @@ -583,9 +583,9 @@ plotLineageNeighbors <- function(query_data = "prot", query = "pspa", #' Lineage Domain Repeats Plot #' -#' @param query_data Data frame containing protein homolog data, including +#' @param query_data Data frame containing protein homolog data, including #' relevant domain architectures and lineages. -#' @param colname Character. The name of the column in query_data that contains +#' @param colname Character. The name of the column in query_data that contains #' domain architectures or other structural information. #' #' @importFrom dplyr across mutate select where @@ -593,8 +593,8 @@ plotLineageNeighbors <- function(query_data = "prot", query = "pspa", #' @importFrom stringr str_count str_replace_all #' @importFrom tidyr gather #' -#' @return A ggplot object representing a heatmap (tile plot) of domain repeat -#' counts across different lineages, with color intensity representing the +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the #' occurrence of domains. #' @export #' @@ -679,8 +679,8 @@ plotLineageDomainRepeats <- function(query_data, colname) { #' @importFrom purrr map #' @importFrom stringr str_locate str_locate_all #' -#' @return A ggplot object representing a heatmap (tile plot) of domain repeat -#' counts across different lineages, with color intensity representing the +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the #' occurrence of domains. #' @export #' @@ -826,26 +826,26 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size #' Stacked Lineage Plot #' -#' @param prot Data frame containing protein data including domain architecture +#' @param prot Data frame containing protein data including domain architecture #' and lineage information. -#' @param column Character. The name of the column in prot representing domain +#' @param column Character. The name of the column in prot representing domain #' architectures (default is "DomArch"). -#' @param cutoff Numeric. A threshold value for filtering domain architectures +#' @param cutoff Numeric. A threshold value for filtering domain architectures #' or protein counts. -#' @param Lineage_col Character. The name of the column representing lineage +#' @param Lineage_col Character. The name of the column representing lineage #' data (default is "Lineage"). -#' @param xlabel Character. Label for the x-axis +#' @param xlabel Character. Label for the x-axis #' (default is "Domain Architecture"). -#' @param reduce_lineage Logical. Whether to shorten lineage names +#' @param reduce_lineage Logical. Whether to shorten lineage names #' (default is TRUE). #' @param label.size Numeric. The size of axis text labels (default is 8). -#' @param legend.position Numeric vector. Coordinates for placing the legend +#' @param legend.position Numeric vector. Coordinates for placing the legend #' (default is c(0.7, 0.4)). -#' @param legend.text.size Numeric. Size of the text in the legend +#' @param legend.text.size Numeric. Size of the text in the legend #' (default is 10). #' @param legend.cols Numeric. Number of columns in the legend (default is 2). #' @param legend.size Numeric. Size of the legend keys (default is 0.7). -#' @param coord_flip Logical. Whether to flip the coordinates of the plot +#' @param coord_flip Logical. Whether to flip the coordinates of the plot #' (default is TRUE). #' @param legend Logical. Whether to display the legend (default is TRUE). #' @@ -853,7 +853,7 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size #' @importFrom ggplot2 aes_string coord_flip element_blank element_line element_rect element_text geom_bar ggplot guides guide_legend scale_fill_manual xlab ylab theme theme_minimal #' @importFrom purrr map #' -#' @return A ggplot object representing a stacked bar plot showing the +#' @return A ggplot object representing a stacked bar plot showing the #' distribution of protein domain architectures across lineages. #' @export #' @@ -982,34 +982,34 @@ plotStackedLineage <- function(prot, column = "DomArch", cutoff, Lineage_col = " #' plotWordCloud3 #' -#' @param data Data frame or table containing words and their frequencies for +#' @param data Data frame or table containing words and their frequencies for #' the word cloud. #' @param size Numeric. Scaling factor for word sizes (default is 1). -#' @param minSize Numeric. Minimum font size for the smallest word +#' @param minSize Numeric. Minimum font size for the smallest word #' (default is 0). #' @param gridSize Numeric. Size of the grid for placing words (default is 0). -#' @param fontFamily Character. Font family to use for the words +#' @param fontFamily Character. Font family to use for the words #' (default is "Segoe UI"). #' @param fontWeight Character. Font weight for the words (default is "bold"). -#' @param color Character or vector. Color of the words. Use "random-dark" for +#' @param color Character or vector. Color of the words. Use "random-dark" for #' random dark colors (default) or specify a color. -#' @param backgroundColor Character. Background color of the word cloud +#' @param backgroundColor Character. Background color of the word cloud #' (default is "white"). -#' @param minRotation Numeric. Minimum rotation angle of words in radians +#' @param minRotation Numeric. Minimum rotation angle of words in radians #' (default is -π/4). -#' @param maxRotation Numeric. Maximum rotation angle of words in radians +#' @param maxRotation Numeric. Maximum rotation angle of words in radians #' (default is π/4). #' @param shuffle Logical. Whether to shuffle the words (default is TRUE). -#' @param rotateRatio Numeric. Proportion of words that are rotated +#' @param rotateRatio Numeric. Proportion of words that are rotated #' (default is 0.4). -#' @param shape Character. Shape of the word cloud ("circle" is default, but +#' @param shape Character. Shape of the word cloud ("circle" is default, but #' you can use "cardioid", "star", "triangle", etc.). #' @param ellipticity Numeric. Degree of ellipticity (default is 0.65). -#' @param widgetsize Numeric vector. Width and height of the widget +#' @param widgetsize Numeric vector. Width and height of the widget #' (default is NULL, which uses default size). -#' @param figPath Character. Path to an image file to use as a mask for the +#' @param figPath Character. Path to an image file to use as a mask for the #' word cloud (optional). -#' @param hoverFunction JS function. JavaScript function to run when hovering +#' @param hoverFunction JS function. JavaScript function to run when hovering #' over words (optional). #' #' @importFrom base64enc base64encode @@ -1082,11 +1082,11 @@ wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = " #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname Character. The name of the column in `query_data` to generate +#' @param colname Character. The name of the column in `query_data` to generate #' the word cloud from. Default is "DomArch". -#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' @param cutoff Numeric. The cutoff value for filtering elements based on their #' frequency. Default is 70. -#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of #' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull @@ -1094,7 +1094,7 @@ wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = " #' @importFrom rlang sym #' @importFrom wordcloud wordcloud #' -#' @return A word cloud plot showing the frequency of elements from the selected +#' @return A word cloud plot showing the frequency of elements from the selected #' column. #' @export #' @@ -1166,17 +1166,17 @@ createWordCloudElement <- function(query_data = "prot", #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname Character. The name of the column in `query_data` to generate +#' @param colname Character. The name of the column in `query_data` to generate #' the word cloud from. Default is "DomArch". -#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' @param cutoff Numeric. The cutoff value for filtering elements based on their #' frequency. Default is 70. -#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of #' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return A word cloud plot showing the frequency of elements from the selected +#' @return A word cloud plot showing the frequency of elements from the selected #' column. #' @export #' @@ -1240,22 +1240,22 @@ createWordCloud2Element <- function(query_data = "prot", #### Sunburst ##### #' Lineage Sunburst #' -#' @param prot Data frame containing a lineage column that the sunburst plot +#' @param prot Data frame containing a lineage column that the sunburst plot #' will be generated for -#' @param lineage_column String. Name of the lineage column within the +#' @param lineage_column String. Name of the lineage column within the #' data frame. Defaults to "Lineage" -#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", +#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", #' a sunburst plot of the lineage #' @param levels Integer. Number of levels the sunburst will have. -#' @param colors A vector of colors for the sunburst plot. +#' @param colors A vector of colors for the sunburst plot. #' If NULL, default colors are used. -#' @param legendOrder String vector. The order of the legend. If legendOrder +#' @param legendOrder String vector. The order of the legend. If legendOrder #' is NULL, -#' @param showLegend Boolean. If TRUE, the legend will be enabled when the +#' @param showLegend Boolean. If TRUE, the legend will be enabled when the #' component first renders. -#' @param maxLevels Integer, the maximum number of levels to display in the -#' sunburst; 5 by default, NULL to disable then the legend will be in the -#' descending order of the top level hierarchy. will be rendered. If the type is +#' @param maxLevels Integer, the maximum number of levels to display in the +#' sunburst; 5 by default, NULL to disable then the legend will be in the +#' descending order of the top level hierarchy. will be rendered. If the type is #' sund2b, a sund2b plot will be rendered. #' #' @importFrom d3r d3_nest @@ -1270,7 +1270,7 @@ createWordCloud2Element <- function(query_data = "prot", #' #' @examples #' \dontrun{ -#' plotLineageSunburst(prot, lineage_column = "Lineage", +#' plotLineageSunburst(prot, lineage_column = "Lineage", #' type = "sunburst", levels = 3) #' } plotLineageSunburst <- function(prot, lineage_column = "Lineage", diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 2f9c7832..e2a8a39c 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -49,8 +49,8 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @export #' #' @examples -#' convert2TitleCase("hello world") -#' convert2TitleCase("this is a test", "_") +#' convert2TitleCase("hello world") +#' convert2TitleCase("this is a test", "_") convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -89,7 +89,7 @@ convert2TitleCase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return A data frame containing the combined alignment and lineage +#' @return A data frame containing the combined alignment and lineage #' information. #' @export #' @@ -191,7 +191,7 @@ addLeaves2Alignment <- function(aln_file = "", #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function adds a new 'Name' column that is comprised of +#' @description This function adds a new 'Name' column that is comprised of #' components from Kingdom, Phylum, Genus, and species, as well as the accession #' #' @param data Data to add name column to @@ -278,7 +278,7 @@ addName <- function(data, #' Default is 'pspa.txt' #' @param fa_outpath Character. Path to the written fasta file. #' Default is 'NULL' -#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' @param reduced Boolean. If TRUE, the fasta file will contain only one #' sequence per lineage. Default is 'FALSE' #' #' @details The alignment file would need two columns: 1. accession + @@ -289,8 +289,8 @@ addName <- function(data, #' #' @importFrom readr write_file #' -#' @return Character string containing the Fasta formatted sequences. -#' If `fa_outpath` is specified, the function also writes the sequences to the +#' @return Character string containing the Fasta formatted sequences. +#' If `fa_outpath` is specified, the function also writes the sequences to the #' Fasta file. #' @export #' @@ -333,7 +333,7 @@ convertAlignment2FA <- function(aln_file = "", } #' mapAcc2Name -#' +#' #' @description #' Default rename_fasta() replacement function. Maps an accession number to its name #' @@ -347,17 +347,17 @@ convertAlignment2FA <- function(aln_file = "", #' @importFrom stringr str_locate #' @importFrom rlang sym #' -#' @return Character string. The modified line from the Fasta file header with +#' @return Character string. The modified line from the Fasta file header with #' the name instead of the accession number. #' @export #' #' @examples #' \dontrun{ -#' acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +#' acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), #' Name = c("Species A", "Species B")) #' line <- ">ACC001 some additional info" #' mapped_line <- mapAcc2Name(line, acc2name_table) -#' print(mapped_line) # Expected output: ">Species A" +#' mapped_line # Expected output: ">Species A" #' } mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an add_names column @@ -389,7 +389,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { #' #' @examples #' \dontrun{ -#' rename_fasta("input.fasta", "output.fasta", +#' rename_fasta("input.fasta", "output.fasta", #' replacement_function = map_acc2name, acc2name = acc2name_table) #' } rename_fasta <- function(fa_path, outpath, @@ -411,8 +411,8 @@ rename_fasta <- function(fa_path, outpath, ################################ ## generateAllAlignments2FA #' generateAllAlignments2FA -#' -#' @description +#' +#' @description #' Adding Leaves to an alignment file w/ accessions #' #' @keywords alignment, accnum, leaves, lineage, species @@ -420,25 +420,25 @@ rename_fasta <- function(fa_path, outpath, #' #' @param aln_path Character. Path to alignment files. #' Default is 'here("data/rawdata_aln/")' -#' @param fa_outpath Character. Path to file. Master protein file with AccNum & +#' @param fa_outpath Character. Path to file. Master protein file with AccNum & #' lineages. #' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' #' @param lin_file Character. Path to the written fasta file. #' Default is 'here("data/alns/")'. -#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' @param reduced Boolean. If TRUE, the fasta file will contain only one #' sequence per lineage. #' Default is 'FALSE'. #' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return NULL. The function saves the output FASTA files to the specified +#' @return NULL. The function saves the output FASTA files to the specified #' directory. #' -#' @details The alignment files would need two columns separated by spaces: -#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, +#' @details The alignment files would need two columns separated by spaces: +#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, #' Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats +#' @note Please refer to the source code if you have alternate + file formats #' and/or column names. #' #' @export @@ -481,9 +481,9 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), #' acc2FA #' #' @description -#' converts protein accession numbers to a fasta format. Resulting +#' converts protein accession numbers to a fasta format. Resulting #' fasta file is written to the outpath. -#' +#' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' @@ -492,8 +492,8 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), #' Resulting fasta file is written to the outpath. #' #' -#' @param accessions Character vector containing protein accession numbers to -#' generate fasta sequences for. Function may not work for vectors of +#' @param accessions Character vector containing protein accession numbers to +#' generate fasta sequences for. Function may not work for vectors of #' length > 10,000 #' @param outpath [str]. Location where fasta file should be written to. #' @param plan Character. The plan to use for processing. Default is "sequential". @@ -508,10 +508,10 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), #' #' @examples #' \dontrun{ -#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), #' outpath = "my_proteins.fasta") #' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> #' acc2FA(outpath = "ebi.fa") #' } acc2FA <- function(accessions, outpath, plan = "sequential") { @@ -601,22 +601,22 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return A character vector containing representative accession numbers, +#' @return A character vector containing representative accession numbers, #' one for each distinct observation in the specified 'reduced' column. #' @export #' #' @examples #' \dontrun{ #' # Example usage with a data frame called `protein_data` -#' createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, -#' reduced = "Lineage", +#' createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, +#' reduced = "Lineage", #' accnum_col = "AccNum") -#' print(representative_accessions) +#' representative_accessions #' } createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one + # Get Unique reduced column and then bind the AccNums back to get one # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -651,9 +651,9 @@ createRepresentativeAccNum <- function(prot_data, #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' @param tool Type of alignment tool to use. One of three options: "Muscle", #' "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. If +#' @param outpath Path to write the resulting alignment to as a FASTA file. If #' NULL, no file is written #' #' @importFrom Biostrings readAAStringSet @@ -665,9 +665,9 @@ createRepresentativeAccNum <- function(prot_data, #' @examples #' \dontrun{ #' # Example usage -#' aligned_sequences <- alignFasta("path/to/sequences.fasta", +#' aligned_sequences <- alignFasta("path/to/sequences.fasta", #' tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") -#' print(aligned_sequences) +#' aligned_sequences #' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { fasta <- readAAStringSet(fasta_file) @@ -723,7 +723,7 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' getAccNumFromFA #' -#' @param fasta_file Character. Path to the FASTA file from which +#' @param fasta_file Character. Path to the FASTA file from which #' accession numbers will be extracted. #' #' @importFrom readr read_file @@ -736,7 +736,7 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' \dontrun{ #' # Example usage #' accnums <- getAccNumFromFA("path/to/sequences.fasta") -#' print(accnums) +#' accnums #' } getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) diff --git a/R/reverse_operons.R b/R/reverse_operons.R index 5e1cb423..9094598b 100755 --- a/R/reverse_operons.R +++ b/R/reverse_operons.R @@ -7,12 +7,12 @@ #' #' @description #' This function processes the genomic context strings (GenContext) and reverses -#' directional signs based on the presence of an equal sign ("="). +#' directional signs based on the presence of an equal sign ("="). #' #' @param prot [vector] A vector of genomic context strings to be processed. #' -#' @return [vector] A vector of the same length as the input, where each genomic -#' element is annotated with either a forward ("->") or reverse ("<-") direction, +#' @return [vector] A vector of the same length as the input, where each genomic +#' element is annotated with either a forward ("->") or reverse ("<-") direction, #' depending on its position relative to the "=" symbols. #' #' @export @@ -73,12 +73,12 @@ straightenOperonSeq <- function(prot) { #' #' @description #' This function processes a genomic context data frame to reverse the direction -#' of operons based on specific patterns in the GenContext column. It handles -#' elements represented by ">" and "<" and restructures the genomic context by -#' flipping the direction of operons while preserving the relationships +#' of operons based on specific patterns in the GenContext column. It handles +#' elements represented by ">" and "<" and restructures the genomic context by +#' flipping the direction of operons while preserving the relationships #' indicated by "=". #' -#' @param prot [data.frame] A data frame containing at least a column named +#' @param prot [data.frame] A data frame containing at least a column named #' 'GenContext', which represents the genomic contexts that need to be reversed. #' #' @return [data.frame] The input data frame with the 'GenContext' column updated t @@ -90,7 +90,7 @@ straightenOperonSeq <- function(prot) { #' # Example genomic context data frame #' prot <- data.frame(GenContext = c("A>B", "CI")) #' reversed_prot <- reverseOperonSeq(prot) -#' print(reversed_prot) +#' reversed_prot reverseOperonSeq <- function(prot) { gencontext <- prot$GenContext diff --git a/man/acc2FA.Rd b/man/acc2FA.Rd index c878403b..ae7101d7 100644 --- a/man/acc2FA.Rd +++ b/man/acc2FA.Rd @@ -35,17 +35,17 @@ Resulting fasta file is written to the outpath. } \examples{ \dontrun{ -acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") } \dontrun{ -acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") } } diff --git a/man/addName.Rd b/man/addName.Rd index e4a745c5..b681f349 100644 --- a/man/addName.Rd +++ b/man/addName.Rd @@ -56,7 +56,7 @@ data <- data.frame( Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") ) enriched_data <- addName(data) -print(enriched_data) +enriched_data \dontrun{ addName(data_frame) } diff --git a/man/addTaxID.Rd b/man/addTaxID.Rd index e960769b..9e68321c 100644 --- a/man/addTaxID.Rd +++ b/man/addTaxID.Rd @@ -27,6 +27,6 @@ addTaxID # Create a sample data table with accession numbers sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) -print(enriched_data) +enriched_data } } diff --git a/man/alignFasta.Rd b/man/alignFasta.Rd index e9bd22d7..61e880ab 100644 --- a/man/alignFasta.Rd +++ b/man/alignFasta.Rd @@ -29,14 +29,14 @@ Perform a Multiple Sequence Alignment on a FASTA file. } \examples{ \dontrun{ -aligned_sequences <- alignFasta("my_sequences.fasta", +aligned_sequences <- alignFasta("my_sequences.fasta", tool = "Muscle", outpath = "aligned_output.fasta") } \dontrun{ # Example usage -aligned_sequences <- alignFasta("path/to/sequences.fasta", +aligned_sequences <- alignFasta("path/to/sequences.fasta", tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") -print(aligned_sequences) +aligned_sequences } } \author{ diff --git a/man/convert2TitleCase.Rd b/man/convert2TitleCase.Rd index a4078141..4769efea 100644 --- a/man/convert2TitleCase.Rd +++ b/man/convert2TitleCase.Rd @@ -30,8 +30,8 @@ Changing case to 'Title Case' # Convert a single string to title case convert2TitleCase("hello world") # Returns "Hello World" -convert2TitleCase("hello world") -convert2TitleCase("this is a test", "_") +convert2TitleCase("hello world") +convert2TitleCase("this is a test", "_") } \seealso{ chartr, toupper, and tolower. diff --git a/man/createRepresentativeAccNum.Rd b/man/createRepresentativeAccNum.Rd index 639a36d4..53902940 100644 --- a/man/createRepresentativeAccNum.Rd +++ b/man/createRepresentativeAccNum.Rd @@ -40,11 +40,14 @@ Function to generate a vector of one Accession number per distinct observation f } \examples{ \dontrun{ +createRepresentativeAccNum(prot) +} +\dontrun{ # Example usage with a data frame called `protein_data` -createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, - reduced = "Lineage", +createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, + reduced = "Lineage", accnum_col = "AccNum") -print(representative_accessions) +representative_accessions } } \author{ diff --git a/man/downloadAssemblySummary.Rd b/man/downloadAssemblySummary.Rd index bad2b603..e67aba70 100644 --- a/man/downloadAssemblySummary.Rd +++ b/man/downloadAssemblySummary.Rd @@ -25,7 +25,7 @@ Download the combined assembly summaries of genbank and refseq } \examples{ \dontrun{ -downloadAssemblySummary(outpath = "assembly_summary.tsv", +downloadAssemblySummary(outpath = "assembly_summary.tsv", keep = c("assembly_accession", "taxid", "organism_name")) } } diff --git a/man/getAccNumFromFA.Rd b/man/getAccNumFromFA.Rd index d2d9216a..4c6179a1 100644 --- a/man/getAccNumFromFA.Rd +++ b/man/getAccNumFromFA.Rd @@ -24,8 +24,11 @@ getAccNumFromFA } \examples{ \dontrun{ +getAccNumFromFA("my_sequences.fasta") +} +\dontrun{ # Example usage accnums <- getAccNumFromFA("path/to/sequences.fasta") -print(accnums) +accnums } } diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd index c76931f1..0eeb0610 100644 --- a/man/getTopAccByLinDomArch.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -37,8 +37,8 @@ Group by lineage + DA then take top 20 } \examples{ \dontrun{ -top_accessions <- getTopAccByLinDomArch(infile_full = my_data, -DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +top_accessions <- getTopAccByLinDomArch(infile_full = my_data, +DA_col = "DomArch.Pfam", lin_col = "Lineage_short", n = 20, query = "specific_query_name") } } diff --git a/man/mapAcc2Name.Rd b/man/mapAcc2Name.Rd index 7ef04955..3213201a 100644 --- a/man/mapAcc2Name.Rd +++ b/man/mapAcc2Name.Rd @@ -35,10 +35,10 @@ Default rename_fasta() replacement function. Maps an accession number to its nam mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name") } \dontrun{ -acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), Name = c("Species A", "Species B")) line <- ">ACC001 some additional info" mapped_line <- mapAcc2Name(line, acc2name_table) -print(mapped_line) # Expected output: ">Species A" +mapped_line # Expected output: ">Species A" } } diff --git a/man/plotIPR2Viz.Rd b/man/plotIPR2Viz.Rd index 8d06eae1..13ac06c1 100644 --- a/man/plotIPR2Viz.Rd +++ b/man/plotIPR2Viz.Rd @@ -53,15 +53,15 @@ plotIPR2Viz } \examples{ \dontrun{ -plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", - infile_full = "path/to/full_file.tsv", - accessions = c("ACC123", "ACC456"), - analysis = c("Pfam", "TMHMM"), - group_by = "Analysis", - topn = 20, - name = "Gene Name", - text_size = 15, +plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", + infile_full = "path/to/full_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + topn = 20, + name = "Gene Name", + text_size = 15, query = "All") -print(plot) +plot } } diff --git a/man/plotIPR2VizWeb.Rd b/man/plotIPR2VizWeb.Rd index 9de7413f..e56d917e 100644 --- a/man/plotIPR2VizWeb.Rd +++ b/man/plotIPR2VizWeb.Rd @@ -54,15 +54,15 @@ plotIPR2VizWeb } \examples{ \dontrun{ -plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", - accessions = c("ACC123", "ACC456"), - analysis = c("Pfam", "TMHMM"), - group_by = "Analysis", - name = "Gene Name", - text_size = 15, - legend_name = "ShortName", - cols = 5, +plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + name = "Gene Name", + text_size = 15, + legend_name = "ShortName", + cols = 5, rows = 10) -print(plot) +plot } } diff --git a/man/plotLineageSunburst.Rd b/man/plotLineageSunburst.Rd index 3240d77d..363e8c27 100644 --- a/man/plotLineageSunburst.Rd +++ b/man/plotLineageSunburst.Rd @@ -49,7 +49,7 @@ Lineage Sunburst } \examples{ \dontrun{ -plotLineageSunburst(prot, lineage_column = "Lineage", +plotLineageSunburst(prot, lineage_column = "Lineage", type = "sunburst", levels = 3) } } diff --git a/man/prepareColumnParams.Rd b/man/prepareColumnParams.Rd index 8a9f566b..f685624e 100644 --- a/man/prepareColumnParams.Rd +++ b/man/prepareColumnParams.Rd @@ -24,6 +24,6 @@ prepareColumnParams count_data <- data.frame(Category = c("A", "B", "C"), n = c(10, 20, 15)) params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) -print(params) +params } } diff --git a/man/prepareSingleColumnParams.Rd b/man/prepareSingleColumnParams.Rd index 0070497e..0261f9c1 100644 --- a/man/prepareSingleColumnParams.Rd +++ b/man/prepareSingleColumnParams.Rd @@ -25,6 +25,6 @@ prepareSingleColumnParams df <- data.frame(Category = c("A", "A", "B", "B", "C"), n = c(10, 20, 30, 40, 50)) params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") -print(params) +params } } diff --git a/man/proteinAcc2TaxID.Rd b/man/proteinAcc2TaxID.Rd index 9be09d53..1ccafe4f 100644 --- a/man/proteinAcc2TaxID.Rd +++ b/man/proteinAcc2TaxID.Rd @@ -31,8 +31,8 @@ proteinAcc2TaxID \dontrun{ # Example accession numbers accessions <- c("ABC123", "XYZ456", "LMN789") -tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +tax_data <- proteinAcc2TaxID(accessions, suffix = "example", out_path = "/path/to/output", return_dt = TRUE) -print(tax_data) +tax_data } } diff --git a/man/renameFA.Rd b/man/renameFA.Rd index da7d339b..18eca8b9 100644 --- a/man/renameFA.Rd +++ b/man/renameFA.Rd @@ -23,7 +23,7 @@ Rename the labels of fasta files } \examples{ \dontrun{ -renameFA("path/to/input.fasta", +renameFA("path/to/input.fasta", "path/to/output.fasta", mapAcc2Name, acc2name) } } diff --git a/man/rename_fasta.Rd b/man/rename_fasta.Rd index 3089d530..35658437 100644 --- a/man/rename_fasta.Rd +++ b/man/rename_fasta.Rd @@ -23,7 +23,7 @@ Rename the labels of fasta files } \examples{ \dontrun{ -rename_fasta("input.fasta", "output.fasta", +rename_fasta("input.fasta", "output.fasta", replacement_function = map_acc2name, acc2name = acc2name_table) } } diff --git a/man/reverseOperonSeq.Rd b/man/reverseOperonSeq.Rd index 3709bbe1..03e68a94 100644 --- a/man/reverseOperonSeq.Rd +++ b/man/reverseOperonSeq.Rd @@ -25,5 +25,5 @@ indicated by "=". # Example genomic context data frame prot <- data.frame(GenContext = c("A>B", "CI")) reversed_prot <- reverseOperonSeq(prot) -print(reversed_prot) +reversed_prot } diff --git a/man/runDeltaBlast.Rd b/man/runDeltaBlast.Rd index fc9cd09e..c3384d12 100644 --- a/man/runDeltaBlast.Rd +++ b/man/runDeltaBlast.Rd @@ -5,7 +5,7 @@ \title{Run DELTABLAST to find homologs for proteins of interest} \usage{ runDeltaBlast( - deltablast_path, + runDeltaBlast, db_search_path, db = "refseq", query, @@ -16,8 +16,6 @@ runDeltaBlast( ) } \arguments{ -\item{deltablast_path}{Path to the Delta-BLAST executable.} - \item{db_search_path}{Path to the BLAST databases.} \item{db}{Name of the BLAST database to search against (default is "refseq").} @@ -31,6 +29,8 @@ runDeltaBlast( \item{num_alignments}{Number of alignments to report.} \item{num_threads}{Number of threads to use for the search (default is 1).} + +\item{deltablast_path}{Path to the Delta-BLAST executable.} } \value{ This function does not return a value; it outputs results to the @@ -41,6 +41,11 @@ This function executes a Delta-BLAST search using the specified parameters and database. It sets the BLAST database path, runs the Delta-BLAST command with the given query, and outputs the results. } +\examples{ +\dontrun{ +runDeltaBlast(runDeltaBlast, db_search_path) +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/runIPRScan.Rd b/man/runIPRScan.Rd index 8431efb4..f675314d 100644 --- a/man/runIPRScan.Rd +++ b/man/runIPRScan.Rd @@ -29,6 +29,6 @@ results <- runIPRScan( filepath_out = "path/to/output_file", appl = c("Pfam", "Gene3D") ) -print(results) +results } } diff --git a/man/shortenLineage.Rd b/man/shortenLineage.Rd index 00200f96..161d0260 100644 --- a/man/shortenLineage.Rd +++ b/man/shortenLineage.Rd @@ -27,9 +27,9 @@ string (up to a given delimiter). } \examples{ \dontrun{ -df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", "Archaea>Euryarchaeota>Thermococci")) shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) -print(shortened_df) +shortened_df } } diff --git a/man/writeMSA_AA2FA.Rd b/man/writeMSA_AA2FA.Rd index c9551102..d0d5d305 100644 --- a/man/writeMSA_AA2FA.Rd +++ b/man/writeMSA_AA2FA.Rd @@ -28,6 +28,9 @@ and msaMuscle from the 'msa' package } \examples{ \dontrun{ +writeMSA_AA2FA("my_sequences.fasta", outpath = "aligned_output.fasta") +} +\dontrun{ # Example usage alignment <- alignFasta("path/to/sequences.fasta") writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta") From ecdd69e27f422c4c0e13dc9ce4ef9818ccdfb828 Mon Sep 17 00:00:00 2001 From: teddyCodex Date: Sun, 27 Oct 2024 16:34:58 +0100 Subject: [PATCH 55/61] added boundary guard and error handling to .LevelReduction --- R/plotting.R | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/R/plotting.R b/R/plotting.R index 5d949cd5..853b377f 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -23,22 +23,12 @@ ######################## #' #' -.LevelReduction <- function(lin, level) { - if (level == 1) { - gt_loc <- str_locate(lin, ">")[[1]] - if (is.na(gt_loc)) { - # No '>' in lineage - return(lin) - } else { - lin <- substring(lin, first = 0, last = (gt_loc - 1)) - return(lin) - } - } - # Out of bounds guard - gt_loc <- str_locate_all(lin, ">")[[1]] - l <- length(gt_loc) / 2 - if (level > l) { - # Not enough '>' in lineage +.LevelReduction <- function(lin, level) { + gt_loc <- str_locate_all(lin, ">")[[1]] + available_levels <- length(gt_loc) / 2 # Since `str_locate_all` returns a matrix + + # Guard against out-of-bounds level requests + if (level > available_levels || level < 1) { return(lin) } else { gt_loc <- gt_loc[level, ][1] %>% as.numeric() @@ -47,6 +37,8 @@ } } + + .GetKingdom <- function(lin) { gt_loc <- str_locate(lin, ">")[, "start"] if (is.na(gt_loc)) { @@ -1359,4 +1351,4 @@ plotLineageSunburst <- function(prot, lineage_column = "Lineage", # # theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5), # # axis.text.y=element_text(angle=90,hjust=1,vjust=0.5)) # -# } +# } \ No newline at end of file From 64d16bec64a607c9cfb427b01acad378dd064ab0 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 09:20:49 -0600 Subject: [PATCH 56/61] Rd consistency --- man/shortenLineage.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/shortenLineage.Rd b/man/shortenLineage.Rd index 161d0260..7390b254 100644 --- a/man/shortenLineage.Rd +++ b/man/shortenLineage.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/plotting.R \name{shortenLineage} \alias{shortenLineage} -\title{Shorten Lineage Names} +\title{shortenLineage} \usage{ shortenLineage(data, colname = "Lineage", abr_len = 1) } From 32418b39ba9b40550ef425c771d4c857741a8446 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 09:21:11 -0600 Subject: [PATCH 57/61] disable example code for reverseOperonSeq() - see https://github.com/JRaviLab/MolEvolvR/issues/118 --- R/reverse_operons.R | 5 ++++- man/reverseOperonSeq.Rd | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/reverse_operons.R b/R/reverse_operons.R index 9094598b..f250e8c0 100755 --- a/R/reverse_operons.R +++ b/R/reverse_operons.R @@ -87,10 +87,13 @@ straightenOperonSeq <- function(prot) { #' @export #' #' @examples +#' \dontrun{ #' # Example genomic context data frame -#' prot <- data.frame(GenContext = c("A>B", "CI")) +#' ## Rework example data, does not pass R-CMD Check +#' prot <- data.frame(GenContext = c("A>B", "CI")) #' reversed_prot <- reverseOperonSeq(prot) #' reversed_prot +#' } reverseOperonSeq <- function(prot) { gencontext <- prot$GenContext diff --git a/man/reverseOperonSeq.Rd b/man/reverseOperonSeq.Rd index 03e68a94..812d0e89 100644 --- a/man/reverseOperonSeq.Rd +++ b/man/reverseOperonSeq.Rd @@ -22,8 +22,11 @@ flipping the direction of operons while preserving the relationships indicated by "=". } \examples{ +\dontrun{ # Example genomic context data frame -prot <- data.frame(GenContext = c("A>B", "CI")) +## Rework example data, does not pass R-CMD Check +prot <- data.frame(GenContext = c("A>B", "CI")) reversed_prot <- reverseOperonSeq(prot) reversed_prot } +} From f4b50f4c387f7a797e630eb2b00c06d473ee75da Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 20:39:13 -0600 Subject: [PATCH 58/61] fix error introduced by merge --- R/assign_job_queue.R | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 6f3dde17..e0c22ec6 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -127,33 +127,6 @@ calculateProcessRuntime <- function(dir_job_results) { dir.create(dirname(path_log_data), recursive = TRUE, showWarnings = FALSE) } - - # attempt to load pre-generated logdata - if (!file.exists(path_log_data)) { - logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) - save(logs, file = path_log_data) - } else { - load(path_log_data) # loads the logs object - } - df_log <- logs$df_log - procs <- c( - "dblast", "dblast_cleanup", "iprscan", - "ipr2lineage", "ipr2da", "blast_clust", - "clust2table" - ) - list_proc_medians <- df_log |> - dplyr::select(dplyr::all_of(procs)) |> - dplyr::summarise( - dplyr::across( - dplyr::everything(), - \(x) median(x, na.rm = TRUE) - ) - ) |> - as.list() - return(list_proc_medians) -} - - # attempt to load pre-generated logdata if (!file.exists(path_log_data)) { logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) @@ -600,6 +573,7 @@ assignJobQueue <- function( #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export plotEstimatedWallTimes <- function() { + tryCatch({ opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { From dd86b3ce04e68345297ee2f0f095f2999ff286f1 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 20:45:29 -0600 Subject: [PATCH 59/61] fix .Rd --- R/assign_job_queue.R | 18 ++++++++++++++++++ man/acc2Lineage.Rd | 3 +-- man/efetchIPG.Rd | 3 +-- man/sinkReset.Rd | 1 - 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index e0c22ec6..52af46bf 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -36,6 +36,9 @@ mapOption2Process <- function() { } +#' mapAdvOption2Process +#' +#' @description #' Use MolEvolvR advanced options to get associated processes #' #' @param advanced_opts character vector of MolEvolvR advanced options @@ -79,6 +82,9 @@ mapAdvOption2Process <- function(advanced_opts) { } +#' calculateProcessRuntime +#' +#' @description #' Scrape MolEvolvR logs and calculate median processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -227,6 +233,9 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { } +#' writeProcessRuntime2YML +#' +#' @description #' Compute median process runtimes, then write a YAML list of the processes and #' their median runtimes in seconds to the path specified by 'filepath'. #' @@ -304,6 +313,9 @@ writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { }) } +#' getProcessRuntimeWeights +#' +#' @description #' Quickly get the runtime weights for MolEvolvR backend processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -494,6 +506,9 @@ calculateEstimatedWallTimeFromOpts <- function(advanced_opts, } +#' assignJobQueue +#' +#' @description #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process @@ -555,6 +570,9 @@ assignJobQueue <- function( } +#' plotEstimatedWallTimes +#' +#' @description #' Plot the estimated runtimes for different advanced options and number #' of inputs #' diff --git a/man/acc2Lineage.Rd b/man/acc2Lineage.Rd index fd4eeceb..ce499592 100644 --- a/man/acc2Lineage.Rd +++ b/man/acc2Lineage.Rd @@ -44,8 +44,7 @@ accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, Protein, Protein Name, Species, and Lineage. } \description{ -This function combines 'efetchIPG()' -and 'IPG2Lineage()' to map a set +This function combines 'efetchIPG()' and 'IPG2Lineage()' to map a set of protein accessions to their assembly (GCA_ID), tax ID, and lineage. Function to map protein accession numbers to lineage diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index eb5ca678..e55c342a 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -27,8 +27,7 @@ The function does not return a value but writes the efetch results directly to the specified \code{out_path}. } \description{ -Perform efetch on the ipg database -and write the results to out_path +Perform efetch on the ipg database and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index e3fc7ce4..0285c0b2 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,7 +8,6 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s -and handles any errors or warnings that occur during the process. } \description{ Sink Reset From f6c8188c9eb27df33bbb74a5c1b0febff549153a Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 21:17:23 -0600 Subject: [PATCH 60/61] swap rlang::abort() for base::stop() - allows for additional metadata to be added to error - pkg consistency, abort is used elsewhere print -> message --- R/tree.R | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/R/tree.R b/R/tree.R index 82eb11db..ddbf9d61 100755 --- a/R/tree.R +++ b/R/tree.R @@ -43,6 +43,8 @@ #' be saved. Default is the path to "data/alns/pspa_snf7.tre". #' @param fasttree_path Path to the FastTree executable, which is used to #' generate the phylogenetic tree. Default is "src/FastTree". +#' +#' @importFrom rlang abort #' #' @return No return value. The function generates a tree file (.tre) from the #' input FASTA file. @@ -63,19 +65,19 @@ convertFA2Tree <- function(fa_path = here("data/alns/pspa_snf7.fa"), # Check if the FASTA file exists if (!file.exists(fa_path)) { - stop(paste("Error: The FASTA file does not exist at:", fa_path)) + abort(paste("Error: The FASTA file does not exist at:", fa_path)) } # Check if the FastTree executable exists if (!file.exists(fasttree_path)) { - stop(paste("Error: The FastTree executable does not exist at:", + abort(paste("Error: The FastTree executable does not exist at:", fasttree_path)) } # Check if the output directory exists tre_dir <- dirname(tre_path) if (!dir.exists(tre_dir)) { - stop(paste("Error: The output directory does not exist:", tre_dir)) + abort(paste("Error: The output directory does not exist:", tre_dir)) } # Check if the output file already exists @@ -84,7 +86,7 @@ convertFA2Tree <- function(fa_path = here("data/alns/pspa_snf7.fa"), tre_path, "\n") } - print(fa_path) + message(fa_path) system2( command = fasttree_path, args = paste(c(fa_path, ">", tre_path), @@ -125,13 +127,13 @@ convertAlignment2Trees <- function(aln_path = here("data/alns/")) { # Check if the alignment directory exists if (!dir.exists(aln_path)) { - stop(paste("Error: The alignment directory does not exist:", aln_path)) + abort(paste("Error: The alignment directory does not exist:", aln_path)) } # finding all fasta alignment files fa_filenames <- list.files(path = aln_path, pattern = "*.fa") # Check if any FASTA files were found if (length(fa_filenames) == 0) { - stop("Error: No FASTA files found in the specified directory.") + abort("Error: No FASTA files found in the specified directory.") } fa_paths <- paste0(aln_path, fa_filenames) @@ -194,13 +196,13 @@ createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", # Check if the FASTA file exists if (!file.exists(fa_file)) { - stop(paste("Error: The FASTA file does not exist at:", fa_file)) + abort(paste("Error: The FASTA file does not exist at:", fa_file)) } # Check if the output directory exists out_dir <- dirname(out_file) if (!dir.exists(out_dir)) { - stop(paste("Error: The output directory does not exist:", out_dir)) + abort(paste("Error: The output directory does not exist:", out_dir)) } # Check if the output file already exists @@ -233,7 +235,7 @@ createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", ## Model Testing & Distance Matrices ## Comparison of different nucleotide or amino acid substitution models mt <- modelTest(prot10, model = "all") - print(mt) + message(mt) # estimate a distance matrix using a Jules-Cantor Model dna_dist <- dist.ml(prot10, model = "JC69") @@ -254,7 +256,7 @@ createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", ## Maximum likelihood and Bootstrapping # ml estimation w/ distance matrix fit <- pml(prot_NJ, prot10) - print(fit) + message(fit) fitJC <- optim.pml(fit, model = "JC", rearrangement = "stochastic") logLik(fitJC) bs <- bootstrap.pml(fitJC, @@ -267,7 +269,7 @@ createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", prot10_dm <- dist.ml(prot10) prot10_NJ <- NJ(prot10_dm) fit2 <- pml(prot10_NJ, data = prot10) - print(fit2) + message(fit2) fitJC2 <- optim.pml(fit2, model = "JC", rearrangement = "stochastic") logLik(fitJC2) bs_subset <- bootstrap.pml(fitJC2, From 01ac8b233f75df76d12320dbbfc0fe09441b1910 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 29 Oct 2024 21:34:12 -0600 Subject: [PATCH 61/61] use rlang::abort() --- R/summarize.R | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/R/summarize.R b/R/summarize.R index 504da767..e76a86da 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -25,7 +25,7 @@ #' #' @importFrom dplyr filter #' @importFrom stringr str_replace_all -#' @importFrom rlang sym +#' @importFrom rlang abort sym #' #' @return Filtered data frame #' @note There is no need to make the domains 'regex safe', that will be handled by this function @@ -44,12 +44,12 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo # Check if prot is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - stop(paste("Error: The specified column '", column, "' does not exist + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } @@ -139,19 +139,19 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - stop(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } # Check if min.freq is a positive integer if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { - stop("Error: 'min.freq' must be a positive integer.") + abort("Error: 'min.freq' must be a positive integer.") } counts <- prot %>% select(column) %>% @@ -200,19 +200,19 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms") { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - stop(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } # Check for valid conversion_type values valid_types <- c("da2doms", "doms2da") if (!conversion_type %in% valid_types) { - stop(paste("Error: Invalid 'conversion_type'. Must be one of:", + abort(paste("Error: Invalid 'conversion_type'. Must be one of:", paste(valid_types, collapse = ", "))) } @@ -277,7 +277,7 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" words2WordCounts <- function(string) { # Check if 'string' is a character vector of length 1 if (!is.character(string) || length(string) != 1) { - stop("Error: 'string' must be a single character vector.") + abort("Error: 'string' must be a single character vector.") } df_word_count <- string %>% @@ -331,18 +331,18 @@ filterByFrequency <- function(x, min.freq) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } # Check if 'min.freq' is a positive integer if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { - stop("Error: 'min.freq' must be a positive integer.") + abort("Error: 'min.freq' must be a positive integer.") } # Check if the 'freq' column exists in the data frame if (!"freq" %in% names(x)) { - stop("Error: The data frame must contain a 'freq' column.") + abort("Error: The data frame must contain a 'freq' column.") } x %>% filter(freq >= min.freq) @@ -388,18 +388,18 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" query) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - stop(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } # Check if the 'by' column exists in the data frame if (!by %in% names(prot)) { - stop(paste("Error: The specified 'by' column '", by, "' does not exist + abort(paste("Error: The specified 'by' column '", by, "' does not exist n the data frame.", sep = "")) } @@ -448,7 +448,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" summarizeDomArch_ByLineage <- function(x) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } # Check if required columns exist in the data frame @@ -456,7 +456,7 @@ summarizeDomArch_ByLineage <- function(x) { missing_columns <- setdiff(required_columns, names(x)) if (length(missing_columns) > 0) { - stop(paste("Error: The following required columns are + abort(paste("Error: The following required columns are missing:", paste(missing_columns, collapse = ", "))) } x %>% @@ -494,7 +494,7 @@ summarizeDomArch_ByLineage <- function(x) { summarizeDomArch <- function(x) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } x %>% group_by(DomArch) %>% @@ -530,7 +530,7 @@ summarizeDomArch <- function(x) { summarizeGenContext_ByDomArchLineage <- function(x) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } x %>% filter(!grepl("^-$", GenContext)) %>% @@ -559,7 +559,7 @@ summarizeGenContext_ByDomArchLineage <- function(x) { summarizeGenContext_ByLineage <- function(x) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } x %>% filter(!grepl("^-$", GenContext)) %>% @@ -596,7 +596,7 @@ summarizeGenContext_ByLineage <- function(x) { summarizeGenContext <- function(x) { # Check if 'x' is a data frame if (!is.data.frame(x)) { - stop("Error: 'x' must be a data frame.") + abort("Error: 'x' must be a data frame.") } x %>% group_by(GenContext) %>% @@ -659,7 +659,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col ) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Check if the specified columns exist in the data frame @@ -667,19 +667,19 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col missing_columns <- setdiff(required_columns, names(prot)) if (length(missing_columns) > 0) { - stop(paste("Error: The following required columns are missing:", + abort(paste("Error: The following required columns are missing:", paste(missing_columns, collapse = ", "))) } # Check that cutoff is a numeric value between 0 and 100 if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) { - stop("Error: 'cutoff' must be a numeric value between 0 and 100.") + abort("Error: 'cutoff' must be a numeric value between 0 and 100.") } # Check that digits is a non-negative integer if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || floor(digits) != digits) { - stop("Error: 'digits' must be a non-negative integer.") + abort("Error: 'digits' must be a non-negative integer.") } column <- sym(column) @@ -843,7 +843,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col findParalogs <- function(prot) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { - stop("Error: 'prot' must be a data frame.") + abort("Error: 'prot' must be a data frame.") } # Remove eukaryotes