Skip to content

Commit

Permalink
Merge pull request #74 from awasyn/premsaerror
Browse files Browse the repository at this point in the history
add input checks in pre-msa-tree
  • Loading branch information
the-mayer authored Oct 30, 2024
2 parents ee5756b + 6949a68 commit c838c4e
Showing 1 changed file with 165 additions and 5 deletions.
170 changes: 165 additions & 5 deletions R/pre-msa-tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,20 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
#' @param x Character vector.
#' @param y Delimitter. Default is space (" ").
#'
#' @importFrom rlang abort
#'
#' @return A character vector in title case.
#' @export
#'
#' @examples
#' convert2TitleCase("hello world")
#' convert2TitleCase("this is a test", "_")

convert2TitleCase <- function(x, y = " ") {
# Check if the input is NULL or not a character
if (is.null(x) || !is.character(x)) {
abort("Error: Input must be a non-null character string.")
}
s <- strsplit(x, y)[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep = "", collapse = y
Expand Down Expand Up @@ -88,6 +95,7 @@ convert2TitleCase <- function(x, y = " ") {
#' @importFrom stats complete.cases
#' @importFrom stringr str_sub
#' @importFrom tidyr replace_na separate
#' @importFrom rlang abort
#'
#' @return A data frame containing the combined alignment and lineage
#' information.
Expand All @@ -104,6 +112,25 @@ addLeaves2Alignment <- function(aln_file = "",
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
# lin_file="data/rawdata_tsv/PspA.txt",
reduced = FALSE) {

#Check if the alignment file is provided and exists
if (nchar(aln_file) == 0) {
abort("Error: Alignment file path must be provided.")
}

if (!file.exists(aln_file)) {
abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
}

# Check if the lineage file exists
if (!file.exists(lin_file)) {
abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
}

# Check that the 'reduced' parameter is logical
if (!is.logical(reduced) || length(reduced) != 1) {
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
}
## SAMPLE ARGS
# aln_file <- "data/rawdata_aln/pspc.gismo.aln"
# lin_file <- "data/rawdata_tsv/all_semiclean.txt"
Expand Down Expand Up @@ -206,7 +233,7 @@ addLeaves2Alignment <- function(aln_file = "",
#' @importFrom dplyr mutate pull select
#' @importFrom stringi stri_replace_all_regex
#' @importFrom tidyr separate
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return Original data with a 'Name' column
#' @export
Expand All @@ -218,6 +245,19 @@ addLeaves2Alignment <- function(aln_file = "",
addName <- function(data,
accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage",
lin_sep = ">", out_col = "Name") {
# Check if the data is a data fram
if (!is.data.frame(data)) {
abort("Error: The input 'data' must be a data frame")
}

# Check that the specified columns exist in the data
required_cols <- c(accnum_col, spec_col, lin_col)
missing_cols <- setdiff(required_cols, names(data))
if (length(missing_cols) > 0) {
abort(paste("Error: The following columns are missing from the data:",
paste(missing_cols, collapse = ", ")))
}

cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp")
split_data <- data %>%
separate(
Expand Down Expand Up @@ -288,6 +328,7 @@ addName <- function(data,
#' file formats and/or column names.
#'
#' @importFrom readr write_file
#' @importFrom rlang abort
#'
#' @return Character string containing the Fasta formatted sequences.
#' If `fa_outpath` is specified, the function also writes the sequences to the
Expand All @@ -302,6 +343,24 @@ convertAlignment2FA <- function(aln_file = "",
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
fa_outpath = "",
reduced = FALSE) {
#Check if the alignment file is provided and exists
if (nchar(aln_file) == 0) {
abort("Error: Alignment file path must be provided.")
}

if (!file.exists(aln_file)) {
abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
}

# Check if the lineage file exists
if (!file.exists(lin_file)) {
abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
}

# Check that the 'reduced' parameter is logical
if (!is.logical(reduced) || length(reduced) != 1) {
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
}
## SAMPLE ARGS
# aln_file <- "data/rawdata_aln/pspc.gismo.aln"
# lin_file <- "data/rawdata_tsv/all_semiclean.txt"
Expand Down Expand Up @@ -345,7 +404,7 @@ convertAlignment2FA <- function(aln_file = "",
#'
#' @importFrom dplyr filter pull
#' @importFrom stringr str_locate
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return Character string. The modified line from the Fasta file header with
#' the name instead of the accession number.
Expand All @@ -359,7 +418,23 @@ convertAlignment2FA <- function(aln_file = "",
#' mapped_line <- mapAcc2Name(line, acc2name_table)
#' mapped_line # Expected output: ">Species A"
#' }

mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
# Check if acc2name is a data frame
if (!is.data.frame(acc2name)) {
abort("Error: acc2name must be a data frame.")
}

# Check if the specified columns exist in the data frame
if (!(acc_col %in% colnames(acc2name))) {
abort("Error: The specified acc_col '", acc_col, "' does not exist in
acc2name.")
}
if (!(name_col %in% colnames(acc2name))) {
abort("Error: The specified name_col '", name_col, "' does not exist in
acc2name.")
}

# change to be the name equivalent to an add_names column
# Find the first ' '
end_acc <- str_locate(line, " ")[[1]]
Expand All @@ -383,6 +458,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#'
#' @importFrom purrr map
#' @importFrom readr read_lines write_lines
#' @importFrom rlang abort
#'
#' @return Character vector containing the modified lines of the Fasta file.
#' @export
Expand All @@ -394,6 +470,17 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#' }
rename_fasta <- function(fa_path, outpath,
replacement_function = map_acc2name, ...) {
# Check if the input FASTA file exists
if (!file.exists(fa_path)) {
abort("Error: The input FASTA file does not exist at the specified
path: ", fa_path)
}

# Check if the output path is writable
outdir <- dirname(outpath)
if (!dir.exists(outdir)) {
abort("Error: The output directory does not exist: ", outdir)
}
lines <- read_lines(fa_path)
res <- map(lines, function(x) {
if (strtrim(x, 1) == ">") {
Expand Down Expand Up @@ -431,6 +518,7 @@ rename_fasta <- function(fa_path, outpath,
#'
#' @importFrom purrr pmap
#' @importFrom stringr str_replace_all
#' @importFrom rlang abort
#'
#' @return NULL. The function saves the output FASTA files to the specified
#' directory.
Expand All @@ -451,6 +539,24 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
fa_outpath = here("data/alns/"),
lin_file = here("data/rawdata_tsv/all_semiclean.txt"),
reduced = F) {
# Check if the alignment path exists
if (!dir.exists(aln_path)) {
abort("Error: The alignment directory does not exist at the specified
path: ", aln_path)
}

# Check if the output path exists; if not, attempt to create it
if (!dir.exists(fa_outpath)) {
dir.create(fa_outpath, recursive = TRUE)
message("Note: The output directory did not exist and has been created: ",
fa_outpath)
}

# Check if the linear file exists
if (!file.exists(lin_file)) {
abort("Error: The linear file does not exist at the specified path: ",
lin_file)
}
# library(here)
# library(tidyverse)
# aln_path <- here("data/rawdata_aln/")
Expand Down Expand Up @@ -502,6 +608,7 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
#' @importFrom future future plan
#' @importFrom purrr map
#' @importFrom rentrez entrez_fetch
#' @importFrom rlang abort
#'
#' @return A Fasta file is written to the specified `outpath`.
#' @export
Expand All @@ -514,7 +621,16 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |>
#' acc2FA(outpath = "ebi.fa")
#' }

acc2FA <- function(accessions, outpath, plan = "sequential") {
if (!is.character(accessions) || length(accessions) == 0) {
abort("Error: 'accessions' must be a non-empty character vector.")
}

if (!dir.exists(dirname(outpath))) {
abort("Error: The output directory does not exist: ", dirname(outpath))
}

# validation
stopifnot(length(accessions) > 0)

Expand Down Expand Up @@ -599,7 +715,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
#' @param accnum_col Column from prot_data that contains Accession Numbers
#'
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return A character vector containing representative accession numbers,
#' one for each distinct observation in the specified 'reduced' column.
Expand All @@ -616,8 +732,25 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
createRepresentativeAccNum <- function(prot_data,
reduced = "Lineage",
accnum_col = "AccNum") {
# Get Unique reduced column and then bind the AccNums back to get one
# AccNum per reduced column

# Validate input
if (!is.data.frame(prot_data)) {
abort("Error: 'prot_data' must be a data frame.")
}

# Check if the reduced column exists in prot_data
if (!(reduced %in% colnames(prot_data))) {
abort("Error: The specified reduced column '", reduced, "' does not
exist in the data frame.")
}

# Check if the accnum_col exists in prot_data
if (!(accnum_col %in% colnames(prot_data))) {
abort("Error: The specified accession number column '", accnum_col, "'
does not exist in the data frame.")
}
# Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column

reduced_sym <- sym(reduced)
accnum_sym <- sym(accnum_col)

Expand Down Expand Up @@ -658,6 +791,7 @@ createRepresentativeAccNum <- function(prot_data,
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom msa msaMuscle msaClustalOmega msaClustalW
#' @importFrom rlang abort
#'
#' @return aligned fasta sequence as a MsaAAMultipleAlignment object
#' @export
Expand All @@ -670,6 +804,14 @@ createRepresentativeAccNum <- function(prot_data,
#' aligned_sequences
#' }
alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
# Validate the input FASTA file
if (!file.exists(fasta_file)) {
abort("Error: The FASTA file does not exist: ", fasta_file)
}

if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
}
fasta <- readAAStringSet(fasta_file)

aligned <- switch(tool,
Expand Down Expand Up @@ -698,6 +840,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#'
#' @importFrom Biostrings unmasked
#' @importFrom readr write_file
#' @importFrom rlang abort
#'
#' @return Character string of the FASTA content that was written to the file.
#' @export
Expand All @@ -708,7 +851,24 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#' alignment <- alignFasta("path/to/sequences.fasta")
#' writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta")
#' }

writeMSA_AA2FA <- function(alignment, outpath) {
# Validate input alignment
if (!inherits(alignment, "AAMultipleAlignment")) {
abort("Error: The alignment must be of type 'AAMultipleAlignment'.")
}

# Check the output path is a character string
if (!is.character(outpath) || nchar(outpath) == 0) {
abort("Error: Invalid output path specified.")
}

# Check if the output directory exists
outdir <- dirname(outpath)
if (!dir.exists(outdir)) {
abort("Error: The output directory does not exist: ", outdir)
}

l <- length(rownames(alignment))
fasta <- ""
for (i in 1:l)
Expand Down

0 comments on commit c838c4e

Please sign in to comment.