diff --git a/R/ncbi_byname.R b/R/ncbi_byname.R index ae1b991..61aa391 100644 --- a/R/ncbi_byname.R +++ b/R/ncbi_byname.R @@ -20,8 +20,7 @@ #' species <- c("Colletes similis","Halictus ligatus","Perdita californica") #' ncbi_byname(taxa=species, gene = c("coi", "co1"), seqrange = "1:2000") #' } -ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, - verbose=TRUE, ...) { +ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, verbose=TRUE, batch_size = 100, ...) { foo <- function(xx) { mssg(verbose, paste("Working on ", xx, "...", sep = "")) mssg(verbose, "...retrieving sequence IDs...") @@ -51,9 +50,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, if (!getrelated) { mssg(verbose, paste("no sequences of ", gene, " for ", xx, sep = "")) res <- data.frame( - xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_, + taxon = xx, gene_desc = NA_character_, + gi_no = NA_real_, acc_no = NA_character_, length = NA_real_, + sequence = NA_character_, spused = NA_character_, stringsAsFactors = FALSE) - names(res) <- NULL } else { mssg(verbose, "...retrieving sequence IDs for related species...") newname <- strsplit(xx, " ")[[1]][[1]] @@ -68,9 +68,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, if (as.numeric(xml2::xml_text(xml2::xml_find_all(out, "//Count")[[1]])) == 0) { mssg(verbose, paste("no sequences of ", gene, " for ", xx, " or ", newname, sep = "")) res <- data.frame( - xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_, + taxon = xx, gene_desc = NA_character_, + gi_no = NA_real_, acc_no = NA_character_, length = NA_real_, + sequence = NA_character_, spused = NA_character_, stringsAsFactors = FALSE) - names(res) <- NULL } else { ## For each species = get GI number with longest sequence mssg(verbose, "...retrieving sequence ID with longest sequence length...") @@ -88,23 +89,43 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, ## For each species = get GI number with longest sequence mssg(verbose, "...retrieving sequence ID with longest sequence length...") # construct query for species - querysum <- list(db = "nucleotide", api_key = ncbi_key(), - id = paste(make_ids(out), collapse = " ")) - z <- con$get("entrez/eutils/esummary.fcgi", query = querysum) - txt <- z$parse("UTF-8") - res <- parse_ncbi(xx, xml2::xml_find_all(xml2::read_xml(txt), "//eSummaryResult")[[1]], verbose) + ids <- make_ids(out) + res_list <- lapply(seq(1, length(ids), by = batch_size), function(i) { + querysum <- list(db = "nucleotide", + id = paste(ids[i:min(i + batch_size - 1, length(ids))], collapse = " "), + api_key = ncbi_key()) + z <- con$get("entrez/eutils/esummary.fcgi", query = querysum) + z$raise_for_status() + xml2::xml_find_all(xml2::read_xml(z$parse("UTF-8")), "//eSummaryResult") + }) + + res <- do.call(rbind, lapply(res_list, function(x) parse_ncbi(xx, x, verbose))) } - + mssg(verbose, "...done.") stats::setNames(res, c("taxon", "gene_desc", "gi_no", "acc_no", "length", "sequence", "spused")) } - + foo_safe <- tryfail(NULL, foo) - if (length(taxa) == 1){ foo_safe(taxa) } else { lapply(taxa, foo_safe) } + if (length(taxa) == 1){ + return(foo_safe(taxa)) + } else { + return(do.call(rbind, lapply(taxa, foo_safe))) + } } parse_ncbi <- function(xx, z, verbose) { + names <- xml2::xml_attr(xml2::xml_find_all(z, "//Item"), "Name") # gets names of values in summary + + if(length(names) == 0) { + message("No sequences found for ", xx) + return(data.frame(taxon = xx, gene_desc = NA, + gi_no = NA, acc_no = NA, length = NA, + sequence = NA, spused = NA, + stringsAsFactors = FALSE)) + } + prd <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Caption"]')) # get access numbers prd <- sapply(prd, function(x) strsplit(x, "_")[[1]][[1]], USE.NAMES=FALSE) l_ <- as.numeric(xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Length"]'))) # gets seq lengths @@ -112,6 +133,9 @@ parse_ncbi <- function(xx, z, verbose) { sns <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Title"]')) # gets seq lengths # get spp names df <- data.frame(gis=gis, length=l_, spnames=sns, predicted=prd, stringsAsFactors = FALSE) df <- df[!df$predicted %in% c("XM","XR"),] # remove predicted sequences + if (nrow(df) == 0) { + return(data.frame(taxon = xx, gene_desc = NA, gi_no = NA, acc_no = NA, length = NA, sequence = NA, spused = NA, stringsAsFactors = FALSE)) + } gisuse <- df[which.max(x=df$length),] # picks longest sequnence length if (NROW(gisuse) > 1) { gisuse <- gisuse[sample(NROW(gisuse), 1), ] @@ -126,12 +150,25 @@ parse_ncbi <- function(xx, z, verbose) { outseq <- w$parse("UTF-8") seq <- gsub("\n", "", strsplit(sub("\n", "<<<", outseq), "<<<")[[1]][[2]]) accessnum <- strsplit(outseq, "\\|")[[1]][4] - outt <- list(xx, as.character(gisuse[,3]), gisuse[,1], accessnum, gisuse[,2], seq) + outt <- list(taxon = xx, gene_desc = as.character(gisuse[,3]), + gi_no = gisuse[,1], acc_no = accessnum, + length = gisuse[,2], sequence = seq, + spused = paste( + strsplit(as.character(gisuse[,3]), + " ")[[1]][1:2], + collapse = " ")) spused <- paste(strsplit(outt[[2]], " ")[[1]][1:2], sep = "", collapse = " ") - outoutout <- data.frame(outt, spused = spused, stringsAsFactors = FALSE) - names(outoutout) <- NULL - outoutout + outoutout <- data.frame(outt, stringsAsFactors = FALSE) + + return(outoutout) } make_ids <- function(x) as.numeric(xml2::xml_text(xml2::xml_find_all(x, "//IdList//Id"))) + +tryfail <- function(default, code) { + tryCatch(code, error = function(e) { + message(e) + return(default) + }) +} \ No newline at end of file diff --git a/tests/testthat/test-ncbi.R b/tests/testthat/test-ncbi.R new file mode 100644 index 0000000..37b4611 --- /dev/null +++ b/tests/testthat/test-ncbi.R @@ -0,0 +1,54 @@ +context("NCBI tests") + +test_one_species <- "Acer rubrum" +test_three_species <- c("Colletes similis", "Halictus ligatus", "Perdita californica") +test_genes <- c("coi", "co1") +test_that("ncbi_byname gets seq for single species", { + skip_on_cran() + result <- ncbi_byname("Acer rubrum") + expect_equal(ncol(result), 7) + expect_equal(nrow(result), 1) + expect_true(!is.na(result$taxon)) + expect_true(!is.na(result$sequence)) +}) + +test_that("ncbi_byname gets seq for multiple species", { + skip_on_cran() + result <- ncbi_byname(test_three_species) + expect_equal(ncol(result), 7) + expect_gte(nrow(result), 3) + expect_true(all(!is.na(result$taxon))) + expect_true(all(!is.na(result$sequence))) +}) + +test_that("ncbi_byname handles cases with no results"), { + skip_on_cran() + result <- ncbi_byname("This is not a species") + expect_equal(ncol(result), 7) + expect_equal(nrow(result), 1) + expect_true(all(is.na(result[,2:7]))) +}) + +test_that("ncbi_byname gets seq for single gene", { + skip_on_cran() + result <- ncbi_byname(test_one_species, gene = "coi") + expect_equal(ncol(result), 7) + expect_gte(nrow(result), 1) + expect_true(all(!is.na(result$taxon))) + expect_true(all(!is.na(result$sequence))) +}) + +## regression tests for https://github.com/ropensci/traits/issues/126 + +test_that("ncbi_byname works under subset of conditions from issue #126", { + skip_on_cran() + + res <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "1:2000") + expect_true(is.data.frame(res)) + res2 <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "500:750") + expect_true(is.data.frame(res2)) + + res3 <- ncbi_byname(taxa = "Sardinops melanostictus", gene = c("12s"), seqrange = "1:2000")) + expect_true(is.data.frame(res3)) + +}) \ No newline at end of file