Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix errors with ncbi_byname reported in #126 #132

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 55 additions & 18 deletions R/ncbi_byname.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
#' species <- c("Colletes similis","Halictus ligatus","Perdita californica")
#' ncbi_byname(taxa=species, gene = c("coi", "co1"), seqrange = "1:2000")
#' }
ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
verbose=TRUE, ...) {
ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE, verbose=TRUE, batch_size = 100, ...) {
foo <- function(xx) {
mssg(verbose, paste("Working on ", xx, "...", sep = ""))
mssg(verbose, "...retrieving sequence IDs...")
Expand Down Expand Up @@ -51,9 +50,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
if (!getrelated) {
mssg(verbose, paste("no sequences of ", gene, " for ", xx, sep = ""))
res <- data.frame(
xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_,
taxon = xx, gene_desc = NA_character_,
gi_no = NA_real_, acc_no = NA_character_, length = NA_real_,
sequence = NA_character_, spused = NA_character_,
stringsAsFactors = FALSE)
names(res) <- NULL
} else {
mssg(verbose, "...retrieving sequence IDs for related species...")
newname <- strsplit(xx, " ")[[1]][[1]]
Expand All @@ -68,9 +68,10 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
if (as.numeric(xml2::xml_text(xml2::xml_find_all(out, "//Count")[[1]])) == 0) {
mssg(verbose, paste("no sequences of ", gene, " for ", xx, " or ", newname, sep = ""))
res <- data.frame(
xx, NA_character_, NA_real_, NA_character_, NA_real_, NA_character_, NA_character_,
taxon = xx, gene_desc = NA_character_,
gi_no = NA_real_, acc_no = NA_character_, length = NA_real_,
sequence = NA_character_, spused = NA_character_,
stringsAsFactors = FALSE)
names(res) <- NULL
} else {
## For each species = get GI number with longest sequence
mssg(verbose, "...retrieving sequence ID with longest sequence length...")
Expand All @@ -88,30 +89,53 @@ ncbi_byname <- function(taxa, gene="COI", seqrange="1:3000", getrelated=FALSE,
## For each species = get GI number with longest sequence
mssg(verbose, "...retrieving sequence ID with longest sequence length...")
# construct query for species
querysum <- list(db = "nucleotide", api_key = ncbi_key(),
id = paste(make_ids(out), collapse = " "))
z <- con$get("entrez/eutils/esummary.fcgi", query = querysum)
txt <- z$parse("UTF-8")
res <- parse_ncbi(xx, xml2::xml_find_all(xml2::read_xml(txt), "//eSummaryResult")[[1]], verbose)
ids <- make_ids(out)
res_list <- lapply(seq(1, length(ids), by = batch_size), function(i) {
querysum <- list(db = "nucleotide",
id = paste(ids[i:min(i + batch_size - 1, length(ids))], collapse = " "),
api_key = ncbi_key())
z <- con$get("entrez/eutils/esummary.fcgi", query = querysum)
z$raise_for_status()
xml2::xml_find_all(xml2::read_xml(z$parse("UTF-8")), "//eSummaryResult")
})

res <- do.call(rbind, lapply(res_list, function(x) parse_ncbi(xx, x, verbose)))
}

mssg(verbose, "...done.")
stats::setNames(res, c("taxon", "gene_desc", "gi_no", "acc_no", "length", "sequence", "spused"))
}

foo_safe <- tryfail(NULL, foo)
if (length(taxa) == 1){ foo_safe(taxa) } else { lapply(taxa, foo_safe) }
if (length(taxa) == 1){
return(foo_safe(taxa))
} else {
return(do.call(rbind, lapply(taxa, foo_safe)))
}
}

parse_ncbi <- function(xx, z, verbose) {

names <- xml2::xml_attr(xml2::xml_find_all(z, "//Item"), "Name") # gets names of values in summary

if(length(names) == 0) {
message("No sequences found for ", xx)
return(data.frame(taxon = xx, gene_desc = NA,
gi_no = NA, acc_no = NA, length = NA,
sequence = NA, spused = NA,
stringsAsFactors = FALSE))
}

prd <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Caption"]')) # get access numbers
prd <- sapply(prd, function(x) strsplit(x, "_")[[1]][[1]], USE.NAMES=FALSE)
l_ <- as.numeric(xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Length"]'))) # gets seq lengths
gis <- as.numeric(xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Gi"]'))) # gets GI numbers
sns <- xml2::xml_text(xml2::xml_find_all(z, '//Item[@Name="Title"]')) # gets seq lengths # get spp names
df <- data.frame(gis=gis, length=l_, spnames=sns, predicted=prd, stringsAsFactors = FALSE)
df <- df[!df$predicted %in% c("XM","XR"),] # remove predicted sequences
if (nrow(df) == 0) {
return(data.frame(taxon = xx, gene_desc = NA, gi_no = NA, acc_no = NA, length = NA, sequence = NA, spused = NA, stringsAsFactors = FALSE))
}
gisuse <- df[which.max(x=df$length),] # picks longest sequnence length
if (NROW(gisuse) > 1) {
gisuse <- gisuse[sample(NROW(gisuse), 1), ]
Expand All @@ -126,12 +150,25 @@ parse_ncbi <- function(xx, z, verbose) {
outseq <- w$parse("UTF-8")
seq <- gsub("\n", "", strsplit(sub("\n", "<<<", outseq), "<<<")[[1]][[2]])
accessnum <- strsplit(outseq, "\\|")[[1]][4]
outt <- list(xx, as.character(gisuse[,3]), gisuse[,1], accessnum, gisuse[,2], seq)
outt <- list(taxon = xx, gene_desc = as.character(gisuse[,3]),
gi_no = gisuse[,1], acc_no = accessnum,
length = gisuse[,2], sequence = seq,
spused = paste(
strsplit(as.character(gisuse[,3]),
" ")[[1]][1:2],
collapse = " "))

spused <- paste(strsplit(outt[[2]], " ")[[1]][1:2], sep = "", collapse = " ")
outoutout <- data.frame(outt, spused = spused, stringsAsFactors = FALSE)
names(outoutout) <- NULL
outoutout
outoutout <- data.frame(outt, stringsAsFactors = FALSE)

return(outoutout)
}

make_ids <- function(x) as.numeric(xml2::xml_text(xml2::xml_find_all(x, "//IdList//Id")))

tryfail <- function(default, code) {
tryCatch(code, error = function(e) {
message(e)
return(default)
})
}
55 changes: 55 additions & 0 deletions tests/testthat/test-ncbi.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
context("NCBI tests")

test_one_species <- "Acer rubrum"
test_three_species <- c("Colletes similis", "Halictus ligatus", "Perdita californica")
test_genes <- c("coi", "co1")
test_that("ncbi_byname gets seq for single species", {
skip_on_cran()
result <- ncbi_byname("Acer rubrum")
expect_equal(ncol(result), 7)
expect_equal(nrow(result), 1)
expect_true(!is.na(result$taxon))
expect_true(!is.na(result$sequence))
})

test_that("ncbi_byname gets seq for multiple species", {
skip_on_cran()
result <- ncbi_byname(test_three_species)
expect_equal(ncol(result), 7)
expect_gte(nrow(result), 3)
expect_true(all(!is.na(result$taxon)))
expect_true(all(!is.na(result$sequence)))
})

test_that("ncbi_byname handles cases with no results"), {
skip_on_cran()
result <- ncbi_byname("This is not a species")
expect_equal(ncol(result), 7)
expect_equal(nrow(result), 1)
expect_true(all(is.na(result[,2:7])))
})

test_that("ncbi_byname gets seq for single gene", {
skip_on_cran()
result <- ncbi_byname(test_one_species, gene = "coi")
expect_equal(ncol(result), 7)
expect_gte(nrow(result), 1)
expect_true(all(!is.na(result$taxon)))
expect_true(all(!is.na(result$sequence)))
})

## regression tests for https://github.com/ropensci/traits/issues/126

test_that("ncbi_byname works under subset of conditions from issue #126", {
skip_on_cran()

res <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "1:2000")
expect_true(is.data.frame(res))
res <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "1:2000")
dlebauer marked this conversation as resolved.
Show resolved Hide resolved
res2 <- ncbi_byname(taxa = "Coryphaena hippurus", gene = c("Coi"), seqrange = "500:750")
expect_true(is.data.frame(res2))

res3 <- ncbi_byname(taxa = "Sardinops melanostictus", gene = c("12s"), seqrange = "1:2000"))
expect_true(is.data.frame(res3))

})