Skip to content

Commit

Permalink
simplify/improve ENSEMBL base transcript name matching
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed Sep 16, 2024
1 parent c8c8ab0 commit 81723a2
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
.Rdata
.httr-oauth
.DS_Store
dcs04_data
inst/doc
33 changes: 20 additions & 13 deletions R/getDegTx.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,35 @@
#' @import rlang
#'
#' @examples
#' getDegTx(covComb_tx_deg)
#' stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1)
getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = select_transcripts(type), assayname = "tpm") {

type = arg_match(type)

#' degTx <- getDegTx(rse_tx, "standard")
getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = NULL, assayname = "tpm") {

#type = arg_match(type)
if (is.null(sig_transcripts)) {
type = arg_match(type)
sig_transcripts <- select_transcripts(type)
} else {
type = "custom"
}
# Validate rse_tx is a RangedSummarizedExperiment object
if (!is(rse_tx, "RangedSummarizedExperiment")) {
stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE)
}

# Check if assayname is in assayNames
if (!assayname %in% assayNames(rse_tx)) {
stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE)
}

# Check for validity and matching of tx names
sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts')

# Subset rse_tx to include sig_transcripts
rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE]

wtx <- check_tx_names(rownames(rse_tx), sig_transcripts)
if (length(wtx) < 10) {
stop("Not enough transcript names were found in the '",type, "' degradation model transcripts" )
}
# if (verbose)
message(" '",type,"' degradation model transcripts found: ", length(wtx))
rse_tx <- rse_tx[wtx, , drop = FALSE]

# Check if the row means is greater than 1
if (mean(rowMeans(assays(rse_tx)[[assayname]])) < 1) {
warning("The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.")
Expand Down
51 changes: 12 additions & 39 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,46 +14,19 @@
#' @export
#'
#' @examples
#' sig_transcripts = select_transcripts("cell_component")
#' check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts')
#' sig_tx <- select_transcripts("cell_component")
#' whichTx <- check_tx_names(rownames(rse_tx), sig_tx)


check_tx_names = function(tx1, tx2, arg_name1, arg_name2) {
check_tx_names = function(txnames, sig_transcripts) {
# Functions for checking whether a vector of transcripts all match GENCODE
# or ENSEMBL naming conventions
is_gencode = function(x) all(grepl("^ENST.*?\\.", x))
is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x))

# Check that both vectors either follow GENCODE or ENSEMBL
if (!is_gencode(tx1) && !is_ensembl(tx1)) {
stop(
sprintf(
"'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
arg_name1
)
)
}
if (!is_gencode(tx2) && !is_ensembl(tx2)) {
stop(
sprintf(
"'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
arg_name2
)
)
}

# Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE
# but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further
# down)
if (is_gencode(tx2) && is_ensembl(tx1)) {
tx2 = sub('\\..*', '', tx2)
}

# At least some transcripts must overlap between 'tx1' and 'tx2'
if (!any(tx2 %in% tx1)) {
stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1))
## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers.
## Since release 44, these have their own IDs
if (!all(grepl("^ENST\\d+", txnames))) {
stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" )
}

# Since only 'tx2' was modified, return the changed copy
return(tx2)
}
## normalize the transcript names
sig_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', sig_transcripts, perl=TRUE)
r_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE)
which(r_tx %in% sig_tx)
}

0 comments on commit 81723a2

Please sign in to comment.