From ec25f84216708d507d315f3906b638f65d79340b Mon Sep 17 00:00:00 2001 From: Gene233 Date: Tue, 26 Mar 2024 23:37:46 +1100 Subject: [PATCH] Bump R version to >= 4.4 and update vignette and function details. --- DESCRIPTION | 19 +++--- NEWS.md | 4 ++ R/{objects.R => AllClasses.R} | 0 R/AllGenerics.R | 4 +- R/tf_idf_iae_wrappers.R | 113 ++++++++++++++++++++++++---------- README.md | 8 +++ man/cal_score.Rd | 4 +- man/iae.Rd | 5 +- man/iae_hdb.Rd | 5 +- man/iae_igm.Rd | 9 ++- man/iae_m.Rd | 5 +- man/iae_prob.Rd | 5 +- man/iae_rf.Rd | 5 +- man/iae_sd.Rd | 5 +- man/idf.Rd | 5 +- man/idf_hdb.Rd | 9 ++- man/idf_igm.Rd | 9 ++- man/idf_m.Rd | 9 ++- man/idf_prob.Rd | 5 +- man/idf_rf.Rd | 5 +- man/idf_sd.Rd | 5 +- man/tf.Rd | 9 ++- vignettes/smartid_Demo.Rmd | 57 ++++++++++------- 23 files changed, 212 insertions(+), 92 deletions(-) rename R/{objects.R => AllClasses.R} (100%) diff --git a/DESCRIPTION b/DESCRIPTION index 8f48b57..0e4ce19 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,26 @@ Package: smartid Title: Scoring and Marker Selection Method Based on Modified TF-IDF -Version: 0.99.2 +Version: 0.99.3 Authors@R: person("Jinjin", "Chen", email = "chen.j@wehi.edu.au", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7923-5723")) -Description: This package enables automated selection of group specific signature, - especially for rare population. The package is developed for generating - specifc lists of signature genes based on TF-IDF modified methods. It can - also be used as a new gene-set scoring method or data transformation method. - Multiple visualization functions are implemented in this package. +Description: This package enables automated selection of group specific + signature, especially for rare population. The package is developed for + generating specifc lists of signature genes based on Term Frequency-Inverse + Document Frequency (TF-IDF) modified methods. It can also be used as a new + gene-set scoring method or data transformation method. Multiple visualization + functions are implemented in this package. biocViews: Software, GeneExpression, Transcriptomics License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Collate: + 'AllClasses.R' 'tf_idf_iae_wrappers.R' 'score.R' 'AllGenerics.R' 'gs_score-methods.R' - 'objects.R' 'plot.R' 'scale_mgm.R' 'score-methods.R' @@ -28,7 +29,7 @@ Collate: 'top_markers.R' 'top_markers-methods.R' Depends: - R (>= 4.3) + R (>= 4.4) Imports: dplyr, ggplot2, @@ -57,4 +58,4 @@ URL: https://davislaboratory.github.io/smartid BugReports: https://github.com/DavisLaboratory/smartid/issues VignetteBuilder: knitr Config/testthat/edition: 3 -LazyData: true +LazyData: false diff --git a/NEWS.md b/NEWS.md index 0d25c84..e10a3e2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,3 +9,7 @@ # smartid 0.99.2 * Added test for `gs_score` function. + +# smartid 0.99.3 + +* Bump R version dependency to >= 4.4 and add details for TF, IDF, IAE functions. diff --git a/R/objects.R b/R/AllClasses.R similarity index 100% rename from R/objects.R rename to R/AllClasses.R diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 8722f13..74b268a 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -10,7 +10,9 @@ ## --------------------------------------------------------------------------- #' @title calculate combined score #' -#' @description compute TF, IDF, IAE and combine the score +#' @description compute TF (term/feature frequency), IDF (inverse document/cell +#' frequency), IAE (inverse average expression of features) and combine the +#' the final score #' #' @inheritParams cal_score_init #' @param data an expression object, can be matrix or SummarizedExperiment diff --git a/R/tf_idf_iae_wrappers.R b/R/tf_idf_iae_wrappers.R index c811f02..7e0e656 100644 --- a/R/tf_idf_iae_wrappers.R +++ b/R/tf_idf_iae_wrappers.R @@ -2,13 +2,18 @@ #-----------------TF variants------------------# ################################################# +### $$\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}$$ + ## term frequency -#' compute term/gene frequency within each cell +#' compute term/feature frequency within each cell +#' +#' @details +#' \deqn{\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}} #' #' @param expr a count matrix, features in row and cells in column #' @param log logical, if to do log-transformation #' -#' @return a matrix of tf +#' @return a matrix of term/gene frequency #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -34,9 +39,12 @@ tf <- function(expr, log = FALSE) { #' standard inverse cell frequency #' +#' @details +#' \deqn{\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})} +#' #' @inheritParams idf_rf #' -#' @return a vector of idf score for each feature +#' @return a vector of inverse cell frequency score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -54,14 +62,17 @@ idf <- function(expr, features = NULL, thres = 0) { } ## inverse document frequency max -### $$\mathbf{IDF_{i,d}} = log(\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$ +### $$\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})$$ ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$ -#' inverse document frequency: max +#' inverse cell frequency: max +#' +#' @details +#' \deqn{\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})} #' #' @inheritParams idf_rf #' -#' @return a vector of idf score for each feature +#' @return a matrix of inverse cell frequency score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -84,14 +95,17 @@ idf_m <- function(expr, features = NULL, thres = 0) { } ## inverse document frequency sd -### $$\mathbf{IDF} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$ +### $$\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$ ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$ #' inverse cell frequency using standard deviation (SD) #' +#' @details +#' \deqn{\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})} +#' #' @inheritParams idf_rf #' -#' @return a vector of idf score for each feature +#' @return a vector of inverse cell frequency score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -110,14 +124,17 @@ idf_sd <- function(expr, features = NULL, thres = 0) { } ## inverse document frequency using hdbscan cluster as label -#' Title +#' inverse document frequency using hdbscan cluster as label +#' +#' @details +#' Details as [smartid:::idf_prob()]. #' #' @inheritParams idf_rf #' @param minPts integer, minimum size of clusters, default 2. #' Details in [dbscan::hdbscan()]. #' @param ... parameters for [dbscan::hdbscan()] #' -#' @return a matrix of IDF score +#' @return a matrix of inverse cell frequency score #' #' @examples #' set.seed(123) @@ -137,9 +154,9 @@ idf_hdb <- function(expr, features = NULL, multi = TRUE, ## factor cluster cluster <- factor(cluster) - idf <- idf_rf(expr = expr, features = features, - label = cluster, multi = multi, - thres = thres) + idf <- idf_prob(expr = expr, features = features, + label = cluster, multi = multi, + thres = thres) return(idf) } @@ -150,13 +167,16 @@ idf_hdb <- function(expr, features = NULL, multi = TRUE, #' labeled inverse cell frequency: relative frequency #' +#' @details +#' \deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}})} +#' #' @param expr a matrix, features in row and cells in column #' @param features vector, feature names or indexes to compute #' @param label vector, group label of each cell #' @param multi logical, if to compute based on binary (FALSE) or multi-class (TRUE) #' @param thres numeric, cell only counts when expr > threshold, default 0 #' -#' @return a matrix of IDF score +#' @return a matrix of inverse cell frequency score #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -197,9 +217,12 @@ idf_rf <- function(expr, features = NULL, label, #' labeled inverse cell frequency: probability based #' +#' @details +#' \deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})} +#' #' @inheritParams idf_rf #' -#' @return a matrix of IDF score +#' @return a matrix of inverse cell frequency score #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -234,16 +257,19 @@ idf_prob <- function(expr, features = NULL, label, } ## labeled inverse document frequency IGM -### $$\mathbf{IGM} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})$$ +### $$\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})$$ ### $$\mathbf{k}: type\ in\ total\ group\ K$$ ### $$\mathbf{r_{k}}: rank\ of\ n_{i,j\in D})_{k}\ in\ total\ group\ K$$ -#' labeled inverse document frequency IGM +#' labeled inverse cell frequency: IGM +#' +#' @details +#' \deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})} #' #' @inheritParams idf_rf #' @param lambda numeric, hyperparameter for IGM #' -#' @return a vector of igm score for each feature +#' @return a vector of inverse gravity moment score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -274,14 +300,17 @@ idf_igm <- function(expr, features = NULL, label, lambda = 7, thres = 0) { ##-----------------unlabeled-------------------## ## inverse average expression -### $$\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})$$ +### $$\mathbf{IAE_i} = log(1+\frac{n}{\sum_j^n\hat N_{i,j}+1})$$ ### $$n: total\ counts\ of\ documents;\ \hat N_{i,j}: max(0, N_{i,j} - threshold)$$ #' standard inverse average expression #' +#' @details +#' \deqn{\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})} +#' #' @inheritParams idf_rf #' -#' @return a vector of iae score for each feature +#' @return a vector of inverse average expression score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -301,14 +330,17 @@ iae <- function(expr, features = NULL, thres = 0) { } ## inverse average expression max -### $$\mathbf{IAE_i} = log(1+\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$ +### $$\mathbf{IAE_{i,d}} = log(1+\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$ ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$ #' inverse average expression: max #' +#' @details +#' \deqn{\mathbf{IAE_{i,j}} = log(1+\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{\sum_{j = 1}^{n} sign(N_{i,j} > threshold)+1})} +#' #' @inheritParams idf_rf #' -#' @return a vector of iae score for each feature +#' @return a matrix of inverse average expression score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -337,9 +369,12 @@ iae_m <- function(expr, features = NULL, thres = 0) { #' inverse average expression using standard deviation (SD) #' +#' @details +#' \deqn{\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})} +#' #' @inheritParams idf_rf #' -#' @return a vector of iae score for each feature +#' @return a vector of inverse average expression score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -362,12 +397,15 @@ iae_sd <- function(expr, features = NULL, thres = 0) { ## inverse average expression using hdbscan cluster as label #' inverse average expression using hdbscan cluster as label #' +#' @details +#' Details as [smartid:::iae_prob()]. +#' #' @inheritParams idf_rf #' @param minPts integer, minimum size of clusters, default 2. #' Details in [dbscan::hdbscan()]. #' @param ... parameters for [dbscan::hdbscan()] #' -#' @return a matrix of IAE score +#' @return a matrix of inverse average expression score #' #' @examples #' set.seed(123) @@ -401,9 +439,9 @@ iae_hdb <- function(expr, features = NULL, multi = TRUE, # # iae <- log1p((mean_row_in/(mean_row_notin+0.01))[, cluster]) ## IDF scores - iae <- iae_rf(expr = expr, features = features, - label = cluster, multi = multi, - thres = thres) + iae <- iae_prob(expr = expr, features = features, + label = cluster, multi = multi, + thres = thres) return(iae) } @@ -416,9 +454,12 @@ iae_hdb <- function(expr, features = NULL, multi = TRUE, #' labeled inverse average expression: relative frequency #' +#' @details +#' \deqn{\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}})} +#' #' @inheritParams idf_rf #' -#' @return a matrix of IAE score +#' @return a matrix of inverse average expression score #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -452,16 +493,19 @@ iae_rf <- function(expr, features = NULL, label, } ## labeled inverse average expression: probability based -### $$\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$ +### $$\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$ ### modified from ### $$\mathbf{IDF_{i,j}} = log(1+\frac{A}{max(B)+1}*\frac{A}{C+1})$$ ### A denotes the number of cells belonging to category D where the gene i occurs at least once; B denotes the number of cells not belonging to category D where the gene i occurs at least once; C denotes the number of cells belonging to category D where the gene i does not occur; D denotes the number of cells not belonging to category D where the gene i does not occur. #' labeled inverse average expression: probability based #' +#' @details +#' \deqn{\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))} +#' #' @inheritParams idf_rf #' -#' @return a matrix of IAE score +#' @return a matrix of inverse average expression score #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) @@ -495,16 +539,19 @@ iae_prob <- function(expr, features = NULL, label, } ## labeled inverse average expression IGM -### $$\mathbf{IGM} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})$$ +### $$\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})$$ ### $$\mathbf{k}: type\ in\ total\ group\ K$$ ### $$\mathbf{r_{k}}: rank\ of\ mean(N_{i,j\in D})_{k}\ in\ total\ group\ K$$ -#' labeled inverse average expression IGM +#' labeled inverse average expression: IGM +#' +#' @details +#' \deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})} #' #' @inheritParams idf_rf #' @param lambda numeric, hyperparameter for IGM #' -#' @return a vector of igm score for each feature +#' @return a vector of inverse gravity moment score for each feature #' #' @examples #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/README.md b/README.md index 134d2cb..17cd6c2 100644 --- a/README.md +++ b/README.md @@ -24,4 +24,12 @@ You can install the development version of smartid like so: devtools::install("DavisLaboratory/smartid") ``` +smartid can be installed from Bioconductor directly as follows: + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("smartid") +``` diff --git a/man/cal_score.Rd b/man/cal_score.Rd index afe429a..df23b7a 100644 --- a/man/cal_score.Rd +++ b/man/cal_score.Rd @@ -62,7 +62,9 @@ optional, default 'score'} A list of matrices or se object containing combined score } \description{ -compute TF, IDF, IAE and combine the score +compute TF (term/feature frequency), IDF (inverse document/cell +frequency), IAE (inverse average expression of features) and combine the +the final score } \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/man/iae.Rd b/man/iae.Rd index 78f4ef8..6798938 100644 --- a/man/iae.Rd +++ b/man/iae.Rd @@ -14,11 +14,14 @@ iae(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of iae score for each feature +a vector of inverse average expression score for each feature } \description{ standard inverse average expression } +\details{ +\deqn{\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::iae(data) diff --git a/man/iae_hdb.Rd b/man/iae_hdb.Rd index 1200f79..6ce6435 100644 --- a/man/iae_hdb.Rd +++ b/man/iae_hdb.Rd @@ -21,11 +21,14 @@ Details in \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}.} \item{...}{parameters for \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}} } \value{ -a matrix of IAE score +a matrix of inverse average expression score } \description{ inverse average expression using hdbscan cluster as label } +\details{ +Details as \code{\link[smartid:::iae_prob]{smartid::::iae_prob()}}. +} \examples{ set.seed(123) data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/man/iae_igm.Rd b/man/iae_igm.Rd index b166999..86f9ec6 100644 --- a/man/iae_igm.Rd +++ b/man/iae_igm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tf_idf_iae_wrappers.R \name{iae_igm} \alias{iae_igm} -\title{labeled inverse average expression IGM} +\title{labeled inverse average expression: IGM} \usage{ iae_igm(expr, features = NULL, label, lambda = 7, thres = 0) } @@ -18,10 +18,13 @@ iae_igm(expr, features = NULL, label, lambda = 7, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of igm score for each feature +a vector of inverse gravity moment score for each feature } \description{ -labeled inverse average expression IGM +labeled inverse average expression: IGM +} +\details{ +\deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})} } \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/man/iae_m.Rd b/man/iae_m.Rd index d646156..b57cbd6 100644 --- a/man/iae_m.Rd +++ b/man/iae_m.Rd @@ -14,11 +14,14 @@ iae_m(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of iae score for each feature +a matrix of inverse average expression score for each feature } \description{ inverse average expression: max } +\details{ +\deqn{\mathbf{IAE_{i,j}} = log(1+\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{\sum_{j = 1}^{n} sign(N_{i,j} > threshold)+1})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::iae_m(data) diff --git a/man/iae_prob.Rd b/man/iae_prob.Rd index aab7677..1a96eeb 100644 --- a/man/iae_prob.Rd +++ b/man/iae_prob.Rd @@ -18,11 +18,14 @@ iae_prob(expr, features = NULL, label, multi = TRUE, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a matrix of IAE score +a matrix of inverse average expression score } \description{ labeled inverse average expression: probability based } +\details{ +\deqn{\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::iae_prob(data, label = sample(c("A", "B"), 10, replace = TRUE)) diff --git a/man/iae_rf.Rd b/man/iae_rf.Rd index 2f35563..8603ef6 100644 --- a/man/iae_rf.Rd +++ b/man/iae_rf.Rd @@ -18,11 +18,14 @@ iae_rf(expr, features = NULL, label, multi = TRUE, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a matrix of IAE score +a matrix of inverse average expression score } \description{ labeled inverse average expression: relative frequency } +\details{ +\deqn{\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::iae_rf(data, label = sample(c("A", "B"), 10, replace = TRUE)) diff --git a/man/iae_sd.Rd b/man/iae_sd.Rd index 23adfa2..ff3091e 100644 --- a/man/iae_sd.Rd +++ b/man/iae_sd.Rd @@ -14,11 +14,14 @@ iae_sd(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of iae score for each feature +a vector of inverse average expression score for each feature } \description{ inverse average expression using standard deviation (SD) } +\details{ +\deqn{\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::iae_sd(data) diff --git a/man/idf.Rd b/man/idf.Rd index a7a26d9..5e5cc6e 100644 --- a/man/idf.Rd +++ b/man/idf.Rd @@ -14,11 +14,14 @@ idf(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of idf score for each feature +a vector of inverse cell frequency score for each feature } \description{ standard inverse cell frequency } +\details{ +\deqn{\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::idf(data) diff --git a/man/idf_hdb.Rd b/man/idf_hdb.Rd index d329845..392ec79 100644 --- a/man/idf_hdb.Rd +++ b/man/idf_hdb.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tf_idf_iae_wrappers.R \name{idf_hdb} \alias{idf_hdb} -\title{Title} +\title{inverse document frequency using hdbscan cluster as label} \usage{ idf_hdb(expr, features = NULL, multi = TRUE, thres = 0, minPts = 2, ...) } @@ -21,10 +21,13 @@ Details in \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}.} \item{...}{parameters for \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}} } \value{ -a matrix of IDF score +a matrix of inverse cell frequency score } \description{ -Title +inverse document frequency using hdbscan cluster as label +} +\details{ +Details as \code{\link[smartid:::idf_prob]{smartid::::idf_prob()}}. } \examples{ set.seed(123) diff --git a/man/idf_igm.Rd b/man/idf_igm.Rd index c6c6f99..9b42ba5 100644 --- a/man/idf_igm.Rd +++ b/man/idf_igm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tf_idf_iae_wrappers.R \name{idf_igm} \alias{idf_igm} -\title{labeled inverse document frequency IGM} +\title{labeled inverse cell frequency: IGM} \usage{ idf_igm(expr, features = NULL, label, lambda = 7, thres = 0) } @@ -18,10 +18,13 @@ idf_igm(expr, features = NULL, label, lambda = 7, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of igm score for each feature +a vector of inverse gravity moment score for each feature } \description{ -labeled inverse document frequency IGM +labeled inverse cell frequency: IGM +} +\details{ +\deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})} } \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/man/idf_m.Rd b/man/idf_m.Rd index c818958..3c65bd2 100644 --- a/man/idf_m.Rd +++ b/man/idf_m.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tf_idf_iae_wrappers.R \name{idf_m} \alias{idf_m} -\title{inverse document frequency: max} +\title{inverse cell frequency: max} \usage{ idf_m(expr, features = NULL, thres = 0) } @@ -14,10 +14,13 @@ idf_m(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of idf score for each feature +a matrix of inverse cell frequency score for each feature } \description{ -inverse document frequency: max +inverse cell frequency: max +} +\details{ +\deqn{\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})} } \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/man/idf_prob.Rd b/man/idf_prob.Rd index f0d014d..15ca94e 100644 --- a/man/idf_prob.Rd +++ b/man/idf_prob.Rd @@ -18,11 +18,14 @@ idf_prob(expr, features = NULL, label, multi = TRUE, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a matrix of IDF score +a matrix of inverse cell frequency score } \description{ labeled inverse cell frequency: probability based } +\details{ +\deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::idf_prob(data, label = sample(c("A", "B"), 10, replace = TRUE)) diff --git a/man/idf_rf.Rd b/man/idf_rf.Rd index e08ce2b..df952fc 100644 --- a/man/idf_rf.Rd +++ b/man/idf_rf.Rd @@ -18,11 +18,14 @@ idf_rf(expr, features = NULL, label, multi = TRUE, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a matrix of IDF score +a matrix of inverse cell frequency score } \description{ labeled inverse cell frequency: relative frequency } +\details{ +\deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::idf_rf(data, label = sample(c("A", "B"), 10, replace = TRUE)) diff --git a/man/idf_sd.Rd b/man/idf_sd.Rd index 08f0c65..8f79c81 100644 --- a/man/idf_sd.Rd +++ b/man/idf_sd.Rd @@ -14,11 +14,14 @@ idf_sd(expr, features = NULL, thres = 0) \item{thres}{numeric, cell only counts when expr > threshold, default 0} } \value{ -a vector of idf score for each feature +a vector of inverse cell frequency score for each feature } \description{ inverse cell frequency using standard deviation (SD) } +\details{ +\deqn{\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})} +} \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) smartid:::idf_sd(data) diff --git a/man/tf.Rd b/man/tf.Rd index 2677116..37c895c 100644 --- a/man/tf.Rd +++ b/man/tf.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tf_idf_iae_wrappers.R \name{tf} \alias{tf} -\title{compute term/gene frequency within each cell} +\title{compute term/feature frequency within each cell} \usage{ tf(expr, log = FALSE) } @@ -12,10 +12,13 @@ tf(expr, log = FALSE) \item{log}{logical, if to do log-transformation} } \value{ -a matrix of tf +a matrix of term/gene frequency } \description{ -compute term/gene frequency within each cell +compute term/feature frequency within each cell +} +\details{ +\deqn{\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}} } \examples{ data <- matrix(rpois(100, 2), 10, dimnames = list(1:10)) diff --git a/vignettes/smartid_Demo.Rmd b/vignettes/smartid_Demo.Rmd index c914476..39d7246 100644 --- a/vignettes/smartid_Demo.Rmd +++ b/vignettes/smartid_Demo.Rmd @@ -32,7 +32,7 @@ knitr::opts_chunk$set( **smartid** -smartid is a package that enables automated selection of group specific signature, especially for rare population. This package is developed for generating specifc lists of signature genes based on TF-IDF modified methods for labeled data. It can also be used as a new gene-set scoring method or data transformation method for un-labeled data. Multiple visualization functions are implemented in this package. +`smartid` is a package that enables automated selection of group specific signature, especially for rare population. This package is developed for generating specifc lists of signature genes based on TF-IDF modified methods for labeled data. It can also be used as a new gene-set scoring method or data transformation method for un-labeled data. Multiple visualization functions are implemented in this package. # Installation @@ -43,13 +43,6 @@ smartid is a package that enables automated selection of group specific signatur The most updated version of `smartid` is hosted on GitHub and can be installed using `devtools::install_github()` function provided by [devtools](https://cran.r-project.org/package=devtools). ```{r installation, eval=FALSE} -# if (!requireNamespace("devtools", quietly = TRUE)) { -# install.packages("devtools") -# } -# if (!requireNamespace("smartid", quietly = TRUE)) { -# devtools::install_github("DavisLaboratory/smartid") -# } - if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } @@ -62,7 +55,7 @@ if (!requireNamespace("smartid", quietly = TRUE)) { ------------------------------------------------------------------------ -To show a quick start guide of smartid, here we use package `splatter` to simulate a scRNA-seq data of 1000 genes * 3000 cells. This data consists of 4 groups, each has 2% DEGs except Group 4, which has no DEG as a negative control group. +To show a quick start guide of `smartid`, here we use package `r BiocStyle::Biocpkg("splatter")` to simulate a scRNA-seq data of 1000 genes * 3000 cells. This data consists of 4 groups, each has 2% DEGs except Group 4, which has no DEG as a negative control group. ```{r set up, message=FALSE} library(smartid) @@ -104,7 +97,7 @@ data_sim `smartid` can be easily used to accurately identify specific marker genes on labeled data. By adapting and modifying TF-IDF approach, `smartid` shows robust power in finding marker genes, especially for rare population which many methods fail in. -**marker identification of smartid includes 3 key steps:** +**marker identification of `smartid` includes 3 key steps:** step 1. score samples step 2. scale and transform scores @@ -112,15 +105,30 @@ data_sim ## Score Samples -The first step is to score all samples/cells by using specified approach. The score can be composed of 3 terms: TF (term frequency), IDF (inverse document frequency) and IAE (inverse average expression). Each term has a couple of available choices with different formats to suit labeled or un-labeled data. +The first step is to score all samples/cells by using specified approach. The score can be composed of 3 terms: TF (term/feature frequency), IDF (inverse document/cell frequency) and IAE (inverse average expression of features). Each term has a couple of available choices with different formats to suit labeled or un-labeled data. -TF here stands for gene frequency, which is similar to CPM, while IDF represents the inverse cell/sample frequency for scRNA-seq data, and IAE is the inverse average expression of each gene across all cells or cells in each labeled group. +The basic version of TF, IDF and IAE can be termed as: +$$\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}$$ +$$\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})$$ +$$\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})$$ + +$$where\ N_{i,j}\ is\ the\ counts\ of\ feature\ i\ in\ cell\ j;\ \hat N_{i,j}\ is\ max(0,\ N_{i,j} - threshold);$$ +$$\ n\ is\ total\ counts\ of\ documents(cells);\ n_i\ is\ \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$ Here for labeled data, we can choose logTF * IDF_prob * IAE_prob for marker identification. -$$\mathbf{score}=logTF*IDF_{prob}*IAE_{prob}$$ +$\mathbf{score}=logTF*IDF_{prob}*IAE_{prob}$ -Another advantage of smartid is that it can start with raw counts data, with no need for pre-processed data. And the scoring is quite fast. +The probability version of IDF can be termed as: +$$\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})$$ + +And the probability version of IAE can be termed as: +$$\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$ +$$where\ D\ is\ the\ category\ of\ cell\ j;\ \hat D\ is\ the\ category\ other\ than\ D.$$ + +TF here stands for gene frequency, which is similar to CPM, while IDF represents the inverse cell/sample frequency for scRNA-seq data, and IAE is the inverse average expression of each gene across all cells or cells in each labeled group. + +Another advantage of `smartid` is that it can start with raw counts data, with no need for pre-processed data. And the scoring is quite fast. ```{r} ## compute score @@ -142,7 +150,7 @@ names(metadata(data_sim)) ## Scale and Transform Score -Scaling is needed to find the markers specific to the group, however, standard scaling might fail due to the rare populations. Here smartid uses a special scaling strategy `scale_mgm`, which can scale im-balanced data by given group labels. +Scaling is needed to find the markers specific to the group, however, standard scaling might fail due to the rare populations. Here `smartid` uses a special scaling strategy `scale_mgm()`, which can scale im-balanced data by given group labels. The score will be transformed using softmax before passing to EM algorithm. @@ -185,9 +193,9 @@ data_sim@rowRanges@elementMetadata[76,] As we can see from above, there is a distinctly different distribution of feature score between group with DEGs and without DEGs. And there is a clear separation (break point) between the real DEGs and non-DEGs. -To help automatically select real markers for each group, smartid used an expectation maximization (EM) approach to identify which genes fall into the real DEGs distribution and which are not. +To help automatically select real markers for each group, `smartid` used an expectation maximization (EM) approach to identify which genes fall into the real DEGs distribution and which are not. -Regarding the distribution of scores as a mixture model, here we can choose function `markers_mixmdl` in smartid to separate features. There are 2 available mixture model to choose: normal (Gaussian) or gamma. We choose "norm" here as it runs faster. +Regarding the distribution of scores as a mixture model, here we can choose function `markers_mixmdl()` in `smartid` to separate features. There are 2 available mixture model to choose: normal (Gaussian) or gamma. We choose "norm" here as it runs faster. `smartid` also allows to plot the mixture distribution plot after EM. It's obvious that the top 2 components of Group 4 share quite similar distribution, thus no markers will be selected for this group. @@ -203,7 +211,7 @@ marker_ls <- markers_mixmdl( marker_ls ``` -We can also compare our selected markers with real DEGs. As there is no markers or DEG in group 4, only show overlap from Group1-3. It's clear that all the markers identified by smartid are real DEGs, with a couple of missing genes in Group 1 and 2. But as what we showed above, those genes only exhibit low mean expression across all cells, thus not confident enough to be selected as markers. +We can also compare our selected markers with real DEGs. As there is no markers or DEG in group 4, only show overlap from Group1-3. It's clear that all the markers identified by `smartid` are real DEGs, with a couple of missing genes in Group 1 and 2. But as what we showed above, those genes only exhibit low mean expression across all cells, thus not confident enough to be selected as markers. ```{r} library(UpSetR) @@ -211,7 +219,7 @@ library(UpSetR) upset(fromList(c(data_sim@metadata$up_markers, marker_ls)), nsets = 6) ``` -`smartid` also provides some other implementation of marker selection. Here is another example using `mclust`. Different from `markers_mixmdl`, `markers_mclust` doesn't need a pre-defined number of components (which is 3 in `markers_mixmdl`), instead, it will select the number of components by searching a series of potential numbers. This method is sometimes more robust than `markers_mixmdl`. +`smartid` also provides some other implementation of marker selection. Here is another example using `mclust`. Different from `markers_mixmdl()`, `markers_mclust()` doesn't need a pre-defined number of components (which is 3 in `markers_mixmdl()`), instead, it will select the number of components by searching a series of potential numbers. This method is sometimes more robust than `markers_mixmdl()`. Similary, this method also allows to plot the mixture distribution for each component, but separately. @@ -221,7 +229,7 @@ set.seed(123) marker_ls_new <- markers_mclust( top_markers = top_m, column = ".dot", - dist = "norm", + method = "max.one", plot = TRUE ) names(marker_ls_new) <- paste(names(marker_ls_new), "new") @@ -242,11 +250,14 @@ While for the unlabeled data, `smartid` also provides the score methods with no Here we choose logTF * IDF_sd * IAE_sd for for gene-set scoring as a use case. -$$\mathbf{score}=logTF*IDF_{sd}*IAE_{sd}$$ +$\mathbf{score}=logTF*IDF_{sd}*IAE_{sd}$ + +$$where\ \mathbf{IDF} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$ +$$\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})$$ ## Score Samples -Similary, the first step is to score samples/cells using the specified method. This step also starts with raw counts data, without need for data pre-processing, which is quite convenient and fast. +Similarly, the first step is to score samples/cells using the specified method. This step also starts with raw counts data, without need for data pre-processing, which is quite convenient and fast. ```{r} ## compute score without label @@ -267,7 +278,7 @@ names(metadata(data_sim)) ## Compute Overall Score for Gene-set -To compare overall score of the given gene-set, we don't need to scale and tranform score this time. Using `gs_score` can easily compute the overall score for each cell based on the given gene-set list. +To compare overall score of the given gene-set, we don't need to scale and tranform score this time. Using `gs_score()` can easily compute the overall score for each cell based on the given gene-set list. ```{r} ## compute score for each group marker list