From ec25f84216708d507d315f3906b638f65d79340b Mon Sep 17 00:00:00 2001
From: Gene233 <chen_jj97@163.com>
Date: Tue, 26 Mar 2024 23:37:46 +1100
Subject: [PATCH] Bump R version to >= 4.4 and update vignette and function
 details.

---
 DESCRIPTION                   |  19 +++---
 NEWS.md                       |   4 ++
 R/{objects.R => AllClasses.R} |   0
 R/AllGenerics.R               |   4 +-
 R/tf_idf_iae_wrappers.R       | 113 ++++++++++++++++++++++++----------
 README.md                     |   8 +++
 man/cal_score.Rd              |   4 +-
 man/iae.Rd                    |   5 +-
 man/iae_hdb.Rd                |   5 +-
 man/iae_igm.Rd                |   9 ++-
 man/iae_m.Rd                  |   5 +-
 man/iae_prob.Rd               |   5 +-
 man/iae_rf.Rd                 |   5 +-
 man/iae_sd.Rd                 |   5 +-
 man/idf.Rd                    |   5 +-
 man/idf_hdb.Rd                |   9 ++-
 man/idf_igm.Rd                |   9 ++-
 man/idf_m.Rd                  |   9 ++-
 man/idf_prob.Rd               |   5 +-
 man/idf_rf.Rd                 |   5 +-
 man/idf_sd.Rd                 |   5 +-
 man/tf.Rd                     |   9 ++-
 vignettes/smartid_Demo.Rmd    |  57 ++++++++++-------
 23 files changed, 212 insertions(+), 92 deletions(-)
 rename R/{objects.R => AllClasses.R} (100%)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8f48b57..0e4ce19 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,25 +1,26 @@
 Package: smartid
 Title: Scoring and Marker Selection Method Based on Modified TF-IDF
-Version: 0.99.2
+Version: 0.99.3
 Authors@R: 
     person("Jinjin", "Chen", email = "chen.j@wehi.edu.au", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-7923-5723"))
-Description: This package enables automated selection of group specific signature,
-    especially for rare population. The package is developed for generating
-    specifc lists of signature genes based on TF-IDF modified methods. It can
-    also be used as a new gene-set scoring method or data transformation method.
-    Multiple visualization functions are implemented in this package.
+Description: This package enables automated selection of group specific
+    signature, especially for rare population. The package is developed for
+    generating specifc lists of signature genes based on Term Frequency-Inverse
+    Document Frequency (TF-IDF) modified methods. It can also be used as a new
+    gene-set scoring method or data transformation method. Multiple visualization
+    functions are implemented in this package.
 biocViews: Software, GeneExpression, Transcriptomics
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3
 Collate: 
+    'AllClasses.R'
     'tf_idf_iae_wrappers.R'
     'score.R'
     'AllGenerics.R'
     'gs_score-methods.R'
-    'objects.R'
     'plot.R'
     'scale_mgm.R'
     'score-methods.R'
@@ -28,7 +29,7 @@ Collate:
     'top_markers.R'
     'top_markers-methods.R'
 Depends: 
-    R (>= 4.3)
+    R (>= 4.4)
 Imports: 
     dplyr,
     ggplot2,
@@ -57,4 +58,4 @@ URL: https://davislaboratory.github.io/smartid
 BugReports: https://github.com/DavisLaboratory/smartid/issues
 VignetteBuilder: knitr
 Config/testthat/edition: 3
-LazyData: true
+LazyData: false
diff --git a/NEWS.md b/NEWS.md
index 0d25c84..e10a3e2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,3 +9,7 @@
 # smartid 0.99.2
 
 * Added test for `gs_score` function.
+
+# smartid 0.99.3
+
+* Bump R version dependency to >= 4.4 and add details for TF, IDF, IAE functions.
diff --git a/R/objects.R b/R/AllClasses.R
similarity index 100%
rename from R/objects.R
rename to R/AllClasses.R
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 8722f13..74b268a 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -10,7 +10,9 @@
 ## ---------------------------------------------------------------------------
 #' @title calculate combined score
 #'
-#' @description compute TF, IDF, IAE and combine the score
+#' @description compute TF (term/feature frequency), IDF (inverse document/cell
+#'     frequency), IAE (inverse average expression of features) and combine the
+#'     the final score
 #'
 #' @inheritParams cal_score_init
 #' @param data an expression object, can be matrix or SummarizedExperiment
diff --git a/R/tf_idf_iae_wrappers.R b/R/tf_idf_iae_wrappers.R
index c811f02..7e0e656 100644
--- a/R/tf_idf_iae_wrappers.R
+++ b/R/tf_idf_iae_wrappers.R
@@ -2,13 +2,18 @@
 #-----------------TF variants------------------#
 #################################################
 
+### $$\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}$$
+
 ## term frequency
-#' compute term/gene frequency within each cell
+#' compute term/feature frequency within each cell
+#'
+#' @details
+#' \deqn{\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}}
 #'
 #' @param expr a count matrix, features in row and cells in column
 #' @param log logical, if to do log-transformation
 #'
-#' @return a matrix of tf
+#' @return a matrix of term/gene frequency
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -34,9 +39,12 @@ tf <- function(expr, log = FALSE) {
 
 #' standard inverse cell frequency
 #'
+#' @details
+#' \deqn{\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of idf score for each feature
+#' @return a vector of inverse cell frequency score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -54,14 +62,17 @@ idf <- function(expr, features = NULL, thres = 0) {
 }
 
 ## inverse document frequency max
-### $$\mathbf{IDF_{i,d}} = log(\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$
+### $$\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})$$
 ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$
 
-#' inverse document frequency: max
+#' inverse cell frequency: max
+#'
+#' @details
+#' \deqn{\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})}
 #'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of idf score for each feature
+#' @return a matrix of inverse cell frequency score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -84,14 +95,17 @@ idf_m <- function(expr, features = NULL, thres = 0) {
 }
 
 ## inverse document frequency sd
-### $$\mathbf{IDF} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$
+### $$\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$
 ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$
 
 #' inverse cell frequency using standard deviation (SD)
 #'
+#' @details
+#' \deqn{\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of idf score for each feature
+#' @return a vector of inverse cell frequency score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -110,14 +124,17 @@ idf_sd <- function(expr, features = NULL, thres = 0) {
 }
 
 ## inverse document frequency using hdbscan cluster as label
-#' Title
+#' inverse document frequency using hdbscan cluster as label
+#'
+#' @details
+#' Details as [smartid:::idf_prob()].
 #'
 #' @inheritParams idf_rf
 #' @param minPts integer, minimum size of clusters, default 2.
 #'               Details in [dbscan::hdbscan()].
 #' @param ... parameters for [dbscan::hdbscan()]
 #'
-#' @return a matrix of IDF score
+#' @return a matrix of inverse cell frequency score
 #'
 #' @examples
 #' set.seed(123)
@@ -137,9 +154,9 @@ idf_hdb <- function(expr, features = NULL, multi = TRUE,
   ## factor cluster
   cluster <- factor(cluster)
 
-  idf <- idf_rf(expr = expr, features = features,
-                label = cluster, multi = multi,
-                thres = thres)
+  idf <- idf_prob(expr = expr, features = features,
+                  label = cluster, multi = multi,
+                  thres = thres)
   return(idf)
 }
 
@@ -150,13 +167,16 @@ idf_hdb <- function(expr, features = NULL, multi = TRUE,
 
 #' labeled inverse cell frequency: relative frequency
 #'
+#' @details
+#' \deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}})}
+#'
 #' @param expr a matrix, features in row and cells in column
 #' @param features vector, feature names or indexes to compute
 #' @param label vector, group label of each cell
 #' @param multi logical, if to compute based on binary (FALSE) or multi-class (TRUE)
 #' @param thres numeric, cell only counts when expr > threshold, default 0
 #'
-#' @return a matrix of IDF score
+#' @return a matrix of inverse cell frequency score
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -197,9 +217,12 @@ idf_rf <- function(expr, features = NULL, label,
 
 #' labeled inverse cell frequency: probability based
 #'
+#' @details
+#' \deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a matrix of IDF score
+#' @return a matrix of inverse cell frequency score
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -234,16 +257,19 @@ idf_prob <- function(expr, features = NULL, label,
 }
 
 ## labeled inverse document frequency IGM
-### $$\mathbf{IGM} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})$$
+### $$\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})$$
 ### $$\mathbf{k}: type\ in\ total\ group\ K$$
 ### $$\mathbf{r_{k}}: rank\ of\ n_{i,j\in D})_{k}\ in\ total\ group\ K$$
 
-#' labeled inverse document frequency IGM
+#' labeled inverse cell frequency: IGM
+#'
+#' @details
+#' \deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})}
 #'
 #' @inheritParams idf_rf
 #' @param lambda numeric, hyperparameter for IGM
 #'
-#' @return a vector of igm score for each feature
+#' @return a vector of inverse gravity moment score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -274,14 +300,17 @@ idf_igm <- function(expr, features = NULL, label, lambda = 7, thres = 0) {
 ##-----------------unlabeled-------------------##
 
 ## inverse average expression
-### $$\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})$$
+### $$\mathbf{IAE_i} = log(1+\frac{n}{\sum_j^n\hat N_{i,j}+1})$$
 ### $$n: total\ counts\ of\ documents;\ \hat N_{i,j}: max(0, N_{i,j} - threshold)$$
 
 #' standard inverse average expression
 #'
+#' @details
+#' \deqn{\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of iae score for each feature
+#' @return a vector of inverse average expression score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -301,14 +330,17 @@ iae <- function(expr, features = NULL, thres = 0) {
 }
 
 ## inverse average expression max
-### $$\mathbf{IAE_i} = log(1+\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$
+### $$\mathbf{IAE_{i,d}} = log(1+\frac{max_{\{i^{'}\in d\}}(n_{i^{'}})}{n_i+1})$$
 ### $$n: total\ counts\ of\ documents;\ n_i: \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$
 
 #' inverse average expression: max
 #'
+#' @details
+#' \deqn{\mathbf{IAE_{i,j}} = log(1+\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{\sum_{j = 1}^{n} sign(N_{i,j} > threshold)+1})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of iae score for each feature
+#' @return a matrix of inverse average expression score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -337,9 +369,12 @@ iae_m <- function(expr, features = NULL, thres = 0) {
 
 #' inverse average expression using standard deviation (SD)
 #'
+#' @details
+#' \deqn{\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a vector of iae score for each feature
+#' @return a vector of inverse average expression score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -362,12 +397,15 @@ iae_sd <- function(expr, features = NULL, thres = 0) {
 ## inverse average expression using hdbscan cluster as label
 #' inverse average expression using hdbscan cluster as label
 #'
+#' @details
+#' Details as [smartid:::iae_prob()].
+#'
 #' @inheritParams idf_rf
 #' @param minPts integer, minimum size of clusters, default 2.
 #'               Details in [dbscan::hdbscan()].
 #' @param ... parameters for [dbscan::hdbscan()]
 #'
-#' @return a matrix of IAE score
+#' @return a matrix of inverse average expression score
 #'
 #' @examples
 #' set.seed(123)
@@ -401,9 +439,9 @@ iae_hdb <- function(expr, features = NULL, multi = TRUE,
   #
   # iae <- log1p((mean_row_in/(mean_row_notin+0.01))[, cluster])  ## IDF scores
 
-  iae <- iae_rf(expr = expr, features = features,
-                label = cluster, multi = multi,
-                thres = thres)
+  iae <- iae_prob(expr = expr, features = features,
+                  label = cluster, multi = multi,
+                  thres = thres)
   return(iae)
 }
 
@@ -416,9 +454,12 @@ iae_hdb <- function(expr, features = NULL, multi = TRUE,
 
 #' labeled inverse average expression: relative frequency
 #'
+#' @details
+#' \deqn{\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}})}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a matrix of IAE score
+#' @return a matrix of inverse average expression score
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -452,16 +493,19 @@ iae_rf <- function(expr, features = NULL, label,
 }
 
 ## labeled inverse average expression: probability based
-### $$\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$
+### $$\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$
 ### modified from
 ### $$\mathbf{IDF_{i,j}} = log(1+\frac{A}{max(B)+1}*\frac{A}{C+1})$$
 ### A denotes the number of cells belonging to category D where the gene i occurs at least once; B denotes the number of cells not belonging to category D where the gene i occurs at least once; C denotes the number of cells belonging to category D where the gene i does not occur; D denotes the number of cells not belonging to category D where the gene i does not occur.
 
 #' labeled inverse average expression: probability based
 #'
+#' @details
+#' \deqn{\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))}
+#'
 #' @inheritParams idf_rf
 #'
-#' @return a matrix of IAE score
+#' @return a matrix of inverse average expression score
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -495,16 +539,19 @@ iae_prob <- function(expr, features = NULL, label,
 }
 
 ## labeled inverse average expression IGM
-### $$\mathbf{IGM} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})$$
+### $$\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})$$
 ### $$\mathbf{k}: type\ in\ total\ group\ K$$
 ### $$\mathbf{r_{k}}: rank\ of\ mean(N_{i,j\in D})_{k}\ in\ total\ group\ K$$
 
-#' labeled inverse average expression IGM
+#' labeled inverse average expression: IGM
+#'
+#' @details
+#' \deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})}
 #'
 #' @inheritParams idf_rf
 #' @param lambda numeric, hyperparameter for IGM
 #'
-#' @return a vector of igm score for each feature
+#' @return a vector of inverse gravity moment score for each feature
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/README.md b/README.md
index 134d2cb..17cd6c2 100644
--- a/README.md
+++ b/README.md
@@ -24,4 +24,12 @@ You can install the development version of smartid like so:
 devtools::install("DavisLaboratory/smartid")
 ```
 
+smartid can be installed from Bioconductor directly as follows:
+
+``` r
+if (!requireNamespace("BiocManager", quietly = TRUE))
+    install.packages("BiocManager")
+
+BiocManager::install("smartid")
+```
 
diff --git a/man/cal_score.Rd b/man/cal_score.Rd
index afe429a..df23b7a 100644
--- a/man/cal_score.Rd
+++ b/man/cal_score.Rd
@@ -62,7 +62,9 @@ optional, default 'score'}
 A list of matrices or se object containing combined score
 }
 \description{
-compute TF, IDF, IAE and combine the score
+compute TF (term/feature frequency), IDF (inverse document/cell
+frequency), IAE (inverse average expression of features) and combine the
+the final score
 }
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/man/iae.Rd b/man/iae.Rd
index 78f4ef8..6798938 100644
--- a/man/iae.Rd
+++ b/man/iae.Rd
@@ -14,11 +14,14 @@ iae(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of iae score for each feature
+a vector of inverse average expression score for each feature
 }
 \description{
 standard inverse average expression
 }
+\details{
+\deqn{\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::iae(data)
diff --git a/man/iae_hdb.Rd b/man/iae_hdb.Rd
index 1200f79..6ce6435 100644
--- a/man/iae_hdb.Rd
+++ b/man/iae_hdb.Rd
@@ -21,11 +21,14 @@ Details in \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}.}
 \item{...}{parameters for \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}}
 }
 \value{
-a matrix of IAE score
+a matrix of inverse average expression score
 }
 \description{
 inverse average expression using hdbscan cluster as label
 }
+\details{
+Details as \code{\link[smartid:::iae_prob]{smartid::::iae_prob()}}.
+}
 \examples{
 set.seed(123)
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/man/iae_igm.Rd b/man/iae_igm.Rd
index b166999..86f9ec6 100644
--- a/man/iae_igm.Rd
+++ b/man/iae_igm.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/tf_idf_iae_wrappers.R
 \name{iae_igm}
 \alias{iae_igm}
-\title{labeled inverse average expression IGM}
+\title{labeled inverse average expression: IGM}
 \usage{
 iae_igm(expr, features = NULL, label, lambda = 7, thres = 0)
 }
@@ -18,10 +18,13 @@ iae_igm(expr, features = NULL, label, lambda = 7, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of igm score for each feature
+a vector of inverse gravity moment score for each feature
 }
 \description{
-labeled inverse average expression IGM
+labeled inverse average expression: IGM
+}
+\details{
+\deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(mean(N_{i,j\in D})_{k})}{\sum_{k}^{K}(mean(N_{i,j\in D})_{k}*r_{k})+e^{-8}})}
 }
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/man/iae_m.Rd b/man/iae_m.Rd
index d646156..b57cbd6 100644
--- a/man/iae_m.Rd
+++ b/man/iae_m.Rd
@@ -14,11 +14,14 @@ iae_m(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of iae score for each feature
+a matrix of inverse average expression score for each feature
 }
 \description{
 inverse average expression: max
 }
+\details{
+\deqn{\mathbf{IAE_{i,j}} = log(1+\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{\sum_{j = 1}^{n} sign(N_{i,j} > threshold)+1})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::iae_m(data)
diff --git a/man/iae_prob.Rd b/man/iae_prob.Rd
index aab7677..1a96eeb 100644
--- a/man/iae_prob.Rd
+++ b/man/iae_prob.Rd
@@ -18,11 +18,14 @@ iae_prob(expr, features = NULL, label, multi = TRUE, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a matrix of IAE score
+a matrix of inverse average expression score
 }
 \description{
 labeled inverse average expression: probability based
 }
+\details{
+\deqn{\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::iae_prob(data, label = sample(c("A", "B"), 10, replace = TRUE))
diff --git a/man/iae_rf.Rd b/man/iae_rf.Rd
index 2f35563..8603ef6 100644
--- a/man/iae_rf.Rd
+++ b/man/iae_rf.Rd
@@ -18,11 +18,14 @@ iae_rf(expr, features = NULL, label, multi = TRUE, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a matrix of IAE score
+a matrix of inverse average expression score
 }
 \description{
 labeled inverse average expression: relative frequency
 }
+\details{
+\deqn{\mathbf{IAE} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::iae_rf(data, label = sample(c("A", "B"), 10, replace = TRUE))
diff --git a/man/iae_sd.Rd b/man/iae_sd.Rd
index 23adfa2..ff3091e 100644
--- a/man/iae_sd.Rd
+++ b/man/iae_sd.Rd
@@ -14,11 +14,14 @@ iae_sd(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of iae score for each feature
+a vector of inverse average expression score for each feature
 }
 \description{
 inverse average expression using standard deviation (SD)
 }
+\details{
+\deqn{\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::iae_sd(data)
diff --git a/man/idf.Rd b/man/idf.Rd
index a7a26d9..5e5cc6e 100644
--- a/man/idf.Rd
+++ b/man/idf.Rd
@@ -14,11 +14,14 @@ idf(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of idf score for each feature
+a vector of inverse cell frequency score for each feature
 }
 \description{
 standard inverse cell frequency
 }
+\details{
+\deqn{\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::idf(data)
diff --git a/man/idf_hdb.Rd b/man/idf_hdb.Rd
index d329845..392ec79 100644
--- a/man/idf_hdb.Rd
+++ b/man/idf_hdb.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/tf_idf_iae_wrappers.R
 \name{idf_hdb}
 \alias{idf_hdb}
-\title{Title}
+\title{inverse document frequency using hdbscan cluster as label}
 \usage{
 idf_hdb(expr, features = NULL, multi = TRUE, thres = 0, minPts = 2, ...)
 }
@@ -21,10 +21,13 @@ Details in \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}.}
 \item{...}{parameters for \code{\link[dbscan:hdbscan]{dbscan::hdbscan()}}}
 }
 \value{
-a matrix of IDF score
+a matrix of inverse cell frequency score
 }
 \description{
-Title
+inverse document frequency using hdbscan cluster as label
+}
+\details{
+Details as \code{\link[smartid:::idf_prob]{smartid::::idf_prob()}}.
 }
 \examples{
 set.seed(123)
diff --git a/man/idf_igm.Rd b/man/idf_igm.Rd
index c6c6f99..9b42ba5 100644
--- a/man/idf_igm.Rd
+++ b/man/idf_igm.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/tf_idf_iae_wrappers.R
 \name{idf_igm}
 \alias{idf_igm}
-\title{labeled inverse document frequency IGM}
+\title{labeled inverse cell frequency: IGM}
 \usage{
 idf_igm(expr, features = NULL, label, lambda = 7, thres = 0)
 }
@@ -18,10 +18,13 @@ idf_igm(expr, features = NULL, label, lambda = 7, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of igm score for each feature
+a vector of inverse gravity moment score for each feature
 }
 \description{
-labeled inverse document frequency IGM
+labeled inverse cell frequency: IGM
+}
+\details{
+\deqn{\mathbf{IGM_i} = log(1+\lambda\frac{max(n_{i,j\in D})_{k}}{\sum_{k}^{K}((n_{i,j\in D})_{k}*r_{k})+e^{-8}})}
 }
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/man/idf_m.Rd b/man/idf_m.Rd
index c818958..3c65bd2 100644
--- a/man/idf_m.Rd
+++ b/man/idf_m.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/tf_idf_iae_wrappers.R
 \name{idf_m}
 \alias{idf_m}
-\title{inverse document frequency: max}
+\title{inverse cell frequency: max}
 \usage{
 idf_m(expr, features = NULL, thres = 0)
 }
@@ -14,10 +14,13 @@ idf_m(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of idf score for each feature
+a matrix of inverse cell frequency score for each feature
 }
 \description{
-inverse document frequency: max
+inverse cell frequency: max
+}
+\details{
+\deqn{\mathbf{IDF_{i,j}} = log(\frac{max_{\{i^{'}\in j\}}(n_{i^{'}})}{n_i+1})}
 }
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/man/idf_prob.Rd b/man/idf_prob.Rd
index f0d014d..15ca94e 100644
--- a/man/idf_prob.Rd
+++ b/man/idf_prob.Rd
@@ -18,11 +18,14 @@ idf_prob(expr, features = NULL, label, multi = TRUE, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a matrix of IDF score
+a matrix of inverse cell frequency score
 }
 \description{
 labeled inverse cell frequency: probability based
 }
+\details{
+\deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::idf_prob(data, label = sample(c("A", "B"), 10, replace = TRUE))
diff --git a/man/idf_rf.Rd b/man/idf_rf.Rd
index e08ce2b..df952fc 100644
--- a/man/idf_rf.Rd
+++ b/man/idf_rf.Rd
@@ -18,11 +18,14 @@ idf_rf(expr, features = NULL, label, multi = TRUE, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a matrix of IDF score
+a matrix of inverse cell frequency score
 }
 \description{
 labeled inverse cell frequency: relative frequency
 }
+\details{
+\deqn{\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::idf_rf(data, label = sample(c("A", "B"), 10, replace = TRUE))
diff --git a/man/idf_sd.Rd b/man/idf_sd.Rd
index 08f0c65..8f79c81 100644
--- a/man/idf_sd.Rd
+++ b/man/idf_sd.Rd
@@ -14,11 +14,14 @@ idf_sd(expr, features = NULL, thres = 0)
 \item{thres}{numeric, cell only counts when expr > threshold, default 0}
 }
 \value{
-a vector of idf score for each feature
+a vector of inverse cell frequency score for each feature
 }
 \description{
 inverse cell frequency using standard deviation (SD)
 }
+\details{
+\deqn{\mathbf{IDF_i} = log(1+sd(N_{i})*\frac{n}{n_i+1})}
+}
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 smartid:::idf_sd(data)
diff --git a/man/tf.Rd b/man/tf.Rd
index 2677116..37c895c 100644
--- a/man/tf.Rd
+++ b/man/tf.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/tf_idf_iae_wrappers.R
 \name{tf}
 \alias{tf}
-\title{compute term/gene frequency within each cell}
+\title{compute term/feature frequency within each cell}
 \usage{
 tf(expr, log = FALSE)
 }
@@ -12,10 +12,13 @@ tf(expr, log = FALSE)
 \item{log}{logical, if to do log-transformation}
 }
 \value{
-a matrix of tf
+a matrix of term/gene frequency
 }
 \description{
-compute term/gene frequency within each cell
+compute term/feature frequency within each cell
+}
+\details{
+\deqn{\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}}
 }
 \examples{
 data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
diff --git a/vignettes/smartid_Demo.Rmd b/vignettes/smartid_Demo.Rmd
index c914476..39d7246 100644
--- a/vignettes/smartid_Demo.Rmd
+++ b/vignettes/smartid_Demo.Rmd
@@ -32,7 +32,7 @@ knitr::opts_chunk$set(
 
 **smartid**
 
-smartid is a package that enables automated selection of group specific signature, especially for rare population. This package is developed for generating specifc lists of signature genes based on TF-IDF modified methods for labeled data. It can also be used as a new gene-set scoring method or data transformation method for un-labeled data. Multiple visualization functions are implemented in this package.
+`smartid` is a package that enables automated selection of group specific signature, especially for rare population. This package is developed for generating specifc lists of signature genes based on TF-IDF modified methods for labeled data. It can also be used as a new gene-set scoring method or data transformation method for un-labeled data. Multiple visualization functions are implemented in this package.
 
 # Installation
 
@@ -43,13 +43,6 @@ smartid is a package that enables automated selection of group specific signatur
 The most updated version of `smartid` is hosted on GitHub and can be installed using `devtools::install_github()` function provided by [devtools](https://cran.r-project.org/package=devtools).
 
 ```{r installation, eval=FALSE}
-# if (!requireNamespace("devtools", quietly = TRUE)) {
-#   install.packages("devtools")
-# }
-# if (!requireNamespace("smartid", quietly = TRUE)) {
-#   devtools::install_github("DavisLaboratory/smartid")
-# }
-
 if (!requireNamespace("BiocManager", quietly = TRUE)) {
   install.packages("BiocManager")
 }
@@ -62,7 +55,7 @@ if (!requireNamespace("smartid", quietly = TRUE)) {
 
 ------------------------------------------------------------------------
 
-To show a quick start guide of smartid, here we use package `splatter` to simulate a scRNA-seq data of 1000 genes * 3000 cells. This data consists of 4 groups, each has 2% DEGs except Group 4, which has no DEG as a negative control group.
+To show a quick start guide of `smartid`, here we use package `r BiocStyle::Biocpkg("splatter")` to simulate a scRNA-seq data of 1000 genes * 3000 cells. This data consists of 4 groups, each has 2% DEGs except Group 4, which has no DEG as a negative control group.
 
 ```{r set up, message=FALSE}
 library(smartid)
@@ -104,7 +97,7 @@ data_sim
 
 `smartid` can be easily used to accurately identify specific marker genes on labeled data. By adapting and modifying TF-IDF approach, `smartid` shows robust power in finding marker genes, especially for rare population which many methods fail in.
 
-**marker identification of smartid includes 3 key steps:**
+**marker identification of `smartid` includes 3 key steps:**
 
   step 1. score samples  
   step 2. scale and transform scores  
@@ -112,15 +105,30 @@ data_sim
 
 ## Score Samples
 
-The first step is to score all samples/cells by using specified approach. The score can be composed of 3 terms: TF (term frequency), IDF (inverse document frequency) and IAE (inverse average expression). Each term has a couple of available choices with different formats to suit labeled or un-labeled data.
+The first step is to score all samples/cells by using specified approach. The score can be composed of 3 terms: TF (term/feature frequency), IDF (inverse document/cell frequency) and IAE (inverse average expression of features). Each term has a couple of available choices with different formats to suit labeled or un-labeled data.
 
-TF here stands for gene frequency, which is similar to CPM, while IDF represents the inverse cell/sample frequency for scRNA-seq data, and IAE is the inverse average expression of each gene across all cells or cells in each labeled group.
+The basic version of TF, IDF and IAE can be termed as:
+$$\mathbf{TF_{i,j}}=\frac{N_{i,j}}{\sum_j{N_{i,j}}}$$
+$$\mathbf{IDF_i} = log(1+\frac{n}{n_i+1})$$
+$$\mathbf{IAE_i} = log(1+\frac{n}{\hat N_{i,j}+1})$$
+
+$$where\ N_{i,j}\ is\ the\ counts\ of\ feature\ i\ in\ cell\ j;\ \hat N_{i,j}\ is\ max(0,\ N_{i,j} - threshold);$$
+$$\ n\ is\ total\ counts\ of\ documents(cells);\ n_i\ is\ \sum_{j = 1}^{n} sign(N_{i,j} > threshold)$$
 
 Here for labeled data, we can choose logTF * IDF_prob * IAE_prob for marker identification.
 
-$$\mathbf{score}=logTF*IDF_{prob}*IAE_{prob}$$
+$\mathbf{score}=logTF*IDF_{prob}*IAE_{prob}$
 
-Another advantage of smartid is that it can start with raw counts data, with no need for pre-processed data. And the scoring is quite fast.
+The probability version of IDF can be termed as:
+$$\mathbf{IDF_{i,j}} = log(1+\frac{\frac{n_{i,j\in D}}{n_{j\in D}}}{max(\frac{n_{i,j\in \hat D}}{n_{j\in \hat D}})+ e^{-8}}\frac{n_{i,j\in D}}{n_{j\in D}})$$
+
+And the probability version of IAE can be termed as:
+$$\mathbf{IAE_{i,j}} = log(1+\frac{mean(N_{i,j\in D})}{max(mean(N_{i,j\in \hat D}))+ e^{-8}}*mean(N_{i,j\in D}))$$
+$$where\ D\ is\ the\ category\ of\ cell\ j;\ \hat D\ is\ the\ category\ other\ than\ D.$$
+
+TF here stands for gene frequency, which is similar to CPM, while IDF represents the inverse cell/sample frequency for scRNA-seq data, and IAE is the inverse average expression of each gene across all cells or cells in each labeled group.
+
+Another advantage of `smartid` is that it can start with raw counts data, with no need for pre-processed data. And the scoring is quite fast.
 
 ```{r}
 ## compute score
@@ -142,7 +150,7 @@ names(metadata(data_sim))
 
 ## Scale and Transform Score
 
-Scaling is needed to find the markers specific to the group, however, standard scaling might fail due to the rare populations. Here smartid uses a special scaling strategy `scale_mgm`, which can scale im-balanced data by given group labels.
+Scaling is needed to find the markers specific to the group, however, standard scaling might fail due to the rare populations. Here `smartid` uses a special scaling strategy `scale_mgm()`, which can scale im-balanced data by given group labels.
 
 The score will be transformed using softmax before passing to EM algorithm.
 
@@ -185,9 +193,9 @@ data_sim@rowRanges@elementMetadata[76,]
 
 As we can see from above, there is a distinctly different distribution of feature score between group with DEGs and without DEGs. And there is a clear separation (break point) between the real DEGs and non-DEGs.
 
-To help automatically select real markers for each group, smartid used an expectation maximization (EM) approach to identify which genes fall into the real DEGs distribution and which are not.
+To help automatically select real markers for each group, `smartid` used an expectation maximization (EM) approach to identify which genes fall into the real DEGs distribution and which are not.
 
-Regarding the distribution of scores as a mixture model, here we can choose function `markers_mixmdl` in smartid to separate features. There are 2 available mixture model to choose: normal (Gaussian) or gamma. We choose "norm" here as it runs faster.
+Regarding the distribution of scores as a mixture model, here we can choose function `markers_mixmdl()` in `smartid` to separate features. There are 2 available mixture model to choose: normal (Gaussian) or gamma. We choose "norm" here as it runs faster.
 
 `smartid` also allows to plot the mixture distribution plot after EM. It's obvious that the top 2 components of Group 4 share quite similar distribution, thus no markers will be selected for this group.
 
@@ -203,7 +211,7 @@ marker_ls <- markers_mixmdl(
 marker_ls
 ```
 
-We can also compare our selected markers with real DEGs. As there is no markers or DEG in group 4, only show overlap from Group1-3. It's clear that all the markers identified by smartid are real DEGs, with a couple of missing genes in Group 1 and 2. But as what we showed above, those genes only exhibit low mean expression across all cells, thus not confident enough to be selected as markers.
+We can also compare our selected markers with real DEGs. As there is no markers or DEG in group 4, only show overlap from Group1-3. It's clear that all the markers identified by `smartid` are real DEGs, with a couple of missing genes in Group 1 and 2. But as what we showed above, those genes only exhibit low mean expression across all cells, thus not confident enough to be selected as markers.
 
 ```{r}
 library(UpSetR)
@@ -211,7 +219,7 @@ library(UpSetR)
 upset(fromList(c(data_sim@metadata$up_markers, marker_ls)), nsets = 6)
 ```
 
-`smartid` also provides some other implementation of marker selection. Here is another example using `mclust`. Different from `markers_mixmdl`, `markers_mclust` doesn't need a pre-defined number of components (which is 3 in `markers_mixmdl`), instead, it will select the number of components by searching a series of potential numbers. This method is sometimes more robust than `markers_mixmdl`.
+`smartid` also provides some other implementation of marker selection. Here is another example using `mclust`. Different from `markers_mixmdl()`, `markers_mclust()` doesn't need a pre-defined number of components (which is 3 in `markers_mixmdl()`), instead, it will select the number of components by searching a series of potential numbers. This method is sometimes more robust than `markers_mixmdl()`.
 
 Similary, this method also allows to plot the mixture distribution for each component, but separately.
 
@@ -221,7 +229,7 @@ set.seed(123)
 marker_ls_new <- markers_mclust(
   top_markers = top_m,
   column = ".dot",
-  dist = "norm",
+  method = "max.one",
   plot = TRUE
 )
 names(marker_ls_new) <- paste(names(marker_ls_new), "new")
@@ -242,11 +250,14 @@ While for the unlabeled data, `smartid` also provides the score methods with no
 
 Here we choose logTF * IDF_sd * IAE_sd for for gene-set scoring as a use case.
 
-$$\mathbf{score}=logTF*IDF_{sd}*IAE_{sd}$$
+$\mathbf{score}=logTF*IDF_{sd}*IAE_{sd}$
+
+$$where\ \mathbf{IDF} = log(1+sd(N_{i})*\frac{n}{n_i+1})$$
+$$\mathbf{IAE} = log(1+sd(N_{i})*\frac{n}{\sum_{j=1}^{n}N_{i,j}+1})$$
 
 ## Score Samples
 
-Similary, the first step is to score samples/cells using the specified method. This step also starts with raw counts data, without need for data pre-processing, which is quite convenient and fast.
+Similarly, the first step is to score samples/cells using the specified method. This step also starts with raw counts data, without need for data pre-processing, which is quite convenient and fast.
 
 ```{r}
 ## compute score without label
@@ -267,7 +278,7 @@ names(metadata(data_sim))
 
 ## Compute Overall Score for Gene-set
 
-To compare overall score of the given gene-set, we don't need to scale and tranform score this time. Using `gs_score` can easily compute the overall score for each cell based on the given gene-set list.
+To compare overall score of the given gene-set, we don't need to scale and tranform score this time. Using `gs_score()` can easily compute the overall score for each cell based on the given gene-set list.
 
 ```{r}
 ## compute score for each group marker list