Merge branch 'master' of github.com:Transipedia/KaMRaT

Transipedia · Dec 21, 2021 · fb8e816 · fb8e816
2 parents cd5fd6d + 2b3cf6e
commit fb8e816
Show file tree

Hide file tree

Showing 12 changed files with 395 additions and 192 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ bin/
 toyroom/tmp/
 test*/
 kamrat-simg/
+toyroom/output/
 notes.txt
 .snakejobs/
 snakejobs*

diff --git a/related-tools/checkers/rank-checker/calcDIDS.R b/related-tools/checkers/rank-checker/calcDIDS.R
diff --git a/related-tools/checkers/rank-checker/calcScores.R b/related-tools/checkers/rank-checker/calcScores.R
diff --git a/related-tools/checkers/rank-checker/general_checker.R b/related-tools/checkers/rank-checker/general_checker.R
@@ -0,0 +1,126 @@
+rm (list = ls())
+
+library(stringr)
+library(magrittr)
+library(parallel)
+library(foreach)
+library(tidyr)
+
+work.dir <- "/home/haoliang.xue/Documents/development/KaMRaT/toyroom"
+
+tab.in <- read.table(gzfile(paste0(work.dir, "/data/kmer-counts.subset4toy.tsv.gz")), 
+                     header = T, row.names = 1)
+for (i in colnames(tab.in)) {
+    tab.in[, i] <- tab.in[, i] / sum(tab.in[, i]) * 10000000
+}
+
+evalRow <- function(X) {
+    library(magrittr)
+    library(MLmetrics) # Accuracy
+    library(e1071) # NBC, LR
+    library(DescTools) # Entropy
+
+    work.dir <- "/home/haoliang.xue/Documents/development/KaMRaT/toyroom"
+
+    smp.condi.categ <- read.table(paste0(work.dir, "/data/sample-condition.toy.tsv"))
+    names(smp.condi.categ) <- c("sample", "condition")
+    smp.condi.categ$condition <- as.numeric(smp.condi.categ$condition) - 1
+
+    smp.condi.cntnu <- read.table(paste0(work.dir, "/data/sample-condition.toy2.tsv"))
+    names(smp.condi.cntnu) <- c("sample", "condition")
+    smp.condi.cntnu$condition <- as.numeric(smp.condi.cntnu$condition)
+
+    X.df <- data.frame("sample" = names(X), "count" = as.numeric(X))
+    s <- sd(X.df$count) # sd
+    m <- min(X.df$count)
+    rsd <- s / max(c(1, m)) # rsd
+    etp <- Entropy(X.df$count + 1) # entropy
+
+    df.categ <- merge(x = X.df, y = smp.condi.categ, by = "sample")
+    x1 <- df.categ$count[df.categ$condition == 0]
+    x2 <- df.categ$count[df.categ$condition == 1]
+
+    if (s != 0) {
+        ttest.praw <- t.test(x = log2(x1 + 1), y = log2(x2 + 1), 
+                             alternative = "two.sided")$p.value # to calculate ttest.padj
+        log2fc <- abs(mean(log2(x1 + 1)) - mean(log2(x2 + 1)))
+        ttest.pi <- -log10(ttest.praw) * log2fc # ttest.pi
+        snr <- (mean(x1) - mean(x2))/(sd(x1) + sd(x2)) # SNR
+    } else {
+        ttest.praw <- 1
+        ttest.pi <- 0
+        snr <- 0
+    }
+
+    dids <- lapply(unique(df.categ$condition), 
+                   FUN = function(i) {
+                       grp.max <- max(df.categ$count[df.categ$condition == i])
+                       grp.dist <- df.categ$count[df.categ$condition != i] - grp.max
+                       grp.dist[grp.dist < 0] <- 0
+                       grp.dids <- sum(sqrt(grp.dist))
+                   }) %>% unlist() %>% max()
+
+    df.categ$count <- (df.categ$count - mean(df.categ$count)) / sd(df.categ$count) # standardization for ML
+
+    nbc.acc <- naiveBayes(x = as.matrix(df.categ$count), y = factor(df.categ$condition)) %>% 
+        predict(newdata = as.matrix(df.categ$count), type = "class") %>%
+        Accuracy(y_true = df.categ$condition) # Bayes
+
+    if (s > 0) {
+        lr.pred <- glm(formula = condition ~ count, data = df.categ, family = binomial) %>%
+            predict(newdata = df.categ, type = "response")
+        lr.acc <- Accuracy(y_pred = factor(ifelse(lr.pred < 0.5, yes = 0, no = 1), levels = c(0, 1)), 
+                           y_true = df.categ$condition) # LR
+    } else {
+        lr.acc <- 0
+    }
+
+    df.cntnu <- merge(x = X.df, y = smp.condi.cntnu, by = "sample")
+
+    ps.cor <- ifelse(s == 0, 
+                     no = cor(df.cntnu$count, df.cntnu$condition, method = "pearson"),
+                     yes = 0)
+
+    sp.cor <- ifelse(s == 0,
+                     no = cor(df.cntnu$count, df.cntnu$condition, method = "spearman"),
+                     yes = 0)
+
+    return(data.frame("ttest.praw" = ttest.praw,
+                      "ttest.pi" = ttest.pi,
+                      "SNR" = snr,
+                      "LR.acc" = lr.acc,
+                      "DIDS.score" = dids,
+                      "Bayes.acc" = nbc.acc,
+                      "pearson" = ps.cor,
+                      "spearman" = sp.cor,
+                      "stddev" = s,
+                      "relat.stddev" = rsd,
+                      "entropy" = etp))
+}
+
+cl <- makeCluster(10)
+eval.res <- parApply(cl = cl, tab.in, MARGIN = 1, FUN = evalRow) %>%
+    do.call(what = rbind)
+stopCluster(cl)
+eval.res$ttest.padj <- p.adjust(eval.res$ttest.praw, method = "BH") # ttest.padj
+
+eval.res$tag <- rownames(eval.res)
+eval.res.long <- pivot_longer(eval.res, cols = -tag, names_to = "method", values_to = "score.R")
+
+kamrat.res <- NULL
+for (f in dir(paste0(work.dir, "/output/kamrat-rank/"))) {
+    if (is.null(kamrat.res)) {
+        kamrat.res <- read.table(paste0(work.dir, "/output/kamrat-rank/", f), header = T)[, 1 : 2]
+    } else {
+        kamrat.res <- merge(x = kamrat.res,
+                            y = read.table(paste0(work.dir, "/output/kamrat-rank/", f), 
+                                       header = T)[, 1 : 2],
+                            by = "tag")
+    }
+}
+
+kamrat.res.long <- pivot_longer(kamrat.res, cols = -tag, names_to = "method", values_to = "score.kamrat")
+
+cmp.res <- merge(kamrat.res.long, eval.res.long, by = c("tag", "method"))
+cmp.res$diff <- abs(cmp.res$score.kamrat - cmp.res$score.R)
+cmp.res$relat.diff <- abs(cmp.res$diff / cmp.res$score.R)
diff --git a/related-tools/checkers/rank-checker/run_kamratRank.bash b/related-tools/checkers/rank-checker/run_kamratRank.bash
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+
+cd ../../../
+
+bin/kamrat index -intab toyroom/data/kmer-counts.subset4toy.tsv.gz -outdir toyroom/output/index -klen 31 -unstrand -nfbase 10000000
+
+for m in `echo ttest.padj ttest.pi snr dids lr:1 bayes:1`
+do
+	bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -design toyroom/data/sample-condition.toy.tsv -outpath toyroom/output/kamrat-rank/kamrat-rank-${m/:/}.txt -withcounts
+done
+
+for m in `echo pearson spearman`
+do
+	bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -design toyroom/data/sample-condition.toy2.tsv -outpath toyroom/output/kamrat-rank/kamrat-rank-$m.txt -withcounts
+done
+
+for m in `echo sd rsd entropy`
+do
+	bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -outpath toyroom/output/kamrat-rank/kamrat-rank-$m.txt -withcounts
+done
diff --git a/src/Makefile.Template b/src/Makefile.Template
@@ -11,8 +11,8 @@ kamratIndex : kamratIndex.cpp utils
 kamratMerge : kamratMerge.cpp data_struct/contig_elem.cpp data_struct/merge_knot.cpp utils
 	g++ -c kamratMerge.cpp data_struct/contig_elem.cpp data_struct/merge_knot.cpp utils/seq_coding.cpp utils/index_loading.cpp utils/vect_opera.cpp
 
-kamratRank : kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp
-	g++ -c kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp -fopenmp -lmlpack -larmadillo -I/path_to_conda_mlpack_env/include $(CPPFLAGS)
+kamratRank : kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp utils/vect_opera.cpp
+	g++ -c kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp utils/vect_opera.cpp -fopenmp -lmlpack -larmadillo -I/path_to_conda_mlpack_env/include $(CPPFLAGS)
 
 kamratFilter : kamratFilter.cpp utils
 	g++ -c kamratFilter.cpp utils/index_loading.cpp $(CPPFLAGS)