Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Transipedia/KaMRaT
Browse files Browse the repository at this point in the history
  • Loading branch information
haoliang.xue committed Dec 21, 2021
2 parents cd5fd6d + 2b3cf6e commit fb8e816
Show file tree
Hide file tree
Showing 12 changed files with 395 additions and 192 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ bin/
toyroom/tmp/
test*/
kamrat-simg/
toyroom/output/
notes.txt
.snakejobs/
snakejobs*
Expand Down
25 changes: 0 additions & 25 deletions related-tools/checkers/rank-checker/calcDIDS.R

This file was deleted.

52 changes: 0 additions & 52 deletions related-tools/checkers/rank-checker/calcScores.R

This file was deleted.

126 changes: 126 additions & 0 deletions related-tools/checkers/rank-checker/general_checker.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
rm (list = ls())

library(stringr)
library(magrittr)
library(parallel)
library(foreach)
library(tidyr)

work.dir <- "/home/haoliang.xue/Documents/development/KaMRaT/toyroom"

tab.in <- read.table(gzfile(paste0(work.dir, "/data/kmer-counts.subset4toy.tsv.gz")),
header = T, row.names = 1)
for (i in colnames(tab.in)) {
tab.in[, i] <- tab.in[, i] / sum(tab.in[, i]) * 10000000
}

evalRow <- function(X) {
library(magrittr)
library(MLmetrics) # Accuracy
library(e1071) # NBC, LR
library(DescTools) # Entropy

work.dir <- "/home/haoliang.xue/Documents/development/KaMRaT/toyroom"

smp.condi.categ <- read.table(paste0(work.dir, "/data/sample-condition.toy.tsv"))
names(smp.condi.categ) <- c("sample", "condition")
smp.condi.categ$condition <- as.numeric(smp.condi.categ$condition) - 1

smp.condi.cntnu <- read.table(paste0(work.dir, "/data/sample-condition.toy2.tsv"))
names(smp.condi.cntnu) <- c("sample", "condition")
smp.condi.cntnu$condition <- as.numeric(smp.condi.cntnu$condition)

X.df <- data.frame("sample" = names(X), "count" = as.numeric(X))
s <- sd(X.df$count) # sd
m <- min(X.df$count)
rsd <- s / max(c(1, m)) # rsd
etp <- Entropy(X.df$count + 1) # entropy

df.categ <- merge(x = X.df, y = smp.condi.categ, by = "sample")
x1 <- df.categ$count[df.categ$condition == 0]
x2 <- df.categ$count[df.categ$condition == 1]

if (s != 0) {
ttest.praw <- t.test(x = log2(x1 + 1), y = log2(x2 + 1),
alternative = "two.sided")$p.value # to calculate ttest.padj
log2fc <- abs(mean(log2(x1 + 1)) - mean(log2(x2 + 1)))
ttest.pi <- -log10(ttest.praw) * log2fc # ttest.pi
snr <- (mean(x1) - mean(x2))/(sd(x1) + sd(x2)) # SNR
} else {
ttest.praw <- 1
ttest.pi <- 0
snr <- 0
}

dids <- lapply(unique(df.categ$condition),
FUN = function(i) {
grp.max <- max(df.categ$count[df.categ$condition == i])
grp.dist <- df.categ$count[df.categ$condition != i] - grp.max
grp.dist[grp.dist < 0] <- 0
grp.dids <- sum(sqrt(grp.dist))
}) %>% unlist() %>% max()

df.categ$count <- (df.categ$count - mean(df.categ$count)) / sd(df.categ$count) # standardization for ML

nbc.acc <- naiveBayes(x = as.matrix(df.categ$count), y = factor(df.categ$condition)) %>%
predict(newdata = as.matrix(df.categ$count), type = "class") %>%
Accuracy(y_true = df.categ$condition) # Bayes

if (s > 0) {
lr.pred <- glm(formula = condition ~ count, data = df.categ, family = binomial) %>%
predict(newdata = df.categ, type = "response")
lr.acc <- Accuracy(y_pred = factor(ifelse(lr.pred < 0.5, yes = 0, no = 1), levels = c(0, 1)),
y_true = df.categ$condition) # LR
} else {
lr.acc <- 0
}

df.cntnu <- merge(x = X.df, y = smp.condi.cntnu, by = "sample")

ps.cor <- ifelse(s == 0,
no = cor(df.cntnu$count, df.cntnu$condition, method = "pearson"),
yes = 0)

sp.cor <- ifelse(s == 0,
no = cor(df.cntnu$count, df.cntnu$condition, method = "spearman"),
yes = 0)

return(data.frame("ttest.praw" = ttest.praw,
"ttest.pi" = ttest.pi,
"SNR" = snr,
"LR.acc" = lr.acc,
"DIDS.score" = dids,
"Bayes.acc" = nbc.acc,
"pearson" = ps.cor,
"spearman" = sp.cor,
"stddev" = s,
"relat.stddev" = rsd,
"entropy" = etp))
}

cl <- makeCluster(10)
eval.res <- parApply(cl = cl, tab.in, MARGIN = 1, FUN = evalRow) %>%
do.call(what = rbind)
stopCluster(cl)
eval.res$ttest.padj <- p.adjust(eval.res$ttest.praw, method = "BH") # ttest.padj

eval.res$tag <- rownames(eval.res)
eval.res.long <- pivot_longer(eval.res, cols = -tag, names_to = "method", values_to = "score.R")

kamrat.res <- NULL
for (f in dir(paste0(work.dir, "/output/kamrat-rank/"))) {
if (is.null(kamrat.res)) {
kamrat.res <- read.table(paste0(work.dir, "/output/kamrat-rank/", f), header = T)[, 1 : 2]
} else {
kamrat.res <- merge(x = kamrat.res,
y = read.table(paste0(work.dir, "/output/kamrat-rank/", f),
header = T)[, 1 : 2],
by = "tag")
}
}

kamrat.res.long <- pivot_longer(kamrat.res, cols = -tag, names_to = "method", values_to = "score.kamrat")

cmp.res <- merge(kamrat.res.long, eval.res.long, by = c("tag", "method"))
cmp.res$diff <- abs(cmp.res$score.kamrat - cmp.res$score.R)
cmp.res$relat.diff <- abs(cmp.res$diff / cmp.res$score.R)
22 changes: 22 additions & 0 deletions related-tools/checkers/rank-checker/run_kamratRank.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -e

cd ../../../

bin/kamrat index -intab toyroom/data/kmer-counts.subset4toy.tsv.gz -outdir toyroom/output/index -klen 31 -unstrand -nfbase 10000000

for m in `echo ttest.padj ttest.pi snr dids lr:1 bayes:1`
do
bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -design toyroom/data/sample-condition.toy.tsv -outpath toyroom/output/kamrat-rank/kamrat-rank-${m/:/}.txt -withcounts
done

for m in `echo pearson spearman`
do
bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -design toyroom/data/sample-condition.toy2.tsv -outpath toyroom/output/kamrat-rank/kamrat-rank-$m.txt -withcounts
done

for m in `echo sd rsd entropy`
do
bin/kamrat rank -idxdir toyroom/output/index/ -rankby $m -outpath toyroom/output/kamrat-rank/kamrat-rank-$m.txt -withcounts
done
4 changes: 2 additions & 2 deletions src/Makefile.Template
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ kamratIndex : kamratIndex.cpp utils
kamratMerge : kamratMerge.cpp data_struct/contig_elem.cpp data_struct/merge_knot.cpp utils
g++ -c kamratMerge.cpp data_struct/contig_elem.cpp data_struct/merge_knot.cpp utils/seq_coding.cpp utils/index_loading.cpp utils/vect_opera.cpp

kamratRank : kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp
g++ -c kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp -fopenmp -lmlpack -larmadillo -I/path_to_conda_mlpack_env/include $(CPPFLAGS)
kamratRank : kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp utils/vect_opera.cpp
g++ -c kamratRank.cpp data_struct/scorer.cpp data_struct/feature_elem.cpp utils/vect_opera.cpp -fopenmp -lmlpack -larmadillo -I/path_to_conda_mlpack_env/include $(CPPFLAGS)

kamratFilter : kamratFilter.cpp utils
g++ -c kamratFilter.cpp utils/index_loading.cpp $(CPPFLAGS)
Expand Down
Loading

0 comments on commit fb8e816

Please sign in to comment.