Skip to content
This repository has been archived by the owner on Nov 28, 2020. It is now read-only.

Coverage comparisons

Marek Wiewiórka edited this page Mar 31, 2019 · 62 revisions

SEQUILA-COV

results

Big cluster

WES - local

----------|1 . | . 2| . 3| . 5| . 10| 


WGS   with GKL
cores     time(count)[s]    times(save)[s]
-------------------------------------------
1        5583.9                 6283.1
5        1592.7                 1853.1
10       634.6                  1050.4
25       245.5                   339.2
50       129.1                   206.8
100       78.6                   130.5
200       56.5                    86.5
300       47.8                    78.4
400       47.7                    70.7
500       43.3                    65.7

WES
cores .   times(save)[s]
------------------------
1         410.9
5         105.8
10 .       63.4
25         27.5
50         19.2
100        16.1

GATK DepthOfCoverage

command

### exome
{ time docker run -it  -v /data/samples/NA12878/WES:/data/ -v /data/samples/hg_builds/:/ref/ broadinstitute/gatk3:3.8-1 \
  java -jar GenomeAnalysisTK.jar \
    -T DepthOfCoverage \
    -R /ref/Homo_sapiens_assembly18.fasta \
    -o /data/gatk_doc_test.txt \
    -I /data/NA12878.proper.wes.bam \
    -omitIntervals \
    -nt 1} 2>> gatk_wes_time_1.txt

MOSDEPTH

results

Environment: cdh00. WGS file: NA12878.proper.wgs.bam - 272G, WES file NA12878.proper.wes.bam - 17G

data threads time [s] time [m]
WES(0.2.3) 1 387 6m27
WES(0.2.4) 1 411 6m50
WES(0.2.4 -x) 1 372 6m12
WES(0.2.3) 5 136 2m16
WES(0.2.4) 5 147 2m27
WES(0.2.4 -x) 5 141 2m21
WES 10 135 2m15
WGS 1 6495 108m15
WGS 5 1959 32m39
WGS 10 1810 30m10

commands

export LD_LIBRARY_PATH=/usr/local/lib

### exome

{ time mos/mosdepth prefix NA12878.proper.wes.bam ; } 2>> mos_wes_time_1.txt
{ time mos/mosdepth -t 4 prefix NA12878.proper.wes.bam ; } 2>> mos_wes_time_5.txt
{ time mos/mosdepth -t 9 prefix NA12878.proper.wes.bam ; } 2>> mos_wes_time_10.txt

### exome 0.2.4
{ time mos/mosdepth_0.2.4 prefix NA12878.proper.wes.bam ; } 2>> mos_wes_time_1_0.2.4.txt
{ time mos/mosdepth_0.2.4 -x prefix NA12878.proper.wes.bam ; } 2>> mos_wes_time_1_0.2.4.txt


### genome

{ time mos/mosdepthh prefix NA12878.proper.wgs.bam ; } 2>> wgs_time_1.txt
{ time mos/mosdepth -t 4 prefix NA12878.proper.wgs.bam ; } 2>> wgs_time_5.txt
{ time mos/mosdepth -t 9 prefix NA12878.proper.wgs.bam ; } 2>> wgs_time_9.txt

### fixed-length window for exome
{ time mos/mosdepth -n --by 500 win NA12878.proper.wes.bam ; } 2>> mos_wes_win_time_1.txt
{ time mos/mosdepth -n -t 4 --by 500 win NA12878.proper.wes.bam ; } 2>> mos_wes_win_time_5.txt
{ time mos/mosdepth -n -t 9 --by 500 win NA12878.proper.wes.bam ; } 2>> mos_wes_win_time_10.txt

### fixed-length window for genome
{ time mos/mosdepth -n --by 500 win NA12878.proper.wgs.bam ; } 2>> mos_wgs_win_time_1.txt
{ time mos/mosdepth -n -t 4 --by 500 win NA12878.proper.wgs.bam ; } 2>> mos_wgs_win_time_5.txt
{ time mos/mosdepth -n -t 9 --by 500 win NA12878.proper.wgs.bam ; } 2>> mos_wgs_win_time_10.txt

SAMTOOLS

results

data threads time [s] time [m]
WES 1 720 12.5
WGS 1 8370 139m30

commands

# exome
{ time samtools depth NA12878.ga2.exome.maq.recal.bam > samtools/NA12878.ga2.exome.maq.recal.depth ; } 2>> samtools/wes_time.txt

# genome
{ time samtools depth NA12878.hiseq.wgs.bwa.recal.bam > samtools/NA12878.hiseq.wgs.bwa.recal.depth ; } 2>> samtools/wgs_time.txt

bedtools

exome (blocks)

docker run -it --rm -u root -v /data/samples/NA12878:/data biocontainers/bedtools bash

root@131d1e99ad03:/data# time bedtools genomecov -ibam /data/NA12878.ga2.exome.maq.recal.bam -bga -strand + >> /data/bedcov.bed


real    16m55.177s
user    15m23.940s
sys     1m29.920s

sequila local

rm -rf ~/.ivy2/cache/org.biodatageeks/bdg-sequila_2.11/*
rm -rf ~/.ivy2/jars/org.biodatageeks_bdg-sequila_2.11*
cd /data/local/opt/spark-2.3.2-bin-hadoop2.7/bin/
#unset in local mode export HADOOP_CONF_DIR=/etc/hadoop/conf
export cores=1
./spark-shell --conf "spark.driver.extraJavaOptions=-Dmapred.max.split.size=134217728" --conf "spark.sql.catalogImplementation=in-memory" --master=local[$cores] --driver-memory=12g --packages org.biodatageeks:bdg-sequila_2.11:0.4.1-SNAPSHOT --repositories https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ -v
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
spark.sqlContext.setConf(BDGInternalParams.InputSplitSize, "134217728")
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")

/* WES -bases-blocks*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/samples/NA12878/WES/NA12878*.bam')""")
 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks')").write.format("parquet").save("/data/samples/NA12878/output_tmp/wes_1_9.parquet")}

/* WGS -bases-blocks*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/samples/NA12878/NA12878*.bam')""")
 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_genome','NA12878', 'blocks')").write.format("parquet").save("/data/samples/NA12878/output_tmp/wgs_1_1.parquet")}




/*windows - 500*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks', '500')").write.format("parquet").save("/tmp/fp16yq/data/32MB_w500_3.parquet") }

/*CRAM*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
//spark.sqlContext.setConf(BDGInternalParams.InputSplitSize, "134217728")
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")

/* WES -bases-blocks*/
//32MB
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome_cram USING org.biodatageeks.datasources.BAM.CRAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/NA12878.*.cram', refPath 'hdfs://datalab/tmp/fp16yq/data/ref/Homo_sapiens_assembly18.fasta')""")

 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_exome_cram','NA12878', 'blocks')").write.format("parquet").save("/tmp/fp16yq/data/wes_5_3.parquet")}

  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome_cram USING org.biodatageeks.datasources.BAM.CRAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/NA12878.*.cram', refPath 'hdfs://datalab/tmp/fp16yq/data/ref/Homo_sapiens_assembly18.fasta')""")




  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome_bam USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/NA12878*.bam')""")


 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_exome_bam','NA12878', 'blocks')").write.format("parquet").save("/tmp/fp16yq/data/wes_bam_10_2.parquet")}


/*WGS*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_genome_cram USING org.biodatageeks.datasources.BAM.CRAMDataSource OPTIONS(path '/tmp/fp16yq/data/genome/NA12878.*.cram', refPath 'hdfs://datalab/tmp/fp16yq/data/ref/Homo_sapiens_assembly18.fasta')""")

 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_exome_cram','NA12878', 'blocks')").write.format("parquet").save("/tmp/fp16yq/data/wes_5_3.parquet")}

 spark.time{
 ss.sql(s"SELECT count(*) FROM bdg_coverage('reads_genome_cram','NA12878', 'blocks')").show}

sambamba

commands

time docker run --rm -v /data/samples/NA12878:/data/samples/NA12878 -w /data/samples/NA12878 wkusmirek/sambamba /opt/sambamba-0.6.8-linux-static depth base --output-file=sambamba_base_coverage.txt --nthreads=1 /data/samples/NA12878/WES/NA12878.proper.wes.bam

time docker run --rm -v /data/samples/NA12878:/data/samples/NA12878 -w /data/samples/NA12878 wkusmirek/sambamba /opt/sambamba-0.6.8-linux-static depth window --output-file=sambamba_window_coverage.txt --nthreads=1 --window-size=500 /data/samples/NA12878/WES/NA12878.proper.wes.bam


time docker run --rm -v /data/samples/NA12878:/data/samples/NA12878 -w /data/samples/NA12878 wkusmirek/sambamba /opt/sambamba-0.6.8-linux-static depth base --output-file=sambamba_base_coverage.txt --nthreads=1 /data/samples/NA12878/WGS/NA12878.proper.wgs.bam

time docker run --rm -v /data/samples/NA12878:/data/samples/NA12878 -w /data/samples/NA12878 wkusmirek/sambamba /opt/sambamba-0.6.8-linux-static depth window --output-file=sambamba_window_coverage.txt --nthreads=1 --window-size=500 /data/samples/NA12878/WGS/NA12878.proper.wgs.bam