Merge pull request #1347 from deeptools/bamcov_rs

Resolving memory issues in cov calc
deeptools · Dec 3, 2024 · 010a1cf · 010a1cf
2 parents 90ce478 + e1602e2
commit 010a1cf
Show file tree

Hide file tree

Showing 12 changed files with 707 additions and 473 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -48,10 +48,6 @@ jobs:
     - name: pip install
       run: |
         pip install .[actions]
-    - name: PEP8
-      run: |
-        micromamba activate test_and_build
-        flake8 . --exclude=.venv,.build,build --ignore=E501,F403,E402,F999,F405,E722,W504,W605
     - name: Test deepTools
       run: |
         pytest -v

diff --git a/Cargo.toml b/Cargo.toml
@@ -15,4 +15,5 @@ rayon = "1.10.0"
 itertools = "0.12.1"
 bigtools = "0.5.3"
 tokio = "*"
-flate2 = "*"
+flate2 = "*"
+tempfile = "*"
diff --git a/pydeeptools/deeptools/bamCompare2.py b/pydeeptools/deeptools/bamCompare2.py
@@ -244,6 +244,12 @@ def main(args=None):
         args.scaleFactorsMethod = 'None'
     else:
         args.normalizeUsing = 'None'
+    if not args.minMappingQuality:
+        args.minMappingQuality = 0
+    if not args.samFlagInclude:
+        args.samFlagInclude = 0
+    if not args.samFlagExclude:
+        args.samFlagExclude = 0
     print(args)
     args.pseudocount = 1
     r_bamcompare(
@@ -252,11 +258,18 @@ def main(args=None):
         args.outFileName, # output file
         args.outFileFormat, # output file format
         args.normalizeUsing, # normalization method
-        args.scaleFactorsMethod, # scaling method
         args.effectiveGenomeSize, # effective genome size
+        args.scaleFactorsMethod, # scaling method
         args.operation,
         args.pseudocount,
+        args.ignoreDuplicates,
+        args.minMappingQuality,
+        args.samFlagInclude,
+        args.samFlagExclude,
+        args.minFragmentLength,
+        args.maxFragmentLength,
         args.numberOfProcessors, # threads
+        args.ignoreForNormalization,
         args.binSize, # bin size
         [], # regions
         True # verbose

diff --git a/pydeeptools/deeptools/bamCoverage.py b/pydeeptools/deeptools/bamCoverage.py
@@ -211,7 +211,7 @@ def main(args=None):
             if args.Offset[0] == 0:
                 sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
             if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
-                sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
+                sys.exit("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
         else:
             if args.Offset[0] == 0:
                 sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")

diff --git a/pydeeptools/deeptools/bamCoverage2.py b/pydeeptools/deeptools/bamCoverage2.py
@@ -117,11 +117,12 @@ def scaleFactor(string):
 
 def process_args(args=None):
     args = parseArguments().parse_args(args)
-
+    if not args.smoothLength:
+        args.smoothLength = 0
     if args.smoothLength and args.smoothLength <= args.binSize:
         print("Warning: the smooth length given ({}) is smaller than the bin "
               "size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize))
-        args.smoothLength = None
+        args.smoothLength = 0
 
     if not args.ignoreForNormalization:
         args.ignoreForNormalization = []
@@ -130,24 +131,55 @@ def process_args(args=None):
 
 def main(args=None):
     args = process_args(args)
-    print(args)
-    # Fail if user tries RPGC without setting the effective genome size
-    if args.normalizeUsing == 'RPGC' and args.effectiveGenomeSize is None:
-        print("Error: You must specify the effective genome size when using "
-              "RPGC normalization. Use --effectiveGenomeSize to set this value.")
-        sys.exit()
     if not args.effectiveGenomeSize:
         args.effectiveGenomeSize = 0
     if not args.normalizeUsing:
         args.normalizeUsing = 'None'
+    if not args.Offset:
+        args.Offset = [1, -1]
+    if not args.extendReads:
+        args.extendReads = 0
+    if not args.filterRNAstrand:
+        args.filterRNAstrand = 'None'
+    if not args.blackListFileName:
+        args.blackListFileName = 'None'
+    if not args.minMappingQuality:
+        args.minMappingQuality = 0
+    if not args.samFlagInclude:
+        args.samFlagInclude = 0
+    if not args.samFlagExclude:
+        args.samFlagExclude = 0
+
+    print(args)
+
     r_bamcoverage(
-       args.bam, # bam file
-       args.outFileName, # output file
-       args.outFileFormat, # output format
-       args.normalizeUsing, # normalization
-       args.effectiveGenomeSize, # effective genome size
-       args.numberOfProcessors, # threads
-       args.binSize, # bin size
-       [], # regions
-       args.verbose # verbose
+        args.bam, # bam file
+        args.outFileName, # output file
+        args.outFileFormat, # output format
+        ## Normalization options
+        args.normalizeUsing, # normalization
+        args.effectiveGenomeSize, # effective genome size
+        args.scaleFactor,
+        # processing options
+        args.MNase,
+        args.Offset,
+        args.extendReads,
+        args.centerReads,
+        args.filterRNAstrand,
+        args.blackListFileName,
+        args.ignoreForNormalization,
+        args.skipNonCoveredRegions,
+        args.smoothLength,
+        args.binSize, # bin size
+        # Filtering options
+        args.ignoreDuplicates,
+        args.minMappingQuality,
+        args.samFlagInclude,
+        args.samFlagExclude,
+        args.minFragmentLength,
+        args.maxFragmentLength,
+        # running options
+        args.numberOfProcessors, # threads
+        [], # regions
+        args.verbose # verbose
     )
diff --git a/src/bamcompare.rs b/src/bamcompare.rs
@@ -1,79 +1,154 @@
 use pyo3::prelude::*;
+use pyo3::types::PyList;
 use rayon::prelude::*;
 use rayon::ThreadPoolBuilder;
-use crate::filehandler::{bam_ispaired, write_file};
-use crate::covcalc::{bam_pileup, parse_regions, collapse_bgvecs};
+use std::io::prelude::*;
+use std::io::{BufReader};
+use std::fs::File;
+use itertools::Itertools;
+use bigtools::{Value};
+use crate::filehandler::{bam_ispaired, write_covfile};
+use crate::covcalc::{bam_pileup, parse_regions, alignmentfilters};
 use crate::normalization::scale_factor_bamcompare;
-use crate::calc::median;
+use crate::calc::{median, calc_ratio};
+use tempfile::{NamedTempFile, TempPath};
 
 #[pyfunction]
 pub fn r_bamcompare(
-    bam_ifile1: &str,
-    bam_ifile2: &str,
-    ofile: &str,
-    ofiletype: &str,
+    // input and output
+    bamifile1: &str, // input bamfile 1
+    bamifile2: &str, // input bamfile 2
+    ofile: &str, // output file
+    ofiletype: &str, // ouput file type, bedgraph or bigwig
+    // norm options
     norm: &str,
-    scalefactorsmethod: &str,
     effective_genome_size: u64,
+    scalefactorsmethod: &str,
     operation: &str,
-    pseudocount: f64,
+    pseudocount: f32,
+    // filtering options
+    ignoreduplicates: bool,
+    minmappingquality: u8, // 
+    samflaginclude: u16,
+    samflagexclude: u16,
+    minfraglen: u32,
+    maxfraglen: u32,
     nproc: usize,
+    _ignorechr: Py<PyList>,
     binsize: u32,
-    regions: Vec<(String, u64, u64)>,
+    regions: Vec<(String, u32, u32)>,
     verbose: bool
 ) -> PyResult<()> {
-    let ispe1 = bam_ispaired(bam_ifile1);
-    let ispe2 = bam_ispaired(bam_ifile2);
+    let ispe1 = bam_ispaired(bamifile1);
+    let ispe2 = bam_ispaired(bamifile2);
 
     if verbose {
-        println!("Sample1: {} is-paired: {}", bam_ifile1, ispe1);
-        println!("Sample2: {} is-paired: {}", bam_ifile2, ispe2);
+        println!("Sample1: {} is-paired: {}", bamifile1, ispe1);
+        println!("Sample2: {} is-paired: {}", bamifile2, ispe2);
     }
+    let mut ignorechr: Vec<String> = Vec::new();
+    Python::with_gil(|py| {
+        ignorechr = _ignorechr.extract(py).expect("Failed to retrieve ignorechr.");
+    });
+    // Set alignment filters
+    let filters = alignmentfilters {
+        minmappingquality: minmappingquality,
+        samflaginclude: samflaginclude,
+        samflagexclude: samflagexclude,
+        minfraglen: minfraglen,
+        maxfraglen: maxfraglen
+    };
 
-    // Parse regions & calculate coverage
-    let (regions, chromsizes)  = parse_regions(&regions, bam_ifile1);
+    // Parse regions & calculate coverage. Note that 
+    let (regions, chromsizes)  = parse_regions(&regions, bamifile1);
     let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();
 
-    // Parse first bamfile
-    let (bg1, mapped1, _unmapped1, readlen1, fraglen1) = pool.install(|| {
-        regions.par_iter()
-            .map(|i| bam_pileup(bam_ifile1, &i, &binsize, &ispe1))
-            .reduce(
-                || (vec![], 0, 0, vec![], vec![]),
-                |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
-                    _bg.extend(bg);
-                    _readlen.extend(readlen);
-                    _fraglen.extend(fraglen);
-                    _mapped += mapped;
-                    _unmapped += unmapped;
-                    (_bg, _mapped, _unmapped, _readlen, _fraglen)
+    // Set up the bam files in a Vec.
+    let bamfiles = vec![(bamifile1, ispe1), (bamifile2, ispe2)];
+
+    let covcalcs: Vec<ParsedBamFile> = pool.install(|| {
+        bamfiles.par_iter()
+            .map(|(bamfile, ispe)| {
+                let (bg, mapped, unmapped, readlen, fraglen) = regions.par_iter()
+                    .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false))
+                    .reduce(
+                        || (vec![], 0, 0, vec![], vec![]),
+                        |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
+                            _bg.extend(bg);
+                            _readlen.extend(readlen);
+                            _fraglen.extend(fraglen);
+                            _mapped += mapped;
+                            _unmapped += unmapped;
+                            (_bg, _mapped, _unmapped, _readlen, _fraglen)
+                        }
+                    );
+                ParsedBamFile {
+                    bamfile: bamfile,
+                    ispe: *ispe,
+                    bg: bg,
+                    mapped: mapped,
+                    unmapped: unmapped,
+                    readlen: median(readlen),
+                    fraglen: median(fraglen)
                 }
-            )
+            })
+        .collect()
     });
-    let _readlen1 = median(readlen1);
-    let _fraglen1 = median(fraglen1);
 
-    // Parse first bamfile
-    let (bg2, mapped2, _unmapped2, readlen2, fraglen2) = pool.install(|| {
-        regions.par_iter()
-            .map(|i| bam_pileup(bam_ifile2, &i, &binsize, &ispe2))
-            .reduce(
-                || (vec![], 0, 0, vec![], vec![]),
-                |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
-                    _bg.extend(bg);
-                    _readlen.extend(readlen);
-                    _fraglen.extend(fraglen);
-                    _mapped += mapped;
-                    _unmapped += unmapped;
-                    (_bg, _mapped, _unmapped, _readlen, _fraglen)
+    // Calculate scale factors.
+    let sf = scale_factor_bamcompare(scalefactorsmethod, covcalcs[0].mapped, covcalcs[1].mapped, binsize, effective_genome_size, norm);
+    println!("scale factor1 = {}, scale factor2 = {}", sf.0, sf.1);
+    // Create output stream
+    let mut chrom = "".to_string();
+    let lines = covcalcs[0].bg.iter().zip(covcalcs[1].bg.iter()).flat_map(
+        |(t1, t2)| {
+            let reader1 = BufReader::new(File::open(t1).unwrap()).lines();
+            let reader2 = BufReader::new(File::open(t2).unwrap()).lines();
+
+            reader1.zip(reader2).map(
+                |(l1, l2)| {
+                    let l1 = l1.unwrap();
+                    let l2 = l2.unwrap();
+                    let fields1: Vec<&str> = l1.split('\t').collect();
+                    let fields2: Vec<&str> = l2.split('\t').collect();
+
+                    let chrom1: String = fields1[0].to_string();
+                    let chrom2: String = fields2[0].to_string();
+                    let start1: u32 = fields1[1].parse().unwrap();
+                    let start2: u32 = fields2[1].parse().unwrap();
+                    let end1: u32 = fields1[2].parse().unwrap();
+                    let end2: u32 = fields2[2].parse().unwrap();
+
+                    // Assert the regions are equal.
+                    assert_eq!(chrom1, chrom2);
+                    assert_eq!(start1, start2);
+                    assert_eq!(end1, end2);
+
+                    // Calculate the coverage.
+                    let cov1: f32 = fields1[3].parse().unwrap();
+                    let cov2: f32 = fields2[3].parse().unwrap();
+                    let cov = calc_ratio(cov1, cov2, &sf.0, &sf.1, &pseudocount, operation);
+
+                    (chrom1, Value { start: start1, end: end1, value: cov })
+                }).coalesce(|p, c| {
+                if p.1.value == c.1.value {
+                    Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
+                } else {
+                    Err((p, c))
                 }
-            )
-    });
-    let _readlen2 = median(readlen2);
-    let _fraglen2 = median(fraglen2);
-    let (sf1, sf2) = scale_factor_bamcompare(scalefactorsmethod, mapped1, mapped2, binsize, effective_genome_size, norm);
-    println!("scale factor1 = {}, scale factor2 = {}", sf1, sf2);
-    let bge = collapse_bgvecs(bg1, bg2, sf1, sf2, pseudocount, operation);
-    write_file(ofile, ofiletype, bge, chromsizes);
+            })
+        }
+    );
+    write_covfile(lines, ofile, ofiletype, chromsizes);
     Ok(())
+}
+
+struct ParsedBamFile<'a> {
+    bamfile: &'a str,
+    ispe: bool,
+    bg: Vec<TempPath>,
+    mapped: u32,
+    unmapped: u32,
+    readlen: f32,
+    fraglen: f32
 }