Skip to content

Commit

Permalink
Merge pull request #1347 from deeptools/bamcov_rs
Browse files Browse the repository at this point in the history
Resolving memory issues in cov calc
  • Loading branch information
WardDeb authored Dec 3, 2024
2 parents 90ce478 + e1602e2 commit 010a1cf
Show file tree
Hide file tree
Showing 12 changed files with 707 additions and 473 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ jobs:
- name: pip install
run: |
pip install .[actions]
- name: PEP8
run: |
micromamba activate test_and_build
flake8 . --exclude=.venv,.build,build --ignore=E501,F403,E402,F999,F405,E722,W504,W605
- name: Test deepTools
run: |
pytest -v
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ rayon = "1.10.0"
itertools = "0.12.1"
bigtools = "0.5.3"
tokio = "*"
flate2 = "*"
flate2 = "*"
tempfile = "*"
15 changes: 14 additions & 1 deletion pydeeptools/deeptools/bamCompare2.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,12 @@ def main(args=None):
args.scaleFactorsMethod = 'None'
else:
args.normalizeUsing = 'None'
if not args.minMappingQuality:
args.minMappingQuality = 0
if not args.samFlagInclude:
args.samFlagInclude = 0
if not args.samFlagExclude:
args.samFlagExclude = 0
print(args)
args.pseudocount = 1
r_bamcompare(
Expand All @@ -252,11 +258,18 @@ def main(args=None):
args.outFileName, # output file
args.outFileFormat, # output file format
args.normalizeUsing, # normalization method
args.scaleFactorsMethod, # scaling method
args.effectiveGenomeSize, # effective genome size
args.scaleFactorsMethod, # scaling method
args.operation,
args.pseudocount,
args.ignoreDuplicates,
args.minMappingQuality,
args.samFlagInclude,
args.samFlagExclude,
args.minFragmentLength,
args.maxFragmentLength,
args.numberOfProcessors, # threads
args.ignoreForNormalization,
args.binSize, # bin size
[], # regions
True # verbose
Expand Down
2 changes: 1 addition & 1 deletion pydeeptools/deeptools/bamCoverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def main(args=None):
if args.Offset[0] == 0:
sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
sys.exit("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
else:
if args.Offset[0] == 0:
sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
Expand Down
66 changes: 49 additions & 17 deletions pydeeptools/deeptools/bamCoverage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,12 @@ def scaleFactor(string):

def process_args(args=None):
args = parseArguments().parse_args(args)

if not args.smoothLength:
args.smoothLength = 0
if args.smoothLength and args.smoothLength <= args.binSize:
print("Warning: the smooth length given ({}) is smaller than the bin "
"size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize))
args.smoothLength = None
args.smoothLength = 0

if not args.ignoreForNormalization:
args.ignoreForNormalization = []
Expand All @@ -130,24 +131,55 @@ def process_args(args=None):

def main(args=None):
args = process_args(args)
print(args)
# Fail if user tries RPGC without setting the effective genome size
if args.normalizeUsing == 'RPGC' and args.effectiveGenomeSize is None:
print("Error: You must specify the effective genome size when using "
"RPGC normalization. Use --effectiveGenomeSize to set this value.")
sys.exit()
if not args.effectiveGenomeSize:
args.effectiveGenomeSize = 0
if not args.normalizeUsing:
args.normalizeUsing = 'None'
if not args.Offset:
args.Offset = [1, -1]
if not args.extendReads:
args.extendReads = 0
if not args.filterRNAstrand:
args.filterRNAstrand = 'None'
if not args.blackListFileName:
args.blackListFileName = 'None'
if not args.minMappingQuality:
args.minMappingQuality = 0
if not args.samFlagInclude:
args.samFlagInclude = 0
if not args.samFlagExclude:
args.samFlagExclude = 0

print(args)

r_bamcoverage(
args.bam, # bam file
args.outFileName, # output file
args.outFileFormat, # output format
args.normalizeUsing, # normalization
args.effectiveGenomeSize, # effective genome size
args.numberOfProcessors, # threads
args.binSize, # bin size
[], # regions
args.verbose # verbose
args.bam, # bam file
args.outFileName, # output file
args.outFileFormat, # output format
## Normalization options
args.normalizeUsing, # normalization
args.effectiveGenomeSize, # effective genome size
args.scaleFactor,
# processing options
args.MNase,
args.Offset,
args.extendReads,
args.centerReads,
args.filterRNAstrand,
args.blackListFileName,
args.ignoreForNormalization,
args.skipNonCoveredRegions,
args.smoothLength,
args.binSize, # bin size
# Filtering options
args.ignoreDuplicates,
args.minMappingQuality,
args.samFlagInclude,
args.samFlagExclude,
args.minFragmentLength,
args.maxFragmentLength,
# running options
args.numberOfProcessors, # threads
[], # regions
args.verbose # verbose
)
181 changes: 128 additions & 53 deletions src/bamcompare.rs
Original file line number Diff line number Diff line change
@@ -1,79 +1,154 @@
use pyo3::prelude::*;
use pyo3::types::PyList;
use rayon::prelude::*;
use rayon::ThreadPoolBuilder;
use crate::filehandler::{bam_ispaired, write_file};
use crate::covcalc::{bam_pileup, parse_regions, collapse_bgvecs};
use std::io::prelude::*;
use std::io::{BufReader};
use std::fs::File;
use itertools::Itertools;
use bigtools::{Value};
use crate::filehandler::{bam_ispaired, write_covfile};
use crate::covcalc::{bam_pileup, parse_regions, alignmentfilters};
use crate::normalization::scale_factor_bamcompare;
use crate::calc::median;
use crate::calc::{median, calc_ratio};
use tempfile::{NamedTempFile, TempPath};

#[pyfunction]
pub fn r_bamcompare(
bam_ifile1: &str,
bam_ifile2: &str,
ofile: &str,
ofiletype: &str,
// input and output
bamifile1: &str, // input bamfile 1
bamifile2: &str, // input bamfile 2
ofile: &str, // output file
ofiletype: &str, // ouput file type, bedgraph or bigwig
// norm options
norm: &str,
scalefactorsmethod: &str,
effective_genome_size: u64,
scalefactorsmethod: &str,
operation: &str,
pseudocount: f64,
pseudocount: f32,
// filtering options
ignoreduplicates: bool,
minmappingquality: u8, //
samflaginclude: u16,
samflagexclude: u16,
minfraglen: u32,
maxfraglen: u32,
nproc: usize,
_ignorechr: Py<PyList>,
binsize: u32,
regions: Vec<(String, u64, u64)>,
regions: Vec<(String, u32, u32)>,
verbose: bool
) -> PyResult<()> {
let ispe1 = bam_ispaired(bam_ifile1);
let ispe2 = bam_ispaired(bam_ifile2);
let ispe1 = bam_ispaired(bamifile1);
let ispe2 = bam_ispaired(bamifile2);

if verbose {
println!("Sample1: {} is-paired: {}", bam_ifile1, ispe1);
println!("Sample2: {} is-paired: {}", bam_ifile2, ispe2);
println!("Sample1: {} is-paired: {}", bamifile1, ispe1);
println!("Sample2: {} is-paired: {}", bamifile2, ispe2);
}
let mut ignorechr: Vec<String> = Vec::new();
Python::with_gil(|py| {
ignorechr = _ignorechr.extract(py).expect("Failed to retrieve ignorechr.");
});
// Set alignment filters
let filters = alignmentfilters {
minmappingquality: minmappingquality,
samflaginclude: samflaginclude,
samflagexclude: samflagexclude,
minfraglen: minfraglen,
maxfraglen: maxfraglen
};

// Parse regions & calculate coverage
let (regions, chromsizes) = parse_regions(&regions, bam_ifile1);
// Parse regions & calculate coverage. Note that
let (regions, chromsizes) = parse_regions(&regions, bamifile1);
let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();

// Parse first bamfile
let (bg1, mapped1, _unmapped1, readlen1, fraglen1) = pool.install(|| {
regions.par_iter()
.map(|i| bam_pileup(bam_ifile1, &i, &binsize, &ispe1))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
_bg.extend(bg);
_readlen.extend(readlen);
_fraglen.extend(fraglen);
_mapped += mapped;
_unmapped += unmapped;
(_bg, _mapped, _unmapped, _readlen, _fraglen)
// Set up the bam files in a Vec.
let bamfiles = vec![(bamifile1, ispe1), (bamifile2, ispe2)];

let covcalcs: Vec<ParsedBamFile> = pool.install(|| {
bamfiles.par_iter()
.map(|(bamfile, ispe)| {
let (bg, mapped, unmapped, readlen, fraglen) = regions.par_iter()
.map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
_bg.extend(bg);
_readlen.extend(readlen);
_fraglen.extend(fraglen);
_mapped += mapped;
_unmapped += unmapped;
(_bg, _mapped, _unmapped, _readlen, _fraglen)
}
);
ParsedBamFile {
bamfile: bamfile,
ispe: *ispe,
bg: bg,
mapped: mapped,
unmapped: unmapped,
readlen: median(readlen),
fraglen: median(fraglen)
}
)
})
.collect()
});
let _readlen1 = median(readlen1);
let _fraglen1 = median(fraglen1);

// Parse first bamfile
let (bg2, mapped2, _unmapped2, readlen2, fraglen2) = pool.install(|| {
regions.par_iter()
.map(|i| bam_pileup(bam_ifile2, &i, &binsize, &ispe2))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
_bg.extend(bg);
_readlen.extend(readlen);
_fraglen.extend(fraglen);
_mapped += mapped;
_unmapped += unmapped;
(_bg, _mapped, _unmapped, _readlen, _fraglen)
// Calculate scale factors.
let sf = scale_factor_bamcompare(scalefactorsmethod, covcalcs[0].mapped, covcalcs[1].mapped, binsize, effective_genome_size, norm);
println!("scale factor1 = {}, scale factor2 = {}", sf.0, sf.1);
// Create output stream
let mut chrom = "".to_string();
let lines = covcalcs[0].bg.iter().zip(covcalcs[1].bg.iter()).flat_map(
|(t1, t2)| {
let reader1 = BufReader::new(File::open(t1).unwrap()).lines();
let reader2 = BufReader::new(File::open(t2).unwrap()).lines();

reader1.zip(reader2).map(
|(l1, l2)| {
let l1 = l1.unwrap();
let l2 = l2.unwrap();
let fields1: Vec<&str> = l1.split('\t').collect();
let fields2: Vec<&str> = l2.split('\t').collect();

let chrom1: String = fields1[0].to_string();
let chrom2: String = fields2[0].to_string();
let start1: u32 = fields1[1].parse().unwrap();
let start2: u32 = fields2[1].parse().unwrap();
let end1: u32 = fields1[2].parse().unwrap();
let end2: u32 = fields2[2].parse().unwrap();

// Assert the regions are equal.
assert_eq!(chrom1, chrom2);
assert_eq!(start1, start2);
assert_eq!(end1, end2);

// Calculate the coverage.
let cov1: f32 = fields1[3].parse().unwrap();
let cov2: f32 = fields2[3].parse().unwrap();
let cov = calc_ratio(cov1, cov2, &sf.0, &sf.1, &pseudocount, operation);

(chrom1, Value { start: start1, end: end1, value: cov })
}).coalesce(|p, c| {
if p.1.value == c.1.value {
Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
} else {
Err((p, c))
}
)
});
let _readlen2 = median(readlen2);
let _fraglen2 = median(fraglen2);
let (sf1, sf2) = scale_factor_bamcompare(scalefactorsmethod, mapped1, mapped2, binsize, effective_genome_size, norm);
println!("scale factor1 = {}, scale factor2 = {}", sf1, sf2);
let bge = collapse_bgvecs(bg1, bg2, sf1, sf2, pseudocount, operation);
write_file(ofile, ofiletype, bge, chromsizes);
})
}
);
write_covfile(lines, ofile, ofiletype, chromsizes);
Ok(())
}

struct ParsedBamFile<'a> {
bamfile: &'a str,
ispe: bool,
bg: Vec<TempPath>,
mapped: u32,
unmapped: u32,
readlen: f32,
fraglen: f32
}
Loading

0 comments on commit 010a1cf

Please sign in to comment.