Skip to content

Commit

Permalink
Merge pull request #24 from jeffersonfparil/dev
Browse files Browse the repository at this point in the history
feat: min_coverage_breadth
  • Loading branch information
jeffersonfparil authored Jul 7, 2024
2 parents 8d5a45c + 3b43690 commit a78f40b
Show file tree
Hide file tree
Showing 16 changed files with 116 additions and 69 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ jobs:
run: |
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage 10
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage-depth 10
cargo run -- pileup2sync -f ./tests/test.pileup -p ./tests/test.csv --n-threads 2 --max-base-error-rate 0.0001 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- fisher_exact_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- chisq_test -f ./tests/test.sync -p ./tests/test.csv --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- pearson_corr -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- ols_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage 10 --min-allele-frequency 0.01
cargo run -- mle_iter -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2 --min-coverage-depth 10 --min-allele-frequency 0.01
cargo run -- gwalpha -f ./tests/test.sync -p ./tests/test.py --n-threads 2 --gwalpha-method LS
cargo run -- gwalpha -f ./tests/test.sync -p ./tests/test.py --n-threads 2 --gwalpha-method ML
cargo run -- sync2csv -f ./tests/test.sync -p ./tests/test.csv --phen-delim , --phen-name-col 0 --phen-value-col 2,3 --n-threads 2
Expand Down
48 changes: 20 additions & 28 deletions src/base/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use crate::base::*;
use ndarray::prelude::*;
use std::collections::HashSet;
use std::fs::{File, OpenOptions};
use std::io::{self, prelude::*, BufReader, BufWriter, Error, ErrorKind, SeekFrom};
use std::str;
Expand Down Expand Up @@ -267,39 +266,32 @@ impl Filter for PileupLine {
}
}

if filter_stats.keep_if_any_meets_coverage {
// At least 1 pool needs to have been covered min_coverage times
let mut met_coverage_requirement = false;
for c in &self.coverages {
if c >= &filter_stats.min_coverage {
met_coverage_requirement = true;
break;
}
}
if !met_coverage_requirement{
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let pools_covered = self.coverages.iter()
.filter(|&&c| c >= filter_stats.min_coverage_depth)
.take(min_coverage_breadth as usize)
.count();

} else {
// All the pools need to have been covered at least min_coverage times
for c in &self.coverages {
if c < &filter_stats.min_coverage {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
if pools_covered != min_coverage_breadth as usize {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
let mut covered_alleles = HashSet::new();
let mut unique_alleles = Vec::new();

for pool in &self.read_codes{
for read in pool{
covered_alleles.insert(read);
'outer: for pool in &self.read_codes {
for &read in pool {
if !unique_alleles.contains(&read) {
unique_alleles.push(read);
if unique_alleles.len() == 2 {
break 'outer;
}
}
}
}

if covered_alleles.len() < 2{
if unique_alleles.len() < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
Expand Down Expand Up @@ -591,10 +583,10 @@ mod tests {
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_if_any_meets_coverage: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
4 changes: 2 additions & 2 deletions src/base/structs_and_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ pub struct FileSyncPhen {
pub struct FilterStats {
pub remove_ns: bool,
pub remove_monoallelic: bool,
pub keep_if_any_meets_coverage: bool,
pub keep_lowercase_reference: bool,
pub max_base_error_rate: f64,
pub min_coverage: u64,
pub min_coverage_depth: u64,
pub min_coverage_breadth: f64,
pub min_allele_frequency: f64,
pub max_missingness_rate: f64,
pub pool_sizes: Vec<f64>,
Expand Down
14 changes: 10 additions & 4 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ impl Filter for LocusCounts {
sum_coverage
.iter()
.fold(sum_coverage[0], |min, &x| if x < min { x } else { min });
if min_sum_coverage < filter_stats.min_coverage as f64 {
if min_sum_coverage < filter_stats.min_coverage_depth as f64 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus
// for i in 0..self.matrix.nrows() {
// if sum_coverage[i] < filter_stats.min_coverage as f64 {
// if sum_coverage[i] < filter_stats.min_coverage_depth as f64 {
// for j in 0..self.matrix.ncols() {
// self.matrix[(i, j)] = f64::NAN as u64;
// }
Expand Down Expand Up @@ -1569,8 +1569,11 @@ mod tests {
let frequencies = *(counts.to_frequencies().unwrap());
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -1675,8 +1678,11 @@ mod tests {
// let phen = file_phen.lparse().unwrap();
// let filter_stats = FilterStats {
// remove_ns: true,
// remove_monoallelic: false,
// keep_lowercase_reference: false,
// max_base_error_rate: 0.005,
// min_coverage: 0,
// min_coverage_depth: 0,
// min_coverage_breadth: 1.0,
// min_allele_frequency: 0.0001,
// max_missingness_rate: 0.2,
// pool_sizes: phen.pool_sizes,
Expand Down
30 changes: 26 additions & 4 deletions src/base/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,29 @@ impl Filter for VcfLine {
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
// All the pools needs be have been covered at least min_coverage times
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let mut pools_covered = 0;
for i in 0..self.allele_depths.len() {
let c = &self.allele_depths[i].iter().fold(0, |sum, x| sum + x);
if c < &filter_stats.min_coverage {
if c >= &filter_stats.min_coverage_depth {
pools_covered += 1;
}
if pools_covered == min_coverage_breadth {
break;
}
}
if pools_covered != min_coverage_breadth {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
if self.alternative_alleles.len() == 0 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}

// Filter by minimum allele frequency,
//// First convert the counts per pool into frequencies
let allele_frequencies = match self.to_frequencies() {
Expand Down Expand Up @@ -509,16 +525,22 @@ mod tests {
);
let filter_stats_1 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
};
let filter_stats_2 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 10,
min_coverage_depth: 10,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
Expand Down
5 changes: 4 additions & 1 deletion src/gp/cv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,11 @@ mod tests {
let n_threads = 2;
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,11 @@ mod tests {
Array2::from_shape_vec((5, 2), vec![1, 9, 2, 8, 3, 7, 4, 6, 5, 5]).unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/gwalpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,11 @@ mod tests {
.reversed_axes();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
2 changes: 1 addition & 1 deletion src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ mod tests {
// let filter_stats = FilterStats {
// remove_ns: true,
// max_base_error_rate: 0.005,
// min_coverage: 1,
// min_coverage_depth: 1,
// min_allele_frequency: 0.005,
// pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
// };
Expand Down
7 changes: 5 additions & 2 deletions src/imputation/adaptive_ld_knn_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -610,7 +613,7 @@ mod tests {
// --phen-pool-size-col 1 \
// --phen-value-col 2 \
// --min-allele-frequency 0.0001 \
// --min-coverage 0 \
// --min-coverage-depth 0 \
// --min-quality 0.01 \
// --max-missingness-rate 0.2 \
// --min-depth-set-to-missing 5 \
Expand Down
5 changes: 4 additions & 1 deletion src/imputation/filtering_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,11 @@ mod tests {
////////////////////////////////////////////////////////////////////////////////////////////////////////
let filter_stats = FilterStats {
remove_ns: false,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 1.0,
min_coverage: 0,
min_coverage_depth: 0,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.000001,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down
7 changes: 5 additions & 2 deletions src/imputation/mean_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -263,7 +266,7 @@ mod tests {
// --phen-pool-size-col 1 \
// --phen-value-col 2 \
// --min-allele-frequency 0.0001 \
// --min-coverage 0 \
// --min-coverage-depth 0 \
// --min-quality 0.01 \
// --max-missingness-rate 0.75 \
// --min-depth-set-to-missing 5 \
Expand Down
Loading

0 comments on commit a78f40b

Please sign in to comment.